From d24dacf68e9607e81c258a21872ab9d1539c8743 Mon Sep 17 00:00:00 2001 From: Elias Bermudez Date: Fri, 31 Oct 2025 16:14:01 -0700 Subject: [PATCH 1/2] Initial automation for cli options --- .pre-commit-config.yaml | 6 + Makefile | 12 +- docs/cli_options.md | 563 +++++++++++++++++++++---------- tools/generate_cli_options_md.py | 197 +++++++++++ 4 files changed, 607 insertions(+), 171 deletions(-) create mode 100755 tools/generate_cli_options_md.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ccff7e889..ad5960766 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -42,6 +42,12 @@ repos: types: [python] pass_filenames: false args: [] + - id: update-cli-docs + name: update-cli-docs + entry: python tools/generate_cli_options_md.py + language: python + pass_filenames: false + files: ^(src/aiperf/cli.*\.py|src/aiperf/common/config/.*\.py|docs/cli_options\.md)$ - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.11.8 hooks: diff --git a/Makefile b/Makefile index 38e7d6d19..841e194c4 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,8 @@ test-verbose init-files setup-venv setup-mkinit install-mock-server \ integration-tests integration-tests-ci integration-tests-verbose \ test-integration test-integration-ci test-integration-verbose \ - test-stress stress-tests internal-help help + test-stress stress-tests internal-help help \ + update-cli-docs check-cli-docs # Include user-defined environment variables @@ -95,6 +96,15 @@ internal-help: init-files: #? run mkinit to generate the __init__.py files. $(activate_venv) && tools/generate_init_files.sh +update-cli-docs: #? regenerate docs/cli_options.md from aiperf profile --help. + @printf "$(bold)$(green)Regenerating CLI options documentation...$(reset)\n" + $(activate_venv) && python tools/generate_cli_options_md.py + @printf "$(bold)$(green)Done! docs/cli_options.md has been updated.$(reset)\n" + +check-cli-docs: #? check if docs/cli_options.md is in sync with aiperf profile --help. + @printf "$(bold)$(blue)Checking if CLI options documentation is up to date...$(reset)\n" + $(activate_venv) && python tools/generate_cli_options_md.py --check + ruff lint: #? run the ruff linters $(activate_venv) && ruff check . $(args) diff --git a/docs/cli_options.md b/docs/cli_options.md index 238e0e091..7705ec904 100644 --- a/docs/cli_options.md +++ b/docs/cli_options.md @@ -1,5 +1,5 @@ @@ -7,173 +7,396 @@ SPDX-License-Identifier: Apache-2.0 Use these options to profile with AIPerf. ``` -╭─ Endpoint ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ * MODEL-NAMES --model-names --model -m Model name(s) to be benchmarked. Can be a comma-separated list or a single model name. [required] │ -│ MODEL-SELECTION-STRATEGY --model-selection-strategy When multiple models are specified, this is how a specific model should be assigned to a prompt. round_robin: nth prompt │ -│ in the list gets assigned to n-mod len(models). random: assignment is uniformly random [choices: round-robin, random] │ -│ [default: round-robin] │ -│ CUSTOM-ENDPOINT --custom-endpoint --endpoint Set a custom endpoint that differs from the OpenAI defaults. │ -│ ENDPOINT-TYPE --endpoint-type The endpoint type to send requests to on the server. [choices: chat, completions, embeddings, rankings] │ -│ [default: chat] │ -│ STREAMING --streaming An option to enable the use of the streaming API. [default: False] │ -│ URL --url -u URL of the endpoint to target for benchmarking. [default: localhost:8000] │ -│ REQUEST-TIMEOUT-SECONDS --request-timeout-seconds The timeout in floating-point seconds for each request to the endpoint. [default: 600.0] │ -│ API-KEY --api-key The API key to use for the endpoint. If provided, it will be sent with every request as a header: Authorization: Bearer │ -│ . │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Input ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ EXTRA-INPUTS --extra-inputs Provide additional inputs to include with every request. Inputs should be in an 'input_name:value' │ -│ format. Alternatively, a string representing a json formatted dict can be provided. [default: []] │ -│ HEADER --header -H Adds a custom header to the requests. Headers must be specified as 'Header:Value' pairs. Alternatively, │ -│ a string representing a json formatted dict can be provided. [default: []] │ -│ INPUT-FILE --input-file The file or directory path that contains the dataset to use for profiling. This parameter is used in │ -│ conjunction with the custom_dataset_type parameter to support different types of user provided │ -│ datasets. │ -│ FIXED-SCHEDULE --fixed-schedule Specifies to run a fixed schedule of requests. This is normally inferred from the --input-file │ -│ parameter, but can be set manually here. [default: False] │ -│ FIXED-SCHEDULE-AUTO-OFFSET --fixed-schedule-auto-offset Specifies to automatically offset the timestamps in the fixed schedule, such that the first timestamp │ -│ is considered 0, and the rest are shifted accordingly. If disabled, the timestamps will be assumed to │ -│ be relative to 0. [default: False] │ -│ FIXED-SCHEDULE-START-OFFSET --fixed-schedule-start-offset Specifies the offset in milliseconds to start the fixed schedule at. By default, the schedule starts at │ -│ 0, but this option can be used to start at a reference point further in the schedule. This option │ -│ cannot be used in conjunction with the --fixed-schedule-auto-offset. The schedule will include any │ -│ requests at the start offset. │ -│ FIXED-SCHEDULE-END-OFFSET --fixed-schedule-end-offset Specifies the offset in milliseconds to end the fixed schedule at. By default, the schedule ends at the │ -│ last timestamp in the trace dataset, but this option can be used to only run a subset of the trace. The │ -│ schedule will include any requests at the end offset. │ -│ PUBLIC-DATASET --public-dataset The public dataset to use for the requests. [choices: sharegpt] │ -│ CUSTOM-DATASET-TYPE --custom-dataset-type The type of custom dataset to use. This parameter is used in conjunction with the --input-file │ -│ parameter. [choices: single_turn, multi_turn, random_pool, mooncake_trace] │ -│ DATASET-SAMPLING-STRATEGY --dataset-sampling-strategy The strategy to use for sampling the dataset. sequential: Iterate through the dataset sequentially, │ -│ then wrap around to the beginning. random: Randomly select a conversation from the dataset. Will │ -│ randomly sample with replacement. shuffle: Shuffle the dataset and iterate through it. Will randomly │ -│ sample without replacement. Once the end of the dataset is reached, shuffle the dataset again and start │ -│ over. [choices: sequential, random, shuffle] │ -│ RANDOM-SEED --random-seed The seed used to generate random values. Set to some value to make the synthetic data generation │ -│ deterministic. It will use system default if not provided. │ -│ GOODPUT --goodput Specify service level objectives (SLOs) for goodput as space-separated 'KEY:VALUE' pairs, where KEY is │ -│ a metric tag and VALUE is a number in the metric’s display unit (falls back to its base unit if no │ -│ display unit is defined). Examples: 'request_latency:250' (ms), 'inter_token_latency:10' (ms), │ -│ output_token_throughput_per_user:600 (tokens/s). Only metrics applicable to the current endpoint/config │ -│ are considered. For more context on the definition of goodput, refer to DistServe paper: │ -│ https://arxiv.org/pdf/2401.09670 and the blog: https://hao-ai-lab.github.io/blogs/distserve │ -╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Output ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ OUTPUT-ARTIFACT-DIR --output-artifact-dir --artifact-dir The directory to store all the (output) artifacts generated by AIPerf. [default: artifacts] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Tokenizer ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ TOKENIZER --tokenizer The Hugging Face tokenizer to use to interpret token metrics from prompts and responses. The value can be the name of a │ -│ tokenizer or the filepath of the tokenizer. The default value is the model name. │ -│ TOKENIZER-REVISION --tokenizer-revision The specific model version to use. It can be a branch name, tag name, or commit ID. [default: main] │ -│ TOKENIZER-TRUST-REMOTE-CODE --tokenizer-trust-remote-code Allows custom tokenizer to be downloaded and executed. This carries security risks and should only be used for │ -│ repositories you trust. This is only necessary for custom tokenizers stored in Hugging Face Hub. [default: False] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Load Generator ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ BENCHMARK-DURATION --benchmark-duration The duration in seconds for benchmarking. │ -│ BENCHMARK-GRACE-PERIOD --benchmark-grace-period The grace period in seconds to wait for responses after benchmark duration ends. Only applies when │ -│ --benchmark-duration is set. Responses received within this period are included in metrics. [default: 30.0] │ -│ CONCURRENCY --concurrency The concurrency value to benchmark. │ -│ REQUEST-RATE --request-rate Sets the request rate for the load generated by AIPerf. Unit: requests/second │ -│ REQUEST-RATE-MODE --request-rate-mode Sets the request rate mode for the load generated by AIPerf. Valid values: constant, poisson. constant: Generate │ -│ requests at a fixed rate. poisson: Generate requests using a poisson distribution. [default: poisson] │ -│ REQUEST-COUNT --request-count --num-requests The number of requests to use for measurement. [default: 10] │ -│ WARMUP-REQUEST-COUNT --warmup-request-count --num-warmup-requests The number of warmup requests to send before benchmarking. [default: 0] │ -│ REQUEST-CANCELLATION-RATE --request-cancellation-rate The percentage of requests to cancel. [default: 0.0] │ -│ REQUEST-CANCELLATION-DELAY --request-cancellation-delay The delay in seconds before cancelling requests. This is used when --request-cancellation-rate is greater than 0. │ -│ [default: 0.0] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Conversation Input ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ CONVERSATION-NUM --conversation-num --num-conversations --num-sessions The total number of unique conversations to generate. Each conversation represents a single request session between client and server. Supported on synthetic mode and │ -│ the custom random_pool dataset. The number of conversations will be used to determine the number of entries in both the custom random_pool and synthetic datasets and │ -│ will be reused until benchmarking is complete. │ -│ NUM-DATASET-ENTRIES --num-dataset-entries --num-prompts The total number of unique dataset entries to generate for the dataset. Each entry represents a single turn used in a request. [default: 100] │ -│ CONVERSATION-TURN-MEAN --conversation-turn-mean --session-turns-mean The mean number of turns within a conversation. [default: 1] │ -│ CONVERSATION-TURN-STDDEV --conversation-turn-stddev --session-turns-stddev The standard deviation of the number of turns within a conversation. [default: 0] │ -│ CONVERSATION-TURN-DELAY-MEAN --conversation-turn-delay-mean --session-turn-delay-mean The mean delay between turns within a conversation in milliseconds. [default: 0.0] │ -│ CONVERSATION-TURN-DELAY-STDDEV --conversation-turn-delay-stddev --session-turn-delay-stddev The standard deviation of the delay between turns within a conversation in milliseconds. [default: 0.0] │ -│ CONVERSATION-TURN-DELAY-RATIO --conversation-turn-delay-ratio --session-delay-ratio A ratio to scale multi-turn delays. [default: 1.0] │ -╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Input Sequence Length (ISL) ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ PROMPT-INPUT-TOKENS-MEAN --prompt-input-tokens-mean The mean of number of tokens in the generated prompts when using synthetic data. [default: 550] │ -│ --synthetic-input-tokens-mean --isl │ -│ PROMPT-INPUT-TOKENS-STDDEV --prompt-input-tokens-stddev The standard deviation of number of tokens in the generated prompts when using synthetic data. [default: 0.0] │ -│ --synthetic-input-tokens-stddev --isl-stddev │ -│ PROMPT-INPUT-TOKENS-BLOCK-SIZE --prompt-input-tokens-block-size The block size of the prompt. [default: 512] │ -│ --synthetic-input-tokens-block-size --isl-block-size │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Output Sequence Length (OSL) ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ PROMPT-OUTPUT-TOKENS-MEAN --prompt-output-tokens-mean The mean number of tokens in each output. │ -│ --output-tokens-mean --osl │ -│ PROMPT-OUTPUT-TOKENS-STDDEV --prompt-output-tokens-stddev The standard deviation of the number of tokens in each output. [default: 0] │ -│ --output-tokens-stddev --osl-stddev │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Prompt ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ PROMPT-BATCH-SIZE --prompt-batch-size --batch-size-text -b The batch size of text requests AIPerf should send. This is currently supported with the embeddings and rankings │ -│ --batch-size endpoint types [default: 1] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Prefix Prompt ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ PROMPT-PREFIX-POOL-SIZE --prompt-prefix-pool-size The total size of the prefix prompt pool to select prefixes from. If this value is not zero, these are prompts that are prepended │ -│ --prefix-prompt-pool-size --num-prefix-prompts to input prompts. This is useful for benchmarking models that use a K-V cache. [default: 0] │ -│ PROMPT-PREFIX-LENGTH --prompt-prefix-length The number of tokens in each prefix prompt. This is only used if "num" is greater than zero. Note that due to the prefix and user │ -│ --prefix-prompt-length prompts being concatenated, the number of tokens in the final prompt may be off by one. [default: 0] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Audio Input ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ AUDIO-BATCH-SIZE --audio-batch-size --batch-size-audio The batch size of audio requests AIPerf should send. This is currently supported with the OpenAI chat endpoint type [default: │ -│ 1] │ -│ AUDIO-LENGTH-MEAN --audio-length-mean The mean length of the audio in seconds. [default: 0.0] │ -│ AUDIO-LENGTH-STDDEV --audio-length-stddev The standard deviation of the length of the audio in seconds. [default: 0.0] │ -│ AUDIO-FORMAT --audio-format The format of the audio files (wav or mp3). [choices: wav, mp3] [default: wav] │ -│ AUDIO-DEPTHS --audio-depths A list of audio bit depths to randomly select from in bits. [default: [16]] │ -│ AUDIO-SAMPLE-RATES --audio-sample-rates A list of audio sample rates to randomly select from in kHz. Common sample rates are 16, 44.1, 48, 96, etc. [default: [16.0]] │ -│ AUDIO-NUM-CHANNELS --audio-num-channels The number of audio channels to use for the audio data generation. [default: 1] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Image Input ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ IMAGE-WIDTH-MEAN --image-width-mean The mean width of images when generating synthetic image data. [default: 0.0] │ -│ IMAGE-WIDTH-STDDEV --image-width-stddev The standard deviation of width of images when generating synthetic image data. [default: 0.0] │ -│ IMAGE-HEIGHT-MEAN --image-height-mean The mean height of images when generating synthetic image data. [default: 0.0] │ -│ IMAGE-HEIGHT-STDDEV --image-height-stddev The standard deviation of height of images when generating synthetic image data. [default: 0.0] │ -│ IMAGE-BATCH-SIZE --image-batch-size --batch-size-image The image batch size of the requests AIPerf should send. [default: 1] │ -│ IMAGE-FORMAT --image-format The compression format of the images. [choices: png, jpeg, random] [default: png] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Service ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ LOG-LEVEL --log-level Logging level [choices: trace, debug, info, notice, warning, success, error, critical] [default: info] │ -│ VERBOSE --verbose -v Equivalent to --log-level DEBUG. Enables more verbose logging output, but lacks some raw message logging. │ -│ [default: False] │ -│ EXTRA-VERBOSE --extra-verbose -vv Equivalent to --log-level TRACE. Enables the most verbose logging output possible. [default: False] │ -│ RECORD-PROCESSOR-SERVICE-COUNT --record-processor-service-count Number of services to spawn for processing records. The higher the request rate, the more services should be │ -│ --record-processors spawned in order to keep up with the incoming records. If not specified, the number of services will be │ -│ automatically determined based on the worker count. │ -│ UI-TYPE --ui-type --ui Type of UI to use [choices: dashboard, simple, none] [default: dashboard] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Workers ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ WORKERS-MAX --workers-max --max-workers Maximum number of workers to create. If not specified, the number of workers will be determined by the formula │ -│ min(concurrency, (num CPUs * 0.75) - 1), with a default max cap of 32. Any value provided will still be capped by the │ -│ concurrency value (if specified), but not by the max cap. │ -╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Endpoint ───────────────────────────────────────────────────────────────────╮ +│ * MODEL-NAMES --model-names -m Model name(s) to be benchmarked. Can be │ +│ --model a comma-separated list or a single │ +│ model name. [required] │ +│ MODEL-SELECTION-STRATEGY When multiple models are specified, │ +│ --model-selection-strategy this is how a specific model should be │ +│ assigned to a prompt. round_robin: nth │ +│ prompt in the list gets assigned to │ +│ n-mod len(models). random: assignment │ +│ is uniformly random [choices: │ +│ round-robin, random] [default: │ +│ round-robin] │ +│ CUSTOM-ENDPOINT Set a custom endpoint that differs from │ +│ --custom-endpoint the OpenAI defaults. │ +│ --endpoint │ +│ ENDPOINT-TYPE The endpoint type to send requests to │ +│ --endpoint-type on the server. [choices: chat, │ +│ completions, cohere-rankings, │ +│ embeddings, hf-tei-rankings, │ +│ huggingface-generate, nim-rankings, │ +│ solido-rag, template] [default: chat] │ +│ STREAMING --streaming An option to enable the use of the │ +│ streaming API. [default: False] │ +│ URL --url -u URL of the endpoint to target for │ +│ benchmarking. [default: localhost:8000] │ +│ REQUEST-TIMEOUT-SECONDS The timeout in floating-point seconds │ +│ --request-timeout-seconds for each request to the endpoint. │ +│ [default: 600.0] │ +│ API-KEY --api-key The API key to use for the endpoint. If │ +│ provided, it will be sent with every │ +│ request as a header: Authorization: │ +│ Bearer . │ +│ TRANSPORT --transport The transport to use for the endpoint. │ +│ --transport-type If not provided, it will be │ +│ auto-detected from the URL.This can │ +│ also be used to force an alternative │ +│ transport or implementation. [choices: │ +│ http] │ +╰──────────────────────────────────────────────────────────────────────────────╯ ``` +``` +╭─ Input ──────────────────────────────────────────────────────────────────────╮ +│ EXTRA-INPUTS --extra-inputs Provide additional inputs to include with │ +│ every request. Inputs should be in an │ +│ 'input_name:value' format. Alternatively, │ +│ a string representing a json formatted │ +│ dict can be provided. [default: []] │ +│ HEADER --header -H Adds a custom header to the requests. │ +│ Headers must be specified as │ +│ 'Header:Value' pairs. Alternatively, a │ +│ string representing a json formatted dict │ +│ can be provided. [default: []] │ +│ INPUT-FILE --input-file The file or directory path that contains │ +│ the dataset to use for profiling. This │ +│ parameter is used in conjunction with the │ +│ custom_dataset_type parameter to support │ +│ different types of user provided datasets. │ +│ FIXED-SCHEDULE Specifies to run a fixed schedule of │ +│ --fixed-schedule requests. This is normally inferred from │ +│ the --input-file parameter, but can be set │ +│ manually here. [default: False] │ +│ FIXED-SCHEDULE-AUTO-OFFSET Specifies to automatically offset the │ +│ --fixed-schedule-auto-offs timestamps in the fixed schedule, such │ +│ et that the first timestamp is considered 0, │ +│ and the rest are shifted accordingly. If │ +│ disabled, the timestamps will be assumed │ +│ to be relative to 0. [default: False] │ +│ FIXED-SCHEDULE-START-OFFSET Specifies the offset in milliseconds to │ +│ --fixed-schedule-start-off start the fixed schedule at. By default, │ +│ set the schedule starts at 0, but this option │ +│ can be used to start at a reference point │ +│ further in the schedule. This option │ +│ cannot be used in conjunction with the │ +│ --fixed-schedule-auto-offset. The schedule │ +│ will include any requests at the start │ +│ offset. │ +│ FIXED-SCHEDULE-END-OFFSET Specifies the offset in milliseconds to │ +│ --fixed-schedule-end-offse end the fixed schedule at. By default, the │ +│ t schedule ends at the last timestamp in the │ +│ trace dataset, but this option can be used │ +│ to only run a subset of the trace. The │ +│ schedule will include any requests at the │ +│ end offset. │ +│ PUBLIC-DATASET The public dataset to use for the │ +│ --public-dataset requests. [choices: sharegpt] │ +│ CUSTOM-DATASET-TYPE The type of custom dataset to use. This │ +│ --custom-dataset-type parameter is used in conjunction with the │ +│ --input-file parameter. [choices: │ +│ single_turn, multi_turn, random_pool, │ +│ mooncake_trace] │ +│ DATASET-SAMPLING-STRATEGY The strategy to use for sampling the │ +│ --dataset-sampling-strateg dataset. sequential: Iterate through the │ +│ y dataset sequentially, then wrap around to │ +│ the beginning. random: Randomly select a │ +│ conversation from the dataset. Will │ +│ randomly sample with replacement. shuffle: │ +│ Shuffle the dataset and iterate through │ +│ it. Will randomly sample without │ +│ replacement. Once the end of the dataset │ +│ is reached, shuffle the dataset again and │ +│ start over. [choices: sequential, random, │ +│ shuffle] │ +│ RANDOM-SEED --random-seed The seed used to generate random values. │ +│ Set to some value to make the synthetic │ +│ data generation deterministic. It will use │ +│ system default if not provided. │ +│ GOODPUT --goodput Specify service level objectives (SLOs) │ +│ for goodput as space-separated 'KEY:VALUE' │ +│ pairs, where KEY is a metric tag and VALUE │ +│ is a number in the metric’s display unit │ +│ (falls back to its base unit if no display │ +│ unit is defined). Examples: │ +│ 'request_latency:250' (ms), │ +│ 'inter_token_latency:10' (ms), │ +│ output_token_throughput_per_user:600 │ +│ (tokens/s). Only metrics applicable to the │ +│ current endpoint/config are considered. │ +│ For more context on the definition of │ +│ goodput, refer to DistServe paper: │ +│ https://arxiv.org/pdf/2401.09670 and the │ +│ blog: │ +│ https://hao-ai-lab.github.io/blogs/distser │ +│ ve │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ Output ─────────────────────────────────────────────────────────────────────╮ +│ OUTPUT-ARTIFACT-DIR The directory to store all the (output) │ +│ --output-artifact-dir artifacts generated by AIPerf. [default: │ +│ --artifact-dir artifacts] │ +│ PROFILE-EXPORT-PREFIX The prefix for the profile export file names. │ +│ --profile-export-prefix Will be suffixed with .csv, .json, .jsonl, and │ +│ --profile-export-file _raw.jsonl.If not provided, the default profile │ +│ export file names will be used: │ +│ profile_export_aiperf.csv, │ +│ profile_export_aiperf.json, │ +│ profile_export.jsonl, and │ +│ profile_export_raw.jsonl. │ +│ EXPORT-LEVEL --export-level The level of profile export files to create. │ +│ --profile-export-level [choices: summary, records, raw] [default: │ +│ records] │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ Tokenizer ──────────────────────────────────────────────────────────────────╮ +│ TOKENIZER --tokenizer The HuggingFace tokenizer to use to interpret │ +│ token metrics from prompts and responses. The │ +│ value can be the name of a tokenizer or the │ +│ filepath of the tokenizer. The default value │ +│ is the model name. │ +│ TOKENIZER-REVISION The specific model version to use. It can be a │ +│ --tokenizer-revision branch name, tag name, or commit ID. [default: │ +│ main] │ +│ TOKENIZER-TRUST-REMOTE-CODE Allows custom tokenizer to be downloaded and │ +│ --tokenizer-trust-remote-c executed. This carries security risks and │ +│ ode should only be used for repositories you │ +│ trust. This is only necessary for custom │ +│ tokenizers stored in HuggingFace Hub. │ +│ [default: False] │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ Load Generator ─────────────────────────────────────────────────────────────╮ +│ BENCHMARK-DURATION The duration in seconds for benchmarking. │ +│ --benchmark-duration │ +│ BENCHMARK-GRACE-PERIOD The grace period in seconds to wait for │ +│ --benchmark-grace-period responses after benchmark duration ends. Only │ +│ applies when --benchmark-duration is set. │ +│ Responses received within this period are │ +│ included in metrics. [default: 30.0] │ +│ CONCURRENCY --concurrency The concurrency value to benchmark. │ +│ REQUEST-RATE --request-rate Sets the request rate for the load generated │ +│ by AIPerf. Unit: requests/second │ +│ REQUEST-RATE-MODE Sets the request rate mode for the load │ +│ --request-rate-mode generated by AIPerf. Valid values: constant, │ +│ poisson. constant: Generate requests at a │ +│ fixed rate. poisson: Generate requests using a │ +│ poisson distribution. [default: poisson] │ +│ REQUEST-COUNT The number of requests to use for measurement. │ +│ --request-count [default: 10] │ +│ --num-requests │ +│ WARMUP-REQUEST-COUNT The number of warmup requests to send before │ +│ --warmup-request-count benchmarking. [default: 0] │ +│ --num-warmup-requests │ +│ REQUEST-CANCELLATION-RATE The percentage of requests to cancel. │ +│ --request-cancellation-rat [default: 0.0] │ +│ e │ +│ REQUEST-CANCELLATION-DELAY The delay in seconds before cancelling │ +│ --request-cancellation-del requests. This is used when │ +│ ay --request-cancellation-rate is greater than 0. │ +│ [default: 0.0] │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ Conversation Input ─────────────────────────────────────────────────────────╮ +│ CONVERSATION-NUM The total number of unique conversations to │ +│ --conversation-num generate. Each conversation represents a │ +│ --num-conversations single request session between client and │ +│ --num-sessions server. Supported on synthetic mode and the │ +│ custom random_pool dataset. The number of │ +│ conversations will be used to determine the │ +│ number of entries in both the custom │ +│ random_pool and synthetic datasets and will be │ +│ reused until benchmarking is complete. │ +│ NUM-DATASET-ENTRIES The total number of unique dataset entries to │ +│ --num-dataset-entries generate for the dataset. Each entry │ +│ --num-prompts represents a single turn used in a request. │ +│ [default: 100] │ +│ CONVERSATION-TURN-MEAN The mean number of turns within a │ +│ --conversation-turn-mean conversation. [default: 1] │ +│ --session-turns-mean │ +│ CONVERSATION-TURN-STDDEV The standard deviation of the number of turns │ +│ --conversation-turn-stddev within a conversation. [default: 0] │ +│ --session-turns-stddev │ +│ CONVERSATION-TURN-DELAY-MEAN The mean delay between turns within a │ +│ --conversation-turn-delay- conversation in milliseconds. [default: 0.0] │ +│ mean │ +│ --session-turn-delay-mean │ +│ CONVERSATION-TURN-DELAY-STDD The standard deviation of the delay between │ +│ EV --conversation-turn-del turns within a conversation in milliseconds. │ +│ ay-stddev --session-turn-d [default: 0.0] │ +│ elay-stddev │ +│ CONVERSATION-TURN-DELAY-RATI A ratio to scale multi-turn delays. [default: │ +│ O --conversation-turn-dela 1.0] │ +│ y-ratio │ +│ --session-delay-ratio │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ Input Sequence Length (ISL) ────────────────────────────────────────────────╮ +│ PROMPT-INPUT-TOKENS-MEAN The mean of number of tokens in the generated │ +│ --prompt-input-tokens-mean prompts when using synthetic data. [default: │ +│ --synthetic-input-tokens-m 550] │ +│ ean --isl │ +│ PROMPT-INPUT-TOKENS-STDDEV The standard deviation of number of tokens in │ +│ --prompt-input-tokens-stdd the generated prompts when using synthetic │ +│ ev --synthetic-input-token data. [default: 0.0] │ +│ s-stddev --isl-stddev │ +│ PROMPT-INPUT-TOKENS-BLOCK-SI The block size of the prompt. [default: 512] │ +│ ZE --prompt-input-tokens-b │ +│ lock-size --synthetic-inpu │ +│ t-tokens-block-size │ +│ --isl-block-size │ +│ SEQ-DIST --seq-dist Sequence length distribution specification for │ +│ --sequence-distribution varying ISL/OSL pairs │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ Output Sequence Length (OSL) ───────────────────────────────────────────────╮ +│ PROMPT-OUTPUT-TOKENS-MEAN The mean number of tokens in each output. │ +│ --prompt-output-tokens-mea │ +│ n --output-tokens-mean │ +│ --osl │ +│ PROMPT-OUTPUT-TOKENS-STDDEV The standard deviation of the number of tokens │ +│ --prompt-output-tokens-std in each output. [default: 0] │ +│ dev --output-tokens-stddev │ +│ --osl-stddev │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ Prompt ─────────────────────────────────────────────────────────────────────╮ +│ PROMPT-BATCH-SIZE -b The batch size of text requests AIPerf should │ +│ --prompt-batch-size send. This is currently supported with the │ +│ --batch-size-text embeddings and rankings endpoint types [default: │ +│ --batch-size 1] │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ Prefix Prompt ──────────────────────────────────────────────────────────────╮ +│ PROMPT-PREFIX-POOL-SIZE The total size of the prefix prompt pool to │ +│ --prompt-prefix-pool-size select prefixes from. If this value is not │ +│ --prefix-prompt-pool-size zero, these are prompts that are prepended to │ +│ --num-prefix-prompts input prompts. This is useful for benchmarking │ +│ models that use a K-V cache. [default: 0] │ +│ PROMPT-PREFIX-LENGTH The number of tokens in each prefix prompt. │ +│ --prompt-prefix-length This is only used if "num" is greater than │ +│ --prefix-prompt-length zero. Note that due to the prefix and user │ +│ prompts being concatenated, the number of │ +│ tokens in the final prompt may be off by one. │ +│ [default: 0] │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ Audio Input ────────────────────────────────────────────────────────────────╮ +│ AUDIO-BATCH-SIZE The batch size of audio requests AIPerf should │ +│ --audio-batch-size send. This is currently supported with the │ +│ --batch-size-audio OpenAI chat endpoint type [default: 1] │ +│ AUDIO-LENGTH-MEAN The mean length of the audio in seconds. │ +│ --audio-length-mean [default: 0.0] │ +│ AUDIO-LENGTH-STDDEV The standard deviation of the length of the │ +│ --audio-length-stddev audio in seconds. [default: 0.0] │ +│ AUDIO-FORMAT --audio-format The format of the audio files (wav or mp3). │ +│ [choices: wav, mp3] [default: wav] │ +│ AUDIO-DEPTHS --audio-depths A list of audio bit depths to randomly select │ +│ from in bits. [default: [16]] │ +│ AUDIO-SAMPLE-RATES A list of audio sample rates to randomly select │ +│ --audio-sample-rates from in kHz. Common sample rates are 16, 44.1, │ +│ 48, 96, etc. [default: [16.0]] │ +│ AUDIO-NUM-CHANNELS The number of audio channels to use for the │ +│ --audio-num-channels audio data generation. [default: 1] │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ Image Input ────────────────────────────────────────────────────────────────╮ +│ IMAGE-WIDTH-MEAN The mean width of images when generating │ +│ --image-width-mean synthetic image data. [default: 0.0] │ +│ IMAGE-WIDTH-STDDEV The standard deviation of width of images when │ +│ --image-width-stddev generating synthetic image data. [default: 0.0] │ +│ IMAGE-HEIGHT-MEAN The mean height of images when generating │ +│ --image-height-mean synthetic image data. [default: 0.0] │ +│ IMAGE-HEIGHT-STDDEV The standard deviation of height of images when │ +│ --image-height-stddev generating synthetic image data. [default: 0.0] │ +│ IMAGE-BATCH-SIZE The image batch size of the requests AIPerf │ +│ --image-batch-size should send. [default: 1] │ +│ --batch-size-image │ +│ IMAGE-FORMAT --image-format The compression format of the images. [choices: │ +│ png, jpeg, random] [default: png] │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ Video Input ────────────────────────────────────────────────────────────────╮ +│ VIDEO-BATCH-SIZE The video batch size of the requests AIPerf │ +│ --video-batch-size should send. [default: 1] │ +│ --batch-size-video │ +│ VIDEO-DURATION Seconds per clip (default: 5.0). [default: 5.0] │ +│ --video-duration │ +│ VIDEO-FPS --video-fps Frames per second (default/recommended for │ +│ Cosmos: 4). [default: 4] │ +│ VIDEO-WIDTH --video-width Video width in pixels. │ +│ VIDEO-HEIGHT --video-height Video height in pixels. │ +│ VIDEO-SYNTH-TYPE Synthetic generator type. [choices: │ +│ --video-synth-type moving-shapes, grid-clock] [default: │ +│ moving-shapes] │ +│ VIDEO-FORMAT --video-format The video format of the generated files. │ +│ [choices: mp4] [default: mp4] │ +│ VIDEO-CODEC --video-codec The video codec to use for encoding. Common │ +│ options: libx264 (CPU, widely compatible), │ +│ libx265 (CPU, smaller files), h264_nvenc │ +│ (NVIDIA GPU), hevc_nvenc (NVIDIA GPU, smaller │ +│ files). Any FFmpeg-supported codec can be used. │ +│ [default: libx264] │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ Service ────────────────────────────────────────────────────────────────────╮ +│ LOG-LEVEL --log-level Logging level [choices: trace, debug, │ +│ info, notice, warning, success, error, │ +│ critical] [default: info] │ +│ VERBOSE --verbose -v Equivalent to --log-level DEBUG. Enables │ +│ more verbose logging output, but lacks │ +│ some raw message logging. [default: │ +│ False] │ +│ EXTRA-VERBOSE -vv Equivalent to --log-level TRACE. Enables │ +│ --extra-verbose the most verbose logging output possible. │ +│ [default: False] │ +│ RECORD-PROCESSOR-SERVICE-COU Number of services to spawn for │ +│ NT --record-processor-serv processing records. The higher the │ +│ ice-count request rate, the more services should be │ +│ --record-processors spawned in order to keep up with the │ +│ incoming records. If not specified, the │ +│ number of services will be automatically │ +│ determined based on the worker count. │ +│ UI-TYPE --ui-type --ui Type of UI to use [choices: none, simple, │ +│ dashboard] [default: dashboard] │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ Telemetry ──────────────────────────────────────────────────────────────────╮ +│ GPU-TELEMETRY Enable GPU telemetry console display and optionally │ +│ --gpu-telemetry specify custom DCGM exporter URLs (e.g., │ +│ http://node1:9401/metrics http://node2:9401/metrics). │ +│ Default localhost:9400 and localhost:9401 are always │ +│ attempted │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ Workers ────────────────────────────────────────────────────────────────────╮ +│ WORKERS-MAX --workers-max Maximum number of workers to create. If not │ +│ --max-workers specified, the number of workers will be │ +│ determined by the formula min(concurrency, (num │ +│ CPUs * 0.75) - 1), with a default max cap of 32. │ +│ Any value provided will still be capped by the │ +│ concurrency value (if specified), but not by the │ +│ max cap. │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` +``` +╭─ ZMQ Communication ──────────────────────────────────────────────────────────╮ +│ ZMQ-HOST --zmq-host Host address for TCP connections [default: │ +│ 127.0.0.1] │ +│ ZMQ-IPC-PATH --zmq-ipc-path Path for IPC sockets │ +╰──────────────────────────────────────────────────────────────────────────────╯ +``` \ No newline at end of file diff --git a/tools/generate_cli_options_md.py b/tools/generate_cli_options_md.py new file mode 100755 index 000000000..b4b8b74ac --- /dev/null +++ b/tools/generate_cli_options_md.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Generate the CLI options documentation from the aiperf profile --help output. + +This script runs `aiperf profile --help` and formats the output into a markdown file. +It should be run from the repository root. + +Usage: + python tools/generate_cli_options_md.py [--check] + +Options: + --check Check if the current cli_options.md matches the generated output. + Returns exit code 1 if they differ, 0 if they match. +""" + +import argparse +import subprocess +import sys +from pathlib import Path + + +def get_help_output() -> str: + """Run aiperf profile --help and return the output.""" + try: + result = subprocess.run( + ["aiperf", "profile", "--help"], + capture_output=True, + text=True, + check=True, + ) + return result.stdout + except subprocess.CalledProcessError as e: + print(f"Error running aiperf profile --help: {e}", file=sys.stderr) + print(f"stderr: {e.stderr}", file=sys.stderr) + sys.exit(1) + except FileNotFoundError: + print( + "Error: aiperf command not found. Make sure it's installed and in your PATH.", + file=sys.stderr, + ) + sys.exit(1) + + +def format_help_as_markdown(help_output: str) -> str: + """Format the help output as markdown with proper header and code blocks.""" + # Split the output into sections based on the ╭─ ... ─╮ headers + lines = help_output.split("\n") + markdown_lines = [ + "", + "", + "# CLI Options", + "Use these options to profile with AIPerf.", + "", + ] + + current_section = [] + in_section = False + + for line in lines: + # Check if this is a section header line (starts with ╭─) + if line.strip().startswith("╭─"): + # If we were in a previous section, add it + if current_section: + markdown_lines.append("```") + markdown_lines.extend(current_section) + markdown_lines.append("```") + current_section = [] + # Start new section + in_section = True + current_section.append(line) + elif line.strip().startswith("╰─"): + # End of section + current_section.append(line) + markdown_lines.append("```") + markdown_lines.extend(current_section) + markdown_lines.append("```") + current_section = [] + in_section = False + elif in_section: + current_section.append(line) + + # Add any remaining section + if current_section: + markdown_lines.append("```") + markdown_lines.extend(current_section) + markdown_lines.append("```") + + return "\n".join(markdown_lines) + + +def stage_file(file_path: Path) -> None: + """Stage a file using git add.""" + try: + subprocess.run( + ["git", "add", str(file_path)], + check=True, + capture_output=True, + text=True, + ) + print(f"✓ Staged {file_path} for commit", file=sys.stderr) + except subprocess.CalledProcessError as e: + print( + f"Warning: Could not stage {file_path}: {e.stderr}", + file=sys.stderr, + ) + except FileNotFoundError: + print( + "Warning: git command not found, file not staged", + file=sys.stderr, + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Generate CLI options documentation from aiperf profile --help" + ) + parser.add_argument( + "--check", + action="store_true", + help="Check if the current cli_options.md matches the generated output", + ) + parser.add_argument( + "--output", + type=Path, + default=Path("docs/cli_options.md"), + help="Output file path (default: docs/cli_options.md)", + ) + parser.add_argument( + "--no-stage", + action="store_true", + help="Don't automatically stage the file with git add (default: auto-stage)", + ) + args = parser.parse_args() + + # Get the help output + print("Running aiperf profile --help...", file=sys.stderr) + help_output = get_help_output() + + # Format as markdown + print("Formatting output as markdown...", file=sys.stderr) + markdown = format_help_as_markdown(help_output) + + if args.check: + # Check mode: compare with existing file + if not args.output.exists(): + print( + f"Error: {args.output} does not exist. Run without --check to generate it.", + file=sys.stderr, + ) + sys.exit(1) + + current_content = args.output.read_text() + if current_content.strip() == markdown.strip(): + print(f"✓ {args.output} is up to date!", file=sys.stderr) + sys.exit(0) + else: + print( + f"✗ {args.output} is out of sync with aiperf profile --help output!", + file=sys.stderr, + ) + print( + " Run 'make update-cli-docs' or 'python tools/generate_cli_options_md.py' to update it.", + file=sys.stderr, + ) + sys.exit(1) + else: + # Write mode: write to file and optionally stage it + file_existed = args.output.exists() + if file_existed: + old_content = args.output.read_text() + + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(markdown) + + # Check if content actually changed + content_changed = not file_existed or old_content != markdown + + if content_changed: + print(f"✓ Generated {args.output}", file=sys.stderr) + + # Auto-stage the file unless --no-stage is specified + if not args.no_stage: + stage_file(args.output) + else: + print(f"✓ {args.output} already up to date", file=sys.stderr) + + sys.exit(0) + + +if __name__ == "__main__": + main() From 7fb48b9e544e5858b6e69e048bc92e774b02f8b7 Mon Sep 17 00:00:00 2001 From: Elias Bermudez Date: Fri, 31 Oct 2025 16:57:44 -0700 Subject: [PATCH 2/2] Iterate on the cli options table --- docs/cli_options.md | 560 +++++++++---------------------- tools/generate_cli_options_md.py | 239 +++++++++++-- 2 files changed, 378 insertions(+), 421 deletions(-) diff --git a/docs/cli_options.md b/docs/cli_options.md index 7705ec904..f69b4a9c9 100644 --- a/docs/cli_options.md +++ b/docs/cli_options.md @@ -6,397 +6,169 @@ SPDX-License-Identifier: Apache-2.0 # CLI Options Use these options to profile with AIPerf. -``` -╭─ Endpoint ───────────────────────────────────────────────────────────────────╮ -│ * MODEL-NAMES --model-names -m Model name(s) to be benchmarked. Can be │ -│ --model a comma-separated list or a single │ -│ model name. [required] │ -│ MODEL-SELECTION-STRATEGY When multiple models are specified, │ -│ --model-selection-strategy this is how a specific model should be │ -│ assigned to a prompt. round_robin: nth │ -│ prompt in the list gets assigned to │ -│ n-mod len(models). random: assignment │ -│ is uniformly random [choices: │ -│ round-robin, random] [default: │ -│ round-robin] │ -│ CUSTOM-ENDPOINT Set a custom endpoint that differs from │ -│ --custom-endpoint the OpenAI defaults. │ -│ --endpoint │ -│ ENDPOINT-TYPE The endpoint type to send requests to │ -│ --endpoint-type on the server. [choices: chat, │ -│ completions, cohere-rankings, │ -│ embeddings, hf-tei-rankings, │ -│ huggingface-generate, nim-rankings, │ -│ solido-rag, template] [default: chat] │ -│ STREAMING --streaming An option to enable the use of the │ -│ streaming API. [default: False] │ -│ URL --url -u URL of the endpoint to target for │ -│ benchmarking. [default: localhost:8000] │ -│ REQUEST-TIMEOUT-SECONDS The timeout in floating-point seconds │ -│ --request-timeout-seconds for each request to the endpoint. │ -│ [default: 600.0] │ -│ API-KEY --api-key The API key to use for the endpoint. If │ -│ provided, it will be sent with every │ -│ request as a header: Authorization: │ -│ Bearer . │ -│ TRANSPORT --transport The transport to use for the endpoint. │ -│ --transport-type If not provided, it will be │ -│ auto-detected from the URL.This can │ -│ also be used to force an alternative │ -│ transport or implementation. [choices: │ -│ http] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Input ──────────────────────────────────────────────────────────────────────╮ -│ EXTRA-INPUTS --extra-inputs Provide additional inputs to include with │ -│ every request. Inputs should be in an │ -│ 'input_name:value' format. Alternatively, │ -│ a string representing a json formatted │ -│ dict can be provided. [default: []] │ -│ HEADER --header -H Adds a custom header to the requests. │ -│ Headers must be specified as │ -│ 'Header:Value' pairs. Alternatively, a │ -│ string representing a json formatted dict │ -│ can be provided. [default: []] │ -│ INPUT-FILE --input-file The file or directory path that contains │ -│ the dataset to use for profiling. This │ -│ parameter is used in conjunction with the │ -│ custom_dataset_type parameter to support │ -│ different types of user provided datasets. │ -│ FIXED-SCHEDULE Specifies to run a fixed schedule of │ -│ --fixed-schedule requests. This is normally inferred from │ -│ the --input-file parameter, but can be set │ -│ manually here. [default: False] │ -│ FIXED-SCHEDULE-AUTO-OFFSET Specifies to automatically offset the │ -│ --fixed-schedule-auto-offs timestamps in the fixed schedule, such │ -│ et that the first timestamp is considered 0, │ -│ and the rest are shifted accordingly. If │ -│ disabled, the timestamps will be assumed │ -│ to be relative to 0. [default: False] │ -│ FIXED-SCHEDULE-START-OFFSET Specifies the offset in milliseconds to │ -│ --fixed-schedule-start-off start the fixed schedule at. By default, │ -│ set the schedule starts at 0, but this option │ -│ can be used to start at a reference point │ -│ further in the schedule. This option │ -│ cannot be used in conjunction with the │ -│ --fixed-schedule-auto-offset. The schedule │ -│ will include any requests at the start │ -│ offset. │ -│ FIXED-SCHEDULE-END-OFFSET Specifies the offset in milliseconds to │ -│ --fixed-schedule-end-offse end the fixed schedule at. By default, the │ -│ t schedule ends at the last timestamp in the │ -│ trace dataset, but this option can be used │ -│ to only run a subset of the trace. The │ -│ schedule will include any requests at the │ -│ end offset. │ -│ PUBLIC-DATASET The public dataset to use for the │ -│ --public-dataset requests. [choices: sharegpt] │ -│ CUSTOM-DATASET-TYPE The type of custom dataset to use. This │ -│ --custom-dataset-type parameter is used in conjunction with the │ -│ --input-file parameter. [choices: │ -│ single_turn, multi_turn, random_pool, │ -│ mooncake_trace] │ -│ DATASET-SAMPLING-STRATEGY The strategy to use for sampling the │ -│ --dataset-sampling-strateg dataset. sequential: Iterate through the │ -│ y dataset sequentially, then wrap around to │ -│ the beginning. random: Randomly select a │ -│ conversation from the dataset. Will │ -│ randomly sample with replacement. shuffle: │ -│ Shuffle the dataset and iterate through │ -│ it. Will randomly sample without │ -│ replacement. Once the end of the dataset │ -│ is reached, shuffle the dataset again and │ -│ start over. [choices: sequential, random, │ -│ shuffle] │ -│ RANDOM-SEED --random-seed The seed used to generate random values. │ -│ Set to some value to make the synthetic │ -│ data generation deterministic. It will use │ -│ system default if not provided. │ -│ GOODPUT --goodput Specify service level objectives (SLOs) │ -│ for goodput as space-separated 'KEY:VALUE' │ -│ pairs, where KEY is a metric tag and VALUE │ -│ is a number in the metric’s display unit │ -│ (falls back to its base unit if no display │ -│ unit is defined). Examples: │ -│ 'request_latency:250' (ms), │ -│ 'inter_token_latency:10' (ms), │ -│ output_token_throughput_per_user:600 │ -│ (tokens/s). Only metrics applicable to the │ -│ current endpoint/config are considered. │ -│ For more context on the definition of │ -│ goodput, refer to DistServe paper: │ -│ https://arxiv.org/pdf/2401.09670 and the │ -│ blog: │ -│ https://hao-ai-lab.github.io/blogs/distser │ -│ ve │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Output ─────────────────────────────────────────────────────────────────────╮ -│ OUTPUT-ARTIFACT-DIR The directory to store all the (output) │ -│ --output-artifact-dir artifacts generated by AIPerf. [default: │ -│ --artifact-dir artifacts] │ -│ PROFILE-EXPORT-PREFIX The prefix for the profile export file names. │ -│ --profile-export-prefix Will be suffixed with .csv, .json, .jsonl, and │ -│ --profile-export-file _raw.jsonl.If not provided, the default profile │ -│ export file names will be used: │ -│ profile_export_aiperf.csv, │ -│ profile_export_aiperf.json, │ -│ profile_export.jsonl, and │ -│ profile_export_raw.jsonl. │ -│ EXPORT-LEVEL --export-level The level of profile export files to create. │ -│ --profile-export-level [choices: summary, records, raw] [default: │ -│ records] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Tokenizer ──────────────────────────────────────────────────────────────────╮ -│ TOKENIZER --tokenizer The HuggingFace tokenizer to use to interpret │ -│ token metrics from prompts and responses. The │ -│ value can be the name of a tokenizer or the │ -│ filepath of the tokenizer. The default value │ -│ is the model name. │ -│ TOKENIZER-REVISION The specific model version to use. It can be a │ -│ --tokenizer-revision branch name, tag name, or commit ID. [default: │ -│ main] │ -│ TOKENIZER-TRUST-REMOTE-CODE Allows custom tokenizer to be downloaded and │ -│ --tokenizer-trust-remote-c executed. This carries security risks and │ -│ ode should only be used for repositories you │ -│ trust. This is only necessary for custom │ -│ tokenizers stored in HuggingFace Hub. │ -│ [default: False] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Load Generator ─────────────────────────────────────────────────────────────╮ -│ BENCHMARK-DURATION The duration in seconds for benchmarking. │ -│ --benchmark-duration │ -│ BENCHMARK-GRACE-PERIOD The grace period in seconds to wait for │ -│ --benchmark-grace-period responses after benchmark duration ends. Only │ -│ applies when --benchmark-duration is set. │ -│ Responses received within this period are │ -│ included in metrics. [default: 30.0] │ -│ CONCURRENCY --concurrency The concurrency value to benchmark. │ -│ REQUEST-RATE --request-rate Sets the request rate for the load generated │ -│ by AIPerf. Unit: requests/second │ -│ REQUEST-RATE-MODE Sets the request rate mode for the load │ -│ --request-rate-mode generated by AIPerf. Valid values: constant, │ -│ poisson. constant: Generate requests at a │ -│ fixed rate. poisson: Generate requests using a │ -│ poisson distribution. [default: poisson] │ -│ REQUEST-COUNT The number of requests to use for measurement. │ -│ --request-count [default: 10] │ -│ --num-requests │ -│ WARMUP-REQUEST-COUNT The number of warmup requests to send before │ -│ --warmup-request-count benchmarking. [default: 0] │ -│ --num-warmup-requests │ -│ REQUEST-CANCELLATION-RATE The percentage of requests to cancel. │ -│ --request-cancellation-rat [default: 0.0] │ -│ e │ -│ REQUEST-CANCELLATION-DELAY The delay in seconds before cancelling │ -│ --request-cancellation-del requests. This is used when │ -│ ay --request-cancellation-rate is greater than 0. │ -│ [default: 0.0] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Conversation Input ─────────────────────────────────────────────────────────╮ -│ CONVERSATION-NUM The total number of unique conversations to │ -│ --conversation-num generate. Each conversation represents a │ -│ --num-conversations single request session between client and │ -│ --num-sessions server. Supported on synthetic mode and the │ -│ custom random_pool dataset. The number of │ -│ conversations will be used to determine the │ -│ number of entries in both the custom │ -│ random_pool and synthetic datasets and will be │ -│ reused until benchmarking is complete. │ -│ NUM-DATASET-ENTRIES The total number of unique dataset entries to │ -│ --num-dataset-entries generate for the dataset. Each entry │ -│ --num-prompts represents a single turn used in a request. │ -│ [default: 100] │ -│ CONVERSATION-TURN-MEAN The mean number of turns within a │ -│ --conversation-turn-mean conversation. [default: 1] │ -│ --session-turns-mean │ -│ CONVERSATION-TURN-STDDEV The standard deviation of the number of turns │ -│ --conversation-turn-stddev within a conversation. [default: 0] │ -│ --session-turns-stddev │ -│ CONVERSATION-TURN-DELAY-MEAN The mean delay between turns within a │ -│ --conversation-turn-delay- conversation in milliseconds. [default: 0.0] │ -│ mean │ -│ --session-turn-delay-mean │ -│ CONVERSATION-TURN-DELAY-STDD The standard deviation of the delay between │ -│ EV --conversation-turn-del turns within a conversation in milliseconds. │ -│ ay-stddev --session-turn-d [default: 0.0] │ -│ elay-stddev │ -│ CONVERSATION-TURN-DELAY-RATI A ratio to scale multi-turn delays. [default: │ -│ O --conversation-turn-dela 1.0] │ -│ y-ratio │ -│ --session-delay-ratio │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Input Sequence Length (ISL) ────────────────────────────────────────────────╮ -│ PROMPT-INPUT-TOKENS-MEAN The mean of number of tokens in the generated │ -│ --prompt-input-tokens-mean prompts when using synthetic data. [default: │ -│ --synthetic-input-tokens-m 550] │ -│ ean --isl │ -│ PROMPT-INPUT-TOKENS-STDDEV The standard deviation of number of tokens in │ -│ --prompt-input-tokens-stdd the generated prompts when using synthetic │ -│ ev --synthetic-input-token data. [default: 0.0] │ -│ s-stddev --isl-stddev │ -│ PROMPT-INPUT-TOKENS-BLOCK-SI The block size of the prompt. [default: 512] │ -│ ZE --prompt-input-tokens-b │ -│ lock-size --synthetic-inpu │ -│ t-tokens-block-size │ -│ --isl-block-size │ -│ SEQ-DIST --seq-dist Sequence length distribution specification for │ -│ --sequence-distribution varying ISL/OSL pairs │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Output Sequence Length (OSL) ───────────────────────────────────────────────╮ -│ PROMPT-OUTPUT-TOKENS-MEAN The mean number of tokens in each output. │ -│ --prompt-output-tokens-mea │ -│ n --output-tokens-mean │ -│ --osl │ -│ PROMPT-OUTPUT-TOKENS-STDDEV The standard deviation of the number of tokens │ -│ --prompt-output-tokens-std in each output. [default: 0] │ -│ dev --output-tokens-stddev │ -│ --osl-stddev │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Prompt ─────────────────────────────────────────────────────────────────────╮ -│ PROMPT-BATCH-SIZE -b The batch size of text requests AIPerf should │ -│ --prompt-batch-size send. This is currently supported with the │ -│ --batch-size-text embeddings and rankings endpoint types [default: │ -│ --batch-size 1] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Prefix Prompt ──────────────────────────────────────────────────────────────╮ -│ PROMPT-PREFIX-POOL-SIZE The total size of the prefix prompt pool to │ -│ --prompt-prefix-pool-size select prefixes from. If this value is not │ -│ --prefix-prompt-pool-size zero, these are prompts that are prepended to │ -│ --num-prefix-prompts input prompts. This is useful for benchmarking │ -│ models that use a K-V cache. [default: 0] │ -│ PROMPT-PREFIX-LENGTH The number of tokens in each prefix prompt. │ -│ --prompt-prefix-length This is only used if "num" is greater than │ -│ --prefix-prompt-length zero. Note that due to the prefix and user │ -│ prompts being concatenated, the number of │ -│ tokens in the final prompt may be off by one. │ -│ [default: 0] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Audio Input ────────────────────────────────────────────────────────────────╮ -│ AUDIO-BATCH-SIZE The batch size of audio requests AIPerf should │ -│ --audio-batch-size send. This is currently supported with the │ -│ --batch-size-audio OpenAI chat endpoint type [default: 1] │ -│ AUDIO-LENGTH-MEAN The mean length of the audio in seconds. │ -│ --audio-length-mean [default: 0.0] │ -│ AUDIO-LENGTH-STDDEV The standard deviation of the length of the │ -│ --audio-length-stddev audio in seconds. [default: 0.0] │ -│ AUDIO-FORMAT --audio-format The format of the audio files (wav or mp3). │ -│ [choices: wav, mp3] [default: wav] │ -│ AUDIO-DEPTHS --audio-depths A list of audio bit depths to randomly select │ -│ from in bits. [default: [16]] │ -│ AUDIO-SAMPLE-RATES A list of audio sample rates to randomly select │ -│ --audio-sample-rates from in kHz. Common sample rates are 16, 44.1, │ -│ 48, 96, etc. [default: [16.0]] │ -│ AUDIO-NUM-CHANNELS The number of audio channels to use for the │ -│ --audio-num-channels audio data generation. [default: 1] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Image Input ────────────────────────────────────────────────────────────────╮ -│ IMAGE-WIDTH-MEAN The mean width of images when generating │ -│ --image-width-mean synthetic image data. [default: 0.0] │ -│ IMAGE-WIDTH-STDDEV The standard deviation of width of images when │ -│ --image-width-stddev generating synthetic image data. [default: 0.0] │ -│ IMAGE-HEIGHT-MEAN The mean height of images when generating │ -│ --image-height-mean synthetic image data. [default: 0.0] │ -│ IMAGE-HEIGHT-STDDEV The standard deviation of height of images when │ -│ --image-height-stddev generating synthetic image data. [default: 0.0] │ -│ IMAGE-BATCH-SIZE The image batch size of the requests AIPerf │ -│ --image-batch-size should send. [default: 1] │ -│ --batch-size-image │ -│ IMAGE-FORMAT --image-format The compression format of the images. [choices: │ -│ png, jpeg, random] [default: png] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Video Input ────────────────────────────────────────────────────────────────╮ -│ VIDEO-BATCH-SIZE The video batch size of the requests AIPerf │ -│ --video-batch-size should send. [default: 1] │ -│ --batch-size-video │ -│ VIDEO-DURATION Seconds per clip (default: 5.0). [default: 5.0] │ -│ --video-duration │ -│ VIDEO-FPS --video-fps Frames per second (default/recommended for │ -│ Cosmos: 4). [default: 4] │ -│ VIDEO-WIDTH --video-width Video width in pixels. │ -│ VIDEO-HEIGHT --video-height Video height in pixels. │ -│ VIDEO-SYNTH-TYPE Synthetic generator type. [choices: │ -│ --video-synth-type moving-shapes, grid-clock] [default: │ -│ moving-shapes] │ -│ VIDEO-FORMAT --video-format The video format of the generated files. │ -│ [choices: mp4] [default: mp4] │ -│ VIDEO-CODEC --video-codec The video codec to use for encoding. Common │ -│ options: libx264 (CPU, widely compatible), │ -│ libx265 (CPU, smaller files), h264_nvenc │ -│ (NVIDIA GPU), hevc_nvenc (NVIDIA GPU, smaller │ -│ files). Any FFmpeg-supported codec can be used. │ -│ [default: libx264] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Service ────────────────────────────────────────────────────────────────────╮ -│ LOG-LEVEL --log-level Logging level [choices: trace, debug, │ -│ info, notice, warning, success, error, │ -│ critical] [default: info] │ -│ VERBOSE --verbose -v Equivalent to --log-level DEBUG. Enables │ -│ more verbose logging output, but lacks │ -│ some raw message logging. [default: │ -│ False] │ -│ EXTRA-VERBOSE -vv Equivalent to --log-level TRACE. Enables │ -│ --extra-verbose the most verbose logging output possible. │ -│ [default: False] │ -│ RECORD-PROCESSOR-SERVICE-COU Number of services to spawn for │ -│ NT --record-processor-serv processing records. The higher the │ -│ ice-count request rate, the more services should be │ -│ --record-processors spawned in order to keep up with the │ -│ incoming records. If not specified, the │ -│ number of services will be automatically │ -│ determined based on the worker count. │ -│ UI-TYPE --ui-type --ui Type of UI to use [choices: none, simple, │ -│ dashboard] [default: dashboard] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Telemetry ──────────────────────────────────────────────────────────────────╮ -│ GPU-TELEMETRY Enable GPU telemetry console display and optionally │ -│ --gpu-telemetry specify custom DCGM exporter URLs (e.g., │ -│ http://node1:9401/metrics http://node2:9401/metrics). │ -│ Default localhost:9400 and localhost:9401 are always │ -│ attempted │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Workers ────────────────────────────────────────────────────────────────────╮ -│ WORKERS-MAX --workers-max Maximum number of workers to create. If not │ -│ --max-workers specified, the number of workers will be │ -│ determined by the formula min(concurrency, (num │ -│ CPUs * 0.75) - 1), with a default max cap of 32. │ -│ Any value provided will still be capped by the │ -│ concurrency value (if specified), but not by the │ -│ max cap. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ ZMQ Communication ──────────────────────────────────────────────────────────╮ -│ ZMQ-HOST --zmq-host Host address for TCP connections [default: │ -│ 127.0.0.1] │ -│ ZMQ-IPC-PATH --zmq-ipc-path Path for IPC sockets │ -╰──────────────────────────────────────────────────────────────────────────────╯ -``` \ No newline at end of file +## Endpoint + +| Option | Description | +|:-------|:-----------:| +| **`*`**
**MODEL-NAMES**
`--model-names` | -m Model name(s) to be benchmarked. Can be --model a comma-separated list or a single model name. [required] | +| **MODEL-SELECTION-STRATEGY** | When multiple models are specified, --model-selection-strategy this is how a specific model should be assigned to a prompt. round_robin: nth prompt in the list gets assigned to n-mod len(models). random: assignment is uniformly random [choices: round-robin, random] [default: round-robin] | +| **CUSTOM-ENDPOINT** | Set a custom endpoint that differs from --custom-endpoint the OpenAI defaults. --endpoint | +| **ENDPOINT-TYPE** | The endpoint type to send requests to --endpoint-type on the server. [choices: chat, completions, cohere-rankings, embeddings, hf-tei-rankings, huggingface-generate, nim-rankings, solido-rag, template] [default: chat] | +| **STREAMING**
`--streaming` | An option to enable the use of the streaming API. [default: False] | +| **URL**
`--url` | -u URL of the endpoint to target for benchmarking. [default: localhost:8000] | +| **REQUEST-TIMEOUT-SECONDS** | The timeout in floating-point seconds --request-timeout-seconds for each request to the endpoint. [default: 600.0] | +| **API-KEY**
`--api-key` | The API key to use for the endpoint. If provided, it will be sent with every request as a header: Authorization: Bearer . | +| **TRANSPORT**
`--transport` | The transport to use for the endpoint. --transport-type If not provided, it will be auto-detected from the URL.This can also be used to force an alternative transport or implementation. [choices: http] | + +## Input + +| Option | Description | +|:-------|:-----------:| +| **EXTRA-INPUTS**
`--extra-inputs` | Provide additional inputs to include with every request. Inputs should be in an 'input_name:value' format. Alternatively, a string representing a json formatted dict can be provided. [default: []] | +| **HEADER**
`--header` | -H Adds a custom header to the requests. Headers must be specified as 'Header:Value' pairs. Alternatively, a string representing a json formatted dict can be provided. [default: []] | +| **INPUT-FILE**
`--input-file` | The file or directory path that contains the dataset to use for profiling. This parameter is used in conjunction with the custom_dataset_type parameter to support different types of user provided datasets. | +| **FIXED-SCHEDULE** | Specifies to run a fixed schedule of --fixed-schedule requests. This is normally inferred from the --input-file parameter, but can be set manually here. [default: False] | +| **FIXED-SCHEDULE-AUTO-OFFSET** | Specifies to automatically offset the --fixed-schedule-auto-offs timestamps in the fixed schedule, such et that the first timestamp is considered 0, and the rest are shifted accordingly. If disabled, the timestamps will be assumed to be relative to 0. [default: False] | +| **FIXED-SCHEDULE-START-OFFSET** | Specifies the offset in milliseconds to --fixed-schedule-start-off start the fixed schedule at. By default, set the schedule starts at 0, but this option can be used to start at a reference point further in the schedule. This option cannot be used in conjunction with the --fixed-schedule-auto-offset. The schedule will include any requests at the start offset. | +| **FIXED-SCHEDULE-END-OFFSET** | Specifies the offset in milliseconds to --fixed-schedule-end-offse end the fixed schedule at. By default, the t schedule ends at the last timestamp in the trace dataset, but this option can be used to only run a subset of the trace. The schedule will include any requests at the end offset. | +| **PUBLIC-DATASET** | The public dataset to use for the --public-dataset requests. [choices: sharegpt] | +| **CUSTOM-DATASET-TYPE** | The type of custom dataset to use. This --custom-dataset-type parameter is used in conjunction with the --input-file parameter. [choices: single_turn, multi_turn, random_pool, mooncake_trace] | +| **DATASET-SAMPLING-STRATEGY** | The strategy to use for sampling the --dataset-sampling-strateg dataset. sequential: Iterate through the y dataset sequentially, then wrap around to the beginning. random: Randomly select a conversation from the dataset. Will randomly sample with replacement. shuffle: Shuffle the dataset and iterate through it. Will randomly sample without replacement. Once the end of the dataset is reached, shuffle the dataset again and start over. [choices: sequential, random, shuffle] | +| **RANDOM-SEED**
`--random-seed` | The seed used to generate random values. Set to some value to make the synthetic data generation deterministic. It will use system default if not provided. | +| **GOODPUT**
`--goodput` | Specify service level objectives (SLOs) for goodput as space-separated 'KEY:VALUE' pairs, where KEY is a metric tag and VALUE is a number in the metric’s display unit (falls back to its base unit if no display unit is defined). Examples: 'request_latency:250' (ms), 'inter_token_latency:10' (ms), output_token_throughput_per_user:600 (tokens/s). Only metrics applicable to the current endpoint/config are considered. For more context on the definition of goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 and the blog: https://hao-ai-lab.github.io/blogs/distser ve | + +## Output + +| Option | Description | +|:-------|:-----------:| +| **OUTPUT-ARTIFACT-DIR** | The directory to store all the (output) --output-artifact-dir artifacts generated by AIPerf. [default: --artifact-dir artifacts] | +| **PROFILE-EXPORT-PREFIX** | The prefix for the profile export file names. --profile-export-prefix Will be suffixed with .csv, .json, .jsonl, and --profile-export-file _raw.jsonl.If not provided, the default profile export file names will be used: profile_export_aiperf.csv, profile_export_aiperf.json, profile_export.jsonl, and profile_export_raw.jsonl. | +| **EXPORT-LEVEL**
`--export-level` | The level of profile export files to create. --profile-export-level [choices: summary, records, raw] [default: records] | + +## Tokenizer + +| Option | Description | +|:-------|:-----------:| +| **TOKENIZER**
`--tokenizer` | The HuggingFace tokenizer to use to interpret token metrics from prompts and responses. The value can be the name of a tokenizer or the filepath of the tokenizer. The default value is the model name. | +| **TOKENIZER-REVISION** | The specific model version to use. It can be a --tokenizer-revision branch name, tag name, or commit ID. [default: main] | +| **TOKENIZER-TRUST-REMOTE-CODE** | Allows custom tokenizer to be downloaded and --tokenizer-trust-remote-c executed. This carries security risks and ode should only be used for repositories you trust. This is only necessary for custom tokenizers stored in HuggingFace Hub. [default: False] | + +## Load Generator + +| Option | Description | +|:-------|:-----------:| +| **BENCHMARK-DURATION** | The duration in seconds for benchmarking. --benchmark-duration | +| **BENCHMARK-GRACE-PERIOD** | The grace period in seconds to wait for --benchmark-grace-period responses after benchmark duration ends. Only applies when --benchmark-duration is set. Responses received within this period are included in metrics. [default: 30.0] | +| **CONCURRENCY**
`--concurrency` | The concurrency value to benchmark. | +| **REQUEST-RATE**
`--request-rate` | Sets the request rate for the load generated by AIPerf. Unit: requests/second | +| **REQUEST-RATE-MODE** | Sets the request rate mode for the load --request-rate-mode generated by AIPerf. Valid values: constant, poisson. constant: Generate requests at a fixed rate. poisson: Generate requests using a poisson distribution. [default: poisson] | +| **REQUEST-COUNT** | The number of requests to use for measurement. --request-count [default: 10] --num-requests | +| **WARMUP-REQUEST-COUNT** | The number of warmup requests to send before --warmup-request-count benchmarking. [default: 0] --num-warmup-requests | +| **REQUEST-CANCELLATION-RATE** | The percentage of requests to cancel. --request-cancellation-rat [default: 0.0] e | +| **REQUEST-CANCELLATION-DELAY** | The delay in seconds before cancelling --request-cancellation-del requests. This is used when ay --request-cancellation-rate is greater than 0. [default: 0.0] | + +## Conversation Input + +| Option | Description | +|:-------|:-----------:| +| **CONVERSATION-NUM** | The total number of unique conversations to --conversation-num generate. Each conversation represents a --num-conversations single request session between client and --num-sessions server. Supported on synthetic mode and the custom random_pool dataset. The number of conversations will be used to determine the number of entries in both the custom random_pool and synthetic datasets and will be reused until benchmarking is complete. | +| **NUM-DATASET-ENTRIES** | The total number of unique dataset entries to --num-dataset-entries generate for the dataset. Each entry --num-prompts represents a single turn used in a request. [default: 100] | +| **CONVERSATION-TURN-MEAN** | The mean number of turns within a --conversation-turn-mean conversation. [default: 1] --session-turns-mean | +| **CONVERSATION-TURN-STDDEV** | The standard deviation of the number of turns --conversation-turn-stddev within a conversation. [default: 0] --session-turns-stddev | +| **CONVERSATION-TURN-DELAY-MEAN** | The mean delay between turns within a --conversation-turn-delay- conversation in milliseconds. [default: 0.0] mean --session-turn-delay-mean | +| **CONVERSATION-TURN-DELAY-STDD** | The standard deviation of the delay between EV --conversation-turn-del turns within a conversation in milliseconds. ay-stddev --session-turn-d [default: 0.0] elay-stddev | +| **CONVERSATION-TURN-DELAY-RATI** | A ratio to scale multi-turn delays. [default: O --conversation-turn-dela 1.0] y-ratio --session-delay-ratio | + +## Input Sequence Length (ISL) + +| Option | Description | +|:-------|:-----------:| +| **PROMPT-INPUT-TOKENS-MEAN** | The mean of number of tokens in the generated --prompt-input-tokens-mean prompts when using synthetic data. [default: --synthetic-input-tokens-m 550] ean --isl | +| **PROMPT-INPUT-TOKENS-STDDEV** | The standard deviation of number of tokens in --prompt-input-tokens-stdd the generated prompts when using synthetic ev --synthetic-input-token data. [default: 0.0] s-stddev --isl-stddev | +| **PROMPT-INPUT-TOKENS-BLOCK-SI** | The block size of the prompt. [default: 512] ZE --prompt-input-tokens-b lock-size --synthetic-inpu t-tokens-block-size --isl-block-size | +| **SEQ-DIST**
`--seq-dist` | Sequence length distribution specification for --sequence-distribution varying ISL/OSL pairs | + +## Output Sequence Length (OSL) + +| Option | Description | +|:-------|:-----------:| +| **PROMPT-OUTPUT-TOKENS-MEAN** | The mean number of tokens in each output. --prompt-output-tokens-mea n --output-tokens-mean --osl | +| **PROMPT-OUTPUT-TOKENS-STDDEV** | The standard deviation of the number of tokens --prompt-output-tokens-std in each output. [default: 0] dev --output-tokens-stddev --osl-stddev | + +## Prompt + +| Option | Description | +|:-------|:-----------:| +| **PROMPT-BATCH-SIZE** | -b The batch size of text requests AIPerf should --prompt-batch-size send. This is currently supported with the --batch-size-text embeddings and rankings endpoint types [default: --batch-size 1] | + +## Prefix Prompt + +| Option | Description | +|:-------|:-----------:| +| **PROMPT-PREFIX-POOL-SIZE** | The total size of the prefix prompt pool to --prompt-prefix-pool-size select prefixes from. If this value is not --prefix-prompt-pool-size zero, these are prompts that are prepended to --num-prefix-prompts input prompts. This is useful for benchmarking models that use a K-V cache. [default: 0] | +| **PROMPT-PREFIX-LENGTH** | The number of tokens in each prefix prompt. --prompt-prefix-length This is only used if "num" is greater than --prefix-prompt-length zero. Note that due to the prefix and user prompts being concatenated, the number of tokens in the final prompt may be off by one. [default: 0] | + +## Audio Input + +| Option | Description | +|:-------|:-----------:| +| **AUDIO-BATCH-SIZE** | The batch size of audio requests AIPerf should --audio-batch-size send. This is currently supported with the --batch-size-audio OpenAI chat endpoint type [default: 1] | +| **AUDIO-LENGTH-MEAN** | The mean length of the audio in seconds. --audio-length-mean [default: 0.0] | +| **AUDIO-LENGTH-STDDEV** | The standard deviation of the length of the --audio-length-stddev audio in seconds. [default: 0.0] | +| **AUDIO-FORMAT**
`--audio-format` | The format of the audio files (wav or mp3). [choices: wav, mp3] [default: wav] | +| **AUDIO-DEPTHS**
`--audio-depths` | A list of audio bit depths to randomly select from in bits. [default: [16]] | +| **AUDIO-SAMPLE-RATES** | A list of audio sample rates to randomly select --audio-sample-rates from in kHz. Common sample rates are 16, 44.1, 48, 96, etc. [default: [16.0]] | +| **AUDIO-NUM-CHANNELS** | The number of audio channels to use for the --audio-num-channels audio data generation. [default: 1] | + +## Image Input + +| Option | Description | +|:-------|:-----------:| +| **IMAGE-WIDTH-MEAN** | The mean width of images when generating --image-width-mean synthetic image data. [default: 0.0] | +| **IMAGE-WIDTH-STDDEV** | The standard deviation of width of images when --image-width-stddev generating synthetic image data. [default: 0.0] | +| **IMAGE-HEIGHT-MEAN** | The mean height of images when generating --image-height-mean synthetic image data. [default: 0.0] | +| **IMAGE-HEIGHT-STDDEV** | The standard deviation of height of images when --image-height-stddev generating synthetic image data. [default: 0.0] | +| **IMAGE-BATCH-SIZE** | The image batch size of the requests AIPerf --image-batch-size should send. [default: 1] --batch-size-image | +| **IMAGE-FORMAT**
`--image-format` | The compression format of the images. [choices: png, jpeg, random] [default: png] | + +## Video Input + +| Option | Description | +|:-------|:-----------:| +| **VIDEO-BATCH-SIZE** | The video batch size of the requests AIPerf --video-batch-size should send. [default: 1] --batch-size-video | +| **VIDEO-DURATION** | Seconds per clip (default: 5.0). [default: 5.0] --video-duration | +| **VIDEO-FPS**
`--video-fps` | Frames per second (default/recommended for Cosmos: 4). [default: 4] | +| **VIDEO-WIDTH**
`--video-width` | Video width in pixels. | +| **VIDEO-HEIGHT**
`--video-height` | Video height in pixels. | +| **VIDEO-SYNTH-TYPE** | Synthetic generator type. [choices: --video-synth-type moving-shapes, grid-clock] [default: moving-shapes] | +| **VIDEO-FORMAT**
`--video-format` | The video format of the generated files. [choices: mp4] [default: mp4] | +| **VIDEO-CODEC**
`--video-codec` | The video codec to use for encoding. Common options: libx264 (CPU, widely compatible), libx265 (CPU, smaller files), h264_nvenc (NVIDIA GPU), hevc_nvenc (NVIDIA GPU, smaller files). Any FFmpeg-supported codec can be used. [default: libx264] | + +## Service + +| Option | Description | +|:-------|:-----------:| +| **LOG-LEVEL**
`--log-level` | Logging level [choices: trace, debug, info, notice, warning, success, error, critical] [default: info] | +| **VERBOSE**
`--verbose` | -v Equivalent to --log-level DEBUG. Enables more verbose logging output, but lacks some raw message logging. [default: False] | +| **EXTRA-VERBOSE** | -vv Equivalent to --log-level TRACE. Enables --extra-verbose the most verbose logging output possible. [default: False] | +| **RECORD-PROCESSOR-SERVICE-COU** | Number of services to spawn for NT --record-processor-serv processing records. The higher the ice-count request rate, the more services should be --record-processors spawned in order to keep up with the incoming records. If not specified, the number of services will be automatically determined based on the worker count. | +| **UI-TYPE**
`--ui-type`
`--ui` | Type of UI to use [choices: none, simple, dashboard] [default: dashboard] | + +## Telemetry + +| Option | Description | +|:-------|:-----------:| +| **GPU-TELEMETRY** | Enable GPU telemetry console display and optionally --gpu-telemetry specify custom DCGM exporter URLs (e.g., http://node1:9401/metrics http://node2:9401/metrics). Default localhost:9400 and localhost:9401 are always attempted | + +## Workers + +| Option | Description | +|:-------|:-----------:| +| **WORKERS-MAX**
`--workers-max` | Maximum number of workers to create. If not --max-workers specified, the number of workers will be determined by the formula min(concurrency, (num CPUs * 0.75) - 1), with a default max cap of 32. Any value provided will still be capped by the concurrency value (if specified), but not by the max cap. | + +## ZMQ Communication + +| Option | Description | +|:-------|:-----------:| +| **ZMQ-HOST**
`--zmq-host` | Host address for TCP connections [default: 127.0.0.1] | +| **ZMQ-IPC-PATH**
`--zmq-ipc-path` | Path for IPC sockets | diff --git a/tools/generate_cli_options_md.py b/tools/generate_cli_options_md.py index b4b8b74ac..1be5b4e1d 100755 --- a/tools/generate_cli_options_md.py +++ b/tools/generate_cli_options_md.py @@ -45,9 +45,10 @@ def get_help_output() -> str: def format_help_as_markdown(help_output: str) -> str: - """Format the help output as markdown with proper header and code blocks.""" - # Split the output into sections based on the ╭─ ... ─╮ headers - lines = help_output.split("\n") + """Format the help output as markdown tables.""" + # Parse the help output into sections + sections = parse_help_sections(help_output) + markdown_lines = [ "