diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ccff7e889..ad5960766 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -42,6 +42,12 @@ repos: types: [python] pass_filenames: false args: [] + - id: update-cli-docs + name: update-cli-docs + entry: python tools/generate_cli_options_md.py + language: python + pass_filenames: false + files: ^(src/aiperf/cli.*\.py|src/aiperf/common/config/.*\.py|docs/cli_options\.md)$ - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.11.8 hooks: diff --git a/Makefile b/Makefile index 38e7d6d19..841e194c4 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,8 @@ test-verbose init-files setup-venv setup-mkinit install-mock-server \ integration-tests integration-tests-ci integration-tests-verbose \ test-integration test-integration-ci test-integration-verbose \ - test-stress stress-tests internal-help help + test-stress stress-tests internal-help help \ + update-cli-docs check-cli-docs # Include user-defined environment variables @@ -95,6 +96,15 @@ internal-help: init-files: #? run mkinit to generate the __init__.py files. $(activate_venv) && tools/generate_init_files.sh +update-cli-docs: #? regenerate docs/cli_options.md from aiperf profile --help. + @printf "$(bold)$(green)Regenerating CLI options documentation...$(reset)\n" + $(activate_venv) && python tools/generate_cli_options_md.py + @printf "$(bold)$(green)Done! docs/cli_options.md has been updated.$(reset)\n" + +check-cli-docs: #? check if docs/cli_options.md is in sync with aiperf profile --help. + @printf "$(bold)$(blue)Checking if CLI options documentation is up to date...$(reset)\n" + $(activate_venv) && python tools/generate_cli_options_md.py --check + ruff lint: #? run the ruff linters $(activate_venv) && ruff check . $(args) diff --git a/docs/cli_options.md b/docs/cli_options.md index 238e0e091..f69b4a9c9 100644 --- a/docs/cli_options.md +++ b/docs/cli_options.md @@ -1,179 +1,174 @@ # CLI Options Use these options to profile with AIPerf. -``` -╭─ Endpoint ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ * MODEL-NAMES --model-names --model -m Model name(s) to be benchmarked. Can be a comma-separated list or a single model name. [required] │ -│ MODEL-SELECTION-STRATEGY --model-selection-strategy When multiple models are specified, this is how a specific model should be assigned to a prompt. round_robin: nth prompt │ -│ in the list gets assigned to n-mod len(models). random: assignment is uniformly random [choices: round-robin, random] │ -│ [default: round-robin] │ -│ CUSTOM-ENDPOINT --custom-endpoint --endpoint Set a custom endpoint that differs from the OpenAI defaults. │ -│ ENDPOINT-TYPE --endpoint-type The endpoint type to send requests to on the server. [choices: chat, completions, embeddings, rankings] │ -│ [default: chat] │ -│ STREAMING --streaming An option to enable the use of the streaming API. [default: False] │ -│ URL --url -u URL of the endpoint to target for benchmarking. [default: localhost:8000] │ -│ REQUEST-TIMEOUT-SECONDS --request-timeout-seconds The timeout in floating-point seconds for each request to the endpoint. [default: 600.0] │ -│ API-KEY --api-key The API key to use for the endpoint. If provided, it will be sent with every request as a header: Authorization: Bearer │ -│ . │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Input ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ EXTRA-INPUTS --extra-inputs Provide additional inputs to include with every request. Inputs should be in an 'input_name:value' │ -│ format. Alternatively, a string representing a json formatted dict can be provided. [default: []] │ -│ HEADER --header -H Adds a custom header to the requests. Headers must be specified as 'Header:Value' pairs. Alternatively, │ -│ a string representing a json formatted dict can be provided. [default: []] │ -│ INPUT-FILE --input-file The file or directory path that contains the dataset to use for profiling. This parameter is used in │ -│ conjunction with the custom_dataset_type parameter to support different types of user provided │ -│ datasets. │ -│ FIXED-SCHEDULE --fixed-schedule Specifies to run a fixed schedule of requests. This is normally inferred from the --input-file │ -│ parameter, but can be set manually here. [default: False] │ -│ FIXED-SCHEDULE-AUTO-OFFSET --fixed-schedule-auto-offset Specifies to automatically offset the timestamps in the fixed schedule, such that the first timestamp │ -│ is considered 0, and the rest are shifted accordingly. If disabled, the timestamps will be assumed to │ -│ be relative to 0. [default: False] │ -│ FIXED-SCHEDULE-START-OFFSET --fixed-schedule-start-offset Specifies the offset in milliseconds to start the fixed schedule at. By default, the schedule starts at │ -│ 0, but this option can be used to start at a reference point further in the schedule. This option │ -│ cannot be used in conjunction with the --fixed-schedule-auto-offset. The schedule will include any │ -│ requests at the start offset. │ -│ FIXED-SCHEDULE-END-OFFSET --fixed-schedule-end-offset Specifies the offset in milliseconds to end the fixed schedule at. By default, the schedule ends at the │ -│ last timestamp in the trace dataset, but this option can be used to only run a subset of the trace. The │ -│ schedule will include any requests at the end offset. │ -│ PUBLIC-DATASET --public-dataset The public dataset to use for the requests. [choices: sharegpt] │ -│ CUSTOM-DATASET-TYPE --custom-dataset-type The type of custom dataset to use. This parameter is used in conjunction with the --input-file │ -│ parameter. [choices: single_turn, multi_turn, random_pool, mooncake_trace] │ -│ DATASET-SAMPLING-STRATEGY --dataset-sampling-strategy The strategy to use for sampling the dataset. sequential: Iterate through the dataset sequentially, │ -│ then wrap around to the beginning. random: Randomly select a conversation from the dataset. Will │ -│ randomly sample with replacement. shuffle: Shuffle the dataset and iterate through it. Will randomly │ -│ sample without replacement. Once the end of the dataset is reached, shuffle the dataset again and start │ -│ over. [choices: sequential, random, shuffle] │ -│ RANDOM-SEED --random-seed The seed used to generate random values. Set to some value to make the synthetic data generation │ -│ deterministic. It will use system default if not provided. │ -│ GOODPUT --goodput Specify service level objectives (SLOs) for goodput as space-separated 'KEY:VALUE' pairs, where KEY is │ -│ a metric tag and VALUE is a number in the metric’s display unit (falls back to its base unit if no │ -│ display unit is defined). Examples: 'request_latency:250' (ms), 'inter_token_latency:10' (ms), │ -│ output_token_throughput_per_user:600 (tokens/s). Only metrics applicable to the current endpoint/config │ -│ are considered. For more context on the definition of goodput, refer to DistServe paper: │ -│ https://arxiv.org/pdf/2401.09670 and the blog: https://hao-ai-lab.github.io/blogs/distserve │ -╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Output ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ OUTPUT-ARTIFACT-DIR --output-artifact-dir --artifact-dir The directory to store all the (output) artifacts generated by AIPerf. [default: artifacts] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Tokenizer ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ TOKENIZER --tokenizer The Hugging Face tokenizer to use to interpret token metrics from prompts and responses. The value can be the name of a │ -│ tokenizer or the filepath of the tokenizer. The default value is the model name. │ -│ TOKENIZER-REVISION --tokenizer-revision The specific model version to use. It can be a branch name, tag name, or commit ID. [default: main] │ -│ TOKENIZER-TRUST-REMOTE-CODE --tokenizer-trust-remote-code Allows custom tokenizer to be downloaded and executed. This carries security risks and should only be used for │ -│ repositories you trust. This is only necessary for custom tokenizers stored in Hugging Face Hub. [default: False] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Load Generator ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ BENCHMARK-DURATION --benchmark-duration The duration in seconds for benchmarking. │ -│ BENCHMARK-GRACE-PERIOD --benchmark-grace-period The grace period in seconds to wait for responses after benchmark duration ends. Only applies when │ -│ --benchmark-duration is set. Responses received within this period are included in metrics. [default: 30.0] │ -│ CONCURRENCY --concurrency The concurrency value to benchmark. │ -│ REQUEST-RATE --request-rate Sets the request rate for the load generated by AIPerf. Unit: requests/second │ -│ REQUEST-RATE-MODE --request-rate-mode Sets the request rate mode for the load generated by AIPerf. Valid values: constant, poisson. constant: Generate │ -│ requests at a fixed rate. poisson: Generate requests using a poisson distribution. [default: poisson] │ -│ REQUEST-COUNT --request-count --num-requests The number of requests to use for measurement. [default: 10] │ -│ WARMUP-REQUEST-COUNT --warmup-request-count --num-warmup-requests The number of warmup requests to send before benchmarking. [default: 0] │ -│ REQUEST-CANCELLATION-RATE --request-cancellation-rate The percentage of requests to cancel. [default: 0.0] │ -│ REQUEST-CANCELLATION-DELAY --request-cancellation-delay The delay in seconds before cancelling requests. This is used when --request-cancellation-rate is greater than 0. │ -│ [default: 0.0] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Conversation Input ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ CONVERSATION-NUM --conversation-num --num-conversations --num-sessions The total number of unique conversations to generate. Each conversation represents a single request session between client and server. Supported on synthetic mode and │ -│ the custom random_pool dataset. The number of conversations will be used to determine the number of entries in both the custom random_pool and synthetic datasets and │ -│ will be reused until benchmarking is complete. │ -│ NUM-DATASET-ENTRIES --num-dataset-entries --num-prompts The total number of unique dataset entries to generate for the dataset. Each entry represents a single turn used in a request. [default: 100] │ -│ CONVERSATION-TURN-MEAN --conversation-turn-mean --session-turns-mean The mean number of turns within a conversation. [default: 1] │ -│ CONVERSATION-TURN-STDDEV --conversation-turn-stddev --session-turns-stddev The standard deviation of the number of turns within a conversation. [default: 0] │ -│ CONVERSATION-TURN-DELAY-MEAN --conversation-turn-delay-mean --session-turn-delay-mean The mean delay between turns within a conversation in milliseconds. [default: 0.0] │ -│ CONVERSATION-TURN-DELAY-STDDEV --conversation-turn-delay-stddev --session-turn-delay-stddev The standard deviation of the delay between turns within a conversation in milliseconds. [default: 0.0] │ -│ CONVERSATION-TURN-DELAY-RATIO --conversation-turn-delay-ratio --session-delay-ratio A ratio to scale multi-turn delays. [default: 1.0] │ -╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Input Sequence Length (ISL) ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ PROMPT-INPUT-TOKENS-MEAN --prompt-input-tokens-mean The mean of number of tokens in the generated prompts when using synthetic data. [default: 550] │ -│ --synthetic-input-tokens-mean --isl │ -│ PROMPT-INPUT-TOKENS-STDDEV --prompt-input-tokens-stddev The standard deviation of number of tokens in the generated prompts when using synthetic data. [default: 0.0] │ -│ --synthetic-input-tokens-stddev --isl-stddev │ -│ PROMPT-INPUT-TOKENS-BLOCK-SIZE --prompt-input-tokens-block-size The block size of the prompt. [default: 512] │ -│ --synthetic-input-tokens-block-size --isl-block-size │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Output Sequence Length (OSL) ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ PROMPT-OUTPUT-TOKENS-MEAN --prompt-output-tokens-mean The mean number of tokens in each output. │ -│ --output-tokens-mean --osl │ -│ PROMPT-OUTPUT-TOKENS-STDDEV --prompt-output-tokens-stddev The standard deviation of the number of tokens in each output. [default: 0] │ -│ --output-tokens-stddev --osl-stddev │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Prompt ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ PROMPT-BATCH-SIZE --prompt-batch-size --batch-size-text -b The batch size of text requests AIPerf should send. This is currently supported with the embeddings and rankings │ -│ --batch-size endpoint types [default: 1] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Prefix Prompt ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ PROMPT-PREFIX-POOL-SIZE --prompt-prefix-pool-size The total size of the prefix prompt pool to select prefixes from. If this value is not zero, these are prompts that are prepended │ -│ --prefix-prompt-pool-size --num-prefix-prompts to input prompts. This is useful for benchmarking models that use a K-V cache. [default: 0] │ -│ PROMPT-PREFIX-LENGTH --prompt-prefix-length The number of tokens in each prefix prompt. This is only used if "num" is greater than zero. Note that due to the prefix and user │ -│ --prefix-prompt-length prompts being concatenated, the number of tokens in the final prompt may be off by one. [default: 0] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Audio Input ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ AUDIO-BATCH-SIZE --audio-batch-size --batch-size-audio The batch size of audio requests AIPerf should send. This is currently supported with the OpenAI chat endpoint type [default: │ -│ 1] │ -│ AUDIO-LENGTH-MEAN --audio-length-mean The mean length of the audio in seconds. [default: 0.0] │ -│ AUDIO-LENGTH-STDDEV --audio-length-stddev The standard deviation of the length of the audio in seconds. [default: 0.0] │ -│ AUDIO-FORMAT --audio-format The format of the audio files (wav or mp3). [choices: wav, mp3] [default: wav] │ -│ AUDIO-DEPTHS --audio-depths A list of audio bit depths to randomly select from in bits. [default: [16]] │ -│ AUDIO-SAMPLE-RATES --audio-sample-rates A list of audio sample rates to randomly select from in kHz. Common sample rates are 16, 44.1, 48, 96, etc. [default: [16.0]] │ -│ AUDIO-NUM-CHANNELS --audio-num-channels The number of audio channels to use for the audio data generation. [default: 1] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Image Input ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ IMAGE-WIDTH-MEAN --image-width-mean The mean width of images when generating synthetic image data. [default: 0.0] │ -│ IMAGE-WIDTH-STDDEV --image-width-stddev The standard deviation of width of images when generating synthetic image data. [default: 0.0] │ -│ IMAGE-HEIGHT-MEAN --image-height-mean The mean height of images when generating synthetic image data. [default: 0.0] │ -│ IMAGE-HEIGHT-STDDEV --image-height-stddev The standard deviation of height of images when generating synthetic image data. [default: 0.0] │ -│ IMAGE-BATCH-SIZE --image-batch-size --batch-size-image The image batch size of the requests AIPerf should send. [default: 1] │ -│ IMAGE-FORMAT --image-format The compression format of the images. [choices: png, jpeg, random] [default: png] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Service ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ LOG-LEVEL --log-level Logging level [choices: trace, debug, info, notice, warning, success, error, critical] [default: info] │ -│ VERBOSE --verbose -v Equivalent to --log-level DEBUG. Enables more verbose logging output, but lacks some raw message logging. │ -│ [default: False] │ -│ EXTRA-VERBOSE --extra-verbose -vv Equivalent to --log-level TRACE. Enables the most verbose logging output possible. [default: False] │ -│ RECORD-PROCESSOR-SERVICE-COUNT --record-processor-service-count Number of services to spawn for processing records. The higher the request rate, the more services should be │ -│ --record-processors spawned in order to keep up with the incoming records. If not specified, the number of services will be │ -│ automatically determined based on the worker count. │ -│ UI-TYPE --ui-type --ui Type of UI to use [choices: dashboard, simple, none] [default: dashboard] │ -╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` -``` -╭─ Workers ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ WORKERS-MAX --workers-max --max-workers Maximum number of workers to create. If not specified, the number of workers will be determined by the formula │ -│ min(concurrency, (num CPUs * 0.75) - 1), with a default max cap of 32. Any value provided will still be capped by the │ -│ concurrency value (if specified), but not by the max cap. │ -╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -``` +## Endpoint + +| Option | Description | +|:-------|:-----------:| +| **`*`**
**MODEL-NAMES**
`--model-names` | -m Model name(s) to be benchmarked. Can be --model a comma-separated list or a single model name. [required] | +| **MODEL-SELECTION-STRATEGY** | When multiple models are specified, --model-selection-strategy this is how a specific model should be assigned to a prompt. round_robin: nth prompt in the list gets assigned to n-mod len(models). random: assignment is uniformly random [choices: round-robin, random] [default: round-robin] | +| **CUSTOM-ENDPOINT** | Set a custom endpoint that differs from --custom-endpoint the OpenAI defaults. --endpoint | +| **ENDPOINT-TYPE** | The endpoint type to send requests to --endpoint-type on the server. [choices: chat, completions, cohere-rankings, embeddings, hf-tei-rankings, huggingface-generate, nim-rankings, solido-rag, template] [default: chat] | +| **STREAMING**
`--streaming` | An option to enable the use of the streaming API. [default: False] | +| **URL**
`--url` | -u URL of the endpoint to target for benchmarking. [default: localhost:8000] | +| **REQUEST-TIMEOUT-SECONDS** | The timeout in floating-point seconds --request-timeout-seconds for each request to the endpoint. [default: 600.0] | +| **API-KEY**
`--api-key` | The API key to use for the endpoint. If provided, it will be sent with every request as a header: Authorization: Bearer . | +| **TRANSPORT**
`--transport` | The transport to use for the endpoint. --transport-type If not provided, it will be auto-detected from the URL.This can also be used to force an alternative transport or implementation. [choices: http] | + +## Input + +| Option | Description | +|:-------|:-----------:| +| **EXTRA-INPUTS**
`--extra-inputs` | Provide additional inputs to include with every request. Inputs should be in an 'input_name:value' format. Alternatively, a string representing a json formatted dict can be provided. [default: []] | +| **HEADER**
`--header` | -H Adds a custom header to the requests. Headers must be specified as 'Header:Value' pairs. Alternatively, a string representing a json formatted dict can be provided. [default: []] | +| **INPUT-FILE**
`--input-file` | The file or directory path that contains the dataset to use for profiling. This parameter is used in conjunction with the custom_dataset_type parameter to support different types of user provided datasets. | +| **FIXED-SCHEDULE** | Specifies to run a fixed schedule of --fixed-schedule requests. This is normally inferred from the --input-file parameter, but can be set manually here. [default: False] | +| **FIXED-SCHEDULE-AUTO-OFFSET** | Specifies to automatically offset the --fixed-schedule-auto-offs timestamps in the fixed schedule, such et that the first timestamp is considered 0, and the rest are shifted accordingly. If disabled, the timestamps will be assumed to be relative to 0. [default: False] | +| **FIXED-SCHEDULE-START-OFFSET** | Specifies the offset in milliseconds to --fixed-schedule-start-off start the fixed schedule at. By default, set the schedule starts at 0, but this option can be used to start at a reference point further in the schedule. This option cannot be used in conjunction with the --fixed-schedule-auto-offset. The schedule will include any requests at the start offset. | +| **FIXED-SCHEDULE-END-OFFSET** | Specifies the offset in milliseconds to --fixed-schedule-end-offse end the fixed schedule at. By default, the t schedule ends at the last timestamp in the trace dataset, but this option can be used to only run a subset of the trace. The schedule will include any requests at the end offset. | +| **PUBLIC-DATASET** | The public dataset to use for the --public-dataset requests. [choices: sharegpt] | +| **CUSTOM-DATASET-TYPE** | The type of custom dataset to use. This --custom-dataset-type parameter is used in conjunction with the --input-file parameter. [choices: single_turn, multi_turn, random_pool, mooncake_trace] | +| **DATASET-SAMPLING-STRATEGY** | The strategy to use for sampling the --dataset-sampling-strateg dataset. sequential: Iterate through the y dataset sequentially, then wrap around to the beginning. random: Randomly select a conversation from the dataset. Will randomly sample with replacement. shuffle: Shuffle the dataset and iterate through it. Will randomly sample without replacement. Once the end of the dataset is reached, shuffle the dataset again and start over. [choices: sequential, random, shuffle] | +| **RANDOM-SEED**
`--random-seed` | The seed used to generate random values. Set to some value to make the synthetic data generation deterministic. It will use system default if not provided. | +| **GOODPUT**
`--goodput` | Specify service level objectives (SLOs) for goodput as space-separated 'KEY:VALUE' pairs, where KEY is a metric tag and VALUE is a number in the metric’s display unit (falls back to its base unit if no display unit is defined). Examples: 'request_latency:250' (ms), 'inter_token_latency:10' (ms), output_token_throughput_per_user:600 (tokens/s). Only metrics applicable to the current endpoint/config are considered. For more context on the definition of goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 and the blog: https://hao-ai-lab.github.io/blogs/distser ve | + +## Output + +| Option | Description | +|:-------|:-----------:| +| **OUTPUT-ARTIFACT-DIR** | The directory to store all the (output) --output-artifact-dir artifacts generated by AIPerf. [default: --artifact-dir artifacts] | +| **PROFILE-EXPORT-PREFIX** | The prefix for the profile export file names. --profile-export-prefix Will be suffixed with .csv, .json, .jsonl, and --profile-export-file _raw.jsonl.If not provided, the default profile export file names will be used: profile_export_aiperf.csv, profile_export_aiperf.json, profile_export.jsonl, and profile_export_raw.jsonl. | +| **EXPORT-LEVEL**
`--export-level` | The level of profile export files to create. --profile-export-level [choices: summary, records, raw] [default: records] | + +## Tokenizer + +| Option | Description | +|:-------|:-----------:| +| **TOKENIZER**
`--tokenizer` | The HuggingFace tokenizer to use to interpret token metrics from prompts and responses. The value can be the name of a tokenizer or the filepath of the tokenizer. The default value is the model name. | +| **TOKENIZER-REVISION** | The specific model version to use. It can be a --tokenizer-revision branch name, tag name, or commit ID. [default: main] | +| **TOKENIZER-TRUST-REMOTE-CODE** | Allows custom tokenizer to be downloaded and --tokenizer-trust-remote-c executed. This carries security risks and ode should only be used for repositories you trust. This is only necessary for custom tokenizers stored in HuggingFace Hub. [default: False] | + +## Load Generator + +| Option | Description | +|:-------|:-----------:| +| **BENCHMARK-DURATION** | The duration in seconds for benchmarking. --benchmark-duration | +| **BENCHMARK-GRACE-PERIOD** | The grace period in seconds to wait for --benchmark-grace-period responses after benchmark duration ends. Only applies when --benchmark-duration is set. Responses received within this period are included in metrics. [default: 30.0] | +| **CONCURRENCY**
`--concurrency` | The concurrency value to benchmark. | +| **REQUEST-RATE**
`--request-rate` | Sets the request rate for the load generated by AIPerf. Unit: requests/second | +| **REQUEST-RATE-MODE** | Sets the request rate mode for the load --request-rate-mode generated by AIPerf. Valid values: constant, poisson. constant: Generate requests at a fixed rate. poisson: Generate requests using a poisson distribution. [default: poisson] | +| **REQUEST-COUNT** | The number of requests to use for measurement. --request-count [default: 10] --num-requests | +| **WARMUP-REQUEST-COUNT** | The number of warmup requests to send before --warmup-request-count benchmarking. [default: 0] --num-warmup-requests | +| **REQUEST-CANCELLATION-RATE** | The percentage of requests to cancel. --request-cancellation-rat [default: 0.0] e | +| **REQUEST-CANCELLATION-DELAY** | The delay in seconds before cancelling --request-cancellation-del requests. This is used when ay --request-cancellation-rate is greater than 0. [default: 0.0] | + +## Conversation Input + +| Option | Description | +|:-------|:-----------:| +| **CONVERSATION-NUM** | The total number of unique conversations to --conversation-num generate. Each conversation represents a --num-conversations single request session between client and --num-sessions server. Supported on synthetic mode and the custom random_pool dataset. The number of conversations will be used to determine the number of entries in both the custom random_pool and synthetic datasets and will be reused until benchmarking is complete. | +| **NUM-DATASET-ENTRIES** | The total number of unique dataset entries to --num-dataset-entries generate for the dataset. Each entry --num-prompts represents a single turn used in a request. [default: 100] | +| **CONVERSATION-TURN-MEAN** | The mean number of turns within a --conversation-turn-mean conversation. [default: 1] --session-turns-mean | +| **CONVERSATION-TURN-STDDEV** | The standard deviation of the number of turns --conversation-turn-stddev within a conversation. [default: 0] --session-turns-stddev | +| **CONVERSATION-TURN-DELAY-MEAN** | The mean delay between turns within a --conversation-turn-delay- conversation in milliseconds. [default: 0.0] mean --session-turn-delay-mean | +| **CONVERSATION-TURN-DELAY-STDD** | The standard deviation of the delay between EV --conversation-turn-del turns within a conversation in milliseconds. ay-stddev --session-turn-d [default: 0.0] elay-stddev | +| **CONVERSATION-TURN-DELAY-RATI** | A ratio to scale multi-turn delays. [default: O --conversation-turn-dela 1.0] y-ratio --session-delay-ratio | + +## Input Sequence Length (ISL) + +| Option | Description | +|:-------|:-----------:| +| **PROMPT-INPUT-TOKENS-MEAN** | The mean of number of tokens in the generated --prompt-input-tokens-mean prompts when using synthetic data. [default: --synthetic-input-tokens-m 550] ean --isl | +| **PROMPT-INPUT-TOKENS-STDDEV** | The standard deviation of number of tokens in --prompt-input-tokens-stdd the generated prompts when using synthetic ev --synthetic-input-token data. [default: 0.0] s-stddev --isl-stddev | +| **PROMPT-INPUT-TOKENS-BLOCK-SI** | The block size of the prompt. [default: 512] ZE --prompt-input-tokens-b lock-size --synthetic-inpu t-tokens-block-size --isl-block-size | +| **SEQ-DIST**
`--seq-dist` | Sequence length distribution specification for --sequence-distribution varying ISL/OSL pairs | + +## Output Sequence Length (OSL) + +| Option | Description | +|:-------|:-----------:| +| **PROMPT-OUTPUT-TOKENS-MEAN** | The mean number of tokens in each output. --prompt-output-tokens-mea n --output-tokens-mean --osl | +| **PROMPT-OUTPUT-TOKENS-STDDEV** | The standard deviation of the number of tokens --prompt-output-tokens-std in each output. [default: 0] dev --output-tokens-stddev --osl-stddev | + +## Prompt + +| Option | Description | +|:-------|:-----------:| +| **PROMPT-BATCH-SIZE** | -b The batch size of text requests AIPerf should --prompt-batch-size send. This is currently supported with the --batch-size-text embeddings and rankings endpoint types [default: --batch-size 1] | + +## Prefix Prompt + +| Option | Description | +|:-------|:-----------:| +| **PROMPT-PREFIX-POOL-SIZE** | The total size of the prefix prompt pool to --prompt-prefix-pool-size select prefixes from. If this value is not --prefix-prompt-pool-size zero, these are prompts that are prepended to --num-prefix-prompts input prompts. This is useful for benchmarking models that use a K-V cache. [default: 0] | +| **PROMPT-PREFIX-LENGTH** | The number of tokens in each prefix prompt. --prompt-prefix-length This is only used if "num" is greater than --prefix-prompt-length zero. Note that due to the prefix and user prompts being concatenated, the number of tokens in the final prompt may be off by one. [default: 0] | + +## Audio Input + +| Option | Description | +|:-------|:-----------:| +| **AUDIO-BATCH-SIZE** | The batch size of audio requests AIPerf should --audio-batch-size send. This is currently supported with the --batch-size-audio OpenAI chat endpoint type [default: 1] | +| **AUDIO-LENGTH-MEAN** | The mean length of the audio in seconds. --audio-length-mean [default: 0.0] | +| **AUDIO-LENGTH-STDDEV** | The standard deviation of the length of the --audio-length-stddev audio in seconds. [default: 0.0] | +| **AUDIO-FORMAT**
`--audio-format` | The format of the audio files (wav or mp3). [choices: wav, mp3] [default: wav] | +| **AUDIO-DEPTHS**
`--audio-depths` | A list of audio bit depths to randomly select from in bits. [default: [16]] | +| **AUDIO-SAMPLE-RATES** | A list of audio sample rates to randomly select --audio-sample-rates from in kHz. Common sample rates are 16, 44.1, 48, 96, etc. [default: [16.0]] | +| **AUDIO-NUM-CHANNELS** | The number of audio channels to use for the --audio-num-channels audio data generation. [default: 1] | + +## Image Input + +| Option | Description | +|:-------|:-----------:| +| **IMAGE-WIDTH-MEAN** | The mean width of images when generating --image-width-mean synthetic image data. [default: 0.0] | +| **IMAGE-WIDTH-STDDEV** | The standard deviation of width of images when --image-width-stddev generating synthetic image data. [default: 0.0] | +| **IMAGE-HEIGHT-MEAN** | The mean height of images when generating --image-height-mean synthetic image data. [default: 0.0] | +| **IMAGE-HEIGHT-STDDEV** | The standard deviation of height of images when --image-height-stddev generating synthetic image data. [default: 0.0] | +| **IMAGE-BATCH-SIZE** | The image batch size of the requests AIPerf --image-batch-size should send. [default: 1] --batch-size-image | +| **IMAGE-FORMAT**
`--image-format` | The compression format of the images. [choices: png, jpeg, random] [default: png] | + +## Video Input + +| Option | Description | +|:-------|:-----------:| +| **VIDEO-BATCH-SIZE** | The video batch size of the requests AIPerf --video-batch-size should send. [default: 1] --batch-size-video | +| **VIDEO-DURATION** | Seconds per clip (default: 5.0). [default: 5.0] --video-duration | +| **VIDEO-FPS**
`--video-fps` | Frames per second (default/recommended for Cosmos: 4). [default: 4] | +| **VIDEO-WIDTH**
`--video-width` | Video width in pixels. | +| **VIDEO-HEIGHT**
`--video-height` | Video height in pixels. | +| **VIDEO-SYNTH-TYPE** | Synthetic generator type. [choices: --video-synth-type moving-shapes, grid-clock] [default: moving-shapes] | +| **VIDEO-FORMAT**
`--video-format` | The video format of the generated files. [choices: mp4] [default: mp4] | +| **VIDEO-CODEC**
`--video-codec` | The video codec to use for encoding. Common options: libx264 (CPU, widely compatible), libx265 (CPU, smaller files), h264_nvenc (NVIDIA GPU), hevc_nvenc (NVIDIA GPU, smaller files). Any FFmpeg-supported codec can be used. [default: libx264] | + +## Service + +| Option | Description | +|:-------|:-----------:| +| **LOG-LEVEL**
`--log-level` | Logging level [choices: trace, debug, info, notice, warning, success, error, critical] [default: info] | +| **VERBOSE**
`--verbose` | -v Equivalent to --log-level DEBUG. Enables more verbose logging output, but lacks some raw message logging. [default: False] | +| **EXTRA-VERBOSE** | -vv Equivalent to --log-level TRACE. Enables --extra-verbose the most verbose logging output possible. [default: False] | +| **RECORD-PROCESSOR-SERVICE-COU** | Number of services to spawn for NT --record-processor-serv processing records. The higher the ice-count request rate, the more services should be --record-processors spawned in order to keep up with the incoming records. If not specified, the number of services will be automatically determined based on the worker count. | +| **UI-TYPE**
`--ui-type`
`--ui` | Type of UI to use [choices: none, simple, dashboard] [default: dashboard] | + +## Telemetry + +| Option | Description | +|:-------|:-----------:| +| **GPU-TELEMETRY** | Enable GPU telemetry console display and optionally --gpu-telemetry specify custom DCGM exporter URLs (e.g., http://node1:9401/metrics http://node2:9401/metrics). Default localhost:9400 and localhost:9401 are always attempted | + +## Workers + +| Option | Description | +|:-------|:-----------:| +| **WORKERS-MAX**
`--workers-max` | Maximum number of workers to create. If not --max-workers specified, the number of workers will be determined by the formula min(concurrency, (num CPUs * 0.75) - 1), with a default max cap of 32. Any value provided will still be capped by the concurrency value (if specified), but not by the max cap. | + +## ZMQ Communication + +| Option | Description | +|:-------|:-----------:| +| **ZMQ-HOST**
`--zmq-host` | Host address for TCP connections [default: 127.0.0.1] | +| **ZMQ-IPC-PATH**
`--zmq-ipc-path` | Path for IPC sockets | diff --git a/tools/generate_cli_options_md.py b/tools/generate_cli_options_md.py new file mode 100755 index 000000000..1be5b4e1d --- /dev/null +++ b/tools/generate_cli_options_md.py @@ -0,0 +1,382 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Generate the CLI options documentation from the aiperf profile --help output. + +This script runs `aiperf profile --help` and formats the output into a markdown file. +It should be run from the repository root. + +Usage: + python tools/generate_cli_options_md.py [--check] + +Options: + --check Check if the current cli_options.md matches the generated output. + Returns exit code 1 if they differ, 0 if they match. +""" + +import argparse +import subprocess +import sys +from pathlib import Path + + +def get_help_output() -> str: + """Run aiperf profile --help and return the output.""" + try: + result = subprocess.run( + ["aiperf", "profile", "--help"], + capture_output=True, + text=True, + check=True, + ) + return result.stdout + except subprocess.CalledProcessError as e: + print(f"Error running aiperf profile --help: {e}", file=sys.stderr) + print(f"stderr: {e.stderr}", file=sys.stderr) + sys.exit(1) + except FileNotFoundError: + print( + "Error: aiperf command not found. Make sure it's installed and in your PATH.", + file=sys.stderr, + ) + sys.exit(1) + + +def format_help_as_markdown(help_output: str) -> str: + """Format the help output as markdown tables.""" + # Parse the help output into sections + sections = parse_help_sections(help_output) + + markdown_lines = [ + "", + "", + "# CLI Options", + "Use these options to profile with AIPerf.", + "", + ] + + # Convert each section to a markdown table + for section_name, options in sections.items(): + if not options: + continue + + markdown_lines.append(f"## {section_name}") + markdown_lines.append("") + + # Create table header + markdown_lines.append("| Option | Description |") + markdown_lines.append("|:-------|:-----------:|") + + # Add each option as a table row + for option in options: + option_col = format_option_column(option) + desc_col = option["description"] + markdown_lines.append(f"| {option_col} | {desc_col} |") + + markdown_lines.append("") + + return "\n".join(markdown_lines) + + +def parse_help_sections(help_output: str) -> dict: + """Parse CLI help output into sections with options. + + Returns: + Dict mapping section names to lists of option dicts + """ + lines = help_output.split("\n") + sections = {} + current_section = None + current_section_lines = [] + in_section = False + + for line in lines: + # Check if this is a section header (╭─ ... ─╮) + if line.strip().startswith("╭─"): + # Save previous section if exists + if current_section and current_section_lines: + sections[current_section] = parse_section_options(current_section_lines) + + # Extract section name + title_start = line.find("─ ") + 2 + title_end = line.rfind(" ─") + if title_start > 1 and title_end > title_start: + current_section = line[title_start:title_end].strip() + else: + current_section = "Options" + + current_section_lines = [] + in_section = True + + elif line.strip().startswith("╰─"): + # End of section - save it + if current_section and current_section_lines: + sections[current_section] = parse_section_options(current_section_lines) + current_section = None + current_section_lines = [] + in_section = False + + elif in_section and line.strip(): + # Content line - strip box borders + if len(line) > 4 and line.startswith("│") and line.endswith("│"): + content = line[2:-2] # Remove │ and surrounding spaces + current_section_lines.append(content) + + # Add any remaining section (shouldn't happen with proper box format) + if current_section and current_section_lines: + sections[current_section] = parse_section_options(current_section_lines) + + return sections + + +def parse_section_options(lines: list[str]) -> list[dict]: + """Parse option lines into structured option dictionaries. + + Returns: + List of dicts with 'name', 'aliases', 'short', 'description', 'required' + """ + options = [] + current_option = None + + for line in lines: + # Detect new option based on indentation: + # - Starts with * (required, no leading space) + # - Starts with exactly 3 spaces (some options in Endpoint section) + # - Starts with NO spaces and uppercase (options in Input, Output, etc.) + # - More than 3 spaces or starts with many spaces = continuation + is_new_option = False + + if not line or line.isspace(): + continue + + if line[0] == "*": + # Required option (no leading space) + is_new_option = True + elif line.startswith(" ") and not line.startswith(" "): + # Exactly 3 spaces = new option (Endpoint section style) + is_new_option = True + elif not line.startswith(" ") and line[0].isupper(): + # No leading space and uppercase = new option (Input/Output section style) + is_new_option = True + + if is_new_option: + # Save previous option + if current_option: + options.append(current_option) + + # Start new option + current_option = parse_option_line(line.lstrip()) + elif current_option: + # Continuation of description + desc = line.strip() + if desc: + # Add space only if description already has content + if current_option["description"]: + current_option["description"] += " " + desc + else: + current_option["description"] = desc + + # Add the last option + if current_option: + options.append(current_option) + + return options + + +def parse_option_line(line: str) -> dict: + """Parse a single option line into components. + + Returns: + Dict with 'name', 'aliases', 'short', 'description', 'required' + """ + import re + + option = { + "name": "", + "aliases": [], + "short": "", + "description": "", + "required": False, + } + + # Check if required (starts with *) + if line.lstrip().startswith("*"): + option["required"] = True + line = line.lstrip()[1:].lstrip() # Remove the * + + # Split on multiple spaces to separate option names from description + parts = re.split(r"\s{2,}", line.strip()) + + if not parts: + return option + + # First part contains option names + option_names = parts[0] + + # Extract option name and aliases + # Pattern: OPTION-NAME --long-name --alias -s + tokens = option_names.split() + + for token in tokens: + token = token.strip() + if not token: + continue + + if token.startswith("--"): + # Long option + if not option["name"]: + option["name"] = token + else: + option["aliases"].append(token) + elif token.startswith("-") and len(token) == 2: + # Short option + option["short"] = token + elif (token.isupper() or (token and token[0].isupper())) and not option["name"]: + # Environment variable style name + option["name"] = token + + # Description is everything after the option names + if len(parts) > 1: + option["description"] = " ".join(parts[1:]) + + return option + + +def format_option_column(option: dict) -> str: + """Format the option column with name, aliases, and required marker. + + Returns: + Formatted string for the option column + """ + parts = [] + + # Required marker + if option["required"]: + parts.append("**`*`**") + + # Main option name + if option["name"]: + name = option["name"] + # Format as code + if name.startswith("--") or name.startswith("-"): + parts.append(f"`{name}`") + else: + parts.append(f"**{name}**") + + # Aliases + for alias in option["aliases"]: + parts.append(f"`{alias}`") + + # Short option + if option["short"]: + parts.append(f"`{option['short']}`") + + return "
".join(parts) if parts else "" + + +def stage_file(file_path: Path) -> None: + """Stage a file using git add.""" + try: + subprocess.run( + ["git", "add", str(file_path)], + check=True, + capture_output=True, + text=True, + ) + print(f"✓ Staged {file_path} for commit", file=sys.stderr) + except subprocess.CalledProcessError as e: + print( + f"Warning: Could not stage {file_path}: {e.stderr}", + file=sys.stderr, + ) + except FileNotFoundError: + print( + "Warning: git command not found, file not staged", + file=sys.stderr, + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Generate CLI options documentation from aiperf profile --help" + ) + parser.add_argument( + "--check", + action="store_true", + help="Check if the current cli_options.md matches the generated output", + ) + parser.add_argument( + "--output", + type=Path, + default=Path("docs/cli_options.md"), + help="Output file path (default: docs/cli_options.md)", + ) + parser.add_argument( + "--no-stage", + action="store_true", + help="Don't automatically stage the file with git add (default: auto-stage)", + ) + args = parser.parse_args() + + # Get the help output + print("Running aiperf profile --help...", file=sys.stderr) + help_output = get_help_output() + + # Format as markdown + print("Formatting output as markdown...", file=sys.stderr) + markdown = format_help_as_markdown(help_output) + + if args.check: + # Check mode: compare with existing file + if not args.output.exists(): + print( + f"Error: {args.output} does not exist. Run without --check to generate it.", + file=sys.stderr, + ) + sys.exit(1) + + current_content = args.output.read_text() + if current_content.strip() == markdown.strip(): + print(f"✓ {args.output} is up to date!", file=sys.stderr) + sys.exit(0) + else: + print( + f"✗ {args.output} is out of sync with aiperf profile --help output!", + file=sys.stderr, + ) + print( + " Run 'make update-cli-docs' or 'python tools/generate_cli_options_md.py' to update it.", + file=sys.stderr, + ) + sys.exit(1) + else: + # Write mode: write to file and optionally stage it + file_existed = args.output.exists() + if file_existed: + old_content = args.output.read_text() + + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(markdown) + + # Check if content actually changed + content_changed = not file_existed or old_content != markdown + + if content_changed: + print(f"✓ Generated {args.output}", file=sys.stderr) + + # Auto-stage the file unless --no-stage is specified + if not args.no_stage: + stage_file(args.output) + else: + print(f"✓ {args.output} already up to date", file=sys.stderr) + + sys.exit(0) + + +if __name__ == "__main__": + main()