From d7c643c04434494b3707073128e7690b366abd16 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 16 Mar 2026 17:52:30 -0500 Subject: [PATCH 1/6] Initial benchmark porting to ASV --- .github/workflows/rocm-ci.yml | 73 +++++++++++ .gitignore | 1 + asv.conf.json | 16 +++ benchmarks/asv/README.md | 166 ++++++++++++++++++++++++++ benchmarks/asv/__init__.py | 0 benchmarks/asv/bench_attention.py | 56 +++++++++ benchmarks/asv/bench_casting.py | 51 ++++++++ benchmarks/asv/bench_gemm.py | 55 +++++++++ benchmarks/asv/bench_gemm_fp8.py | 60 ++++++++++ benchmarks/asv/bench_grouped_gemm.py | 64 ++++++++++ benchmarks/asv/bench_normalization.py | 36 ++++++ 11 files changed, 578 insertions(+) create mode 100644 asv.conf.json create mode 100644 benchmarks/asv/README.md create mode 100644 benchmarks/asv/__init__.py create mode 100644 benchmarks/asv/bench_attention.py create mode 100644 benchmarks/asv/bench_casting.py create mode 100644 benchmarks/asv/bench_gemm.py create mode 100644 benchmarks/asv/bench_gemm_fp8.py create mode 100644 benchmarks/asv/bench_grouped_gemm.py create mode 100644 benchmarks/asv/bench_normalization.py diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 32c3cb2a2..c35e5d0ad 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -368,6 +368,79 @@ jobs: EOF )" + - name: Restore previous ASV results + if: github.event_name == 'push' && github.ref_name == 'dev' + continue-on-error: true + env: + ARTIFACTORY_API_KEY: ${{ secrets.ARTIFACTORY_API_KEY }} + run: | + set -x + BASE_URL="https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/asv-results" + ARTIFACT_URL="${BASE_URL}/${{ matrix.runner }}/results.tar.gz" + + curl -sf -H "X-JFrog-Art-Api:${ARTIFACTORY_API_KEY}" \ + -o /tmp/asv-results.tar.gz "$ARTIFACT_URL" || { + echo "::notice::No previous ASV results found. Starting fresh." + exit 0 + } + + mkdir -p asv-results + tar xzf /tmp/asv-results.tar.gz -C asv-results/ + + # Copy into the container's ASV results directory + docker exec te-runner mkdir -p /workspace/benchmarks/.asv/results + docker cp asv-results/. te-runner:/workspace/benchmarks/.asv/results/ + echo "Restored previous ASV results from Artifactory." + + - name: Performance benchmarks (ASV) + if: github.event_name == 'push' && github.ref_name == 'dev' + continue-on-error: true + env: + RUNNER_NAME: ${{ matrix.runner }} + run: | + set -ex + + # Derive a stable machine name from the runner label + case "${RUNNER_NAME}" in + linux-te-mi325*) MACHINE_NAME="mi325" ;; + linux-te-mi355*) MACHINE_NAME="mi355" ;; + *) MACHINE_NAME="${RUNNER_NAME}" ;; + esac + + docker exec -e MACHINE_NAME="$MACHINE_NAME" te-runner bash -c "$(cat <<'OUTER' + set -ex + pip install asv + cd /workspace + asv machine --yes --machine "$MACHINE_NAME" + asv run --python=same --launch-method spawn \ + 2>&1 | tee /workspace/asv_results.txt + OUTER + )" + + # Copy results out of the container for upload + rm -rf asv-results + docker cp te-runner:/workspace/benchmarks/.asv/results/. ./asv-results/ || true + + - name: Upload ASV results + if: github.event_name == 'push' && github.ref_name == 'dev' + continue-on-error: true + env: + ARTIFACTORY_API_KEY: ${{ secrets.ARTIFACTORY_API_KEY }} + run: | + set -ex + if [[ ! -d asv-results ]] || [[ -z "$(ls -A asv-results)" ]]; then + echo "::notice::No ASV results to upload." + exit 0 + fi + + BASE_URL="https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/asv-results" + tar czf /tmp/asv-results.tar.gz -C asv-results . + + curl -sf -H "X-JFrog-Art-Api:${ARTIFACTORY_API_KEY}" \ + -T /tmp/asv-results.tar.gz \ + "${BASE_URL}/${{ matrix.runner }}/results.tar.gz" + echo "Uploaded ASV results to Artifactory." + - name: Check Test Failure Status if: always() run: | diff --git a/.gitignore b/.gitignore index d3b18b358..a5fd89b4b 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,4 @@ artifacts/ **/times.csv transformer_engine/build_info.txt transformer_engine/common/util/hip_nvml.* +.asv/ diff --git a/asv.conf.json b/asv.conf.json new file mode 100644 index 000000000..dc71bf345 --- /dev/null +++ b/asv.conf.json @@ -0,0 +1,16 @@ +{ + "version": 1, + "project": "TransformerEngine", + "project_url": "https://github.com/ROCm/TransformerEngine", + "repo": ".", + "branches": ["dev"], + "environment_type": "existing", + "install_command": [], + "build_command": [], + "benchmark_dir": "benchmarks/asv", + "results_dir": "benchmarks/.asv/results", + "html_dir": "benchmarks/.asv/html", + "install_timeout": 600, + "benchmark_timeout": 1200, + "launch_method": "spawn" +} diff --git a/benchmarks/asv/README.md b/benchmarks/asv/README.md new file mode 100644 index 000000000..7de4fd6c5 --- /dev/null +++ b/benchmarks/asv/README.md @@ -0,0 +1,166 @@ +# ASV Benchmarks for TransformerEngine + +Performance benchmarks built on [ASV (Air Speed Velocity)](https://asv.readthedocs.io/), +a framework for benchmarking Python packages over their lifetime. + +## Prerequisites + +- TransformerEngine must already be built and installed in the current Python environment. +- A ROCm or CUDA GPU must be available. +- Install ASV: `pip install asv` + +ASV is configured with `environment_type: "existing"` (in `asv.conf.json` at the repo root), +meaning it uses the current Python environment directly — it does not create virtualenvs or +attempt to build TE itself. + +## Local usage + +All commands are run from the **repository root** (where `asv.conf.json` lives). + +### Register your machine + +```bash +asv machine --yes --machine my-machine-name +``` + +This creates a machine profile in `benchmarks/.asv/results/my-machine-name/machine.json`. +Use a descriptive name (e.g., `mi325`, `mi300x-dev`) — results are stored per machine, so +the name must be consistent across runs for historical comparison. + +### Run all benchmarks + +```bash +asv run --python=same --launch-method spawn +``` + +- `--python=same` — use the current interpreter (required with `environment_type: "existing"`) +- `--launch-method spawn` — required for CUDA (fork causes "Cannot re-initialize CUDA in forked subprocess") + +### Run a single suite + +```bash +asv run --python=same --launch-method spawn --bench bench_casting +``` + +The `--bench` argument accepts a regex that matches benchmark file or class names. + +### Quick smoke test + +```bash +asv run --python=same --launch-method spawn --quick --bench bench_casting +``` + +`--quick` runs each benchmark only once with no statistical analysis. Useful for verifying +benchmarks work, but note that results are **not saved to disk** in quick mode. + +### Compare two commits + +```bash +asv continuous --python=same --launch-method spawn HEAD~1 HEAD +``` + +This checks out each commit, runs benchmarks on both, and reports regressions. +Note: this only works if the benchmark files exist on both commits. + +### Generate an HTML dashboard + +```bash +asv publish +asv preview +``` + +`asv publish` generates static HTML from stored results into `benchmarks/.asv/html/`. +`asv preview` serves it locally on `http://localhost:8080`. + +## How results are stored + +### Local results + +ASV stores results as JSON files under `benchmarks/.asv/results/`: + +``` +benchmarks/.asv/results/ + my-machine-name/ + machine.json # Hardware/OS metadata + .json # Timing results for that commit + .json + ... +``` + +Each commit JSON contains the wall-clock timings for every benchmark + parameter combination +run on that machine. The `benchmarks/.asv/` directory is in `.gitignore`. + +### CI results (Artifactory) + +In CI, benchmarks run **only on pushes to `dev`** (not on PRs). This builds a historical +record of performance on the main branch. + +The CI pipeline (`.github/workflows/rocm-ci.yml`) follows this flow: + +1. **Restore** — Download `results.tar.gz` from Artifactory for the current runner +2. **Benchmark** — Run `asv run`, which appends a new `{commit}.json` to the results directory +3. **Upload** — Tar up the results directory and upload back to Artifactory + +Results are stored per machine at: +``` +https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/asv-results/ + linux-te-mi325-8/results.tar.gz + linux-te-mi355-8/results.tar.gz +``` + +Each tarball contains the full ASV results directory for that machine, accumulating +a new commit JSON on every push to `dev`. ASV machine names map to hardware: +`mi325` for MI325X runners, `mi355` for MI355X runners. + +### Downloading CI results locally + +To inspect CI results on your local machine (requires Artifactory access): + +```bash +# Download results for a specific machine +curl -sf -H "X-JFrog-Art-Api:${ARTIFACTORY_API_KEY}" \ + -o results.tar.gz \ + "https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/asv-results/linux-te-mi325-8/results.tar.gz" + +# Extract into your local ASV results directory +mkdir -p benchmarks/.asv/results +tar xzf results.tar.gz -C benchmarks/.asv/results/ + +# Generate and view the dashboard +asv publish +asv preview +``` + +This can also be provided statically via github pages. + +## Writing new benchmarks + +Create a new file in `benchmarks/asv/` following the naming convention `bench_.py`. + +```python +import torch +import transformer_engine.pytorch as te + +class BenchSomething: + params = [[1024, 4096], ["config_a", "config_b"]] + param_names = ["M", "config"] + timeout = 300 # seconds, per parameter combination + + def setup(self, M, config): + # Allocate tensors, create modules. + # This runs before each time_* method but is NOT timed. + ... + + def time_forward(self, M, config): + # ASV times this method (adaptive iterations + statistics). + # MUST call torch.cuda.synchronize() to ensure GPU work completes. + self.module(self.x) + torch.cuda.synchronize() +``` + +Key rules: +- Method names starting with `time_` are automatically timed by ASV. +- Always call `torch.cuda.synchronize()` at the end of `time_*` methods. +- Clear `.grad` attributes in backward benchmarks to prevent memory accumulation. +- ASV runs each `time_*` method in a **separate subprocess** — no shared state between methods. +- The `params` list defines a cross-product; keep the matrix size reasonable. diff --git a/benchmarks/asv/__init__.py b/benchmarks/asv/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmarks/asv/bench_attention.py b/benchmarks/asv/bench_attention.py new file mode 100644 index 000000000..9c64888f6 --- /dev/null +++ b/benchmarks/asv/bench_attention.py @@ -0,0 +1,56 @@ +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +"""Fused multi-head attention (GQA) benchmarks via te.DotProductAttention. + +Forward FLOPs = 4 * batch * num_q_heads * seq_len^2 * head_dim +Backward FLOPs ~ 2x forward +""" + +import torch +import transformer_engine.pytorch as te + +BATCH = 2 + +# (num_q_heads, num_kv_heads, head_dim, tp) +MODELS = { + "Llama3-8B_TP1": (32, 8, 128, 1), + "Llama3-8B_TP8": (32, 8, 128, 8), + "Llama3-70B_TP8": (64, 8, 128, 8), + "Llama3-405B_TP8": (128, 8, 128, 8), + "Qwen2.5-7B_TP1": (28, 4, 128, 1), + "Qwen2.5-72B_TP8": (64, 8, 128, 8), +} + + +class BenchAttention: + params = [[1024, 2048, 4096, 8192], list(MODELS)] + param_names = ["seq_len", "model"] + timeout = 300 + + def setup(self, seq_len, model): + n_q, n_kv, hd, tp = MODELS[model] + qh, kvh = n_q // tp, n_kv // tp + dtype = torch.bfloat16 + + self.attn = te.DotProductAttention( + num_attention_heads=qh, kv_channels=hd, + num_gqa_groups=kvh, attn_mask_type="causal", + ).to(device="cuda", dtype=dtype) + + self.q = torch.randn(seq_len, BATCH, qh, hd, dtype=dtype, device="cuda", requires_grad=True) + self.k = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True) + self.v = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True) + self.grad_out = torch.randn_like(self.attn(self.q, self.k, self.v)) + + def time_forward(self, seq_len, model): + self.attn(self.q, self.k, self.v) + torch.cuda.synchronize() + + def time_forward_backward(self, seq_len, model): + out = self.attn(self.q, self.k, self.v) + out.backward(self.grad_out) + self.q.grad = self.k.grad = self.v.grad = None + torch.cuda.synchronize() diff --git a/benchmarks/asv/bench_casting.py b/benchmarks/asv/bench_casting.py new file mode 100644 index 000000000..7195a01ab --- /dev/null +++ b/benchmarks/asv/bench_casting.py @@ -0,0 +1,51 @@ +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +"""FP8 casting micro-benchmarks. + +Memory-bound quantization/dequantization between BF16 and FP8 formats. +""" + +import torch + +if hasattr(torch, "float8_e4m3fnuz"): + FP8_E4M3 = torch.float8_e4m3fnuz + FP8_E5M2 = torch.float8_e5m2fnuz +else: + FP8_E4M3 = torch.float8_e4m3fn + FP8_E5M2 = torch.float8_e5m2 + +HIDDEN_SIZES = { + "Llama3-8B": 4096, + "Llama3-70B": 8192, + "Llama3-405B": 16384, + "Qwen2.5-7B": 3584, + "Qwen2.5-72B": 8192, +} + +CAST_CONFIGS = { + "BF16_to_E4M3": (torch.bfloat16, FP8_E4M3), + "E4M3_to_BF16": (FP8_E4M3, torch.bfloat16), + "BF16_to_E5M2": (torch.bfloat16, FP8_E5M2), + "E5M2_to_BF16": (FP8_E5M2, torch.bfloat16), +} + + +class BenchCasting: + params = [[1024, 2048, 4096, 8192], list(HIDDEN_SIZES), list(CAST_CONFIGS)] + param_names = ["M", "model", "cast"] + timeout = 120 + + def setup(self, M, model, cast): + hidden = HIDDEN_SIZES[model] + src_dtype, self.dst_dtype = CAST_CONFIGS[cast] + if src_dtype in (FP8_E4M3, FP8_E5M2): + self.x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda").to(src_dtype) + else: + self.x = torch.randn(M, hidden, dtype=src_dtype, device="cuda") + + def time_cast(self, M, model, cast): + self.x.to(self.dst_dtype) + torch.cuda.synchronize() diff --git a/benchmarks/asv/bench_gemm.py b/benchmarks/asv/bench_gemm.py new file mode 100644 index 000000000..6a09a08b5 --- /dev/null +++ b/benchmarks/asv/bench_gemm.py @@ -0,0 +1,55 @@ +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +"""BF16 GEMM benchmarks via te.Linear. + +GEMM shapes derived from transformer layer projections: + QKV, AttnOut, GateUp (SwiGLU), Down. +""" + +import torch +import transformer_engine.pytorch as te + +# (hidden, intermediate, num_q_heads, num_kv_heads, head_dim, tp) +MODELS = { + "Llama3-8B_TP1": (4096, 14336, 32, 8, 128, 1), + "Llama3-8B_TP8": (4096, 14336, 32, 8, 128, 8), + "Llama3-70B_TP8": (8192, 28672, 64, 8, 128, 8), + "Llama3-405B_TP8": (16384, 53248, 128, 8, 128, 8), + "Qwen2.5-7B_TP1": (3584, 18944, 28, 4, 128, 1), + "Qwen2.5-72B_TP8": (8192, 29568, 64, 8, 128, 8), +} + +# Pre-compute (N, K) for each GEMM shape +SHAPES = {} +for _name, (h, inter, nq, nkv, hd, tp) in MODELS.items(): + SHAPES[f"{_name}-QKV"] = ((nq * hd + 2 * nkv * hd) // tp, h) + SHAPES[f"{_name}-AttnOut"] = (h, (nq * hd) // tp) + SHAPES[f"{_name}-GateUp"] = ((2 * inter) // tp, h) + SHAPES[f"{_name}-Down"] = (h, inter // tp) + + +class BenchGemm: + params = [[1024, 2048, 4096, 8192], list(SHAPES)] + param_names = ["M", "shape"] + timeout = 300 + + def setup(self, M, shape): + N, K = SHAPES[shape] + dtype = torch.bfloat16 + self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype) + self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True) + self.grad_out = torch.randn_like(self.linear(self.x)) + + def time_forward(self, M, shape): + self.linear(self.x) + torch.cuda.synchronize() + + def time_forward_backward(self, M, shape): + out = self.linear(self.x) + out.backward(self.grad_out) + self.x.grad = None + self.linear.weight.grad = None + torch.cuda.synchronize() diff --git a/benchmarks/asv/bench_gemm_fp8.py b/benchmarks/asv/bench_gemm_fp8.py new file mode 100644 index 000000000..9d70d8879 --- /dev/null +++ b/benchmarks/asv/bench_gemm_fp8.py @@ -0,0 +1,60 @@ +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +"""FP8 GEMM benchmarks via te.Linear under fp8_autocast. + +Same shapes as bench_gemm.py but with FP8 quantized compute. +""" + +import torch +import transformer_engine.pytorch as te +from transformer_engine.common.recipe import DelayedScaling, Format + +# (hidden, intermediate, num_q_heads, num_kv_heads, head_dim, tp) +MODELS = { + "Llama3-8B_TP1": (4096, 14336, 32, 8, 128, 1), + "Llama3-8B_TP8": (4096, 14336, 32, 8, 128, 8), + "Llama3-70B_TP8": (8192, 28672, 64, 8, 128, 8), + "Llama3-405B_TP8": (16384, 53248, 128, 8, 128, 8), + "Qwen2.5-7B_TP1": (3584, 18944, 28, 4, 128, 1), + "Qwen2.5-72B_TP8": (8192, 29568, 64, 8, 128, 8), +} + +SHAPES = {} +for _name, (h, inter, nq, nkv, hd, tp) in MODELS.items(): + SHAPES[f"{_name}-QKV"] = ((nq * hd + 2 * nkv * hd) // tp, h) + SHAPES[f"{_name}-AttnOut"] = (h, (nq * hd) // tp) + SHAPES[f"{_name}-GateUp"] = ((2 * inter) // tp, h) + SHAPES[f"{_name}-Down"] = (h, inter // tp) + +FP8_RECIPE = DelayedScaling( + fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max", +) + + +class BenchGemmFP8: + params = [[1024, 2048, 4096, 8192], list(SHAPES)] + param_names = ["M", "shape"] + timeout = 300 + + def setup(self, M, shape): + N, K = SHAPES[shape] + dtype = torch.bfloat16 + self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype) + self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True) + self.grad_out = torch.randn(M, N, dtype=dtype, device="cuda") + + def time_forward(self, M, shape): + with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE): + self.linear(self.x) + torch.cuda.synchronize() + + def time_forward_backward(self, M, shape): + with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE): + out = self.linear(self.x) + out.backward(self.grad_out) + self.x.grad = None + self.linear.weight.grad = None + torch.cuda.synchronize() diff --git a/benchmarks/asv/bench_grouped_gemm.py b/benchmarks/asv/bench_grouped_gemm.py new file mode 100644 index 000000000..3c35737f5 --- /dev/null +++ b/benchmarks/asv/bench_grouped_gemm.py @@ -0,0 +1,64 @@ +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +"""Grouped GEMM benchmarks via te.GroupedLinear. + +MoE model configurations with GateUp and Down projections. +""" + +import torch +import transformer_engine.pytorch as te + +# (n_routed_experts, moe_intermediate_size, hidden_size) +MOE_MODELS = { + "DSV2-Lite": (64, 1408, 2048), + "DSV2": (160, 1536, 5120), + "DSV3": (256, 2048, 7168), + "Grok-V2": (8, 16384, 8192), +} + +# Build (config_key -> (num_gemms, N, K)) mapping +CONFIGS = {} +for model, (n_experts, inter, hidden) in MOE_MODELS.items(): + for ep in [32, 16, 8]: + if n_experts % ep != 0: + continue + B = n_experts // ep + CONFIGS[f"{model}_EP{ep}-GateUp"] = (B, 2 * inter, hidden) + CONFIGS[f"{model}_EP{ep}-Down"] = (B, hidden, inter) + + +class BenchGroupedGemm: + params = [[512, 1024, 2048, 4096], list(CONFIGS)] + param_names = ["M", "config"] + timeout = 300 + + def setup(self, M, config): + B, N, K = CONFIGS[config] + dtype = torch.bfloat16 + + self.module = te.GroupedLinear( + num_gemms=B, in_features=K, out_features=N, bias=False, + ).to(device="cuda", dtype=dtype) + + self.xs = [ + torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True) + for _ in range(B) + ] + outs = self.module(self.xs) + self.grad_outs = [torch.randn_like(o) for o in outs] + + def time_forward(self, M, config): + self.module(self.xs) + torch.cuda.synchronize() + + def time_forward_backward(self, M, config): + outs = self.module(self.xs) + torch.autograd.backward(outs, self.grad_outs) + for x in self.xs: + x.grad = None + for p in self.module.parameters(): + p.grad = None + torch.cuda.synchronize() diff --git a/benchmarks/asv/bench_normalization.py b/benchmarks/asv/bench_normalization.py new file mode 100644 index 000000000..f68b60a51 --- /dev/null +++ b/benchmarks/asv/bench_normalization.py @@ -0,0 +1,36 @@ +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +"""RMSNorm and LayerNorm benchmarks on activation-sized tensors.""" + +import torch +import transformer_engine.pytorch as te + +NORMS = {"RMSNorm": te.RMSNorm, "LayerNorm": te.LayerNorm} +HIDDEN_SIZES = [3584, 4096, 8192, 16384] + + +class BenchNormalization: + params = [[1024, 2048, 4096, 8192], HIDDEN_SIZES, list(NORMS)] + param_names = ["M", "hidden", "norm_type"] + timeout = 120 + + def setup(self, M, hidden, norm_type): + dtype = torch.bfloat16 + self.norm = NORMS[norm_type](hidden).to(device="cuda", dtype=dtype) + self.x = torch.randn(M, hidden, dtype=dtype, device="cuda", requires_grad=True) + self.grad_out = torch.randn_like(self.norm(self.x)) + + def time_forward(self, M, hidden, norm_type): + self.norm(self.x) + torch.cuda.synchronize() + + def time_forward_backward(self, M, hidden, norm_type): + out = self.norm(self.x) + out.backward(self.grad_out) + self.x.grad = None + for p in self.norm.parameters(): + p.grad = None + torch.cuda.synchronize() From b8291223203b89bdb098a27a54728fdb174fd755 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 17 Mar 2026 08:51:18 -0500 Subject: [PATCH 2/6] Update casting benchmark --- benchmarks/asv/bench_casting.py | 42 ++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/benchmarks/asv/bench_casting.py b/benchmarks/asv/bench_casting.py index 7195a01ab..fb594a7c2 100644 --- a/benchmarks/asv/bench_casting.py +++ b/benchmarks/asv/bench_casting.py @@ -5,17 +5,13 @@ ############################################################################### """FP8 casting micro-benchmarks. -Memory-bound quantization/dequantization between BF16 and FP8 formats. +Memory-bound quantization/dequantization between BF16 and FP8 formats +using Transformer Engine's quantized tensor infrastructure. """ import torch - -if hasattr(torch, "float8_e4m3fnuz"): - FP8_E4M3 = torch.float8_e4m3fnuz - FP8_E5M2 = torch.float8_e5m2fnuz -else: - FP8_E4M3 = torch.float8_e4m3fn - FP8_E5M2 = torch.float8_e5m2 +from transformer_engine.pytorch import Float8CurrentScalingQuantizer +from transformer_engine_torch import DType as TE_DType HIDDEN_SIZES = { "Llama3-8B": 4096, @@ -26,10 +22,10 @@ } CAST_CONFIGS = { - "BF16_to_E4M3": (torch.bfloat16, FP8_E4M3), - "E4M3_to_BF16": (FP8_E4M3, torch.bfloat16), - "BF16_to_E5M2": (torch.bfloat16, FP8_E5M2), - "E5M2_to_BF16": (FP8_E5M2, torch.bfloat16), + "BF16_to_E4M3": ("quantize", TE_DType.kFloat8E4M3), + "E4M3_to_BF16": ("dequantize", TE_DType.kFloat8E4M3), + "BF16_to_E5M2": ("quantize", TE_DType.kFloat8E5M2), + "E5M2_to_BF16": ("dequantize", TE_DType.kFloat8E5M2), } @@ -40,12 +36,24 @@ class BenchCasting: def setup(self, M, model, cast): hidden = HIDDEN_SIZES[model] - src_dtype, self.dst_dtype = CAST_CONFIGS[cast] - if src_dtype in (FP8_E4M3, FP8_E5M2): - self.x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda").to(src_dtype) + direction, fp8_dtype = CAST_CONFIGS[cast] + self.direction = direction + quantizer = Float8CurrentScalingQuantizer( + fp8_dtype=fp8_dtype, + device=torch.device("cuda"), + rowwise=True, + columnwise=False, + ) + if direction == "dequantize": + bf16_tensor = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda") + self.x = quantizer.quantize(bf16_tensor) else: - self.x = torch.randn(M, hidden, dtype=src_dtype, device="cuda") + self.x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda") + self.quantizer = quantizer def time_cast(self, M, model, cast): - self.x.to(self.dst_dtype) + if self.direction == "quantize": + self.quantizer.quantize(self.x) + else: + self.x.dequantize(dtype=torch.bfloat16) torch.cuda.synchronize() From 21678b41426f3bc7e30f56022c93f66444a462d6 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 18 Mar 2026 16:26:16 -0500 Subject: [PATCH 3/6] Added helper script and documentation --- benchmarks/asv/README.md | 31 +++++++++++++++- benchmarks/asv/run_benchmarks.sh | 63 ++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 1 deletion(-) create mode 100755 benchmarks/asv/run_benchmarks.sh diff --git a/benchmarks/asv/README.md b/benchmarks/asv/README.md index 7de4fd6c5..bd02b4991 100644 --- a/benchmarks/asv/README.md +++ b/benchmarks/asv/README.md @@ -13,7 +13,36 @@ ASV is configured with `environment_type: "existing"` (in `asv.conf.json` at the meaning it uses the current Python environment directly — it does not create virtualenvs or attempt to build TE itself. -## Local usage +## Helper script + +A convenience wrapper (`benchmarks/asv/run_benchmarks.sh`) is provided for common tasks. +It can be run from anywhere — it automatically `cd`s to the repo root. Available benchmark +suites are discovered dynamically from `bench_*.py` files. + +```bash +bash benchmarks/asv/run_benchmarks.sh [options] +``` + +| Command | Description | +|---|---| +| `setup [name]` | Register machine with ASV (defaults to `hostname`) | +| `run [suite]` | Run all benchmarks, or a single suite (e.g. `bench_casting`) | +| `quick [suite]` | Smoke test — single iteration, results not saved | +| `compare [ref] [new]` | Compare two commits (defaults to `HEAD~1` vs `HEAD`) | +| `view` | Generate HTML dashboard and serve on `localhost:8080` | +| `list` | List available benchmark suites | + +Examples: + +```bash +bash benchmarks/asv/run_benchmarks.sh setup mi325 +bash benchmarks/asv/run_benchmarks.sh run bench_casting +bash benchmarks/asv/run_benchmarks.sh quick +bash benchmarks/asv/run_benchmarks.sh compare HEAD~3 HEAD +bash benchmarks/asv/run_benchmarks.sh view +``` + +## Local usage (manual ASV commands) All commands are run from the **repository root** (where `asv.conf.json` lives). diff --git a/benchmarks/asv/run_benchmarks.sh b/benchmarks/asv/run_benchmarks.sh new file mode 100755 index 000000000..5f07c23ff --- /dev/null +++ b/benchmarks/asv/run_benchmarks.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# Helper script for common ASV benchmark tasks. +# Run from the repository root (where asv.conf.json lives). +set -euo pipefail + +cd "$(git rev-parse --show-toplevel)" + +BENCH_DIR="benchmarks/asv" +mapfile -t SUITES < <(find "$BENCH_DIR" -maxdepth 1 -name 'bench_*.py' -printf '%f\n' | sed 's/\.py$//' | sort) + +usage() { + cat < [options] + +Commands: + setup Register this machine with ASV + run [SUITE] Run all benchmarks, or a single suite (e.g. bench_casting) + quick [SUITE] Smoke-test run (single iteration, results not saved) + compare [REF] [NEW] Compare two commits (default: HEAD~1 vs HEAD) + view Generate HTML dashboard and open preview server + list List available benchmark suites + +EOF +} + +case "${1:-}" in + setup) + MACHINE="${2:-$(hostname)}" + echo "Registering machine as: $MACHINE" + asv machine --yes --machine "$MACHINE" + ;; + run) + CMD=(asv run --python=same --launch-method spawn) + [[ -n "${2:-}" ]] && CMD+=(--bench "$2") + echo "Running: ${CMD[*]}" + "${CMD[@]}" + ;; + quick) + CMD=(asv run --python=same --launch-method spawn --quick) + [[ -n "${2:-}" ]] && CMD+=(--bench "$2") + echo "Running (quick): ${CMD[*]}" + "${CMD[@]}" + ;; + compare) + REF="${2:-HEAD~1}" + NEW="${3:-HEAD}" + echo "Comparing $REF vs $NEW" + asv continuous --python=same --launch-method spawn "$REF" "$NEW" + ;; + view) + asv publish + echo "Starting preview server at http://localhost:8080" + asv preview + ;; + list) + echo "Available benchmark suites:" + for s in "${SUITES[@]}"; do echo " $s"; done + ;; + *) + usage + exit 1 + ;; +esac From 6cb91a56a86bfb3653d333de851a282bf7b8ee97 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 19 Mar 2026 13:09:32 -0500 Subject: [PATCH 4/6] Corrected local benchmarking --- asv.conf.json | 2 +- benchmarks/asv/README.md | 13 ++++++++----- benchmarks/asv/run_benchmarks.sh | 6 ++++-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/asv.conf.json b/asv.conf.json index dc71bf345..1b17c9c9e 100644 --- a/asv.conf.json +++ b/asv.conf.json @@ -3,7 +3,7 @@ "project": "TransformerEngine", "project_url": "https://github.com/ROCm/TransformerEngine", "repo": ".", - "branches": ["dev"], + "branches": ["HEAD", "dev"], "environment_type": "existing", "install_command": [], "build_command": [], diff --git a/benchmarks/asv/README.md b/benchmarks/asv/README.md index bd02b4991..5d2686ed2 100644 --- a/benchmarks/asv/README.md +++ b/benchmarks/asv/README.md @@ -11,7 +11,8 @@ a framework for benchmarking Python packages over their lifetime. ASV is configured with `environment_type: "existing"` (in `asv.conf.json` at the repo root), meaning it uses the current Python environment directly — it does not create virtualenvs or -attempt to build TE itself. +attempt to build TE itself. The config sets `branches: ["HEAD", "dev"]` so that `asv publish` +accepts results from both the currently checked-out branch and `dev` (for CI history). ## Helper script @@ -26,7 +27,7 @@ bash benchmarks/asv/run_benchmarks.sh [options] | Command | Description | |---|---| | `setup [name]` | Register machine with ASV (defaults to `hostname`) | -| `run [suite]` | Run all benchmarks, or a single suite (e.g. `bench_casting`) | +| `run [suite]` | Run benchmarks for the current commit (optionally a single suite) | | `quick [suite]` | Smoke test — single iteration, results not saved | | `compare [ref] [new]` | Compare two commits (defaults to `HEAD~1` vs `HEAD`) | | `view` | Generate HTML dashboard and serve on `localhost:8080` | @@ -59,16 +60,18 @@ the name must be consistent across runs for historical comparison. ### Run all benchmarks ```bash -asv run --python=same --launch-method spawn +asv run --python=same --launch-method spawn --set-commit-hash $(git rev-parse HEAD) ``` - `--python=same` — use the current interpreter (required with `environment_type: "existing"`) - `--launch-method spawn` — required for CUDA (fork causes "Cannot re-initialize CUDA in forked subprocess") +- `--set-commit-hash` — **required** with `environment_type: "existing"`. Without it, ASV + runs benchmarks but silently discards results. The helper script sets this automatically. ### Run a single suite ```bash -asv run --python=same --launch-method spawn --bench bench_casting +asv run --python=same --launch-method spawn --set-commit-hash $(git rev-parse HEAD) --bench bench_casting ``` The `--bench` argument accepts a regex that matches benchmark file or class names. @@ -76,7 +79,7 @@ The `--bench` argument accepts a regex that matches benchmark file or class name ### Quick smoke test ```bash -asv run --python=same --launch-method spawn --quick --bench bench_casting +asv run --python=same --launch-method spawn --quick --set-commit-hash $(git rev-parse HEAD) --bench bench_casting ``` `--quick` runs each benchmark only once with no statistical analysis. Useful for verifying diff --git a/benchmarks/asv/run_benchmarks.sh b/benchmarks/asv/run_benchmarks.sh index 5f07c23ff..7e9a21d23 100755 --- a/benchmarks/asv/run_benchmarks.sh +++ b/benchmarks/asv/run_benchmarks.sh @@ -30,13 +30,15 @@ case "${1:-}" in asv machine --yes --machine "$MACHINE" ;; run) - CMD=(asv run --python=same --launch-method spawn) + CMD=(asv run --python=same --launch-method spawn + --set-commit-hash "$(git rev-parse HEAD)") [[ -n "${2:-}" ]] && CMD+=(--bench "$2") echo "Running: ${CMD[*]}" "${CMD[@]}" ;; quick) - CMD=(asv run --python=same --launch-method spawn --quick) + CMD=(asv run --python=same --launch-method spawn --quick + --set-commit-hash "$(git rev-parse HEAD)") [[ -n "${2:-}" ]] && CMD+=(--bench "$2") echo "Running (quick): ${CMD[*]}" "${CMD[@]}" From 1a98989da5e79b9f53da90485b0157d5d256670a Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 19 Mar 2026 13:28:09 -0500 Subject: [PATCH 5/6] Added direct-run option to bypass subprocess overhead --- asv.conf.json | 2 +- benchmarks/asv/README.md | 8 ++- benchmarks/asv/direct_run.py | 92 ++++++++++++++++++++++++++++++++ benchmarks/asv/run_benchmarks.sh | 11 ++++ 4 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 benchmarks/asv/direct_run.py diff --git a/asv.conf.json b/asv.conf.json index 1b17c9c9e..482e20c60 100644 --- a/asv.conf.json +++ b/asv.conf.json @@ -3,7 +3,7 @@ "project": "TransformerEngine", "project_url": "https://github.com/ROCm/TransformerEngine", "repo": ".", - "branches": ["HEAD", "dev"], + "branches": ["HEAD"], "environment_type": "existing", "install_command": [], "build_command": [], diff --git a/benchmarks/asv/README.md b/benchmarks/asv/README.md index 5d2686ed2..5d88d8ad3 100644 --- a/benchmarks/asv/README.md +++ b/benchmarks/asv/README.md @@ -11,8 +11,9 @@ a framework for benchmarking Python packages over their lifetime. ASV is configured with `environment_type: "existing"` (in `asv.conf.json` at the repo root), meaning it uses the current Python environment directly — it does not create virtualenvs or -attempt to build TE itself. The config sets `branches: ["HEAD", "dev"]` so that `asv publish` -accepts results from both the currently checked-out branch and `dev` (for CI history). +attempt to build TE itself. The config sets `branches: ["HEAD"]` so that `asv publish` accepts results from +whichever branch is currently checked out — this works for both local development +and CI (where `HEAD` points to `dev`). ## Helper script @@ -29,6 +30,7 @@ bash benchmarks/asv/run_benchmarks.sh [options] | `setup [name]` | Register machine with ASV (defaults to `hostname`) | | `run [suite]` | Run benchmarks for the current commit (optionally a single suite) | | `quick [suite]` | Smoke test — single iteration, results not saved | +| `direct suite [method]` | Fast in-process run — no subprocesses, no ASV overhead | | `compare [ref] [new]` | Compare two commits (defaults to `HEAD~1` vs `HEAD`) | | `view` | Generate HTML dashboard and serve on `localhost:8080` | | `list` | List available benchmark suites | @@ -39,6 +41,8 @@ Examples: bash benchmarks/asv/run_benchmarks.sh setup mi325 bash benchmarks/asv/run_benchmarks.sh run bench_casting bash benchmarks/asv/run_benchmarks.sh quick +bash benchmarks/asv/run_benchmarks.sh direct bench_casting +bash benchmarks/asv/run_benchmarks.sh direct bench_gemm time_forward bash benchmarks/asv/run_benchmarks.sh compare HEAD~3 HEAD bash benchmarks/asv/run_benchmarks.sh view ``` diff --git a/benchmarks/asv/direct_run.py b/benchmarks/asv/direct_run.py new file mode 100644 index 000000000..1e59e99e8 --- /dev/null +++ b/benchmarks/asv/direct_run.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +"""Run ASV benchmark classes directly in-process, bypassing subprocess overhead. + +Usage: + python benchmarks/asv/direct_run.py [options] [method_filter] + +Examples: + python benchmarks/asv/direct_run.py bench_casting + python benchmarks/asv/direct_run.py bench_gemm time_forward + python benchmarks/asv/direct_run.py -w 5 -n 20 bench_casting +""" + +import argparse +import importlib +import itertools +import math +import sys +import time + + +def run_class(cls, class_name, method_filter=None, warmup=3, iters=7): + methods = sorted(m for m in dir(cls) if m.startswith("time_")) + if method_filter: + methods = [m for m in methods if method_filter in m] + if not methods: + return + + params = getattr(cls, "params", [[]]) + param_names = getattr(cls, "param_names", []) + combos = list(itertools.product(*params)) + + print(f"\n{class_name} ({len(combos)} combos x {len(methods)} methods, " + f"{warmup} warmup, {iters} timed)") + print("-" * 90) + print(f" {'median':>10} {'mean':>10} {'stdev':>10} {'method':<30} params") + print("-" * 90) + + for combo in combos: + label = ", ".join(f"{n}={v}" for n, v in zip(param_names, combo)) + instance = cls() + try: + instance.setup(*combo) + except Exception as e: + print(f" SKIP {label} setup failed: {e}") + continue + + for method_name in methods: + method = getattr(instance, method_name) + + for _ in range(warmup): + method(*combo) + + times = [] + for _ in range(iters): + t0 = time.perf_counter() + method(*combo) + times.append(time.perf_counter() - t0) + + times.sort() + median = times[len(times) // 2] + mean = sum(times) / len(times) + stdev = math.sqrt(sum((t - mean) ** 2 for t in times) / len(times)) + print(f" {median*1000:>8.3f}ms {mean*1000:>8.3f}ms " + f"{stdev*1000:>8.3f}ms {method_name:<30} {label}") + + +def main(): + parser = argparse.ArgumentParser( + description="Run ASV benchmarks directly in-process (no subprocess overhead).") + parser.add_argument("suite", help="Benchmark module name (e.g. bench_casting)") + parser.add_argument("method_filter", nargs="?", default=None, + help="Only run time_* methods containing this string") + parser.add_argument("-w", "--warmup", type=int, default=3, + help="Number of warmup iterations (default: 3)") + parser.add_argument("-n", "--iters", type=int, default=7, + help="Number of timed iterations (default: 7)") + args = parser.parse_args() + + mod = importlib.import_module(args.suite) + + for name in sorted(dir(mod)): + obj = getattr(mod, name) + if isinstance(obj, type) and name.startswith("Bench"): + run_class(obj, name, args.method_filter, args.warmup, args.iters) + + +if __name__ == "__main__": + import os + + os.chdir(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, ".") + main() diff --git a/benchmarks/asv/run_benchmarks.sh b/benchmarks/asv/run_benchmarks.sh index 7e9a21d23..4ca71881c 100755 --- a/benchmarks/asv/run_benchmarks.sh +++ b/benchmarks/asv/run_benchmarks.sh @@ -16,6 +16,8 @@ Commands: setup Register this machine with ASV run [SUITE] Run all benchmarks, or a single suite (e.g. bench_casting) quick [SUITE] Smoke-test run (single iteration, results not saved) + direct [-w W] [-n N] SUITE [METHOD] + Fast in-process run (no subprocesses, no ASV overhead) compare [REF] [NEW] Compare two commits (default: HEAD~1 vs HEAD) view Generate HTML dashboard and open preview server list List available benchmark suites @@ -43,6 +45,15 @@ case "${1:-}" in echo "Running (quick): ${CMD[*]}" "${CMD[@]}" ;; + direct) + shift + if [[ $# -eq 0 ]]; then + echo "Usage: $0 direct [options] SUITE [METHOD]" + echo "Options: -w WARMUP -n ITERS" + exit 1 + fi + python "$BENCH_DIR/direct_run.py" "$@" + ;; compare) REF="${2:-HEAD~1}" NEW="${3:-HEAD}" From 498f16d02eab404ba5f24be01fb804f9c440e59e Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 20 Mar 2026 14:01:35 -0500 Subject: [PATCH 6/6] Refactor to prefer direct runs, and moved asv conf --- benchmarks/asv/README.md | 39 ++- asv.conf.json => benchmarks/asv/asv.conf.json | 8 +- benchmarks/asv/direct_run.py | 279 ++++++++++++++++-- benchmarks/asv/run_benchmarks.sh | 58 ++-- 4 files changed, 313 insertions(+), 71 deletions(-) rename asv.conf.json => benchmarks/asv/asv.conf.json (69%) diff --git a/benchmarks/asv/README.md b/benchmarks/asv/README.md index 5d88d8ad3..9189522e1 100644 --- a/benchmarks/asv/README.md +++ b/benchmarks/asv/README.md @@ -9,7 +9,7 @@ a framework for benchmarking Python packages over their lifetime. - A ROCm or CUDA GPU must be available. - Install ASV: `pip install asv` -ASV is configured with `environment_type: "existing"` (in `asv.conf.json` at the repo root), +ASV is configured with `environment_type: "existing"` (in `benchmarks/asv/asv.conf.json`), meaning it uses the current Python environment directly — it does not create virtualenvs or attempt to build TE itself. The config sets `branches: ["HEAD"]` so that `asv publish` accepts results from whichever branch is currently checked out — this works for both local development @@ -28,33 +28,40 @@ bash benchmarks/asv/run_benchmarks.sh [options] | Command | Description | |---|---| | `setup [name]` | Register machine with ASV (defaults to `hostname`) | -| `run [suite]` | Run benchmarks for the current commit (optionally a single suite) | -| `quick [suite]` | Smoke test — single iteration, results not saved | -| `direct suite [method]` | Fast in-process run — no subprocesses, no ASV overhead | +| `run [suite] [method]` | Run benchmarks in-process (fast, saves ASV-compatible results) | +| `run --asv [suite]` | Run benchmarks via ASV (subprocess isolation per benchmark) | +| `quick [suite]` | Smoke test via ASV — single iteration, results not saved | | `compare [ref] [new]` | Compare two commits (defaults to `HEAD~1` vs `HEAD`) | | `view` | Generate HTML dashboard and serve on `localhost:8080` | | `list` | List available benchmark suites | +The default `run` command executes benchmarks directly in-process, avoiding the +significant subprocess-per-benchmark overhead that ASV imposes. Results are saved in +ASV-compatible format and can be viewed with `view`. Use `run --asv` when you need +ASV's subprocess isolation (e.g. for CI or statistical rigor). + Examples: ```bash bash benchmarks/asv/run_benchmarks.sh setup mi325 -bash benchmarks/asv/run_benchmarks.sh run bench_casting -bash benchmarks/asv/run_benchmarks.sh quick -bash benchmarks/asv/run_benchmarks.sh direct bench_casting -bash benchmarks/asv/run_benchmarks.sh direct bench_gemm time_forward +bash benchmarks/asv/run_benchmarks.sh run # all suites +bash benchmarks/asv/run_benchmarks.sh run bench_casting # one suite +bash benchmarks/asv/run_benchmarks.sh run bench_gemm time_forward # one method +bash benchmarks/asv/run_benchmarks.sh run -w 5 -n 20 bench_casting # custom iterations +bash benchmarks/asv/run_benchmarks.sh run --asv bench_casting # via ASV subprocesses bash benchmarks/asv/run_benchmarks.sh compare HEAD~3 HEAD bash benchmarks/asv/run_benchmarks.sh view ``` ## Local usage (manual ASV commands) -All commands are run from the **repository root** (where `asv.conf.json` lives). +All manual `asv` commands require `--config` with an **absolute path** to the config file +and should be run from the **repository root**. ASV does not resolve relative `--config` paths. ### Register your machine ```bash -asv machine --yes --machine my-machine-name +asv machine --config $(pwd)/benchmarks/asv/asv.conf.json --yes --machine my-machine-name ``` This creates a machine profile in `benchmarks/.asv/results/my-machine-name/machine.json`. @@ -64,7 +71,7 @@ the name must be consistent across runs for historical comparison. ### Run all benchmarks ```bash -asv run --python=same --launch-method spawn --set-commit-hash $(git rev-parse HEAD) +asv run --config $(pwd)/benchmarks/asv/asv.conf.json --python=same --launch-method spawn --set-commit-hash $(git rev-parse HEAD) ``` - `--python=same` — use the current interpreter (required with `environment_type: "existing"`) @@ -75,7 +82,7 @@ asv run --python=same --launch-method spawn --set-commit-hash $(git rev-parse HE ### Run a single suite ```bash -asv run --python=same --launch-method spawn --set-commit-hash $(git rev-parse HEAD) --bench bench_casting +asv run --config $(pwd)/benchmarks/asv/asv.conf.json --python=same --launch-method spawn --set-commit-hash $(git rev-parse HEAD) --bench bench_casting ``` The `--bench` argument accepts a regex that matches benchmark file or class names. @@ -83,7 +90,7 @@ The `--bench` argument accepts a regex that matches benchmark file or class name ### Quick smoke test ```bash -asv run --python=same --launch-method spawn --quick --set-commit-hash $(git rev-parse HEAD) --bench bench_casting +asv run --config $(pwd)/benchmarks/asv/asv.conf.json --python=same --launch-method spawn --quick --set-commit-hash $(git rev-parse HEAD) --bench bench_casting ``` `--quick` runs each benchmark only once with no statistical analysis. Useful for verifying @@ -92,7 +99,7 @@ benchmarks work, but note that results are **not saved to disk** in quick mode. ### Compare two commits ```bash -asv continuous --python=same --launch-method spawn HEAD~1 HEAD +asv continuous --config $(pwd)/benchmarks/asv/asv.conf.json --python=same --launch-method spawn HEAD~1 HEAD ``` This checks out each commit, runs benchmarks on both, and reports regressions. @@ -101,8 +108,8 @@ Note: this only works if the benchmark files exist on both commits. ### Generate an HTML dashboard ```bash -asv publish -asv preview +asv publish --config $(pwd)/benchmarks/asv/asv.conf.json +asv preview --config $(pwd)/benchmarks/asv/asv.conf.json ``` `asv publish` generates static HTML from stored results into `benchmarks/.asv/html/`. diff --git a/asv.conf.json b/benchmarks/asv/asv.conf.json similarity index 69% rename from asv.conf.json rename to benchmarks/asv/asv.conf.json index 482e20c60..3c1616aac 100644 --- a/asv.conf.json +++ b/benchmarks/asv/asv.conf.json @@ -2,14 +2,14 @@ "version": 1, "project": "TransformerEngine", "project_url": "https://github.com/ROCm/TransformerEngine", - "repo": ".", + "repo": "../..", "branches": ["HEAD"], "environment_type": "existing", "install_command": [], "build_command": [], - "benchmark_dir": "benchmarks/asv", - "results_dir": "benchmarks/.asv/results", - "html_dir": "benchmarks/.asv/html", + "benchmark_dir": ".", + "results_dir": "../.asv/results", + "html_dir": "../.asv/html", "install_timeout": 600, "benchmark_timeout": 1200, "launch_method": "spawn" diff --git a/benchmarks/asv/direct_run.py b/benchmarks/asv/direct_run.py index 1e59e99e8..dab841a6e 100644 --- a/benchmarks/asv/direct_run.py +++ b/benchmarks/asv/direct_run.py @@ -1,6 +1,9 @@ #!/usr/bin/env python3 """Run ASV benchmark classes directly in-process, bypassing subprocess overhead. +Results are saved in ASV-compatible format so they can be viewed with +``asv publish && asv preview``. + Usage: python benchmarks/asv/direct_run.py [options] [method_filter] @@ -11,19 +14,174 @@ """ import argparse +import hashlib import importlib +import inspect import itertools +import json import math +import os +import platform +import subprocess import sys +import textwrap import time -def run_class(cls, class_name, method_filter=None, warmup=3, iters=7): +# --------------------------------------------------------------------------- +# ASV result generation +# --------------------------------------------------------------------------- + +def _get_benchmark_version(cls, method_name): + """Compute the version hash the same way ASV does. + + ASV hashes a code string built from the time_* and setup methods. + The string is class header + indented time method + indented setup, + with no trailing newline. + """ + time_src = textwrap.dedent(inspect.getsource(getattr(cls, method_name))) + setup_src = textwrap.dedent(inspect.getsource(cls.setup)) + code = ( + f"class {cls.__name__}:\n" + + textwrap.indent(time_src, " ") + "\n" + + textwrap.indent(setup_src, " ") + ).rstrip("\n") + return hashlib.sha256(code.encode()).hexdigest() + + +def _format_param_value(v): + """Format a parameter value the way ASV stores it in JSON.""" + if isinstance(v, str): + return f"'{v}'" + return repr(v) + + +def _get_machine_info(): + """Build the params/machine dict ASV expects.""" + machine = platform.node() + info = { + "arch": platform.machine(), + "cpu": "", + "machine": machine, + "num_cpu": str(os.cpu_count()), + "os": f"{platform.system()} {platform.release()}", + "ram": "", + } + try: + with open("/proc/cpuinfo") as f: + for line in f: + if line.startswith("model name"): + info["cpu"] = line.split(":", 1)[1].strip() + break + with open("/proc/meminfo") as f: + for line in f: + if line.startswith("MemTotal"): + info["ram"] = line.split()[1] # kB + break + except OSError: + pass + return machine, info + + +def _get_commit_hash(): + """Get the current git HEAD hash.""" + try: + return subprocess.check_output( + ["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL + ).decode().strip() + except Exception: + return "unknown" + + +def _compute_stats(samples): + """Compute statistics from a list of timing samples.""" + if not samples: + return None, None, None, None, None + s = sorted(samples) + n = len(s) + median = s[n // 2] + mean = sum(s) / n + q25 = s[max(0, n // 4)] + q75 = s[min(n - 1, 3 * n // 4)] + stdev = math.sqrt(sum((t - mean) ** 2 for t in s) / n) + ci_lo = max(0, mean - 2.576 * stdev / math.sqrt(n)) # 99% CI + ci_hi = mean + 2.576 * stdev / math.sqrt(n) + return median, ci_lo, ci_hi, q25, q75 + + +def _get_results_dir(): + """Read results_dir from asv.conf.json, resolved to an absolute path.""" + conf_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "asv.conf.json") + with open(conf_path) as f: + conf = json.load(f) + conf_dir = os.path.dirname(conf_path) + return os.path.normpath(os.path.join(conf_dir, conf["results_dir"])) + + +def save_asv_results(all_results): + """Write results to ASV's results directory.""" + commit_hash = _get_commit_hash() + machine_name, machine_info = _get_machine_info() + env_name = "existing-" + sys.executable.replace("/", "_").strip("_") + results_dir = _get_results_dir() + machine_dir = os.path.join(results_dir, machine_name) + os.makedirs(machine_dir, exist_ok=True) + + # Write machine.json if missing + machine_json = os.path.join(machine_dir, "machine.json") + if not os.path.exists(machine_json): + with open(machine_json, "w") as f: + json.dump({**machine_info, "version": 1}, f, indent=4) + + # Load existing result file or start fresh + filename = f"{commit_hash[:8]}-{env_name}.json" + result_path = os.path.join(machine_dir, filename) + if os.path.exists(result_path): + with open(result_path) as f: + data = json.load(f) + else: + data = { + "commit_hash": commit_hash, + "env_name": env_name, + "date": int(time.time() * 1000), + "params": {**machine_info, "python": sys.executable}, + "python": sys.executable, + "requirements": {}, + "env_vars": {}, + "result_columns": [ + "result", "params", "version", + "started_at", "duration", + "stats_ci_99_a", "stats_ci_99_b", + "stats_q_25", "stats_q_75", + "stats_number", "stats_repeat", + "samples", + ], + "results": {}, + "durations": {}, + "version": 2, + } + + # Merge new results + for bench_key, bench_data in all_results.items(): + data["results"][bench_key] = bench_data + + with open(result_path, "w") as f: + json.dump(data, f, indent=2) + + print(f"\nResults saved to {result_path}") + + +# --------------------------------------------------------------------------- +# Benchmark runner +# --------------------------------------------------------------------------- + +def run_class(suite_name, cls, class_name, method_filter=None, warmup=3, iters=7): + """Run all benchmarks in a class, returning ASV-formatted results.""" methods = sorted(m for m in dir(cls) if m.startswith("time_")) if method_filter: methods = [m for m in methods if method_filter in m] if not methods: - return + return {} params = getattr(cls, "params", [[]]) param_names = getattr(cls, "param_names", []) @@ -31,37 +189,96 @@ def run_class(cls, class_name, method_filter=None, warmup=3, iters=7): print(f"\n{class_name} ({len(combos)} combos x {len(methods)} methods, " f"{warmup} warmup, {iters} timed)") - print("-" * 90) - print(f" {'median':>10} {'mean':>10} {'stdev':>10} {'method':<30} params") - print("-" * 90) - - for combo in combos: - label = ", ".join(f"{n}={v}" for n, v in zip(param_names, combo)) - instance = cls() - try: - instance.setup(*combo) - except Exception as e: - print(f" SKIP {label} setup failed: {e}") - continue - - for method_name in methods: + HDR = (f" {'median':>10} {'mean':>10} {'stdev':>10}" + f" {'q25':>10} {'q75':>10} {'min':>10} {'max':>10}" + f" {'method':<30} params") + print("-" * len(HDR)) + print(HDR) + print("-" * len(HDR)) + + # ASV stores params as lists of string representations + asv_params = [[_format_param_value(v) for v in dim] for dim in params] + + all_results = {} + + for method_name in methods: + bench_key = f"{suite_name}.{class_name}.{method_name}" + version = _get_benchmark_version(cls, method_name) + + medians = [] + ci_los = [] + ci_his = [] + q25s = [] + q75s = [] + numbers = [] + repeats = [] + started_at = int(time.time() * 1000) + t_start = time.perf_counter() + + for combo in combos: + label = ", ".join(f"{n}={v}" for n, v in zip(param_names, combo)) + instance = cls() + try: + instance.setup(*combo) + except Exception as e: + print(f" SKIP {label} setup failed: {e}") + medians.append(None) + ci_los.append(None) + ci_his.append(None) + q25s.append(None) + q75s.append(None) + numbers.append(None) + repeats.append(None) + continue + method = getattr(instance, method_name) for _ in range(warmup): method(*combo) - times = [] + samples = [] for _ in range(iters): t0 = time.perf_counter() method(*combo) - times.append(time.perf_counter() - t0) + samples.append(time.perf_counter() - t0) + + median, ci_lo, ci_hi, q25, q75 = _compute_stats(samples) + mean = sum(samples) / len(samples) + stdev = math.sqrt(sum((t - mean) ** 2 for t in samples) / len(samples)) + s_min, s_max = min(samples), max(samples) + + medians.append(median) + ci_los.append(ci_lo) + ci_his.append(ci_hi) + q25s.append(q25) + q75s.append(q75) + numbers.append(1) + repeats.append(iters) - times.sort() - median = times[len(times) // 2] - mean = sum(times) / len(times) - stdev = math.sqrt(sum((t - mean) ** 2 for t in times) / len(times)) print(f" {median*1000:>8.3f}ms {mean*1000:>8.3f}ms " - f"{stdev*1000:>8.3f}ms {method_name:<30} {label}") + f"{stdev*1000:>8.3f}ms {q25*1000:>8.3f}ms {q75*1000:>8.3f}ms " + f"{s_min*1000:>8.3f}ms {s_max*1000:>8.3f}ms " + f"{method_name:<30} {label}") + + duration = time.perf_counter() - t_start + + # ASV result row: [result, params, version, started_at, duration, + # ci_99_a, ci_99_b, q_25, q_75, number, repeat, samples] + all_results[bench_key] = [ + medians, + asv_params, + version, + started_at, + round(duration, 2), + ci_los, + ci_his, + q25s, + q75s, + numbers, + repeats, + ] + + return all_results def main(): @@ -74,19 +291,27 @@ def main(): help="Number of warmup iterations (default: 3)") parser.add_argument("-n", "--iters", type=int, default=7, help="Number of timed iterations (default: 7)") + parser.add_argument("--no-save", action="store_true", + help="Skip saving results to ASV format") args = parser.parse_args() mod = importlib.import_module(args.suite) + all_results = {} for name in sorted(dir(mod)): obj = getattr(mod, name) if isinstance(obj, type) and name.startswith("Bench"): - run_class(obj, name, args.method_filter, args.warmup, args.iters) + results = run_class( + args.suite, obj, name, + args.method_filter, args.warmup, args.iters) + all_results.update(results) + if all_results and not args.no_save: + save_asv_results(all_results) -if __name__ == "__main__": - import os - os.chdir(os.path.dirname(os.path.abspath(__file__))) +if __name__ == "__main__": + script_dir = os.path.dirname(os.path.abspath(__file__)) + os.chdir(script_dir) sys.path.insert(0, ".") main() diff --git a/benchmarks/asv/run_benchmarks.sh b/benchmarks/asv/run_benchmarks.sh index 4ca71881c..e8020a960 100755 --- a/benchmarks/asv/run_benchmarks.sh +++ b/benchmarks/asv/run_benchmarks.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash # Helper script for common ASV benchmark tasks. -# Run from the repository root (where asv.conf.json lives). set -euo pipefail cd "$(git rev-parse --show-toplevel)" BENCH_DIR="benchmarks/asv" +ASV_CONF="$(pwd)/$BENCH_DIR/asv.conf.json" mapfile -t SUITES < <(find "$BENCH_DIR" -maxdepth 1 -name 'bench_*.py' -printf '%f\n' | sed 's/\.py$//' | sort) usage() { @@ -14,10 +14,10 @@ Usage: bash benchmarks/asv/run_benchmarks.sh [options] Commands: setup Register this machine with ASV - run [SUITE] Run all benchmarks, or a single suite (e.g. bench_casting) - quick [SUITE] Smoke-test run (single iteration, results not saved) - direct [-w W] [-n N] SUITE [METHOD] - Fast in-process run (no subprocesses, no ASV overhead) + run [-w W] [-n N] [SUITE] [METHOD] + Run benchmarks in-process (fast, saves ASV-compatible results) + run --asv [SUITE] Run benchmarks via ASV (subprocess isolation per benchmark) + quick [SUITE] Smoke-test via ASV (single iteration, results not saved) compare [REF] [NEW] Compare two commits (default: HEAD~1 vs HEAD) view Generate HTML dashboard and open preview server list List available benchmark suites @@ -29,41 +29,51 @@ case "${1:-}" in setup) MACHINE="${2:-$(hostname)}" echo "Registering machine as: $MACHINE" - asv machine --yes --machine "$MACHINE" + asv machine --yes --machine "$MACHINE" --config "$ASV_CONF" ;; run) - CMD=(asv run --python=same --launch-method spawn - --set-commit-hash "$(git rev-parse HEAD)") - [[ -n "${2:-}" ]] && CMD+=(--bench "$2") - echo "Running: ${CMD[*]}" - "${CMD[@]}" + shift + if [[ "${1:-}" == "--asv" ]]; then + shift + CMD=(asv run --config "$ASV_CONF" --python=same --launch-method spawn + --set-commit-hash "$(git rev-parse HEAD)") + [[ -n "${1:-}" ]] && CMD+=(--bench "$1") + echo "Running (asv): ${CMD[*]}" + "${CMD[@]}" + else + # Default: fast in-process run + ARGS=() + while [[ $# -gt 0 ]]; do + ARGS+=("$1") + shift + done + if [[ ${#ARGS[@]} -eq 0 ]]; then + # Run all suites + for s in "${SUITES[@]}"; do + python "$BENCH_DIR/direct_run.py" "$s" + done + else + python "$BENCH_DIR/direct_run.py" "${ARGS[@]}" + fi + fi ;; quick) - CMD=(asv run --python=same --launch-method spawn --quick + CMD=(asv run --config "$ASV_CONF" --python=same --launch-method spawn --quick --set-commit-hash "$(git rev-parse HEAD)") [[ -n "${2:-}" ]] && CMD+=(--bench "$2") echo "Running (quick): ${CMD[*]}" "${CMD[@]}" ;; - direct) - shift - if [[ $# -eq 0 ]]; then - echo "Usage: $0 direct [options] SUITE [METHOD]" - echo "Options: -w WARMUP -n ITERS" - exit 1 - fi - python "$BENCH_DIR/direct_run.py" "$@" - ;; compare) REF="${2:-HEAD~1}" NEW="${3:-HEAD}" echo "Comparing $REF vs $NEW" - asv continuous --python=same --launch-method spawn "$REF" "$NEW" + asv continuous --config "$ASV_CONF" --python=same --launch-method spawn "$REF" "$NEW" ;; view) - asv publish + asv publish --config "$ASV_CONF" echo "Starting preview server at http://localhost:8080" - asv preview + asv preview --config "$ASV_CONF" ;; list) echo "Available benchmark suites:"