ROCm · Micky774 · Mar 16, 2026 · Mar 17, 2026 · Mar 18, 2026 · Mar 19, 2026
@@ -368,6 +368,79 @@ jobs:
           EOF
           )"
 
+      - name: Restore previous ASV results
+        if: github.event_name == 'push' && github.ref_name == 'dev'
+        continue-on-error: true
+        env:
+          ARTIFACTORY_API_KEY: ${{ secrets.ARTIFACTORY_API_KEY }}
+        run: |
+          set -x
+          BASE_URL="https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/asv-results"
+          ARTIFACT_URL="${BASE_URL}/${{ matrix.runner }}/results.tar.gz"
+
+          curl -sf -H "X-JFrog-Art-Api:${ARTIFACTORY_API_KEY}" \
+            -o /tmp/asv-results.tar.gz "$ARTIFACT_URL" || {
+            echo "::notice::No previous ASV results found. Starting fresh."
+            exit 0
+          }
+
+          mkdir -p asv-results
+          tar xzf /tmp/asv-results.tar.gz -C asv-results/
+
+          # Copy into the container's ASV results directory
+          docker exec te-runner mkdir -p /workspace/benchmarks/.asv/results
+          docker cp asv-results/. te-runner:/workspace/benchmarks/.asv/results/
+          echo "Restored previous ASV results from Artifactory."
+
+      - name: Performance benchmarks (ASV)
+        if: github.event_name == 'push' && github.ref_name == 'dev'
+        continue-on-error: true
+        env:
+          RUNNER_NAME: ${{ matrix.runner }}
+        run: |
+          set -ex
+
+          # Derive a stable machine name from the runner label
+          case "${RUNNER_NAME}" in
+            linux-te-mi325*) MACHINE_NAME="mi325" ;;
+            linux-te-mi355*) MACHINE_NAME="mi355" ;;
+            *)               MACHINE_NAME="${RUNNER_NAME}" ;;
+          esac
+
+          docker exec -e MACHINE_NAME="$MACHINE_NAME" te-runner bash -c "$(cat <<'OUTER'
+          set -ex
+          pip install asv
+          cd /workspace
+          asv machine --yes --machine "$MACHINE_NAME"
+          asv run --python=same --launch-method spawn \
+            2>&1 | tee /workspace/asv_results.txt
+          OUTER
+          )"
+
+          # Copy results out of the container for upload
+          rm -rf asv-results
+          docker cp te-runner:/workspace/benchmarks/.asv/results/. ./asv-results/ || true
+
+      - name: Upload ASV results
+        if: github.event_name == 'push' && github.ref_name == 'dev'
+        continue-on-error: true
+        env:
+          ARTIFACTORY_API_KEY: ${{ secrets.ARTIFACTORY_API_KEY }}
+        run: |
+          set -ex
+          if [[ ! -d asv-results ]] || [[ -z "$(ls -A asv-results)" ]]; then
+            echo "::notice::No ASV results to upload."
+            exit 0
+          fi
+
+          BASE_URL="https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/asv-results"
+          tar czf /tmp/asv-results.tar.gz -C asv-results .
+
+          curl -sf -H "X-JFrog-Art-Api:${ARTIFACTORY_API_KEY}" \
+            -T /tmp/asv-results.tar.gz \
+            "${BASE_URL}/${{ matrix.runner }}/results.tar.gz"
+          echo "Uploaded ASV results to Artifactory."
+
       - name: Check Test Failure Status
         if: always()
         run: |

@@ -55,3 +55,4 @@ artifacts/
 **/times.csv
 transformer_engine/build_info.txt
 transformer_engine/common/util/hip_nvml.*
+.asv/
@@ -0,0 +1,209 @@
+# ASV Benchmarks for TransformerEngine
+
+Performance benchmarks built on [ASV (Air Speed Velocity)](https://asv.readthedocs.io/),
+a framework for benchmarking Python packages over their lifetime.
+
+## Prerequisites
+
+- TransformerEngine must already be built and installed in the current Python environment.
+- A ROCm or CUDA GPU must be available.
+- Install ASV: `pip install asv`
+
+ASV is configured with `environment_type: "existing"` (in `benchmarks/asv/asv.conf.json`),
+meaning it uses the current Python environment directly — it does not create virtualenvs or
+attempt to build TE itself. The config sets `branches: ["HEAD"]` so that `asv publish` accepts results from
+whichever branch is currently checked out — this works for both local development
+and CI (where `HEAD` points to `dev`).
+
+## Helper script
+
+A convenience wrapper (`benchmarks/asv/run_benchmarks.sh`) is provided for common tasks.
+It can be run from anywhere — it automatically `cd`s to the repo root. Available benchmark
+suites are discovered dynamically from `bench_*.py` files.
+
+```bash
+bash benchmarks/asv/run_benchmarks.sh <command> [options]
+```
+
+| Command | Description |
+|---|---|
+| `setup [name]` | Register machine with ASV (defaults to `hostname`) |
+| `run [suite] [method]` | Run benchmarks in-process (fast, saves ASV-compatible results) |
+| `run --asv [suite]` | Run benchmarks via ASV (subprocess isolation per benchmark) |
+| `quick [suite]` | Smoke test via ASV — single iteration, results not saved |
+| `compare [ref] [new]` | Compare two commits (defaults to `HEAD~1` vs `HEAD`) |
+| `view` | Generate HTML dashboard and serve on `localhost:8080` |
+| `list` | List available benchmark suites |
+
+The default `run` command executes benchmarks directly in-process, avoiding the
+significant subprocess-per-benchmark overhead that ASV imposes. Results are saved in
+ASV-compatible format and can be viewed with `view`. Use `run --asv` when you need
+ASV's subprocess isolation (e.g. for CI or statistical rigor).
+
+Examples:
+
+```bash
+bash benchmarks/asv/run_benchmarks.sh setup mi325
+bash benchmarks/asv/run_benchmarks.sh run                           # all suites
+bash benchmarks/asv/run_benchmarks.sh run bench_casting             # one suite
+bash benchmarks/asv/run_benchmarks.sh run bench_gemm time_forward   # one method
+bash benchmarks/asv/run_benchmarks.sh run -w 5 -n 20 bench_casting  # custom iterations
+bash benchmarks/asv/run_benchmarks.sh run --asv bench_casting       # via ASV subprocesses
+bash benchmarks/asv/run_benchmarks.sh compare HEAD~3 HEAD
+bash benchmarks/asv/run_benchmarks.sh view
+```
+
+## Local usage (manual ASV commands)
+
+All manual `asv` commands require `--config` with an **absolute path** to the config file
+and should be run from the **repository root**. ASV does not resolve relative `--config` paths.
+
+### Register your machine
+
+```bash
+asv machine --config $(pwd)/benchmarks/asv/asv.conf.json --yes --machine my-machine-name
+```
+
+This creates a machine profile in `benchmarks/.asv/results/my-machine-name/machine.json`.
+Use a descriptive name (e.g., `mi325`, `mi300x-dev`) — results are stored per machine, so
+the name must be consistent across runs for historical comparison.
+
+### Run all benchmarks
+
+```bash
+asv run --config $(pwd)/benchmarks/asv/asv.conf.json --python=same --launch-method spawn --set-commit-hash $(git rev-parse HEAD)
+```
+
+- `--python=same` — use the current interpreter (required with `environment_type: "existing"`)
+- `--launch-method spawn` — required for CUDA (fork causes "Cannot re-initialize CUDA in forked subprocess")
+- `--set-commit-hash` — **required** with `environment_type: "existing"`. Without it, ASV
+  runs benchmarks but silently discards results. The helper script sets this automatically.
+
+### Run a single suite
+
+```bash
+asv run --config $(pwd)/benchmarks/asv/asv.conf.json --python=same --launch-method spawn --set-commit-hash $(git rev-parse HEAD) --bench bench_casting
+```
+
+The `--bench` argument accepts a regex that matches benchmark file or class names.
+
+### Quick smoke test
+
+```bash
+asv run --config $(pwd)/benchmarks/asv/asv.conf.json --python=same --launch-method spawn --quick --set-commit-hash $(git rev-parse HEAD) --bench bench_casting
+```
+
+`--quick` runs each benchmark only once with no statistical analysis. Useful for verifying
+benchmarks work, but note that results are **not saved to disk** in quick mode.
+
+### Compare two commits
+
+```bash
+asv continuous --config $(pwd)/benchmarks/asv/asv.conf.json --python=same --launch-method spawn HEAD~1 HEAD
+```
+
+This checks out each commit, runs benchmarks on both, and reports regressions.
+Note: this only works if the benchmark files exist on both commits.
+
+### Generate an HTML dashboard
+
+```bash
+asv publish --config $(pwd)/benchmarks/asv/asv.conf.json
+asv preview --config $(pwd)/benchmarks/asv/asv.conf.json
+```
+
+`asv publish` generates static HTML from stored results into `benchmarks/.asv/html/`.
+`asv preview` serves it locally on `http://localhost:8080`.
+
+## How results are stored
+
+### Local results
+
+ASV stores results as JSON files under `benchmarks/.asv/results/`:
+
+```
+benchmarks/.asv/results/
+  my-machine-name/
+    machine.json           # Hardware/OS metadata
+    <commit-hash>.json     # Timing results for that commit
+    <commit-hash>.json
+    ...
+```
+
+Each commit JSON contains the wall-clock timings for every benchmark + parameter combination
+run on that machine. The `benchmarks/.asv/` directory is in `.gitignore`.
+
+### CI results (Artifactory)
+
+In CI, benchmarks run **only on pushes to `dev`** (not on PRs). This builds a historical
+record of performance on the main branch.
+
+The CI pipeline (`.github/workflows/rocm-ci.yml`) follows this flow:
+
+1. **Restore** — Download `results.tar.gz` from Artifactory for the current runner
+2. **Benchmark** — Run `asv run`, which appends a new `{commit}.json` to the results directory
+3. **Upload** — Tar up the results directory and upload back to Artifactory
+
+Results are stored per machine at:
+```
+https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/asv-results/
+  linux-te-mi325-8/results.tar.gz
+  linux-te-mi355-8/results.tar.gz
+```
+
+Each tarball contains the full ASV results directory for that machine, accumulating
+a new commit JSON on every push to `dev`. ASV machine names map to hardware:
+`mi325` for MI325X runners, `mi355` for MI355X runners.
+
+### Downloading CI results locally
+
+To inspect CI results on your local machine (requires Artifactory access):
+
+```bash
+# Download results for a specific machine
+curl -sf -H "X-JFrog-Art-Api:${ARTIFACTORY_API_KEY}" \
+  -o results.tar.gz \
+  "https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/asv-results/linux-te-mi325-8/results.tar.gz"
+
+# Extract into your local ASV results directory
+mkdir -p benchmarks/.asv/results
+tar xzf results.tar.gz -C benchmarks/.asv/results/
+
+# Generate and view the dashboard
+asv publish
+asv preview
+```
+
+This can also be provided statically via github pages.
+
+## Writing new benchmarks
+
+Create a new file in `benchmarks/asv/` following the naming convention `bench_<name>.py`.
+
+```python
+import torch
+import transformer_engine.pytorch as te
+
+class BenchSomething:
+    params = [[1024, 4096], ["config_a", "config_b"]]
+    param_names = ["M", "config"]
+    timeout = 300  # seconds, per parameter combination
+
+    def setup(self, M, config):
+        # Allocate tensors, create modules.
+        # This runs before each time_* method but is NOT timed.
+        ...
+
+    def time_forward(self, M, config):
+        # ASV times this method (adaptive iterations + statistics).
+        # MUST call torch.cuda.synchronize() to ensure GPU work completes.
+        self.module(self.x)
+        torch.cuda.synchronize()
+```
+
+Key rules:
+- Method names starting with `time_` are automatically timed by ASV.
+- Always call `torch.cuda.synchronize()` at the end of `time_*` methods.
+- Clear `.grad` attributes in backward benchmarks to prevent memory accumulation.
+- ASV runs each `time_*` method in a **separate subprocess** — no shared state between methods.
+- The `params` list defines a cross-product; keep the matrix size reasonable.
@@ -0,0 +1,16 @@
+{
+    "version": 1,
+    "project": "TransformerEngine",
+    "project_url": "https://github.com/ROCm/TransformerEngine",
+    "repo": "../..",
+    "branches": ["HEAD"],
+    "environment_type": "existing",
+    "install_command": [],
+    "build_command": [],
+    "benchmark_dir": ".",
+    "results_dir": "../.asv/results",
+    "html_dir": "../.asv/html",
+    "install_timeout": 600,
+    "benchmark_timeout": 1200,
+    "launch_method": "spawn"
+}
@@ -0,0 +1,56 @@
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Fused multi-head attention (GQA) benchmarks via te.DotProductAttention.
+
+Forward FLOPs  = 4 * batch * num_q_heads * seq_len^2 * head_dim
+Backward FLOPs ~ 2x forward
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+BATCH = 2
+
+# (num_q_heads, num_kv_heads, head_dim, tp)
+MODELS = {
+    "Llama3-8B_TP1":   (32, 8, 128, 1),
+    "Llama3-8B_TP8":   (32, 8, 128, 8),
+    "Llama3-70B_TP8":  (64, 8, 128, 8),
+    "Llama3-405B_TP8": (128, 8, 128, 8),
+    "Qwen2.5-7B_TP1":  (28, 4, 128, 1),
+    "Qwen2.5-72B_TP8": (64, 8, 128, 8),
+}
+
+
+class BenchAttention:
+    params = [[1024, 2048, 4096, 8192], list(MODELS)]
+    param_names = ["seq_len", "model"]
+    timeout = 300
+
+    def setup(self, seq_len, model):
+        n_q, n_kv, hd, tp = MODELS[model]
+        qh, kvh = n_q // tp, n_kv // tp
+        dtype = torch.bfloat16
+
+        self.attn = te.DotProductAttention(
+            num_attention_heads=qh, kv_channels=hd,
+            num_gqa_groups=kvh, attn_mask_type="causal",
+        ).to(device="cuda", dtype=dtype)
+
+        self.q = torch.randn(seq_len, BATCH, qh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.k = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.v = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.attn(self.q, self.k, self.v))
+
+    def time_forward(self, seq_len, model):
+        self.attn(self.q, self.k, self.v)
+        torch.cuda.synchronize()
+
+    def time_forward_backward(self, seq_len, model):
+        out = self.attn(self.q, self.k, self.v)
+        out.backward(self.grad_out)
+        self.q.grad = self.k.grad = self.v.grad = None
+        torch.cuda.synchronize()