From d7c643c04434494b3707073128e7690b366abd16 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Mon, 16 Mar 2026 17:52:30 -0500
Subject: [PATCH 1/6] Initial benchmark porting to ASV

---
 .github/workflows/rocm-ci.yml         |  73 +++++++++++
 .gitignore                            |   1 +
 asv.conf.json                         |  16 +++
 benchmarks/asv/README.md              | 166 ++++++++++++++++++++++++++
 benchmarks/asv/__init__.py            |   0
 benchmarks/asv/bench_attention.py     |  56 +++++++++
 benchmarks/asv/bench_casting.py       |  51 ++++++++
 benchmarks/asv/bench_gemm.py          |  55 +++++++++
 benchmarks/asv/bench_gemm_fp8.py      |  60 ++++++++++
 benchmarks/asv/bench_grouped_gemm.py  |  64 ++++++++++
 benchmarks/asv/bench_normalization.py |  36 ++++++
 11 files changed, 578 insertions(+)
 create mode 100644 asv.conf.json
 create mode 100644 benchmarks/asv/README.md
 create mode 100644 benchmarks/asv/__init__.py
 create mode 100644 benchmarks/asv/bench_attention.py
 create mode 100644 benchmarks/asv/bench_casting.py
 create mode 100644 benchmarks/asv/bench_gemm.py
 create mode 100644 benchmarks/asv/bench_gemm_fp8.py
 create mode 100644 benchmarks/asv/bench_grouped_gemm.py
 create mode 100644 benchmarks/asv/bench_normalization.py

diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml
index 32c3cb2a2..c35e5d0ad 100644
--- a/.github/workflows/rocm-ci.yml
+++ b/.github/workflows/rocm-ci.yml
@@ -368,6 +368,79 @@ jobs:
           EOF
           )"
 
+      - name: Restore previous ASV results
+        if: github.event_name == 'push' && github.ref_name == 'dev'
+        continue-on-error: true
+        env:
+          ARTIFACTORY_API_KEY: ${{ secrets.ARTIFACTORY_API_KEY }}
+        run: |
+          set -x
+          BASE_URL="https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/asv-results"
+          ARTIFACT_URL="${BASE_URL}/${{ matrix.runner }}/results.tar.gz"
+
+          curl -sf -H "X-JFrog-Art-Api:${ARTIFACTORY_API_KEY}" \
+            -o /tmp/asv-results.tar.gz "$ARTIFACT_URL" || {
+            echo "::notice::No previous ASV results found. Starting fresh."
+            exit 0
+          }
+
+          mkdir -p asv-results
+          tar xzf /tmp/asv-results.tar.gz -C asv-results/
+
+          # Copy into the container's ASV results directory
+          docker exec te-runner mkdir -p /workspace/benchmarks/.asv/results
+          docker cp asv-results/. te-runner:/workspace/benchmarks/.asv/results/
+          echo "Restored previous ASV results from Artifactory."
+
+      - name: Performance benchmarks (ASV)
+        if: github.event_name == 'push' && github.ref_name == 'dev'
+        continue-on-error: true
+        env:
+          RUNNER_NAME: ${{ matrix.runner }}
+        run: |
+          set -ex
+
+          # Derive a stable machine name from the runner label
+          case "${RUNNER_NAME}" in
+            linux-te-mi325*) MACHINE_NAME="mi325" ;;
+            linux-te-mi355*) MACHINE_NAME="mi355" ;;
+            *)               MACHINE_NAME="${RUNNER_NAME}" ;;
+          esac
+
+          docker exec -e MACHINE_NAME="$MACHINE_NAME" te-runner bash -c "$(cat <<'OUTER'
+          set -ex
+          pip install asv
+          cd /workspace
+          asv machine --yes --machine "$MACHINE_NAME"
+          asv run --python=same --launch-method spawn \
+            2>&1 | tee /workspace/asv_results.txt
+          OUTER
+          )"
+
+          # Copy results out of the container for upload
+          rm -rf asv-results
+          docker cp te-runner:/workspace/benchmarks/.asv/results/. ./asv-results/ || true
+
+      - name: Upload ASV results
+        if: github.event_name == 'push' && github.ref_name == 'dev'
+        continue-on-error: true
+        env:
+          ARTIFACTORY_API_KEY: ${{ secrets.ARTIFACTORY_API_KEY }}
+        run: |
+          set -ex
+          if [[ ! -d asv-results ]] || [[ -z "$(ls -A asv-results)" ]]; then
+            echo "::notice::No ASV results to upload."
+            exit 0
+          fi
+
+          BASE_URL="https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/asv-results"
+          tar czf /tmp/asv-results.tar.gz -C asv-results .
+
+          curl -sf -H "X-JFrog-Art-Api:${ARTIFACTORY_API_KEY}" \
+            -T /tmp/asv-results.tar.gz \
+            "${BASE_URL}/${{ matrix.runner }}/results.tar.gz"
+          echo "Uploaded ASV results to Artifactory."
+
       - name: Check Test Failure Status
         if: always()
         run: |
diff --git a/.gitignore b/.gitignore
index d3b18b358..a5fd89b4b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -55,3 +55,4 @@ artifacts/
 **/times.csv
 transformer_engine/build_info.txt
 transformer_engine/common/util/hip_nvml.*
+.asv/
diff --git a/asv.conf.json b/asv.conf.json
new file mode 100644
index 000000000..dc71bf345
--- /dev/null
+++ b/asv.conf.json
@@ -0,0 +1,16 @@
+{
+    "version": 1,
+    "project": "TransformerEngine",
+    "project_url": "https://github.com/ROCm/TransformerEngine",
+    "repo": ".",
+    "branches": ["dev"],
+    "environment_type": "existing",
+    "install_command": [],
+    "build_command": [],
+    "benchmark_dir": "benchmarks/asv",
+    "results_dir": "benchmarks/.asv/results",
+    "html_dir": "benchmarks/.asv/html",
+    "install_timeout": 600,
+    "benchmark_timeout": 1200,
+    "launch_method": "spawn"
+}
diff --git a/benchmarks/asv/README.md b/benchmarks/asv/README.md
new file mode 100644
index 000000000..7de4fd6c5
--- /dev/null
+++ b/benchmarks/asv/README.md
@@ -0,0 +1,166 @@
+# ASV Benchmarks for TransformerEngine
+
+Performance benchmarks built on [ASV (Air Speed Velocity)](https://asv.readthedocs.io/),
+a framework for benchmarking Python packages over their lifetime.
+
+## Prerequisites
+
+- TransformerEngine must already be built and installed in the current Python environment.
+- A ROCm or CUDA GPU must be available.
+- Install ASV: `pip install asv`
+
+ASV is configured with `environment_type: "existing"` (in `asv.conf.json` at the repo root),
+meaning it uses the current Python environment directly — it does not create virtualenvs or
+attempt to build TE itself.
+
+## Local usage
+
+All commands are run from the **repository root** (where `asv.conf.json` lives).
+
+### Register your machine
+
+```bash
+asv machine --yes --machine my-machine-name
+```
+
+This creates a machine profile in `benchmarks/.asv/results/my-machine-name/machine.json`.
+Use a descriptive name (e.g., `mi325`, `mi300x-dev`) — results are stored per machine, so
+the name must be consistent across runs for historical comparison.
+
+### Run all benchmarks
+
+```bash
+asv run --python=same --launch-method spawn
+```
+
+- `--python=same` — use the current interpreter (required with `environment_type: "existing"`)
+- `--launch-method spawn` — required for CUDA (fork causes "Cannot re-initialize CUDA in forked subprocess")
+
+### Run a single suite
+
+```bash
+asv run --python=same --launch-method spawn --bench bench_casting
+```
+
+The `--bench` argument accepts a regex that matches benchmark file or class names.
+
+### Quick smoke test
+
+```bash
+asv run --python=same --launch-method spawn --quick --bench bench_casting
+```
+
+`--quick` runs each benchmark only once with no statistical analysis. Useful for verifying
+benchmarks work, but note that results are **not saved to disk** in quick mode.
+
+### Compare two commits
+
+```bash
+asv continuous --python=same --launch-method spawn HEAD~1 HEAD
+```
+
+This checks out each commit, runs benchmarks on both, and reports regressions.
+Note: this only works if the benchmark files exist on both commits.
+
+### Generate an HTML dashboard
+
+```bash
+asv publish
+asv preview
+```
+
+`asv publish` generates static HTML from stored results into `benchmarks/.asv/html/`.
+`asv preview` serves it locally on `http://localhost:8080`.
+
+## How results are stored
+
+### Local results
+
+ASV stores results as JSON files under `benchmarks/.asv/results/`:
+
+```
+benchmarks/.asv/results/
+  my-machine-name/
+    machine.json           # Hardware/OS metadata
+    <commit-hash>.json     # Timing results for that commit
+    <commit-hash>.json
+    ...
+```
+
+Each commit JSON contains the wall-clock timings for every benchmark + parameter combination
+run on that machine. The `benchmarks/.asv/` directory is in `.gitignore`.
+
+### CI results (Artifactory)
+
+In CI, benchmarks run **only on pushes to `dev`** (not on PRs). This builds a historical
+record of performance on the main branch.
+
+The CI pipeline (`.github/workflows/rocm-ci.yml`) follows this flow:
+
+1. **Restore** — Download `results.tar.gz` from Artifactory for the current runner
+2. **Benchmark** — Run `asv run`, which appends a new `{commit}.json` to the results directory
+3. **Upload** — Tar up the results directory and upload back to Artifactory
+
+Results are stored per machine at:
+```
+https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/asv-results/
+  linux-te-mi325-8/results.tar.gz
+  linux-te-mi355-8/results.tar.gz
+```
+
+Each tarball contains the full ASV results directory for that machine, accumulating
+a new commit JSON on every push to `dev`. ASV machine names map to hardware:
+`mi325` for MI325X runners, `mi355` for MI355X runners.
+
+### Downloading CI results locally
+
+To inspect CI results on your local machine (requires Artifactory access):
+
+```bash
+# Download results for a specific machine
+curl -sf -H "X-JFrog-Art-Api:${ARTIFACTORY_API_KEY}" \
+  -o results.tar.gz \
+  "https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/asv-results/linux-te-mi325-8/results.tar.gz"
+
+# Extract into your local ASV results directory
+mkdir -p benchmarks/.asv/results
+tar xzf results.tar.gz -C benchmarks/.asv/results/
+
+# Generate and view the dashboard
+asv publish
+asv preview
+```
+
+This can also be provided statically via github pages.
+
+## Writing new benchmarks
+
+Create a new file in `benchmarks/asv/` following the naming convention `bench_<name>.py`.
+
+```python
+import torch
+import transformer_engine.pytorch as te
+
+class BenchSomething:
+    params = [[1024, 4096], ["config_a", "config_b"]]
+    param_names = ["M", "config"]
+    timeout = 300  # seconds, per parameter combination
+
+    def setup(self, M, config):
+        # Allocate tensors, create modules.
+        # This runs before each time_* method but is NOT timed.
+        ...
+
+    def time_forward(self, M, config):
+        # ASV times this method (adaptive iterations + statistics).
+        # MUST call torch.cuda.synchronize() to ensure GPU work completes.
+        self.module(self.x)
+        torch.cuda.synchronize()
+```
+
+Key rules:
+- Method names starting with `time_` are automatically timed by ASV.
+- Always call `torch.cuda.synchronize()` at the end of `time_*` methods.
+- Clear `.grad` attributes in backward benchmarks to prevent memory accumulation.
+- ASV runs each `time_*` method in a **separate subprocess** — no shared state between methods.
+- The `params` list defines a cross-product; keep the matrix size reasonable.
diff --git a/benchmarks/asv/__init__.py b/benchmarks/asv/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/asv/bench_attention.py b/benchmarks/asv/bench_attention.py
new file mode 100644
index 000000000..9c64888f6
--- /dev/null
+++ b/benchmarks/asv/bench_attention.py
@@ -0,0 +1,56 @@
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Fused multi-head attention (GQA) benchmarks via te.DotProductAttention.
+
+Forward FLOPs  = 4 * batch * num_q_heads * seq_len^2 * head_dim
+Backward FLOPs ~ 2x forward
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+BATCH = 2
+
+# (num_q_heads, num_kv_heads, head_dim, tp)
+MODELS = {
+    "Llama3-8B_TP1":   (32, 8, 128, 1),
+    "Llama3-8B_TP8":   (32, 8, 128, 8),
+    "Llama3-70B_TP8":  (64, 8, 128, 8),
+    "Llama3-405B_TP8": (128, 8, 128, 8),
+    "Qwen2.5-7B_TP1":  (28, 4, 128, 1),
+    "Qwen2.5-72B_TP8": (64, 8, 128, 8),
+}
+
+
+class BenchAttention:
+    params = [[1024, 2048, 4096, 8192], list(MODELS)]
+    param_names = ["seq_len", "model"]
+    timeout = 300
+
+    def setup(self, seq_len, model):
+        n_q, n_kv, hd, tp = MODELS[model]
+        qh, kvh = n_q // tp, n_kv // tp
+        dtype = torch.bfloat16
+
+        self.attn = te.DotProductAttention(
+            num_attention_heads=qh, kv_channels=hd,
+            num_gqa_groups=kvh, attn_mask_type="causal",
+        ).to(device="cuda", dtype=dtype)
+
+        self.q = torch.randn(seq_len, BATCH, qh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.k = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.v = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.attn(self.q, self.k, self.v))
+
+    def time_forward(self, seq_len, model):
+        self.attn(self.q, self.k, self.v)
+        torch.cuda.synchronize()
+
+    def time_forward_backward(self, seq_len, model):
+        out = self.attn(self.q, self.k, self.v)
+        out.backward(self.grad_out)
+        self.q.grad = self.k.grad = self.v.grad = None
+        torch.cuda.synchronize()
diff --git a/benchmarks/asv/bench_casting.py b/benchmarks/asv/bench_casting.py
new file mode 100644
index 000000000..7195a01ab
--- /dev/null
+++ b/benchmarks/asv/bench_casting.py
@@ -0,0 +1,51 @@
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""FP8 casting micro-benchmarks.
+
+Memory-bound quantization/dequantization between BF16 and FP8 formats.
+"""
+
+import torch
+
+if hasattr(torch, "float8_e4m3fnuz"):
+    FP8_E4M3 = torch.float8_e4m3fnuz
+    FP8_E5M2 = torch.float8_e5m2fnuz
+else:
+    FP8_E4M3 = torch.float8_e4m3fn
+    FP8_E5M2 = torch.float8_e5m2
+
+HIDDEN_SIZES = {
+    "Llama3-8B": 4096,
+    "Llama3-70B": 8192,
+    "Llama3-405B": 16384,
+    "Qwen2.5-7B": 3584,
+    "Qwen2.5-72B": 8192,
+}
+
+CAST_CONFIGS = {
+    "BF16_to_E4M3": (torch.bfloat16, FP8_E4M3),
+    "E4M3_to_BF16": (FP8_E4M3, torch.bfloat16),
+    "BF16_to_E5M2": (torch.bfloat16, FP8_E5M2),
+    "E5M2_to_BF16": (FP8_E5M2, torch.bfloat16),
+}
+
+
+class BenchCasting:
+    params = [[1024, 2048, 4096, 8192], list(HIDDEN_SIZES), list(CAST_CONFIGS)]
+    param_names = ["M", "model", "cast"]
+    timeout = 120
+
+    def setup(self, M, model, cast):
+        hidden = HIDDEN_SIZES[model]
+        src_dtype, self.dst_dtype = CAST_CONFIGS[cast]
+        if src_dtype in (FP8_E4M3, FP8_E5M2):
+            self.x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda").to(src_dtype)
+        else:
+            self.x = torch.randn(M, hidden, dtype=src_dtype, device="cuda")
+
+    def time_cast(self, M, model, cast):
+        self.x.to(self.dst_dtype)
+        torch.cuda.synchronize()
diff --git a/benchmarks/asv/bench_gemm.py b/benchmarks/asv/bench_gemm.py
new file mode 100644
index 000000000..6a09a08b5
--- /dev/null
+++ b/benchmarks/asv/bench_gemm.py
@@ -0,0 +1,55 @@
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""BF16 GEMM benchmarks via te.Linear.
+
+GEMM shapes derived from transformer layer projections:
+  QKV, AttnOut, GateUp (SwiGLU), Down.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+# (hidden, intermediate, num_q_heads, num_kv_heads, head_dim, tp)
+MODELS = {
+    "Llama3-8B_TP1":   (4096, 14336, 32, 8, 128, 1),
+    "Llama3-8B_TP8":   (4096, 14336, 32, 8, 128, 8),
+    "Llama3-70B_TP8":  (8192, 28672, 64, 8, 128, 8),
+    "Llama3-405B_TP8": (16384, 53248, 128, 8, 128, 8),
+    "Qwen2.5-7B_TP1":  (3584, 18944, 28, 4, 128, 1),
+    "Qwen2.5-72B_TP8": (8192, 29568, 64, 8, 128, 8),
+}
+
+# Pre-compute (N, K) for each GEMM shape
+SHAPES = {}
+for _name, (h, inter, nq, nkv, hd, tp) in MODELS.items():
+    SHAPES[f"{_name}-QKV"] = ((nq * hd + 2 * nkv * hd) // tp, h)
+    SHAPES[f"{_name}-AttnOut"] = (h, (nq * hd) // tp)
+    SHAPES[f"{_name}-GateUp"] = ((2 * inter) // tp, h)
+    SHAPES[f"{_name}-Down"] = (h, inter // tp)
+
+
+class BenchGemm:
+    params = [[1024, 2048, 4096, 8192], list(SHAPES)]
+    param_names = ["M", "shape"]
+    timeout = 300
+
+    def setup(self, M, shape):
+        N, K = SHAPES[shape]
+        dtype = torch.bfloat16
+        self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.linear(self.x))
+
+    def time_forward(self, M, shape):
+        self.linear(self.x)
+        torch.cuda.synchronize()
+
+    def time_forward_backward(self, M, shape):
+        out = self.linear(self.x)
+        out.backward(self.grad_out)
+        self.x.grad = None
+        self.linear.weight.grad = None
+        torch.cuda.synchronize()
diff --git a/benchmarks/asv/bench_gemm_fp8.py b/benchmarks/asv/bench_gemm_fp8.py
new file mode 100644
index 000000000..9d70d8879
--- /dev/null
+++ b/benchmarks/asv/bench_gemm_fp8.py
@@ -0,0 +1,60 @@
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""FP8 GEMM benchmarks via te.Linear under fp8_autocast.
+
+Same shapes as bench_gemm.py but with FP8 quantized compute.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+from transformer_engine.common.recipe import DelayedScaling, Format
+
+# (hidden, intermediate, num_q_heads, num_kv_heads, head_dim, tp)
+MODELS = {
+    "Llama3-8B_TP1":   (4096, 14336, 32, 8, 128, 1),
+    "Llama3-8B_TP8":   (4096, 14336, 32, 8, 128, 8),
+    "Llama3-70B_TP8":  (8192, 28672, 64, 8, 128, 8),
+    "Llama3-405B_TP8": (16384, 53248, 128, 8, 128, 8),
+    "Qwen2.5-7B_TP1":  (3584, 18944, 28, 4, 128, 1),
+    "Qwen2.5-72B_TP8": (8192, 29568, 64, 8, 128, 8),
+}
+
+SHAPES = {}
+for _name, (h, inter, nq, nkv, hd, tp) in MODELS.items():
+    SHAPES[f"{_name}-QKV"] = ((nq * hd + 2 * nkv * hd) // tp, h)
+    SHAPES[f"{_name}-AttnOut"] = (h, (nq * hd) // tp)
+    SHAPES[f"{_name}-GateUp"] = ((2 * inter) // tp, h)
+    SHAPES[f"{_name}-Down"] = (h, inter // tp)
+
+FP8_RECIPE = DelayedScaling(
+    fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max",
+)
+
+
+class BenchGemmFP8:
+    params = [[1024, 2048, 4096, 8192], list(SHAPES)]
+    param_names = ["M", "shape"]
+    timeout = 300
+
+    def setup(self, M, shape):
+        N, K = SHAPES[shape]
+        dtype = torch.bfloat16
+        self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn(M, N, dtype=dtype, device="cuda")
+
+    def time_forward(self, M, shape):
+        with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+            self.linear(self.x)
+        torch.cuda.synchronize()
+
+    def time_forward_backward(self, M, shape):
+        with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+            out = self.linear(self.x)
+        out.backward(self.grad_out)
+        self.x.grad = None
+        self.linear.weight.grad = None
+        torch.cuda.synchronize()
diff --git a/benchmarks/asv/bench_grouped_gemm.py b/benchmarks/asv/bench_grouped_gemm.py
new file mode 100644
index 000000000..3c35737f5
--- /dev/null
+++ b/benchmarks/asv/bench_grouped_gemm.py
@@ -0,0 +1,64 @@
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Grouped GEMM benchmarks via te.GroupedLinear.
+
+MoE model configurations with GateUp and Down projections.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+# (n_routed_experts, moe_intermediate_size, hidden_size)
+MOE_MODELS = {
+    "DSV2-Lite": (64, 1408, 2048),
+    "DSV2":      (160, 1536, 5120),
+    "DSV3":      (256, 2048, 7168),
+    "Grok-V2":   (8, 16384, 8192),
+}
+
+# Build (config_key -> (num_gemms, N, K)) mapping
+CONFIGS = {}
+for model, (n_experts, inter, hidden) in MOE_MODELS.items():
+    for ep in [32, 16, 8]:
+        if n_experts % ep != 0:
+            continue
+        B = n_experts // ep
+        CONFIGS[f"{model}_EP{ep}-GateUp"] = (B, 2 * inter, hidden)
+        CONFIGS[f"{model}_EP{ep}-Down"] = (B, hidden, inter)
+
+
+class BenchGroupedGemm:
+    params = [[512, 1024, 2048, 4096], list(CONFIGS)]
+    param_names = ["M", "config"]
+    timeout = 300
+
+    def setup(self, M, config):
+        B, N, K = CONFIGS[config]
+        dtype = torch.bfloat16
+
+        self.module = te.GroupedLinear(
+            num_gemms=B, in_features=K, out_features=N, bias=False,
+        ).to(device="cuda", dtype=dtype)
+
+        self.xs = [
+            torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+            for _ in range(B)
+        ]
+        outs = self.module(self.xs)
+        self.grad_outs = [torch.randn_like(o) for o in outs]
+
+    def time_forward(self, M, config):
+        self.module(self.xs)
+        torch.cuda.synchronize()
+
+    def time_forward_backward(self, M, config):
+        outs = self.module(self.xs)
+        torch.autograd.backward(outs, self.grad_outs)
+        for x in self.xs:
+            x.grad = None
+        for p in self.module.parameters():
+            p.grad = None
+        torch.cuda.synchronize()
diff --git a/benchmarks/asv/bench_normalization.py b/benchmarks/asv/bench_normalization.py
new file mode 100644
index 000000000..f68b60a51
--- /dev/null
+++ b/benchmarks/asv/bench_normalization.py
@@ -0,0 +1,36 @@
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""RMSNorm and LayerNorm benchmarks on activation-sized tensors."""
+
+import torch
+import transformer_engine.pytorch as te
+
+NORMS = {"RMSNorm": te.RMSNorm, "LayerNorm": te.LayerNorm}
+HIDDEN_SIZES = [3584, 4096, 8192, 16384]
+
+
+class BenchNormalization:
+    params = [[1024, 2048, 4096, 8192], HIDDEN_SIZES, list(NORMS)]
+    param_names = ["M", "hidden", "norm_type"]
+    timeout = 120
+
+    def setup(self, M, hidden, norm_type):
+        dtype = torch.bfloat16
+        self.norm = NORMS[norm_type](hidden).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, hidden, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.norm(self.x))
+
+    def time_forward(self, M, hidden, norm_type):
+        self.norm(self.x)
+        torch.cuda.synchronize()
+
+    def time_forward_backward(self, M, hidden, norm_type):
+        out = self.norm(self.x)
+        out.backward(self.grad_out)
+        self.x.grad = None
+        for p in self.norm.parameters():
+            p.grad = None
+        torch.cuda.synchronize()

From b8291223203b89bdb098a27a54728fdb174fd755 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 17 Mar 2026 08:51:18 -0500
Subject: [PATCH 2/6] Update casting benchmark

---
 benchmarks/asv/bench_casting.py | 42 ++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/benchmarks/asv/bench_casting.py b/benchmarks/asv/bench_casting.py
index 7195a01ab..fb594a7c2 100644
--- a/benchmarks/asv/bench_casting.py
+++ b/benchmarks/asv/bench_casting.py
@@ -5,17 +5,13 @@
 ###############################################################################
 """FP8 casting micro-benchmarks.
 
-Memory-bound quantization/dequantization between BF16 and FP8 formats.
+Memory-bound quantization/dequantization between BF16 and FP8 formats
+using Transformer Engine's quantized tensor infrastructure.
 """
 
 import torch
-
-if hasattr(torch, "float8_e4m3fnuz"):
-    FP8_E4M3 = torch.float8_e4m3fnuz
-    FP8_E5M2 = torch.float8_e5m2fnuz
-else:
-    FP8_E4M3 = torch.float8_e4m3fn
-    FP8_E5M2 = torch.float8_e5m2
+from transformer_engine.pytorch import Float8CurrentScalingQuantizer
+from transformer_engine_torch import DType as TE_DType
 
 HIDDEN_SIZES = {
     "Llama3-8B": 4096,
@@ -26,10 +22,10 @@
 }
 
 CAST_CONFIGS = {
-    "BF16_to_E4M3": (torch.bfloat16, FP8_E4M3),
-    "E4M3_to_BF16": (FP8_E4M3, torch.bfloat16),
-    "BF16_to_E5M2": (torch.bfloat16, FP8_E5M2),
-    "E5M2_to_BF16": (FP8_E5M2, torch.bfloat16),
+    "BF16_to_E4M3": ("quantize", TE_DType.kFloat8E4M3),
+    "E4M3_to_BF16": ("dequantize", TE_DType.kFloat8E4M3),
+    "BF16_to_E5M2": ("quantize", TE_DType.kFloat8E5M2),
+    "E5M2_to_BF16": ("dequantize", TE_DType.kFloat8E5M2),
 }
 
 
@@ -40,12 +36,24 @@ class BenchCasting:
 
     def setup(self, M, model, cast):
         hidden = HIDDEN_SIZES[model]
-        src_dtype, self.dst_dtype = CAST_CONFIGS[cast]
-        if src_dtype in (FP8_E4M3, FP8_E5M2):
-            self.x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda").to(src_dtype)
+        direction, fp8_dtype = CAST_CONFIGS[cast]
+        self.direction = direction
+        quantizer = Float8CurrentScalingQuantizer(
+            fp8_dtype=fp8_dtype,
+            device=torch.device("cuda"),
+            rowwise=True,
+            columnwise=False,
+        )
+        if direction == "dequantize":
+            bf16_tensor = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
+            self.x = quantizer.quantize(bf16_tensor)
         else:
-            self.x = torch.randn(M, hidden, dtype=src_dtype, device="cuda")
+            self.x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
+            self.quantizer = quantizer
 
     def time_cast(self, M, model, cast):
-        self.x.to(self.dst_dtype)
+        if self.direction == "quantize":
+            self.quantizer.quantize(self.x)
+        else:
+            self.x.dequantize(dtype=torch.bfloat16)
         torch.cuda.synchronize()

From 21678b41426f3bc7e30f56022c93f66444a462d6 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Wed, 18 Mar 2026 16:26:16 -0500
Subject: [PATCH 3/6] Added helper script and documentation

---
 benchmarks/asv/README.md         | 31 +++++++++++++++-
 benchmarks/asv/run_benchmarks.sh | 63 ++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+), 1 deletion(-)
 create mode 100755 benchmarks/asv/run_benchmarks.sh

diff --git a/benchmarks/asv/README.md b/benchmarks/asv/README.md
index 7de4fd6c5..bd02b4991 100644
--- a/benchmarks/asv/README.md
+++ b/benchmarks/asv/README.md
@@ -13,7 +13,36 @@ ASV is configured with `environment_type: "existing"` (in `asv.conf.json` at the
 meaning it uses the current Python environment directly — it does not create virtualenvs or
 attempt to build TE itself.
 
-## Local usage
+## Helper script
+
+A convenience wrapper (`benchmarks/asv/run_benchmarks.sh`) is provided for common tasks.
+It can be run from anywhere — it automatically `cd`s to the repo root. Available benchmark
+suites are discovered dynamically from `bench_*.py` files.
+
+```bash
+bash benchmarks/asv/run_benchmarks.sh <command> [options]
+```
+
+| Command | Description |
+|---|---|
+| `setup [name]` | Register machine with ASV (defaults to `hostname`) |
+| `run [suite]` | Run all benchmarks, or a single suite (e.g. `bench_casting`) |
+| `quick [suite]` | Smoke test — single iteration, results not saved |
+| `compare [ref] [new]` | Compare two commits (defaults to `HEAD~1` vs `HEAD`) |
+| `view` | Generate HTML dashboard and serve on `localhost:8080` |
+| `list` | List available benchmark suites |
+
+Examples:
+
+```bash
+bash benchmarks/asv/run_benchmarks.sh setup mi325
+bash benchmarks/asv/run_benchmarks.sh run bench_casting
+bash benchmarks/asv/run_benchmarks.sh quick
+bash benchmarks/asv/run_benchmarks.sh compare HEAD~3 HEAD
+bash benchmarks/asv/run_benchmarks.sh view
+```
+
+## Local usage (manual ASV commands)
 
 All commands are run from the **repository root** (where `asv.conf.json` lives).
 
diff --git a/benchmarks/asv/run_benchmarks.sh b/benchmarks/asv/run_benchmarks.sh
new file mode 100755
index 000000000..5f07c23ff
--- /dev/null
+++ b/benchmarks/asv/run_benchmarks.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+# Helper script for common ASV benchmark tasks.
+# Run from the repository root (where asv.conf.json lives).
+set -euo pipefail
+
+cd "$(git rev-parse --show-toplevel)"
+
+BENCH_DIR="benchmarks/asv"
+mapfile -t SUITES < <(find "$BENCH_DIR" -maxdepth 1 -name 'bench_*.py' -printf '%f\n' | sed 's/\.py$//' | sort)
+
+usage() {
+    cat <<EOF
+Usage: bash benchmarks/asv/run_benchmarks.sh <command> [options]
+
+Commands:
+  setup                 Register this machine with ASV
+  run [SUITE]           Run all benchmarks, or a single suite (e.g. bench_casting)
+  quick [SUITE]         Smoke-test run (single iteration, results not saved)
+  compare [REF] [NEW]   Compare two commits (default: HEAD~1 vs HEAD)
+  view                  Generate HTML dashboard and open preview server
+  list                  List available benchmark suites
+
+EOF
+}
+
+case "${1:-}" in
+    setup)
+        MACHINE="${2:-$(hostname)}"
+        echo "Registering machine as: $MACHINE"
+        asv machine --yes --machine "$MACHINE"
+        ;;
+    run)
+        CMD=(asv run --python=same --launch-method spawn)
+        [[ -n "${2:-}" ]] && CMD+=(--bench "$2")
+        echo "Running: ${CMD[*]}"
+        "${CMD[@]}"
+        ;;
+    quick)
+        CMD=(asv run --python=same --launch-method spawn --quick)
+        [[ -n "${2:-}" ]] && CMD+=(--bench "$2")
+        echo "Running (quick): ${CMD[*]}"
+        "${CMD[@]}"
+        ;;
+    compare)
+        REF="${2:-HEAD~1}"
+        NEW="${3:-HEAD}"
+        echo "Comparing $REF vs $NEW"
+        asv continuous --python=same --launch-method spawn "$REF" "$NEW"
+        ;;
+    view)
+        asv publish
+        echo "Starting preview server at http://localhost:8080"
+        asv preview
+        ;;
+    list)
+        echo "Available benchmark suites:"
+        for s in "${SUITES[@]}"; do echo "  $s"; done
+        ;;
+    *)
+        usage
+        exit 1
+        ;;
+esac

From 6cb91a56a86bfb3653d333de851a282bf7b8ee97 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Thu, 19 Mar 2026 13:09:32 -0500
Subject: [PATCH 4/6] Corrected local benchmarking

---
 asv.conf.json                    |  2 +-
 benchmarks/asv/README.md         | 13 ++++++++-----
 benchmarks/asv/run_benchmarks.sh |  6 ++++--
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/asv.conf.json b/asv.conf.json
index dc71bf345..1b17c9c9e 100644
--- a/asv.conf.json
+++ b/asv.conf.json
@@ -3,7 +3,7 @@
     "project": "TransformerEngine",
     "project_url": "https://github.com/ROCm/TransformerEngine",
     "repo": ".",
-    "branches": ["dev"],
+    "branches": ["HEAD", "dev"],
     "environment_type": "existing",
     "install_command": [],
     "build_command": [],
diff --git a/benchmarks/asv/README.md b/benchmarks/asv/README.md
index bd02b4991..5d2686ed2 100644
--- a/benchmarks/asv/README.md
+++ b/benchmarks/asv/README.md
@@ -11,7 +11,8 @@ a framework for benchmarking Python packages over their lifetime.
 
 ASV is configured with `environment_type: "existing"` (in `asv.conf.json` at the repo root),
 meaning it uses the current Python environment directly — it does not create virtualenvs or
-attempt to build TE itself.
+attempt to build TE itself. The config sets `branches: ["HEAD", "dev"]` so that `asv publish`
+accepts results from both the currently checked-out branch and `dev` (for CI history).
 
 ## Helper script
 
@@ -26,7 +27,7 @@ bash benchmarks/asv/run_benchmarks.sh <command> [options]
 | Command | Description |
 |---|---|
 | `setup [name]` | Register machine with ASV (defaults to `hostname`) |
-| `run [suite]` | Run all benchmarks, or a single suite (e.g. `bench_casting`) |
+| `run [suite]` | Run benchmarks for the current commit (optionally a single suite) |
 | `quick [suite]` | Smoke test — single iteration, results not saved |
 | `compare [ref] [new]` | Compare two commits (defaults to `HEAD~1` vs `HEAD`) |
 | `view` | Generate HTML dashboard and serve on `localhost:8080` |
@@ -59,16 +60,18 @@ the name must be consistent across runs for historical comparison.
 ### Run all benchmarks
 
 ```bash
-asv run --python=same --launch-method spawn
+asv run --python=same --launch-method spawn --set-commit-hash $(git rev-parse HEAD)
 ```
 
 - `--python=same` — use the current interpreter (required with `environment_type: "existing"`)
 - `--launch-method spawn` — required for CUDA (fork causes "Cannot re-initialize CUDA in forked subprocess")
+- `--set-commit-hash` — **required** with `environment_type: "existing"`. Without it, ASV
+  runs benchmarks but silently discards results. The helper script sets this automatically.
 
 ### Run a single suite
 
 ```bash
-asv run --python=same --launch-method spawn --bench bench_casting
+asv run --python=same --launch-method spawn --set-commit-hash $(git rev-parse HEAD) --bench bench_casting
 ```
 
 The `--bench` argument accepts a regex that matches benchmark file or class names.
@@ -76,7 +79,7 @@ The `--bench` argument accepts a regex that matches benchmark file or class name
 ### Quick smoke test
 
 ```bash
-asv run --python=same --launch-method spawn --quick --bench bench_casting
+asv run --python=same --launch-method spawn --quick --set-commit-hash $(git rev-parse HEAD) --bench bench_casting
 ```
 
 `--quick` runs each benchmark only once with no statistical analysis. Useful for verifying
diff --git a/benchmarks/asv/run_benchmarks.sh b/benchmarks/asv/run_benchmarks.sh
index 5f07c23ff..7e9a21d23 100755
--- a/benchmarks/asv/run_benchmarks.sh
+++ b/benchmarks/asv/run_benchmarks.sh
@@ -30,13 +30,15 @@ case "${1:-}" in
         asv machine --yes --machine "$MACHINE"
         ;;
     run)
-        CMD=(asv run --python=same --launch-method spawn)
+        CMD=(asv run --python=same --launch-method spawn
+             --set-commit-hash "$(git rev-parse HEAD)")
         [[ -n "${2:-}" ]] && CMD+=(--bench "$2")
         echo "Running: ${CMD[*]}"
         "${CMD[@]}"
         ;;
     quick)
-        CMD=(asv run --python=same --launch-method spawn --quick)
+        CMD=(asv run --python=same --launch-method spawn --quick
+             --set-commit-hash "$(git rev-parse HEAD)")
         [[ -n "${2:-}" ]] && CMD+=(--bench "$2")
         echo "Running (quick): ${CMD[*]}"
         "${CMD[@]}"

From 1a98989da5e79b9f53da90485b0157d5d256670a Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Thu, 19 Mar 2026 13:28:09 -0500
Subject: [PATCH 5/6] Added direct-run option to bypass subprocess overhead

---
 asv.conf.json                    |  2 +-
 benchmarks/asv/README.md         |  8 ++-
 benchmarks/asv/direct_run.py     | 92 ++++++++++++++++++++++++++++++++
 benchmarks/asv/run_benchmarks.sh | 11 ++++
 4 files changed, 110 insertions(+), 3 deletions(-)
 create mode 100644 benchmarks/asv/direct_run.py

diff --git a/asv.conf.json b/asv.conf.json
index 1b17c9c9e..482e20c60 100644
--- a/asv.conf.json
+++ b/asv.conf.json
@@ -3,7 +3,7 @@
     "project": "TransformerEngine",
     "project_url": "https://github.com/ROCm/TransformerEngine",
     "repo": ".",
-    "branches": ["HEAD", "dev"],
+    "branches": ["HEAD"],
     "environment_type": "existing",
     "install_command": [],
     "build_command": [],
diff --git a/benchmarks/asv/README.md b/benchmarks/asv/README.md
index 5d2686ed2..5d88d8ad3 100644
--- a/benchmarks/asv/README.md
+++ b/benchmarks/asv/README.md
@@ -11,8 +11,9 @@ a framework for benchmarking Python packages over their lifetime.
 
 ASV is configured with `environment_type: "existing"` (in `asv.conf.json` at the repo root),
 meaning it uses the current Python environment directly — it does not create virtualenvs or
-attempt to build TE itself. The config sets `branches: ["HEAD", "dev"]` so that `asv publish`
-accepts results from both the currently checked-out branch and `dev` (for CI history).
+attempt to build TE itself. The config sets `branches: ["HEAD"]` so that `asv publish` accepts results from
+whichever branch is currently checked out — this works for both local development
+and CI (where `HEAD` points to `dev`).
 
 ## Helper script
 
@@ -29,6 +30,7 @@ bash benchmarks/asv/run_benchmarks.sh <command> [options]
 | `setup [name]` | Register machine with ASV (defaults to `hostname`) |
 | `run [suite]` | Run benchmarks for the current commit (optionally a single suite) |
 | `quick [suite]` | Smoke test — single iteration, results not saved |
+| `direct suite [method]` | Fast in-process run — no subprocesses, no ASV overhead |
 | `compare [ref] [new]` | Compare two commits (defaults to `HEAD~1` vs `HEAD`) |
 | `view` | Generate HTML dashboard and serve on `localhost:8080` |
 | `list` | List available benchmark suites |
@@ -39,6 +41,8 @@ Examples:
 bash benchmarks/asv/run_benchmarks.sh setup mi325
 bash benchmarks/asv/run_benchmarks.sh run bench_casting
 bash benchmarks/asv/run_benchmarks.sh quick
+bash benchmarks/asv/run_benchmarks.sh direct bench_casting
+bash benchmarks/asv/run_benchmarks.sh direct bench_gemm time_forward
 bash benchmarks/asv/run_benchmarks.sh compare HEAD~3 HEAD
 bash benchmarks/asv/run_benchmarks.sh view
 ```
diff --git a/benchmarks/asv/direct_run.py b/benchmarks/asv/direct_run.py
new file mode 100644
index 000000000..1e59e99e8
--- /dev/null
+++ b/benchmarks/asv/direct_run.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""Run ASV benchmark classes directly in-process, bypassing subprocess overhead.
+
+Usage:
+    python benchmarks/asv/direct_run.py [options] <suite> [method_filter]
+
+Examples:
+    python benchmarks/asv/direct_run.py bench_casting
+    python benchmarks/asv/direct_run.py bench_gemm time_forward
+    python benchmarks/asv/direct_run.py -w 5 -n 20 bench_casting
+"""
+
+import argparse
+import importlib
+import itertools
+import math
+import sys
+import time
+
+
+def run_class(cls, class_name, method_filter=None, warmup=3, iters=7):
+    methods = sorted(m for m in dir(cls) if m.startswith("time_"))
+    if method_filter:
+        methods = [m for m in methods if method_filter in m]
+    if not methods:
+        return
+
+    params = getattr(cls, "params", [[]])
+    param_names = getattr(cls, "param_names", [])
+    combos = list(itertools.product(*params))
+
+    print(f"\n{class_name}  ({len(combos)} combos x {len(methods)} methods, "
+          f"{warmup} warmup, {iters} timed)")
+    print("-" * 90)
+    print(f"  {'median':>10}  {'mean':>10}  {'stdev':>10}  {'method':<30}  params")
+    print("-" * 90)
+
+    for combo in combos:
+        label = ", ".join(f"{n}={v}" for n, v in zip(param_names, combo))
+        instance = cls()
+        try:
+            instance.setup(*combo)
+        except Exception as e:
+            print(f"  SKIP  {label}  setup failed: {e}")
+            continue
+
+        for method_name in methods:
+            method = getattr(instance, method_name)
+
+            for _ in range(warmup):
+                method(*combo)
+
+            times = []
+            for _ in range(iters):
+                t0 = time.perf_counter()
+                method(*combo)
+                times.append(time.perf_counter() - t0)
+
+            times.sort()
+            median = times[len(times) // 2]
+            mean = sum(times) / len(times)
+            stdev = math.sqrt(sum((t - mean) ** 2 for t in times) / len(times))
+            print(f"  {median*1000:>8.3f}ms  {mean*1000:>8.3f}ms  "
+                  f"{stdev*1000:>8.3f}ms  {method_name:<30}  {label}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run ASV benchmarks directly in-process (no subprocess overhead).")
+    parser.add_argument("suite", help="Benchmark module name (e.g. bench_casting)")
+    parser.add_argument("method_filter", nargs="?", default=None,
+                        help="Only run time_* methods containing this string")
+    parser.add_argument("-w", "--warmup", type=int, default=3,
+                        help="Number of warmup iterations (default: 3)")
+    parser.add_argument("-n", "--iters", type=int, default=7,
+                        help="Number of timed iterations (default: 7)")
+    args = parser.parse_args()
+
+    mod = importlib.import_module(args.suite)
+
+    for name in sorted(dir(mod)):
+        obj = getattr(mod, name)
+        if isinstance(obj, type) and name.startswith("Bench"):
+            run_class(obj, name, args.method_filter, args.warmup, args.iters)
+
+
+if __name__ == "__main__":
+    import os
+
+    os.chdir(os.path.dirname(os.path.abspath(__file__)))
+    sys.path.insert(0, ".")
+    main()
diff --git a/benchmarks/asv/run_benchmarks.sh b/benchmarks/asv/run_benchmarks.sh
index 7e9a21d23..4ca71881c 100755
--- a/benchmarks/asv/run_benchmarks.sh
+++ b/benchmarks/asv/run_benchmarks.sh
@@ -16,6 +16,8 @@ Commands:
   setup                 Register this machine with ASV
   run [SUITE]           Run all benchmarks, or a single suite (e.g. bench_casting)
   quick [SUITE]         Smoke-test run (single iteration, results not saved)
+  direct [-w W] [-n N] SUITE [METHOD]
+                      Fast in-process run (no subprocesses, no ASV overhead)
   compare [REF] [NEW]   Compare two commits (default: HEAD~1 vs HEAD)
   view                  Generate HTML dashboard and open preview server
   list                  List available benchmark suites
@@ -43,6 +45,15 @@ case "${1:-}" in
         echo "Running (quick): ${CMD[*]}"
         "${CMD[@]}"
         ;;
+    direct)
+        shift
+        if [[ $# -eq 0 ]]; then
+            echo "Usage: $0 direct [options] SUITE [METHOD]"
+            echo "Options: -w WARMUP  -n ITERS"
+            exit 1
+        fi
+        python "$BENCH_DIR/direct_run.py" "$@"
+        ;;
     compare)
         REF="${2:-HEAD~1}"
         NEW="${3:-HEAD}"

From 498f16d02eab404ba5f24be01fb804f9c440e59e Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Fri, 20 Mar 2026 14:01:35 -0500
Subject: [PATCH 6/6] Refactor to prefer direct runs, and moved asv conf

---
 benchmarks/asv/README.md                      |  39 ++-
 asv.conf.json => benchmarks/asv/asv.conf.json |   8 +-
 benchmarks/asv/direct_run.py                  | 279 ++++++++++++++++--
 benchmarks/asv/run_benchmarks.sh              |  58 ++--
 4 files changed, 313 insertions(+), 71 deletions(-)
 rename asv.conf.json => benchmarks/asv/asv.conf.json (69%)

diff --git a/benchmarks/asv/README.md b/benchmarks/asv/README.md
index 5d88d8ad3..9189522e1 100644
--- a/benchmarks/asv/README.md
+++ b/benchmarks/asv/README.md
@@ -9,7 +9,7 @@ a framework for benchmarking Python packages over their lifetime.
 - A ROCm or CUDA GPU must be available.
 - Install ASV: `pip install asv`
 
-ASV is configured with `environment_type: "existing"` (in `asv.conf.json` at the repo root),
+ASV is configured with `environment_type: "existing"` (in `benchmarks/asv/asv.conf.json`),
 meaning it uses the current Python environment directly — it does not create virtualenvs or
 attempt to build TE itself. The config sets `branches: ["HEAD"]` so that `asv publish` accepts results from
 whichever branch is currently checked out — this works for both local development
@@ -28,33 +28,40 @@ bash benchmarks/asv/run_benchmarks.sh <command> [options]
 | Command | Description |
 |---|---|
 | `setup [name]` | Register machine with ASV (defaults to `hostname`) |
-| `run [suite]` | Run benchmarks for the current commit (optionally a single suite) |
-| `quick [suite]` | Smoke test — single iteration, results not saved |
-| `direct suite [method]` | Fast in-process run — no subprocesses, no ASV overhead |
+| `run [suite] [method]` | Run benchmarks in-process (fast, saves ASV-compatible results) |
+| `run --asv [suite]` | Run benchmarks via ASV (subprocess isolation per benchmark) |
+| `quick [suite]` | Smoke test via ASV — single iteration, results not saved |
 | `compare [ref] [new]` | Compare two commits (defaults to `HEAD~1` vs `HEAD`) |
 | `view` | Generate HTML dashboard and serve on `localhost:8080` |
 | `list` | List available benchmark suites |
 
+The default `run` command executes benchmarks directly in-process, avoiding the
+significant subprocess-per-benchmark overhead that ASV imposes. Results are saved in
+ASV-compatible format and can be viewed with `view`. Use `run --asv` when you need
+ASV's subprocess isolation (e.g. for CI or statistical rigor).
+
 Examples:
 
 ```bash
 bash benchmarks/asv/run_benchmarks.sh setup mi325
-bash benchmarks/asv/run_benchmarks.sh run bench_casting
-bash benchmarks/asv/run_benchmarks.sh quick
-bash benchmarks/asv/run_benchmarks.sh direct bench_casting
-bash benchmarks/asv/run_benchmarks.sh direct bench_gemm time_forward
+bash benchmarks/asv/run_benchmarks.sh run                           # all suites
+bash benchmarks/asv/run_benchmarks.sh run bench_casting             # one suite
+bash benchmarks/asv/run_benchmarks.sh run bench_gemm time_forward   # one method
+bash benchmarks/asv/run_benchmarks.sh run -w 5 -n 20 bench_casting  # custom iterations
+bash benchmarks/asv/run_benchmarks.sh run --asv bench_casting       # via ASV subprocesses
 bash benchmarks/asv/run_benchmarks.sh compare HEAD~3 HEAD
 bash benchmarks/asv/run_benchmarks.sh view
 ```
 
 ## Local usage (manual ASV commands)
 
-All commands are run from the **repository root** (where `asv.conf.json` lives).
+All manual `asv` commands require `--config` with an **absolute path** to the config file
+and should be run from the **repository root**. ASV does not resolve relative `--config` paths.
 
 ### Register your machine
 
 ```bash
-asv machine --yes --machine my-machine-name
+asv machine --config $(pwd)/benchmarks/asv/asv.conf.json --yes --machine my-machine-name
 ```
 
 This creates a machine profile in `benchmarks/.asv/results/my-machine-name/machine.json`.
@@ -64,7 +71,7 @@ the name must be consistent across runs for historical comparison.
 ### Run all benchmarks
 
 ```bash
-asv run --python=same --launch-method spawn --set-commit-hash $(git rev-parse HEAD)
+asv run --config $(pwd)/benchmarks/asv/asv.conf.json --python=same --launch-method spawn --set-commit-hash $(git rev-parse HEAD)
 ```
 
 - `--python=same` — use the current interpreter (required with `environment_type: "existing"`)
@@ -75,7 +82,7 @@ asv run --python=same --launch-method spawn --set-commit-hash $(git rev-parse HE
 ### Run a single suite
 
 ```bash
-asv run --python=same --launch-method spawn --set-commit-hash $(git rev-parse HEAD) --bench bench_casting
+asv run --config $(pwd)/benchmarks/asv/asv.conf.json --python=same --launch-method spawn --set-commit-hash $(git rev-parse HEAD) --bench bench_casting
 ```
 
 The `--bench` argument accepts a regex that matches benchmark file or class names.
@@ -83,7 +90,7 @@ The `--bench` argument accepts a regex that matches benchmark file or class name
 ### Quick smoke test
 
 ```bash
-asv run --python=same --launch-method spawn --quick --set-commit-hash $(git rev-parse HEAD) --bench bench_casting
+asv run --config $(pwd)/benchmarks/asv/asv.conf.json --python=same --launch-method spawn --quick --set-commit-hash $(git rev-parse HEAD) --bench bench_casting
 ```
 
 `--quick` runs each benchmark only once with no statistical analysis. Useful for verifying
@@ -92,7 +99,7 @@ benchmarks work, but note that results are **not saved to disk** in quick mode.
 ### Compare two commits
 
 ```bash
-asv continuous --python=same --launch-method spawn HEAD~1 HEAD
+asv continuous --config $(pwd)/benchmarks/asv/asv.conf.json --python=same --launch-method spawn HEAD~1 HEAD
 ```
 
 This checks out each commit, runs benchmarks on both, and reports regressions.
@@ -101,8 +108,8 @@ Note: this only works if the benchmark files exist on both commits.
 ### Generate an HTML dashboard
 
 ```bash
-asv publish
-asv preview
+asv publish --config $(pwd)/benchmarks/asv/asv.conf.json
+asv preview --config $(pwd)/benchmarks/asv/asv.conf.json
 ```
 
 `asv publish` generates static HTML from stored results into `benchmarks/.asv/html/`.
diff --git a/asv.conf.json b/benchmarks/asv/asv.conf.json
similarity index 69%
rename from asv.conf.json
rename to benchmarks/asv/asv.conf.json
index 482e20c60..3c1616aac 100644
--- a/asv.conf.json
+++ b/benchmarks/asv/asv.conf.json
@@ -2,14 +2,14 @@
     "version": 1,
     "project": "TransformerEngine",
     "project_url": "https://github.com/ROCm/TransformerEngine",
-    "repo": ".",
+    "repo": "../..",
     "branches": ["HEAD"],
     "environment_type": "existing",
     "install_command": [],
     "build_command": [],
-    "benchmark_dir": "benchmarks/asv",
-    "results_dir": "benchmarks/.asv/results",
-    "html_dir": "benchmarks/.asv/html",
+    "benchmark_dir": ".",
+    "results_dir": "../.asv/results",
+    "html_dir": "../.asv/html",
     "install_timeout": 600,
     "benchmark_timeout": 1200,
     "launch_method": "spawn"
diff --git a/benchmarks/asv/direct_run.py b/benchmarks/asv/direct_run.py
index 1e59e99e8..dab841a6e 100644
--- a/benchmarks/asv/direct_run.py
+++ b/benchmarks/asv/direct_run.py
@@ -1,6 +1,9 @@
 #!/usr/bin/env python3
 """Run ASV benchmark classes directly in-process, bypassing subprocess overhead.
 
+Results are saved in ASV-compatible format so they can be viewed with
+``asv publish && asv preview``.
+
 Usage:
     python benchmarks/asv/direct_run.py [options] <suite> [method_filter]
 
@@ -11,19 +14,174 @@
 """
 
 import argparse
+import hashlib
 import importlib
+import inspect
 import itertools
+import json
 import math
+import os
+import platform
+import subprocess
 import sys
+import textwrap
 import time
 
 
-def run_class(cls, class_name, method_filter=None, warmup=3, iters=7):
+# ---------------------------------------------------------------------------
+# ASV result generation
+# ---------------------------------------------------------------------------
+
+def _get_benchmark_version(cls, method_name):
+    """Compute the version hash the same way ASV does.
+
+    ASV hashes a code string built from the time_* and setup methods.
+    The string is class header + indented time method + indented setup,
+    with no trailing newline.
+    """
+    time_src = textwrap.dedent(inspect.getsource(getattr(cls, method_name)))
+    setup_src = textwrap.dedent(inspect.getsource(cls.setup))
+    code = (
+        f"class {cls.__name__}:\n"
+        + textwrap.indent(time_src, "    ") + "\n"
+        + textwrap.indent(setup_src, "    ")
+    ).rstrip("\n")
+    return hashlib.sha256(code.encode()).hexdigest()
+
+
+def _format_param_value(v):
+    """Format a parameter value the way ASV stores it in JSON."""
+    if isinstance(v, str):
+        return f"'{v}'"
+    return repr(v)
+
+
+def _get_machine_info():
+    """Build the params/machine dict ASV expects."""
+    machine = platform.node()
+    info = {
+        "arch": platform.machine(),
+        "cpu": "",
+        "machine": machine,
+        "num_cpu": str(os.cpu_count()),
+        "os": f"{platform.system()} {platform.release()}",
+        "ram": "",
+    }
+    try:
+        with open("/proc/cpuinfo") as f:
+            for line in f:
+                if line.startswith("model name"):
+                    info["cpu"] = line.split(":", 1)[1].strip()
+                    break
+        with open("/proc/meminfo") as f:
+            for line in f:
+                if line.startswith("MemTotal"):
+                    info["ram"] = line.split()[1]  # kB
+                    break
+    except OSError:
+        pass
+    return machine, info
+
+
+def _get_commit_hash():
+    """Get the current git HEAD hash."""
+    try:
+        return subprocess.check_output(
+            ["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL
+        ).decode().strip()
+    except Exception:
+        return "unknown"
+
+
+def _compute_stats(samples):
+    """Compute statistics from a list of timing samples."""
+    if not samples:
+        return None, None, None, None, None
+    s = sorted(samples)
+    n = len(s)
+    median = s[n // 2]
+    mean = sum(s) / n
+    q25 = s[max(0, n // 4)]
+    q75 = s[min(n - 1, 3 * n // 4)]
+    stdev = math.sqrt(sum((t - mean) ** 2 for t in s) / n)
+    ci_lo = max(0, mean - 2.576 * stdev / math.sqrt(n))  # 99% CI
+    ci_hi = mean + 2.576 * stdev / math.sqrt(n)
+    return median, ci_lo, ci_hi, q25, q75
+
+
+def _get_results_dir():
+    """Read results_dir from asv.conf.json, resolved to an absolute path."""
+    conf_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "asv.conf.json")
+    with open(conf_path) as f:
+        conf = json.load(f)
+    conf_dir = os.path.dirname(conf_path)
+    return os.path.normpath(os.path.join(conf_dir, conf["results_dir"]))
+
+
+def save_asv_results(all_results):
+    """Write results to ASV's results directory."""
+    commit_hash = _get_commit_hash()
+    machine_name, machine_info = _get_machine_info()
+    env_name = "existing-" + sys.executable.replace("/", "_").strip("_")
+    results_dir = _get_results_dir()
+    machine_dir = os.path.join(results_dir, machine_name)
+    os.makedirs(machine_dir, exist_ok=True)
+
+    # Write machine.json if missing
+    machine_json = os.path.join(machine_dir, "machine.json")
+    if not os.path.exists(machine_json):
+        with open(machine_json, "w") as f:
+            json.dump({**machine_info, "version": 1}, f, indent=4)
+
+    # Load existing result file or start fresh
+    filename = f"{commit_hash[:8]}-{env_name}.json"
+    result_path = os.path.join(machine_dir, filename)
+    if os.path.exists(result_path):
+        with open(result_path) as f:
+            data = json.load(f)
+    else:
+        data = {
+            "commit_hash": commit_hash,
+            "env_name": env_name,
+            "date": int(time.time() * 1000),
+            "params": {**machine_info, "python": sys.executable},
+            "python": sys.executable,
+            "requirements": {},
+            "env_vars": {},
+            "result_columns": [
+                "result", "params", "version",
+                "started_at", "duration",
+                "stats_ci_99_a", "stats_ci_99_b",
+                "stats_q_25", "stats_q_75",
+                "stats_number", "stats_repeat",
+                "samples",
+            ],
+            "results": {},
+            "durations": {},
+            "version": 2,
+        }
+
+    # Merge new results
+    for bench_key, bench_data in all_results.items():
+        data["results"][bench_key] = bench_data
+
+    with open(result_path, "w") as f:
+        json.dump(data, f, indent=2)
+
+    print(f"\nResults saved to {result_path}")
+
+
+# ---------------------------------------------------------------------------
+# Benchmark runner
+# ---------------------------------------------------------------------------
+
+def run_class(suite_name, cls, class_name, method_filter=None, warmup=3, iters=7):
+    """Run all benchmarks in a class, returning ASV-formatted results."""
     methods = sorted(m for m in dir(cls) if m.startswith("time_"))
     if method_filter:
         methods = [m for m in methods if method_filter in m]
     if not methods:
-        return
+        return {}
 
     params = getattr(cls, "params", [[]])
     param_names = getattr(cls, "param_names", [])
@@ -31,37 +189,96 @@ def run_class(cls, class_name, method_filter=None, warmup=3, iters=7):
 
     print(f"\n{class_name}  ({len(combos)} combos x {len(methods)} methods, "
           f"{warmup} warmup, {iters} timed)")
-    print("-" * 90)
-    print(f"  {'median':>10}  {'mean':>10}  {'stdev':>10}  {'method':<30}  params")
-    print("-" * 90)
-
-    for combo in combos:
-        label = ", ".join(f"{n}={v}" for n, v in zip(param_names, combo))
-        instance = cls()
-        try:
-            instance.setup(*combo)
-        except Exception as e:
-            print(f"  SKIP  {label}  setup failed: {e}")
-            continue
-
-        for method_name in methods:
+    HDR = (f"  {'median':>10}  {'mean':>10}  {'stdev':>10}"
+           f"  {'q25':>10}  {'q75':>10}  {'min':>10}  {'max':>10}"
+           f"  {'method':<30}  params")
+    print("-" * len(HDR))
+    print(HDR)
+    print("-" * len(HDR))
+
+    # ASV stores params as lists of string representations
+    asv_params = [[_format_param_value(v) for v in dim] for dim in params]
+
+    all_results = {}
+
+    for method_name in methods:
+        bench_key = f"{suite_name}.{class_name}.{method_name}"
+        version = _get_benchmark_version(cls, method_name)
+
+        medians = []
+        ci_los = []
+        ci_his = []
+        q25s = []
+        q75s = []
+        numbers = []
+        repeats = []
+        started_at = int(time.time() * 1000)
+        t_start = time.perf_counter()
+
+        for combo in combos:
+            label = ", ".join(f"{n}={v}" for n, v in zip(param_names, combo))
+            instance = cls()
+            try:
+                instance.setup(*combo)
+            except Exception as e:
+                print(f"  SKIP  {label}  setup failed: {e}")
+                medians.append(None)
+                ci_los.append(None)
+                ci_his.append(None)
+                q25s.append(None)
+                q75s.append(None)
+                numbers.append(None)
+                repeats.append(None)
+                continue
+
             method = getattr(instance, method_name)
 
             for _ in range(warmup):
                 method(*combo)
 
-            times = []
+            samples = []
             for _ in range(iters):
                 t0 = time.perf_counter()
                 method(*combo)
-                times.append(time.perf_counter() - t0)
+                samples.append(time.perf_counter() - t0)
+
+            median, ci_lo, ci_hi, q25, q75 = _compute_stats(samples)
+            mean = sum(samples) / len(samples)
+            stdev = math.sqrt(sum((t - mean) ** 2 for t in samples) / len(samples))
+            s_min, s_max = min(samples), max(samples)
+
+            medians.append(median)
+            ci_los.append(ci_lo)
+            ci_his.append(ci_hi)
+            q25s.append(q25)
+            q75s.append(q75)
+            numbers.append(1)
+            repeats.append(iters)
 
-            times.sort()
-            median = times[len(times) // 2]
-            mean = sum(times) / len(times)
-            stdev = math.sqrt(sum((t - mean) ** 2 for t in times) / len(times))
             print(f"  {median*1000:>8.3f}ms  {mean*1000:>8.3f}ms  "
-                  f"{stdev*1000:>8.3f}ms  {method_name:<30}  {label}")
+                  f"{stdev*1000:>8.3f}ms  {q25*1000:>8.3f}ms  {q75*1000:>8.3f}ms  "
+                  f"{s_min*1000:>8.3f}ms  {s_max*1000:>8.3f}ms  "
+                  f"{method_name:<30}  {label}")
+
+        duration = time.perf_counter() - t_start
+
+        # ASV result row: [result, params, version, started_at, duration,
+        #   ci_99_a, ci_99_b, q_25, q_75, number, repeat, samples]
+        all_results[bench_key] = [
+            medians,
+            asv_params,
+            version,
+            started_at,
+            round(duration, 2),
+            ci_los,
+            ci_his,
+            q25s,
+            q75s,
+            numbers,
+            repeats,
+        ]
+
+    return all_results
 
 
 def main():
@@ -74,19 +291,27 @@ def main():
                         help="Number of warmup iterations (default: 3)")
     parser.add_argument("-n", "--iters", type=int, default=7,
                         help="Number of timed iterations (default: 7)")
+    parser.add_argument("--no-save", action="store_true",
+                        help="Skip saving results to ASV format")
     args = parser.parse_args()
 
     mod = importlib.import_module(args.suite)
 
+    all_results = {}
     for name in sorted(dir(mod)):
         obj = getattr(mod, name)
         if isinstance(obj, type) and name.startswith("Bench"):
-            run_class(obj, name, args.method_filter, args.warmup, args.iters)
+            results = run_class(
+                args.suite, obj, name,
+                args.method_filter, args.warmup, args.iters)
+            all_results.update(results)
 
+    if all_results and not args.no_save:
+        save_asv_results(all_results)
 
-if __name__ == "__main__":
-    import os
 
-    os.chdir(os.path.dirname(os.path.abspath(__file__)))
+if __name__ == "__main__":
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    os.chdir(script_dir)
     sys.path.insert(0, ".")
     main()
diff --git a/benchmarks/asv/run_benchmarks.sh b/benchmarks/asv/run_benchmarks.sh
index 4ca71881c..e8020a960 100755
--- a/benchmarks/asv/run_benchmarks.sh
+++ b/benchmarks/asv/run_benchmarks.sh
@@ -1,11 +1,11 @@
 #!/usr/bin/env bash
 # Helper script for common ASV benchmark tasks.
-# Run from the repository root (where asv.conf.json lives).
 set -euo pipefail
 
 cd "$(git rev-parse --show-toplevel)"
 
 BENCH_DIR="benchmarks/asv"
+ASV_CONF="$(pwd)/$BENCH_DIR/asv.conf.json"
 mapfile -t SUITES < <(find "$BENCH_DIR" -maxdepth 1 -name 'bench_*.py' -printf '%f\n' | sed 's/\.py$//' | sort)
 
 usage() {
@@ -14,10 +14,10 @@ Usage: bash benchmarks/asv/run_benchmarks.sh <command> [options]
 
 Commands:
   setup                 Register this machine with ASV
-  run [SUITE]           Run all benchmarks, or a single suite (e.g. bench_casting)
-  quick [SUITE]         Smoke-test run (single iteration, results not saved)
-  direct [-w W] [-n N] SUITE [METHOD]
-                      Fast in-process run (no subprocesses, no ASV overhead)
+  run [-w W] [-n N] [SUITE] [METHOD]
+                        Run benchmarks in-process (fast, saves ASV-compatible results)
+  run --asv [SUITE]     Run benchmarks via ASV (subprocess isolation per benchmark)
+  quick [SUITE]         Smoke-test via ASV (single iteration, results not saved)
   compare [REF] [NEW]   Compare two commits (default: HEAD~1 vs HEAD)
   view                  Generate HTML dashboard and open preview server
   list                  List available benchmark suites
@@ -29,41 +29,51 @@ case "${1:-}" in
     setup)
         MACHINE="${2:-$(hostname)}"
         echo "Registering machine as: $MACHINE"
-        asv machine --yes --machine "$MACHINE"
+        asv machine --yes --machine "$MACHINE" --config "$ASV_CONF"
         ;;
     run)
-        CMD=(asv run --python=same --launch-method spawn
-             --set-commit-hash "$(git rev-parse HEAD)")
-        [[ -n "${2:-}" ]] && CMD+=(--bench "$2")
-        echo "Running: ${CMD[*]}"
-        "${CMD[@]}"
+        shift
+        if [[ "${1:-}" == "--asv" ]]; then
+            shift
+            CMD=(asv run --config "$ASV_CONF" --python=same --launch-method spawn
+                 --set-commit-hash "$(git rev-parse HEAD)")
+            [[ -n "${1:-}" ]] && CMD+=(--bench "$1")
+            echo "Running (asv): ${CMD[*]}"
+            "${CMD[@]}"
+        else
+            # Default: fast in-process run
+            ARGS=()
+            while [[ $# -gt 0 ]]; do
+                ARGS+=("$1")
+                shift
+            done
+            if [[ ${#ARGS[@]} -eq 0 ]]; then
+                # Run all suites
+                for s in "${SUITES[@]}"; do
+                    python "$BENCH_DIR/direct_run.py" "$s"
+                done
+            else
+                python "$BENCH_DIR/direct_run.py" "${ARGS[@]}"
+            fi
+        fi
         ;;
     quick)
-        CMD=(asv run --python=same --launch-method spawn --quick
+        CMD=(asv run --config "$ASV_CONF" --python=same --launch-method spawn --quick
              --set-commit-hash "$(git rev-parse HEAD)")
         [[ -n "${2:-}" ]] && CMD+=(--bench "$2")
         echo "Running (quick): ${CMD[*]}"
         "${CMD[@]}"
         ;;
-    direct)
-        shift
-        if [[ $# -eq 0 ]]; then
-            echo "Usage: $0 direct [options] SUITE [METHOD]"
-            echo "Options: -w WARMUP  -n ITERS"
-            exit 1
-        fi
-        python "$BENCH_DIR/direct_run.py" "$@"
-        ;;
     compare)
         REF="${2:-HEAD~1}"
         NEW="${3:-HEAD}"
         echo "Comparing $REF vs $NEW"
-        asv continuous --python=same --launch-method spawn "$REF" "$NEW"
+        asv continuous --config "$ASV_CONF" --python=same --launch-method spawn "$REF" "$NEW"
         ;;
     view)
-        asv publish
+        asv publish --config "$ASV_CONF"
         echo "Starting preview server at http://localhost:8080"
-        asv preview
+        asv preview --config "$ASV_CONF"
         ;;
     list)
         echo "Available benchmark suites:"