From 2b9430840f3f8a80c0a38dd2f092a2a9d487b8a0 Mon Sep 17 00:00:00 2001 From: amd-asalykov Date: Thu, 22 Jan 2026 16:20:41 +0000 Subject: [PATCH 1/8] add hip backend for eval_single_sample --- pyproject.toml | 11 ++- scripts/generate_and_eval_single_sample.py | 4 +- src/kernelbench/prompts/hardware/gpu_specs.py | 84 +++++++++++++++++++ .../prompts/model_new_ex_add_hip.py | 45 ++++++++++ src/kernelbench/prompts/prompts.toml | 5 ++ src/kernelbench/utils.py | 2 +- 6 files changed, 148 insertions(+), 3 deletions(-) create mode 100644 src/kernelbench/prompts/model_new_ex_add_hip.py diff --git a/pyproject.toml b/pyproject.toml index bed37150..36125475 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ requires-python = "==3.10.*" dependencies = [ # Frameworks "torch==2.9.0", - + "pytorch-triton-rocm>=3.4.0", "transformers", "datasets", "modal", @@ -49,6 +49,15 @@ dev = [ "ruff", ] +[tool.uv.sources] +torch = [{ index = "pytorch-rocm" }] +torchvision = [{ index = "pytorch-rocm" }] +pytorch-triton-rocm = [{ index = "pytorch-rocm" }] + +[[tool.uv.index]] +name = "pytorch-rocm" +url = "https://download.pytorch.org/whl/rocm6.4" +explicit = true [tool.setuptools.packages.find] where = ["src"] diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py index fce1b16f..95aea5d7 100644 --- a/scripts/generate_and_eval_single_sample.py +++ b/scripts/generate_and_eval_single_sample.py @@ -124,6 +124,8 @@ def main(config: EvalConfig): ) if config.gpu_arch: + if (type(config.gpu_arch) is not list): + config.gpu_arch = [config.gpu_arch] set_gpu_arch(config.gpu_arch) # otherwise build for all architectures if config.log: @@ -174,7 +176,7 @@ def main(config: EvalConfig): include_hardware = include_hardware.lower() in ["true", "1", "yes"] config.include_hardware_info = include_hardware - supported_backends = {"cuda", "triton", "tilelang", "cute", "thunderkittens"} + supported_backends = {"cuda", "hip", "triton", "tilelang", "cute", "thunderkittens"} backend = config.backend.lower() if backend not in supported_backends: raise ValueError( diff --git a/src/kernelbench/prompts/hardware/gpu_specs.py b/src/kernelbench/prompts/hardware/gpu_specs.py index 800f20ef..ca63488f 100644 --- a/src/kernelbench/prompts/hardware/gpu_specs.py +++ b/src/kernelbench/prompts/hardware/gpu_specs.py @@ -118,6 +118,90 @@ "Maximum number of thread blocks per SM": "32", "Shared memory capacity per SM": "164 KB", "Maximum shared memory per thread block": "163 KB", + }, + "MI300X": { + "GPU Architecture": "gfx942", + "GPU Memory": "192GB", + "Memory Bandwidth": "5.3 TB/s", + "FP64 TFLOPS": "81.7", + "FP64 Matrix Core TFLOPS": "163.4", + "FP32 TFLOPS": "163.4", + "TF32 Matrix Core TFLOPS": "653.7 (1307.4 with sparsity)", + "BFLOAT16 Matrix Core TFLOPS": "1307.4 (2614.9 with sparsity)", + "FP16 Matrix Core TFLOPS": "1307.4 (2614.9 with sparsity)", + "FP8 Matrix Core TFLOPS": "2614.9 (5229.8 with sparsity)", + "INT8 Matrix Core TOPS": "2614.9 (5229.8 with sparsity)", + "Number of CU": "304", + "SIMDs per CU": "4", + "Wavefront Size": "64", + "Workgroup Max Size": "1024", + "Max Waves Per CU": "32", + "Max Threads per CU": "2048", + "Maximum number of registers per thread": "256", + "Shared memory capacity per CU": "64 KB", + }, + "MI325X": { + "GPU Architecture": "gfx942", + "GPU Memory": "256GB", + "Memory Bandwidth": "6TB/s", + "FP64 TFLOPS": "81.7", + "FP64 Matrix Core TFLOPS": "163.4", + "FP32 TFLOPS": "163.4", + "TF32 Matrix Core TFLOPS": "653.7 (1307.4 with sparsity)", + "BFLOAT16 Matrix Core TFLOPS": "1307.4 (2614.9 with sparsity)", + "FP16 Matrix Core TFLOPS": "1307.4 (2614.9 with sparsity)", + "FP8 Matrix Core TFLOPS": "2614.9 (5229.8 with sparsity)", + "INT8 Matrix Core TOPS": "2614.9 (5229.8 with sparsity)", + "Number of CU": "304", + "SIMDs per CU": "4", + "Wavefront Size": "64", + "Workgroup Max Size": "1024", + "Max Waves Per CU": "32", + "Max Threads per CU": "2048", + "Maximum number of registers per thread": "256", + "Shared memory capacity per CU": "64 KB", + }, + "MI350X": { + "GPU Architecture": "gfx950", + "GPU Memory": "288GB", + "Memory Bandwidth": "8TB/s", + "FP64 TFLOPS": "72.1", + "FP64 Matrix Core TFLOPS": "72.1", + "FP32 TFLOPS": "144.2", + "BFLOAT16 Matrix Core TFLOPS": "2300 (4600 with sparsity)", + "FP16 Matrix Core TFLOPS": "2300 (4600 with sparsity)", + "FP8 Matrix Core TFLOPS": "4600", + "MXFP6, MXFP4 Matrix Core TFLOPS": "9200", + "INT8 Matrix Core TOPS": "4600 (9200 with sparsity)", + "Number of CU": "256", + "SIMDs per CU": "4", + "Wavefront Size": "64", + "Workgroup Max Size": "1024", + "Max Waves Per CU": "32", + "Max Threads per CU": "2048", + "Maximum number of registers per thread": "256", + "Shared memory capacity per CU": "160 KB", + }, + "MI355X": { + "GPU Architecture": "gfx950", + "GPU Memory": "288GB", + "Memory Bandwidth": "8TB/s", + "FP64 TFLOPS": "78.6", + "FP64 Matrix Core TFLOPS": "78.6", + "FP32 TFLOPS": "157.3", + "BFLOAT16 Matrix Core TFLOPS": "2500 (5000 with sparsity)", + "FP16 Matrix Core TFLOPS": "2500 (5000 with sparsity)", + "FP8 Matrix Core TFLOPS": "5000", + "MXFP6, MXFP4 Matrix Core TFLOPS": "10000", + "INT8 Matrix Core TOPS": "5000 (10000 with sparsity)", + "Number of CU": "256", + "SIMDs per CU": "4", + "Wavefront Size": "64", + "Workgroup Max Size": "1024", + "Max Waves Per CU": "32", + "Max Threads per CU": "2048", + "Maximum number of registers per thread": "256", + "Shared memory capacity per CU": "160 KB", } } diff --git a/src/kernelbench/prompts/model_new_ex_add_hip.py b/src/kernelbench/prompts/model_new_ex_add_hip.py new file mode 100644 index 00000000..2498bc18 --- /dev/null +++ b/src/kernelbench/prompts/model_new_ex_add_hip.py @@ -0,0 +1,45 @@ +import os +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.cpp_extension import load_inline + +os.environ["CXX"] = "hipcc" + +elementwise_add_cpp_source = """ +#include + +__global__ void elementwise_add_kernel(const float* a, const float* b, float* out, int size) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + out[idx] = a[idx] + b[idx]; + } +} + +torch::Tensor elementwise_add_hip(torch::Tensor a, torch::Tensor b) { + auto size = a.numel(); + auto out = torch::zeros_like(a); + + const int block_size = 256; + const int num_blocks = (size + block_size - 1) / block_size; + + elementwise_add_kernel<<>>(a.data_ptr(), b.data_ptr(), out.data_ptr(), size); + + return out; +} +""" + +elementwise_add = load_inline( + name="elementwise_add", + cpp_sources=elementwise_add_cpp_source, + functions=["elementwise_add_hip"], + verbose=True, +) + +class ModelNew(nn.Module): + def __init__(self) -> None: + super().__init__() + self.elementwise_add = elementwise_add + + def forward(self, a, b): + return self.elementwise_add.elementwise_add_hip(a, b) \ No newline at end of file diff --git a/src/kernelbench/prompts/prompts.toml b/src/kernelbench/prompts/prompts.toml index 2768aa11..61b6b15f 100644 --- a/src/kernelbench/prompts/prompts.toml +++ b/src/kernelbench/prompts/prompts.toml @@ -54,6 +54,11 @@ backend_display = "ThunderKittens kernels" one_shot_new_arch = "src/kernelbench/prompts/model_new_ex_add_thunderkittens.py" # No few_shot_examples - will use one-shot when few_shot option is selected +[backends.hip] +backend_display = "HIP kernels" +one_shot_new_arch = "src/kernelbench/prompts/model_new_ex_add_hip.py" +# No few_shot_examples - will use one-shot when few_shot option is selected + # ------------------------------------------------------------------------- # Precision: Precision-specific configuration # ------------------------------------------------------------------------- diff --git a/src/kernelbench/utils.py b/src/kernelbench/utils.py index cf8b0ad8..bbd6a468 100644 --- a/src/kernelbench/utils.py +++ b/src/kernelbench/utils.py @@ -42,7 +42,7 @@ def set_gpu_arch(arch_list: list[str]): """ Set env variable for torch cuda arch list to build kernels for specified architectures """ - valid_archs = ["Maxwell", "Pascal", "Volta", "Turing", "Ampere", "Hopper", "Ada"] + valid_archs = ["Maxwell", "Pascal", "Volta", "Turing", "Ampere", "Hopper", "Ada", "gfx942", "gfx950"] for arch in arch_list: if arch not in valid_archs: raise ValueError(f"Invalid architecture: {arch}. Must be one of {valid_archs}") From 69ab132722a76e2eabd15e6867e78b2fb29d18fd Mon Sep 17 00:00:00 2001 From: amd-asalykov Date: Wed, 28 Jan 2026 20:04:21 +0000 Subject: [PATCH 2/8] update pyproject.toml for CDNA4 --- README.md | 5 ++--- pyproject.toml | 9 +++------ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 7343e73b..021fd033 100644 --- a/README.md +++ b/README.md @@ -115,10 +115,9 @@ uv run python scripts/generate_and_eval_single_sample.py dataset_src=huggingface ``` **What you might need to modify** -* **`gpu_arch`** - Depend on your GPU, you might need to adjust the `gpu_arch` argument to reflect your hardware. +* **`gpu_arch`** - Depend on your GPU, you might need to adjust the `gpu_arch` argument to reflect your hardware. `gpu_arch` currently supported for `hip` backend: `gfx942`, `gfx950`. * **`precision`** - You can specify the precision of tensor by `precision=fp32`. Currently all of our reported results are `fp32` but we added support for `fp16` & `bf16`. -* **`backend`** - We are also supporting other GPU programming languages beyond `cuda`. Simply specify `backend=triton`. For now we support DSLs: `cuda`, `triton`, `cute`, `tilelang`, `thunderkittens`. - +* **`backend`** - We are also supporting other GPU programming languages beyond `cuda`. For example, simply specify `backend=triton` or `backend=hip`. For now we support DSLs: `cuda`, `hip`, `triton`, `cute`, `tilelang`, `thunderkittens`. Note on setting up ThunderKittens (TK) locally: to use `backend=thunderkittens`, you need to git clone the ThunderKittens repo and set the following environment variable to point to your local ThunderKittens directory, `export THUNDERKITTENS_ROOT=`, and all ThunderKitten programs as shown in the [example](src/kernelbench/prompts/model_new_ex_add_thunderkittens.py), should contain `tk_root = os.environ.get("THUNDERKITTENS_ROOT", "/root/ThunderKittens")`, which enable the kernel to include the right TK primitives. In addition, we only support BF16 for TK right now. diff --git a/pyproject.toml b/pyproject.toml index 36125475..a29e98f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,10 +10,9 @@ version = "0.2.0.dev0" requires-python = "==3.10.*" dependencies = [ # Frameworks - "torch==2.9.0", - "pytorch-triton-rocm>=3.4.0", + "torch>=2.9.0", "transformers", - "datasets", + "datasets>=2.19.0", "modal", # helper @@ -52,12 +51,10 @@ dev = [ [tool.uv.sources] torch = [{ index = "pytorch-rocm" }] torchvision = [{ index = "pytorch-rocm" }] -pytorch-triton-rocm = [{ index = "pytorch-rocm" }] [[tool.uv.index]] name = "pytorch-rocm" -url = "https://download.pytorch.org/whl/rocm6.4" -explicit = true +url = "https://download.pytorch.org/whl/rocm7.1" [tool.setuptools.packages.find] where = ["src"] From fb5abb6250324788947288bbe83f806c1e60360e Mon Sep 17 00:00:00 2001 From: amd-asalykov Date: Wed, 28 Jan 2026 21:09:41 +0000 Subject: [PATCH 3/8] update --- README.md | 4 ++++ pyproject.toml | 15 ++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 021fd033..b090cbf3 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,10 @@ uv sync # Install with GPU dependencies (for local GPU evaluation) uv sync --extra gpu +# Install with ROCm backend +uv add torch --index pytorch=https://download.pytorch.org/whl/rocm7.1 +uv sync + # Run commands with uv (which invoke the right env) uv run python scripts/.py ... ``` diff --git a/pyproject.toml b/pyproject.toml index a29e98f4..cd5b2689 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,6 @@ dependencies = [ "transformers", "datasets>=2.19.0", "modal", - # helper "tqdm", "packaging", @@ -23,12 +22,10 @@ dependencies = [ "ninja", "tomli", "tabulate", - # Numerics "einops", "python-dotenv", "numpy", - # LLM providers "openai", "litellm[proxy]", @@ -48,17 +45,17 @@ dev = [ "ruff", ] -[tool.uv.sources] -torch = [{ index = "pytorch-rocm" }] -torchvision = [{ index = "pytorch-rocm" }] - [[tool.uv.index]] -name = "pytorch-rocm" +name = "pytorch" url = "https://download.pytorch.org/whl/rocm7.1" + [tool.setuptools.packages.find] where = ["src"] include = ["kernelbench*"] [tool.setuptools.package-data] -kernelbench = ["prompts/**/*"] \ No newline at end of file +kernelbench = ["prompts/**/*"] + +[tool.uv.sources] +torch = { index = "pytorch" } From 062801086084c0e3708511c00fc996355a1aaf47 Mon Sep 17 00:00:00 2001 From: amd-asalykov Date: Wed, 28 Jan 2026 21:15:30 +0000 Subject: [PATCH 4/8] update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b090cbf3..c2a40ef2 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ uv sync # Install with GPU dependencies (for local GPU evaluation) uv sync --extra gpu -# Install with ROCm backend +# Install with AMD ROCm backend uv add torch --index pytorch=https://download.pytorch.org/whl/rocm7.1 uv sync From 1a3bb4137ad4d47da39f45d39a819234eafddf29 Mon Sep 17 00:00:00 2001 From: amd-asalykov Date: Wed, 28 Jan 2026 21:17:02 +0000 Subject: [PATCH 5/8] update --- pyproject.toml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cd5b2689..f3f98ae2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,9 +11,11 @@ requires-python = "==3.10.*" dependencies = [ # Frameworks "torch>=2.9.0", + "transformers", "datasets>=2.19.0", "modal", + # helper "tqdm", "packaging", @@ -22,10 +24,12 @@ dependencies = [ "ninja", "tomli", "tabulate", + # Numerics "einops", "python-dotenv", "numpy", + # LLM providers "openai", "litellm[proxy]", @@ -45,17 +49,10 @@ dev = [ "ruff", ] -[[tool.uv.index]] -name = "pytorch" -url = "https://download.pytorch.org/whl/rocm7.1" - [tool.setuptools.packages.find] where = ["src"] include = ["kernelbench*"] [tool.setuptools.package-data] -kernelbench = ["prompts/**/*"] - -[tool.uv.sources] -torch = { index = "pytorch" } +kernelbench = ["prompts/**/*"] \ No newline at end of file From 416f70d71074a4d59a39c39b602bc4ae92f2a4d0 Mon Sep 17 00:00:00 2001 From: amd-asalykov Date: Wed, 28 Jan 2026 21:25:15 +0000 Subject: [PATCH 6/8] update --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c2a40ef2..0ff33cec 100644 --- a/README.md +++ b/README.md @@ -85,12 +85,11 @@ We have transitioned to using `pyproject.toml` and `uv` for dependency managemen # Install base dependencies (works without a local GPU) uv sync -# Install with GPU dependencies (for local GPU evaluation) -uv sync --extra gpu - # Install with AMD ROCm backend uv add torch --index pytorch=https://download.pytorch.org/whl/rocm7.1 -uv sync + +# Install with GPU dependencies (for local GPU evaluation) +uv sync --extra gpu # Run commands with uv (which invoke the right env) uv run python scripts/.py ... From ac473febc72c828e3510d277728c796c26731f6a Mon Sep 17 00:00:00 2001 From: amd-asalykov Date: Mon, 2 Feb 2026 13:06:51 -0600 Subject: [PATCH 7/8] add ROCm version requirement --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0ff33cec..a53d5672 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ We have transitioned to using `pyproject.toml` and `uv` for dependency managemen # Install base dependencies (works without a local GPU) uv sync -# Install with AMD ROCm backend +# Install with AMD ROCm backend (ROCm>=7.1 is required) uv add torch --index pytorch=https://download.pytorch.org/whl/rocm7.1 # Install with GPU dependencies (for local GPU evaluation) From 0312e01dec5189d8d7d0bf668762a7d2b9b39f35 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Sat, 21 Feb 2026 16:59:51 -0800 Subject: [PATCH 8/8] check and add more guardrails --- README.md | 10 +++-- scripts/generate_and_eval_single_sample.py | 2 +- src/kernelbench/eval.py | 19 +++++++- src/kernelbench/profile.py | 9 +++- src/kernelbench/prompts/hardware/gpu_specs.py | 1 + .../prompts/model_new_ex_add_hip.py | 1 + src/kernelbench/prompts/prompts.toml | 2 +- src/kernelbench/timing.py | 15 ++++--- src/kernelbench/utils.py | 45 ++++++++++++++++--- 9 files changed, 86 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index a53d5672..ce9348c6 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ We construct KernelBench to have 4 Levels of categories: - **Level 4 🤗**: Level Hugging Face Optimize whole model architectures from HuggingFace -We are actively extending KernelBench to other DSLs beyond `cuda` as well (see below). +We are actively extending KernelBench to other DSLs beyond `cuda` as well (see below), as well as AMD GPU support. ## ⚖️ Evaluation #### Methodology @@ -95,6 +95,8 @@ uv sync --extra gpu uv run python scripts/.py ... ``` +For AMD GPU aka ROCm backend (ROCm>=7.1), please add `uv remove torch && uv add torch --index pytorch=https://download.pytorch.org/whl/rocm7.1` for RoCm compatible PyTorch to configure your dependencies. Running in a docker image is recommended for this due to complexity of ROCm setup. + You can still use `conda (python=3.10)` to create your environment and install dependencies with `requirements.txt`. We use `litellm` for API calls. Please set your keys by creating a `.env` following our `.env.example`. @@ -118,9 +120,11 @@ uv run python scripts/generate_and_eval_single_sample.py dataset_src=huggingface ``` **What you might need to modify** -* **`gpu_arch`** - Depend on your GPU, you might need to adjust the `gpu_arch` argument to reflect your hardware. `gpu_arch` currently supported for `hip` backend: `gfx942`, `gfx950`. +* **`gpu_arch`** - Depend on your GPU, you might need to adjust the `gpu_arch` argument to reflect your hardware. * **`precision`** - You can specify the precision of tensor by `precision=fp32`. Currently all of our reported results are `fp32` but we added support for `fp16` & `bf16`. -* **`backend`** - We are also supporting other GPU programming languages beyond `cuda`. For example, simply specify `backend=triton` or `backend=hip`. For now we support DSLs: `cuda`, `hip`, `triton`, `cute`, `tilelang`, `thunderkittens`. +* **`backend`** - We are also supporting other GPU programming languages beyond `cuda`. For example, simply specify `backend=triton` or `backend=hip`. For now we support NVIDIA GPUs with programming frameworks and DSLs: `cuda`, `triton`, `cute`, `tilelang`, `thunderkittens`. + +Note for AMD GPUs: Use `hip` backend, `gpu_arch` currently supported: `gfx942`, `gfx950`. Note on setting up ThunderKittens (TK) locally: to use `backend=thunderkittens`, you need to git clone the ThunderKittens repo and set the following environment variable to point to your local ThunderKittens directory, `export THUNDERKITTENS_ROOT=`, and all ThunderKitten programs as shown in the [example](src/kernelbench/prompts/model_new_ex_add_thunderkittens.py), should contain `tk_root = os.environ.get("THUNDERKITTENS_ROOT", "/root/ThunderKittens")`, which enable the kernel to include the right TK primitives. In addition, we only support BF16 for TK right now. diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py index 95aea5d7..fbde28b9 100644 --- a/scripts/generate_and_eval_single_sample.py +++ b/scripts/generate_and_eval_single_sample.py @@ -124,7 +124,7 @@ def main(config: EvalConfig): ) if config.gpu_arch: - if (type(config.gpu_arch) is not list): + if (type(config.gpu_arch) is not list): # normalization to list config.gpu_arch = [config.gpu_arch] set_gpu_arch(config.gpu_arch) # otherwise build for all architectures diff --git a/src/kernelbench/eval.py b/src/kernelbench/eval.py index dd79b2c0..3557dc94 100644 --- a/src/kernelbench/eval.py +++ b/src/kernelbench/eval.py @@ -429,7 +429,18 @@ def eval_kernel_against_ref( # TODO: check device is busy assert torch.cuda.is_available(), "CUDA is not available, cannot run Eval" - if backend.lower() == "tilelang": + # Backend-GPU vendor validation + from .utils import get_gpu_vendor + vendor = get_gpu_vendor(device) + backend_lower = backend.lower() + # HIP is AMD-only + if backend_lower == "hip" and vendor != "amd": + raise ValueError(f"HIP backend requires AMD GPU, got {vendor}") + # cuda/cute/thunderkittens are NVIDIA-only (triton/tilelang work on both) + if backend_lower in ["cuda", "cute", "thunderkittens"] and vendor == "amd": + raise ValueError(f"{backend} backend requires NVIDIA GPU, got AMD") + + if backend_lower == "tilelang": assert precision == torch.float16 or precision == torch.bfloat16, "TileLang only supports fp16 or bfloat16" torch.set_printoptions( @@ -463,7 +474,11 @@ def eval_kernel_against_ref( raise ValueError( f"device must be an int or torch.device, got {type(device)}" ) - os.environ["CUDA_VISIBLE_DEVICES"] = str(device_num) + # NVIDIA uses CUDA_VISIBLE_DEVICES, AMD uses HIP_VISIBLE_DEVICES + if vendor == "amd": + os.environ["HIP_VISIBLE_DEVICES"] = str(device_num) + else: + os.environ["CUDA_VISIBLE_DEVICES"] = str(device_num) context = {} if verbose: diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py index 8326324e..a4960438 100644 --- a/src/kernelbench/profile.py +++ b/src/kernelbench/profile.py @@ -90,13 +90,20 @@ def profile_with_nsight(func, metrics=None, num_trials=1): >>> print(results['gpu__time_duration.sum']) # Time in nanoseconds Raises: - RuntimeError: If nsight-python is not installed. + RuntimeError: If nsight-python is not installed or not on NVIDIA GPU. """ if not NSIGHT_AVAILABLE: raise RuntimeError( "nsight-python not available." ) + # NSight is NVIDIA-only + from kernelbench.utils import get_gpu_vendor + if get_gpu_vendor() != "nvidia": + raise RuntimeError( + "NSight profiling requires NVIDIA GPU. Not available on AMD." + ) + # Normalize metrics to a list if metrics is None: metrics = ['sm__cycles_active.avg'] diff --git a/src/kernelbench/prompts/hardware/gpu_specs.py b/src/kernelbench/prompts/hardware/gpu_specs.py index ca63488f..7fde700e 100644 --- a/src/kernelbench/prompts/hardware/gpu_specs.py +++ b/src/kernelbench/prompts/hardware/gpu_specs.py @@ -119,6 +119,7 @@ "Shared memory capacity per SM": "164 KB", "Maximum shared memory per thread block": "163 KB", }, + # NOTE: In the future we will have a more unified format for various GPUs "MI300X": { "GPU Architecture": "gfx942", "GPU Memory": "192GB", diff --git a/src/kernelbench/prompts/model_new_ex_add_hip.py b/src/kernelbench/prompts/model_new_ex_add_hip.py index 2498bc18..806aa876 100644 --- a/src/kernelbench/prompts/model_new_ex_add_hip.py +++ b/src/kernelbench/prompts/model_new_ex_add_hip.py @@ -4,6 +4,7 @@ import torch.nn.functional as F from torch.utils.cpp_extension import load_inline +# Must include this line so PyTorch could use HIP compiler for AMD GPUs os.environ["CXX"] = "hipcc" elementwise_add_cpp_source = """ diff --git a/src/kernelbench/prompts/prompts.toml b/src/kernelbench/prompts/prompts.toml index 61b6b15f..6a7dfcaa 100644 --- a/src/kernelbench/prompts/prompts.toml +++ b/src/kernelbench/prompts/prompts.toml @@ -55,7 +55,7 @@ one_shot_new_arch = "src/kernelbench/prompts/model_new_ex_add_thunderkittens.py" # No few_shot_examples - will use one-shot when few_shot option is selected [backends.hip] -backend_display = "HIP kernels" +backend_display = "AMD HIP kernels" one_shot_new_arch = "src/kernelbench/prompts/model_new_ex_add_hip.py" # No few_shot_examples - will use one-shot when few_shot option is selected diff --git a/src/kernelbench/timing.py b/src/kernelbench/timing.py index 52a2b85e..f22920b4 100644 --- a/src/kernelbench/timing.py +++ b/src/kernelbench/timing.py @@ -510,20 +510,25 @@ def time_execution_with_nsight_python( """ Time a CUDA kernel function using nsight-python. - Note: nsight returns an average time across num_trials runs. + NOTE: NVIDIA-only. NSight Compute (ncu) does not work on AMD GPUs. + Returns a list with a single value (average time) for API consistency. GPU time from nsight is in nanoseconds, converted to milliseconds. - - Returns: - List containing one float: average elapsed time in milliseconds """ - from kernelbench.profile import profile_with_nsight + from kernelbench.utils import get_gpu_vendor if device is None: if verbose: print(f"Using current device: {torch.cuda.current_device()}") device = torch.cuda.current_device() + + # NSight is NVIDIA-only + if get_gpu_vendor(device) != "nvidia": + raise RuntimeError( + "NSight profiling requires NVIDIA GPU. " + "Use timing_method='cuda_event' or 'do_bench' for AMD." + ) with torch.cuda.device(device): # Warm ups diff --git a/src/kernelbench/utils.py b/src/kernelbench/utils.py index bbd6a468..e975cc63 100644 --- a/src/kernelbench/utils.py +++ b/src/kernelbench/utils.py @@ -38,16 +38,51 @@ # Inference Helpers ######################################################## +NVIDIA_ARCHS = ["Maxwell", "Pascal", "Volta", "Turing", "Ampere", "Hopper", "Ada", "Blackwell"] +AMD_ARCHS = ["gfx942", "gfx950"] # gfx942: CDNA3 (MI300), gfx950: CDNA4 (MI350) + + +######################################################## +# GPU Vendor Detection +######################################################## + +def get_gpu_vendor(device: torch.device | int | None = None) -> str: + """Returns 'nvidia', 'amd', or 'unknown' for the given device.""" + if not torch.cuda.is_available(): + return "unknown" + if device is None: + device = torch.cuda.current_device() + name = torch.cuda.get_device_name(device).upper() + if "NVIDIA" in name: + return "nvidia" + if "AMD" in name or "MI3" in name: + return "amd" + return "unknown" + + def set_gpu_arch(arch_list: list[str]): """ - Set env variable for torch cuda arch list to build kernels for specified architectures + Set env variable for torch to build kernels for specified architectures. + Supports both NVIDIA (TORCH_CUDA_ARCH_LIST) and AMD (PYTORCH_ROCM_ARCH). """ - valid_archs = ["Maxwell", "Pascal", "Volta", "Turing", "Ampere", "Hopper", "Ada", "gfx942", "gfx950"] + nvidia_archs = [] + amd_archs = [] + for arch in arch_list: - if arch not in valid_archs: - raise ValueError(f"Invalid architecture: {arch}. Must be one of {valid_archs}") + if arch in NVIDIA_ARCHS: + nvidia_archs.append(arch) + elif arch in AMD_ARCHS: + amd_archs.append(arch) + else: + raise ValueError(f"Invalid architecture: {arch}. Must be one of NVIDIA: {NVIDIA_ARCHS} or AMD: {AMD_ARCHS}") + + if nvidia_archs and amd_archs: + raise ValueError(f"Cannot mix NVIDIA and AMD architectures. Got NVIDIA: {nvidia_archs}, AMD: {amd_archs}") - os.environ["TORCH_CUDA_ARCH_LIST"] = ";".join(arch_list) + if nvidia_archs: + os.environ["TORCH_CUDA_ARCH_LIST"] = ";".join(nvidia_archs) + elif amd_archs: + os.environ["PYTORCH_ROCM_ARCH"] = ";".join(amd_archs) def query_server( prompt: str | list[dict], # string if normal prompt, list of dicts if chat prompt,