Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ We construct KernelBench to have 4 Levels of categories:
- **Level 4 🤗**: Level Hugging Face
Optimize whole model architectures from HuggingFace

We are actively extending KernelBench to other DSLs beyond `cuda` as well (see below).
We are actively extending KernelBench to other DSLs beyond `cuda` as well (see below), as well as AMD GPU support.

## ⚖️ Evaluation
#### Methodology
Expand Down Expand Up @@ -85,13 +85,18 @@ We have transitioned to using `pyproject.toml` and `uv` for dependency managemen
# Install base dependencies (works without a local GPU)
uv sync

# Install with AMD ROCm backend (ROCm>=7.1 is required)
uv add torch --index pytorch=https://download.pytorch.org/whl/rocm7.1

# Install with GPU dependencies (for local GPU evaluation)
uv sync --extra gpu

# Run commands with uv (which invoke the right env)
uv run python scripts/<script_name>.py ...
```

For AMD GPU aka ROCm backend (ROCm>=7.1), please add `uv remove torch && uv add torch --index pytorch=https://download.pytorch.org/whl/rocm7.1` for RoCm compatible PyTorch to configure your dependencies. Running in a docker image is recommended for this due to complexity of ROCm setup.

You can still use `conda (python=3.10)` to create your environment and install dependencies with `requirements.txt`.

We use `litellm` for API calls. Please set your keys by creating a `.env` following our `.env.example`.
Expand All @@ -117,8 +122,9 @@ uv run python scripts/generate_and_eval_single_sample.py dataset_src=huggingface
**What you might need to modify**
* **`gpu_arch`** - Depend on your GPU, you might need to adjust the `gpu_arch` argument to reflect your hardware.
* **`precision`** - You can specify the precision of tensor by `precision=fp32`. Currently all of our reported results are `fp32` but we added support for `fp16` & `bf16`.
* **`backend`** - We are also supporting other GPU programming languages beyond `cuda`. Simply specify `backend=triton`. For now we support DSLs: `cuda`, `triton`, `cute`, `tilelang`, `thunderkittens`.
* **`backend`** - We are also supporting other GPU programming languages beyond `cuda`. For example, simply specify `backend=triton` or `backend=hip`. For now we support NVIDIA GPUs with programming frameworks and DSLs: `cuda`, `triton`, `cute`, `tilelang`, `thunderkittens`.

Note for AMD GPUs: Use `hip` backend, `gpu_arch` currently supported: `gfx942`, `gfx950`.

Note on setting up ThunderKittens (TK) locally: to use `backend=thunderkittens`, you need to git clone the ThunderKittens repo and set the following environment variable to point to your local ThunderKittens directory, `export THUNDERKITTENS_ROOT=<PATH to ThunderKittens folder>`, and all ThunderKitten programs as shown in the [example](src/kernelbench/prompts/model_new_ex_add_thunderkittens.py), should contain `tk_root = os.environ.get("THUNDERKITTENS_ROOT", "/root/ThunderKittens")`, which enable the kernel to include the right TK primitives. In addition, we only support BF16 for TK right now.

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ version = "0.2.0.dev0"
requires-python = "==3.10.*"
dependencies = [
# Frameworks
"torch==2.9.0",
"torch>=2.9.0",

"transformers",
"datasets",
"datasets>=2.19.0",
"modal",

# helper
Expand Down
4 changes: 3 additions & 1 deletion scripts/generate_and_eval_single_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ def main(config: EvalConfig):
)

if config.gpu_arch:
if (type(config.gpu_arch) is not list): # normalization to list
config.gpu_arch = [config.gpu_arch]
set_gpu_arch(config.gpu_arch) # otherwise build for all architectures

if config.log:
Expand Down Expand Up @@ -174,7 +176,7 @@ def main(config: EvalConfig):
include_hardware = include_hardware.lower() in ["true", "1", "yes"]
config.include_hardware_info = include_hardware

supported_backends = {"cuda", "triton", "tilelang", "cute", "thunderkittens"}
supported_backends = {"cuda", "hip", "triton", "tilelang", "cute", "thunderkittens"}
backend = config.backend.lower()
if backend not in supported_backends:
raise ValueError(
Expand Down
19 changes: 17 additions & 2 deletions src/kernelbench/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,18 @@ def eval_kernel_against_ref(
# TODO: check device is busy
assert torch.cuda.is_available(), "CUDA is not available, cannot run Eval"

if backend.lower() == "tilelang":
# Backend-GPU vendor validation
from .utils import get_gpu_vendor
vendor = get_gpu_vendor(device)
backend_lower = backend.lower()
# HIP is AMD-only
if backend_lower == "hip" and vendor != "amd":
raise ValueError(f"HIP backend requires AMD GPU, got {vendor}")
# cuda/cute/thunderkittens are NVIDIA-only (triton/tilelang work on both)
if backend_lower in ["cuda", "cute", "thunderkittens"] and vendor == "amd":
raise ValueError(f"{backend} backend requires NVIDIA GPU, got AMD")

if backend_lower == "tilelang":
assert precision == torch.float16 or precision == torch.bfloat16, "TileLang only supports fp16 or bfloat16"

torch.set_printoptions(
Expand Down Expand Up @@ -463,7 +474,11 @@ def eval_kernel_against_ref(
raise ValueError(
f"device must be an int or torch.device, got {type(device)}"
)
os.environ["CUDA_VISIBLE_DEVICES"] = str(device_num)
# NVIDIA uses CUDA_VISIBLE_DEVICES, AMD uses HIP_VISIBLE_DEVICES
if vendor == "amd":
os.environ["HIP_VISIBLE_DEVICES"] = str(device_num)
else:
os.environ["CUDA_VISIBLE_DEVICES"] = str(device_num)
context = {}

if verbose:
Expand Down
9 changes: 8 additions & 1 deletion src/kernelbench/profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,20 @@ def profile_with_nsight(func, metrics=None, num_trials=1):
>>> print(results['gpu__time_duration.sum']) # Time in nanoseconds

Raises:
RuntimeError: If nsight-python is not installed.
RuntimeError: If nsight-python is not installed or not on NVIDIA GPU.
"""
if not NSIGHT_AVAILABLE:
raise RuntimeError(
"nsight-python not available."
)

# NSight is NVIDIA-only
from kernelbench.utils import get_gpu_vendor
if get_gpu_vendor() != "nvidia":
raise RuntimeError(
"NSight profiling requires NVIDIA GPU. Not available on AMD."
)

# Normalize metrics to a list
if metrics is None:
metrics = ['sm__cycles_active.avg']
Expand Down
85 changes: 85 additions & 0 deletions src/kernelbench/prompts/hardware/gpu_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,91 @@
"Maximum number of thread blocks per SM": "32",
"Shared memory capacity per SM": "164 KB",
"Maximum shared memory per thread block": "163 KB",
},
# NOTE: In the future we will have a more unified format for various GPUs
"MI300X": {
"GPU Architecture": "gfx942",
"GPU Memory": "192GB",
"Memory Bandwidth": "5.3 TB/s",
"FP64 TFLOPS": "81.7",
"FP64 Matrix Core TFLOPS": "163.4",
"FP32 TFLOPS": "163.4",
"TF32 Matrix Core TFLOPS": "653.7 (1307.4 with sparsity)",
"BFLOAT16 Matrix Core TFLOPS": "1307.4 (2614.9 with sparsity)",
"FP16 Matrix Core TFLOPS": "1307.4 (2614.9 with sparsity)",
"FP8 Matrix Core TFLOPS": "2614.9 (5229.8 with sparsity)",
"INT8 Matrix Core TOPS": "2614.9 (5229.8 with sparsity)",
"Number of CU": "304",
"SIMDs per CU": "4",
"Wavefront Size": "64",
"Workgroup Max Size": "1024",
"Max Waves Per CU": "32",
"Max Threads per CU": "2048",
"Maximum number of registers per thread": "256",
"Shared memory capacity per CU": "64 KB",
},
"MI325X": {
"GPU Architecture": "gfx942",
"GPU Memory": "256GB",
"Memory Bandwidth": "6TB/s",
"FP64 TFLOPS": "81.7",
"FP64 Matrix Core TFLOPS": "163.4",
"FP32 TFLOPS": "163.4",
"TF32 Matrix Core TFLOPS": "653.7 (1307.4 with sparsity)",
"BFLOAT16 Matrix Core TFLOPS": "1307.4 (2614.9 with sparsity)",
"FP16 Matrix Core TFLOPS": "1307.4 (2614.9 with sparsity)",
"FP8 Matrix Core TFLOPS": "2614.9 (5229.8 with sparsity)",
"INT8 Matrix Core TOPS": "2614.9 (5229.8 with sparsity)",
"Number of CU": "304",
"SIMDs per CU": "4",
"Wavefront Size": "64",
"Workgroup Max Size": "1024",
"Max Waves Per CU": "32",
"Max Threads per CU": "2048",
"Maximum number of registers per thread": "256",
"Shared memory capacity per CU": "64 KB",
},
"MI350X": {
"GPU Architecture": "gfx950",
"GPU Memory": "288GB",
"Memory Bandwidth": "8TB/s",
"FP64 TFLOPS": "72.1",
"FP64 Matrix Core TFLOPS": "72.1",
"FP32 TFLOPS": "144.2",
"BFLOAT16 Matrix Core TFLOPS": "2300 (4600 with sparsity)",
"FP16 Matrix Core TFLOPS": "2300 (4600 with sparsity)",
"FP8 Matrix Core TFLOPS": "4600",
"MXFP6, MXFP4 Matrix Core TFLOPS": "9200",
"INT8 Matrix Core TOPS": "4600 (9200 with sparsity)",
"Number of CU": "256",
"SIMDs per CU": "4",
"Wavefront Size": "64",
"Workgroup Max Size": "1024",
"Max Waves Per CU": "32",
"Max Threads per CU": "2048",
"Maximum number of registers per thread": "256",
"Shared memory capacity per CU": "160 KB",
},
"MI355X": {
"GPU Architecture": "gfx950",
"GPU Memory": "288GB",
"Memory Bandwidth": "8TB/s",
"FP64 TFLOPS": "78.6",
"FP64 Matrix Core TFLOPS": "78.6",
"FP32 TFLOPS": "157.3",
"BFLOAT16 Matrix Core TFLOPS": "2500 (5000 with sparsity)",
"FP16 Matrix Core TFLOPS": "2500 (5000 with sparsity)",
"FP8 Matrix Core TFLOPS": "5000",
"MXFP6, MXFP4 Matrix Core TFLOPS": "10000",
"INT8 Matrix Core TOPS": "5000 (10000 with sparsity)",
"Number of CU": "256",
"SIMDs per CU": "4",
"Wavefront Size": "64",
"Workgroup Max Size": "1024",
"Max Waves Per CU": "32",
"Max Threads per CU": "2048",
"Maximum number of registers per thread": "256",
"Shared memory capacity per CU": "160 KB",
}
}

Expand Down
46 changes: 46 additions & 0 deletions src/kernelbench/prompts/model_new_ex_add_hip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.cpp_extension import load_inline

# Must include this line so PyTorch could use HIP compiler for AMD GPUs
os.environ["CXX"] = "hipcc"

elementwise_add_cpp_source = """
#include <hip/hip_runtime.h>

__global__ void elementwise_add_kernel(const float* a, const float* b, float* out, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
out[idx] = a[idx] + b[idx];
}
}

torch::Tensor elementwise_add_hip(torch::Tensor a, torch::Tensor b) {
auto size = a.numel();
auto out = torch::zeros_like(a);

const int block_size = 256;
const int num_blocks = (size + block_size - 1) / block_size;

elementwise_add_kernel<<<num_blocks, block_size>>>(a.data_ptr<float>(), b.data_ptr<float>(), out.data_ptr<float>(), size);

return out;
}
"""

elementwise_add = load_inline(
name="elementwise_add",
cpp_sources=elementwise_add_cpp_source,
functions=["elementwise_add_hip"],
verbose=True,
)

class ModelNew(nn.Module):
def __init__(self) -> None:
super().__init__()
self.elementwise_add = elementwise_add

def forward(self, a, b):
return self.elementwise_add.elementwise_add_hip(a, b)
5 changes: 5 additions & 0 deletions src/kernelbench/prompts/prompts.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ backend_display = "ThunderKittens kernels"
one_shot_new_arch = "src/kernelbench/prompts/model_new_ex_add_thunderkittens.py"
# No few_shot_examples - will use one-shot when few_shot option is selected

[backends.hip]
backend_display = "AMD HIP kernels"
one_shot_new_arch = "src/kernelbench/prompts/model_new_ex_add_hip.py"
# No few_shot_examples - will use one-shot when few_shot option is selected

# -------------------------------------------------------------------------
# Precision: Precision-specific configuration
# -------------------------------------------------------------------------
Expand Down
15 changes: 10 additions & 5 deletions src/kernelbench/timing.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,20 +510,25 @@ def time_execution_with_nsight_python(
"""
Time a CUDA kernel function using nsight-python.

Note: nsight returns an average time across num_trials runs.
NOTE: NVIDIA-only. NSight Compute (ncu) does not work on AMD GPUs.

Returns a list with a single value (average time) for API consistency.
GPU time from nsight is in nanoseconds, converted to milliseconds.

Returns:
List containing one float: average elapsed time in milliseconds
"""

from kernelbench.profile import profile_with_nsight
from kernelbench.utils import get_gpu_vendor

if device is None:
if verbose:
print(f"Using current device: {torch.cuda.current_device()}")
device = torch.cuda.current_device()

# NSight is NVIDIA-only
if get_gpu_vendor(device) != "nvidia":
raise RuntimeError(
"NSight profiling requires NVIDIA GPU. "
"Use timing_method='cuda_event' or 'do_bench' for AMD."
)

with torch.cuda.device(device):
# Warm ups
Expand Down
45 changes: 40 additions & 5 deletions src/kernelbench/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,51 @@
# Inference Helpers
########################################################

NVIDIA_ARCHS = ["Maxwell", "Pascal", "Volta", "Turing", "Ampere", "Hopper", "Ada", "Blackwell"]
AMD_ARCHS = ["gfx942", "gfx950"] # gfx942: CDNA3 (MI300), gfx950: CDNA4 (MI350)


########################################################
# GPU Vendor Detection
########################################################

def get_gpu_vendor(device: torch.device | int | None = None) -> str:
"""Returns 'nvidia', 'amd', or 'unknown' for the given device."""
if not torch.cuda.is_available():
return "unknown"
if device is None:
device = torch.cuda.current_device()
name = torch.cuda.get_device_name(device).upper()
if "NVIDIA" in name:
return "nvidia"
if "AMD" in name or "MI3" in name:
return "amd"
return "unknown"


def set_gpu_arch(arch_list: list[str]):
"""
Set env variable for torch cuda arch list to build kernels for specified architectures
Set env variable for torch to build kernels for specified architectures.
Supports both NVIDIA (TORCH_CUDA_ARCH_LIST) and AMD (PYTORCH_ROCM_ARCH).
"""
valid_archs = ["Maxwell", "Pascal", "Volta", "Turing", "Ampere", "Hopper", "Ada"]
nvidia_archs = []
amd_archs = []

for arch in arch_list:
if arch not in valid_archs:
raise ValueError(f"Invalid architecture: {arch}. Must be one of {valid_archs}")
if arch in NVIDIA_ARCHS:
nvidia_archs.append(arch)
elif arch in AMD_ARCHS:
amd_archs.append(arch)
else:
raise ValueError(f"Invalid architecture: {arch}. Must be one of NVIDIA: {NVIDIA_ARCHS} or AMD: {AMD_ARCHS}")

if nvidia_archs and amd_archs:
raise ValueError(f"Cannot mix NVIDIA and AMD architectures. Got NVIDIA: {nvidia_archs}, AMD: {amd_archs}")

os.environ["TORCH_CUDA_ARCH_LIST"] = ";".join(arch_list)
if nvidia_archs:
os.environ["TORCH_CUDA_ARCH_LIST"] = ";".join(nvidia_archs)
elif amd_archs:
os.environ["PYTORCH_ROCM_ARCH"] = ";".join(amd_archs)

def query_server(
prompt: str | list[dict], # string if normal prompt, list of dicts if chat prompt,
Expand Down