From 2b9430840f3f8a80c0a38dd2f092a2a9d487b8a0 Mon Sep 17 00:00:00 2001
From: amd-asalykov <asalykov@amd.com>
Date: Thu, 22 Jan 2026 16:20:41 +0000
Subject: [PATCH 1/8] add hip backend for eval_single_sample

---
 pyproject.toml                                | 11 ++-
 scripts/generate_and_eval_single_sample.py    |  4 +-
 src/kernelbench/prompts/hardware/gpu_specs.py | 84 +++++++++++++++++++
 .../prompts/model_new_ex_add_hip.py           | 45 ++++++++++
 src/kernelbench/prompts/prompts.toml          |  5 ++
 src/kernelbench/utils.py                      |  2 +-
 6 files changed, 148 insertions(+), 3 deletions(-)
 create mode 100644 src/kernelbench/prompts/model_new_ex_add_hip.py

diff --git a/pyproject.toml b/pyproject.toml
index bed37150..36125475 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ requires-python = "==3.10.*"
 dependencies = [
     # Frameworks
     "torch==2.9.0",
-
+    "pytorch-triton-rocm>=3.4.0",
     "transformers",
     "datasets",
     "modal",
@@ -49,6 +49,15 @@ dev = [
     "ruff",
 ]
 
+[tool.uv.sources]
+torch = [{ index = "pytorch-rocm" }]
+torchvision = [{ index = "pytorch-rocm" }]
+pytorch-triton-rocm = [{ index = "pytorch-rocm" }]
+
+[[tool.uv.index]]
+name = "pytorch-rocm"
+url = "https://download.pytorch.org/whl/rocm6.4"
+explicit = true
 
 [tool.setuptools.packages.find]
 where = ["src"]
diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
index fce1b16f..95aea5d7 100644
--- a/scripts/generate_and_eval_single_sample.py
+++ b/scripts/generate_and_eval_single_sample.py
@@ -124,6 +124,8 @@ def main(config: EvalConfig):
     )
 
     if config.gpu_arch:
+        if (type(config.gpu_arch) is not list):
+            config.gpu_arch = [config.gpu_arch]
         set_gpu_arch(config.gpu_arch)  # otherwise build for all architectures
 
     if config.log:
@@ -174,7 +176,7 @@ def main(config: EvalConfig):
         include_hardware = include_hardware.lower() in ["true", "1", "yes"]
     config.include_hardware_info = include_hardware
 
-    supported_backends = {"cuda", "triton", "tilelang", "cute", "thunderkittens"}
+    supported_backends = {"cuda", "hip", "triton", "tilelang", "cute", "thunderkittens"}
     backend = config.backend.lower()
     if backend not in supported_backends:
         raise ValueError(
diff --git a/src/kernelbench/prompts/hardware/gpu_specs.py b/src/kernelbench/prompts/hardware/gpu_specs.py
index 800f20ef..ca63488f 100644
--- a/src/kernelbench/prompts/hardware/gpu_specs.py
+++ b/src/kernelbench/prompts/hardware/gpu_specs.py
@@ -118,6 +118,90 @@
         "Maximum number of thread blocks per SM": "32",
         "Shared memory capacity per SM": "164 KB",
         "Maximum shared memory per thread block": "163 KB",
+    },
+    "MI300X": {
+        "GPU Architecture": "gfx942",
+        "GPU Memory": "192GB",
+        "Memory Bandwidth": "5.3 TB/s",
+        "FP64 TFLOPS": "81.7",
+        "FP64 Matrix Core TFLOPS": "163.4",
+        "FP32 TFLOPS": "163.4",
+        "TF32 Matrix Core TFLOPS": "653.7 (1307.4 with sparsity)",
+        "BFLOAT16 Matrix Core TFLOPS": "1307.4 (2614.9 with sparsity)",
+        "FP16 Matrix Core TFLOPS": "1307.4 (2614.9 with sparsity)",
+        "FP8 Matrix Core TFLOPS": "2614.9 (5229.8 with sparsity)",
+        "INT8 Matrix Core TOPS": "2614.9 (5229.8 with sparsity)",
+        "Number of CU": "304",
+        "SIMDs per CU": "4",
+        "Wavefront Size": "64",
+        "Workgroup Max Size": "1024",
+        "Max Waves Per CU": "32",
+        "Max Threads per CU": "2048",
+        "Maximum number of registers per thread": "256",
+        "Shared memory capacity per CU": "64 KB",
+    },
+    "MI325X": {
+        "GPU Architecture": "gfx942",
+        "GPU Memory": "256GB",
+        "Memory Bandwidth": "6TB/s",
+        "FP64 TFLOPS": "81.7",
+        "FP64 Matrix Core TFLOPS": "163.4",
+        "FP32 TFLOPS": "163.4",
+        "TF32 Matrix Core TFLOPS": "653.7 (1307.4 with sparsity)",
+        "BFLOAT16 Matrix Core TFLOPS": "1307.4 (2614.9 with sparsity)",
+        "FP16 Matrix Core TFLOPS": "1307.4 (2614.9 with sparsity)",
+        "FP8 Matrix Core TFLOPS": "2614.9 (5229.8 with sparsity)",
+        "INT8 Matrix Core TOPS": "2614.9 (5229.8 with sparsity)",
+        "Number of CU": "304",
+        "SIMDs per CU": "4",
+        "Wavefront Size": "64",
+        "Workgroup Max Size": "1024",
+        "Max Waves Per CU": "32",
+        "Max Threads per CU": "2048",
+        "Maximum number of registers per thread": "256",
+        "Shared memory capacity per CU": "64 KB",
+    },
+    "MI350X": {
+        "GPU Architecture": "gfx950",
+        "GPU Memory": "288GB",
+        "Memory Bandwidth": "8TB/s",
+        "FP64 TFLOPS": "72.1",
+        "FP64 Matrix Core TFLOPS": "72.1",
+        "FP32 TFLOPS": "144.2",
+        "BFLOAT16 Matrix Core TFLOPS": "2300 (4600 with sparsity)",
+        "FP16 Matrix Core TFLOPS": "2300 (4600 with sparsity)",
+        "FP8 Matrix Core TFLOPS": "4600",
+        "MXFP6, MXFP4 Matrix Core TFLOPS": "9200",
+        "INT8 Matrix Core TOPS": "4600 (9200 with sparsity)",
+        "Number of CU": "256",
+        "SIMDs per CU": "4",
+        "Wavefront Size": "64",
+        "Workgroup Max Size": "1024",
+        "Max Waves Per CU": "32",
+        "Max Threads per CU": "2048",
+        "Maximum number of registers per thread": "256",
+        "Shared memory capacity per CU": "160 KB",
+    },
+    "MI355X": {
+        "GPU Architecture": "gfx950",
+        "GPU Memory": "288GB",
+        "Memory Bandwidth": "8TB/s",
+        "FP64 TFLOPS": "78.6",
+        "FP64 Matrix Core TFLOPS": "78.6",
+        "FP32 TFLOPS": "157.3",
+        "BFLOAT16 Matrix Core TFLOPS": "2500 (5000 with sparsity)",
+        "FP16 Matrix Core TFLOPS": "2500 (5000 with sparsity)",
+        "FP8 Matrix Core TFLOPS": "5000",
+        "MXFP6, MXFP4 Matrix Core TFLOPS": "10000",
+        "INT8 Matrix Core TOPS": "5000 (10000 with sparsity)",
+        "Number of CU": "256",
+        "SIMDs per CU": "4",
+        "Wavefront Size": "64",
+        "Workgroup Max Size": "1024",
+        "Max Waves Per CU": "32",
+        "Max Threads per CU": "2048",
+        "Maximum number of registers per thread": "256",
+        "Shared memory capacity per CU": "160 KB",
     }
 }
 
diff --git a/src/kernelbench/prompts/model_new_ex_add_hip.py b/src/kernelbench/prompts/model_new_ex_add_hip.py
new file mode 100644
index 00000000..2498bc18
--- /dev/null
+++ b/src/kernelbench/prompts/model_new_ex_add_hip.py
@@ -0,0 +1,45 @@
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.cpp_extension import load_inline
+
+os.environ["CXX"] = "hipcc"
+
+elementwise_add_cpp_source = """
+#include <hip/hip_runtime.h>
+
+__global__ void elementwise_add_kernel(const float* a, const float* b, float* out, int size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) {
+        out[idx] = a[idx] + b[idx];
+    }
+}
+
+torch::Tensor elementwise_add_hip(torch::Tensor a, torch::Tensor b) {
+    auto size = a.numel();
+    auto out = torch::zeros_like(a);
+
+    const int block_size = 256;
+    const int num_blocks = (size + block_size - 1) / block_size;
+
+    elementwise_add_kernel<<<num_blocks, block_size>>>(a.data_ptr<float>(), b.data_ptr<float>(), out.data_ptr<float>(), size);
+
+    return out;
+}
+"""
+
+elementwise_add = load_inline(
+    name="elementwise_add",
+    cpp_sources=elementwise_add_cpp_source,
+    functions=["elementwise_add_hip"],
+    verbose=True,
+)
+
+class ModelNew(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.elementwise_add = elementwise_add
+
+    def forward(self, a, b):
+        return self.elementwise_add.elementwise_add_hip(a, b)
\ No newline at end of file
diff --git a/src/kernelbench/prompts/prompts.toml b/src/kernelbench/prompts/prompts.toml
index 2768aa11..61b6b15f 100644
--- a/src/kernelbench/prompts/prompts.toml
+++ b/src/kernelbench/prompts/prompts.toml
@@ -54,6 +54,11 @@ backend_display = "ThunderKittens kernels"
 one_shot_new_arch = "src/kernelbench/prompts/model_new_ex_add_thunderkittens.py"
 # No few_shot_examples - will use one-shot when few_shot option is selected
 
+[backends.hip]
+backend_display = "HIP kernels"
+one_shot_new_arch = "src/kernelbench/prompts/model_new_ex_add_hip.py"
+# No few_shot_examples - will use one-shot when few_shot option is selected
+
 # -------------------------------------------------------------------------
 # Precision: Precision-specific configuration
 # -------------------------------------------------------------------------
diff --git a/src/kernelbench/utils.py b/src/kernelbench/utils.py
index cf8b0ad8..bbd6a468 100644
--- a/src/kernelbench/utils.py
+++ b/src/kernelbench/utils.py
@@ -42,7 +42,7 @@ def set_gpu_arch(arch_list: list[str]):
     """
     Set env variable for torch cuda arch list to build kernels for specified architectures
     """
-    valid_archs = ["Maxwell", "Pascal", "Volta", "Turing", "Ampere", "Hopper", "Ada"]
+    valid_archs = ["Maxwell", "Pascal", "Volta", "Turing", "Ampere", "Hopper", "Ada", "gfx942", "gfx950"]
     for arch in arch_list:
         if arch not in valid_archs:
             raise ValueError(f"Invalid architecture: {arch}. Must be one of {valid_archs}")

From 69ab132722a76e2eabd15e6867e78b2fb29d18fd Mon Sep 17 00:00:00 2001
From: amd-asalykov <asalykov@amd.com>
Date: Wed, 28 Jan 2026 20:04:21 +0000
Subject: [PATCH 2/8] update pyproject.toml for CDNA4

---
 README.md      | 5 ++---
 pyproject.toml | 9 +++------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 7343e73b..021fd033 100644
--- a/README.md
+++ b/README.md
@@ -115,10 +115,9 @@ uv run python scripts/generate_and_eval_single_sample.py dataset_src=huggingface
 ```
 
 **What you might need to modify**
-* **`gpu_arch`** - Depend on your GPU, you might need to adjust the `gpu_arch` argument to reflect your hardware.
+* **`gpu_arch`** - Depend on your GPU, you might need to adjust the `gpu_arch` argument to reflect your hardware. `gpu_arch` currently supported for `hip` backend: `gfx942`, `gfx950`.
 * **`precision`** - You can specify the precision of tensor by `precision=fp32`. Currently all of our reported results are `fp32` but we added support for `fp16` & `bf16`.
-*  **`backend`** - We are also supporting other GPU programming languages beyond `cuda`. Simply specify `backend=triton`. For now we support DSLs: `cuda`, `triton`, `cute`, `tilelang`, `thunderkittens`.
-
+*  **`backend`** - We are also supporting other GPU programming languages beyond `cuda`. For example, simply specify `backend=triton` or `backend=hip`. For now we support DSLs: `cuda`, `hip`, `triton`, `cute`, `tilelang`, `thunderkittens`. 
 
 Note on setting up ThunderKittens (TK) locally: to use `backend=thunderkittens`, you need to git clone the ThunderKittens repo and set the following environment variable to point to your local ThunderKittens directory, `export THUNDERKITTENS_ROOT=<PATH to ThunderKittens folder>`, and all ThunderKitten programs as shown in the [example](src/kernelbench/prompts/model_new_ex_add_thunderkittens.py), should contain `tk_root = os.environ.get("THUNDERKITTENS_ROOT", "/root/ThunderKittens")`, which enable the kernel to include the right TK primitives. In addition, we only support BF16 for TK right now.
 
diff --git a/pyproject.toml b/pyproject.toml
index 36125475..a29e98f4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,10 +10,9 @@ version = "0.2.0.dev0"
 requires-python = "==3.10.*"
 dependencies = [
     # Frameworks
-    "torch==2.9.0",
-    "pytorch-triton-rocm>=3.4.0",
+    "torch>=2.9.0",
     "transformers",
-    "datasets",
+    "datasets>=2.19.0",
     "modal",
 
     # helper
@@ -52,12 +51,10 @@ dev = [
 [tool.uv.sources]
 torch = [{ index = "pytorch-rocm" }]
 torchvision = [{ index = "pytorch-rocm" }]
-pytorch-triton-rocm = [{ index = "pytorch-rocm" }]
 
 [[tool.uv.index]]
 name = "pytorch-rocm"
-url = "https://download.pytorch.org/whl/rocm6.4"
-explicit = true
+url = "https://download.pytorch.org/whl/rocm7.1"
 
 [tool.setuptools.packages.find]
 where = ["src"]

From fb5abb6250324788947288bbe83f806c1e60360e Mon Sep 17 00:00:00 2001
From: amd-asalykov <asalykov@amd.com>
Date: Wed, 28 Jan 2026 21:09:41 +0000
Subject: [PATCH 3/8] update

---
 README.md      |  4 ++++
 pyproject.toml | 15 ++++++---------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 021fd033..b090cbf3 100644
--- a/README.md
+++ b/README.md
@@ -88,6 +88,10 @@ uv sync
 # Install with GPU dependencies (for local GPU evaluation)
 uv sync --extra gpu
 
+# Install with ROCm backend
+uv add torch --index pytorch=https://download.pytorch.org/whl/rocm7.1
+uv sync
+
 # Run commands with uv (which invoke the right env)
 uv run python scripts/<script_name>.py ...
 ```
diff --git a/pyproject.toml b/pyproject.toml
index a29e98f4..cd5b2689 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,6 @@ dependencies = [
     "transformers",
     "datasets>=2.19.0",
     "modal",
-
     # helper
     "tqdm",
     "packaging",
@@ -23,12 +22,10 @@ dependencies = [
     "ninja",
     "tomli",
     "tabulate",
-
     # Numerics
     "einops",
     "python-dotenv",
     "numpy",
-
     # LLM providers
     "openai",
     "litellm[proxy]",
@@ -48,17 +45,17 @@ dev = [
     "ruff",
 ]
 
-[tool.uv.sources]
-torch = [{ index = "pytorch-rocm" }]
-torchvision = [{ index = "pytorch-rocm" }]
-
 [[tool.uv.index]]
-name = "pytorch-rocm"
+name = "pytorch"
 url = "https://download.pytorch.org/whl/rocm7.1"
 
+
 [tool.setuptools.packages.find]
 where = ["src"]
 include = ["kernelbench*"]
 
 [tool.setuptools.package-data]
-kernelbench = ["prompts/**/*"]
\ No newline at end of file
+kernelbench = ["prompts/**/*"]
+
+[tool.uv.sources]
+torch = { index = "pytorch" }

From 062801086084c0e3708511c00fc996355a1aaf47 Mon Sep 17 00:00:00 2001
From: amd-asalykov <asalykov@amd.com>
Date: Wed, 28 Jan 2026 21:15:30 +0000
Subject: [PATCH 4/8] update

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b090cbf3..c2a40ef2 100644
--- a/README.md
+++ b/README.md
@@ -88,7 +88,7 @@ uv sync
 # Install with GPU dependencies (for local GPU evaluation)
 uv sync --extra gpu
 
-# Install with ROCm backend
+# Install with AMD ROCm backend
 uv add torch --index pytorch=https://download.pytorch.org/whl/rocm7.1
 uv sync
 

From 1a3bb4137ad4d47da39f45d39a819234eafddf29 Mon Sep 17 00:00:00 2001
From: amd-asalykov <asalykov@amd.com>
Date: Wed, 28 Jan 2026 21:17:02 +0000
Subject: [PATCH 5/8] update

---
 pyproject.toml | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index cd5b2689..f3f98ae2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,9 +11,11 @@ requires-python = "==3.10.*"
 dependencies = [
     # Frameworks
     "torch>=2.9.0",
+
     "transformers",
     "datasets>=2.19.0",
     "modal",
+
     # helper
     "tqdm",
     "packaging",
@@ -22,10 +24,12 @@ dependencies = [
     "ninja",
     "tomli",
     "tabulate",
+
     # Numerics
     "einops",
     "python-dotenv",
     "numpy",
+
     # LLM providers
     "openai",
     "litellm[proxy]",
@@ -45,17 +49,10 @@ dev = [
     "ruff",
 ]
 
-[[tool.uv.index]]
-name = "pytorch"
-url = "https://download.pytorch.org/whl/rocm7.1"
-
 
 [tool.setuptools.packages.find]
 where = ["src"]
 include = ["kernelbench*"]
 
 [tool.setuptools.package-data]
-kernelbench = ["prompts/**/*"]
-
-[tool.uv.sources]
-torch = { index = "pytorch" }
+kernelbench = ["prompts/**/*"]
\ No newline at end of file

From 416f70d71074a4d59a39c39b602bc4ae92f2a4d0 Mon Sep 17 00:00:00 2001
From: amd-asalykov <asalykov@amd.com>
Date: Wed, 28 Jan 2026 21:25:15 +0000
Subject: [PATCH 6/8] update

---
 README.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index c2a40ef2..0ff33cec 100644
--- a/README.md
+++ b/README.md
@@ -85,12 +85,11 @@ We have transitioned to using `pyproject.toml` and `uv` for dependency managemen
 # Install base dependencies (works without a local GPU)
 uv sync
 
-# Install with GPU dependencies (for local GPU evaluation)
-uv sync --extra gpu
-
 # Install with AMD ROCm backend
 uv add torch --index pytorch=https://download.pytorch.org/whl/rocm7.1
-uv sync
+
+# Install with GPU dependencies (for local GPU evaluation)
+uv sync --extra gpu
 
 # Run commands with uv (which invoke the right env)
 uv run python scripts/<script_name>.py ...

From ac473febc72c828e3510d277728c796c26731f6a Mon Sep 17 00:00:00 2001
From: amd-asalykov <asalykov@amd.com>
Date: Mon, 2 Feb 2026 13:06:51 -0600
Subject: [PATCH 7/8] add ROCm version requirement

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0ff33cec..a53d5672 100644
--- a/README.md
+++ b/README.md
@@ -85,7 +85,7 @@ We have transitioned to using `pyproject.toml` and `uv` for dependency managemen
 # Install base dependencies (works without a local GPU)
 uv sync
 
-# Install with AMD ROCm backend
+# Install with AMD ROCm backend (ROCm>=7.1 is required)
 uv add torch --index pytorch=https://download.pytorch.org/whl/rocm7.1
 
 # Install with GPU dependencies (for local GPU evaluation)

From 0312e01dec5189d8d7d0bf668762a7d2b9b39f35 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Sat, 21 Feb 2026 16:59:51 -0800
Subject: [PATCH 8/8] check and add more guardrails

---
 README.md                                     | 10 +++--
 scripts/generate_and_eval_single_sample.py    |  2 +-
 src/kernelbench/eval.py                       | 19 +++++++-
 src/kernelbench/profile.py                    |  9 +++-
 src/kernelbench/prompts/hardware/gpu_specs.py |  1 +
 .../prompts/model_new_ex_add_hip.py           |  1 +
 src/kernelbench/prompts/prompts.toml          |  2 +-
 src/kernelbench/timing.py                     | 15 ++++---
 src/kernelbench/utils.py                      | 45 ++++++++++++++++---
 9 files changed, 86 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index a53d5672..ce9348c6 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ We construct KernelBench to have 4 Levels of categories:
 - **Level 4 🤗**:  Level Hugging Face 
     Optimize whole model architectures from HuggingFace
 
-We are actively extending KernelBench to other DSLs beyond `cuda` as well (see below).
+We are actively extending KernelBench to other DSLs beyond `cuda` as well (see below), as well as AMD GPU support.
 
 ## ⚖️ Evaluation
 #### Methodology
@@ -95,6 +95,8 @@ uv sync --extra gpu
 uv run python scripts/<script_name>.py ...
 ```
 
+For AMD GPU aka ROCm backend (ROCm>=7.1), please add `uv remove torch && uv add torch --index pytorch=https://download.pytorch.org/whl/rocm7.1` for RoCm compatible PyTorch to configure your dependencies. Running in a docker image is recommended for this due to complexity of ROCm setup.
+
 You can still use `conda (python=3.10)` to create your environment and install dependencies with `requirements.txt`.
 
 We use `litellm` for API calls. Please set your keys by creating a `.env` following our `.env.example`.
@@ -118,9 +120,11 @@ uv run python scripts/generate_and_eval_single_sample.py dataset_src=huggingface
 ```
 
 **What you might need to modify**
-* **`gpu_arch`** - Depend on your GPU, you might need to adjust the `gpu_arch` argument to reflect your hardware. `gpu_arch` currently supported for `hip` backend: `gfx942`, `gfx950`.
+* **`gpu_arch`** - Depend on your GPU, you might need to adjust the `gpu_arch` argument to reflect your hardware.
 * **`precision`** - You can specify the precision of tensor by `precision=fp32`. Currently all of our reported results are `fp32` but we added support for `fp16` & `bf16`.
-*  **`backend`** - We are also supporting other GPU programming languages beyond `cuda`. For example, simply specify `backend=triton` or `backend=hip`. For now we support DSLs: `cuda`, `hip`, `triton`, `cute`, `tilelang`, `thunderkittens`. 
+*  **`backend`** - We are also supporting other GPU programming languages beyond `cuda`. For example, simply specify `backend=triton` or `backend=hip`. For now we support NVIDIA GPUs with programming frameworks and DSLs: `cuda`, `triton`, `cute`, `tilelang`, `thunderkittens`. 
+
+Note for AMD GPUs: Use `hip` backend, `gpu_arch` currently supported: `gfx942`, `gfx950`.
 
 Note on setting up ThunderKittens (TK) locally: to use `backend=thunderkittens`, you need to git clone the ThunderKittens repo and set the following environment variable to point to your local ThunderKittens directory, `export THUNDERKITTENS_ROOT=<PATH to ThunderKittens folder>`, and all ThunderKitten programs as shown in the [example](src/kernelbench/prompts/model_new_ex_add_thunderkittens.py), should contain `tk_root = os.environ.get("THUNDERKITTENS_ROOT", "/root/ThunderKittens")`, which enable the kernel to include the right TK primitives. In addition, we only support BF16 for TK right now.
 
diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
index 95aea5d7..fbde28b9 100644
--- a/scripts/generate_and_eval_single_sample.py
+++ b/scripts/generate_and_eval_single_sample.py
@@ -124,7 +124,7 @@ def main(config: EvalConfig):
     )
 
     if config.gpu_arch:
-        if (type(config.gpu_arch) is not list):
+        if (type(config.gpu_arch) is not list): # normalization to list
             config.gpu_arch = [config.gpu_arch]
         set_gpu_arch(config.gpu_arch)  # otherwise build for all architectures
 
diff --git a/src/kernelbench/eval.py b/src/kernelbench/eval.py
index dd79b2c0..3557dc94 100644
--- a/src/kernelbench/eval.py
+++ b/src/kernelbench/eval.py
@@ -429,7 +429,18 @@ def eval_kernel_against_ref(
     # TODO: check device is busy
     assert torch.cuda.is_available(), "CUDA is not available, cannot run Eval"
     
-    if backend.lower() == "tilelang":
+    # Backend-GPU vendor validation
+    from .utils import get_gpu_vendor
+    vendor = get_gpu_vendor(device)
+    backend_lower = backend.lower()
+    # HIP is AMD-only
+    if backend_lower == "hip" and vendor != "amd":
+        raise ValueError(f"HIP backend requires AMD GPU, got {vendor}")
+    # cuda/cute/thunderkittens are NVIDIA-only (triton/tilelang work on both)
+    if backend_lower in ["cuda", "cute", "thunderkittens"] and vendor == "amd":
+        raise ValueError(f"{backend} backend requires NVIDIA GPU, got AMD")
+    
+    if backend_lower == "tilelang":
         assert precision == torch.float16 or precision == torch.bfloat16, "TileLang only supports fp16 or bfloat16"
     
     torch.set_printoptions(
@@ -463,7 +474,11 @@ def eval_kernel_against_ref(
             raise ValueError(
                 f"device must be an int or torch.device, got {type(device)}"
             )
-        os.environ["CUDA_VISIBLE_DEVICES"] = str(device_num)
+        # NVIDIA uses CUDA_VISIBLE_DEVICES, AMD uses HIP_VISIBLE_DEVICES
+        if vendor == "amd":
+            os.environ["HIP_VISIBLE_DEVICES"] = str(device_num)
+        else:
+            os.environ["CUDA_VISIBLE_DEVICES"] = str(device_num)
     context = {}
 
     if verbose:
diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py
index 8326324e..a4960438 100644
--- a/src/kernelbench/profile.py
+++ b/src/kernelbench/profile.py
@@ -90,13 +90,20 @@ def profile_with_nsight(func, metrics=None, num_trials=1):
         >>> print(results['gpu__time_duration.sum'])  # Time in nanoseconds
     
     Raises:
-        RuntimeError: If nsight-python is not installed.
+        RuntimeError: If nsight-python is not installed or not on NVIDIA GPU.
     """
     if not NSIGHT_AVAILABLE:
         raise RuntimeError(
             "nsight-python not available."
         )
     
+    # NSight is NVIDIA-only
+    from kernelbench.utils import get_gpu_vendor
+    if get_gpu_vendor() != "nvidia":
+        raise RuntimeError(
+            "NSight profiling requires NVIDIA GPU. Not available on AMD."
+        )
+    
     # Normalize metrics to a list
     if metrics is None:
         metrics = ['sm__cycles_active.avg']
diff --git a/src/kernelbench/prompts/hardware/gpu_specs.py b/src/kernelbench/prompts/hardware/gpu_specs.py
index ca63488f..7fde700e 100644
--- a/src/kernelbench/prompts/hardware/gpu_specs.py
+++ b/src/kernelbench/prompts/hardware/gpu_specs.py
@@ -119,6 +119,7 @@
         "Shared memory capacity per SM": "164 KB",
         "Maximum shared memory per thread block": "163 KB",
     },
+    # NOTE: In the future we will have a more unified format for various GPUs
     "MI300X": {
         "GPU Architecture": "gfx942",
         "GPU Memory": "192GB",
diff --git a/src/kernelbench/prompts/model_new_ex_add_hip.py b/src/kernelbench/prompts/model_new_ex_add_hip.py
index 2498bc18..806aa876 100644
--- a/src/kernelbench/prompts/model_new_ex_add_hip.py
+++ b/src/kernelbench/prompts/model_new_ex_add_hip.py
@@ -4,6 +4,7 @@
 import torch.nn.functional as F
 from torch.utils.cpp_extension import load_inline
 
+# Must include this line so PyTorch could use HIP compiler for AMD GPUs
 os.environ["CXX"] = "hipcc"
 
 elementwise_add_cpp_source = """
diff --git a/src/kernelbench/prompts/prompts.toml b/src/kernelbench/prompts/prompts.toml
index 61b6b15f..6a7dfcaa 100644
--- a/src/kernelbench/prompts/prompts.toml
+++ b/src/kernelbench/prompts/prompts.toml
@@ -55,7 +55,7 @@ one_shot_new_arch = "src/kernelbench/prompts/model_new_ex_add_thunderkittens.py"
 # No few_shot_examples - will use one-shot when few_shot option is selected
 
 [backends.hip]
-backend_display = "HIP kernels"
+backend_display = "AMD HIP kernels"
 one_shot_new_arch = "src/kernelbench/prompts/model_new_ex_add_hip.py"
 # No few_shot_examples - will use one-shot when few_shot option is selected
 
diff --git a/src/kernelbench/timing.py b/src/kernelbench/timing.py
index 52a2b85e..f22920b4 100644
--- a/src/kernelbench/timing.py
+++ b/src/kernelbench/timing.py
@@ -510,20 +510,25 @@ def time_execution_with_nsight_python(
     """
     Time a CUDA kernel function using nsight-python.
     
-    Note: nsight returns an average time across num_trials runs.
+    NOTE: NVIDIA-only. NSight Compute (ncu) does not work on AMD GPUs.
+    
     Returns a list with a single value (average time) for API consistency.
     GPU time from nsight is in nanoseconds, converted to milliseconds.
-    
-    Returns:
-        List containing one float: average elapsed time in milliseconds
     """
-    
     from kernelbench.profile import profile_with_nsight
+    from kernelbench.utils import get_gpu_vendor
     
     if device is None:
         if verbose:
             print(f"Using current device: {torch.cuda.current_device()}")
         device = torch.cuda.current_device()
+    
+    # NSight is NVIDIA-only
+    if get_gpu_vendor(device) != "nvidia":
+        raise RuntimeError(
+            "NSight profiling requires NVIDIA GPU. "
+            "Use timing_method='cuda_event' or 'do_bench' for AMD."
+        )
 
     with torch.cuda.device(device):
         # Warm ups
diff --git a/src/kernelbench/utils.py b/src/kernelbench/utils.py
index bbd6a468..e975cc63 100644
--- a/src/kernelbench/utils.py
+++ b/src/kernelbench/utils.py
@@ -38,16 +38,51 @@
 # Inference Helpers
 ########################################################
 
+NVIDIA_ARCHS = ["Maxwell", "Pascal", "Volta", "Turing", "Ampere", "Hopper", "Ada", "Blackwell"]
+AMD_ARCHS = ["gfx942", "gfx950"] # gfx942: CDNA3 (MI300), gfx950: CDNA4 (MI350) 
+
+
+########################################################
+# GPU Vendor Detection
+########################################################
+
+def get_gpu_vendor(device: torch.device | int | None = None) -> str:
+    """Returns 'nvidia', 'amd', or 'unknown' for the given device."""
+    if not torch.cuda.is_available():
+        return "unknown"
+    if device is None:
+        device = torch.cuda.current_device()
+    name = torch.cuda.get_device_name(device).upper()
+    if "NVIDIA" in name:
+        return "nvidia"
+    if "AMD" in name or "MI3" in name:
+        return "amd"
+    return "unknown"
+
+
 def set_gpu_arch(arch_list: list[str]):
     """
-    Set env variable for torch cuda arch list to build kernels for specified architectures
+    Set env variable for torch to build kernels for specified architectures.
+    Supports both NVIDIA (TORCH_CUDA_ARCH_LIST) and AMD (PYTORCH_ROCM_ARCH).
     """
-    valid_archs = ["Maxwell", "Pascal", "Volta", "Turing", "Ampere", "Hopper", "Ada", "gfx942", "gfx950"]
+    nvidia_archs = []
+    amd_archs = []
+    
     for arch in arch_list:
-        if arch not in valid_archs:
-            raise ValueError(f"Invalid architecture: {arch}. Must be one of {valid_archs}")
+        if arch in NVIDIA_ARCHS:
+            nvidia_archs.append(arch)
+        elif arch in AMD_ARCHS:
+            amd_archs.append(arch)
+        else:
+            raise ValueError(f"Invalid architecture: {arch}. Must be one of NVIDIA: {NVIDIA_ARCHS} or AMD: {AMD_ARCHS}")
+    
+    if nvidia_archs and amd_archs:
+        raise ValueError(f"Cannot mix NVIDIA and AMD architectures. Got NVIDIA: {nvidia_archs}, AMD: {amd_archs}")
     
-    os.environ["TORCH_CUDA_ARCH_LIST"] = ";".join(arch_list)
+    if nvidia_archs:
+        os.environ["TORCH_CUDA_ARCH_LIST"] = ";".join(nvidia_archs)
+    elif amd_archs:
+        os.environ["PYTORCH_ROCM_ARCH"] = ";".join(amd_archs)
 
 def query_server(
     prompt: str | list[dict],  # string if normal prompt, list of dicts if chat prompt,