flashinfer-ai · aleozlx · Nov 28, 2025 · gemini-code-assist · Nov 28, 2025 · aleozlx
@@ -529,8 +529,9 @@ def __init__(
         :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
         :type cluster_shape_mn: Tuple[int, int]
         """
-        assert sm_version == "sm_100", (
-            "sm_100 is the only supported SM version for cute-dsl backend."
+        supported_sm_versions = ["sm_100", "sm_103"]
+        assert sm_version in supported_sm_versions, (
+            f"{supported_sm_versions} are the only supported SM versions for cute-dsl backend, but encountered {sm_version}"
         )
 
         self.acc_dtype = cutlass.Float32
@@ -561,7 +562,12 @@ def __init__(
         self.cta_sync_bar_id = 0
         self.epilog_sync_bar_id = 1
         self.tmem_ptr_sync_bar_id = 2
-        self.smem_capacity = utils.get_smem_capacity_in_bytes(sm_version)
+
+        # HACK "sm_103" doesn't work yet for the query
+        # https://github.com/NVIDIA/cutlass/blob/5016493cc0d8650d5b2f6d2c2751cf49bc217e86/python/CuTeDSL/cutlass/utils/smem_allocator.py#L19
+        # self.smem_capacity = utils.get_smem_capacity_in_bytes(sm_version)
+        self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
-        # HACK "sm_103" doesn't work yet for the query
-        # https://github.com/NVIDIA/cutlass/blob/5016493cc0d8650d5b2f6d2c2751cf49bc217e86/python/CuTeDSL/cutlass/utils/smem_allocator.py#L19
-        # self.smem_capacity = utils.get_smem_capacity_in_bytes(sm_version)
-        self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+        # TODO: Remove this workaround once nvidia-cutlass-dsl supports sm_103 for smem capacity queries.
+        # HACK: "sm_103" is not yet supported by get_smem_capacity_in_bytes. Using "sm_100" as a fallback.
+        # See: https://github.com/NVIDIA/cutlass/blob/5016493cc0d8650d5b2f6d2c2751cf49bc217e86/python/CuTeDSL/cutlass/utils/smem_allocator.py#L19
+        smem_query_version = "sm_100" if sm_version == "sm_103" else sm_version
+        self.smem_capacity = utils.get_smem_capacity_in_bytes(smem_query_version)
-        # HACK "sm_103" doesn't work yet for the query
-        # https://github.com/NVIDIA/cutlass/blob/5016493cc0d8650d5b2f6d2c2751cf49bc217e86/python/CuTeDSL/cutlass/utils/smem_allocator.py#L19
-        # self.smem_capacity = utils.get_smem_capacity_in_bytes(sm_version)
-        self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+        # TODO: Remove this workaround once nvidia-cutlass-dsl supports sm_103 for smem capacity queries.
+        # HACK: "sm_103" is not yet supported by get_smem_capacity_in_bytes. Using "sm_100" as a fallback.
+        # See: https://github.com/NVIDIA/cutlass/blob/5016493cc0d8650d5b2f6d2c2751cf49bc217e86/python/CuTeDSL/cutlass/utils/smem_allocator.py#L19
+        smem_query_version = "sm_100" if sm_version == "sm_103" else sm_version
+        self.smem_capacity = utils.get_smem_capacity_in_bytes(smem_query_version)
+
         SM100_TMEM_CAPACITY_COLUMNS = 512
         self.num_tmem_alloc_cols = SM100_TMEM_CAPACITY_COLUMNS
 

diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@ einops
 ninja
 numpy
 nvidia-cudnn-frontend>=1.13.0
-nvidia-cutlass-dsl>=4.2.1
+nvidia-cutlass-dsl>=4.3.1
 nvidia-ml-py
 packaging>=24.2
 requests

diff --git a/tests/gemm/test_cute_dsl_blockscaled_gemm.py b/tests/gemm/test_cute_dsl_blockscaled_gemm.py
@@ -80,10 +80,12 @@ def test_blockscaled_gemm_python_interface(
 ):
     torch.manual_seed(42)
     device = torch.device("cuda:0")
-    major, minor = torch.cuda.get_device_capability(device)
-
-    if not (major == 10 and minor == 0):
-        pytest.skip("Cute-dsl backend is only supported on SM100.")
+    device_ver = torch.cuda.get_device_capability(device)
+    supported_device_vers = [(10, 0), (10, 3)]
+    if device_ver not in supported_device_vers:
+        pytest.skip(
+            f"Cute-dsl backend is only supported on {supported_device_vers}, skipping {device_ver}."
+        )
 
     l, m = lm
     k, n = kn