test/prototype/mx_formats/test_kernels.py

zxd1997066 · zxd1997066 · commit 77feb44858d2 · 2025-12-01T16:23:00.000+08:00
diff --git a/test/prototype/mx_formats/test_kernels.py b/test/prototype/mx_formats/test_kernels.py
@@ -49,9 +49,11 @@
     is_sm_at_least_89,
     is_sm_at_least_100,
     torch_version_at_least,
+    get_current_accelerator_device,
 )
 
 torch.manual_seed(0)
+_DEVICE = get_current_accelerator_device()
 
 if not torch_version_at_least("2.8.0"):
     pytest.skip("Unsupported PyTorch version", allow_module_level=True)
@@ -398,9 +400,9 @@ def test_fp6_values(dtype_name):
     [
         "cpu",
         pytest.param(
-            "cuda",
+            _DEVICE,
             marks=pytest.mark.skipif(
-                not torch.cuda.is_available(), reason="CUDA not available"
+                not torch.accelerator.is_available(), reason="GPU not available"
             ),
         ),
     ],
@@ -423,11 +425,11 @@ def test_fp6_e3m2_rounding(f32_val, f6_e3m2_enc, device):
     assert f6_e3m2_unpacked.item() == (f6_e3m2_enc | 0b100000)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
 def test_fp6_e2m3_pack_unpack():
     orig_vals = torch.Tensor([[0.0, 0.5, 7.5, -0.0], [-0.875, 1.0, -6.0, 0.125]]).to(
-        "cuda"
+        _DEVICE
     )
     orig_vals_f6_unpacked = f32_to_f6_e2m3_unpacked(orig_vals)
     orig_vals_f6_packed = pack_uint6(orig_vals_f6_unpacked)
@@ -438,11 +440,11 @@ def test_fp6_e2m3_pack_unpack():
     assert torch.all(orig_vals_f6_packed_unpacked == orig_vals)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
 def test_fp6_e3m2_pack_unpack():
     orig_vals = torch.Tensor([[0.0, 5.0, 28.0, -0.0], [-0.25, 0.1875, 0.0625, 8.0]]).to(
-        "cuda"
+        _DEVICE
     )
     orig_vals_f6_unpacked = f32_to_f6_e3m2_unpacked(orig_vals)
     orig_vals_f6_packed = pack_uint6(orig_vals_f6_unpacked)
@@ -472,13 +474,13 @@ def triton_to_mxfp8_dim0_reference(
 
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
 @pytest.mark.skipif(
-    not is_sm_at_least_89(),
+    torch.cuda.is_available() and not is_sm_at_least_89(),
     reason="float8 in triton requires CUDA capability 8.9 or greater",
 )
 @pytest.mark.parametrize("M", (256, 2048))
 @pytest.mark.parametrize("K", (256, 2048))
 def test_triton_mxfp8_dim1_randn(M, K):
-    x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda")
+    x = torch.randn(M, K, dtype=torch.bfloat16, device=_DEVICE)
     x_mx_ref, x_s_ref = triton_to_mxfp8_dim1_reference(x, block_size=32)
     x_mx_t, x_s_t = triton_to_mxfp8_dim1(x, inner_block_size=32)
     torch.testing.assert_close(x_mx_t, x_mx_ref, rtol=0, atol=0)
@@ -487,13 +489,13 @@ def test_triton_mxfp8_dim1_randn(M, K):
 
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
 @pytest.mark.skipif(
-    not is_sm_at_least_100(),
+    torch.cuda.is_available() and not is_sm_at_least_100(),
     reason="mxfp8 requires CUDA capability 10.0 or greater",
 )
 @pytest.mark.parametrize("M", (256, 2048, 131072))
 @pytest.mark.parametrize("K", (256, 5120, 7168))
 def test_triton_mxfp8_dim0_randn(M, K):
-    x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda")
+    x = torch.randn(M, K, dtype=torch.bfloat16, device=_DEVICE)
     x_mx_ref, x_s_ref = triton_to_mxfp8_dim0_reference(x, block_size=32)
     x_mx_t, x_s_t = triton_to_mxfp8_dim0(x, inner_block_size=32)
     torch.testing.assert_close(x_mx_t, x_mx_ref, rtol=0, atol=0)
@@ -502,11 +504,11 @@ def test_triton_mxfp8_dim0_randn(M, K):
 
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
 @pytest.mark.skipif(
-    not is_sm_at_least_100(),
+    torch.cuda.is_available() and not is_sm_at_least_100(),
     reason="mxfp8 requires CUDA capability 10.0 or greater",
 )
 def test_triton_mxfp8_dim0_zeros():
-    x = torch.zeros(8192, 5120, dtype=torch.bfloat16, device="cuda")
+    x = torch.zeros(8192, 5120, dtype=torch.bfloat16, device=_DEVICE)
     x_mx_ref, x_s_ref = triton_to_mxfp8_dim0_reference(x, block_size=32)
     x_mx_t, x_s_t = triton_to_mxfp8_dim0(x, inner_block_size=32)
     assert not x_mx_t.isnan().any(), "quantized tensor should not contain NaNs"
@@ -516,14 +518,14 @@ def test_triton_mxfp8_dim0_zeros():
 
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
 @pytest.mark.skipif(
-    not is_sm_at_least_100(),
+    torch.cuda.is_available() and not is_sm_at_least_100(),
     reason="mxfp8 requires CUDA capability 10.0 or greater",
 )
 @pytest.mark.parametrize("M", (256, 2048, 131072))
 @pytest.mark.parametrize("K", (256, 5120, 7168))
 @pytest.mark.parametrize("orig_dtype", (torch.float32, torch.bfloat16))
 def test_triton_mxfp8_dequant_dim0(M, K, orig_dtype):
-    x = torch.zeros(M, K, dtype=orig_dtype, device="cuda")
+    x = torch.zeros(M, K, dtype=orig_dtype, device=_DEVICE)
     block_size = 32
     x_data, x_scales = triton_to_mxfp8_dim0_reference(x, block_size=32)
     hp_ref = to_dtype(
@@ -537,7 +539,7 @@ def test_triton_mxfp8_dequant_dim0(M, K, orig_dtype):
     torch.testing.assert_close(hp_t, hp_ref, rtol=0, atol=0)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 @pytest.mark.parametrize(
     "shape",
     [
@@ -552,14 +554,14 @@ def test_triton_mxfp8_dequant_dim0(M, K, orig_dtype):
     ],
 )
 def test_rearrange(shape):
-    scales = torch.randint(256, size=shape, device="cuda", dtype=torch.uint8)
+    scales = torch.randint(256, size=shape, device=_DEVICE, dtype=torch.uint8)
     eager = to_blocked(scales, False)
     triton = to_blocked(scales, True)
     torch.testing.assert_close(eager, triton, atol=0, rtol=0)
 
 
 @pytest.mark.skipif(
-    not is_sm_at_least_100(),
+    torch.cuda.is_available() and not is_sm_at_least_100(),
     reason="MXFP8 requires CUDA capability 10.0 or greater",
 )
 @pytest.mark.parametrize("M", (32, 64, 2048))
@@ -578,7 +580,7 @@ def test_cuda_mx_dim1_numerics(M, K, input_dtype, scaling_mode):
 
     # Use disinct incrementing values from 0 to M*K-1 to make debugging easier.
     x = (
-        torch.arange(0, M * K, dtype=input_dtype, device="cuda")
+        torch.arange(0, M * K, dtype=input_dtype, device=_DEVICE)
         .reshape(M, K)
         .contiguous()
     )
@@ -607,7 +609,7 @@ def test_cuda_mx_dim1_numerics(M, K, input_dtype, scaling_mode):
 
 
 @pytest.mark.skipif(
-    not is_sm_at_least_100(),
+    torch.cuda.is_available() and not is_sm_at_least_100(),
     reason="MXFP8 requires CUDA capability 10.0 or greater",
 )
 def test_cuda_mx_dim0_not_supported():
@@ -616,7 +618,7 @@ def test_cuda_mx_dim0_not_supported():
     M, K = 64, 64
     block_size = 32
     x = (
-        torch.arange(0, M * K, dtype=torch.bfloat16, device="cuda")
+        torch.arange(0, M * K, dtype=torch.bfloat16, device=_DEVICE)
         .reshape(M, K)
         .contiguous()
     )
@@ -631,15 +633,15 @@ def test_cuda_mx_dim0_not_supported():
 
 
 @pytest.mark.skipif(
-    not is_sm_at_least_100(),
+    torch.cuda.is_available() and not is_sm_at_least_100(),
     reason="MXFP8 requires CUDA capability 10.0 or greater",
 )
 def test_cuda_mx_dim1_invalid_block_size():
     from torchao.prototype import mxfp8_cuda
 
     M, K = 64, 64
     x = (
-        torch.arange(0, M * K, dtype=torch.bfloat16, device="cuda")
+        torch.arange(0, M * K, dtype=torch.bfloat16, device=_DEVICE)
         .reshape(M, K)
         .contiguous()
     )