pytorch · oulgen · Oct 28, 2025 · jansel · Nov 2, 2025
diff --git a/.github/matrix.json b/.github/matrix.json
@@ -71,6 +71,16 @@
       "container-options": "--device=/dev/kfd --device=/dev/dri",
       "pytorch-version": "pytorch-nightly",
       "alias": "mi325x"
+    },
+    {
+      "runner": "linux.g5.4xlarge.nvidia.gpu",
+      "python-version": "3.12",
+      "ref-eager": false,
+      "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04",
+      "runtime-version": "cpu",
+      "container-options": "--gpus all",
+      "pytorch-version": "pytorch-nightly",
+      "alias": "cpu"
     }
   ]
 }
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -97,7 +97,7 @@ jobs:
           fi
 
       - name: Install Triton
-        if: steps.cache.outputs.cache-hit != 'true' && matrix.pytorch-version != 'pytorch-2.9'
+        if: steps.cache.outputs.cache-hit != 'true' && (matrix.pytorch-version != 'pytorch-2.9' || contains(matrix.alias, 'cpu'))
         run: |
           set -x
           source .venv/bin/activate
@@ -110,7 +110,11 @@ jobs:
           cd /tmp/$USER
           uv pip uninstall triton pytorch-triton || true
           rm -rf triton/ || true
-          git clone https://github.com/triton-lang/triton.git
+          if [[ "${{ matrix.alias }}" == *cpu* ]]; then
+            git clone --recursive -b main-merged https://github.com/triton-lang/triton-cpu.git triton
+          else
+            git clone https://github.com/triton-lang/triton.git triton
+          fi
           cd triton/
           uv pip install -r python/requirements.txt
           MAX_JOBS=$(nproc) TRITON_PARALLEL_LINK_JOBS=2 uv pip install .
@@ -131,6 +135,7 @@ jobs:
           if [[ "${{ matrix.dtype-asserts }}" == "true" ]]; then export HELION_DEBUG_DTYPE_ASSERTS=1; fi
           if [[ "${{ matrix.expecttest-accept }}" == "true" ]]; then export EXPECTTEST_ACCEPT=1; fi
           if [[ "${{ matrix.ref-eager }}" == "true" ]]; then export HELION_INTERPRET=1; fi
+          if [[ "${{ contains(matrix.alias, 'cpu') }}" == "true" ]]; then export TRITON_CPU_BACKEND=1; fi
           # -rf: print failed tests
           # --timeout: max allowed time for each test
           pytest -rf --timeout=60

diff --git a/helion/_testing.py b/helion/_testing.py
@@ -34,19 +34,37 @@
     from .runtime.kernel import Kernel
 
 
-DEVICE = torch.device("xpu") if torch.xpu.is_available() else torch.device("cuda")
-PROJECT_ROOT: Path = Path(__file__).parent.parent
-EXAMPLES_DIR: Path = PROJECT_ROOT / "examples"
+def _get_triton_backend() -> str | None:
+    try:
+        return triton.runtime.driver.active.get_current_target().backend  # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess]
+    except Exception:
+        return None
 
 
-def is_cuda() -> bool:
-    """Return True if running on CUDA (NVIDIA GPU)."""
+def is_cpu() -> bool:
+    """Return True if running on Triton CPU backend."""
     return (
-        triton.runtime.driver.active.get_current_target().backend == "cuda"  # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess]
-        and DEVICE.type == "cuda"
+        os.environ.get("TRITON_CPU_BACKEND", "0") == "1"
+        or _get_triton_backend() == "cpu"
     )
 
 
+def is_cuda() -> bool:
+    """Return True if running on CUDA (NVIDIA GPU)."""
+    return _get_triton_backend() == "cuda" and torch.cuda.is_available()
+
+
+PROJECT_ROOT: Path = Path(__file__).parent.parent
+EXAMPLES_DIR: Path = PROJECT_ROOT / "examples"
+
+if is_cpu():
+    DEVICE = torch.device("cpu")
+elif torch.xpu.is_available():
+    DEVICE = torch.device("xpu")
+else:
+    DEVICE = torch.device("cuda")
+
+
 def get_nvidia_gpu_model() -> str:
     """
     Retrieves the model of the NVIDIA GPU being used.
@@ -80,6 +98,11 @@ def skipIfXPU(reason: str) -> Callable[[Callable], Callable]:
     return unittest.skipIf(torch.xpu.is_available(), reason)  # pyright: ignore[reportAttributeAccessIssue]
 
 
+def skipIfCpu(reason: str) -> Callable[[Callable], Callable]:
+    """Skip test if running on Triton CPU backend."""
+    return unittest.skipIf(is_cpu(), reason)
+
+
 def skipIfA10G(reason: str) -> Callable[[Callable], Callable]:
     """Skip test if running on A10G GPU"""
     gpu_model = get_nvidia_gpu_model()

diff --git a/test/test_autotuner.py b/test/test_autotuner.py
@@ -28,6 +28,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import import_path
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRocm
 from helion.autotuner import DifferentialEvolutionSearch
 from helion.autotuner import PatternSearch
@@ -316,6 +317,7 @@ def add(a, b):
         )
         torch.testing.assert_close(add(*args), sum(args))
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_run_finite_search(self):
         @helion.kernel(
             configs=[
@@ -347,6 +349,7 @@ def add(a, b):
         torch.testing.assert_close(add(*args), sum(args))
 
     @skipIfRocm("too slow on rocm")
+    @skipIfCpu("TritonError: Error from Triton code")
     def test_random_search(self):
         args = (
             torch.randn([512, 512], device=DEVICE),
@@ -436,6 +439,7 @@ def diff_count(flat):
         ]
         self.assertEqual(sorted(pair_neighbors), sorted(expected))
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_accuracy_check_filters_bad_config_wrong_output(self) -> None:
         bad_config = helion.Config(block_sizes=[1], num_warps=8)
         good_config = helion.Config(block_sizes=[1], num_warps=4)
@@ -509,6 +513,7 @@ def make_bad_config_produce_wrong_output(
         run_mode("fork", expect_error=False)
         run_mode("spawn", expect_error=True)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_accuracy_check_filters_bad_config_wrong_arg_mutation(self) -> None:
         bad_config = helion.Config(block_sizes=[1], num_warps=8)
         good_config = helion.Config(block_sizes=[1], num_warps=4)
@@ -591,6 +596,7 @@ def wrong_fn(*fn_args, **fn_kwargs):
         run_mode("fork", expect_error=False)
         run_mode("spawn", expect_error=True)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_autotune_baseline_fn(self) -> None:
         """Test that custom baseline function is used for accuracy checking."""
         config1 = helion.Config(block_sizes=[32], num_warps=4)
@@ -631,6 +637,7 @@ def add(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
         # Verify the result is correct
         torch.testing.assert_close(result, args[0] + args[1])
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_autotune_baseline_fn_filters_bad_config(self) -> None:
         """Test that custom baseline function correctly filters incorrect configs."""
         bad_config = helion.Config(block_sizes=[1], num_warps=8)
@@ -729,6 +736,7 @@ def add(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
         ):
             add(*args)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_max_generations(self):
         """Autotuner max generation respects explicit kwargs then setting override."""
 
@@ -772,6 +780,7 @@ def add(a, b):
         result = add(*args)
         torch.testing.assert_close(result, sum(args))
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_autotune_effort_quick(self):
         """Test that quick effort profile uses correct default values."""
         # Get the quick profile defaults
@@ -907,6 +916,7 @@ def add(a, b):
         return search.samples[0]
 
     @skipIfRocm("accuracy difference")
+    @skipIfCpu("fails on Triton CPU backend")
     def test_autotune_random_seed_from_env_var(self) -> None:
         # same env var value -> same random sample
         with patch.dict(
@@ -931,6 +941,7 @@ def test_autotune_random_seed_from_env_var(self) -> None:
         self.assertNotEqual(first, second)
 
     @skipIfRocm("accuracy difference")
+    @skipIfCpu("fails on Triton CPU backend")
     def test_autotune_random_seed_from_settings(self) -> None:
         # same autotune_random_seed setting -> same random sample
         first = self._autotune_and_record(autotune_random_seed=4242)

diff --git a/test/test_cache.py b/test/test_cache.py
@@ -15,6 +15,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import import_path
+from helion._testing import skipIfCpu
 from helion._utils import counters
 from helion.autotuner import StrictLocalAutotuneCache
 from helion.autotuner.base_search import BaseSearch
@@ -73,6 +74,7 @@ def get_welford_kernel():
 
 class TestCache(RefEagerTestDisabled, TestCase):
     @parametrize("name", ("add", "matmul", "welford"))
+    @skipIfCpu("fails on Triton CPU backend")
     def test_kernel(self, name):
         kernel, args_a, result_a, args_b, result_b = KERNELS[name]()
 
@@ -105,6 +107,7 @@ def test_kernel(self, name):
         self.assertEqual(counters["autotune"]["cache_hit"], 1)
         self.assertEqual(counters["autotune"]["cache_put"], 2)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_key_affects_cache_specialization(self):
         counters["autotune"].clear()
         self.addCleanup(counters["autotune"].clear)
@@ -150,6 +153,7 @@ def add_one(x: torch.Tensor):
         self.assertEqual(counters["autotune"]["cache_hit"], 1)
         self.assertEqual(counters["autotune"]["cache_put"], 2)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_assert_cache_hit(self):
         counters["autotune"].clear()
         self.addCleanup(counters["autotune"].clear)

diff --git a/test/test_dot.py b/test/test_dot.py
@@ -14,6 +14,7 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import is_cuda
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 from helion._testing import skipIfRocm
 from helion._testing import skipIfXPU
@@ -293,6 +294,7 @@ def bmm(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
 
     @skipIfRefEager("Debug dtype codegen checks rely on compiled code")
     @skipIfXPU("Failed on XPU - https://github.com/pytorch/helion/issues/772")
+    @skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.")
     def test_baddbmm_pipeline_debug_dtype_asserts(self):
         # Reproduces scripts/repro512.py within the test suite and asserts
         # the kernel compiles and runs with debug dtype asserts enabled.
@@ -981,6 +983,16 @@ def test_matmul_reshape_n_2(self):
             "float16 accumulator not supported for bf16/f32 in ref eager mode"
         )(_test_func)
 
+    # CPU backend skip for specific failing dynamic-shape case
+    if test_name == "test_input_float16_acc_float16_dynamic_shape":
+        _test_func = skipIfCpu("AssertionError: Tensor-likes are not close!")(
+            _test_func
+        )
+    if test_name == "test_input_float16_acc_float16_static_shape":
+        _test_func = skipIfCpu("AssertionError: Tensor-likes are not close!")(
+            _test_func
+        )
+
     setattr(TestDot, test_name, _test_func)
 
 

diff --git a/test/test_errors.py b/test/test_errors.py
@@ -10,6 +10,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion.autotuner.base_search import PopulationBasedSearch
 from helion.autotuner.base_search import PopulationMember
 from helion.autotuner.differential_evolution import DifferentialEvolutionSearch
@@ -33,6 +34,7 @@ def _test_outer_kernel_calling_inner(x: torch.Tensor) -> torch.Tensor:
 
 
 class TestErrors(RefEagerTestDisabled, TestCase):
+    @skipIfCpu("fails on Triton CPU backend")
     def test_autotune_no_valid_configs(self):
         class FakeKernel:
             def __init__(self) -> None:

diff --git a/test/test_examples.py b/test/test_examples.py
@@ -16,6 +16,7 @@
 from helion._testing import check_example
 from helion._testing import import_path
 from helion._testing import skipIfA10G
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 from helion._testing import skipIfRocm
 from helion._testing import skipIfXPU
@@ -24,6 +25,7 @@
 torch.backends.cudnn.conv.fp32_precision = "tf32"
 
 
+@skipIfCpu("needs to be debugged")
 class TestExamples(RefEagerTestBase, TestCase):
     def test_add(self):
         args = (

diff --git a/test/test_generate_ast.py b/test/test_generate_ast.py
@@ -11,6 +11,7 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import import_path
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 import helion.language as hl
 
@@ -35,6 +36,7 @@ def test_add1d(self):
         torch.testing.assert_close(result, args[0] + args[1])
         self.assertExpectedJournal(code)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_add2d(self):
         args = (
             torch.randn([100, 500], device=DEVICE),
@@ -46,6 +48,7 @@ def test_add2d(self):
         torch.testing.assert_close(result, args[0] + args[1])
         self.assertExpectedJournal(code)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_add2d_loop_order(self):
         args = (
             torch.randn([100, 500], device=DEVICE),
@@ -61,6 +64,7 @@ def test_add2d_loop_order(self):
         torch.testing.assert_close(result, args[0] + args[1])
         self.assertExpectedJournal(code)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_add3d(self):
         args = (
             torch.randn([100, 500, 10], device=DEVICE),
@@ -83,6 +87,7 @@ def test_add3d_xy_grid(self):
         torch.testing.assert_close(result, args[0] + args[1])
         self.assertExpectedJournal(code)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_add3d_reorder(self):
         args = (
             torch.randn([100, 500, 10], device=DEVICE),
@@ -213,6 +218,7 @@ def test_final_cast_enforced_for_to_dtype(self):
         # Ensure codegen emits a final tl.cast(..., tl.bfloat16)
         assert "tl.cast" in code and "tl.bfloat16" in code
 
+    @skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.")
     def test_sigmoid_scalar_autocast(self):
         @helion.kernel(
             config=helion.Config(

diff --git a/test/test_indexing.py b/test/test_indexing.py
@@ -14,6 +14,7 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfLowVRAM
 from helion._testing import skipIfNormalMode
 from helion._testing import skipIfRefEager
@@ -396,6 +397,7 @@ def test_block_size_access(x: torch.Tensor) -> torch.Tensor:
         "IndexOffsetOutOfRangeForInt32 error is not raised in ref eager mode"
     )
     @skipIfLowVRAM("Test requires high VRAM")
+    @skipIfCpu("fails on Triton CPU backend")
     def test_int32_offset_out_of_range_error(self):
         repro_config = helion.Config(
             block_sizes=[32, 32],

diff --git a/test/test_inline_asm_elementwise.py b/test/test_inline_asm_elementwise.py
@@ -10,6 +10,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRocm
 import helion.language as hl
 
@@ -221,6 +222,7 @@ def kernel_empty_args(x: torch.Tensor) -> torch.Tensor:
         torch.testing.assert_close(result, expected)
 
     @skipIfRocm("only works on cuda")
+    @skipIfCpu("RuntimeError: failed to translate module to LLVM IR")
     def test_inline_asm_basic_compilation(self):
         """Test that inline_asm_elementwise compiles without errors (no CUDA requirement)"""