From 192f187f0d3e3cdafbc63731ba6467a9026736c5 Mon Sep 17 00:00:00 2001
From: Oguz Ulgen <oulgen7@gmail.com>
Date: Mon, 27 Oct 2025 18:47:45 -0700
Subject: [PATCH 1/4] Add best effort triton-cpu support

Fixes #163

stack-info: PR: https://github.com/pytorch/helion/pull/1037, branch: oulgen/stack/163
---
 .github/matrix.json                 | 10 ++++++++
 .github/workflows/test.yml          |  9 +++++--
 helion/_testing.py                  | 37 +++++++++++++++++++++++------
 test/test_autotuner.py              | 11 +++++++++
 test/test_cache.py                  |  4 ++++
 test/test_dot.py                    | 12 ++++++++++
 test/test_errors.py                 |  2 ++
 test/test_examples.py               |  2 ++
 test/test_generate_ast.py           |  6 +++++
 test/test_indexing.py               |  2 ++
 test/test_inline_asm_elementwise.py |  2 ++
 test/test_loops.py                  |  6 +++++
 test/test_masking.py                |  4 ++++
 test/test_matmul.py                 |  2 ++
 test/test_misc.py                   |  3 +++
 test/test_persistent_kernels.py     |  2 ++
 test/test_print.py                  |  2 ++
 test/test_random.py                 |  2 ++
 test/test_reductions.py             |  2 ++
 test/test_register_tunable.py       |  2 ++
 test/test_rng.py                    |  2 ++
 test/test_signal_wait.py            |  2 ++
 test/test_specialize.py             |  2 ++
 test/test_type_propagation.py       |  3 +++
 24 files changed, 122 insertions(+), 9 deletions(-)

diff --git a/.github/matrix.json b/.github/matrix.json
index 659f596fc..2c68a0905 100644
--- a/.github/matrix.json
+++ b/.github/matrix.json
@@ -71,6 +71,16 @@
       "container-options": "--device=/dev/kfd --device=/dev/dri",
       "pytorch-version": "pytorch-nightly",
       "alias": "mi325x"
+    },
+    {
+      "runner": "linux.g5.4xlarge.nvidia.gpu",
+      "python-version": "3.12",
+      "ref-eager": false,
+      "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04",
+      "runtime-version": "cpu",
+      "container-options": "--gpus all",
+      "pytorch-version": "pytorch-nightly",
+      "alias": "cpu"
     }
   ]
 }
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 053cac5c3..3ebf327ec 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -97,7 +97,7 @@ jobs:
           fi
 
       - name: Install Triton
-        if: steps.cache.outputs.cache-hit != 'true' && matrix.pytorch-version != 'pytorch-2.9'
+        if: steps.cache.outputs.cache-hit != 'true' && (matrix.pytorch-version != 'pytorch-2.9' || contains(matrix.alias, 'cpu'))
         run: |
           set -x
           source .venv/bin/activate
@@ -110,7 +110,11 @@ jobs:
           cd /tmp/$USER
           uv pip uninstall triton pytorch-triton || true
           rm -rf triton/ || true
-          git clone https://github.com/triton-lang/triton.git
+          if [[ "${{ matrix.alias }}" == *cpu* ]]; then
+            git clone --recursive -b main-merged https://github.com/triton-lang/triton-cpu.git triton
+          else
+            git clone https://github.com/triton-lang/triton.git triton
+          fi
           cd triton/
           uv pip install -r python/requirements.txt
           MAX_JOBS=$(nproc) TRITON_PARALLEL_LINK_JOBS=2 uv pip install .
@@ -131,6 +135,7 @@ jobs:
           if [[ "${{ matrix.dtype-asserts }}" == "true" ]]; then export HELION_DEBUG_DTYPE_ASSERTS=1; fi
           if [[ "${{ matrix.expecttest-accept }}" == "true" ]]; then export EXPECTTEST_ACCEPT=1; fi
           if [[ "${{ matrix.ref-eager }}" == "true" ]]; then export HELION_INTERPRET=1; fi
+          if [[ "${{ contains(matrix.alias, 'cpu') }}" == "true" ]]; then export TRITON_CPU_BACKEND=1; fi
           # -rf: print failed tests
           # --timeout: max allowed time for each test
           pytest -rf --timeout=60
diff --git a/helion/_testing.py b/helion/_testing.py
index 6da692f17..c0776732b 100644
--- a/helion/_testing.py
+++ b/helion/_testing.py
@@ -34,19 +34,37 @@
     from .runtime.kernel import Kernel
 
 
-DEVICE = torch.device("xpu") if torch.xpu.is_available() else torch.device("cuda")
-PROJECT_ROOT: Path = Path(__file__).parent.parent
-EXAMPLES_DIR: Path = PROJECT_ROOT / "examples"
+def _get_triton_backend() -> str | None:
+    try:
+        return triton.runtime.driver.active.get_current_target().backend  # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess]
+    except Exception:
+        return None
 
 
-def is_cuda() -> bool:
-    """Return True if running on CUDA (NVIDIA GPU)."""
+def is_cpu() -> bool:
+    """Return True if running on Triton CPU backend."""
     return (
-        triton.runtime.driver.active.get_current_target().backend == "cuda"  # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess]
-        and DEVICE.type == "cuda"
+        os.environ.get("TRITON_CPU_BACKEND", "0") == "1"
+        or _get_triton_backend() == "cpu"
     )
 
 
+def is_cuda() -> bool:
+    """Return True if running on CUDA (NVIDIA GPU)."""
+    return _get_triton_backend() == "cuda" and torch.cuda.is_available()
+
+
+PROJECT_ROOT: Path = Path(__file__).parent.parent
+EXAMPLES_DIR: Path = PROJECT_ROOT / "examples"
+
+if is_cpu():
+    DEVICE = torch.device("cpu")
+elif torch.xpu.is_available():
+    DEVICE = torch.device("xpu")
+else:
+    DEVICE = torch.device("cuda")
+
+
 def get_nvidia_gpu_model() -> str:
     """
     Retrieves the model of the NVIDIA GPU being used.
@@ -80,6 +98,11 @@ def skipIfXPU(reason: str) -> Callable[[Callable], Callable]:
     return unittest.skipIf(torch.xpu.is_available(), reason)  # pyright: ignore[reportAttributeAccessIssue]
 
 
+def skipIfCpu(reason: str) -> Callable[[Callable], Callable]:
+    """Skip test if running on Triton CPU backend."""
+    return unittest.skipIf(is_cpu(), reason)
+
+
 def skipIfA10G(reason: str) -> Callable[[Callable], Callable]:
     """Skip test if running on A10G GPU"""
     gpu_model = get_nvidia_gpu_model()
diff --git a/test/test_autotuner.py b/test/test_autotuner.py
index 877577a2d..0daa2181d 100644
--- a/test/test_autotuner.py
+++ b/test/test_autotuner.py
@@ -28,6 +28,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import import_path
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRocm
 from helion.autotuner import DifferentialEvolutionSearch
 from helion.autotuner import PatternSearch
@@ -316,6 +317,7 @@ def add(a, b):
         )
         torch.testing.assert_close(add(*args), sum(args))
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_run_finite_search(self):
         @helion.kernel(
             configs=[
@@ -347,6 +349,7 @@ def add(a, b):
         torch.testing.assert_close(add(*args), sum(args))
 
     @skipIfRocm("too slow on rocm")
+    @skipIfCpu("TritonError: Error from Triton code")
     def test_random_search(self):
         args = (
             torch.randn([512, 512], device=DEVICE),
@@ -436,6 +439,7 @@ def diff_count(flat):
         ]
         self.assertEqual(sorted(pair_neighbors), sorted(expected))
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_accuracy_check_filters_bad_config_wrong_output(self) -> None:
         bad_config = helion.Config(block_sizes=[1], num_warps=8)
         good_config = helion.Config(block_sizes=[1], num_warps=4)
@@ -509,6 +513,7 @@ def make_bad_config_produce_wrong_output(
         run_mode("fork", expect_error=False)
         run_mode("spawn", expect_error=True)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_accuracy_check_filters_bad_config_wrong_arg_mutation(self) -> None:
         bad_config = helion.Config(block_sizes=[1], num_warps=8)
         good_config = helion.Config(block_sizes=[1], num_warps=4)
@@ -591,6 +596,7 @@ def wrong_fn(*fn_args, **fn_kwargs):
         run_mode("fork", expect_error=False)
         run_mode("spawn", expect_error=True)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_autotune_baseline_fn(self) -> None:
         """Test that custom baseline function is used for accuracy checking."""
         config1 = helion.Config(block_sizes=[32], num_warps=4)
@@ -631,6 +637,7 @@ def add(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
         # Verify the result is correct
         torch.testing.assert_close(result, args[0] + args[1])
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_autotune_baseline_fn_filters_bad_config(self) -> None:
         """Test that custom baseline function correctly filters incorrect configs."""
         bad_config = helion.Config(block_sizes=[1], num_warps=8)
@@ -729,6 +736,7 @@ def add(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
         ):
             add(*args)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_max_generations(self):
         """Autotuner max generation respects explicit kwargs then setting override."""
 
@@ -772,6 +780,7 @@ def add(a, b):
         result = add(*args)
         torch.testing.assert_close(result, sum(args))
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_autotune_effort_quick(self):
         """Test that quick effort profile uses correct default values."""
         # Get the quick profile defaults
@@ -907,6 +916,7 @@ def add(a, b):
         return search.samples[0]
 
     @skipIfRocm("accuracy difference")
+    @skipIfCpu("fails on Triton CPU backend")
     def test_autotune_random_seed_from_env_var(self) -> None:
         # same env var value -> same random sample
         with patch.dict(
@@ -931,6 +941,7 @@ def test_autotune_random_seed_from_env_var(self) -> None:
         self.assertNotEqual(first, second)
 
     @skipIfRocm("accuracy difference")
+    @skipIfCpu("fails on Triton CPU backend")
     def test_autotune_random_seed_from_settings(self) -> None:
         # same autotune_random_seed setting -> same random sample
         first = self._autotune_and_record(autotune_random_seed=4242)
diff --git a/test/test_cache.py b/test/test_cache.py
index d458f5f9b..c69463225 100644
--- a/test/test_cache.py
+++ b/test/test_cache.py
@@ -15,6 +15,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import import_path
+from helion._testing import skipIfCpu
 from helion._utils import counters
 from helion.autotuner import StrictLocalAutotuneCache
 from helion.autotuner.base_search import BaseSearch
@@ -73,6 +74,7 @@ def get_welford_kernel():
 
 class TestCache(RefEagerTestDisabled, TestCase):
     @parametrize("name", ("add", "matmul", "welford"))
+    @skipIfCpu("fails on Triton CPU backend")
     def test_kernel(self, name):
         kernel, args_a, result_a, args_b, result_b = KERNELS[name]()
 
@@ -105,6 +107,7 @@ def test_kernel(self, name):
         self.assertEqual(counters["autotune"]["cache_hit"], 1)
         self.assertEqual(counters["autotune"]["cache_put"], 2)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_key_affects_cache_specialization(self):
         counters["autotune"].clear()
         self.addCleanup(counters["autotune"].clear)
@@ -150,6 +153,7 @@ def add_one(x: torch.Tensor):
         self.assertEqual(counters["autotune"]["cache_hit"], 1)
         self.assertEqual(counters["autotune"]["cache_put"], 2)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_assert_cache_hit(self):
         counters["autotune"].clear()
         self.addCleanup(counters["autotune"].clear)
diff --git a/test/test_dot.py b/test/test_dot.py
index 555fad1f7..5d56807bb 100644
--- a/test/test_dot.py
+++ b/test/test_dot.py
@@ -14,6 +14,7 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import is_cuda
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 from helion._testing import skipIfRocm
 from helion._testing import skipIfXPU
@@ -293,6 +294,7 @@ def bmm(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
 
     @skipIfRefEager("Debug dtype codegen checks rely on compiled code")
     @skipIfXPU("Failed on XPU - https://github.com/pytorch/helion/issues/772")
+    @skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.")
     def test_baddbmm_pipeline_debug_dtype_asserts(self):
         # Reproduces scripts/repro512.py within the test suite and asserts
         # the kernel compiles and runs with debug dtype asserts enabled.
@@ -981,6 +983,16 @@ def test_matmul_reshape_n_2(self):
             "float16 accumulator not supported for bf16/f32 in ref eager mode"
         )(_test_func)
 
+    # CPU backend skip for specific failing dynamic-shape case
+    if test_name == "test_input_float16_acc_float16_dynamic_shape":
+        _test_func = skipIfCpu("AssertionError: Tensor-likes are not close!")(
+            _test_func
+        )
+    if test_name == "test_input_float16_acc_float16_static_shape":
+        _test_func = skipIfCpu("AssertionError: Tensor-likes are not close!")(
+            _test_func
+        )
+
     setattr(TestDot, test_name, _test_func)
 
 
diff --git a/test/test_errors.py b/test/test_errors.py
index 20795bffc..feec2a925 100644
--- a/test/test_errors.py
+++ b/test/test_errors.py
@@ -10,6 +10,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion.autotuner.base_search import PopulationBasedSearch
 from helion.autotuner.base_search import PopulationMember
 from helion.autotuner.differential_evolution import DifferentialEvolutionSearch
@@ -33,6 +34,7 @@ def _test_outer_kernel_calling_inner(x: torch.Tensor) -> torch.Tensor:
 
 
 class TestErrors(RefEagerTestDisabled, TestCase):
+    @skipIfCpu("fails on Triton CPU backend")
     def test_autotune_no_valid_configs(self):
         class FakeKernel:
             def __init__(self) -> None:
diff --git a/test/test_examples.py b/test/test_examples.py
index 61eb0a07d..15cf2c43c 100644
--- a/test/test_examples.py
+++ b/test/test_examples.py
@@ -16,6 +16,7 @@
 from helion._testing import check_example
 from helion._testing import import_path
 from helion._testing import skipIfA10G
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 from helion._testing import skipIfRocm
 from helion._testing import skipIfXPU
@@ -24,6 +25,7 @@
 torch.backends.cudnn.conv.fp32_precision = "tf32"
 
 
+@skipIfCpu("needs to be debugged")
 class TestExamples(RefEagerTestBase, TestCase):
     def test_add(self):
         args = (
diff --git a/test/test_generate_ast.py b/test/test_generate_ast.py
index d0b428a47..97878b05b 100644
--- a/test/test_generate_ast.py
+++ b/test/test_generate_ast.py
@@ -11,6 +11,7 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import import_path
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 import helion.language as hl
 
@@ -35,6 +36,7 @@ def test_add1d(self):
         torch.testing.assert_close(result, args[0] + args[1])
         self.assertExpectedJournal(code)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_add2d(self):
         args = (
             torch.randn([100, 500], device=DEVICE),
@@ -46,6 +48,7 @@ def test_add2d(self):
         torch.testing.assert_close(result, args[0] + args[1])
         self.assertExpectedJournal(code)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_add2d_loop_order(self):
         args = (
             torch.randn([100, 500], device=DEVICE),
@@ -61,6 +64,7 @@ def test_add2d_loop_order(self):
         torch.testing.assert_close(result, args[0] + args[1])
         self.assertExpectedJournal(code)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_add3d(self):
         args = (
             torch.randn([100, 500, 10], device=DEVICE),
@@ -83,6 +87,7 @@ def test_add3d_xy_grid(self):
         torch.testing.assert_close(result, args[0] + args[1])
         self.assertExpectedJournal(code)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_add3d_reorder(self):
         args = (
             torch.randn([100, 500, 10], device=DEVICE),
@@ -213,6 +218,7 @@ def test_final_cast_enforced_for_to_dtype(self):
         # Ensure codegen emits a final tl.cast(..., tl.bfloat16)
         assert "tl.cast" in code and "tl.bfloat16" in code
 
+    @skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.")
     def test_sigmoid_scalar_autocast(self):
         @helion.kernel(
             config=helion.Config(
diff --git a/test/test_indexing.py b/test/test_indexing.py
index 2aeafa38a..8e1118390 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -14,6 +14,7 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfLowVRAM
 from helion._testing import skipIfNormalMode
 from helion._testing import skipIfRefEager
@@ -396,6 +397,7 @@ def test_block_size_access(x: torch.Tensor) -> torch.Tensor:
         "IndexOffsetOutOfRangeForInt32 error is not raised in ref eager mode"
     )
     @skipIfLowVRAM("Test requires high VRAM")
+    @skipIfCpu("fails on Triton CPU backend")
     def test_int32_offset_out_of_range_error(self):
         repro_config = helion.Config(
             block_sizes=[32, 32],
diff --git a/test/test_inline_asm_elementwise.py b/test/test_inline_asm_elementwise.py
index 629ca1e3d..d21be7afb 100644
--- a/test/test_inline_asm_elementwise.py
+++ b/test/test_inline_asm_elementwise.py
@@ -10,6 +10,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRocm
 import helion.language as hl
 
@@ -221,6 +222,7 @@ def kernel_empty_args(x: torch.Tensor) -> torch.Tensor:
         torch.testing.assert_close(result, expected)
 
     @skipIfRocm("only works on cuda")
+    @skipIfCpu("RuntimeError: failed to translate module to LLVM IR")
     def test_inline_asm_basic_compilation(self):
         """Test that inline_asm_elementwise compiles without errors (no CUDA requirement)"""
 
diff --git a/test/test_loops.py b/test/test_loops.py
index e1a6447f1..f263e930f 100644
--- a/test/test_loops.py
+++ b/test/test_loops.py
@@ -14,6 +14,7 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import import_path
+from helion._testing import skipIfCpu
 from helion._testing import skipIfLowVRAM
 from helion._testing import skipIfRefEager
 import helion.language as hl
@@ -76,6 +77,7 @@ def test_3d_device_loop0(self):
         self.assertExpectedJournal(code)
 
     @skipIfLowVRAM("Test requires high VRAM for [128, 128, 128, 128] tensors")
+    @skipIfCpu("fails on Triton CPU backend")
     def test_3d_device_loop1(self):
         args = (torch.randn([128, 128, 128, 128], device=DEVICE),)
         code, result = code_and_output(
@@ -102,6 +104,7 @@ def test_3d_device_loop2(self):
 
     @patch.object(_compat, "_supports_tensor_descriptor", lambda: False)
     @skipIfLowVRAM("Test requires high VRAM for [128, 128, 128, 128] tensors")
+    @skipIfCpu("fails on Triton CPU backend")
     def test_3d_device_loop3(self):
         args = (torch.randn([128, 128, 128, 128], device=DEVICE),)
         code, result = code_and_output(
@@ -372,6 +375,7 @@ def fn(x: torch.Tensor) -> torch.Tensor:
         self.assertEqual(spec.min_size, 32)
         self.assertEqual(spec.max_size, 256)
 
+    @skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.")
     def test_register_block_size_codegen_size_hint(self):
         @helion.kernel(static_shapes=True)
         def kernel_fixed_block_size(
@@ -1153,6 +1157,7 @@ def kernel_with_dynamic_fill(
         expected = x + fill_value[0]
         torch.testing.assert_close(result, expected)
 
+    @skipIfCpu("codegen mismatch on CPU")
     def test_nested_loop_accumulator(self):
         """Test variable scoping with nested loops and accumulator pattern."""
 
@@ -1202,6 +1207,7 @@ def nested_loop_accumulator(x: torch.Tensor) -> torch.Tensor:
         torch.testing.assert_close(result, expected, atol=1e-5, rtol=1e-5)
         self.assertExpectedJournal(code)
 
+    @skipIfCpu("codegen mismatch on CPU")
     def test_three_pass_kernel(self):
         """Test variable scoping with three-pass pattern like layer norm."""
 
diff --git a/test/test_masking.py b/test/test_masking.py
index ad7bff8f4..0290c3d61 100644
--- a/test/test_masking.py
+++ b/test/test_masking.py
@@ -11,6 +11,7 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 import helion.language as hl
 
@@ -42,6 +43,7 @@ def add1mm(x, y):
             result, (args[0] + 1) @ (args[1] + 1), rtol=1e-2, atol=1e-1
         )
 
+    @skipIfCpu("AssertionError: Tensor-likes are not close!")
     def test_no_mask_views0(self):
         @helion.kernel(config={"block_sizes": [32]})
         def fn(x):
@@ -59,6 +61,7 @@ def fn(x):
         torch.testing.assert_close(result, args[0].sum(dim=1, keepdim=True))
         self.assertNotIn("tl.where", code)
 
+    @skipIfCpu("AssertionError: Tensor-likes are not close!")
     def test_no_mask_views1(self):
         @helion.kernel(config={"block_sizes": [32]})
         def fn(x):
@@ -130,6 +133,7 @@ def fn(x):
         torch.testing.assert_close(result, (args[0] + 1).sum(dim=1))
         self.assertIn("tl.where", code)
 
+    @skipIfCpu("AssertionError: Tensor-likes are not close!")
     def test_no_mask_inductor_ops(self):
         @helion.kernel(config={"block_sizes": [32]})
         def fn(x):
diff --git a/test/test_matmul.py b/test/test_matmul.py
index 026485ec3..fa76013e4 100644
--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -15,6 +15,7 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import import_path
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 from helion._testing import skipIfRocm
 import helion.language as hl
@@ -203,6 +204,7 @@ def test_matmul_static_shapes3(self):
         torch.testing.assert_close(output, args[0] @ args[1], atol=1e-1, rtol=1e-2)
         self.assertExpectedJournal(code)
 
+    @skipIfCpu("fails on Triton CPU backend")
     def test_matmul_packed_int4_block_size_constexpr(self):
         torch.manual_seed(0)
         M = N = K = 32
diff --git a/test/test_misc.py b/test/test_misc.py
index 643c5411b..d2d409124 100644
--- a/test/test_misc.py
+++ b/test/test_misc.py
@@ -27,10 +27,12 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import import_path
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 import helion.language as hl
 
 
+@skipIfCpu("need to debug")
 class TestMisc(RefEagerTestBase, TestCase):
     def test_binary_operation_duplicate_args(self):
         """Test case to reproduce issue #221: binary operations with duplicate tensor references"""
@@ -194,6 +196,7 @@ class Point2:
         torch.testing.assert_close(result[1], 4 * x)
         self.assertExpectedJournal(code)
 
+    @skipIfCpu("AssertionError: Tensor-likes are not close!")
     def test_dtype_cast_preserved_before_second_dot(self):
         """Regression for issue #512: ensure p.to(v.dtype) is honored before a second dot.
 
diff --git a/test/test_persistent_kernels.py b/test/test_persistent_kernels.py
index dcb94fa39..f6d77a218 100644
--- a/test/test_persistent_kernels.py
+++ b/test/test_persistent_kernels.py
@@ -11,6 +11,7 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 import helion.language as hl
 
@@ -902,6 +903,7 @@ def multi_loop_kernel(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         torch.testing.assert_close(result[0], result_flat[0], atol=0, rtol=0)
         torch.testing.assert_close(result[1], result_flat[1], atol=0, rtol=0)
 
+    @skipIfCpu("RuntimeError: PassManager::run failed")
     def test_persistent_interleaved_multiple_loops_with_l2_grouping(self):
         """Test persistent_interleaved with multiple top-level hl.tile loops AND l2_grouping (all 3 features combined)."""
 
diff --git a/test/test_print.py b/test/test_print.py
index fdfc1bb4e..e3bb21fa7 100644
--- a/test/test_print.py
+++ b/test/test_print.py
@@ -13,6 +13,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRocm
 import helion.language as hl
 
@@ -27,6 +28,7 @@ def _store_capfd_on_class(request, capfd):
         request.cls._capfd = capfd
 
 
+@skipIfCpu("needs to be debugged")
 class TestPrint(RefEagerTestDisabled, TestCase):
     def run_kernel_and_capture_output(self, kernel_fn, args):
         """Helper to run kernel and capture output"""
diff --git a/test/test_random.py b/test/test_random.py
index b4dd74f3a..af1611a22 100644
--- a/test/test_random.py
+++ b/test/test_random.py
@@ -9,9 +9,11 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 import helion.language as hl
 
 
+@skipIfCpu("needs to be debugged")
 class TestRandom(RefEagerTestBase, TestCase):
     def test_hl_rand_1d(self):
         @helion.kernel(static_shapes=False)
diff --git a/test/test_reductions.py b/test/test_reductions.py
index bf4a10eeb..586d53409 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -11,6 +11,7 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 import helion.language as hl
 
@@ -79,6 +80,7 @@ def sum_const_inner(x: torch.Tensor) -> torch.Tensor:
         torch.testing.assert_close(out, x.sum(-1), rtol=1e-4, atol=1e-4)
 
     @skipIfRefEager("Does not call assert_close")
+    @skipIfCpu("fails on Triton CPU backend")
     def test_broken_layernorm(self):
         @helion.kernel(autotune_effort="none")
         def layer_norm_fwd(
diff --git a/test/test_register_tunable.py b/test/test_register_tunable.py
index 10b50cdcd..b6ea5f959 100644
--- a/test/test_register_tunable.py
+++ b/test/test_register_tunable.py
@@ -11,6 +11,7 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRocm
 from helion.autotuner import EnumFragment
 from helion.autotuner import IntegerFragment
@@ -109,6 +110,7 @@ def fn(x: torch.Tensor):
 
     @patch.object(_compat, "_supports_tensor_descriptor", lambda: False)
     @skipIfRocm("failure on rocm")
+    @skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.")
     def test_matmul_split_k(self):
         """Test matmul_split_k kernel with register_tunable"""
 
diff --git a/test/test_rng.py b/test/test_rng.py
index 3fec8d6d1..a27052014 100644
--- a/test/test_rng.py
+++ b/test/test_rng.py
@@ -10,9 +10,11 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 import helion.language as hl
 
 
+@skipIfCpu("needs to be debugged")
 class TestRNG(RefEagerTestBase, TestCase):
     def test_rand(self):
         """Test RNG seeding behavior, reproducibility, output range, and distribution."""
diff --git a/test/test_signal_wait.py b/test/test_signal_wait.py
index 8ed10d273..2388971e7 100644
--- a/test/test_signal_wait.py
+++ b/test/test_signal_wait.py
@@ -9,11 +9,13 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfNotCUDA
 from helion._testing import skipIfRocm
 import helion.language as hl
 
 
+@skipIfCpu("needs to be debugged")
 class TestWait(RefEagerTestDisabled, TestCase):
     @skipIfRocm("only works on cuda")
     def test_wait_basic(self):
diff --git a/test/test_specialize.py b/test/test_specialize.py
index ecb144834..2b0e9d616 100644
--- a/test/test_specialize.py
+++ b/test/test_specialize.py
@@ -10,11 +10,13 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 from helion.exc import ShapeSpecializingAllocation
 import helion.language as hl
 
 
+@skipIfCpu("needs to be debugged")
 class TestSpecialize(RefEagerTestBase, TestCase):
     maxDiff = 163842
 
diff --git a/test/test_type_propagation.py b/test/test_type_propagation.py
index a1ce00afc..49d70fe80 100644
--- a/test/test_type_propagation.py
+++ b/test/test_type_propagation.py
@@ -12,6 +12,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import import_path
+from helion._testing import skipIfCpu
 from helion._testing import skipIfXPU
 import helion.language as hl
 
@@ -97,6 +98,7 @@ def test_matmul(self):
         self.assertExpectedJournal(output)
 
     @skipIfXPU("CUDA-only")
+    @skipIfCpu("CUDA-only")
     def test_cuda_device_properties(self):
         @helion.kernel
         def use_device_properties(x: torch.Tensor) -> torch.Tensor:
@@ -119,6 +121,7 @@ def use_device_properties(x: torch.Tensor) -> torch.Tensor:
         self.assertExpectedJournal(output)
 
     @skipIfXPU("CUDA-only")
+    @skipIfCpu("CUDA-only")
     def test_cuda_device_properties_unsupported_attribute(self):
         @helion.kernel
         def use_unsupported_property(x: torch.Tensor) -> torch.Tensor:

From ada59b9525fa0532b39e3b89f9689639c87e4c33 Mon Sep 17 00:00:00 2001
From: Oguz Ulgen <oulgen7@gmail.com>
Date: Sun, 2 Nov 2025 01:27:59 -0700
Subject: [PATCH 2/4] Apply suggestion from @oulgen

---
 .github/matrix.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/matrix.json b/.github/matrix.json
index 2c68a0905..945e57099 100644
--- a/.github/matrix.json
+++ b/.github/matrix.json
@@ -73,7 +73,7 @@
       "alias": "mi325x"
     },
     {
-      "runner": "linux.g5.4xlarge.nvidia.gpu",
+      "runner": "linux.24xl.spr-metal",
       "python-version": "3.12",
       "ref-eager": false,
       "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04",

From a8271c558be1c93b6c8fbe4e9a10742ba7e8f011 Mon Sep 17 00:00:00 2001
From: Oguz Ulgen <oulgen7@gmail.com>
Date: Sun, 2 Nov 2025 01:37:00 -0700
Subject: [PATCH 3/4] Apply suggestion from @oulgen

---
 .github/matrix.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/matrix.json b/.github/matrix.json
index 945e57099..8d6815966 100644
--- a/.github/matrix.json
+++ b/.github/matrix.json
@@ -76,9 +76,9 @@
       "runner": "linux.24xl.spr-metal",
       "python-version": "3.12",
       "ref-eager": false,
-      "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04",
+      "image": "ubuntu:24.04",
       "runtime-version": "cpu",
-      "container-options": "--gpus all",
+      "container-options": "--ipc=host",
       "pytorch-version": "pytorch-nightly",
       "alias": "cpu"
     }

From 8eae8bfdca9d5a20b0989c6c2e61df46c27af675 Mon Sep 17 00:00:00 2001
From: Oguz Ulgen <oulgen7@gmail.com>
Date: Sun, 2 Nov 2025 08:55:11 -0800
Subject: [PATCH 4/4] Apply suggestion from @oulgen

---
 .github/matrix.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/matrix.json b/.github/matrix.json
index 8d6815966..2c68a0905 100644
--- a/.github/matrix.json
+++ b/.github/matrix.json
@@ -73,12 +73,12 @@
       "alias": "mi325x"
     },
     {
-      "runner": "linux.24xl.spr-metal",
+      "runner": "linux.g5.4xlarge.nvidia.gpu",
       "python-version": "3.12",
       "ref-eager": false,
-      "image": "ubuntu:24.04",
+      "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04",
       "runtime-version": "cpu",
-      "container-options": "--ipc=host",
+      "container-options": "--gpus all",
       "pytorch-version": "pytorch-nightly",
       "alias": "cpu"
     }