Add best effort triton-cpu support

oulgen · oulgen · commit d4ce200c91fc · 2025-10-30T22:19:22.000-07:00
Fixes #163 stack-info: PR: #1037, branch: oulgen/stack/163
diff --git a/.github/matrix.json b/.github/matrix.json
@@ -71,6 +71,16 @@
       "container-options": "--device=/dev/kfd --device=/dev/dri",
       "pytorch-version": "pytorch-nightly",
       "alias": "mi325x"
+    },
+    {
+      "runner": "linux.g5.4xlarge.nvidia.gpu",
+      "python-version": "3.12",
+      "ref-eager": false,
+      "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04",
+      "runtime-version": "cpu",
+      "container-options": "--gpus all",
+      "pytorch-version": "pytorch-nightly",
+      "alias": "cpu"
     }
   ]
 }
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -97,7 +97,7 @@ jobs:
           fi
 
       - name: Install Triton
-        if: steps.cache.outputs.cache-hit != 'true' && matrix.pytorch-version != 'pytorch-2.9'
+        if: steps.cache.outputs.cache-hit != 'true' && (matrix.pytorch-version != 'pytorch-2.9' || contains(matrix.alias, 'cpu'))
         run: |
           set -x
           source .venv/bin/activate
@@ -110,7 +110,11 @@ jobs:
           cd /tmp/$USER
           uv pip uninstall triton pytorch-triton || true
           rm -rf triton/ || true
-          git clone https://github.com/triton-lang/triton.git
+          if [[ "${{ matrix.alias }}" == *cpu* ]]; then
+            git clone --recursive -b main-merged https://github.com/triton-lang/triton-cpu.git triton
+          else
+            git clone https://github.com/triton-lang/triton.git triton
+          fi
           cd triton/
           uv pip install -r python/requirements.txt
           MAX_JOBS=$(nproc) TRITON_PARALLEL_LINK_JOBS=2 uv pip install .
@@ -131,9 +135,10 @@ jobs:
           if [[ "${{ matrix.dtype-asserts }}" == "true" ]]; then export HELION_DEBUG_DTYPE_ASSERTS=1; fi
           if [[ "${{ matrix.expecttest-accept }}" == "true" ]]; then export EXPECTTEST_ACCEPT=1; fi
           if [[ "${{ matrix.ref-eager }}" == "true" ]]; then export HELION_INTERPRET=1; fi
+          if [[ "${{ contains(matrix.alias, 'cpu') }}" == "true" ]]; then export TRITON_CPU_BACKEND=1; fi
           # -rf: print failed tests
           # --timeout: max allowed time for each test
-          pytest -rf --timeout=60
+          pytest -rf -vv --timeout=60
 
   test-notebooks:
     name: test-notebooks-cu128-py3.12-pytorch-2.9-a10g
diff --git a/helion/_testing.py b/helion/_testing.py
@@ -34,19 +34,37 @@
     from .runtime.kernel import Kernel
 
 
-DEVICE = torch.device("xpu") if torch.xpu.is_available() else torch.device("cuda")
-PROJECT_ROOT: Path = Path(__file__).parent.parent
-EXAMPLES_DIR: Path = PROJECT_ROOT / "examples"
+def _get_triton_backend() -> str | None:
+    try:
+        return triton.runtime.driver.active.get_current_target().backend  # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess]
+    except Exception:
+        return None
 
 
-def is_cuda() -> bool:
-    """Return True if running on CUDA (NVIDIA GPU)."""
+def is_cpu() -> bool:
+    """Return True if running on Triton CPU backend."""
     return (
-        triton.runtime.driver.active.get_current_target().backend == "cuda"  # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess]
-        and DEVICE.type == "cuda"
+        os.environ.get("TRITON_CPU_BACKEND", "0") == "1"
+        or _get_triton_backend() == "cpu"
     )
 
 
+def is_cuda() -> bool:
+    """Return True if running on CUDA (NVIDIA GPU)."""
+    return _get_triton_backend() == "cuda" and torch.cuda.is_available()
+
+
+PROJECT_ROOT: Path = Path(__file__).parent.parent
+EXAMPLES_DIR: Path = PROJECT_ROOT / "examples"
+
+if is_cpu():
+    DEVICE = torch.device("cpu")
+elif torch.xpu.is_available():
+    DEVICE = torch.device("xpu")
+else:
+    DEVICE = torch.device("cuda")
+
+
 def get_nvidia_gpu_model() -> str:
     """
     Retrieves the model of the NVIDIA GPU being used.
@@ -80,6 +98,11 @@ def skipIfXPU(reason: str) -> Callable[[Callable], Callable]:
     return unittest.skipIf(torch.xpu.is_available(), reason)  # pyright: ignore[reportAttributeAccessIssue]
 
 
+def skipIfCpu(reason: str) -> Callable[[Callable], Callable]:
+    """Skip test if running on Triton CPU backend."""
+    return unittest.skipIf(is_cpu(), reason)
+
+
 def skipIfA10G(reason: str) -> Callable[[Callable], Callable]:
     """Skip test if running on A10G GPU"""
     gpu_model = get_nvidia_gpu_model()
diff --git a/test/test_autotuner.py b/test/test_autotuner.py
@@ -28,6 +28,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import import_path
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRocm
 from helion.autotuner import DifferentialEvolutionSearch
 from helion.autotuner import PatternSearch
@@ -347,6 +348,7 @@ def add(a, b):
         torch.testing.assert_close(add(*args), sum(args))
 
     @skipIfRocm("too slow on rocm")
+    @skipIfCpu("TritonError: Error from Triton code")
     def test_random_search(self):
         args = (
             torch.randn([512, 512], device=DEVICE),
diff --git a/test/test_dot.py b/test/test_dot.py
@@ -14,6 +14,7 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import is_cuda
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 from helion._testing import skipIfRocm
 from helion._testing import skipIfXPU
@@ -293,6 +294,7 @@ def bmm(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
 
     @skipIfRefEager("Debug dtype codegen checks rely on compiled code")
     @skipIfXPU("Failed on XPU - https://github.com/pytorch/helion/issues/772")
+    @skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.")
     def test_baddbmm_pipeline_debug_dtype_asserts(self):
         # Reproduces scripts/repro512.py within the test suite and asserts
         # the kernel compiles and runs with debug dtype asserts enabled.
@@ -981,6 +983,12 @@ def test_matmul_reshape_n_2(self):
             "float16 accumulator not supported for bf16/f32 in ref eager mode"
         )(_test_func)
 
+    # CPU backend skip for specific failing dynamic-shape case
+    if test_name == "test_input_float16_acc_float16_dynamic_shape":
+        _test_func = skipIfCpu("AssertionError: Tensor-likes are not close!")(
+            _test_func
+        )
+
     setattr(TestDot, test_name, _test_func)
 
 
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -16,6 +16,7 @@
 from helion._testing import check_example
 from helion._testing import import_path
 from helion._testing import skipIfA10G
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 from helion._testing import skipIfRocm
 from helion._testing import skipIfXPU
@@ -24,6 +25,7 @@
 torch.backends.cudnn.conv.fp32_precision = "tf32"
 
 
+@skipIfCpu("needs to be debugged")
 class TestExamples(RefEagerTestBase, TestCase):
     def test_add(self):
         args = (
diff --git a/test/test_generate_ast.py b/test/test_generate_ast.py
@@ -11,6 +11,7 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import import_path
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 import helion.language as hl
 
@@ -213,6 +214,7 @@ def test_final_cast_enforced_for_to_dtype(self):
         # Ensure codegen emits a final tl.cast(..., tl.bfloat16)
         assert "tl.cast" in code and "tl.bfloat16" in code
 
+    @skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.")
     def test_sigmoid_scalar_autocast(self):
         @helion.kernel(
             config=helion.Config(
diff --git a/test/test_inline_asm_elementwise.py b/test/test_inline_asm_elementwise.py
@@ -10,6 +10,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRocm
 import helion.language as hl
 
@@ -221,6 +222,7 @@ def kernel_empty_args(x: torch.Tensor) -> torch.Tensor:
         torch.testing.assert_close(result, expected)
 
     @skipIfRocm("only works on cuda")
+    @skipIfCpu("RuntimeError: failed to translate module to LLVM IR")
     def test_inline_asm_basic_compilation(self):
         """Test that inline_asm_elementwise compiles without errors (no CUDA requirement)"""
 
diff --git a/test/test_loops.py b/test/test_loops.py
@@ -14,6 +14,7 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import import_path
+from helion._testing import skipIfCpu
 from helion._testing import skipIfLowVRAM
 from helion._testing import skipIfRefEager
 import helion.language as hl
@@ -372,6 +373,7 @@ def fn(x: torch.Tensor) -> torch.Tensor:
         self.assertEqual(spec.min_size, 32)
         self.assertEqual(spec.max_size, 256)
 
+    @skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.")
     def test_register_block_size_codegen_size_hint(self):
         @helion.kernel(static_shapes=True)
         def kernel_fixed_block_size(
@@ -1153,6 +1155,7 @@ def kernel_with_dynamic_fill(
         expected = x + fill_value[0]
         torch.testing.assert_close(result, expected)
 
+    @skipIfCpu("codegen mismatch on CPU")
     def test_nested_loop_accumulator(self):
         """Test variable scoping with nested loops and accumulator pattern."""
 
@@ -1202,6 +1205,7 @@ def nested_loop_accumulator(x: torch.Tensor) -> torch.Tensor:
         torch.testing.assert_close(result, expected, atol=1e-5, rtol=1e-5)
         self.assertExpectedJournal(code)
 
+    @skipIfCpu("codegen mismatch on CPU")
     def test_three_pass_kernel(self):
         """Test variable scoping with three-pass pattern like layer norm."""
 
diff --git a/test/test_masking.py b/test/test_masking.py
@@ -11,6 +11,7 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 import helion.language as hl
 
@@ -42,6 +43,7 @@ def add1mm(x, y):
             result, (args[0] + 1) @ (args[1] + 1), rtol=1e-2, atol=1e-1
         )
 
+    @skipIfCpu("AssertionError: Tensor-likes are not close!")
     def test_no_mask_views0(self):
         @helion.kernel(config={"block_sizes": [32]})
         def fn(x):
@@ -59,6 +61,7 @@ def fn(x):
         torch.testing.assert_close(result, args[0].sum(dim=1, keepdim=True))
         self.assertNotIn("tl.where", code)
 
+    @skipIfCpu("AssertionError: Tensor-likes are not close!")
     def test_no_mask_views1(self):
         @helion.kernel(config={"block_sizes": [32]})
         def fn(x):
@@ -130,6 +133,7 @@ def fn(x):
         torch.testing.assert_close(result, (args[0] + 1).sum(dim=1))
         self.assertIn("tl.where", code)
 
+    @skipIfCpu("AssertionError: Tensor-likes are not close!")
     def test_no_mask_inductor_ops(self):
         @helion.kernel(config={"block_sizes": [32]})
         def fn(x):
diff --git a/test/test_misc.py b/test/test_misc.py
@@ -27,10 +27,12 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import import_path
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 import helion.language as hl
 
 
+@skipIfCpu("need to debug")
 class TestMisc(RefEagerTestBase, TestCase):
     def test_binary_operation_duplicate_args(self):
         """Test case to reproduce issue #221: binary operations with duplicate tensor references"""
@@ -194,6 +196,7 @@ class Point2:
         torch.testing.assert_close(result[1], 4 * x)
         self.assertExpectedJournal(code)
 
+    @skipIfCpu("AssertionError: Tensor-likes are not close!")
     def test_dtype_cast_preserved_before_second_dot(self):
         """Regression for issue #512: ensure p.to(v.dtype) is honored before a second dot.
 
diff --git a/test/test_persistent_kernels.py b/test/test_persistent_kernels.py
@@ -11,6 +11,7 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 import helion.language as hl
 
@@ -902,6 +903,7 @@ def multi_loop_kernel(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         torch.testing.assert_close(result[0], result_flat[0], atol=0, rtol=0)
         torch.testing.assert_close(result[1], result_flat[1], atol=0, rtol=0)
 
+    @skipIfCpu("RuntimeError: PassManager::run failed")
     def test_persistent_interleaved_multiple_loops_with_l2_grouping(self):
         """Test persistent_interleaved with multiple top-level hl.tile loops AND l2_grouping (all 3 features combined)."""
 
diff --git a/test/test_print.py b/test/test_print.py
@@ -13,6 +13,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRocm
 import helion.language as hl
 
@@ -27,6 +28,7 @@ def _store_capfd_on_class(request, capfd):
         request.cls._capfd = capfd
 
 
+@skipIfCpu("needs to be debugged")
 class TestPrint(RefEagerTestDisabled, TestCase):
     def run_kernel_and_capture_output(self, kernel_fn, args):
         """Helper to run kernel and capture output"""
diff --git a/test/test_random.py b/test/test_random.py
@@ -9,9 +9,11 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 import helion.language as hl
 
 
+@skipIfCpu("needs to be debugged")
 class TestRandom(RefEagerTestBase, TestCase):
     def test_hl_rand_1d(self):
         @helion.kernel(static_shapes=False)
diff --git a/test/test_register_tunable.py b/test/test_register_tunable.py
@@ -11,6 +11,7 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRocm
 from helion.autotuner import EnumFragment
 from helion.autotuner import IntegerFragment
@@ -109,6 +110,7 @@ def fn(x: torch.Tensor):
 
     @patch.object(_compat, "_supports_tensor_descriptor", lambda: False)
     @skipIfRocm("failure on rocm")
+    @skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.")
     def test_matmul_split_k(self):
         """Test matmul_split_k kernel with register_tunable"""
 
diff --git a/test/test_rng.py b/test/test_rng.py
@@ -10,9 +10,11 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 import helion.language as hl
 
 
+@skipIfCpu("needs to be debugged")
 class TestRNG(RefEagerTestBase, TestCase):
     def test_rand(self):
         """Test RNG seeding behavior, reproducibility, output range, and distribution."""
diff --git a/test/test_signal_wait.py b/test/test_signal_wait.py
@@ -9,11 +9,13 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfNotCUDA
 from helion._testing import skipIfRocm
 import helion.language as hl
 
 
+@skipIfCpu("needs to be debugged")
 class TestWait(RefEagerTestDisabled, TestCase):
     @skipIfRocm("only works on cuda")
     def test_wait_basic(self):
diff --git a/test/test_specialize.py b/test/test_specialize.py
@@ -10,11 +10,13 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfCpu
 from helion._testing import skipIfRefEager
 from helion.exc import ShapeSpecializingAllocation
 import helion.language as hl
 
 
+@skipIfCpu("needs to be debugged")
 class TestSpecialize(RefEagerTestBase, TestCase):
     maxDiff = 163842
 

Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,16 @@`
`71`	`71`	`"container-options": "--device=/dev/kfd --device=/dev/dri",`
`72`	`72`	`"pytorch-version": "pytorch-nightly",`
`73`	`73`	`"alias": "mi325x"`
	`74`	`+ },`
	`75`	`+ {`
	`76`	`+ "runner": "linux.g5.4xlarge.nvidia.gpu",`
	`77`	`+ "python-version": "3.12",`
	`78`	`+ "ref-eager": false,`
	`79`	`+ "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04",`
	`80`	`+ "runtime-version": "cpu",`
	`81`	`+ "container-options": "--gpus all",`
	`82`	`+ "pytorch-version": "pytorch-nightly",`
	`83`	`+ "alias": "cpu"`
`74`	`84`	`}`
`75`	`85`	`]`
`76`	`86`	`}`