From 192f187f0d3e3cdafbc63731ba6467a9026736c5 Mon Sep 17 00:00:00 2001 From: Oguz Ulgen Date: Mon, 27 Oct 2025 18:47:45 -0700 Subject: [PATCH 1/4] Add best effort triton-cpu support Fixes #163 stack-info: PR: https://github.com/pytorch/helion/pull/1037, branch: oulgen/stack/163 --- .github/matrix.json | 10 ++++++++ .github/workflows/test.yml | 9 +++++-- helion/_testing.py | 37 +++++++++++++++++++++++------ test/test_autotuner.py | 11 +++++++++ test/test_cache.py | 4 ++++ test/test_dot.py | 12 ++++++++++ test/test_errors.py | 2 ++ test/test_examples.py | 2 ++ test/test_generate_ast.py | 6 +++++ test/test_indexing.py | 2 ++ test/test_inline_asm_elementwise.py | 2 ++ test/test_loops.py | 6 +++++ test/test_masking.py | 4 ++++ test/test_matmul.py | 2 ++ test/test_misc.py | 3 +++ test/test_persistent_kernels.py | 2 ++ test/test_print.py | 2 ++ test/test_random.py | 2 ++ test/test_reductions.py | 2 ++ test/test_register_tunable.py | 2 ++ test/test_rng.py | 2 ++ test/test_signal_wait.py | 2 ++ test/test_specialize.py | 2 ++ test/test_type_propagation.py | 3 +++ 24 files changed, 122 insertions(+), 9 deletions(-) diff --git a/.github/matrix.json b/.github/matrix.json index 659f596fc..2c68a0905 100644 --- a/.github/matrix.json +++ b/.github/matrix.json @@ -71,6 +71,16 @@ "container-options": "--device=/dev/kfd --device=/dev/dri", "pytorch-version": "pytorch-nightly", "alias": "mi325x" + }, + { + "runner": "linux.g5.4xlarge.nvidia.gpu", + "python-version": "3.12", + "ref-eager": false, + "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04", + "runtime-version": "cpu", + "container-options": "--gpus all", + "pytorch-version": "pytorch-nightly", + "alias": "cpu" } ] } diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 053cac5c3..3ebf327ec 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -97,7 +97,7 @@ jobs: fi - name: Install Triton - if: steps.cache.outputs.cache-hit != 'true' && matrix.pytorch-version != 'pytorch-2.9' + if: steps.cache.outputs.cache-hit != 'true' && (matrix.pytorch-version != 'pytorch-2.9' || contains(matrix.alias, 'cpu')) run: | set -x source .venv/bin/activate @@ -110,7 +110,11 @@ jobs: cd /tmp/$USER uv pip uninstall triton pytorch-triton || true rm -rf triton/ || true - git clone https://github.com/triton-lang/triton.git + if [[ "${{ matrix.alias }}" == *cpu* ]]; then + git clone --recursive -b main-merged https://github.com/triton-lang/triton-cpu.git triton + else + git clone https://github.com/triton-lang/triton.git triton + fi cd triton/ uv pip install -r python/requirements.txt MAX_JOBS=$(nproc) TRITON_PARALLEL_LINK_JOBS=2 uv pip install . @@ -131,6 +135,7 @@ jobs: if [[ "${{ matrix.dtype-asserts }}" == "true" ]]; then export HELION_DEBUG_DTYPE_ASSERTS=1; fi if [[ "${{ matrix.expecttest-accept }}" == "true" ]]; then export EXPECTTEST_ACCEPT=1; fi if [[ "${{ matrix.ref-eager }}" == "true" ]]; then export HELION_INTERPRET=1; fi + if [[ "${{ contains(matrix.alias, 'cpu') }}" == "true" ]]; then export TRITON_CPU_BACKEND=1; fi # -rf: print failed tests # --timeout: max allowed time for each test pytest -rf --timeout=60 diff --git a/helion/_testing.py b/helion/_testing.py index 6da692f17..c0776732b 100644 --- a/helion/_testing.py +++ b/helion/_testing.py @@ -34,19 +34,37 @@ from .runtime.kernel import Kernel -DEVICE = torch.device("xpu") if torch.xpu.is_available() else torch.device("cuda") -PROJECT_ROOT: Path = Path(__file__).parent.parent -EXAMPLES_DIR: Path = PROJECT_ROOT / "examples" +def _get_triton_backend() -> str | None: + try: + return triton.runtime.driver.active.get_current_target().backend # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess] + except Exception: + return None -def is_cuda() -> bool: - """Return True if running on CUDA (NVIDIA GPU).""" +def is_cpu() -> bool: + """Return True if running on Triton CPU backend.""" return ( - triton.runtime.driver.active.get_current_target().backend == "cuda" # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess] - and DEVICE.type == "cuda" + os.environ.get("TRITON_CPU_BACKEND", "0") == "1" + or _get_triton_backend() == "cpu" ) +def is_cuda() -> bool: + """Return True if running on CUDA (NVIDIA GPU).""" + return _get_triton_backend() == "cuda" and torch.cuda.is_available() + + +PROJECT_ROOT: Path = Path(__file__).parent.parent +EXAMPLES_DIR: Path = PROJECT_ROOT / "examples" + +if is_cpu(): + DEVICE = torch.device("cpu") +elif torch.xpu.is_available(): + DEVICE = torch.device("xpu") +else: + DEVICE = torch.device("cuda") + + def get_nvidia_gpu_model() -> str: """ Retrieves the model of the NVIDIA GPU being used. @@ -80,6 +98,11 @@ def skipIfXPU(reason: str) -> Callable[[Callable], Callable]: return unittest.skipIf(torch.xpu.is_available(), reason) # pyright: ignore[reportAttributeAccessIssue] +def skipIfCpu(reason: str) -> Callable[[Callable], Callable]: + """Skip test if running on Triton CPU backend.""" + return unittest.skipIf(is_cpu(), reason) + + def skipIfA10G(reason: str) -> Callable[[Callable], Callable]: """Skip test if running on A10G GPU""" gpu_model = get_nvidia_gpu_model() diff --git a/test/test_autotuner.py b/test/test_autotuner.py index 877577a2d..0daa2181d 100644 --- a/test/test_autotuner.py +++ b/test/test_autotuner.py @@ -28,6 +28,7 @@ from helion._testing import RefEagerTestDisabled from helion._testing import TestCase from helion._testing import import_path +from helion._testing import skipIfCpu from helion._testing import skipIfRocm from helion.autotuner import DifferentialEvolutionSearch from helion.autotuner import PatternSearch @@ -316,6 +317,7 @@ def add(a, b): ) torch.testing.assert_close(add(*args), sum(args)) + @skipIfCpu("fails on Triton CPU backend") def test_run_finite_search(self): @helion.kernel( configs=[ @@ -347,6 +349,7 @@ def add(a, b): torch.testing.assert_close(add(*args), sum(args)) @skipIfRocm("too slow on rocm") + @skipIfCpu("TritonError: Error from Triton code") def test_random_search(self): args = ( torch.randn([512, 512], device=DEVICE), @@ -436,6 +439,7 @@ def diff_count(flat): ] self.assertEqual(sorted(pair_neighbors), sorted(expected)) + @skipIfCpu("fails on Triton CPU backend") def test_accuracy_check_filters_bad_config_wrong_output(self) -> None: bad_config = helion.Config(block_sizes=[1], num_warps=8) good_config = helion.Config(block_sizes=[1], num_warps=4) @@ -509,6 +513,7 @@ def make_bad_config_produce_wrong_output( run_mode("fork", expect_error=False) run_mode("spawn", expect_error=True) + @skipIfCpu("fails on Triton CPU backend") def test_accuracy_check_filters_bad_config_wrong_arg_mutation(self) -> None: bad_config = helion.Config(block_sizes=[1], num_warps=8) good_config = helion.Config(block_sizes=[1], num_warps=4) @@ -591,6 +596,7 @@ def wrong_fn(*fn_args, **fn_kwargs): run_mode("fork", expect_error=False) run_mode("spawn", expect_error=True) + @skipIfCpu("fails on Triton CPU backend") def test_autotune_baseline_fn(self) -> None: """Test that custom baseline function is used for accuracy checking.""" config1 = helion.Config(block_sizes=[32], num_warps=4) @@ -631,6 +637,7 @@ def add(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: # Verify the result is correct torch.testing.assert_close(result, args[0] + args[1]) + @skipIfCpu("fails on Triton CPU backend") def test_autotune_baseline_fn_filters_bad_config(self) -> None: """Test that custom baseline function correctly filters incorrect configs.""" bad_config = helion.Config(block_sizes=[1], num_warps=8) @@ -729,6 +736,7 @@ def add(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: ): add(*args) + @skipIfCpu("fails on Triton CPU backend") def test_max_generations(self): """Autotuner max generation respects explicit kwargs then setting override.""" @@ -772,6 +780,7 @@ def add(a, b): result = add(*args) torch.testing.assert_close(result, sum(args)) + @skipIfCpu("fails on Triton CPU backend") def test_autotune_effort_quick(self): """Test that quick effort profile uses correct default values.""" # Get the quick profile defaults @@ -907,6 +916,7 @@ def add(a, b): return search.samples[0] @skipIfRocm("accuracy difference") + @skipIfCpu("fails on Triton CPU backend") def test_autotune_random_seed_from_env_var(self) -> None: # same env var value -> same random sample with patch.dict( @@ -931,6 +941,7 @@ def test_autotune_random_seed_from_env_var(self) -> None: self.assertNotEqual(first, second) @skipIfRocm("accuracy difference") + @skipIfCpu("fails on Triton CPU backend") def test_autotune_random_seed_from_settings(self) -> None: # same autotune_random_seed setting -> same random sample first = self._autotune_and_record(autotune_random_seed=4242) diff --git a/test/test_cache.py b/test/test_cache.py index d458f5f9b..c69463225 100644 --- a/test/test_cache.py +++ b/test/test_cache.py @@ -15,6 +15,7 @@ from helion._testing import RefEagerTestDisabled from helion._testing import TestCase from helion._testing import import_path +from helion._testing import skipIfCpu from helion._utils import counters from helion.autotuner import StrictLocalAutotuneCache from helion.autotuner.base_search import BaseSearch @@ -73,6 +74,7 @@ def get_welford_kernel(): class TestCache(RefEagerTestDisabled, TestCase): @parametrize("name", ("add", "matmul", "welford")) + @skipIfCpu("fails on Triton CPU backend") def test_kernel(self, name): kernel, args_a, result_a, args_b, result_b = KERNELS[name]() @@ -105,6 +107,7 @@ def test_kernel(self, name): self.assertEqual(counters["autotune"]["cache_hit"], 1) self.assertEqual(counters["autotune"]["cache_put"], 2) + @skipIfCpu("fails on Triton CPU backend") def test_key_affects_cache_specialization(self): counters["autotune"].clear() self.addCleanup(counters["autotune"].clear) @@ -150,6 +153,7 @@ def add_one(x: torch.Tensor): self.assertEqual(counters["autotune"]["cache_hit"], 1) self.assertEqual(counters["autotune"]["cache_put"], 2) + @skipIfCpu("fails on Triton CPU backend") def test_assert_cache_hit(self): counters["autotune"].clear() self.addCleanup(counters["autotune"].clear) diff --git a/test/test_dot.py b/test/test_dot.py index 555fad1f7..5d56807bb 100644 --- a/test/test_dot.py +++ b/test/test_dot.py @@ -14,6 +14,7 @@ from helion._testing import TestCase from helion._testing import code_and_output from helion._testing import is_cuda +from helion._testing import skipIfCpu from helion._testing import skipIfRefEager from helion._testing import skipIfRocm from helion._testing import skipIfXPU @@ -293,6 +294,7 @@ def bmm(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: @skipIfRefEager("Debug dtype codegen checks rely on compiled code") @skipIfXPU("Failed on XPU - https://github.com/pytorch/helion/issues/772") + @skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.") def test_baddbmm_pipeline_debug_dtype_asserts(self): # Reproduces scripts/repro512.py within the test suite and asserts # the kernel compiles and runs with debug dtype asserts enabled. @@ -981,6 +983,16 @@ def test_matmul_reshape_n_2(self): "float16 accumulator not supported for bf16/f32 in ref eager mode" )(_test_func) + # CPU backend skip for specific failing dynamic-shape case + if test_name == "test_input_float16_acc_float16_dynamic_shape": + _test_func = skipIfCpu("AssertionError: Tensor-likes are not close!")( + _test_func + ) + if test_name == "test_input_float16_acc_float16_static_shape": + _test_func = skipIfCpu("AssertionError: Tensor-likes are not close!")( + _test_func + ) + setattr(TestDot, test_name, _test_func) diff --git a/test/test_errors.py b/test/test_errors.py index 20795bffc..feec2a925 100644 --- a/test/test_errors.py +++ b/test/test_errors.py @@ -10,6 +10,7 @@ from helion._testing import RefEagerTestDisabled from helion._testing import TestCase from helion._testing import code_and_output +from helion._testing import skipIfCpu from helion.autotuner.base_search import PopulationBasedSearch from helion.autotuner.base_search import PopulationMember from helion.autotuner.differential_evolution import DifferentialEvolutionSearch @@ -33,6 +34,7 @@ def _test_outer_kernel_calling_inner(x: torch.Tensor) -> torch.Tensor: class TestErrors(RefEagerTestDisabled, TestCase): + @skipIfCpu("fails on Triton CPU backend") def test_autotune_no_valid_configs(self): class FakeKernel: def __init__(self) -> None: diff --git a/test/test_examples.py b/test/test_examples.py index 61eb0a07d..15cf2c43c 100644 --- a/test/test_examples.py +++ b/test/test_examples.py @@ -16,6 +16,7 @@ from helion._testing import check_example from helion._testing import import_path from helion._testing import skipIfA10G +from helion._testing import skipIfCpu from helion._testing import skipIfRefEager from helion._testing import skipIfRocm from helion._testing import skipIfXPU @@ -24,6 +25,7 @@ torch.backends.cudnn.conv.fp32_precision = "tf32" +@skipIfCpu("needs to be debugged") class TestExamples(RefEagerTestBase, TestCase): def test_add(self): args = ( diff --git a/test/test_generate_ast.py b/test/test_generate_ast.py index d0b428a47..97878b05b 100644 --- a/test/test_generate_ast.py +++ b/test/test_generate_ast.py @@ -11,6 +11,7 @@ from helion._testing import TestCase from helion._testing import code_and_output from helion._testing import import_path +from helion._testing import skipIfCpu from helion._testing import skipIfRefEager import helion.language as hl @@ -35,6 +36,7 @@ def test_add1d(self): torch.testing.assert_close(result, args[0] + args[1]) self.assertExpectedJournal(code) + @skipIfCpu("fails on Triton CPU backend") def test_add2d(self): args = ( torch.randn([100, 500], device=DEVICE), @@ -46,6 +48,7 @@ def test_add2d(self): torch.testing.assert_close(result, args[0] + args[1]) self.assertExpectedJournal(code) + @skipIfCpu("fails on Triton CPU backend") def test_add2d_loop_order(self): args = ( torch.randn([100, 500], device=DEVICE), @@ -61,6 +64,7 @@ def test_add2d_loop_order(self): torch.testing.assert_close(result, args[0] + args[1]) self.assertExpectedJournal(code) + @skipIfCpu("fails on Triton CPU backend") def test_add3d(self): args = ( torch.randn([100, 500, 10], device=DEVICE), @@ -83,6 +87,7 @@ def test_add3d_xy_grid(self): torch.testing.assert_close(result, args[0] + args[1]) self.assertExpectedJournal(code) + @skipIfCpu("fails on Triton CPU backend") def test_add3d_reorder(self): args = ( torch.randn([100, 500, 10], device=DEVICE), @@ -213,6 +218,7 @@ def test_final_cast_enforced_for_to_dtype(self): # Ensure codegen emits a final tl.cast(..., tl.bfloat16) assert "tl.cast" in code and "tl.bfloat16" in code + @skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.") def test_sigmoid_scalar_autocast(self): @helion.kernel( config=helion.Config( diff --git a/test/test_indexing.py b/test/test_indexing.py index 2aeafa38a..8e1118390 100644 --- a/test/test_indexing.py +++ b/test/test_indexing.py @@ -14,6 +14,7 @@ from helion._testing import RefEagerTestBase from helion._testing import TestCase from helion._testing import code_and_output +from helion._testing import skipIfCpu from helion._testing import skipIfLowVRAM from helion._testing import skipIfNormalMode from helion._testing import skipIfRefEager @@ -396,6 +397,7 @@ def test_block_size_access(x: torch.Tensor) -> torch.Tensor: "IndexOffsetOutOfRangeForInt32 error is not raised in ref eager mode" ) @skipIfLowVRAM("Test requires high VRAM") + @skipIfCpu("fails on Triton CPU backend") def test_int32_offset_out_of_range_error(self): repro_config = helion.Config( block_sizes=[32, 32], diff --git a/test/test_inline_asm_elementwise.py b/test/test_inline_asm_elementwise.py index 629ca1e3d..d21be7afb 100644 --- a/test/test_inline_asm_elementwise.py +++ b/test/test_inline_asm_elementwise.py @@ -10,6 +10,7 @@ from helion._testing import RefEagerTestDisabled from helion._testing import TestCase from helion._testing import code_and_output +from helion._testing import skipIfCpu from helion._testing import skipIfRocm import helion.language as hl @@ -221,6 +222,7 @@ def kernel_empty_args(x: torch.Tensor) -> torch.Tensor: torch.testing.assert_close(result, expected) @skipIfRocm("only works on cuda") + @skipIfCpu("RuntimeError: failed to translate module to LLVM IR") def test_inline_asm_basic_compilation(self): """Test that inline_asm_elementwise compiles without errors (no CUDA requirement)""" diff --git a/test/test_loops.py b/test/test_loops.py index e1a6447f1..f263e930f 100644 --- a/test/test_loops.py +++ b/test/test_loops.py @@ -14,6 +14,7 @@ from helion._testing import TestCase from helion._testing import code_and_output from helion._testing import import_path +from helion._testing import skipIfCpu from helion._testing import skipIfLowVRAM from helion._testing import skipIfRefEager import helion.language as hl @@ -76,6 +77,7 @@ def test_3d_device_loop0(self): self.assertExpectedJournal(code) @skipIfLowVRAM("Test requires high VRAM for [128, 128, 128, 128] tensors") + @skipIfCpu("fails on Triton CPU backend") def test_3d_device_loop1(self): args = (torch.randn([128, 128, 128, 128], device=DEVICE),) code, result = code_and_output( @@ -102,6 +104,7 @@ def test_3d_device_loop2(self): @patch.object(_compat, "_supports_tensor_descriptor", lambda: False) @skipIfLowVRAM("Test requires high VRAM for [128, 128, 128, 128] tensors") + @skipIfCpu("fails on Triton CPU backend") def test_3d_device_loop3(self): args = (torch.randn([128, 128, 128, 128], device=DEVICE),) code, result = code_and_output( @@ -372,6 +375,7 @@ def fn(x: torch.Tensor) -> torch.Tensor: self.assertEqual(spec.min_size, 32) self.assertEqual(spec.max_size, 256) + @skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.") def test_register_block_size_codegen_size_hint(self): @helion.kernel(static_shapes=True) def kernel_fixed_block_size( @@ -1153,6 +1157,7 @@ def kernel_with_dynamic_fill( expected = x + fill_value[0] torch.testing.assert_close(result, expected) + @skipIfCpu("codegen mismatch on CPU") def test_nested_loop_accumulator(self): """Test variable scoping with nested loops and accumulator pattern.""" @@ -1202,6 +1207,7 @@ def nested_loop_accumulator(x: torch.Tensor) -> torch.Tensor: torch.testing.assert_close(result, expected, atol=1e-5, rtol=1e-5) self.assertExpectedJournal(code) + @skipIfCpu("codegen mismatch on CPU") def test_three_pass_kernel(self): """Test variable scoping with three-pass pattern like layer norm.""" diff --git a/test/test_masking.py b/test/test_masking.py index ad7bff8f4..0290c3d61 100644 --- a/test/test_masking.py +++ b/test/test_masking.py @@ -11,6 +11,7 @@ from helion._testing import RefEagerTestBase from helion._testing import TestCase from helion._testing import code_and_output +from helion._testing import skipIfCpu from helion._testing import skipIfRefEager import helion.language as hl @@ -42,6 +43,7 @@ def add1mm(x, y): result, (args[0] + 1) @ (args[1] + 1), rtol=1e-2, atol=1e-1 ) + @skipIfCpu("AssertionError: Tensor-likes are not close!") def test_no_mask_views0(self): @helion.kernel(config={"block_sizes": [32]}) def fn(x): @@ -59,6 +61,7 @@ def fn(x): torch.testing.assert_close(result, args[0].sum(dim=1, keepdim=True)) self.assertNotIn("tl.where", code) + @skipIfCpu("AssertionError: Tensor-likes are not close!") def test_no_mask_views1(self): @helion.kernel(config={"block_sizes": [32]}) def fn(x): @@ -130,6 +133,7 @@ def fn(x): torch.testing.assert_close(result, (args[0] + 1).sum(dim=1)) self.assertIn("tl.where", code) + @skipIfCpu("AssertionError: Tensor-likes are not close!") def test_no_mask_inductor_ops(self): @helion.kernel(config={"block_sizes": [32]}) def fn(x): diff --git a/test/test_matmul.py b/test/test_matmul.py index 026485ec3..fa76013e4 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -15,6 +15,7 @@ from helion._testing import TestCase from helion._testing import code_and_output from helion._testing import import_path +from helion._testing import skipIfCpu from helion._testing import skipIfRefEager from helion._testing import skipIfRocm import helion.language as hl @@ -203,6 +204,7 @@ def test_matmul_static_shapes3(self): torch.testing.assert_close(output, args[0] @ args[1], atol=1e-1, rtol=1e-2) self.assertExpectedJournal(code) + @skipIfCpu("fails on Triton CPU backend") def test_matmul_packed_int4_block_size_constexpr(self): torch.manual_seed(0) M = N = K = 32 diff --git a/test/test_misc.py b/test/test_misc.py index 643c5411b..d2d409124 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -27,10 +27,12 @@ from helion._testing import TestCase from helion._testing import code_and_output from helion._testing import import_path +from helion._testing import skipIfCpu from helion._testing import skipIfRefEager import helion.language as hl +@skipIfCpu("need to debug") class TestMisc(RefEagerTestBase, TestCase): def test_binary_operation_duplicate_args(self): """Test case to reproduce issue #221: binary operations with duplicate tensor references""" @@ -194,6 +196,7 @@ class Point2: torch.testing.assert_close(result[1], 4 * x) self.assertExpectedJournal(code) + @skipIfCpu("AssertionError: Tensor-likes are not close!") def test_dtype_cast_preserved_before_second_dot(self): """Regression for issue #512: ensure p.to(v.dtype) is honored before a second dot. diff --git a/test/test_persistent_kernels.py b/test/test_persistent_kernels.py index dcb94fa39..f6d77a218 100644 --- a/test/test_persistent_kernels.py +++ b/test/test_persistent_kernels.py @@ -11,6 +11,7 @@ from helion._testing import RefEagerTestBase from helion._testing import TestCase from helion._testing import code_and_output +from helion._testing import skipIfCpu from helion._testing import skipIfRefEager import helion.language as hl @@ -902,6 +903,7 @@ def multi_loop_kernel(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: torch.testing.assert_close(result[0], result_flat[0], atol=0, rtol=0) torch.testing.assert_close(result[1], result_flat[1], atol=0, rtol=0) + @skipIfCpu("RuntimeError: PassManager::run failed") def test_persistent_interleaved_multiple_loops_with_l2_grouping(self): """Test persistent_interleaved with multiple top-level hl.tile loops AND l2_grouping (all 3 features combined).""" diff --git a/test/test_print.py b/test/test_print.py index fdfc1bb4e..e3bb21fa7 100644 --- a/test/test_print.py +++ b/test/test_print.py @@ -13,6 +13,7 @@ from helion._testing import RefEagerTestDisabled from helion._testing import TestCase from helion._testing import code_and_output +from helion._testing import skipIfCpu from helion._testing import skipIfRocm import helion.language as hl @@ -27,6 +28,7 @@ def _store_capfd_on_class(request, capfd): request.cls._capfd = capfd +@skipIfCpu("needs to be debugged") class TestPrint(RefEagerTestDisabled, TestCase): def run_kernel_and_capture_output(self, kernel_fn, args): """Helper to run kernel and capture output""" diff --git a/test/test_random.py b/test/test_random.py index b4dd74f3a..af1611a22 100644 --- a/test/test_random.py +++ b/test/test_random.py @@ -9,9 +9,11 @@ from helion._testing import RefEagerTestBase from helion._testing import TestCase from helion._testing import code_and_output +from helion._testing import skipIfCpu import helion.language as hl +@skipIfCpu("needs to be debugged") class TestRandom(RefEagerTestBase, TestCase): def test_hl_rand_1d(self): @helion.kernel(static_shapes=False) diff --git a/test/test_reductions.py b/test/test_reductions.py index bf4a10eeb..586d53409 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -11,6 +11,7 @@ from helion._testing import RefEagerTestBase from helion._testing import TestCase from helion._testing import code_and_output +from helion._testing import skipIfCpu from helion._testing import skipIfRefEager import helion.language as hl @@ -79,6 +80,7 @@ def sum_const_inner(x: torch.Tensor) -> torch.Tensor: torch.testing.assert_close(out, x.sum(-1), rtol=1e-4, atol=1e-4) @skipIfRefEager("Does not call assert_close") + @skipIfCpu("fails on Triton CPU backend") def test_broken_layernorm(self): @helion.kernel(autotune_effort="none") def layer_norm_fwd( diff --git a/test/test_register_tunable.py b/test/test_register_tunable.py index 10b50cdcd..b6ea5f959 100644 --- a/test/test_register_tunable.py +++ b/test/test_register_tunable.py @@ -11,6 +11,7 @@ from helion._testing import RefEagerTestBase from helion._testing import TestCase from helion._testing import code_and_output +from helion._testing import skipIfCpu from helion._testing import skipIfRocm from helion.autotuner import EnumFragment from helion.autotuner import IntegerFragment @@ -109,6 +110,7 @@ def fn(x: torch.Tensor): @patch.object(_compat, "_supports_tensor_descriptor", lambda: False) @skipIfRocm("failure on rocm") + @skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.") def test_matmul_split_k(self): """Test matmul_split_k kernel with register_tunable""" diff --git a/test/test_rng.py b/test/test_rng.py index 3fec8d6d1..a27052014 100644 --- a/test/test_rng.py +++ b/test/test_rng.py @@ -10,9 +10,11 @@ from helion._testing import RefEagerTestBase from helion._testing import TestCase from helion._testing import code_and_output +from helion._testing import skipIfCpu import helion.language as hl +@skipIfCpu("needs to be debugged") class TestRNG(RefEagerTestBase, TestCase): def test_rand(self): """Test RNG seeding behavior, reproducibility, output range, and distribution.""" diff --git a/test/test_signal_wait.py b/test/test_signal_wait.py index 8ed10d273..2388971e7 100644 --- a/test/test_signal_wait.py +++ b/test/test_signal_wait.py @@ -9,11 +9,13 @@ from helion._testing import RefEagerTestDisabled from helion._testing import TestCase from helion._testing import code_and_output +from helion._testing import skipIfCpu from helion._testing import skipIfNotCUDA from helion._testing import skipIfRocm import helion.language as hl +@skipIfCpu("needs to be debugged") class TestWait(RefEagerTestDisabled, TestCase): @skipIfRocm("only works on cuda") def test_wait_basic(self): diff --git a/test/test_specialize.py b/test/test_specialize.py index ecb144834..2b0e9d616 100644 --- a/test/test_specialize.py +++ b/test/test_specialize.py @@ -10,11 +10,13 @@ from helion._testing import RefEagerTestBase from helion._testing import TestCase from helion._testing import code_and_output +from helion._testing import skipIfCpu from helion._testing import skipIfRefEager from helion.exc import ShapeSpecializingAllocation import helion.language as hl +@skipIfCpu("needs to be debugged") class TestSpecialize(RefEagerTestBase, TestCase): maxDiff = 163842 diff --git a/test/test_type_propagation.py b/test/test_type_propagation.py index a1ce00afc..49d70fe80 100644 --- a/test/test_type_propagation.py +++ b/test/test_type_propagation.py @@ -12,6 +12,7 @@ from helion._testing import RefEagerTestDisabled from helion._testing import TestCase from helion._testing import import_path +from helion._testing import skipIfCpu from helion._testing import skipIfXPU import helion.language as hl @@ -97,6 +98,7 @@ def test_matmul(self): self.assertExpectedJournal(output) @skipIfXPU("CUDA-only") + @skipIfCpu("CUDA-only") def test_cuda_device_properties(self): @helion.kernel def use_device_properties(x: torch.Tensor) -> torch.Tensor: @@ -119,6 +121,7 @@ def use_device_properties(x: torch.Tensor) -> torch.Tensor: self.assertExpectedJournal(output) @skipIfXPU("CUDA-only") + @skipIfCpu("CUDA-only") def test_cuda_device_properties_unsupported_attribute(self): @helion.kernel def use_unsupported_property(x: torch.Tensor) -> torch.Tensor: From ada59b9525fa0532b39e3b89f9689639c87e4c33 Mon Sep 17 00:00:00 2001 From: Oguz Ulgen Date: Sun, 2 Nov 2025 01:27:59 -0700 Subject: [PATCH 2/4] Apply suggestion from @oulgen --- .github/matrix.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/matrix.json b/.github/matrix.json index 2c68a0905..945e57099 100644 --- a/.github/matrix.json +++ b/.github/matrix.json @@ -73,7 +73,7 @@ "alias": "mi325x" }, { - "runner": "linux.g5.4xlarge.nvidia.gpu", + "runner": "linux.24xl.spr-metal", "python-version": "3.12", "ref-eager": false, "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04", From a8271c558be1c93b6c8fbe4e9a10742ba7e8f011 Mon Sep 17 00:00:00 2001 From: Oguz Ulgen Date: Sun, 2 Nov 2025 01:37:00 -0700 Subject: [PATCH 3/4] Apply suggestion from @oulgen --- .github/matrix.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/matrix.json b/.github/matrix.json index 945e57099..8d6815966 100644 --- a/.github/matrix.json +++ b/.github/matrix.json @@ -76,9 +76,9 @@ "runner": "linux.24xl.spr-metal", "python-version": "3.12", "ref-eager": false, - "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04", + "image": "ubuntu:24.04", "runtime-version": "cpu", - "container-options": "--gpus all", + "container-options": "--ipc=host", "pytorch-version": "pytorch-nightly", "alias": "cpu" } From 8eae8bfdca9d5a20b0989c6c2e61df46c27af675 Mon Sep 17 00:00:00 2001 From: Oguz Ulgen Date: Sun, 2 Nov 2025 08:55:11 -0800 Subject: [PATCH 4/4] Apply suggestion from @oulgen --- .github/matrix.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/matrix.json b/.github/matrix.json index 8d6815966..2c68a0905 100644 --- a/.github/matrix.json +++ b/.github/matrix.json @@ -73,12 +73,12 @@ "alias": "mi325x" }, { - "runner": "linux.24xl.spr-metal", + "runner": "linux.g5.4xlarge.nvidia.gpu", "python-version": "3.12", "ref-eager": false, - "image": "ubuntu:24.04", + "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04", "runtime-version": "cpu", - "container-options": "--ipc=host", + "container-options": "--gpus all", "pytorch-version": "pytorch-nightly", "alias": "cpu" }