Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/matrix.json
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,16 @@
"container-options": "--device=/dev/kfd --device=/dev/dri",
"pytorch-version": "pytorch-nightly",
"alias": "mi325x"
},
{
"runner": "linux.g5.4xlarge.nvidia.gpu",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need a GPU runner for this? GPU machines are expensive.

"python-version": "3.12",
"ref-eager": false,
"image": "nvidia/cuda:12.8.1-devel-ubuntu24.04",
"runtime-version": "cpu",
"container-options": "--gpus all",
"pytorch-version": "pytorch-nightly",
"alias": "cpu"
}
]
}
9 changes: 7 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ jobs:
fi

- name: Install Triton
if: steps.cache.outputs.cache-hit != 'true' && matrix.pytorch-version != 'pytorch-2.9'
if: steps.cache.outputs.cache-hit != 'true' && (matrix.pytorch-version != 'pytorch-2.9' || contains(matrix.alias, 'cpu'))
run: |
set -x
source .venv/bin/activate
Expand All @@ -110,7 +110,11 @@ jobs:
cd /tmp/$USER
uv pip uninstall triton pytorch-triton || true
rm -rf triton/ || true
git clone https://github.com/triton-lang/triton.git
if [[ "${{ matrix.alias }}" == *cpu* ]]; then
git clone --recursive -b main-merged https://github.com/triton-lang/triton-cpu.git triton
else
git clone https://github.com/triton-lang/triton.git triton
fi
cd triton/
uv pip install -r python/requirements.txt
MAX_JOBS=$(nproc) TRITON_PARALLEL_LINK_JOBS=2 uv pip install .
Expand All @@ -131,6 +135,7 @@ jobs:
if [[ "${{ matrix.dtype-asserts }}" == "true" ]]; then export HELION_DEBUG_DTYPE_ASSERTS=1; fi
if [[ "${{ matrix.expecttest-accept }}" == "true" ]]; then export EXPECTTEST_ACCEPT=1; fi
if [[ "${{ matrix.ref-eager }}" == "true" ]]; then export HELION_INTERPRET=1; fi
if [[ "${{ contains(matrix.alias, 'cpu') }}" == "true" ]]; then export TRITON_CPU_BACKEND=1; fi
# -rf: print failed tests
# --timeout: max allowed time for each test
pytest -rf --timeout=60
Expand Down
37 changes: 30 additions & 7 deletions helion/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,37 @@
from .runtime.kernel import Kernel


DEVICE = torch.device("xpu") if torch.xpu.is_available() else torch.device("cuda")
PROJECT_ROOT: Path = Path(__file__).parent.parent
EXAMPLES_DIR: Path = PROJECT_ROOT / "examples"
def _get_triton_backend() -> str | None:
try:
return triton.runtime.driver.active.get_current_target().backend # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess]
except Exception:
return None


def is_cuda() -> bool:
"""Return True if running on CUDA (NVIDIA GPU)."""
def is_cpu() -> bool:
"""Return True if running on Triton CPU backend."""
return (
triton.runtime.driver.active.get_current_target().backend == "cuda" # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess]
and DEVICE.type == "cuda"
os.environ.get("TRITON_CPU_BACKEND", "0") == "1"
or _get_triton_backend() == "cpu"
)


def is_cuda() -> bool:
"""Return True if running on CUDA (NVIDIA GPU)."""
return _get_triton_backend() == "cuda" and torch.cuda.is_available()


PROJECT_ROOT: Path = Path(__file__).parent.parent
EXAMPLES_DIR: Path = PROJECT_ROOT / "examples"

if is_cpu():
DEVICE = torch.device("cpu")
elif torch.xpu.is_available():
DEVICE = torch.device("xpu")
else:
DEVICE = torch.device("cuda")


def get_nvidia_gpu_model() -> str:
"""
Retrieves the model of the NVIDIA GPU being used.
Expand Down Expand Up @@ -80,6 +98,11 @@ def skipIfXPU(reason: str) -> Callable[[Callable], Callable]:
return unittest.skipIf(torch.xpu.is_available(), reason) # pyright: ignore[reportAttributeAccessIssue]


def skipIfCpu(reason: str) -> Callable[[Callable], Callable]:
"""Skip test if running on Triton CPU backend."""
return unittest.skipIf(is_cpu(), reason)


def skipIfA10G(reason: str) -> Callable[[Callable], Callable]:
"""Skip test if running on A10G GPU"""
gpu_model = get_nvidia_gpu_model()
Expand Down
11 changes: 11 additions & 0 deletions test/test_autotuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from helion._testing import RefEagerTestDisabled
from helion._testing import TestCase
from helion._testing import import_path
from helion._testing import skipIfCpu
from helion._testing import skipIfRocm
from helion.autotuner import DifferentialEvolutionSearch
from helion.autotuner import PatternSearch
Expand Down Expand Up @@ -316,6 +317,7 @@ def add(a, b):
)
torch.testing.assert_close(add(*args), sum(args))

@skipIfCpu("fails on Triton CPU backend")
def test_run_finite_search(self):
@helion.kernel(
configs=[
Expand Down Expand Up @@ -347,6 +349,7 @@ def add(a, b):
torch.testing.assert_close(add(*args), sum(args))

@skipIfRocm("too slow on rocm")
@skipIfCpu("TritonError: Error from Triton code")
def test_random_search(self):
args = (
torch.randn([512, 512], device=DEVICE),
Expand Down Expand Up @@ -436,6 +439,7 @@ def diff_count(flat):
]
self.assertEqual(sorted(pair_neighbors), sorted(expected))

@skipIfCpu("fails on Triton CPU backend")
def test_accuracy_check_filters_bad_config_wrong_output(self) -> None:
bad_config = helion.Config(block_sizes=[1], num_warps=8)
good_config = helion.Config(block_sizes=[1], num_warps=4)
Expand Down Expand Up @@ -509,6 +513,7 @@ def make_bad_config_produce_wrong_output(
run_mode("fork", expect_error=False)
run_mode("spawn", expect_error=True)

@skipIfCpu("fails on Triton CPU backend")
def test_accuracy_check_filters_bad_config_wrong_arg_mutation(self) -> None:
bad_config = helion.Config(block_sizes=[1], num_warps=8)
good_config = helion.Config(block_sizes=[1], num_warps=4)
Expand Down Expand Up @@ -591,6 +596,7 @@ def wrong_fn(*fn_args, **fn_kwargs):
run_mode("fork", expect_error=False)
run_mode("spawn", expect_error=True)

@skipIfCpu("fails on Triton CPU backend")
def test_autotune_baseline_fn(self) -> None:
"""Test that custom baseline function is used for accuracy checking."""
config1 = helion.Config(block_sizes=[32], num_warps=4)
Expand Down Expand Up @@ -631,6 +637,7 @@ def add(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
# Verify the result is correct
torch.testing.assert_close(result, args[0] + args[1])

@skipIfCpu("fails on Triton CPU backend")
def test_autotune_baseline_fn_filters_bad_config(self) -> None:
"""Test that custom baseline function correctly filters incorrect configs."""
bad_config = helion.Config(block_sizes=[1], num_warps=8)
Expand Down Expand Up @@ -729,6 +736,7 @@ def add(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
):
add(*args)

@skipIfCpu("fails on Triton CPU backend")
def test_max_generations(self):
"""Autotuner max generation respects explicit kwargs then setting override."""

Expand Down Expand Up @@ -772,6 +780,7 @@ def add(a, b):
result = add(*args)
torch.testing.assert_close(result, sum(args))

@skipIfCpu("fails on Triton CPU backend")
def test_autotune_effort_quick(self):
"""Test that quick effort profile uses correct default values."""
# Get the quick profile defaults
Expand Down Expand Up @@ -907,6 +916,7 @@ def add(a, b):
return search.samples[0]

@skipIfRocm("accuracy difference")
@skipIfCpu("fails on Triton CPU backend")
def test_autotune_random_seed_from_env_var(self) -> None:
# same env var value -> same random sample
with patch.dict(
Expand All @@ -931,6 +941,7 @@ def test_autotune_random_seed_from_env_var(self) -> None:
self.assertNotEqual(first, second)

@skipIfRocm("accuracy difference")
@skipIfCpu("fails on Triton CPU backend")
def test_autotune_random_seed_from_settings(self) -> None:
# same autotune_random_seed setting -> same random sample
first = self._autotune_and_record(autotune_random_seed=4242)
Expand Down
4 changes: 4 additions & 0 deletions test/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from helion._testing import RefEagerTestDisabled
from helion._testing import TestCase
from helion._testing import import_path
from helion._testing import skipIfCpu
from helion._utils import counters
from helion.autotuner import StrictLocalAutotuneCache
from helion.autotuner.base_search import BaseSearch
Expand Down Expand Up @@ -73,6 +74,7 @@ def get_welford_kernel():

class TestCache(RefEagerTestDisabled, TestCase):
@parametrize("name", ("add", "matmul", "welford"))
@skipIfCpu("fails on Triton CPU backend")
def test_kernel(self, name):
kernel, args_a, result_a, args_b, result_b = KERNELS[name]()

Expand Down Expand Up @@ -105,6 +107,7 @@ def test_kernel(self, name):
self.assertEqual(counters["autotune"]["cache_hit"], 1)
self.assertEqual(counters["autotune"]["cache_put"], 2)

@skipIfCpu("fails on Triton CPU backend")
def test_key_affects_cache_specialization(self):
counters["autotune"].clear()
self.addCleanup(counters["autotune"].clear)
Expand Down Expand Up @@ -150,6 +153,7 @@ def add_one(x: torch.Tensor):
self.assertEqual(counters["autotune"]["cache_hit"], 1)
self.assertEqual(counters["autotune"]["cache_put"], 2)

@skipIfCpu("fails on Triton CPU backend")
def test_assert_cache_hit(self):
counters["autotune"].clear()
self.addCleanup(counters["autotune"].clear)
Expand Down
12 changes: 12 additions & 0 deletions test/test_dot.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from helion._testing import TestCase
from helion._testing import code_and_output
from helion._testing import is_cuda
from helion._testing import skipIfCpu
from helion._testing import skipIfRefEager
from helion._testing import skipIfRocm
from helion._testing import skipIfXPU
Expand Down Expand Up @@ -293,6 +294,7 @@ def bmm(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:

@skipIfRefEager("Debug dtype codegen checks rely on compiled code")
@skipIfXPU("Failed on XPU - https://github.com/pytorch/helion/issues/772")
@skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.")
def test_baddbmm_pipeline_debug_dtype_asserts(self):
# Reproduces scripts/repro512.py within the test suite and asserts
# the kernel compiles and runs with debug dtype asserts enabled.
Expand Down Expand Up @@ -981,6 +983,16 @@ def test_matmul_reshape_n_2(self):
"float16 accumulator not supported for bf16/f32 in ref eager mode"
)(_test_func)

# CPU backend skip for specific failing dynamic-shape case
if test_name == "test_input_float16_acc_float16_dynamic_shape":
_test_func = skipIfCpu("AssertionError: Tensor-likes are not close!")(
_test_func
)
if test_name == "test_input_float16_acc_float16_static_shape":
_test_func = skipIfCpu("AssertionError: Tensor-likes are not close!")(
_test_func
)

setattr(TestDot, test_name, _test_func)


Expand Down
2 changes: 2 additions & 0 deletions test/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from helion._testing import RefEagerTestDisabled
from helion._testing import TestCase
from helion._testing import code_and_output
from helion._testing import skipIfCpu
from helion.autotuner.base_search import PopulationBasedSearch
from helion.autotuner.base_search import PopulationMember
from helion.autotuner.differential_evolution import DifferentialEvolutionSearch
Expand All @@ -33,6 +34,7 @@ def _test_outer_kernel_calling_inner(x: torch.Tensor) -> torch.Tensor:


class TestErrors(RefEagerTestDisabled, TestCase):
@skipIfCpu("fails on Triton CPU backend")
def test_autotune_no_valid_configs(self):
class FakeKernel:
def __init__(self) -> None:
Expand Down
2 changes: 2 additions & 0 deletions test/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from helion._testing import check_example
from helion._testing import import_path
from helion._testing import skipIfA10G
from helion._testing import skipIfCpu
from helion._testing import skipIfRefEager
from helion._testing import skipIfRocm
from helion._testing import skipIfXPU
Expand All @@ -24,6 +25,7 @@
torch.backends.cudnn.conv.fp32_precision = "tf32"


@skipIfCpu("needs to be debugged")
class TestExamples(RefEagerTestBase, TestCase):
def test_add(self):
args = (
Expand Down
6 changes: 6 additions & 0 deletions test/test_generate_ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from helion._testing import TestCase
from helion._testing import code_and_output
from helion._testing import import_path
from helion._testing import skipIfCpu
from helion._testing import skipIfRefEager
import helion.language as hl

Expand All @@ -35,6 +36,7 @@ def test_add1d(self):
torch.testing.assert_close(result, args[0] + args[1])
self.assertExpectedJournal(code)

@skipIfCpu("fails on Triton CPU backend")
def test_add2d(self):
args = (
torch.randn([100, 500], device=DEVICE),
Expand All @@ -46,6 +48,7 @@ def test_add2d(self):
torch.testing.assert_close(result, args[0] + args[1])
self.assertExpectedJournal(code)

@skipIfCpu("fails on Triton CPU backend")
def test_add2d_loop_order(self):
args = (
torch.randn([100, 500], device=DEVICE),
Expand All @@ -61,6 +64,7 @@ def test_add2d_loop_order(self):
torch.testing.assert_close(result, args[0] + args[1])
self.assertExpectedJournal(code)

@skipIfCpu("fails on Triton CPU backend")
def test_add3d(self):
args = (
torch.randn([100, 500, 10], device=DEVICE),
Expand All @@ -83,6 +87,7 @@ def test_add3d_xy_grid(self):
torch.testing.assert_close(result, args[0] + args[1])
self.assertExpectedJournal(code)

@skipIfCpu("fails on Triton CPU backend")
def test_add3d_reorder(self):
args = (
torch.randn([100, 500, 10], device=DEVICE),
Expand Down Expand Up @@ -213,6 +218,7 @@ def test_final_cast_enforced_for_to_dtype(self):
# Ensure codegen emits a final tl.cast(..., tl.bfloat16)
assert "tl.cast" in code and "tl.bfloat16" in code

@skipIfCpu("Failed: Timeout (>10.0s) from pytest-timeout.")
def test_sigmoid_scalar_autocast(self):
@helion.kernel(
config=helion.Config(
Expand Down
2 changes: 2 additions & 0 deletions test/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from helion._testing import RefEagerTestBase
from helion._testing import TestCase
from helion._testing import code_and_output
from helion._testing import skipIfCpu
from helion._testing import skipIfLowVRAM
from helion._testing import skipIfNormalMode
from helion._testing import skipIfRefEager
Expand Down Expand Up @@ -396,6 +397,7 @@ def test_block_size_access(x: torch.Tensor) -> torch.Tensor:
"IndexOffsetOutOfRangeForInt32 error is not raised in ref eager mode"
)
@skipIfLowVRAM("Test requires high VRAM")
@skipIfCpu("fails on Triton CPU backend")
def test_int32_offset_out_of_range_error(self):
repro_config = helion.Config(
block_sizes=[32, 32],
Expand Down
2 changes: 2 additions & 0 deletions test/test_inline_asm_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from helion._testing import RefEagerTestDisabled
from helion._testing import TestCase
from helion._testing import code_and_output
from helion._testing import skipIfCpu
from helion._testing import skipIfRocm
import helion.language as hl

Expand Down Expand Up @@ -221,6 +222,7 @@ def kernel_empty_args(x: torch.Tensor) -> torch.Tensor:
torch.testing.assert_close(result, expected)

@skipIfRocm("only works on cuda")
@skipIfCpu("RuntimeError: failed to translate module to LLVM IR")
def test_inline_asm_basic_compilation(self):
"""Test that inline_asm_elementwise compiles without errors (no CUDA requirement)"""

Expand Down
Loading
Loading