Add ptxas_config autotuning option

jansel · jansel · commit f04abf5cf7d8 · 2025-10-02T20:42:10.000-07:00
stack-info: PR: #793, branch: jansel/stack/158
diff --git a/helion/_compat.py b/helion/_compat.py
@@ -88,3 +88,12 @@ def warps_to_threads(num_warps: int) -> int:
         )
         return num_warps * (props.warp_size or 32)
     return num_warps * 32
+
+
+def supports_ptxas(device: torch.device) -> bool:
+    """Return True if PTXAS controls are supported for the given device."""
+    if device.type != "cuda":
+        return False
+    if torch.version.hip is not None:
+        return False
+    return supports_tensor_descriptor()
diff --git a/helion/_compiler/compile_environment.py b/helion/_compiler/compile_environment.py
@@ -20,6 +20,7 @@
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 
 from .. import exc
+from .._compat import supports_ptxas
 from ..language.constexpr import ConstExpr
 from .loop_dependency_checker import LoopDependencyChecker
 from .source_location import SourceLocation
@@ -90,6 +91,7 @@ def __init__(self, device: torch.device, settings: Settings) -> None:
         self.block_sizes: list[BlockSizeInfo] = []
         self.debug_shape_renames: dict[sympy.Expr, sympy.Expr] = {}
         self.config_spec = ConfigSpec()
+        self.config_spec.ptxas_supported = supports_ptxas(device)
         self.kernel_tensor_sizes: dict[tuple[sympy.Expr, ...], int] = (
             collections.Counter()
         )
diff --git a/helion/_compiler/device_function.py b/helion/_compiler/device_function.py
@@ -573,6 +573,12 @@ def codegen_function_call(self) -> ast.AST:
                 f"num_stages={self.config.num_stages}",
             ]
         )
+        ptxas_config = self.config.ptxas_config
+        if ptxas_config:
+            from ..runtime.ptxas_configs import get_ptxas_option
+
+            ptx_option = get_ptxas_option(ptxas_config)
+            args.append(f"ptx_options={ptx_option!r}")
         pid = self.pid
         assert pid is not None
         # TODO(jansel): we should run CSE this statement
diff --git a/helion/autotuner/config_spec.py b/helion/autotuner/config_spec.py
@@ -50,6 +50,7 @@
         "num_stages",
         "pid_type",
         "indexing",
+        "ptxas_config",
     ]
 )
 VALID_PID_TYPES = ("flat", "xyz", "persistent_blocked", "persistent_interleaved")
@@ -97,6 +98,7 @@ class ConfigSpec:
         default_factory=functools.partial(tuple, VALID_PID_TYPES)
     )
     grid_block_ids: list[int] = dataclasses.field(default_factory=list)
+    ptxas_supported: bool = True
 
     @staticmethod
     def _valid_indexing_types() -> tuple[IndexingLiteral, ...]:
@@ -226,6 +228,11 @@ def normalize(self, config: helion.Config | dict[str, object]) -> None:
             else:
                 config[name] = values[0]
 
+        if self.ptxas_supported:
+            value = config.get("ptxas_config") or 0
+            if not isinstance(value, int):
+                raise InvalidConfig(f"ptxas_config must be integer, got {value!r}")
+
         # Set default values for grid indices when pid_type is not persistent
         pid_type = config["pid_type"]
         if pid_type in ("flat", "xyz") and self.grid_block_ids:
@@ -267,6 +274,10 @@ def flat_config(self, fn: Callable[[ConfigSpecFragment], object]) -> helion.Conf
             "indexing": fn(EnumFragment(self._valid_indexing_types())),
             "pid_type": fn(EnumFragment(self.allowed_pid_types)),
         }
+        if self.ptxas_supported:
+            from ..runtime.ptxas_configs import search_ptxas_configs
+
+            config["ptxas_config"] = fn(EnumFragment((0, *search_ptxas_configs())))
         # Add tunable parameters
         for key, fragment in self.user_defined_tunables.items():
             config[key] = fn(fragment)
diff --git a/helion/runtime/__init__.py b/helion/runtime/__init__.py
@@ -61,8 +61,15 @@ def default_launcher(
     *args: object,
     num_warps: int,
     num_stages: int,
+    ptx_options: str | None = None,
 ) -> object:
     """Default launcher function that executes the kernel immediately."""
-    return triton_kernel.run(
-        *args, grid=grid, warmup=False, num_warps=num_warps, num_stages=num_stages
-    )
+    run_kwargs = {
+        "grid": grid,
+        "warmup": False,
+        "num_warps": num_warps,
+        "num_stages": num_stages,
+    }
+    if ptx_options:
+        run_kwargs["ptx_options"] = ptx_options
+    return triton_kernel.run(*args, **run_kwargs)
diff --git a/helion/runtime/config.py b/helion/runtime/config.py
@@ -38,6 +38,7 @@ def __init__(
         num_stages: int | None = None,
         pid_type: PidTypeLiteral | None = None,
         indexing: IndexingLiteral | None = None,
+        ptxas_config: int | None = None,
         # For user-defined properties
         **kwargs: object,
     ) -> None:
@@ -78,6 +79,7 @@ def __init__(
             "num_stages": num_stages,
             "indexing": indexing,
             "pid_type": pid_type,
+            "ptxas_config": ptxas_config,
         }
         for key, value in core_props.items():
             if value is not None:
@@ -169,6 +171,10 @@ def pid_type(self) -> PidTypeLiteral:
     def range_unroll_factors(self) -> list[int]:
         return cast("list[int]", self.config.get("range_unroll_factors", []))
 
+    @property
+    def ptxas_config(self) -> int:
+        return cast("int", self.config.get("ptxas_config", 0))
+
     @property
     def range_warp_specializes(self) -> list[bool | None]:
         return cast("list[bool | None]", self.config.get("range_warp_specializes", []))
diff --git a/helion/runtime/ptxas_configs/__init__.py b/helion/runtime/ptxas_configs/__init__.py
@@ -0,0 +1,43 @@
+"""Utilities for working with packaged PTXAS control files."""
+
+from __future__ import annotations
+
+from functools import cache
+from pathlib import Path
+
+_CONFIG_FILES: dict[int, str] = {
+    1: "spiffy-bee-104.bin",
+}
+
+
+def _config_root() -> Path:
+    return Path(__file__).resolve().parent
+
+
+@cache
+def search_ptxas_configs() -> tuple[int, ...]:
+    """Return the sorted tuple of available PTXAS config IDs."""
+
+    return tuple(sorted(_CONFIG_FILES))
+
+
+def _config_file_path(config_id: int) -> str:
+    """Return the absolute path to the PTXAS control file for ``config_id``."""
+
+    try:
+        filename = _CONFIG_FILES[config_id]
+    except KeyError as exc:  # pragma: no cover - defensive
+        raise ValueError(f"Unknown PTXAS config id: {config_id}") from exc
+    resolved = (_config_root() / filename).resolve()
+    if not resolved.is_file():
+        raise FileNotFoundError(f"Missing PTXAS config file: {resolved}")
+    return str(resolved)
+
+
+@cache
+def get_ptxas_option(config_value: int) -> str | None:
+    """Translate a config enum value into a PTXAS option string."""
+
+    if config_value == 0:
+        return None
+    return f"--apply-controls {_config_file_path(config_value)}"
diff --git a/helion/runtime/ptxas_configs/spiffy-bee-104.bin b/helion/runtime/ptxas_configs/spiffy-bee-104.bin
diff --git a/pyproject.toml b/pyproject.toml
@@ -85,6 +85,7 @@ packages = ["helion"]
 include = [
   "helion/**/*.py",
   "helion/**/*.pyi",
+  "helion/runtime/ptxas_configs/*.bin",
   "LICENSE",
 ]
 exclude = [
diff --git a/test/test_ptxas_config.expected b/test/test_ptxas_config.expected
@@ -0,0 +1,27 @@
+This file is automatically generated by assertExpectedJournal calls in test_ptxas_config.py.
+Update expected outputs by running tests with the EXPECTTEST_ACCEPT=1 environment variable set.
+
+--- assertExpectedJournal(TestPtxasConfig.test_ptxas_config_apply_controls_flag)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion__copy_kernel(x_flat, out_flat, x_size_0, out_flat_stride_0, x_flat_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    load = tl.load(x_flat + indices_0 * x_flat_stride_0, mask_0, other=0)
+    tl.store(out_flat + indices_0 * out_flat_stride_0, load, mask_0)
+
+def _copy_kernel(x: torch.Tensor, *, _launcher=_default_launcher):
+    out = torch.empty_like(x)
+    x_flat = x.view(-1)
+    out_flat = out.view(-1)
+    _BLOCK_SIZE_0 = 32
+    _launcher(_helion__copy_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x_flat, out_flat, x.size(0), out_flat.stride(0), x_flat.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3, ptx_options='--apply-controls <path>')
+    return out
diff --git a/test/test_ptxas_config.py b/test/test_ptxas_config.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+import unittest
+
+import pytest
+import torch
+
+import helion
+from helion._compat import supports_ptxas
+from helion._testing import DEVICE
+from helion._testing import TestCase
+from helion._testing import code_and_output
+from helion.exc import InvalidConfig
+import helion.language as hl
+from helion.runtime.ptxas_configs import _config_file_path
+
+
+@helion.kernel()
+def _copy_kernel(x: torch.Tensor) -> torch.Tensor:
+    out = torch.empty_like(x)
+    x_flat = x.view(-1)
+    out_flat = out.view(-1)
+    for tile in hl.tile(x_flat.numel()):
+        out_flat[tile] = x_flat[tile]
+    return out
+
+
+class TestPtxasConfig(TestCase):
+    @unittest.skipUnless(
+        supports_ptxas(DEVICE), "PTXAS controls are only available on NVIDIA GPUs"
+    )
+    def test_ptxas_config_apply_controls_flag(self) -> None:
+        x = torch.randn(128, device=DEVICE)
+        code, result = code_and_output(
+            _copy_kernel, (x,), ptxas_config=1, block_size=32
+        )
+        torch.testing.assert_close(result, x)
+
+        option = f"--apply-controls {_config_file_path(1)}"
+        self.assertIn(option, code)
+
+        self.assertExpectedJournal(code.replace(_config_file_path(1), "<path>"))
+
+    def test_ptxas_config_invalid_value(self) -> None:
+        x = torch.randn(2, device=DEVICE)
+        bound = _copy_kernel.bind((x,))
+        base = bound.config_spec.default_config()
+
+        options = base.config.copy()
+        options["ptxas_config"] = "a"
+        flagged = helion.Config(**options)
+
+        with pytest.raises(InvalidConfig):
+            bound.config_spec.normalize(flagged)

Original file line number	Diff line number	Diff line change
`@@ -573,6 +573,12 @@ def codegen_function_call(self) -> ast.AST:`
`573`	`573`	`f"num_stages={self.config.num_stages}",`
`574`	`574`	`]`
`575`	`575`	`)`
	`576`	`+ ptxas_config = self.config.ptxas_config`
	`577`	`+ if ptxas_config:`
	`578`	`+ from ..runtime.ptxas_configs import get_ptxas_option`
	`579`	`+`
	`580`	`+ ptx_option = get_ptxas_option(ptxas_config)`
	`581`	`+ args.append(f"ptx_options={ptx_option!r}")`
`576`	`582`	`pid = self.pid`
`577`	`583`	`assert pid is not None`
`578`	`584`	`# TODO(jansel): we should run CSE this statement`
Original file line number	Diff line number	Diff line change
`@@ -85,6 +85,7 @@ packages = ["helion"]`
`85`	`85`	`include = [`
`86`	`86`	`"helion/*/.py",`
`87`	`87`	`"helion/*/.pyi",`
	`88`	`+ "helion/runtime/ptxas_configs/*.bin",`
`88`	`89`	`"LICENSE",`
`89`	`90`	`]`
`90`	`91`	`exclude = [`