pytorch
diff --git a/‎examples/matmul.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/matmul.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎helion/_compiler/compile_environment.py‎
Lines changed: 1 addition & 0 deletions b/‎helion/_compiler/compile_environment.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎helion/_compiler/device_function.py‎
Lines changed: 65 additions & 22 deletions b/‎helion/_compiler/device_function.py‎
Lines changed: 65 additions & 22 deletions
diff --git a/‎helion/_compiler/device_ir.py‎
Lines changed: 42 additions & 14 deletions b/‎helion/_compiler/device_ir.py‎
Lines changed: 42 additions & 14 deletions
@@ -34,6 +34,7 @@
         "range_unroll_factors": [0, 0],
         "range_num_stages": [0, 0],
     },
+    allow_epilogue_subtiling=True,
 )
 def matmul(
     x: Tensor,
 
@@ -99,6 +99,7 @@ def __init__(self, device: torch.device, settings: Settings) -> None:
         self.device_load_count = (
             0  # Track number of loads in all device code for eviction policy tuning
         )
+        self.device_store_count = 0  # Track number of stores for subtiling
 
     def add_kernel_tensor_size(self, sizes: Sequence[int | torch.SymInt]) -> None:
         from .device_function import contains_only_block_size_symbols
 
@@ -250,6 +250,9 @@ def __init__(self, name: str, config: Config, codegen: GenerateAST) -> None:
         self.rng_seed_count = 0
         self.device_load_index = 0  # Track which load in device code we're generating (for eviction policy tuning)
         # Name of the RNG seed buffer parameter in kernel signature
+        self.device_store_index = (
+            0  # Track which store in device code we're generating (for subtiling)
+        )
         self.rng_seed_buffer_param_name = None
 
     def has_rng_ops(self) -> bool:
@@ -421,8 +424,9 @@ def tensor_descriptor_arg(
         self, fake_value: torch.Tensor, block_size: list[int | torch.SymInt]
     ) -> TensorDescriptorArg:
         host_function = HostFunction.current()
-        block_size_expr = ", ".join(map(self.literal_expr, block_size))
+        block_size_expr = ", ".join(self.literal_expr(dim) for dim in block_size)
         key = (fake_value, block_size_expr)
+
         if key not in self._tensor_descriptor_args:
             origin = host_function.tensor_to_origin[fake_value]
             desc_name = self.new_var(origin.suggest_var_name() + "_desc")
@@ -515,22 +519,6 @@ def _format_constexpr_value(self, value: object) -> str:
         if isinstance(value, (torch.SymInt, torch.SymFloat, torch.SymBool)):
             value = value._sympy_()
 
-        # Handle sympy expressions (sanitize by replacing triton_helpers functions)
-        if isinstance(value, sympy.Expr):
-            sanitized = value.replace(  # pyright: ignore[reportAttributeAccessIssue]
-                lambda node: isinstance(node, sympy.Function)
-                and getattr(node.func, "__name__", "")
-                == "triton_helpers.div_floor_integer",
-                lambda node: sympy.floor(node.args[0] / node.args[1]),  # pyright: ignore[reportAttributeAccessIssue]
-            ).replace(  # pyright: ignore[reportAttributeAccessIssue]
-                lambda node: isinstance(node, sympy.Function)
-                and getattr(node.func, "__name__", "")
-                == "triton_helpers.remainder_integer",
-                lambda node: sympy.Mod(node.args[0], node.args[1]),  # pyright: ignore[reportAttributeAccessIssue]
-            )
-            expr = cast("sympy.Expr", sanitized)
-            return HostFunction.current().sympy_expr(expr)
-
         return HostFunction.current().literal_expr(value)
 
     def _tensor_property(
@@ -708,11 +696,19 @@ def current() -> DeviceFunction:
 
 
 class HelionTritonPrinter(TritonPrinter):
-    """Custom Triton printer that avoids wrapping float literals in tl.full().
-
-    Inductor's default TritonPrinter prints SymPy Float as a 0-D Triton value
-    via tl.full([], <val>, tl.float64). We override this to emit the raw numeric
-    literal, letting downstream type promotion and casts handle dtype.
+    """Custom Triton printer that does the following:
+
+    - Avoids wrapping float literals in tl.full().
+     Inductor's default TritonPrinter prints SymPy Float as a 0-D Triton value
+     via tl.full([], <val>, tl.float64). We override this to emit the raw numeric
+     literal, letting downstream type promotion and casts handle dtype.
+
+    - Avoids triton_helpers.div_floor_integer(...) calls when both operands are
+      provably non-negative integers. TritonPrinter by default converts
+      floor(u1/2) to triton_helpers.div_floor_integer(...). We override this to
+      emit u1 // 2 only when the numerator is known to be non-negative and the
+      denominator is a positive integer, so that we keep helper calls for cases
+      that rely on floor semantics with mixed signs.
     """
 
     def _print_Float(self, expr: sympy.Expr) -> str:
@@ -721,6 +717,53 @@ def _print_Float(self, expr: sympy.Expr) -> str:
     def _print_ToFloat(self, expr: sympy.Expr) -> str:
         return f"{expr} + 0.0"
 
+    def _is_nonnegative(self, expr: sympy.Expr) -> bool:
+        if expr.is_nonnegative is True or expr.is_zero is True:
+            return True
+        if expr.is_positive is True:
+            return True
+        try:
+            host_fn = HostFunction.current()
+        except NoCurrentFunction:
+            host_fn = None
+        if host_fn is not None:
+            origin_info = host_fn.expr_to_origin.get(expr)
+            if origin_info and isinstance(
+                origin_info.origin, (BlockSizeOrigin, TensorSizeOrigin)
+            ):
+                return True
+        if isinstance(expr, sympy.Symbol) and expr.name.startswith("_BLOCK_SIZE_"):
+            return True
+        if isinstance(expr, sympy.Number):
+            return bool(expr >= 0)
+        return False
+
+    def _format_trunc_div(self, lhs: sympy.Expr, rhs: sympy.Expr) -> str:
+        lhs_str = self._print(lhs)
+        rhs_str = self._print(rhs)
+        if not (lhs.is_Integer or lhs.is_Symbol):
+            lhs_str = f"({lhs_str})"
+        if not (rhs.is_Integer or rhs.is_Symbol):
+            rhs_str = f"({rhs_str})"
+        return f"{lhs_str} // {rhs_str}"
+
+    def _print_floor(self, expr: sympy.Expr) -> str:
+        inner = expr.args[0]
+        numer, denom = inner.as_numer_denom()
+        if (
+            isinstance(denom, sympy.Integer)
+            and denom > 1
+            and self._is_nonnegative(numer)
+        ):
+            return self._format_trunc_div(numer, denom)
+        return super()._print_floor(expr)
+
+    def _print_FloorDiv(self, expr: sympy.Expr) -> str:
+        lhs, rhs = expr.args
+        if isinstance(rhs, sympy.Integer) and rhs > 0 and self._is_nonnegative(lhs):
+            return self._format_trunc_div(lhs, rhs)
+        return super()._print_FloorDiv(expr)
+
 
 def texpr(expr: sympy.Expr) -> str:
     return HelionTritonPrinter().doprint(expr)
@@ -1076,7 +1076,7 @@ def visit_For(self, node: ast.For) -> None:
             self.generic_visit(node)
 
 
-def _count_device_loads(device_ir: DeviceIR) -> int:
+def _count_device_loads_and_stores(device_ir: DeviceIR) -> int:
     """Count the number of load operations in all device code for eviction policy tuning."""
     from ..language import memory_ops
 
@@ -1087,26 +1087,29 @@ def _count_device_loads(device_ir: DeviceIR) -> int:
         if info.new_graph_id is not None
     }
 
-    load_count = 0
+    load_count, store_count = 0, 0
     # Walk all graphs except rolled duplicates
     for graph_info in device_ir.graphs:
         if graph_info.graph_id in rolled_graph_ids:
             continue
 
         for node in graph_info.graph.nodes:
             # Check if this is a load operation
-            if node.op == "call_function" and node.target is memory_ops.load:
-                # Only count loads without explicit eviction policy
-                # (user can still specify eviction_policy to override tuning)
-                # Check kwargs first, then check if 4th arg (eviction_policy) is None
-                eviction_policy_arg = node.kwargs.get("eviction_policy")
-                if eviction_policy_arg is None:
-                    # Check if eviction_policy was passed as positional arg (index 3)
-                    if len(node.args) >= 4:
-                        eviction_policy_arg = node.args[3]
+            if node.op == "call_function":
+                if node.target is memory_ops.load:
+                    # Only count loads without explicit eviction policy
+                    # (user can still specify eviction_policy to override tuning)
+                    # Check kwargs first, then check if 4th arg (eviction_policy) is None
+                    eviction_policy_arg = node.kwargs.get("eviction_policy")
                     if eviction_policy_arg is None:
-                        load_count += 1
-    return load_count
+                        # Check if eviction_policy was passed as positional arg (index 3)
+                        if len(node.args) >= 4:
+                            eviction_policy_arg = node.args[3]
+                        if eviction_policy_arg is None:
+                            load_count += 1
+                elif node.target is memory_ops.store:
+                    store_count += 1
+    return load_count, store_count
 
 
 def _register_eviction_policy_tunable(load_count: int) -> None:
@@ -1125,6 +1128,24 @@ def _register_eviction_policy_tunable(load_count: int) -> None:
     env.device_load_count = load_count
 
 
+def _register_epilogue_subtile_tunable(store_count: int) -> None:
+    """Register the epilogue subtile tunable for all device stores."""
+    if store_count == 0:
+        return
+
+    from ..autotuner.config_fragment import EnumFragment
+    from ..autotuner.config_fragment import ListOf
+    from ..autotuner.config_spec import VALID_EPILOGUE_SUBTILE_SIZES
+
+    env = CompileEnvironment.current()
+    # Register a tunable for epilogue subtile for all device stores
+    fragment = ListOf(
+        EnumFragment(choices=VALID_EPILOGUE_SUBTILE_SIZES), length=store_count
+    )
+    env.config_spec.epilogue_subtiling = fragment
+    env.device_store_count = store_count
+
+
 def lower_to_device_ir(func: HostFunction) -> DeviceIR:
     device_ir = DeviceIR()
     with func, device_ir, compile_lock:
@@ -1148,9 +1169,16 @@ def lower_to_device_ir(func: HostFunction) -> DeviceIR:
             CompileEnvironment.current().config_spec.disallow_pid_type("xyz")
 
         # Count all device loads and register eviction policy tunable
-        load_count = _count_device_loads(device_ir)
+        load_count, store_count = _count_device_loads_and_stores(device_ir)
         _register_eviction_policy_tunable(load_count)
 
+        # Epilogue subtiling only for Blackwell
+        if (
+            torch.cuda.get_device_capability() >= (10, 0)
+            and CompileEnvironment.current().settings.allow_epilogue_subtiling
+        ):
+            _register_epilogue_subtile_tunable(store_count)
+
         return device_ir
Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,7 @@`
`34`	`34`	`"range_unroll_factors": [0, 0],`
`35`	`35`	`"range_num_stages": [0, 0],`
`36`	`36`	`},`
	`37`	`+ allow_epilogue_subtiling=True,`
`37`	`38`	`)`
`38`	`39`	`def matmul(`
`39`	`40`	`x: Tensor,`
Original file line number	Diff line number	Diff line change
`@@ -99,6 +99,7 @@ def __init__(self, device: torch.device, settings: Settings) -> None:`
`99`	`99`	`self.device_load_count = (`
`100`	`100`	`0 # Track number of loads in all device code for eviction policy tuning`
`101`	`101`	`)`
	`102`	`+ self.device_store_count = 0 # Track number of stores for subtiling`
`102`	`103`
`103`	`104`	`def add_kernel_tensor_size(self, sizes: Sequence[int \| torch.SymInt]) -> None:`
`104`	`105`	`from .device_function import contains_only_block_size_symbols`