pytorch
diff --git a/‎examples/matmul.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/matmul.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎helion/_compiler/device_function.py‎
Lines changed: 62 additions & 22 deletions b/‎helion/_compiler/device_function.py‎
Lines changed: 62 additions & 22 deletions
diff --git a/‎helion/_compiler/device_ir.py‎
Lines changed: 22 additions & 0 deletions b/‎helion/_compiler/device_ir.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎helion/_compiler/indexing_strategy.py‎
Lines changed: 168 additions & 2 deletions b/‎helion/_compiler/indexing_strategy.py‎
Lines changed: 168 additions & 2 deletions
@@ -34,6 +34,8 @@
         "range_unroll_factors": [0, 0],
         "range_num_stages": [0, 0],
     },
+    allow_epilogue_subtiling=True,
+    autotune_effort="quick",
 )
 def matmul(
     x: Tensor,
 
@@ -462,8 +462,9 @@ def tensor_descriptor_arg(
         self, fake_value: torch.Tensor, block_size: list[int | torch.SymInt]
     ) -> TensorDescriptorArg:
         host_function = HostFunction.current()
-        block_size_expr = ", ".join(map(self.literal_expr, block_size))
+        block_size_expr = ", ".join(self.literal_expr(dim) for dim in block_size)
         key = (fake_value, block_size_expr)
+
         if key not in self._tensor_descriptor_args:
             origin = host_function.tensor_to_origin[fake_value]
             desc_name = self.new_var(origin.suggest_var_name() + "_desc")
@@ -556,22 +557,6 @@ def _format_constexpr_value(self, value: object) -> str:
         if isinstance(value, (torch.SymInt, torch.SymFloat, torch.SymBool)):
             value = value._sympy_()
 
-        # Handle sympy expressions (sanitize by replacing triton_helpers functions)
-        if isinstance(value, sympy.Expr):
-            sanitized = value.replace(  # pyright: ignore[reportAttributeAccessIssue]
-                lambda node: isinstance(node, sympy.Function)
-                and getattr(node.func, "__name__", "")
-                == "triton_helpers.div_floor_integer",
-                lambda node: sympy.floor(node.args[0] / node.args[1]),  # pyright: ignore[reportAttributeAccessIssue]
-            ).replace(  # pyright: ignore[reportAttributeAccessIssue]
-                lambda node: isinstance(node, sympy.Function)
-                and getattr(node.func, "__name__", "")
-                == "triton_helpers.remainder_integer",
-                lambda node: sympy.Mod(node.args[0], node.args[1]),  # pyright: ignore[reportAttributeAccessIssue]
-            )
-            expr = cast("sympy.Expr", sanitized)
-            return HostFunction.current().sympy_expr(expr)
-
         return HostFunction.current().literal_expr(value)
 
     def _tensor_property(
@@ -749,11 +734,19 @@ def current() -> DeviceFunction:
 
 
 class HelionTritonPrinter(TritonPrinter):
-    """Custom Triton printer that avoids wrapping float literals in tl.full().
-
-    Inductor's default TritonPrinter prints SymPy Float as a 0-D Triton value
-    via tl.full([], <val>, tl.float64). We override this to emit the raw numeric
-    literal, letting downstream type promotion and casts handle dtype.
+    """Custom Triton printer that does the following:
+
+    - Avoids wrapping float literals in tl.full().
+     Inductor's default TritonPrinter prints SymPy Float as a 0-D Triton value
+     via tl.full([], <val>, tl.float64). We override this to emit the raw numeric
+     literal, letting downstream type promotion and casts handle dtype.
+
+    - Avoids triton_helpers.div_floor_integer(...) calls when both operands are
+      provably non-negative integers. TritonPrinter by default converts
+      floor(u1/2) to triton_helpers.div_floor_integer(...). We override this to
+      emit u1 // 2 only when the numerator is known to be non-negative and the
+      denominator is a positive integer, so that we keep helper calls for cases
+      that rely on floor semantics with mixed signs.
     """
 
     def _print_Float(self, expr: sympy.Expr) -> str:
@@ -762,6 +755,53 @@ def _print_Float(self, expr: sympy.Expr) -> str:
     def _print_ToFloat(self, expr: sympy.Expr) -> str:
         return f"{expr} + 0.0"
 
+    def _is_nonnegative(self, expr: sympy.Expr) -> bool:
+        if expr.is_nonnegative is True or expr.is_zero is True:
+            return True
+        if expr.is_positive is True:
+            return True
+        try:
+            host_fn = HostFunction.current()
+        except NoCurrentFunction:
+            host_fn = None
+        if host_fn is not None:
+            origin_info = host_fn.expr_to_origin.get(expr)
+            if origin_info and isinstance(
+                origin_info.origin, (BlockSizeOrigin, TensorSizeOrigin)
+            ):
+                return True
+        if isinstance(expr, sympy.Symbol) and expr.name.startswith("_BLOCK_SIZE_"):
+            return True
+        if isinstance(expr, sympy.Number):
+            return bool(expr >= 0)
+        return False
+
+    def _format_trunc_div(self, lhs: sympy.Expr, rhs: sympy.Expr) -> str:
+        lhs_str = self._print(lhs)
+        rhs_str = self._print(rhs)
+        if not (lhs.is_Integer or lhs.is_Symbol):
+            lhs_str = f"({lhs_str})"
+        if not (rhs.is_Integer or rhs.is_Symbol):
+            rhs_str = f"({rhs_str})"
+        return f"{lhs_str} // {rhs_str}"
+
+    def _print_floor(self, expr: sympy.Expr) -> str:
+        inner = expr.args[0]
+        numer, denom = inner.as_numer_denom()
+        if (
+            isinstance(denom, sympy.Integer)
+            and denom > 1
+            and self._is_nonnegative(numer)
+        ):
+            return self._format_trunc_div(numer, denom)
+        return super()._print_floor(expr)
+
+    def _print_FloorDiv(self, expr: sympy.Expr) -> str:
+        lhs, rhs = expr.args
+        if isinstance(rhs, sympy.Integer) and rhs > 0 and self._is_nonnegative(lhs):
+            return self._format_trunc_div(lhs, rhs)
+        return super()._print_FloorDiv(expr)
+
 
 def texpr(expr: sympy.Expr) -> str:
     return HelionTritonPrinter().doprint(expr)
@@ -63,6 +63,7 @@
 from .type_propagation import _eval_binary
 from .type_propagation import _eval_compare
 from .type_propagation import _eval_unary
+from .utils import _allow_epilogue_subtiling
 
 if TYPE_CHECKING:
     from collections.abc import Callable
@@ -1161,6 +1162,23 @@ def _register_load_store_tunables(
         )
 
 
+def _register_epilogue_subtile_tunable(store_count: int) -> None:
+    """Register the epilogue subtile tunable for all device stores."""
+    if store_count == 0:
+        return
+
+    from ..autotuner.config_fragment import EnumFragment
+    from ..autotuner.config_fragment import ListOf
+    from ..autotuner.config_spec import VALID_EPILOGUE_SUBTILE_SIZES
+
+    env = CompileEnvironment.current()
+    # Register a tunable for epilogue subtile for all device stores
+    fragment = ListOf(
+        EnumFragment(choices=VALID_EPILOGUE_SUBTILE_SIZES), length=store_count
+    )
+    env.config_spec.epilogue_subtiling = fragment
+
+
 def lower_to_device_ir(func: HostFunction) -> DeviceIR:
     device_ir = DeviceIR()
     with func, device_ir, compile_lock:
@@ -1191,6 +1209,10 @@ def lower_to_device_ir(func: HostFunction) -> DeviceIR:
             total_load_count, loads_without_eviction_policy, store_count
         )
 
+        # Epilogue subtiling only for Blackwell
+        if _allow_epilogue_subtiling():
+            _register_epilogue_subtile_tunable(store_count)
+
         return device_ir
 
 
 
@@ -15,10 +15,12 @@
 from .. import exc
 from .._compat import get_tensor_descriptor_fn_name
 from .ast_extension import expr_from_string
+from .ast_extension import statement_from_string
 from .compile_environment import CompileEnvironment
 from .device_function import DeviceFunction
 from .host_function import HostFunction
 from .tile_strategy import DeviceLoopState
+from .utils import _allow_epilogue_subtiling
 from .utils import compute_slice_size
 from .variable_origin import BlockSizeOrigin
 
@@ -352,7 +354,6 @@ def codegen_load(
             )
         assert extra_mask is None
         indexing = BlockedSubscriptIndexing.create(state, fake_tensor, subscript)
-
         # Load from tensor descriptor with permuted offsets
         load_expr = expr_from_string(
             f"{indexing.tensor_descriptor(state)}.load({indexing.offsets_str_permuted(state)})"
@@ -382,23 +383,188 @@ def codegen_store(
             )
         assert extra_mask is None
         indexing = BlockedSubscriptIndexing.create(state, fake_tensor, subscript)
+        store_value = indexing.reshape_store(state, value)
 
+        config = DeviceFunction.current().config
+        epilogue_subtiles = state.config.epilogue_subtiling
         # Apply permutation to the value being stored if needed
         desc_arg = indexing.tensor_descriptor_arg(state)
-        store_value = indexing.reshape_store(state, value)
 
         if desc_arg.permutation is not None:
             # Apply permutation to the value
             store_value = expr_from_string(
                 f"tl.permute({{store_val}}, {desc_arg.permutation!r})",
                 store_val=store_value,
             )
+        
+        if _allow_epilogue_subtiling() and (
+            idx := state.device_function.device_store_index
+        ) <= len(epilogue_subtiles):
+            subtile_split = epilogue_subtiles[idx - 1]
+
+            subtile_codegen = self._codegen_epilogue_subtile_store(
+                state,
+                fake_tensor,
+                indexing,
+                store_value,
+                subtile_split,
+                config,
+            )
+            if subtile_codegen is not None:
+                return subtile_codegen
+
+            if "pointwise_in" in state.fx_node.meta:
+                # We still need to codegen pointwise if subtile_codegen is None
+                store_value = self._apply_pointwise_to_subtile(
+                    state, state.fx_node.meta["pointwise_in"], store_value
+                )
 
         return expr_from_string(
             f"{indexing.tensor_descriptor(state)}.store({indexing.offsets_str_permuted(state)}, {{value}})",
             value=store_value,
         )
 
+    def _apply_pointwise_to_subtile(
+        self, state: CodegenState, pointwise_node: torch.fx.Node, subtile_value: ast.AST
+    ) -> ast.AST:
+        """Apply a pointwise operation to a subtile value.
+
+        Args:
+            state: The codegen state
+            pointwise_node: The FX node representing the pointwise operation
+            subtile_value: The AST for the subtile value to apply the operation to
+
+        Returns:
+            AST for the result after applying the pointwise operation
+        """
+        from torch._inductor import ir
+
+        from .inductor_lowering import PointwiseLowering
+        from .inductor_lowering import install_inductor_kernel_handlers
+
+        lowering = pointwise_node.meta["lowering"]
+        assert isinstance(lowering, PointwiseLowering)
+
+        # Get the pointwise buffer
+        buffer = lowering.buffer
+        assert isinstance(buffer.data, ir.Pointwise)
+
+        # Create a temporary variable for the subtile
+        codegen = state.codegen
+        subtile_var = codegen.lift(subtile_value, prefix="subtile")
+
+        # Set up the inductor kernel handlers with the subtile as input
+        with install_inductor_kernel_handlers(
+            codegen, {lowering.input_names[0]: subtile_var}
+        ):
+            # Generate the pointwise operation
+            indices = [sympy.Symbol(f"i{n}") for n in range(len(buffer.data.ranges))]
+            from .inductor_lowering import _unpack_opsvalue
+
+            result_name = _unpack_opsvalue(buffer.data.inner_fn(indices))
+            return expr_from_string(result_name)
+
+    def _codegen_epilogue_subtile_store(
+        self,
+        state: CodegenState,
+        fake_tensor: torch.Tensor,
+        indexing: BlockedSubscriptIndexing,
+        store_value: ast.AST,
+        subtile_split: int,
+        config: Config,
+    ) -> ast.AST | None:
+        env = CompileEnvironment.current()
+        block_m, block_n = indexing.block_shape
+        block_n_hint = env.size_hint(block_n)
+        block_idx = env.get_block_id(block_n)
+        block_size = env.block_sizes[block_idx].from_config(config)
+
+        if "pointwise_in" in state.fx_node.meta:
+            fused_pointwise_node = state.fx_node.meta["pointwise_in"]
+            assert fused_pointwise_node == state.fx_node.args[2]
+        else:
+            fused_pointwise_node = None
+
+        # Currently support 2D tiles without permutations
+        if (
+            len(indexing.block_shape) != 2
+            or len(indexing.offsets) != 2
+            or subtile_split == 0
+            or block_n_hint % 2 != 0
+            or block_size <= 16
+        ):
+            return None
+
+        device_fn = state.device_function
+        codegen = state.codegen
+
+        block_m_str = device_fn.literal_expr(block_m)
+        block_n_str = device_fn.literal_expr(block_n)
+        indexing.block_shape[1] //= subtile_split
+
+        # TODO(PaulZhang12): Support more epilogue subtile configs besides 2
+        block_n_half_str = f"({block_n_str} // {subtile_split})"
+
+        # Lift the store value into a temporary variable for reuse
+        acc_var = codegen.lift(store_value, prefix="acc")
+
+        reshape_expr = expr_from_string(
+            "tl.reshape({acc}, [{dim_m}, 2, {dim_half}]).permute(0, 2, 1)",
+            acc=acc_var,
+            dim_m=expr_from_string(block_m_str),
+            dim_half=expr_from_string(block_n_half_str),
+        )
+        reshape_var = codegen.lift(reshape_expr, prefix="acc")
+
+        acc0_name = codegen.tmpvar(prefix="acc")
+        acc1_name = codegen.tmpvar(prefix="acc")
+        codegen.add_statement(
+            statement_from_string(
+                f"{acc0_name}, {acc1_name} = tl.split({{acc}})",
+                acc=reshape_var,
+            )
+        )
+
+        # Now apply the pointwise operation per-subtile if we have one
+        if fused_pointwise_node is not None:
+            acc0 = self._apply_pointwise_to_subtile(
+                state, fused_pointwise_node, expr_from_string(acc0_name)
+            )
+            acc1 = self._apply_pointwise_to_subtile(
+                state, fused_pointwise_node, expr_from_string(acc1_name)
+            )
+        else:
+            acc0 = expr_from_string(acc0_name)
+            acc1 = expr_from_string(acc1_name)
+
+        desc_name = indexing.tensor_descriptor(state)
+        offset0 = expr_from_string(indexing.offsets[0])
+        offset1 = expr_from_string(indexing.offsets[1])
+
+        # First subtile store
+        codegen.add_statement(
+            statement_from_string(
+                f"{desc_name}.store([{{off0}}, {{off1}}], {{value}})",
+                off0=offset0,
+                off1=offset1,
+                value=acc0,
+            )
+        )
+
+        offset1_shifted = expr_from_string(
+            "({offset} + {half})",
+            offset=expr_from_string(indexing.offsets[1]),
+            half=expr_from_string(block_n_half_str),
+        )
+
+        # Emit second subtile store as the expression returned to the caller
+        return expr_from_string(
+            f"{desc_name}.store([{{off0}}, {{off1}}], {{value}})",
+            off0=offset0,
+            off1=offset1_shifted,
+            value=acc1,
+        )
+
 
 class StackIndexingStrategy:
     """
Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,8 @@`
`34`	`34`	`"range_unroll_factors": [0, 0],`
`35`	`35`	`"range_num_stages": [0, 0],`
`36`	`36`	`},`
	`37`	`+ allow_epilogue_subtiling=True,`
	`38`	`+ autotune_effort="quick",`
`37`	`39`	`)`
`38`	`40`	`def matmul(`
`39`	`41`	`x: Tensor,`