Add epilogue subtiling

PaulZhang12 · PaulZhang12 · commit 2bc36d0bd9fd · 2025-10-15T18:53:11.000-07:00
stack-info: PR: #948, branch: PaulZhang12/stack/14
diff --git a/examples/matmul.py b/examples/matmul.py
@@ -44,6 +44,7 @@ def matmul(
     Returns:
         Tensor: Resulting matrix of shape [m, n].
     """
+
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f"size mismatch {k} != {k2}"
diff --git a/helion/_compiler/device_function.py b/helion/_compiler/device_function.py
@@ -420,9 +420,14 @@ def tensor_arg(
     def tensor_descriptor_arg(
         self, fake_value: torch.Tensor, block_size: list[int | torch.SymInt]
     ) -> TensorDescriptorArg:
+        import re
         host_function = HostFunction.current()
         block_size_expr = ", ".join(map(self.literal_expr, block_size))
+        pattern = r'triton_helpers\.div_floor_integer\(([^,]+),\s*(\d+)\)'
+        replacement = r'\1 // \2'
+        block_size_expr = re.sub(pattern, replacement, block_size_expr)
         key = (fake_value, block_size_expr)
+
         if key not in self._tensor_descriptor_args:
             origin = host_function.tensor_to_origin[fake_value]
             desc_name = self.new_var(origin.suggest_var_name() + "_desc")
diff --git a/helion/_compiler/indexing_strategy.py b/helion/_compiler/indexing_strategy.py
@@ -15,6 +15,7 @@
 from .. import exc
 from .._compat import get_tensor_descriptor_fn_name
 from .ast_extension import expr_from_string
+from .ast_extension import statement_from_string
 from .compile_environment import CompileEnvironment
 from .device_function import DeviceFunction
 from .host_function import HostFunction
@@ -384,22 +385,116 @@ def codegen_store(
         assert extra_mask is None
         indexing = BlockedSubscriptIndexing.create(state, fake_tensor, subscript)
 
+        config = DeviceFunction.current().config
+        store_value = indexing.reshape_store(state, value)
+        if config.epilogue_subtiling:
+            return self._codegen_epilogue_subtile_store(state, fake_tensor, indexing, store_value)
+
         # Apply permutation to the value being stored if needed
         desc_arg = indexing.tensor_descriptor_arg(state)
-        store_value = indexing.reshape_store(state, value)
 
         if desc_arg.permutation is not None:
             # Apply permutation to the value
             store_value = expr_from_string(
                 f"tl.permute({{store_val}}, {desc_arg.permutation!r})",
                 store_val=store_value,
             )
-
+        
         return expr_from_string(
             f"{indexing.tensor_descriptor(state)}.store({indexing.offsets_str_permuted(state)}, {{value}})",
             value=store_value,
         )
 
+    def _codegen_epilogue_subtile_store(
+        self,
+        state: CodegenState,
+        fake_tensor: torch.Tensor,
+        indexing: BlockedSubscriptIndexing,
+        store_value: ast.AST,
+    ) -> ast.AST | None:
+       # Currently support 2D tiles without permutations
+        if len(indexing.block_shape) != 2 or len(indexing.offsets) != 2:
+            return None
+
+        env = CompileEnvironment.current()
+        block_m, block_n = indexing.block_shape
+        try:
+            block_n_hint = env.size_hint(block_n)
+        except Exception:
+            return None
+
+        if block_n_hint % 2 != 0:
+            return None
+
+        device_fn = state.device_function
+        codegen = state.codegen
+
+        block_m_str = device_fn.literal_expr(block_m)
+        block_n_str = device_fn.literal_expr(block_n)
+        indexing.block_shape[1] //= 2
+        desc_arg = indexing.tensor_descriptor_arg(state)
+
+        if desc_arg.permutation is not None:
+            return None
+
+
+        block_n_half_str = f"({block_n_str} // 2)"
+
+        # Lift the store value into a temporary variable for reuse
+        acc_var = codegen.lift(store_value, prefix="acc")
+
+        reshape_expr = expr_from_string(
+            "tl.reshape({acc}, [{dim_m}, 2, {dim_half}])",
+            acc=acc_var,
+            dim_m=expr_from_string(block_m_str),
+            dim_half=expr_from_string(block_n_half_str),
+        )
+        reshape_var = codegen.lift(reshape_expr, prefix="acc")
+
+        permute_expr = expr_from_string(
+            "tl.permute({acc}, [0, 2, 1])",
+            acc=reshape_var,
+        )
+        permute_var = codegen.lift(permute_expr, prefix="acc")
+
+        acc0_name = codegen.tmpvar(prefix="acc")
+        acc1_name = codegen.tmpvar(prefix="acc")
+        codegen.add_statement(
+            statement_from_string(
+                f"{acc0_name}, {acc1_name} = tl.split({{acc}})",
+                acc=permute_var,
+            )
+        )
+        acc0 = expr_from_string(acc0_name)
+        acc1 = expr_from_string(acc1_name)
+
+        desc_name = indexing.tensor_descriptor(state)
+        offset0 = expr_from_string(indexing.offsets[0])
+        offset1 = expr_from_string(indexing.offsets[1])
+
+        # First subtile store
+        codegen.add_statement(
+            statement_from_string(
+                f"{desc_name}.store([{{off0}}, {{off1}}], {{value}})",
+                off0=offset0,
+                off1=offset1,
+                value=acc0,
+            )
+        )
+
+        offset1_shifted = expr_from_string(
+            "({offset} + {half})",
+            offset=expr_from_string(indexing.offsets[1]),
+            half=expr_from_string(block_n_half_str),
+        )
+
+        # Emit second subtile store as the expression returned to the caller
+        return expr_from_string(
+            f"{desc_name}.store([{{off0}}, {{off1}}], {{value}})",
+            off0=offset0,
+            off1=offset1_shifted,
+            value=acc1,
+        )
 
 class StackIndexingStrategy:
     """
diff --git a/helion/autotuner/config_spec.py b/helion/autotuner/config_spec.py
@@ -52,6 +52,7 @@
         "pid_type",
         "indexing",
         "load_eviction_policies",
+        "epilogue_subtiling"
     ]
 )
 VALID_PID_TYPES = ("flat", "xyz", "persistent_blocked", "persistent_interleaved")
@@ -105,6 +106,7 @@ class ConfigSpec:
             EnumFragment(choices=VALID_EVICTION_POLICIES), length=0
         )
     )
+    epilogue_subtiling: bool = dataclasses.field(default=False)
 
     @staticmethod
     def _valid_indexing_types() -> tuple[IndexingLiteral, ...]:
@@ -224,6 +226,7 @@ def normalize(self, config: helion.Config | dict[str, object]) -> None:
         config.setdefault(
             "load_eviction_policies", self.load_eviction_policies.default()
         )
+        config.setdefault("epilogue_subtiling", False)
         # TODO(jansel): include num_ctas and max_nreg
 
         for name, values in (
@@ -238,6 +241,9 @@ def normalize(self, config: helion.Config | dict[str, object]) -> None:
             else:
                 config[name] = values[0]
 
+        if config["indexing"] != "tensor_descriptor" or any(block_id < 16 for block_id in config["block_sizes"]):
+            config["epilogue_subtiling"] = False
+        
         # Set default values for grid indices when pid_type is not persistent
         pid_type = config["pid_type"]
         if pid_type in ("flat", "xyz") and self.grid_block_ids:
@@ -279,6 +285,7 @@ def flat_config(self, fn: Callable[[ConfigSpecFragment], object]) -> helion.Conf
             "indexing": fn(EnumFragment(self._valid_indexing_types())),
             "pid_type": fn(EnumFragment(self.allowed_pid_types)),
             "load_eviction_policies": fn(self.load_eviction_policies),
+            "epilogue_subtiling": fn(BooleanFragment()),
         }
         # Add tunable parameters
         config.update(
diff --git a/helion/runtime/config.py b/helion/runtime/config.py
@@ -39,6 +39,7 @@ def __init__(
         num_stages: int | None = None,
         pid_type: PidTypeLiteral | None = None,
         indexing: IndexingLiteral | None = None,
+        epilogue_subtiling: bool | None = None,
         # For user-defined properties
         **kwargs: object,
     ) -> None:
@@ -61,6 +62,7 @@ def __init__(
             num_stages: Number of stages for software pipelining.
             pid_type: Program ID type strategy ("flat", "xyz", "persistent_blocked", "persistent_interleaved").
             indexing: Indexing strategy ("pointer", "tensor_descriptor", "block_ptr").
+            epilogue_subtiling: Whether to use subtiling for epilogue.
             **kwargs: Additional user-defined configuration parameters.
         """
         self.config = {}
@@ -81,6 +83,7 @@ def __init__(
             "num_stages": num_stages,
             "indexing": indexing,
             "pid_type": pid_type,
+            "epilogue_subtiling": epilogue_subtiling,
         }
         for key, value in core_props.items():
             if value is not None:
@@ -206,6 +209,10 @@ def load_eviction_policies(self) -> list[str]:
     def indexing(self) -> IndexingLiteral:
         return self.config.get("indexing", "pointer")  # type: ignore[return-value]
 
+    @property
+    def epilogue_subtiling(self) -> bool:
+        return self.config.get("epilogue_subtiling", False)  # type: ignore[return-value]
+
 
 def _list_to_tuple(x: object) -> object:
     if isinstance(x, list):