From f7752ef7ef16ad9db8f367935950dbf1ec06e6b8 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 10 Jan 2025 04:24:59 +0000
Subject: [PATCH 001/432] [Frontend] Split scheduling module

---
 .../mlir/mlir_codegen_backend.py              | 156 -----------------
 PyTorchSimFrontend/mlir/mlir_scheduling.py    | 161 ++++++++++++++++++
 Scheduler/scheduler.py                        |   4 +-
 3 files changed, 164 insertions(+), 157 deletions(-)
 create mode 100644 PyTorchSimFrontend/mlir/mlir_scheduling.py

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 09c5aa34..d5336fdb 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -11,20 +11,15 @@
 from typing import Dict
 from collections import OrderedDict
 import torch
-from torch._inductor import dependencies, config
 from torch._inductor.codegen import cpp, wrapper, common
-from torch._inductor.scheduler import BaseScheduling
 from torch._inductor.virtualized import V, _ops as ops
 from torch._inductor.codecache import write_atomic, write
-from Simulator.simulator import BackendSimulator
-from PyTorchSimFrontend import extension_config
 from torch._inductor.utils import (
     IndentedBuffer,
     is_welford_reduction,
 )
 import PyTorchSimFrontend.extension_codecache as extension_codecache
 
-
 from . import mlir_common
 
 def reduction_init(reduction_type, dtype):
@@ -1319,8 +1314,6 @@ def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape
     def roundup_vectorlane(self, size, amp=1):
         return ((size + self.vector_lane - 1) // self.vector_lane) * self.vector_lane * amp
 
-from . import mlir_lowering
-
 @dataclasses.dataclass
 class LoopLevel:
     var: sympy.Expr
@@ -1364,152 +1357,3 @@ def mark_parallel(self, par_depth):
         for i in range(1, par_depth):
             loops[i].collapsed = True
         loops[0].simd = loops[par_depth - 1].simd
-
-class MLIRWrapperKenrelGroup(cpp.KernelGroup):
-    def __init__(self):
-        super().__init__()
-        self.args = mlir_common.MLIRKernelArgs()
-
-class MLIRScheduling(BaseScheduling):
-    count = 0
-    target_kernel = MLIRKernel
-    def __init__(self, scheduler):
-        self.scheduler = scheduler
-        self.kernel_group = MLIRWrapperKenrelGroup()
-        self._ready_to_flush = False
-        self.outer_function = set()
-        config.inplace_buffers = False # FIXME. inout kernel makes trouble.. So disabled it!
-
-    def _set_flush_status(self, status: bool):
-        self._ready_to_flush = status
-
-    def can_fuse_vertical(self, node1, node2):
-        return False
-        return self.can_fuse_horizontal(node1, node2) and not node1.is_reduction()
-
-    def can_fuse_horizontal(self, node1, node2):
-        return False
-        _, (vars1, reduce1) = node1.group
-        _, (vars2, reduce2) = node2.group
-        if vars1 == vars2 and reduce1 == reduce2:
-            return True
-        #TODO: Temporary solution determining the fusion condition similar to CPP/OpenMP
-        v1_total = math.prod(vars1) if len(vars1) else 0
-        v2_total = math.prod(vars2) if len(vars2) else 0
-        r1_total = math.prod(reduce1) if len(reduce1) else 0
-        r2_total = math.prod(reduce2) if len(reduce2) else 0
-        if reduce1 == () \
-            and v1_total == (v2_total + r2_total):
-            # and node1.node.layout.size == node2.node.layout.size:     #FIXME: Need to check layout too?
-            return True
-        return False
-
-    def group_fn(self, sizes):
-        return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes)
-
-    def codegen_nodes(self, nodes):
-        _, (group, reduction_group) = max(
-            nodes, key=lambda x: int(x.is_reduction())
-        ).group
-        ex_kernel = self.target_kernel()
-        ex_kernel.kernel_group = self.kernel_group
-
-        kernel_name = f"extension_kernel_{MLIRScheduling.count}"
-        MLIRScheduling.count += 1
-        src_code = ex_kernel.codegen_nodes(nodes, kernel_name)
-        self.define_kernel(src_code, kernel_name, ex_kernel.vector_lane,
-                           ex_kernel.spad_info, origins= {str(i) for i in nodes[0].node.origins})
-        ex_kernel.call_kernel(kernel_name)
-        _, args, _, _ = ex_kernel.args.mlir_argdefs()
-        args = ", ".join(args)
-        if (extension_config.CONFIG_BACKENDSIM_EAGER_MODE):
-            V.graph.wrapper_code.writeline(
-                f"yield ({kernel_name}, ({args}))"
-            )
-        self._set_flush_status(True)
-
-    def ready_to_flush(self):
-        return self._ready_to_flush
-
-    def codegen_sync(self):
-        pass
-
-    def flush(self):
-        self.kernel_group.codegen_define_and_call(V.graph.wrapper_code)
-        self.kernel_group = MLIRWrapperKenrelGroup()
-        self._set_flush_status(False)
-
-    def define_function(self, kernel):
-        code, function_name = kernel.def_function()
-        if code is not None and function_name not in self.outer_function:
-            wrapper = V.graph.wrapper_code
-            wrapper.header.writeline(code)
-            self.outer_function.add(function_name)
-
-    def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, tile_size=[1, 1, 1], loop_size=None, origins={}):
-        wrapper = V.graph.wrapper_code
-        if src_code in wrapper.src_to_kernel:
-            kernel_name = wrapper.src_to_kernel[src_code]
-        else:
-            wrapper.src_to_kernel[src_code] = kernel_name
-
-            codecache_def = IndentedBuffer()
-            codecache_def.writeline(f"custom_async_compile.mlir('''{src_code}''', ")
-            codecache_def.writeline(f"vectorlane_size={vector_lane},")
-            codecache_def.writeline(f"tile_size={tile_size},")
-            codecache_def.writeline(f"loop_size={loop_size},")
-            codecache_def.writeline(f"spad_info={spad_info},")
-            codecache_def.writeline(f"origins={origins},")
-            codecache_def.writeline("arg_attributes=arg_attributes)")
-            wrapper.define_kernel(kernel_name, codecache_def.getvalue(), cuda=False)
-        return kernel_name
-
-    def codegen_src_code(self, kernel, render, template_node, epilogue_nodes):
-        with kernel:
-            for node in [template_node, *epilogue_nodes]:
-                    node.mark_run()
-            partial_code = render()
-            for node in epilogue_nodes:
-                ranges = node.get_ranges()
-                node.codegen(kernel.set_ranges(ranges[0], ranges[1], None))
-        with V.set_kernel_handler(kernel):
-            src_code = (
-                partial_code
-                if isinstance(partial_code, str)
-                else partial_code.finalize()
-            )
-            src_code = kernel.add_extra_global_vars(src_code)
-        return src_code
-
-    def codegen_template(self, template_node, epilogue_nodes):
-        _, (numel, rnumel) = template_node.group
-        template_buffer = template_node.node
-        kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, epilogue_nodes=epilogue_nodes)
-        _, _, _, kernel.buffer_types = kernel.args.mlir_argdefs()
-
-        src_code = self.codegen_src_code(kernel, render, template_node, epilogue_nodes)
-        wrapper = V.graph.wrapper_code
-
-        if src_code in wrapper.src_to_kernel: # [CONV] check inner function is already defined
-            kernel_name = wrapper.src_to_kernel[src_code]
-            kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, epilogue_nodes=epilogue_nodes, kernel_name=kernel_name) # update kernel name
-            src_code = self.codegen_src_code(kernel, render, template_node, epilogue_nodes)
-
-        with V.set_kernel_handler(kernel):
-            codegen_header(src_code, (kernel.header.getvalue(), kernel.gem5_header.getvalue()))
-            # node_schedule = [template_node, *epilogue_nodes]
-            kernel.meta_kernel()
-            kernel_name = self.define_kernel(src_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info,
-                                             kernel.tile_size, kernel.loop_size, origins={str(i) for i in template_node.node.origins})
-            self.define_function(kernel)
-
-        kernel.call_kernel(kernel_name)
-        V.graph.removed_buffers |= kernel.removed_buffers
-        _, args, _, _ = kernel.args.mlir_argdefs()
-        args = ", ".join(args)
-        if (extension_config.CONFIG_BACKENDSIM_EAGER_MODE):
-            target_kernel_name = kernel_name if kernel.outer_func_name is None else kernel.outer_func_name
-            V.graph.wrapper_code.writeline(
-                f"yield ({target_kernel_name}, ({args}))"
-            )
-        self._set_flush_status(True)
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
new file mode 100644
index 00000000..ea2005a8
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -0,0 +1,161 @@
+from PyTorchSimFrontend import extension_config
+from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
+
+
+from torch._inductor import config
+from torch._inductor.codegen import cpp
+from torch._inductor.scheduler import BaseScheduling
+from torch._inductor.utils import IndentedBuffer
+from torch._inductor.virtualized import V
+
+from . import mlir_common
+from . import mlir_lowering
+
+class MLIRWrapperKenrelGroup(cpp.KernelGroup):
+    def __init__(self):
+        super().__init__()
+        self.args = mlir_common.MLIRKernelArgs()
+
+class MLIRScheduling(BaseScheduling):
+    count = 0
+    target_kernel = MLIRKernel
+    def __init__(self, scheduler):
+        self.scheduler = scheduler
+        self.kernel_group = MLIRWrapperKenrelGroup()
+        self._ready_to_flush = False
+        self.outer_function = set()
+        config.inplace_buffers = False # FIXME. inout kernel makes trouble.. So disabled it!
+
+    def _set_flush_status(self, status: bool):
+        self._ready_to_flush = status
+
+    def can_fuse_vertical(self, node1, node2):
+        return False
+        return self.can_fuse_horizontal(node1, node2) and not node1.is_reduction()
+
+    def can_fuse_horizontal(self, node1, node2):
+        return False
+        _, (vars1, reduce1) = node1.group
+        _, (vars2, reduce2) = node2.group
+        if vars1 == vars2 and reduce1 == reduce2:
+            return True
+        #TODO: Temporary solution determining the fusion condition similar to CPP/OpenMP
+        v1_total = math.prod(vars1) if len(vars1) else 0
+        v2_total = math.prod(vars2) if len(vars2) else 0
+        r1_total = math.prod(reduce1) if len(reduce1) else 0
+        r2_total = math.prod(reduce2) if len(reduce2) else 0
+        if reduce1 == () \
+            and v1_total == (v2_total + r2_total):
+            # and node1.node.layout.size == node2.node.layout.size:     #FIXME: Need to check layout too?
+            return True
+        return False
+
+    def group_fn(self, sizes):
+        return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes)
+
+    def codegen_nodes(self, nodes):
+        _, (group, reduction_group) = max(
+            nodes, key=lambda x: int(x.is_reduction())
+        ).group
+        ex_kernel = self.target_kernel()
+        ex_kernel.kernel_group = self.kernel_group
+
+        kernel_name = f"extension_kernel_{MLIRScheduling.count}"
+        MLIRScheduling.count += 1
+        src_code = ex_kernel.codegen_nodes(nodes, kernel_name)
+        self.define_kernel(src_code, kernel_name, ex_kernel.vector_lane,
+                           ex_kernel.spad_info, origins= {str(i) for i in nodes[0].node.origins})
+        ex_kernel.call_kernel(kernel_name)
+        _, args, _, _ = ex_kernel.args.mlir_argdefs()
+        args = ", ".join(args)
+        if (extension_config.CONFIG_BACKENDSIM_EAGER_MODE):
+            V.graph.wrapper_code.writeline(
+                f"yield ({kernel_name}, ({args}))"
+            )
+        self._set_flush_status(True)
+
+    def ready_to_flush(self):
+        return self._ready_to_flush
+
+    def codegen_sync(self):
+        pass
+
+    def flush(self):
+        self.kernel_group.codegen_define_and_call(V.graph.wrapper_code)
+        self.kernel_group = MLIRWrapperKenrelGroup()
+        self._set_flush_status(False)
+
+    def define_function(self, kernel):
+        code, function_name = kernel.def_function()
+        if code is not None and function_name not in self.outer_function:
+            wrapper = V.graph.wrapper_code
+            wrapper.header.writeline(code)
+            self.outer_function.add(function_name)
+
+    def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, tile_size=[1, 1, 1], loop_size=None, origins={}):
+        wrapper = V.graph.wrapper_code
+        if src_code in wrapper.src_to_kernel:
+            kernel_name = wrapper.src_to_kernel[src_code]
+        else:
+            wrapper.src_to_kernel[src_code] = kernel_name
+
+            codecache_def = IndentedBuffer()
+            codecache_def.writeline(f"custom_async_compile.mlir('''{src_code}''', ")
+            codecache_def.writeline(f"vectorlane_size={vector_lane},")
+            codecache_def.writeline(f"tile_size={tile_size},")
+            codecache_def.writeline(f"loop_size={loop_size},")
+            codecache_def.writeline(f"spad_info={spad_info},")
+            codecache_def.writeline(f"origins={origins},")
+            codecache_def.writeline("arg_attributes=arg_attributes)")
+            wrapper.define_kernel(kernel_name, codecache_def.getvalue(), cuda=False)
+        return kernel_name
+
+    def codegen_src_code(self, kernel, render, template_node, epilogue_nodes):
+        with kernel:
+            for node in [template_node, *epilogue_nodes]:
+                    node.mark_run()
+            partial_code = render()
+            for node in epilogue_nodes:
+                ranges = node.get_ranges()
+                node.codegen(kernel.set_ranges(ranges[0], ranges[1], None))
+        with V.set_kernel_handler(kernel):
+            src_code = (
+                partial_code
+                if isinstance(partial_code, str)
+                else partial_code.finalize()
+            )
+            src_code = kernel.add_extra_global_vars(src_code)
+        return src_code
+
+    def codegen_template(self, template_node, epilogue_nodes):
+        _, (numel, rnumel) = template_node.group
+        template_buffer = template_node.node
+        kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, epilogue_nodes=epilogue_nodes)
+        _, _, _, kernel.buffer_types = kernel.args.mlir_argdefs()
+
+        src_code = self.codegen_src_code(kernel, render, template_node, epilogue_nodes)
+        wrapper = V.graph.wrapper_code
+
+        if src_code in wrapper.src_to_kernel: # [CONV] check inner function is already defined
+            kernel_name = wrapper.src_to_kernel[src_code]
+            kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, epilogue_nodes=epilogue_nodes, kernel_name=kernel_name) # update kernel name
+            src_code = self.codegen_src_code(kernel, render, template_node, epilogue_nodes)
+
+        with V.set_kernel_handler(kernel):
+            codegen_header(src_code, (kernel.header.getvalue(), kernel.gem5_header.getvalue()))
+            # node_schedule = [template_node, *epilogue_nodes]
+            kernel.meta_kernel()
+            kernel_name = self.define_kernel(src_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info,
+                                             kernel.tile_size, kernel.loop_size, origins={str(i) for i in template_node.node.origins})
+            self.define_function(kernel)
+
+        kernel.call_kernel(kernel_name)
+        V.graph.removed_buffers |= kernel.removed_buffers
+        _, args, _, _ = kernel.args.mlir_argdefs()
+        args = ", ".join(args)
+        if (extension_config.CONFIG_BACKENDSIM_EAGER_MODE):
+            target_kernel_name = kernel_name if kernel.outer_func_name is None else kernel.outer_func_name
+            V.graph.wrapper_code.writeline(
+                f"yield ({target_kernel_name}, ({args}))"
+            )
+        self._set_flush_status(True)
\ No newline at end of file
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 1d8064f9..f1f0b35a 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -179,9 +179,11 @@ def setup_device():
             register_backend_for_device,
         )
         from PyTorchSimFrontend.mlir.mlir_codegen_backend import (
-            MLIRScheduling,
             ExtensionWrapperCodegen,
         )
+        from PyTorchSimFrontend.mlir.mlir_scheduling import (
+            MLIRScheduling
+        )
         register_backend_for_device(
             "extension_device", MLIRScheduling, ExtensionWrapperCodegen
         )

From 13cc93361aad8a29cb92f274d93399d8b1bad380 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 10 Jan 2025 04:27:08 +0000
Subject: [PATCH 002/432] [Frontend] move common function to mlir_common module

---
 .../mlir/mlir_codegen_backend.py              | 35 ------------------
 PyTorchSimFrontend/mlir/mlir_common.py        | 36 +++++++++++++++++++
 2 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index d5336fdb..42e091c6 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -662,7 +662,6 @@ def __init__(self):
         self.kernel_group = None
         self.call_ranges = None
         self.ranges = None
-        self.itervars = None
         self.reduction_depth = None
         self.reduction_prefix = IndentedBuffer()
         self.reduction_suffix = IndentedBuffer()
@@ -688,40 +687,6 @@ def __init__(self):
         self.reduce_iterator = {}
         self.is_template_kernel = False
 
-    def get_constant_vector(self, expr):
-        constant_vector = [[int(expr.coeff(var)),None] for var in self.itervars]
-        return constant_vector
-
-    def get_constant_vector2(self, expr):
-        # Case 0. symbol ex) index 0
-        # Case 1. inner product form ex) 16 * index0 + 1 * index1
-        # Case 2. Complicated form ex) 16 * index0 + 8 * (index//4) + (index % 4)
-        constant_vector = []
-        if expr.is_symbol:
-            constant_vector.append(tuple([1, expr]))
-            return constant_vector
-
-        for arg in expr.args:
-            if arg.is_symbol:
-                constant_vector.append(tuple([1,arg]))
-                continue
-            if len(arg.args) == 0: #TODO: check this
-                continue
-            if arg.args[0].is_number:
-                constant_vector.append(arg.args)
-            else:
-                constant_vector.append([1, arg])
-
-        return constant_vector
-
-    def find_node_by_name(self, name):
-        if name in V.graph.graph_inputs:
-            return V.graph.graph_inputs[name]
-        else:
-            for output_node in V.graph.graph_outputs:
-                if output_node.data.name == name:
-                    return output_node
-
     def get_dma_info(self, name, index, dtype):
         current_tile = MLIRTile(self.tile_desc.n_row, self.tile_desc.n_col, self.tile_desc.vector_lane, self.tile_desc.used_vector_lane)
         cv = self.get_constant_vector(index)
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 912704b5..f0740910 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -163,6 +163,8 @@ class BaseMLIRKernel(common.Kernel, BaseMLIRHardwareInfo):
 
     def __init__(self, args=None):
         super().__init__(args)
+        self.itervars = None
+
         self.vector_compute = IndentedBuffer()
         self.reductions_suffix = IndentedBuffer()
         self.cse = common.CSE(self.newvar_prefix, self.suffix)
@@ -193,6 +195,40 @@ def check_dtype_in_args(self, args):
                 dtype = arg
         return dtype
 
+    def get_constant_vector(self, expr):
+        constant_vector = [[int(expr.coeff(var)),None] for var in self.itervars]
+        return constant_vector
+
+    def get_constant_vector2(self, expr):
+        # Case 0. symbol ex) index 0
+        # Case 1. inner product form ex) 16 * index0 + 1 * index1
+        # Case 2. Complicated form ex) 16 * index0 + 8 * (index//4) + (index % 4)
+        constant_vector = []
+        if expr.is_symbol:
+            constant_vector.append(tuple([1, expr]))
+            return constant_vector
+
+        for arg in expr.args:
+            if arg.is_symbol:
+                constant_vector.append(tuple([1,arg]))
+                continue
+            if len(arg.args) == 0: #TODO: check this
+                continue
+            if arg.args[0].is_number:
+                constant_vector.append(arg.args)
+            else:
+                constant_vector.append([1, arg])
+
+        return constant_vector
+
+    def find_node_by_name(self, name):
+        if name in V.graph.graph_inputs:
+            return V.graph.graph_inputs[name]
+        else:
+            for output_node in V.graph.graph_outputs:
+                if output_node.data.name == name:
+                    return output_node
+
     def register_var_info(self, var, var_info):
         self.var_info[var] = var_info
 

From efed9e8d6a929f85ad0916f6e3d9633cb689df6f Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 10 Jan 2025 04:50:16 +0000
Subject: [PATCH 003/432] [Frontend] move common logic to mlir_common

---
 .../mlir/mlir_codegen_backend.py              |  81 +-------------
 PyTorchSimFrontend/mlir/mlir_common.py        | 101 +++++++++++++++---
 2 files changed, 89 insertions(+), 93 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 42e091c6..25eadbbe 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -604,65 +604,12 @@ def broadcast(operand1, operand2, *args, var_info=None):
     "MVOUT": 3,
 }
 
-class MLIRTile():
-    TILE_ROW_WISE = 0
-    TILE_COL_WISE = 1
-    TILE_PER_LANE_ROW_WISE = 2
-    TILE_PER_LANE_COL_WISE = 3
-    def __init__(self, n_row, n_col, vector_lane, used_vector_lane=None) -> None:
-        self.n_row = n_row
-        self.n_col = n_col
-        self.vector_lane = vector_lane
-        if used_vector_lane is None:
-            self.used_vector_lane = self.vector_lane
-        else:
-            self.used_vector_lane = used_vector_lane
-        self.tile_per_lane_layout = self.TILE_PER_LANE_ROW_WISE # How a given tile per lane is stored
-        self.tile_layout = self.TILE_ROW_WISE # How a given tile is stored per lane
-        self.vector_lane_axis = (self.n_col//self.used_vector_lane) > 0 #(0: Col major, 1: Row major)
-
-    def get_tile_size(self):
-        return self.n_row * self.n_col
-
-    def get_rows_per_lane(self):
-        if self.n_row % self.used_vector_lane != 0 and self.n_row > 1:
-            print(f"[Warning] n_row({self.n_row}) % vector_lane({self.used_vector_lane}) != 0")
-        return self.div_round_up(self.n_row, self.used_vector_lane)
-
-    def get_cols_per_lane(self):
-        if self.n_col % self.used_vector_lane != 0 and self.n_col > 1:
-            print(f"[Warning] n_col({self.n_col}) % vector_lane({self.used_vector_lane}) != 0")
-        return self.div_round_up(self.n_col, self.used_vector_lane)
-
-    def get_tile_size_per_lane(self):
-        if self.get_tile_size() % self.used_vector_lane != 0:
-            print(f"[Warning] n_col({self.n_col}) % vector_lane({self.used_vector_lane}) != 0")
-        return self.div_round_up(self.get_tile_size(), self.used_vector_lane)
-
-    def get_tile_shape(self):
-        return f"{self.n_row}x{self.n_col}"
-
-    def get_chunk_size(self):
-        if self.tile_layout == self.TILE_ROW_WISE:
-            chunk_size = self.get_tile_size_per_lane()
-        else:
-            chunk_size = self.get_cols_per_lane()
-        return chunk_size
-
-    @staticmethod
-    def div_round_up(size, round_val):
-        return (size + round_val - 1) // round_val
-
 class MLIRKernel(mlir_common.BaseMLIRKernel):
     overrides = ExtensionOverrides
     newvar_prefix = "%"
 
     def __init__(self):
         super().__init__(mlir_common.MLIRKernelArgs())
-        self.kernel_group = None
-        self.call_ranges = None
-        self.ranges = None
-        self.reduction_depth = None
         self.reduction_prefix = IndentedBuffer()
         self.reduction_suffix = IndentedBuffer()
         self.body = IndentedBuffer()
@@ -678,7 +625,6 @@ def __init__(self):
         self.map_cse = common.CSE("#", self.suffix, name_prefix="map")
         self.consts = set()
         self.tags = set()
-        self.tile_desc = MLIRTile(self.tile_row, self.tile_col, self.vector_lane)
         self.dma_cache = {}
         self.dma_counter = 1
         self.reduction_idx = {}
@@ -1067,12 +1013,6 @@ def store_reduction(self, name, index, value):
         self.cse.generate(self.reductions_suffix, code, assignment = False)
 
     def codegen_body(self):
-        # if not (
-        #     self.loads
-        #     or self.stores
-        #     or self.compute
-        # ):
-        #     return
         def template_store(options):
             subtile_size = [self.vector_lane, self.vector_lane]
             async_flag = 1
@@ -1228,25 +1168,11 @@ def adjust_tile_size(self):
             raise NotImplementedError()
 
     def set_ranges(self, lengths, reduction_lengths, read_writes):
-        self.read_writes = read_writes
-        if self.call_ranges:
-            assert self.call_ranges == tuple(lengths) + tuple(
-                reduction_lengths
-            ), f"{self.call_ranges} == {tuple(lengths)} + {tuple(reduction_lengths)}"
-            assert self.reduction_depth == len(lengths)
-        else:
-            self.call_ranges = tuple(lengths) + tuple(reduction_lengths)
-            self.ranges = [self.rename_indexing(x) for x in self.call_ranges]
-            self.itervars = [sympy.Symbol(f"index{n}") for n in range(len(self.ranges))]
-            self.reduction_depth = len(lengths)
+        ret = super().set_ranges(lengths, reduction_lengths, read_writes)
 
         # Adjust time size when it is vector
         self.adjust_tile_size()
-
-        return (
-            self.itervars[: self.reduction_depth],
-            self.itervars[self.reduction_depth :],
-        )
+        return ret
 
     def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, indices, raw_index):
         c_type = mlir_common.DTYPE_TO_C[dtype]
@@ -1276,8 +1202,7 @@ def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape
         buffer = self.cse.generate(code_buffer, f"memref.get_global @{new_name}_spad : memref<{dram_tile_shape}x{mlir_type}, 1>")
         return buffer, indices
 
-    def roundup_vectorlane(self, size, amp=1):
-        return ((size + self.vector_lane - 1) // self.vector_lane) * self.vector_lane * amp
+
 
 @dataclasses.dataclass
 class LoopLevel:
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index f0740910..df44a810 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -141,6 +141,56 @@ def set_info(outer, inner, arg_type):
             set_info(outer, inner, self.MLIR_ARGS_VAR)
         return arg_defs, call_args, arg_attributes, buffer_types
 
+
+class MLIRTile():
+    TILE_ROW_WISE = 0
+    TILE_COL_WISE = 1
+    TILE_PER_LANE_ROW_WISE = 2
+    TILE_PER_LANE_COL_WISE = 3
+    def __init__(self, n_row, n_col, vector_lane, used_vector_lane=None) -> None:
+        self.n_row = n_row
+        self.n_col = n_col
+        self.vector_lane = vector_lane
+        if used_vector_lane is None:
+            self.used_vector_lane = self.vector_lane
+        else:
+            self.used_vector_lane = used_vector_lane
+        self.tile_per_lane_layout = self.TILE_PER_LANE_ROW_WISE # How a given tile per lane is stored
+        self.tile_layout = self.TILE_ROW_WISE # How a given tile is stored per lane
+        self.vector_lane_axis = (self.n_col//self.used_vector_lane) > 0 #(0: Col major, 1: Row major)
+
+    def get_tile_size(self):
+        return self.n_row * self.n_col
+
+    def get_rows_per_lane(self):
+        if self.n_row % self.used_vector_lane != 0 and self.n_row > 1:
+            print(f"[Warning] n_row({self.n_row}) % vector_lane({self.used_vector_lane}) != 0")
+        return self.div_round_up(self.n_row, self.used_vector_lane)
+
+    def get_cols_per_lane(self):
+        if self.n_col % self.used_vector_lane != 0 and self.n_col > 1:
+            print(f"[Warning] n_col({self.n_col}) % vector_lane({self.used_vector_lane}) != 0")
+        return self.div_round_up(self.n_col, self.used_vector_lane)
+
+    def get_tile_size_per_lane(self):
+        if self.get_tile_size() % self.used_vector_lane != 0:
+            print(f"[Warning] n_col({self.n_col}) % vector_lane({self.used_vector_lane}) != 0")
+        return self.div_round_up(self.get_tile_size(), self.used_vector_lane)
+
+    def get_tile_shape(self):
+        return f"{self.n_row}x{self.n_col}"
+
+    def get_chunk_size(self):
+        if self.tile_layout == self.TILE_ROW_WISE:
+            chunk_size = self.get_tile_size_per_lane()
+        else:
+            chunk_size = self.get_cols_per_lane()
+        return chunk_size
+
+    @staticmethod
+    def div_round_up(size, round_val):
+        return (size + round_val - 1) // round_val
+
 class BaseMLIRHardwareInfo():
     def __init__(self):
         # Default HW setting
@@ -163,18 +213,43 @@ class BaseMLIRKernel(common.Kernel, BaseMLIRHardwareInfo):
 
     def __init__(self, args=None):
         super().__init__(args)
+        self.kernel_group = None
+        # Kernel iteration range info
+        self.call_ranges = None
+        self.ranges = None
+        self.reduction_depth = None
         self.itervars = None
-
+        # Code buffer
         self.vector_compute = IndentedBuffer()
         self.reductions_suffix = IndentedBuffer()
         self.cse = common.CSE(self.newvar_prefix, self.suffix)
-        self.tile_row = extension_config.CONFIG_TILE_ROW
-        if self.tile_row == -1:
-            self.tile_row = self.vlen * self.vector_lane
-        self.tile_col = extension_config.CONFIG_TILE_COL
-        if self.tile_col == -1:
-            self.tile_col = 8 # FIXME: tile_col is not always vector_lane * vlen
-        self.var_info = {}
+        # Tile size setting
+        tile_row = extension_config.CONFIG_TILE_ROW
+        if tile_row == -1:
+            tile_row = self.vlen * self.vector_lane
+        tile_col = extension_config.CONFIG_TILE_COL
+        if tile_col == -1:
+            tile_col = 8 # FIXME: tile_col is not always vector_lane * vlen
+        self.tile_desc = MLIRTile(tile_row, tile_col, self.vector_lane)
+        self.var_info = {} # MLIR variable info
+
+    def set_ranges(self, lengths, reduction_lengths, read_writes):
+        self.read_writes = read_writes
+        if self.call_ranges:
+            assert self.call_ranges == tuple(lengths) + tuple(
+                reduction_lengths
+            ), f"{self.call_ranges} == {tuple(lengths)} + {tuple(reduction_lengths)}"
+            assert self.reduction_depth == len(lengths)
+        else:
+            self.call_ranges = tuple(lengths) + tuple(reduction_lengths)
+            self.ranges = [self.rename_indexing(x) for x in self.call_ranges]
+            self.itervars = [sympy.Symbol(f"index{n}") for n in range(len(self.ranges))]
+            self.reduction_depth = len(lengths)
+
+        return (
+            self.itervars[: self.reduction_depth],
+            self.itervars[self.reduction_depth :],
+        )
 
     def load(self, name: str, index: sympy.Expr):
         raise NotImplementedError()
@@ -188,13 +263,6 @@ def store(self, name, index, value, mode=None):
     def reduction(self, dtype, src_dtype, reduction_type, value):
         raise NotImplementedError()
 
-    def check_dtype_in_args(self, args):
-        dtype = torch.float32 # default dtype
-        for arg in args:
-            if arg in list(DTYPE_TO_MLIR.keys()):
-                dtype = arg
-        return dtype
-
     def get_constant_vector(self, expr):
         constant_vector = [[int(expr.coeff(var)),None] for var in self.itervars]
         return constant_vector
@@ -229,6 +297,9 @@ def find_node_by_name(self, name):
                 if output_node.data.name == name:
                     return output_node
 
+    def roundup_vectorlane(self, size, amp=1):
+        return ((size + self.vector_lane - 1) // self.vector_lane) * self.vector_lane * amp
+
     def register_var_info(self, var, var_info):
         self.var_info[var] = var_info
 

From 4b28d4a28758bcbd035809012d77ea862957fa1a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 10 Jan 2025 05:21:50 +0000
Subject: [PATCH 004/432] [Frontend] Cleanup mlir_codegen_backend module

---
 .../mlir/mlir_codegen_backend.py              | 312 ++++++++----------
 PyTorchSimFrontend/mlir/mlir_common.py        | 102 +++++-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  12 +-
 PyTorchSimFrontend/mlir/mlir_template.py      |   4 +-
 4 files changed, 222 insertions(+), 208 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 25eadbbe..99c39322 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -627,116 +627,17 @@ def __init__(self):
         self.tags = set()
         self.dma_cache = {}
         self.dma_counter = 1
-        self.reduction_idx = {}
         self.affine_yield = {}
         self.welford_reduce_out = None
         self.reduce_iterator = {}
         self.is_template_kernel = False
 
-    def get_dma_info(self, name, index, dtype):
-        current_tile = MLIRTile(self.tile_desc.n_row, self.tile_desc.n_col, self.tile_desc.vector_lane, self.tile_desc.used_vector_lane)
-        cv = self.get_constant_vector(index)
-        cv2 = self.get_constant_vector2(index)
-        tile_size_per_lane = self.tile_desc.get_tile_size_per_lane()            # FIXME. move this
-        tile_size_per_lane = 2 if tile_size_per_lane==1 else tile_size_per_lane # Avoid scalar operation
-
-        if len(cv) != len(cv2) and len(cv2) == 3:
-            print("Mismatch! ", cv)
-            # FIXME. this is really shitty code :(
-            cv = cv2#[[1 if x[0] == 0 else x[0], x[1]] for x in cv]
-
-        # Case 0. Tile is 0-D scalar
-        if len(cv) == 0:
-            # Use only one vectorlane to handle scalar data
-            current_tile.n_row = 1
-            current_tile.n_col = 1
-            current_tile.tile_layout = MLIRTile.TILE_ROW_WISE
-            current_tile.tile_per_lane_layout = MLIRTile.TILE_PER_LANE_ROW_WISE
-            mm_stride, tile_size_per_lane = 1, 1
-            chunk_size = current_tile.get_chunk_size()
-        # Case 1. Tile is 1-D vector type
-        elif len(cv) == 1 and len(cv) <= self.reduction_depth:
-            current_tile.n_row = 1
-            current_tile.n_col = self.tile_desc.get_tile_size()
-            current_tile.tile_layout = MLIRTile.TILE_ROW_WISE
-            current_tile.tile_per_lane_layout = MLIRTile.TILE_PER_LANE_COL_WISE # Actually it is not needed in vector case
-            chunk_size = current_tile.get_chunk_size()
-            mm_stride = current_tile.n_col
-        # Case 2. Tile is 1-D vector type with reduction
-        elif len(cv) == 1 and len(cv) == self.reduction_depth + 1:
-            # Use only one vectorlane to reduce a vector
-            current_tile.tile_layout = MLIRTile.TILE_ROW_WISE
-            current_tile.tile_per_lane_layout = MLIRTile.TILE_PER_LANE_ROW_WISE
-            current_tile.n_row = 1
-            current_tile.n_col = self.tile_desc.get_tile_size()
-            current_tile.used_vector_lane = 1
-            chunk_size = current_tile.get_chunk_size()
-            mm_stride = 0 # don't care
-        # Case 3. Tile is 2-D tile
-        elif len(cv) == 2:
-            is_reduction = self.reduction_depth == 1
-            if cv[0][0] != 0 and cv[1][0] != 0:
-                is_transposed = cv[0][0] < cv[1][0]
-                if is_transposed:
-                    current_tile.n_row = self.tile_desc.n_col
-                    current_tile.n_col = self.tile_desc.n_row
-                    mm_stride = self.ranges[0]
-                else:
-                    current_tile.n_row = self.tile_desc.n_row
-                    current_tile.n_col = self.tile_desc.n_col
-                    mm_stride = self.ranges[1]
-
-                if is_reduction and is_transposed:
-                    current_tile.tile_layout = MLIRTile.TILE_COL_WISE
-                    current_tile.tile_per_lane_layout = MLIRTile.TILE_PER_LANE_ROW_WISE
-                    chunk_size = current_tile.get_chunk_size()
-                elif is_reduction and not is_transposed:
-                    current_tile.tile_layout = MLIRTile.TILE_ROW_WISE
-                    current_tile.tile_per_lane_layout = MLIRTile.TILE_PER_LANE_COL_WISE
-                    chunk_size = current_tile.get_chunk_size()
-                elif not is_reduction and is_transposed:
-                    # Transposed case
-                    current_tile.tile_layout = MLIRTile.TILE_COL_WISE
-                    current_tile.tile_per_lane_layout = MLIRTile.TILE_PER_LANE_COL_WISE
-                    chunk_size = current_tile.get_chunk_size()
-                else: # not is_reduction and not is_transpose
-                    current_tile.tile_layout = MLIRTile.TILE_COL_WISE if self.tile_desc.vector_lane_axis else MLIRTile.TILE_ROW_WISE
-                    current_tile.tile_per_lane_layout = MLIRTile.TILE_PER_LANE_ROW_WISE
-                    chunk_size = current_tile.get_chunk_size()
-            else:
-                # Broadcast pattern
-                current_tile.tile_per_lane_layout = MLIRTile.TILE_PER_LANE_ROW_WISE
-                mm_stride = 0
-                if cv[0][0] == 0:
-                    current_tile.tile_layout = MLIRTile.TILE_COL_WISE if self.tile_desc.vector_lane_axis else MLIRTile.TILE_ROW_WISE
-                    current_tile.n_row = self.tile_desc.n_row
-                    current_tile.n_col = self.tile_desc.n_col
-                    chunk_size = current_tile.get_chunk_size()
-                else: # cv[1][0] == 0
-                    current_tile.n_row = self.tile_desc.n_col
-                    current_tile.n_col = self.tile_desc.n_row
-                    chunk_size = current_tile.get_cols_per_lane()
-                    if not is_reduction:
-                        current_tile.tile_per_lane_layout = MLIRTile.TILE_PER_LANE_COL_WISE
-                        chunk_size = current_tile.n_col if self.tile_desc.vector_lane_axis else chunk_size
-        elif len(cv) == 3:
-            current_tile.tile_per_lane_layout = MLIRTile.TILE_PER_LANE_COL_WISE # Actually it is not needed in vector case
-            mm_stride = cv[-1][0]
-            # When current_tile.n_col stride is 1, we can access row vector
-            if mm_stride == 1:
-                current_tile.n_row = 1
-                current_tile.n_col = self.tile_desc.get_tile_size()
-            # if current_tile.n_col stride is not 1, we have to access in a column vector
-            else:
-                current_tile.n_row = self.tile_desc.get_tile_size()
-                current_tile.n_col = 1
-            chunk_size = current_tile.get_tile_size_per_lane()
-        else:
-            raise NotImplementedError()
+    def set_ranges(self, lengths, reduction_lengths, read_writes):
+        ret = super().set_ranges(lengths, reduction_lengths, read_writes)
 
-        #assert(not (dtype==torch.bool and chunk_size < 8))
-        chunk = chunk_size << 1 | (current_tile.tile_per_lane_layout == MLIRTile.TILE_PER_LANE_COL_WISE)
-        return mm_stride, chunk, [current_tile.n_row, current_tile.n_col], tile_size_per_lane
+        # Adjust time size when it is vector
+        self.adjust_tile_size()
+        return ret
 
     def parse_indices(self, expr):
         if len(expr.args) == 0:
@@ -766,35 +667,6 @@ def parse_indices(self, expr):
         index = self.cse.generate(self.loads, f"affine.apply #{map_var}({args})")
         return index
 
-    def codegen_nodes(self, nodes, kernel_name):
-        _, (group, reduction_group) = max(
-            nodes, key=lambda x: int(x.is_reduction())
-        ).group
-
-        self.set_ranges(group, reduction_group, None)
-        with self as kernel:
-            kernel.args = kernel.kernel_group.args
-            for node in nodes:
-                vars, reduction_vars = kernel.set_ranges(group, reduction_group, node.read_writes)
-                kernel.args.tile_row = kernel.tile_desc.n_row
-                kernel.args.tile_col = kernel.tile_desc.n_col
-                _, _, _, kernel.buffer_types = kernel.args.mlir_argdefs()
-                kernel.reduction_idx = {var: i for i, var in enumerate(reduction_vars)}
-                node.run(vars, reduction_vars)
-        src_code = self.codegen_kernel(kernel_name=kernel_name)
-        self.meta_kernel()
-
-        write_path = extension_codecache.get_write_path(src_code)
-        if not os.path.exists(write_path):
-            os.makedirs(write_path)
-        spike_write_path = os.path.join(write_path, "global_var.h")
-        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-        if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, self.header.getvalue())
-        if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, self.gem5_header.getvalue())
-        return src_code
-
     def load(self, name: str, index: sympy.Expr):
         index = self.rename_indexing(index)
         indices = self.parse_indices(index)
@@ -1000,9 +872,9 @@ def store_reduction(self, name, index, value):
         # MVOUT Encoding
         dmaType = 3 # MVIN 2, MVIN2 1, MVIN3 14, MVOUT 3
         mm_stride = tile_col
-        is_col_major = MLIRTile.TILE_PER_LANE_ROW_WISE
+        is_col_major = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
         chunk_size = self.tile_desc.get_rows_per_lane()
-        chunk = chunk_size << 1 | (is_col_major == MLIRTile.TILE_PER_LANE_COL_WISE)
+        chunk = chunk_size << 1 | (is_col_major == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
         self.consts.add(dmaType)
         self.consts.add(mm_stride)
         self.consts.add(chunk)
@@ -1031,6 +903,9 @@ def template_store(options):
         self.compute.clear()
         self.stores.clear()
 
+    def codegen_global_init(self):
+        return self.global_vars
+
     def codegen_init(self):
         code = IndentedBuffer()
         tags = sorted(self.tags)
@@ -1046,8 +921,6 @@ def codegen_loops(self):
         # Loop body part
         tile_row, tile_col = self.tile_desc.n_row, self.tile_desc.n_col
         # FIXME.
-        #if (self.tiling_idx < self.reduction_depth and len(self.reduction_idx) > 0):
-        #    tile_row, tile_col = self.tile_desc.n_col, self.tile_desc.n_row
         tile_row = self.tile_desc.get_tile_size() if len(self.itervars) == 1 else tile_row
         loops = [LoopLevel(var, size, idx-len(self.itervars), tile_row=tile_row, tile_col=tile_col) for idx, (var, size) in enumerate(zip(self.itervars, self.ranges))]
         loops, reductions = [LoopNest(loops[: self.reduction_depth]),
@@ -1082,43 +955,125 @@ def codegen_loops(self):
         code.writeline(f"return")
         return code
 
-    def codegen_kernel(self, kernel_name):
-        wrapper = V.graph.wrapper_code
-        arg_defs, _, _, _ = self.kernel_group.args.mlir_argdefs()
-        code = self._codegen_kernel(arg_defs, kernel_name)
-        return code.getvalue()
-
-    def meta_kernel(self):
-        wrapper = V.graph.wrapper_code
-        _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs()
-        wrapper.add_import_once('\nprint(f\'Wrapper Codegen Path = {__file__}\')')
-        wrapper.add_import_once(f'\nfrom PyTorchSimFrontend.extension_codecache import CustomAsyncCompile')
-        wrapper.add_import_once(f'\ncustom_async_compile = CustomAsyncCompile()')
-        # Dump loop and load/store information
-        wrapper.add_import_once(f"arg_attributes = {arg_attributes}")
-
-
-    def call_kernel(self, kernel_name):
-        wrapper = V.graph.wrapper_code
-        _, call_args, _, _ = self.kernel_group.args.mlir_argdefs()
-       # generate the code to call this
-        wrapper.generate_kernel_call(kernel_name, call_args, cuda=False)
-
-    def _codegen_kernel(self, arg_defs, kernel_name):
-        arg_defs = ",\n".ljust(25).join(arg_defs)
-        code = common.BracesBuffer()
-
-        code.splice(self.global_vars)
-        #TODO:. kernel name custom
-        kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel"
-        code.writeline(f'func.func @{kernel_decl_name}({arg_defs})')
-        with code.indent():
-            for old, new in self.kernel_group.args.aliases():
-                code.writeline(f"auto {old} = {new};")
-            # Loop body part
-            code.splice(self.codegen_init())
-            code.splice(self.codegen_loops())
-        return code
+    def codegen_nodes(self, nodes, kernel_name):
+        src_code = super().codegen_nodes(nodes, kernel_name)
+
+        # Create extra header for simulatoors
+        write_path = extension_codecache.get_write_path(src_code)
+        if not os.path.exists(write_path):
+            os.makedirs(write_path)
+        spike_write_path = os.path.join(write_path, "global_var.h")
+        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
+        if not os.path.exists(spike_write_path):
+            write_atomic(spike_write_path, self.header.getvalue())
+        if not os.path.exists(gem5_write_path):
+            write_atomic(gem5_write_path, self.gem5_header.getvalue())
+        return src_code
+
+    def get_dma_info(self, name, index, dtype):
+        current_tile = mlir_common.MLIRTile(self.tile_desc.n_row, self.tile_desc.n_col, self.tile_desc.vector_lane, self.tile_desc.used_vector_lane)
+        cv = self.get_constant_vector(index)
+        cv2 = self.get_constant_vector2(index)
+        tile_size_per_lane = self.tile_desc.get_tile_size_per_lane()            # FIXME. move this
+        tile_size_per_lane = 2 if tile_size_per_lane==1 else tile_size_per_lane # Avoid scalar operation
+
+        if len(cv) != len(cv2) and len(cv2) == 3:
+            print("Mismatch! ", cv)
+            # FIXME. this is really shitty code :(
+            cv = cv2#[[1 if x[0] == 0 else x[0], x[1]] for x in cv]
+
+        # Case 0. Tile is 0-D scalar
+        if len(cv) == 0:
+            # Use only one vectorlane to handle scalar data
+            current_tile.n_row = 1
+            current_tile.n_col = 1
+            current_tile.tile_layout = mlir_common.MLIRTile.TILE_ROW_WISE
+            current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
+            mm_stride, tile_size_per_lane = 1, 1
+            chunk_size = current_tile.get_chunk_size()
+        # Case 1. Tile is 1-D vector type
+        elif len(cv) == 1 and len(cv) <= self.reduction_depth:
+            current_tile.n_row = 1
+            current_tile.n_col = self.tile_desc.get_tile_size()
+            current_tile.tile_layout = mlir_common.MLIRTile.TILE_ROW_WISE
+            current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE # Actually it is not needed in vector case
+            chunk_size = current_tile.get_chunk_size()
+            mm_stride = current_tile.n_col
+        # Case 2. Tile is 1-D vector type with reduction
+        elif len(cv) == 1 and len(cv) == self.reduction_depth + 1:
+            # Use only one vectorlane to reduce a vector
+            current_tile.tile_layout = mlir_common.MLIRTile.TILE_ROW_WISE
+            current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
+            current_tile.n_row = 1
+            current_tile.n_col = self.tile_desc.get_tile_size()
+            current_tile.used_vector_lane = 1
+            chunk_size = current_tile.get_chunk_size()
+            mm_stride = 0 # don't care
+        # Case 3. Tile is 2-D tile
+        elif len(cv) == 2:
+            is_reduction = self.reduction_depth == 1
+            if cv[0][0] != 0 and cv[1][0] != 0:
+                is_transposed = cv[0][0] < cv[1][0]
+                if is_transposed:
+                    current_tile.n_row = self.tile_desc.n_col
+                    current_tile.n_col = self.tile_desc.n_row
+                    mm_stride = self.ranges[0]
+                else:
+                    current_tile.n_row = self.tile_desc.n_row
+                    current_tile.n_col = self.tile_desc.n_col
+                    mm_stride = self.ranges[1]
+
+                if is_reduction and is_transposed:
+                    current_tile.tile_layout = mlir_common.MLIRTile.TILE_COL_WISE
+                    current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
+                    chunk_size = current_tile.get_chunk_size()
+                elif is_reduction and not is_transposed:
+                    current_tile.tile_layout = mlir_common.MLIRTile.TILE_ROW_WISE
+                    current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE
+                    chunk_size = current_tile.get_chunk_size()
+                elif not is_reduction and is_transposed:
+                    # Transposed case
+                    current_tile.tile_layout = mlir_common.MLIRTile.TILE_COL_WISE
+                    current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE
+                    chunk_size = current_tile.get_chunk_size()
+                else: # not is_reduction and not is_transpose
+                    current_tile.tile_layout = mlir_common.MLIRTile.TILE_COL_WISE if self.tile_desc.vector_lane_axis else mlir_common.MLIRTile.TILE_ROW_WISE
+                    current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
+                    chunk_size = current_tile.get_chunk_size()
+            else:
+                # Broadcast pattern
+                current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
+                mm_stride = 0
+                if cv[0][0] == 0:
+                    current_tile.tile_layout = mlir_common.MLIRTile.TILE_COL_WISE if self.tile_desc.vector_lane_axis else mlir_common.MLIRTile.TILE_ROW_WISE
+                    current_tile.n_row = self.tile_desc.n_row
+                    current_tile.n_col = self.tile_desc.n_col
+                    chunk_size = current_tile.get_chunk_size()
+                else: # cv[1][0] == 0
+                    current_tile.n_row = self.tile_desc.n_col
+                    current_tile.n_col = self.tile_desc.n_row
+                    chunk_size = current_tile.get_cols_per_lane()
+                    if not is_reduction:
+                        current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE
+                        chunk_size = current_tile.n_col if self.tile_desc.vector_lane_axis else chunk_size
+        elif len(cv) == 3:
+            current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE # Actually it is not needed in vector case
+            mm_stride = cv[-1][0]
+            # When current_tile.n_col stride is 1, we can access row vector
+            if mm_stride == 1:
+                current_tile.n_row = 1
+                current_tile.n_col = self.tile_desc.get_tile_size()
+            # if current_tile.n_col stride is not 1, we have to access in a column vector
+            else:
+                current_tile.n_row = self.tile_desc.get_tile_size()
+                current_tile.n_col = 1
+            chunk_size = current_tile.get_tile_size_per_lane()
+        else:
+            raise NotImplementedError()
+
+        #assert(not (dtype==torch.bool and chunk_size < 8))
+        chunk = chunk_size << 1 | (current_tile.tile_per_lane_layout == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
+        return mm_stride, chunk, [current_tile.n_row, current_tile.n_col], tile_size_per_lane
 
     def adjust_tile_size(self):
         if self.read_writes is not None:
@@ -1167,13 +1122,6 @@ def adjust_tile_size(self):
         if len(self.itervars) >= 3 and self.reduction_depth < len(self.itervars):
             raise NotImplementedError()
 
-    def set_ranges(self, lengths, reduction_lengths, read_writes):
-        ret = super().set_ranges(lengths, reduction_lengths, read_writes)
-
-        # Adjust time size when it is vector
-        self.adjust_tile_size()
-        return ret
-
     def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, indices, raw_index):
         c_type = mlir_common.DTYPE_TO_C[dtype]
         mlir_type = mlir_common.DTYPE_TO_MLIR[dtype]
@@ -1202,8 +1150,6 @@ def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape
         buffer = self.cse.generate(code_buffer, f"memref.get_global @{new_name}_spad : memref<{dram_tile_shape}x{mlir_type}, 1>")
         return buffer, indices
 
-
-
 @dataclasses.dataclass
 class LoopLevel:
     var: sympy.Expr
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index df44a810..a949cb5d 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -1,6 +1,7 @@
 import os
 import torch
 from torch._inductor.codegen import common
+from torch._inductor.codegen import cpp
 from torch._inductor.virtualized import V
 from torch._inductor.ir import MultiOutputLayout
 import sympy
@@ -191,6 +192,11 @@ def get_chunk_size(self):
     def div_round_up(size, round_val):
         return (size + round_val - 1) // round_val
 
+class MLIRWrapperKenrelGroup(cpp.KernelGroup):
+    def __init__(self):
+        super().__init__()
+        self.args = MLIRKernelArgs()
+
 class BaseMLIRHardwareInfo():
     def __init__(self):
         # Default HW setting
@@ -213,7 +219,7 @@ class BaseMLIRKernel(common.Kernel, BaseMLIRHardwareInfo):
 
     def __init__(self, args=None):
         super().__init__(args)
-        self.kernel_group = None
+        self.kernel_group : MLIRWrapperKenrelGroup = None
         # Kernel iteration range info
         self.call_ranges = None
         self.ranges = None
@@ -232,6 +238,8 @@ def __init__(self, args=None):
             tile_col = 8 # FIXME: tile_col is not always vector_lane * vlen
         self.tile_desc = MLIRTile(tile_row, tile_col, self.vector_lane)
         self.var_info = {} # MLIR variable info
+        self.buffer_types : dict = None
+        self.read_writes = None
 
     def set_ranges(self, lengths, reduction_lengths, read_writes):
         self.read_writes = read_writes
@@ -263,6 +271,70 @@ def store(self, name, index, value, mode=None):
     def reduction(self, dtype, src_dtype, reduction_type, value):
         raise NotImplementedError()
 
+    def codegen_global_init(self):
+        raise NotImplementedError()
+
+    def codegen_loops(self):
+        raise NotImplementedError()
+
+    def codegen_init(self):
+        raise NotImplementedError()
+
+    def call_kernel(self, kernel_name):
+        wrapper = V.graph.wrapper_code
+        _, call_args, _, _ = self.kernel_group.args.mlir_argdefs()
+       # generate the code to call this
+        wrapper.generate_kernel_call(kernel_name, call_args, cuda=False)
+
+    def codegen_nodes(self, nodes, kernel_name):
+        _, (group, reduction_group) = max(
+            nodes, key=lambda x: int(x.is_reduction())
+        ).group
+
+        self.set_ranges(group, reduction_group, None)
+        with self as kernel:
+            kernel.args = kernel.kernel_group.args
+            for node in nodes:
+                vars, reduction_vars = kernel.set_ranges(group, reduction_group, node.read_writes)
+                kernel.args.tile_row = kernel.tile_desc.n_row
+                kernel.args.tile_col = kernel.tile_desc.n_col
+                _, _, _, kernel.buffer_types = kernel.args.mlir_argdefs()
+                node.run(vars, reduction_vars)
+        src_code = self.codegen_kernel(kernel_name=kernel_name)
+        self.meta_kernel()
+        return src_code
+
+    def codegen_kernel(self, kernel_name):
+        arg_defs, _, _, _ = self.kernel_group.args.mlir_argdefs()
+        code = self._codegen_kernel(arg_defs, kernel_name)
+        return code.getvalue()
+
+    def _codegen_kernel(self, arg_defs, kernel_name):
+        arg_defs = ",\n".ljust(25).join(arg_defs)
+        code = common.BracesBuffer()
+
+        #TODO:. kernel name custom
+        kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel"
+
+        code.splice(self.codegen_global_init())
+        code.writeline(f'func.func @{kernel_decl_name}({arg_defs})')
+        with code.indent():
+            for old, new in self.kernel_group.args.aliases():
+                code.writeline(f"auto {old} = {new};")
+            # Loop body part
+            code.splice(self.codegen_init())
+            code.splice(self.codegen_loops())
+        return code
+
+    def meta_kernel(self):
+        wrapper = V.graph.wrapper_code
+        _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs()
+        wrapper.add_import_once('\nprint(f\'Wrapper Codegen Path = {__file__}\')')
+        wrapper.add_import_once(f'\nfrom PyTorchSimFrontend.extension_codecache import CustomAsyncCompile')
+        wrapper.add_import_once(f'\ncustom_async_compile = CustomAsyncCompile()')
+        # Dump loop and load/store information
+        wrapper.add_import_once(f"arg_attributes = {arg_attributes}")
+
     def get_constant_vector(self, expr):
         constant_vector = [[int(expr.coeff(var)),None] for var in self.itervars]
         return constant_vector
@@ -303,6 +375,20 @@ def roundup_vectorlane(self, size, amp=1):
     def register_var_info(self, var, var_info):
         self.var_info[var] = var_info
 
+    def rename_indexing(self, index) -> sympy.Expr:
+        # adds the necessary kernel args for index expressions
+        # and renames variables in index expressions to kernel arg names
+        if isinstance(index, (list, tuple)):
+            return [self.rename_indexing(x) for x in index]
+        index = V.graph.sizevars.simplify(index)
+        sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name)
+        replacements = {
+            x: self.args.size(x)
+            for x in sorted_symbols
+            if x.name.startswith("s") or x.name.startswith("ps")
+        }
+        return sympy_subs(index, replacements)
+
     def __enter__(self):
         class CSEProxy:
             self.name = "CSEProxy"
@@ -407,16 +493,4 @@ def bucketize(
         self.exit_stack.enter_context(V.set_kernel_handler(self))
         return self
 
-    def rename_indexing(self, index) -> sympy.Expr:
-        # adds the necessary kernel args for index expressions
-        # and renames variables in index expressions to kernel arg names
-        if isinstance(index, (list, tuple)):
-            return [self.rename_indexing(x) for x in index]
-        index = V.graph.sizevars.simplify(index)
-        sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name)
-        replacements = {
-            x: self.args.size(x)
-            for x in sorted_symbols
-            if x.name.startswith("s") or x.name.startswith("ps")
-        }
-        return sympy_subs(index, replacements)
+
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index ea2005a8..752fa8b4 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -1,9 +1,8 @@
+import math
 from PyTorchSimFrontend import extension_config
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
 
-
 from torch._inductor import config
-from torch._inductor.codegen import cpp
 from torch._inductor.scheduler import BaseScheduling
 from torch._inductor.utils import IndentedBuffer
 from torch._inductor.virtualized import V
@@ -11,17 +10,12 @@
 from . import mlir_common
 from . import mlir_lowering
 
-class MLIRWrapperKenrelGroup(cpp.KernelGroup):
-    def __init__(self):
-        super().__init__()
-        self.args = mlir_common.MLIRKernelArgs()
-
 class MLIRScheduling(BaseScheduling):
     count = 0
     target_kernel = MLIRKernel
     def __init__(self, scheduler):
         self.scheduler = scheduler
-        self.kernel_group = MLIRWrapperKenrelGroup()
+        self.kernel_group = mlir_common.MLIRWrapperKenrelGroup()
         self._ready_to_flush = False
         self.outer_function = set()
         config.inplace_buffers = False # FIXME. inout kernel makes trouble.. So disabled it!
@@ -82,7 +76,7 @@ def codegen_sync(self):
 
     def flush(self):
         self.kernel_group.codegen_define_and_call(V.graph.wrapper_code)
-        self.kernel_group = MLIRWrapperKenrelGroup()
+        self.kernel_group = mlir_common.MLIRWrapperKenrelGroup()
         self._set_flush_status(False)
 
     def define_function(self, kernel):
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index ec1340e7..f8e2b428 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -21,8 +21,8 @@
 from torch._inductor.virtualized import V
 
 from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
-from PyTorchSimFrontend.mlir.mlir_common import BaseMLIRHardwareInfo
-from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel, MLIRTile
+from PyTorchSimFrontend.mlir.mlir_common import BaseMLIRHardwareInfo, MLIRTile
+from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
 
 from . import mlir_common
 

From a4b21f8bce36c5d4eaefeceaf365c4cd86bfc0b6 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 7 Jan 2025 12:06:28 +0000
Subject: [PATCH 005/432] [Fix] #90 Scalar-Vector operations

---
 .../mlir/mlir_codegen_backend.py               |  4 ++--
 tests/MoE/test_moe.py                          | 18 +++++++++---------
 tests/test_single_perceptron.py                |  5 +++--
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 99c39322..1c28a2a8 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -777,10 +777,10 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             shape = f"vector<{self.tile_desc.get_tile_size()}x{type_name}>"
             reduced_shape = type_name
             init = self.cse.generate(self.reduction_prefix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
-            if len(self.ranges) == 1:
+            if len(self.ranges) == 1: # 1-D vector to scalar
                 axis = "0"
                 acc_var = init
-                shape = f"vector<{self.tile_desc.get_tile_size_per_lane()}x{type_name}>"
+                shape = f"vector<{self.tile_desc.get_tile_size()}x{type_name}>" # use single vector lane
             elif len(self.ranges) == 2:
                 vec_len = self.tile_desc.get_rows_per_lane()
                 flattened_size = f"vector<{self.tile_desc.get_tile_size_per_lane()}x{type_name}>"
diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py
index ff6dd00b..9a247669 100644
--- a/tests/MoE/test_moe.py
+++ b/tests/MoE/test_moe.py
@@ -453,15 +453,15 @@ def test_moe(device):
     total_cpu_loss.backward()
     print("MoE Backward Done!")
 
-    print("MoE Weight Bias print")
-    for i in range(num_experts):
-        print(f"\nExpert {i}")
-        print(f"FC1 Weight: {model.experts[i].fc1.weight.cpu()}")
-        print(f"FC1 Bias: {model.experts[i].fc1.bias.cpu()}")
-        print("\n")
-        print(f"FC2 Weight: {model.experts[i].fc2.weight.cpu()}")
-        print(f"FC2 Bias: {model.experts[i].fc2.bias.cpu()}")
-        print("\n")
+    # print("MoE Weight Bias print")
+    # for i in range(num_experts):
+    #     print(f"\nExpert {i}")
+    #     print(f"FC1 Weight: {model.experts[i].fc1.weight.cpu()}")
+    #     print(f"FC1 Bias: {model.experts[i].fc1.bias.cpu()}")
+    #     print("\n")
+    #     print(f"FC2 Weight: {model.experts[i].fc2.weight.cpu()}")
+    #     print(f"FC2 Bias: {model.experts[i].fc2.bias.cpu()}")
+    #     print("\n")
 
     print("MoE Weight Bias Grad")
     for i in range(num_experts):
diff --git a/tests/test_single_perceptron.py b/tests/test_single_perceptron.py
index 7ab02656..78a6b117 100644
--- a/tests/test_single_perceptron.py
+++ b/tests/test_single_perceptron.py
@@ -41,13 +41,14 @@ def weight_update(a, b, lr):
     b2.requires_grad = True
     opt_mlp = torch.compile(dynamic=False)(perceptron)
     opt_w = torch.compile(dynamic=False)(weight_update)
-    opt_loss = torch.compile(dynamic=False)(torch.nn.MSELoss())
+    loss_fn = torch.nn.MSELoss()
+    opt_loss = torch.compile(dynamic=False)(loss_fn)
     lr = torch.tensor(5e-2).to(device=device) # learning rate
     y = opt_mlp(w1, x1, b1)
     loss = opt_loss(y, y1)
     loss.backward()
     cpu_y = perceptron(x2, w2, b2)
-    cpu_loss = torch.nn.MSELoss()(cpu_y, y2)
+    cpu_loss = loss_fn(cpu_y, y2)
     cpu_loss.backward()
     test_result("Perceptron", y, cpu_y)
     test_result("Loss", loss, cpu_loss)

From 638b714adf1e505c98c3b126ed54e75b88563d90 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 8 Jan 2025 10:23:29 +0000
Subject: [PATCH 006/432] [Frontend] #89 Efficient vector tile size

---
 .../mlir/mlir_codegen_backend.py              | 37 ++++++++++++++++++-
 PyTorchSimFrontend/mlir/mlir_common.py        |  3 ++
 tests/MoE/test_moe.py                         |  8 ++--
 tests/test_softmax.py                         | 24 ++++++++++++
 4 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 1c28a2a8..b9a5bb6f 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -638,6 +638,35 @@ def set_ranges(self, lengths, reduction_lengths, read_writes):
         # Adjust time size when it is vector
         self.adjust_tile_size()
         return ret
+    def get_constant_vector2(self, expr):
+        # Case 0. symbol ex) index 0
+        # Case 1. inner product form ex) 16 * index0 + 1 * index1
+        # Case 2. Complicated form ex) 16 * index0 + 8 * (index//4) + (index % 4)
+        constant_vector = []
+        if expr.is_symbol:
+            constant_vector.append(tuple([1, expr]))
+            return constant_vector
+
+        for arg in expr.args:
+            if arg.is_symbol:
+                constant_vector.append(tuple([1,arg]))
+                continue
+            if len(arg.args) == 0: #TODO: check this
+                continue
+            if arg.args[0].is_number:
+                constant_vector.append(arg.args)
+            else:
+                constant_vector.append([1, arg])
+
+        return constant_vector
+
+    def find_node_by_name(self, name):
+        if name in V.graph.graph_inputs:
+            return V.graph.graph_inputs[name]
+        else:
+            for output_node in V.graph.graph_outputs:
+                if output_node.data.name == name:
+                    return output_node
 
     def parse_indices(self, expr):
         if len(expr.args) == 0:
@@ -999,6 +1028,9 @@ def get_dma_info(self, name, index, dtype):
             current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE # Actually it is not needed in vector case
             chunk_size = current_tile.get_chunk_size()
             mm_stride = current_tile.n_col
+            if self.is_scalar(name): # scalar to vector broadcasting
+                mm_stride = 0
+                current_tile.n_row, current_tile.n_col = current_tile.n_col, current_tile.n_row
         # Case 2. Tile is 1-D vector type with reduction
         elif len(cv) == 1 and len(cv) == self.reduction_depth + 1:
             # Use only one vectorlane to reduce a vector
@@ -1009,6 +1041,9 @@ def get_dma_info(self, name, index, dtype):
             current_tile.used_vector_lane = 1
             chunk_size = current_tile.get_chunk_size()
             mm_stride = 0 # don't care
+            tile_size_per_lane = current_tile.get_tile_size_per_lane()
+            if self.is_scalar(name): # scalar to vector broadcasting
+                current_tile.n_row, current_tile.n_col = current_tile.n_col, current_tile.n_row
         # Case 3. Tile is 2-D tile
         elif len(cv) == 2:
             is_reduction = self.reduction_depth == 1
@@ -1094,7 +1129,7 @@ def adjust_tile_size(self):
 
         # Case 1. vector kernel
         if len(self.itervars) == 1:
-            self.tile_desc.n_col = self.tile_desc.get_tile_size()
+            self.tile_desc.n_col = self.tile_desc.get_tile_size() if self.tile_desc.get_tile_size() < self.ranges[0] else self.ranges[0] # effective tile size
             self.tile_desc.n_row = 1
         elif len(self.itervars) == 0:
             self.tile_desc.n_col = 1
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index a949cb5d..21612a4c 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -369,6 +369,9 @@ def find_node_by_name(self, name):
                 if output_node.data.name == name:
                     return output_node
 
+    def is_scalar(self, name):
+        return self.buffer_types[name][1] == 1
+
     def roundup_vectorlane(self, size, amp=1):
         return ((size + self.vector_lane - 1) // self.vector_lane) * self.vector_lane * amp
 
diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py
index 9a247669..d14bf5c6 100644
--- a/tests/MoE/test_moe.py
+++ b/tests/MoE/test_moe.py
@@ -420,15 +420,15 @@ def test_moe(device):
     x1 = copy.deepcopy(X).to(device=device)
     x2 = copy.deepcopy(X).to("cpu")
 
-    # model.train()
-    model.eval()
+    model.train()
+    # model.eval()
     model_device = model.to(device=device)
     opt_model = torch.compile(model_device, dynamic=False)
     y_hat, aux_loss = opt_model(x1)
     print("MoE Custom Device Done!")
 
-    # model_cpu.train()
-    model_cpu.eval()
+    model_cpu.train()
+    # model_cpu.eval()
     cpu_hat, cpu_aux_loss = model_cpu(x2)
     test_result("MoE Forward", y_hat, cpu_hat)
     test_result("MoE Aux Loss", aux_loss, cpu_aux_loss)
diff --git a/tests/test_softmax.py b/tests/test_softmax.py
index ca49953c..d68638f8 100644
--- a/tests/test_softmax.py
+++ b/tests/test_softmax.py
@@ -18,6 +18,29 @@ def test_softmax(device, size=(128, 128), dim=1):
     input = torch.randn(size)
     x1 = input.to(device=device)
     x2 = input.to("cpu")
+
+    # split softmax into 3 steps
+    # def softmax1(x): # find max
+    #     return x.max(dim=dim, keepdim=True).values
+    # def softmax2(x, max):
+    #     return (x - max).exp().sum(dim=dim, keepdim=True)
+    # def softmax3(x, max, sum):
+    #     return (x - max).exp().div(sum)
+
+    # opt_fn1 = torch.compile(dynamic=False)(softmax1)
+    # opt_fn2 = torch.compile(dynamic=False)(softmax2)
+    # opt_fn3 = torch.compile(dynamic=False)(softmax3)
+
+    # max = opt_fn1(x1)
+    # cpu_max = softmax1(x2)
+    # test_result("Softmax Max", max, cpu_max)
+    # sum = opt_fn2(x1, max)
+    # cpu_sum = softmax2(x2, cpu_max)
+    # test_result("Softmax Sum", sum, cpu_sum)
+    # y = opt_fn3(x1, max, sum)
+    # cpu_y = softmax3(x2, cpu_max, cpu_sum)
+    # test_result("Softmax", y, cpu_y)
+
     opt_fn = torch.compile(dynamic=False)(torch.nn.functional.softmax)
     y = opt_fn(x1, dim=dim)
     cpu_y = torch.nn.functional.softmax(x2, dim=dim)
@@ -33,3 +56,4 @@ def test_softmax(device, size=(128, 128), dim=1):
     device = module.custom_device()
     test_softmax(device, size=(64, 128))
     test_softmax(device, size=(256, 128))
+    test_softmax(device, size=(1, 16))

From 6bd61bad1321d901baae2de3235bad9df8d97eea Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Thu, 9 Jan 2025 10:37:51 +0000
Subject: [PATCH 007/432] [Frontend] minimum 1D tile size

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index b9a5bb6f..12ede0d3 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1129,7 +1129,11 @@ def adjust_tile_size(self):
 
         # Case 1. vector kernel
         if len(self.itervars) == 1:
-            self.tile_desc.n_col = self.tile_desc.get_tile_size() if self.tile_desc.get_tile_size() < self.ranges[0] else self.ranges[0] # effective tile size
+            tile_size = self.tile_desc.get_tile_size()
+            if tile_size < self.ranges[0]:
+                tile_size = self.ranges[0]
+                min_tile_size_unit = self.vector_lane * self.vlen # VCIX widening is not implemented
+                self.tile_desc.n_col = (tile_size + min_tile_size_unit - 1) // min_tile_size_unit
             self.tile_desc.n_row = 1
         elif len(self.itervars) == 0:
             self.tile_desc.n_col = 1

From 65ba39764c6185275e199ce7fdd3e12357edacd2 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 15 Jan 2025 03:43:45 +0000
Subject: [PATCH 008/432] [Frontend] various padding type

---
 .../mlir/mlir_codegen_backend.py              | 47 +++++--------------
 1 file changed, 13 insertions(+), 34 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 12ede0d3..c63f53bb 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -638,35 +638,15 @@ def set_ranges(self, lengths, reduction_lengths, read_writes):
         # Adjust time size when it is vector
         self.adjust_tile_size()
         return ret
-    def get_constant_vector2(self, expr):
-        # Case 0. symbol ex) index 0
-        # Case 1. inner product form ex) 16 * index0 + 1 * index1
-        # Case 2. Complicated form ex) 16 * index0 + 8 * (index//4) + (index % 4)
-        constant_vector = []
-        if expr.is_symbol:
-            constant_vector.append(tuple([1, expr]))
-            return constant_vector
-
-        for arg in expr.args:
-            if arg.is_symbol:
-                constant_vector.append(tuple([1,arg]))
-                continue
-            if len(arg.args) == 0: #TODO: check this
-                continue
-            if arg.args[0].is_number:
-                constant_vector.append(arg.args)
-            else:
-                constant_vector.append([1, arg])
-
-        return constant_vector
 
-    def find_node_by_name(self, name):
-        if name in V.graph.graph_inputs:
-            return V.graph.graph_inputs[name]
-        else:
-            for output_node in V.graph.graph_outputs:
-                if output_node.data.name == name:
-                    return output_node
+    # padding type 0: zero-padding 1: negative-padding(-inf) ...
+    def get_padding_type(self):
+        ops = self.current_node.node.origins
+        if self.current_node.is_reduction():
+            for op in ops:
+                if "exp" in op.name: # exponential reduciton case
+                    return 1
+        return 0
 
     def parse_indices(self, expr):
         if len(expr.args) == 0:
@@ -699,6 +679,7 @@ def parse_indices(self, expr):
     def load(self, name: str, index: sympy.Expr):
         index = self.rename_indexing(index)
         indices = self.parse_indices(index)
+        padding = self.get_padding_type()
         prefix = self.newvar_prefix
         if index.is_number:
             prefix = prefix + "c"
@@ -725,7 +706,7 @@ def load(self, name: str, index: sympy.Expr):
             self.dma_cache[dma_key] = dmaType, stride, chunk
         self.tags.add(f"{name}_tag")
         self.consts.add(0)
-        code = f"affine.dma_start %{var}[{prefix}{indices}], %{buffer}[%c0, %c0], %{name}_tag[0], %c{dmaType}, %c{stride}, %c{chunk} : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32>"
+        code = f"affine.dma_start %{var}[{prefix}{indices}], %{buffer}[%c0, %c0], %{name}_tag[0], %c{dmaType}, %c{stride}, %c{chunk} : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32> {{padding = {padding}}}"
         self.cse.generate(self.loads, code, assignment = False) # FIXME: assignment = False does not support caching
 
         operation = "affine.vector_load" if tile_size_per_lane > 1 else "affine.load"
@@ -1129,11 +1110,9 @@ def adjust_tile_size(self):
 
         # Case 1. vector kernel
         if len(self.itervars) == 1:
-            tile_size = self.tile_desc.get_tile_size()
-            if tile_size < self.ranges[0]:
-                tile_size = self.ranges[0]
-                min_tile_size_unit = self.vector_lane * self.vlen # VCIX widening is not implemented
-                self.tile_desc.n_col = (tile_size + min_tile_size_unit - 1) // min_tile_size_unit
+            tile_size = self.tile_desc.get_tile_size() if self.tile_desc.get_tile_size() < self.ranges[0] else self.ranges[0]
+            min_tile_size_unit = self.vector_lane * self.vlen # TODO: VCIX widening is not implemented
+            self.tile_desc.n_col = math.ceil(tile_size / min_tile_size_unit) * min_tile_size_unit # padding
             self.tile_desc.n_row = 1
         elif len(self.itervars) == 0:
             self.tile_desc.n_col = 1

From 950912484767af23f06e4c4db7f9048c0d53d4cf Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Fri, 10 Jan 2025 11:19:13 +0000
Subject: [PATCH 009/432] [Fronted] fusion basic case debug

---
 .../mlir/mlir_codegen_backend.py              | 22 ++++++-------
 PyTorchSimFrontend/mlir/mlir_scheduling.py    | 23 ++++++++++---
 PyTorchSimFrontend/mlir/mlir_template.py      | 32 ++++++++++---------
 test_extension_backend.py                     |  1 +
 4 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index c63f53bb..b4e2d00b 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -12,6 +12,7 @@
 from collections import OrderedDict
 import torch
 from torch._inductor.codegen import cpp, wrapper, common
+from torch._inductor.scheduler import BaseScheduling
 from torch._inductor.virtualized import V, _ops as ops
 from torch._inductor.codecache import write_atomic, write
 from torch._inductor.utils import (
@@ -903,7 +904,7 @@ def template_store(options):
                    f": memref<{options['TILE_M']}x{options['TILE_N']}xf32, 1>,"\
                    f"memref<{options['M'] * options['N']}xf32>, memref<1xi32>" #FIXME: Using constant index
             self.cse.generate(self.stores, line, assignment = False)
-        self.body.splice(self.codegen_init())
+        self.body.splice(self.codegen_init('e_'))
         self.body.splice(self.loads)
         self.body.splice(self.compute)
         if len(self.stores._lines) == 0:
@@ -916,14 +917,14 @@ def template_store(options):
     def codegen_global_init(self):
         return self.global_vars
 
-    def codegen_init(self):
+    def codegen_init(self, prefix=""):
         code = IndentedBuffer()
         tags = sorted(self.tags)
         consts = sorted(self.consts)
         for tag in tags:
-            code.writeline(f"%{tag} = memref.alloc() : memref<1xi32>")
+            code.writeline(f"%{prefix}{tag} = memref.alloc() : memref<1xi32>")
         for const in consts:
-            code.writeline(f"%c{const} = arith.constant {const} : index")
+            code.writeline(f"%{prefix}c{const} = arith.constant {const} : index")
         return code
 
     def codegen_loops(self):
@@ -1140,17 +1141,14 @@ def adjust_tile_size(self):
         if len(self.itervars) >= 3 and self.reduction_depth < len(self.itervars):
             raise NotImplementedError()
 
-    def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, indices, raw_index):
+    def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, indices, raw_index, is_template=False):
         c_type = mlir_common.DTYPE_TO_C[dtype]
         mlir_type = mlir_common.DTYPE_TO_MLIR[dtype]
         # Make sure each lane's buffer has at least two element
         tile_size = max(self.roundup_vectorlane(tile_row * tile_col), self.vector_lane * 2)
-        if dtype == torch.bool and not self.is_template_kernel:     #FIXME: epilogue ReLU does not need this
-            if self.is_template_kernel:
-                mapping = f"template_{indices} "
-                self.map_cse.generate(self.global_vars, f"#{mapping} = affine_map<({indices}) -> ({indices} floordiv 8)>", assignment=False)
-            else:
-                mapping = self.map_cse.generate(self.global_vars, f"affine_map<({indices}) -> ({indices} floordiv 8)>")
+
+        if dtype == torch.bool and not is_template:
+            mapping = self.map_cse.generate(self.global_vars, f"affine_map<({indices}) -> ({indices} floordiv 8)>")
             indices = self.cse.generate(self.loads, f"affine.apply #{mapping}(%{indices})") # FIXME. Only loads?
 
         if name not in self.global_vars_dict:
@@ -1210,4 +1208,4 @@ def mark_parallel(self, par_depth):
         loops[0].parallel = par_depth
         for i in range(1, par_depth):
             loops[i].collapsed = True
-        loops[0].simd = loops[par_depth - 1].simd
+        loops[0].simd = loops[par_depth - 1].simd
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 752fa8b4..7425728c 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -3,7 +3,7 @@
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
 
 from torch._inductor import config
-from torch._inductor.scheduler import BaseScheduling
+from torch._inductor.scheduler import BaseScheduling, FusedSchedulerNode
 from torch._inductor.utils import IndentedBuffer
 from torch._inductor.virtualized import V
 
@@ -24,15 +24,30 @@ def _set_flush_status(self, status: bool):
         self._ready_to_flush = status
 
     def can_fuse_vertical(self, node1, node2):
-        return False
         return self.can_fuse_horizontal(node1, node2) and not node1.is_reduction()
 
     def can_fuse_horizontal(self, node1, node2):
-        return False
         _, (vars1, reduce1) = node1.group
         _, (vars2, reduce2) = node2.group
+
+        # Reduction is currently not supported
+        if node1.is_reduction() or node2.is_reduction():
+            return False
+
+        if not isinstance(node1, FusedSchedulerNode) and not isinstance(node2, FusedSchedulerNode):
+            # Different layout is not supported
+            if node1.node.layout.dtype != node2.node.layout.dtype:
+                return False
+
+            # Different size is not supported for non-template node
+            if  not node1.is_template() and (node1._sizes[0] != node2._sizes[0]):
+                return False
+
         if vars1 == vars2 and reduce1 == reduce2:
             return True
+        if reduce1 == () and vars1 == vars2 + reduce2:
+            return True
+
         #TODO: Temporary solution determining the fusion condition similar to CPP/OpenMP
         v1_total = math.prod(vars1) if len(vars1) else 0
         v2_total = math.prod(vars2) if len(vars2) else 0
@@ -40,8 +55,8 @@ def can_fuse_horizontal(self, node1, node2):
         r2_total = math.prod(reduce2) if len(reduce2) else 0
         if reduce1 == () \
             and v1_total == (v2_total + r2_total):
-            # and node1.node.layout.size == node2.node.layout.size:     #FIXME: Need to check layout too?
             return True
+
         return False
 
     def group_fn(self, sizes):
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index f8e2b428..c45c5d29 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -8,13 +8,8 @@
 from typing import List, Optional
 from unittest.mock import patch
 
-from torch._inductor.codegen.common import KernelTemplate
-from torch._inductor.codegen.common import ChoiceCaller
-from torch._inductor.codegen.common import Kernel
-from torch._inductor.codegen.common import OpOverrides
-from torch._inductor.ir import Buffer
-from torch._inductor.ir import IRNode
-from torch._inductor.ir import TemplateBuffer
+from torch._inductor.codegen.common import Kernel, KernelTemplate, ChoiceCaller, OpOverrides
+from torch._inductor.ir import Buffer, IRNode, TemplateBuffer
 from torch._inductor.select_algorithm import PartialRender
 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
 from torch._inductor.autotune_process import TensorMeta
@@ -250,11 +245,14 @@ def render(self, template, kwargs):
         )
 
     def adjust_tile_size(self):
+        # Fixed tile size for template kernel
+        self.tile_desc.tile_layout = MLIRTile.TILE_COL_WISE
         self.tile_desc.n_row = self.render_options['TILE_M']
         self.tile_desc.n_col = self.render_options['TILE_N']
         return
 
     def load_epilogue(self, name: str, index: sympy.Expr):
+        indices = self.parse_indices(index)
         index = self.rename_indexing(index)
         var = self.args.input(name)
         dtype = V.graph.get_dtype(name)
@@ -263,23 +261,24 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         if name in self.buffer_names:
             buffer = self.buffer_names[name]
         else:
-            dram_mlir_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
+            # dram_mlir_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
             mvin3 = 14
             self.consts.add(mvin3)
-            self.consts.add(0)
             dram_tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
-            buffer, indices = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], dram_tile_shape, self.loads, index)
+            buffer, indices = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], dram_tile_shape, self.loads, indices, index)
             self.buffer_names[name] = buffer
-            line = f"affine.dma_start %{var}[%index2], %{buffer}[%c0, %c0], %tag[0], %c{mvin3}, %N, %c_set : {dram_mlir_shape}, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32>"
+            # line = f"affine.dma_start %{var}[%index2], %{buffer}[%e_c0, %e_c0], %tag[0], %e_c{mvin3}, %N, %c_set : {dram_mlir_shape}, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32>"
+            line = f"affine.dma_start %{var}[%index2], %{buffer}[%e_c0, %e_c0], %tag[0], %e_c{mvin3}, %N, %c_set : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32>"
             self.cse.generate(self.loads, line, assignment = False)
 
         tile_size_per_lane = self.render_options['TILE_M'] * self.render_options['TILE_N'] // self.vector_lane
         operation = "affine.vector_load" if tile_size_per_lane > 1 else "affine.load"
         shape = f", vector<{tile_size_per_lane}x{type_name}>" if tile_size_per_lane > 1 else ""
-        line = f"{operation} %{buffer}[0, 0] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>{shape}"
+        line = f"{operation} %{buffer}[%e_c0, %e_c0] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>{shape}"
         out = self.cse.generate(self.loads, line)
         var_info = [tile_size_per_lane, mlir_common.DTYPE_TO_MLIR[dtype]]
         self.register_var_info(out, var_info)
+        self.consts.add(0)
         return out
 
     def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
@@ -292,7 +291,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         dtype = V.graph.get_dtype(name)
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
 
-        chunk_size = self.tile_desc.get_chunk_size()
+        chunk_size = 1  # Fixed for template kernel
         chunk = chunk_size << 1 | (self.tile_desc.tile_per_lane_layout == MLIRTile.TILE_PER_LANE_COL_WISE)
         self.consts.add(chunk)
 
@@ -306,14 +305,17 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         tile_size_per_lane = self.render_options['TILE_M'] * self.render_options['TILE_N'] // self.vector_lane
         operation = "affine.vector_store" if tile_size_per_lane > 1 else "affine.store"
         shape = f", vector<{tile_size_per_lane}x{type_name}>" if tile_size_per_lane > 1 else ""
-        line = f"{operation} %{value}, %{buffer}[0, 0] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>{shape}"
+        line = f"{operation} %{value}, %{buffer}[%e_c0, %e_c0] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>{shape}"
         self.cse.generate(self.stores, line, assignment = False)
 
         self.tags.add(f"{name}_tag")
         self.consts.add(0)
-        code = f"affine.dma_start %{buffer}[%c0, %c0], %{var}[%index2], %tag[0], %c_mvout, %N, %c{chunk} : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>, memref<{self.render_options['M'] * self.render_options['N']}x{type_name}>, memref<1xi32>" #FIXME: Using constant index and tag
+        code = f"affine.dma_start %{buffer}[%e_c0, %e_c0], %{var}[%index2], %tag[0], %c_mvout, %N, %e_c{chunk} : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>, memref<{self.render_options['M'] * self.render_options['N']}x{type_name}>, memref<1xi32>" #FIXME: Using constant index and tag
         self.cse.generate(self.stores, code, assignment = False)
 
+    def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, indices, raw_index):
+        return super().get_scratchpad_buffer(dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, indices, raw_index, True)
+
 class MLIRTemplateCaller(CUDATemplateCaller):
     def __str__(self):
         return f"MLIRTemplateCaller(source_file={self.bmreq.source_file})"
diff --git a/test_extension_backend.py b/test_extension_backend.py
index 6c5429c7..bee28729 100644
--- a/test_extension_backend.py
+++ b/test_extension_backend.py
@@ -50,5 +50,6 @@
 
     # # Fusion Test
     test_matmul_scalar(device)
+    test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="relu")
     test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="sigmoid")
     test_addmm_residual(device)

From 4a8ea90f9e92cafe383f2dbb3e4e9d06352c612c Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Tue, 14 Jan 2025 07:01:30 +0000
Subject: [PATCH 010/432] [Frontend] remove unused code

---
 PyTorchSimFrontend/mlir/mlir_template.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index c45c5d29..2c1678a2 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -261,13 +261,11 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         if name in self.buffer_names:
             buffer = self.buffer_names[name]
         else:
-            # dram_mlir_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
             mvin3 = 14
             self.consts.add(mvin3)
             dram_tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
             buffer, indices = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], dram_tile_shape, self.loads, indices, index)
             self.buffer_names[name] = buffer
-            # line = f"affine.dma_start %{var}[%index2], %{buffer}[%e_c0, %e_c0], %tag[0], %e_c{mvin3}, %N, %c_set : {dram_mlir_shape}, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32>"
             line = f"affine.dma_start %{var}[%index2], %{buffer}[%e_c0, %e_c0], %tag[0], %e_c{mvin3}, %N, %c_set : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32>"
             self.cse.generate(self.loads, line, assignment = False)
 

From ec55905487a73e35cd5aa85d22b4aa651e7e76d4 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 15 Jan 2025 06:04:04 +0000
Subject: [PATCH 011/432] [Frontend] block convolution fusion

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 7425728c..655d2944 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -30,6 +30,10 @@ def can_fuse_horizontal(self, node1, node2):
         _, (vars1, reduce1) = node1.group
         _, (vars2, reduce2) = node2.group
 
+        # Convolution is currently not supported
+        if node1.node.origin_node.target._name == 'aten::convolution' or node2.node.origin_node.target._name == 'aten::convolution':
+            return False
+
         # Reduction is currently not supported
         if node1.is_reduction() or node2.is_reduction():
             return False

From 8919704162539ee4a77e3cca69db50278e671628 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 15 Jan 2025 12:29:57 +0000
Subject: [PATCH 012/432] [Frontned] Extend to_dtype operation

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index b4e2d00b..d9d3d1da 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -226,11 +226,20 @@ def to_dtype(operand, dst_mlir_dtype, *args, var_info=None):
             raise NotImplementedError("floating point to integer conversion")
         if dst_mlir_dtype[0] == "f" and src_mlir_dtype[0] == "i":
             raise NotImplementedError("integer to floating point conversion")
-        else:
+        if dst_mlir_dtype[0] == "i":
+            if dst_bits > src_bits:
+                return f"arith.extui %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
+            elif dst_bits < src_bits:
+                return f"arith.trunc %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
+            return f"arith.maximumi %{operand}, %{operand} : {shape}", [tile_size, dst_mlir_dtype]
+        elif dst_mlir_dtype[0] == "f":
             if dst_bits > src_bits:
-                return f"arith.extui %{operand} : {src_shape} to {shape}"
+                return f"arith.extf %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
             elif dst_bits < src_bits:
-                return f"arith.trunc %{operand} : {src_shape} to {shape}"
+                return f"arith.trunf %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
+            return f"arith.maximumf %{operand}, %{operand} : {shape}", [tile_size, dst_mlir_dtype]
+        else:
+            raise NotImplementedError("Unsupported type for to_dtype ops")
 
     @staticmethod
     def constant(value, src_type, *args, var_info=None):
@@ -602,7 +611,7 @@ def broadcast(operand1, operand2, *args, var_info=None):
     "MVIN1": 2,
     "MVIN2": 1,
     "MVIN3": 14,
-    "MVOUT": 3,
+    "MVOUT1": 3,
 }
 
 class MLIRKernel(mlir_common.BaseMLIRKernel):

From c411add51693af57a59edf4fb01f46ffcda74722 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 15 Jan 2025 12:59:26 +0000
Subject: [PATCH 013/432] [Frontend] Add common local variable gen

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  |  1 +
 .../mlir/mlir_codegen_backend.py              | 98 ++++++++++---------
 PyTorchSimFrontend/mlir/mlir_common.py        |  4 -
 PyTorchSimFrontend/mlir/mlir_conv_template.py |  1 +
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  1 +
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  1 +
 PyTorchSimFrontend/mlir/mlir_template.py      | 46 ++++++---
 7 files changed, 86 insertions(+), 66 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index cd99d52e..5a68947e 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -35,6 +35,7 @@
   %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
   %tag = memref.alloc() : memref<1xi32>
   %v0 = arith.constant dense<0.0> : vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>
+  {{- kernel.def_local_vars() }}
 
   affine.for %b=0 to {{ B }} {
     affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index d9d3d1da..2f4deca3 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -620,6 +620,8 @@ class MLIRKernel(mlir_common.BaseMLIRKernel):
 
     def __init__(self):
         super().__init__(mlir_common.MLIRKernelArgs())
+        self.const_buffer = IndentedBuffer()
+        self.alloc_buffer = IndentedBuffer()
         self.reduction_prefix = IndentedBuffer()
         self.reduction_suffix = IndentedBuffer()
         self.body = IndentedBuffer()
@@ -632,16 +634,17 @@ def __init__(self):
         self.iterator_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="iter")
         self.init_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="init")
         self.init_vec_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="init_vec")
+        self.const_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="const")
+        self.alloc_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="alloc")
         self.map_cse = common.CSE("#", self.suffix, name_prefix="map")
-        self.consts = set()
-        self.tags = set()
+        self.consts = dict()
+        self.tags = dict()
         self.dma_cache = {}
         self.dma_counter = 1
         self.affine_yield = {}
         self.welford_reduce_out = None
         self.reduce_iterator = {}
         self.is_template_kernel = False
-
     def set_ranges(self, lengths, reduction_lengths, read_writes):
         ret = super().set_ranges(lengths, reduction_lengths, read_writes)
 
@@ -658,7 +661,12 @@ def get_padding_type(self):
                     return 1
         return 0
 
-    def parse_indices(self, expr):
+    def parse_indices(self, expr) -> common.CSEVariable:
+        # Constant case
+        if expr.is_number:
+            return self.get_const_cse(int(expr))
+
+        # Identity case
         if len(expr.args) == 0:
             return expr
 
@@ -670,9 +678,11 @@ def parse_indices(self, expr):
             indices[index] = None
         indices = list(indices.keys())
 
-        args = ", ".join(map(str, indices))
+        # Extract // pattern
         if "//" in expr_str:
             expr_str = expr_str.replace("//", " floordiv ")
+
+        # Extract modular pattern
         pattern = r"ModularIndexing\((.*?)\)"
         matches = re.search(pattern, expr_str)
         if matches:
@@ -681,6 +691,7 @@ def parse_indices(self, expr):
             replace_str = f"({args_list[0]} floordiv {args_list[1]}) mod {args_list[2]}"
             expr_str = re.sub(r"ModularIndexing\([^)]*\)", replace_str, expr_str)
 
+        args = ", ".join(map(str, indices))
         map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args}) -> ({expr_str})>")
         args = ", ".join([f"%{i}" for i in indices])
         index = self.cse.generate(self.loads, f"affine.apply #{map_var}({args})")
@@ -691,9 +702,6 @@ def load(self, name: str, index: sympy.Expr):
         indices = self.parse_indices(index)
         padding = self.get_padding_type()
         prefix = self.newvar_prefix
-        if index.is_number:
-            prefix = prefix + "c"
-            self.consts.add(int(index))
         var = self.args.input(name)
         dtype = V.graph.get_dtype(name)
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
@@ -710,13 +718,13 @@ def load(self, name: str, index: sympy.Expr):
             assert(self.dma_counter < 4)
             dmaType = DMA_TYPE[f"MVIN{self.dma_counter}"]
             self.dma_counter += 1
-            self.consts.add(dmaType)
-            self.consts.add(stride)
-            self.consts.add(chunk)
+            dmaType = self.get_const_cse(dmaType)
+            stride = self.get_const_cse(stride)
+            chunk = self.get_const_cse(chunk)
             self.dma_cache[dma_key] = dmaType, stride, chunk
-        self.tags.add(f"{name}_tag")
-        self.consts.add(0)
-        code = f"affine.dma_start %{var}[{prefix}{indices}], %{buffer}[%c0, %c0], %{name}_tag[0], %c{dmaType}, %c{stride}, %c{chunk} : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32> {{padding = {padding}}}"
+        tag_var = self.get_tag_cse(f"{name}_tag")
+        zero_var = self.get_const_cse(0)
+        code = f"affine.dma_start %{var}[{prefix}{indices}], %{buffer}[%{zero_var}, %{zero_var}], %{tag_var}[0], %{dmaType}, %{stride}, %{chunk} : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32> {{padding = {padding}}}"
         self.cse.generate(self.loads, code, assignment = False) # FIXME: assignment = False does not support caching
 
         operation = "affine.vector_load" if tile_size_per_lane > 1 else "affine.load"
@@ -731,9 +739,6 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         index = self.rename_indexing(index)
         indices = self.parse_indices(index)
         prefix = self.newvar_prefix
-        if index.is_number:
-            prefix = prefix + "c"
-            self.consts.add(int(index))
         var = self.args.output(name)
         dtype = V.graph.get_dtype(name)
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
@@ -745,9 +750,9 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
         # MVOUT Encoding
         dmaType = 3 # MVIN 2, MVIN2 1, MVIN3 14, MVOUT 3
-        self.consts.add(dmaType)
-        self.consts.add(stride)
-        self.consts.add(chunk)
+        dmaType = self.get_const_cse(dmaType)
+        stride = self.get_const_cse(stride)
+        chunk = self.get_const_cse(chunk)
 
         store_size, operand_type = self.var_info[value]
         operation = "affine.vector_store" if tile_size_per_lane > 1 and store_size > 1 else "affine.store"
@@ -757,9 +762,9 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
         line = f"{operation} %{value}, %{buffer}[0, 0] : memref<{dram_tile_shape}x{type_name}, 1>{shape}"
         self.cse.generate(self.stores, line, assignment = False)
-        self.consts.add(0)
-        self.tags.add(f"{name}_tag")
-        code = f"affine.dma_start %{buffer}[%c0, %c0], %{var}[{prefix}{indices}], %{name}_tag[0], %c{dmaType}, %c{stride}, %c{chunk} : memref<{dram_tile_shape}x{type_name}, 1>, memref<{self.buffer_types[name][1]}x{type_name}>, memref<1xi32>"
+        tag_var = self.get_tag_cse(f"{name}_tag")
+        zero_var = self.get_const_cse(0)
+        code = f"affine.dma_start %{buffer}[%{zero_var}, %{zero_var}], %{var}[{prefix}{indices}], %{tag_var}[0], %{dmaType}, %{stride}, %{chunk} : memref<{dram_tile_shape}x{type_name}, 1>, memref<{self.buffer_types[name][1]}x{type_name}>, memref<1xi32>"
         self.cse.generate(self.stores, code, assignment = False)
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
@@ -842,10 +847,6 @@ def store_reduction(self, name, index, value):
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
         index = self.rename_indexing(index)
         indices = self.parse_indices(index)
-        prefix = self.newvar_prefix
-        if index.is_number:
-            prefix = prefix + "c"
-            self.consts.add(int(index))
         # Tile is always reuduced in inner loop
         tile_col = self.tile_desc.n_row
         tile_row = 1
@@ -895,25 +896,25 @@ def store_reduction(self, name, index, value):
         is_col_major = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
         chunk_size = self.tile_desc.get_rows_per_lane()
         chunk = chunk_size << 1 | (is_col_major == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
-        self.consts.add(dmaType)
-        self.consts.add(mm_stride)
-        self.consts.add(chunk)
-        self.tags.add(f"{name}_tag")
+        dmaType = self.get_const_cse(dmaType)
+        mm_stride = self.get_const_cse(mm_stride)
+        chunk = self.get_const_cse(chunk)
+
         # Change row, col
-        self.consts.add(0)
-        code = f"affine.dma_start %{buffer}[%c0, %c0], %{var}[{prefix}{indices}], %{name}_tag[0], %c{dmaType}, %c{mm_stride}, %c{chunk} : memref<{tile_row}x{tile_col}x{type_name}, 1>, memref<{self.buffer_types[name][1]}x{type_name}>, memref<1xi32>"
+        tag_var = self.get_tag_cse(f"{name}_tag")
+        zero_var = self.get_const_cse(0)
+        code = f"affine.dma_start %{buffer}[%{zero_var}, %{zero_var}], %{var}[%{indices}], %{tag_var}[0], %{dmaType}, %{mm_stride}, %{chunk} : memref<{tile_row}x{tile_col}x{type_name}, 1>, memref<{self.buffer_types[name][1]}x{type_name}>, memref<1xi32>"
         self.cse.generate(self.reductions_suffix, code, assignment = False)
 
     def codegen_body(self):
         def template_store(options):
             subtile_size = [self.vector_lane, self.vector_lane]
             async_flag = 1
-            self.consts.add(0)
-            line = f"affine.dma_start %Y_buffer[%c0, %c0], %Y[%index2], %tag[0], %c_mvout, %N, %c_set"\
+            zero_var = self.get_const_cse(0)
+            line = f"affine.dma_start %Y_buffer[%{zero_var}, %{zero_var}], %Y[%index2], %tag[0], %c_mvout, %N, %c_set"\
                    f": memref<{options['TILE_M']}x{options['TILE_N']}xf32, 1>,"\
                    f"memref<{options['M'] * options['N']}xf32>, memref<1xi32>" #FIXME: Using constant index
             self.cse.generate(self.stores, line, assignment = False)
-        self.body.splice(self.codegen_init('e_'))
         self.body.splice(self.loads)
         self.body.splice(self.compute)
         if len(self.stores._lines) == 0:
@@ -926,16 +927,6 @@ def template_store(options):
     def codegen_global_init(self):
         return self.global_vars
 
-    def codegen_init(self, prefix=""):
-        code = IndentedBuffer()
-        tags = sorted(self.tags)
-        consts = sorted(self.consts)
-        for tag in tags:
-            code.writeline(f"%{prefix}{tag} = memref.alloc() : memref<1xi32>")
-        for const in consts:
-            code.writeline(f"%{prefix}c{const} = arith.constant {const} : index")
-        return code
-
     def codegen_loops(self):
         code = mlir_common.ParallelLoopBuffer()
         # Loop body part
@@ -952,6 +943,9 @@ def codegen_loops(self):
             vars = ', '.join([f"%{name}" for name, _ in self.affine_yield.items()])
             reduced_shapes = ', '.join([f"{shape}" for _, shape in self.affine_yield.items()])
             self.stores.writeline(f"affine.yield {vars} : {reduced_shapes}")
+
+        code.splice(self.const_buffer)
+        code.splice(self.alloc_buffer)
         with contextlib.ExitStack() as stack:
             for loop in loops.loops:
                 loop_lines = loop.lines()
@@ -978,7 +972,7 @@ def codegen_loops(self):
     def codegen_nodes(self, nodes, kernel_name):
         src_code = super().codegen_nodes(nodes, kernel_name)
 
-        # Create extra header for simulatoors
+        # Create extra headers for simulators
         write_path = extension_codecache.get_write_path(src_code)
         if not os.path.exists(write_path):
             os.makedirs(write_path)
@@ -1175,6 +1169,16 @@ def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape
         buffer = self.cse.generate(code_buffer, f"memref.get_global @{new_name}_spad : memref<{dram_tile_shape}x{mlir_type}, 1>")
         return buffer, indices
 
+    def get_const_cse(self, value) -> common.CSEVariable:
+        if value not in self.consts:
+            self.consts[value] = self.const_cse.generate(self.const_buffer, f"arith.constant {value} : index")
+        return self.consts[value]
+
+    def get_tag_cse(self, value, shape="memref<1xi32>"):
+        if value not in self.tags:
+            self.tags[value] = self.alloc_cse.generate(self.alloc_buffer, f"memref.alloc() : {shape}")
+        return self.tags[value]
+
 @dataclasses.dataclass
 class LoopLevel:
     var: sympy.Expr
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 21612a4c..51399197 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -277,9 +277,6 @@ def codegen_global_init(self):
     def codegen_loops(self):
         raise NotImplementedError()
 
-    def codegen_init(self):
-        raise NotImplementedError()
-
     def call_kernel(self, kernel_name):
         wrapper = V.graph.wrapper_code
         _, call_args, _, _ = self.kernel_group.args.mlir_argdefs()
@@ -322,7 +319,6 @@ def _codegen_kernel(self, arg_defs, kernel_name):
             for old, new in self.kernel_group.args.aliases():
                 code.writeline(f"auto {old} = {new};")
             # Loop body part
-            code.splice(self.codegen_init())
             code.splice(self.codegen_loops())
         return code
 
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 3f52a61d..304474cc 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -35,6 +35,7 @@
   %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
   %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
   %tag = memref.alloc() : memref<1xi32>
+  {{- kernel.def_local_vars() }}
 
   affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
     affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 954059c0..c116ebf6 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -35,6 +35,7 @@
   %tag = memref.alloc() : memref<1xi32>{% if not Bias %}
   %v0 = arith.constant dense<0.0> : vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>{% endif %}
   %c0 = arith.constant 0 : index
+  {{- kernel.def_local_vars() }}
 
   affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
     affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 655d2944..21e7a78c 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -138,6 +138,7 @@ def codegen_src_code(self, kernel, render, template_node, epilogue_nodes):
                 else partial_code.finalize()
             )
             src_code = kernel.add_extra_global_vars(src_code)
+            src_code = kernel.add_extra_local_vars(src_code)
         return src_code
 
     def codegen_template(self, template_node, epilogue_nodes):
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 2c1678a2..4303f7d4 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -8,12 +8,13 @@
 from typing import List, Optional
 from unittest.mock import patch
 
-from torch._inductor.codegen.common import Kernel, KernelTemplate, ChoiceCaller, OpOverrides
+from torch._inductor.codegen.common import Kernel, KernelTemplate, ChoiceCaller, OpOverrides, CSE
 from torch._inductor.ir import Buffer, IRNode, TemplateBuffer
 from torch._inductor.select_algorithm import PartialRender
 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
 from torch._inductor.autotune_process import TensorMeta
 from torch._inductor.virtualized import V
+from torch._inductor.utils import IndentedBuffer
 
 from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
 from PyTorchSimFrontend.mlir.mlir_common import BaseMLIRHardwareInfo, MLIRTile
@@ -46,6 +47,9 @@ def __init__(self,
         self.tile_size = []
         self.loop_size = None
         self.is_template_kernel = True
+        self.map_cse = CSE("#", self.suffix, name_prefix="template_map")
+        self.const_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="template_const")
+        self.alloc_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="template_alloc")
 
         # Overwrite ops
         self.load = self.load_epilogue
@@ -237,6 +241,22 @@ def add_extra_global_vars(self, code):
         key = "<GLOBAL_VARS>"
         return code.replace(key, self.replace_global_vars())
 
+    def def_local_vars(self):
+        return "<LOCAL_VARS>"
+
+    def replace_local_vars(self):
+        code = IndentedBuffer()
+        code.tabwidth = 2
+        code.splice("\n")
+        with code.indent():
+            code.splice(self.const_buffer)
+            code.splice(self.alloc_buffer)
+        return code.getvalue()
+
+    def add_extra_local_vars(self, code):
+        key = "<LOCAL_VARS>"
+        return code.replace(key, self.replace_local_vars())
+
     def render(self, template, kwargs):
         # self.render_hooks = {}
         return PartialRender(
@@ -262,36 +282,33 @@ def load_epilogue(self, name: str, index: sympy.Expr):
             buffer = self.buffer_names[name]
         else:
             mvin3 = 14
-            self.consts.add(mvin3)
+            mvin3 = self.get_const_cse(mvin3)
+            zero_cse = self.get_const_cse(0)
             dram_tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
             buffer, indices = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], dram_tile_shape, self.loads, indices, index)
             self.buffer_names[name] = buffer
-            line = f"affine.dma_start %{var}[%index2], %{buffer}[%e_c0, %e_c0], %tag[0], %e_c{mvin3}, %N, %c_set : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32>"
+            line = f"affine.dma_start %{var}[%index2], %{buffer}[%{zero_cse}, %{zero_cse}], %tag[0], %{mvin3}, %N, %c_set : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32>"
             self.cse.generate(self.loads, line, assignment = False)
 
+        zero_cse = self.get_const_cse(0)
         tile_size_per_lane = self.render_options['TILE_M'] * self.render_options['TILE_N'] // self.vector_lane
         operation = "affine.vector_load" if tile_size_per_lane > 1 else "affine.load"
         shape = f", vector<{tile_size_per_lane}x{type_name}>" if tile_size_per_lane > 1 else ""
-        line = f"{operation} %{buffer}[%e_c0, %e_c0] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>{shape}"
+        line = f"{operation} %{buffer}[%{zero_cse}, %{zero_cse}] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>{shape}"
         out = self.cse.generate(self.loads, line)
         var_info = [tile_size_per_lane, mlir_common.DTYPE_TO_MLIR[dtype]]
         self.register_var_info(out, var_info)
-        self.consts.add(0)
         return out
 
     def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         indices = self.parse_indices(index)
-        prefix = self.newvar_prefix
-        if index.is_number:
-            prefix = prefix + "c"
-            self.consts.add(int(index))
         var = self.args.output(name)
         dtype = V.graph.get_dtype(name)
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
 
         chunk_size = 1  # Fixed for template kernel
         chunk = chunk_size << 1 | (self.tile_desc.tile_per_lane_layout == MLIRTile.TILE_PER_LANE_COL_WISE)
-        self.consts.add(chunk)
+        chunk = self.get_const_cse(chunk)
 
         if name in self.buffer_names:
             buffer = self.buffer_names[name]
@@ -300,15 +317,14 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
             buffer, indices = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], dram_tile_shape, self.stores, indices, index)
             self.buffer_names[name] = buffer
 
+        zero_var = self.get_const_cse(0)
         tile_size_per_lane = self.render_options['TILE_M'] * self.render_options['TILE_N'] // self.vector_lane
         operation = "affine.vector_store" if tile_size_per_lane > 1 else "affine.store"
         shape = f", vector<{tile_size_per_lane}x{type_name}>" if tile_size_per_lane > 1 else ""
-        line = f"{operation} %{value}, %{buffer}[%e_c0, %e_c0] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>{shape}"
+        line = f"{operation} %{value}, %{buffer}[%{zero_var}, %{zero_var}] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>{shape}"
         self.cse.generate(self.stores, line, assignment = False)
-
-        self.tags.add(f"{name}_tag")
-        self.consts.add(0)
-        code = f"affine.dma_start %{buffer}[%e_c0, %e_c0], %{var}[%index2], %tag[0], %c_mvout, %N, %e_c{chunk} : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>, memref<{self.render_options['M'] * self.render_options['N']}x{type_name}>, memref<1xi32>" #FIXME: Using constant index and tag
+        tag_var = self.get_tag_cse(f"{name}_tag")
+        code = f"affine.dma_start %{buffer}[%{zero_var}, %{zero_var}], %{var}[%index2], %{tag_var}[0], %c_mvout, %N, %{chunk} : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>, memref<{self.render_options['M'] * self.render_options['N']}x{type_name}>, memref<1xi32>" #FIXME: Using constant index and tag
         self.cse.generate(self.stores, code, assignment = False)
 
     def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, indices, raw_index):

From f37de2902ae070e591cb30b21c0764588ed55f4b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 15 Jan 2025 13:05:10 +0000
Subject: [PATCH 014/432] [Frontend] Introduce common get_dma_code method

---
 .../mlir/mlir_codegen_backend.py              | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 2f4deca3..ed0a30fe 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -641,6 +641,10 @@ def __init__(self):
         self.tags = dict()
         self.dma_cache = {}
         self.dma_counter = 1
+        self.dma_read_cache = {}
+        self.dma_write_cache = {}
+        self.dma_read_counter = 1
+        self.dma_write_counter = 1
         self.affine_yield = {}
         self.welford_reduce_out = None
         self.reduce_iterator = {}
@@ -1095,6 +1099,43 @@ def get_dma_info(self, name, index, dtype):
         chunk = chunk_size << 1 | (current_tile.tile_per_lane_layout == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
         return mm_stride, chunk, [current_tile.n_row, current_tile.n_col], tile_size_per_lane
 
+    def get_dma_code(self, dma_type_name, stride, chunk, mlir_dtype, dram_var, index_var, sram_var, tag_name, dram_shape, tile_shape):
+        dma_key = (stride, chunk, mlir_dtype)
+        if dma_type_name == "MVIN" and dma_key in self.dma_read_cache:
+            dma_type, mm_stride, chunk = self.dma_read_cache[dma_key]
+        elif dma_type_name == "MVOUT" and dma_key in self.dma_write_cache:
+            dma_type, mm_stride, chunk = self.dma_read_cache[dma_key]
+        else:
+            mm_stride = self.get_const_cse(stride)
+            chunk = self.get_const_cse(chunk)
+            if dma_type_name == "MVIN":
+                dma_type = self.get_const_cse(DMA_TYPE[f"{dma_type_name}{self.dma_read_counter}"])
+                self.dma_read_counter += 1
+                self.dma_read_cache[dma_key] = [dma_type, mm_stride, chunk]
+            else:
+                dma_type = self.get_const_cse(DMA_TYPE[f"{dma_type_name}{self.dma_write_counter}"])
+                # self.dma_write_counter += 1 Is it okay?
+                self.dma_write_cache[dma_key] = [dma_type, mm_stride, chunk]
+        tag = self.get_tag_cse(tag_name)
+        zero_cse = self.get_const_cse(0)
+
+        # Prepare opearnds and attributes
+        dram_operand = f"%{dram_var}[%{index_var}]"
+        sram_operand = f"%{sram_var}[%{zero_cse}, %{zero_cse}]"
+        tag_var = f"%{tag}[0]"
+        dma_attribute = f"%{dma_type}, %{mm_stride}, %{chunk}"
+        dram_shape = f"memref<{dram_shape}x{mlir_dtype}>"
+        sram_shape = f"memref<{tile_shape}x{mlir_dtype}, 1>"
+        tag_shape = "memref<1xi32>"
+
+        if dma_type_name == "MVIN":
+            src_operand, dst_operand = dram_operand, sram_operand
+            src_shape, dst_shape = dram_shape, sram_shape
+        else:
+            src_operand, dst_operand = sram_operand, dram_operand
+            src_shape, dst_shape = sram_shape, dram_shape
+        return f"affine.dma_start {src_operand}, {dst_operand}, {tag_var}, {dma_attribute} : {src_shape}, {dst_shape}, {tag_shape}"
+
     def adjust_tile_size(self):
         if self.read_writes is not None:
             read_writes = list(self.read_writes.reads) + list(self.read_writes.writes)

From b6a431ecc7e297c981b6b81a58d9611fbc948fa8 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 15 Jan 2025 13:14:04 +0000
Subject: [PATCH 015/432] [Frontend] use get_dma_code method

---
 .../mlir/mlir_codegen_backend.py              | 94 +++++++------------
 PyTorchSimFrontend/mlir/mlir_template.py      | 79 ++++++++--------
 2 files changed, 76 insertions(+), 97 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index ed0a30fe..0111d57a 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -639,8 +639,6 @@ def __init__(self):
         self.map_cse = common.CSE("#", self.suffix, name_prefix="map")
         self.consts = dict()
         self.tags = dict()
-        self.dma_cache = {}
-        self.dma_counter = 1
         self.dma_read_cache = {}
         self.dma_write_cache = {}
         self.dma_read_counter = 1
@@ -703,72 +701,52 @@ def parse_indices(self, expr) -> common.CSEVariable:
 
     def load(self, name: str, index: sympy.Expr):
         index = self.rename_indexing(index)
-        indices = self.parse_indices(index)
         padding = self.get_padding_type()
-        prefix = self.newvar_prefix
-        var = self.args.input(name)
+        index_var = self.parse_indices(index)
+        dram_var = self.args.input(name)
         dtype = V.graph.get_dtype(name)
-        type_name = mlir_common.DTYPE_TO_MLIR[dtype]
+        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         stride, chunk, tile_shape, tile_size_per_lane = self.get_dma_info(name, index, dtype)
-        dram_tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
+        tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
 
         # Define scratch pad buffer
-        buffer, indices = self.get_scratchpad_buffer(dtype, name, self.tile_desc.n_row, self.tile_desc.n_col, dram_tile_shape, self.loads, indices, index)
+        sram_var, index_var = self.get_scratchpad_buffer(dtype, name, self.tile_desc.n_row, self.tile_desc.n_col, tile_shape, self.loads, index_var, index)
         # MVIN Encoding
-        dma_key = (stride, chunk, dtype)
-        if dma_key in self.dma_cache:
-            dmaType, stride, chunk = self.dma_cache[dma_key]
-        else:
-            assert(self.dma_counter < 4)
-            dmaType = DMA_TYPE[f"MVIN{self.dma_counter}"]
-            self.dma_counter += 1
-            dmaType = self.get_const_cse(dmaType)
-            stride = self.get_const_cse(stride)
-            chunk = self.get_const_cse(chunk)
-            self.dma_cache[dma_key] = dmaType, stride, chunk
-        tag_var = self.get_tag_cse(f"{name}_tag")
-        zero_var = self.get_const_cse(0)
-        code = f"affine.dma_start %{var}[{prefix}{indices}], %{buffer}[%{zero_var}, %{zero_var}], %{tag_var}[0], %{dmaType}, %{stride}, %{chunk} : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32> {{padding = {padding}}}"
+        code = self.get_dma_code("MVIN", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", self.buffer_types[name][1], tile_shape)
         self.cse.generate(self.loads, code, assignment = False) # FIXME: assignment = False does not support caching
 
+        # Generate vector load instruction
         operation = "affine.vector_load" if tile_size_per_lane > 1 else "affine.load"
-        shape = f", vector<{tile_size_per_lane}x{type_name}>" if tile_size_per_lane > 1 else ""
-        line = f"{operation} %{buffer}[0, 0] : memref<{dram_tile_shape}x{type_name}, 1>{shape}"
+        shape = f", vector<{tile_size_per_lane}x{mlir_dtype}>" if tile_size_per_lane > 1 else ""
+        line = f"{operation} %{sram_var}[0, 0] : memref<{tile_shape}x{mlir_dtype}, 1>{shape}"
         out = self.cse.generate(self.loads, line)
-        var_info = [tile_size_per_lane, mlir_common.DTYPE_TO_MLIR[dtype]]
-        self.register_var_info(out, var_info)
+        self.register_var_info(out, [tile_size_per_lane, mlir_dtype])
         return out
 
     def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         index = self.rename_indexing(index)
-        indices = self.parse_indices(index)
-        prefix = self.newvar_prefix
-        var = self.args.output(name)
+        index_var = self.parse_indices(index)
+        dram_var = self.args.output(name)
         dtype = V.graph.get_dtype(name)
-        type_name = mlir_common.DTYPE_TO_MLIR[dtype]
+        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         stride, chunk, tile_shape, tile_size_per_lane = self.get_dma_info(name, index, dtype)
-        dram_tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
+        tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
 
         # Define scratch pad buffer
-        buffer, indices = self.get_scratchpad_buffer(dtype, name, self.tile_desc.n_row, self.tile_desc.n_col, dram_tile_shape, self.stores, indices, index)
-
-        # MVOUT Encoding
-        dmaType = 3 # MVIN 2, MVIN2 1, MVIN3 14, MVOUT 3
-        dmaType = self.get_const_cse(dmaType)
-        stride = self.get_const_cse(stride)
-        chunk = self.get_const_cse(chunk)
+        sram_var, index_var = self.get_scratchpad_buffer(dtype, name, self.tile_desc.n_row, self.tile_desc.n_col, tile_shape, self.stores, index_var, index)
 
+        # Generate vector store instruction
         store_size, operand_type = self.var_info[value]
         operation = "affine.vector_store" if tile_size_per_lane > 1 and store_size > 1 else "affine.store"
-        shape = f", vector<{tile_size_per_lane}x{type_name}>" if tile_size_per_lane > 1 and store_size > 1 else ""
-        if type_name != operand_type:
-            value = ops.custom_cast(value, type_name, var_info=self.var_info)
+        shape = f", vector<{tile_size_per_lane}x{mlir_dtype}>" if tile_size_per_lane > 1 and store_size > 1 else ""
+        if mlir_dtype != operand_type:
+            value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
 
-        line = f"{operation} %{value}, %{buffer}[0, 0] : memref<{dram_tile_shape}x{type_name}, 1>{shape}"
+        line = f"{operation} %{value}, %{sram_var}[0, 0] : memref<{tile_shape}x{mlir_dtype}, 1>{shape}"
         self.cse.generate(self.stores, line, assignment = False)
-        tag_var = self.get_tag_cse(f"{name}_tag")
-        zero_var = self.get_const_cse(0)
-        code = f"affine.dma_start %{buffer}[%{zero_var}, %{zero_var}], %{var}[{prefix}{indices}], %{tag_var}[0], %{dmaType}, %{stride}, %{chunk} : memref<{dram_tile_shape}x{type_name}, 1>, memref<{self.buffer_types[name][1]}x{type_name}>, memref<1xi32>"
+
+        # Generate DMA instruction
+        code = self.get_dma_code("MVOUT", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", self.buffer_types[name][1], tile_shape)
         self.cse.generate(self.stores, code, assignment = False)
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
@@ -846,24 +824,25 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         return acc
 
     def store_reduction(self, name, index, value):
-        var = self.args.output(name)
+        dram_var = self.args.output(name)
         dtype = V.graph.get_dtype(name)
-        type_name = mlir_common.DTYPE_TO_MLIR[dtype]
+        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         index = self.rename_indexing(index)
-        indices = self.parse_indices(index)
+        index_var = self.parse_indices(index)
+
         # Tile is always reuduced in inner loop
         tile_col = self.tile_desc.n_row
         tile_row = 1
         dram_tile_shape = f"{tile_row}x{tile_col}"
-        buffer, indices = self.get_scratchpad_buffer(dtype, name, tile_row, tile_col, dram_tile_shape, self.reductions_suffix, indices, index)
+        sram_var, index_var = self.get_scratchpad_buffer(dtype, name, tile_row, tile_col, dram_tile_shape, self.reductions_suffix, index_var, index)
         if self.welford_reduce_out is not None:
             # raise NotImplementedError()
             sum, sqr_sum, _ = self.welford_reduce_out
-            shape = f"vector<{self.tile_desc.get_rows_per_lane()}x{type_name}>" if self.buffer_types[name][1] > 1 else type_name
+            shape = f"vector<{self.tile_desc.get_rows_per_lane()}x{mlir_dtype}>" if self.buffer_types[name][1] > 1 else mlir_dtype
             # mean
             divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.ranges[self.reduction_depth])} : f32")
             if self.buffer_types[name][1] > 1:
-                divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to vector<{self.var_info[sum][0]}x{type_name}>")
+                divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to vector<{self.var_info[sum][0]}x{mlir_dtype}>")
             else:
                 divider_vec = f"f{self.buffer_types[name][1]}"
             mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{sum}, %{divider_vec} : {shape}")
@@ -889,25 +868,20 @@ def store_reduction(self, name, index, value):
         if self.tile_desc.get_rows_per_lane() == 1:
             shape = ""
         else:
-            shape = f"vector<{self.tile_desc.get_rows_per_lane()}x{type_name}>"
+            shape = f"vector<{self.tile_desc.get_rows_per_lane()}x{mlir_dtype}>"
             shape = f", {shape}" if self.buffer_types[name][1] > 1 else ""
-        line = f"{operation} %{value}, %{buffer}[0, 0] : memref<{tile_row}x{tile_col}x{type_name}, 1>{shape}"
+        line = f"{operation} %{value}, %{sram_var}[0, 0] : memref<{tile_row}x{tile_col}x{mlir_dtype}, 1>{shape}"
         self.cse.generate(self.reductions_suffix, line, assignment = False)
 
         # MVOUT Encoding
-        dmaType = 3 # MVIN 2, MVIN2 1, MVIN3 14, MVOUT 3
         mm_stride = tile_col
         is_col_major = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
         chunk_size = self.tile_desc.get_rows_per_lane()
         chunk = chunk_size << 1 | (is_col_major == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
-        dmaType = self.get_const_cse(dmaType)
-        mm_stride = self.get_const_cse(mm_stride)
-        chunk = self.get_const_cse(chunk)
 
+        # Generate DMA instruction
         # Change row, col
-        tag_var = self.get_tag_cse(f"{name}_tag")
-        zero_var = self.get_const_cse(0)
-        code = f"affine.dma_start %{buffer}[%{zero_var}, %{zero_var}], %{var}[%{indices}], %{tag_var}[0], %{dmaType}, %{mm_stride}, %{chunk} : memref<{tile_row}x{tile_col}x{type_name}, 1>, memref<{self.buffer_types[name][1]}x{type_name}>, memref<1xi32>"
+        code = self.get_dma_code("MVOUT", mm_stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", self.buffer_types[name][1], f"{tile_row}x{tile_col}")
         self.cse.generate(self.reductions_suffix, code, assignment = False)
 
     def codegen_body(self):
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 4303f7d4..2ee131b6 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -272,63 +272,68 @@ def adjust_tile_size(self):
         return
 
     def load_epilogue(self, name: str, index: sympy.Expr):
-        indices = self.parse_indices(index)
+        index_var = self.parse_indices(index)
+        index_var = "index2"
         index = self.rename_indexing(index)
-        var = self.args.input(name)
+        dram_var = self.args.input(name)
         dtype = V.graph.get_dtype(name)
-        type_name = mlir_common.DTYPE_TO_MLIR[dtype]
-
-        if name in self.buffer_names:
-            buffer = self.buffer_names[name]
-        else:
-            mvin3 = 14
-            mvin3 = self.get_const_cse(mvin3)
-            zero_cse = self.get_const_cse(0)
-            dram_tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
-            buffer, indices = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], dram_tile_shape, self.loads, indices, index)
-            self.buffer_names[name] = buffer
-            line = f"affine.dma_start %{var}[%index2], %{buffer}[%{zero_cse}, %{zero_cse}], %tag[0], %{mvin3}, %N, %c_set : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32>"
-            self.cse.generate(self.loads, line, assignment = False)
-
-        zero_cse = self.get_const_cse(0)
+        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
+        if name not in self.buffer_names:
+            # Allocate sram buffer
+            tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
+            sram_var, index_var = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], tile_shape, self.loads, index_var, index)
+            self.buffer_names[name] = sram_var
+
+            # Generate DMA instruction
+            stride = self.render_options['N']   # FIXME. Is it okay?
+            chunk = 2                           # FIXME. Is it okay?
+            index_var = "index2"                # FIXME. Is it okay?
+            code = self.get_dma_code("MVIN", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", self.buffer_types[name][1], tile_shape)
+            self.cse.generate(self.loads, code, assignment = False)
+
+        # Load vector from sram
+        sram_var = self.buffer_names[name]
         tile_size_per_lane = self.render_options['TILE_M'] * self.render_options['TILE_N'] // self.vector_lane
         operation = "affine.vector_load" if tile_size_per_lane > 1 else "affine.load"
-        shape = f", vector<{tile_size_per_lane}x{type_name}>" if tile_size_per_lane > 1 else ""
-        line = f"{operation} %{buffer}[%{zero_cse}, %{zero_cse}] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>{shape}"
+        shape = f", vector<{tile_size_per_lane}x{mlir_dtype}>" if tile_size_per_lane > 1 else ""
+        zero_var = self.get_const_cse(0)
+        line = f"{operation} %{sram_var}[%{zero_var}, %{zero_var}] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{mlir_dtype}, 1>{shape}"
         out = self.cse.generate(self.loads, line)
-        var_info = [tile_size_per_lane, mlir_common.DTYPE_TO_MLIR[dtype]]
-        self.register_var_info(out, var_info)
+        self.register_var_info(out, [tile_size_per_lane, mlir_dtype])
         return out
 
     def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
-        indices = self.parse_indices(index)
-        var = self.args.output(name)
+        index_var = self.parse_indices(index)
+        index_var = "index2"
+        dram_var = self.args.output(name)
         dtype = V.graph.get_dtype(name)
-        type_name = mlir_common.DTYPE_TO_MLIR[dtype]
+        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
         chunk_size = 1  # Fixed for template kernel
         chunk = chunk_size << 1 | (self.tile_desc.tile_per_lane_layout == MLIRTile.TILE_PER_LANE_COL_WISE)
-        chunk = self.get_const_cse(chunk)
 
-        if name in self.buffer_names:
-            buffer = self.buffer_names[name]
-        else:
+        if name not in self.buffer_names:
             dram_tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
-            buffer, indices = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], dram_tile_shape, self.stores, indices, index)
-            self.buffer_names[name] = buffer
+            sram_var, index_var = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], dram_tile_shape, self.stores, index_var, index)
+            self.buffer_names[name] = sram_var
+        sram_var = self.buffer_names[name]
 
-        zero_var = self.get_const_cse(0)
         tile_size_per_lane = self.render_options['TILE_M'] * self.render_options['TILE_N'] // self.vector_lane
         operation = "affine.vector_store" if tile_size_per_lane > 1 else "affine.store"
-        shape = f", vector<{tile_size_per_lane}x{type_name}>" if tile_size_per_lane > 1 else ""
-        line = f"{operation} %{value}, %{buffer}[%{zero_var}, %{zero_var}] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>{shape}"
+        shape = f", vector<{tile_size_per_lane}x{mlir_dtype}>" if tile_size_per_lane > 1 else ""
+        zero_var = self.get_const_cse(0)
+        line = f"{operation} %{value}, %{sram_var}[%{zero_var}, %{zero_var}] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{mlir_dtype}, 1>{shape}"
         self.cse.generate(self.stores, line, assignment = False)
-        tag_var = self.get_tag_cse(f"{name}_tag")
-        code = f"affine.dma_start %{buffer}[%{zero_var}, %{zero_var}], %{var}[%index2], %{tag_var}[0], %c_mvout, %N, %{chunk} : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>, memref<{self.render_options['M'] * self.render_options['N']}x{type_name}>, memref<1xi32>" #FIXME: Using constant index and tag
+
+        stride = self.render_options['N']   # FIXME. Is it okay?
+        index_var = "index2"                # FIXME. Is it okay?
+        dram_shape = f"{self.render_options['M'] * self.render_options['N']}"
+        tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
+        code = self.get_dma_code("MVOUT", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape)
         self.cse.generate(self.stores, code, assignment = False)
 
-    def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, indices, raw_index):
-        return super().get_scratchpad_buffer(dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, indices, raw_index, True)
+    def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, index_var, raw_index):
+        return super().get_scratchpad_buffer(dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, index_var, raw_index, True)
 
 class MLIRTemplateCaller(CUDATemplateCaller):
     def __str__(self):

From 5bcc68b4fc7dc6533d0b30111d5417b17ada3cb6 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 15 Jan 2025 13:24:54 +0000
Subject: [PATCH 016/432] [Frontend/schedule] allow fusion only gemm

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 21e7a78c..c793dcf7 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -30,12 +30,15 @@ def can_fuse_horizontal(self, node1, node2):
         _, (vars1, reduce1) = node1.group
         _, (vars2, reduce2) = node2.group
 
+        # Reduction is currently not supported
+        if node1.is_reduction() or node2.is_reduction():
+            return False
+
         # Convolution is currently not supported
-        if node1.node.origin_node.target._name == 'aten::convolution' or node2.node.origin_node.target._name == 'aten::convolution':
+        if not isinstance(node1, FusedSchedulerNode) and node1.node.origin_node is not None and node1.node.origin_node.target._name == 'aten::convolution':
             return False
 
-        # Reduction is currently not supported
-        if node1.is_reduction() or node2.is_reduction():
+        if not isinstance(node2, FusedSchedulerNode) and node2.node.origin_node is not None and node2.node.origin_node.target._name == 'aten::convolution':
             return False
 
         if not isinstance(node1, FusedSchedulerNode) and not isinstance(node2, FusedSchedulerNode):

From d602b06e6d8d633406daeded3ee06681f2d319d1 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 15 Jan 2025 13:25:15 +0000
Subject: [PATCH 017/432] [Frontend] Support padding in the get_dma_code

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 0111d57a..5060c50d 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -712,7 +712,7 @@ def load(self, name: str, index: sympy.Expr):
         # Define scratch pad buffer
         sram_var, index_var = self.get_scratchpad_buffer(dtype, name, self.tile_desc.n_row, self.tile_desc.n_col, tile_shape, self.loads, index_var, index)
         # MVIN Encoding
-        code = self.get_dma_code("MVIN", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", self.buffer_types[name][1], tile_shape)
+        code = self.get_dma_code("MVIN", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", self.buffer_types[name][1], tile_shape, padding)
         self.cse.generate(self.loads, code, assignment = False) # FIXME: assignment = False does not support caching
 
         # Generate vector load instruction
@@ -1073,7 +1073,7 @@ def get_dma_info(self, name, index, dtype):
         chunk = chunk_size << 1 | (current_tile.tile_per_lane_layout == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
         return mm_stride, chunk, [current_tile.n_row, current_tile.n_col], tile_size_per_lane
 
-    def get_dma_code(self, dma_type_name, stride, chunk, mlir_dtype, dram_var, index_var, sram_var, tag_name, dram_shape, tile_shape):
+    def get_dma_code(self, dma_type_name, stride, chunk, mlir_dtype, dram_var, index_var, sram_var, tag_name, dram_shape, tile_shape, padding_type=None):
         dma_key = (stride, chunk, mlir_dtype)
         if dma_type_name == "MVIN" and dma_key in self.dma_read_cache:
             dma_type, mm_stride, chunk = self.dma_read_cache[dma_key]
@@ -1108,7 +1108,11 @@ def get_dma_code(self, dma_type_name, stride, chunk, mlir_dtype, dram_var, index
         else:
             src_operand, dst_operand = sram_operand, dram_operand
             src_shape, dst_shape = sram_shape, dram_shape
-        return f"affine.dma_start {src_operand}, {dst_operand}, {tag_var}, {dma_attribute} : {src_shape}, {dst_shape}, {tag_shape}"
+
+        code = f"affine.dma_start {src_operand}, {dst_operand}, {tag_var}, {dma_attribute} : {src_shape}, {dst_shape}, {tag_shape}"
+        if padding_type is not None:
+            code = code + f" {{padding = {padding_type}}}"
+        return code
 
     def adjust_tile_size(self):
         if self.read_writes is not None:

From 29bc92b5658fff4f1f40eac5c04e4b8abd8da3a8 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 15 Jan 2025 13:25:47 +0000
Subject: [PATCH 018/432] [Frontend/Fusion] Now use hardcoded index

---
 PyTorchSimFrontend/mlir/mlir_template.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 2ee131b6..930f4777 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -272,7 +272,7 @@ def adjust_tile_size(self):
         return
 
     def load_epilogue(self, name: str, index: sympy.Expr):
-        index_var = self.parse_indices(index)
+        #index_var = self.parse_indices(index)
         index_var = "index2"
         index = self.rename_indexing(index)
         dram_var = self.args.input(name)
@@ -303,7 +303,7 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         return out
 
     def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
-        index_var = self.parse_indices(index)
+        #index_var = self.parse_indices(index)
         index_var = "index2"
         dram_var = self.args.output(name)
         dtype = V.graph.get_dtype(name)

From 795971b12a0da43dab10d66e31f0f072a9408b52 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 16 Jan 2025 03:31:12 +0000
Subject: [PATCH 019/432] [Frontend] Hotfix for wrong dma code generation

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 5060c50d..dbb336da 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1078,7 +1078,7 @@ def get_dma_code(self, dma_type_name, stride, chunk, mlir_dtype, dram_var, index
         if dma_type_name == "MVIN" and dma_key in self.dma_read_cache:
             dma_type, mm_stride, chunk = self.dma_read_cache[dma_key]
         elif dma_type_name == "MVOUT" and dma_key in self.dma_write_cache:
-            dma_type, mm_stride, chunk = self.dma_read_cache[dma_key]
+            dma_type, mm_stride, chunk = self.dma_write_cache[dma_key]
         else:
             mm_stride = self.get_const_cse(stride)
             chunk = self.get_const_cse(chunk)

From 4cbb641697e836a0c29a869bdab943718dee3285 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 16 Jan 2025 04:03:42 +0000
Subject: [PATCH 020/432] [Frontend] refactor fusion dependent code

---
 .../mlir/mlir_codegen_backend.py              | 18 ---------------
 PyTorchSimFrontend/mlir/mlir_template.py      | 22 +++++++++++++++++++
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index dbb336da..29e91dec 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -884,24 +884,6 @@ def store_reduction(self, name, index, value):
         code = self.get_dma_code("MVOUT", mm_stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", self.buffer_types[name][1], f"{tile_row}x{tile_col}")
         self.cse.generate(self.reductions_suffix, code, assignment = False)
 
-    def codegen_body(self):
-        def template_store(options):
-            subtile_size = [self.vector_lane, self.vector_lane]
-            async_flag = 1
-            zero_var = self.get_const_cse(0)
-            line = f"affine.dma_start %Y_buffer[%{zero_var}, %{zero_var}], %Y[%index2], %tag[0], %c_mvout, %N, %c_set"\
-                   f": memref<{options['TILE_M']}x{options['TILE_N']}xf32, 1>,"\
-                   f"memref<{options['M'] * options['N']}xf32>, memref<1xi32>" #FIXME: Using constant index
-            self.cse.generate(self.stores, line, assignment = False)
-        self.body.splice(self.loads)
-        self.body.splice(self.compute)
-        if len(self.stores._lines) == 0:
-            template_store(self.render_options)
-        self.body.splice(self.stores)
-        self.loads.clear()
-        self.compute.clear()
-        self.stores.clear()
-
     def codegen_global_init(self):
         return self.global_vars
 
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 930f4777..5ff9a9a2 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -161,6 +161,28 @@ def call_kernel(self, kernel_name):
             kernel_name if self.outer_func_name is None else self.outer_func_name,
             call_args, cuda=False)
 
+    def codegen_body(self):
+        def template_store(options):
+            sram_var = "Y_buffer"
+            dram_var = "Y"
+            index_var = "index2"
+            tag_var = "tag"
+            stride = options['N']
+            chunk = 2
+            mlir_dtype = "f32"
+            dram_shape = f"{options['M'] * options['N']}"
+            tile_shape = f"{options['TILE_M']}x{options['TILE_N']}"
+            code = self.get_dma_code("MVOUT", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, tag_var, dram_shape, tile_shape)
+            self.cse.generate(self.stores, code, assignment = False)
+        self.body.splice(self.loads)
+        self.body.splice(self.compute)
+        if len(self.stores._lines) == 0:
+            template_store(self.render_options)
+        self.body.splice(self.stores)
+        self.loads.clear()
+        self.compute.clear()
+        self.stores.clear()
+
     def def_kernel(
         self,
         inputs: List[IRNode],

From 3e15d407baf673a24de0ee99d60d3d228457ae1e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 16 Jan 2025 04:46:50 +0000
Subject: [PATCH 021/432] [Frontend/DMA4d] Export tensor size and stride to
 mlir code

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 12 +++++++-----
 PyTorchSimFrontend/mlir/mlir_common.py          | 17 ++++++++++++-----
 PyTorchSimFrontend/mlir/mlir_template.py        |  7 ++++---
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 29e91dec..e31412ed 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -707,12 +707,13 @@ def load(self, name: str, index: sympy.Expr):
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         stride, chunk, tile_shape, tile_size_per_lane = self.get_dma_info(name, index, dtype)
+        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
 
         # Define scratch pad buffer
         sram_var, index_var = self.get_scratchpad_buffer(dtype, name, self.tile_desc.n_row, self.tile_desc.n_col, tile_shape, self.loads, index_var, index)
         # MVIN Encoding
-        code = self.get_dma_code("MVIN", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", self.buffer_types[name][1], tile_shape, padding)
+        code = self.get_dma_code("MVIN", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape, padding)
         self.cse.generate(self.loads, code, assignment = False) # FIXME: assignment = False does not support caching
 
         # Generate vector load instruction
@@ -730,6 +731,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         stride, chunk, tile_shape, tile_size_per_lane = self.get_dma_info(name, index, dtype)
+        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
 
         # Define scratch pad buffer
@@ -746,7 +748,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         self.cse.generate(self.stores, line, assignment = False)
 
         # Generate DMA instruction
-        code = self.get_dma_code("MVOUT", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", self.buffer_types[name][1], tile_shape)
+        code = self.get_dma_code("MVOUT", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape)
         self.cse.generate(self.stores, code, assignment = False)
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
@@ -878,10 +880,10 @@ def store_reduction(self, name, index, value):
         is_col_major = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
         chunk_size = self.tile_desc.get_rows_per_lane()
         chunk = chunk_size << 1 | (is_col_major == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
-
+        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         # Generate DMA instruction
         # Change row, col
-        code = self.get_dma_code("MVOUT", mm_stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", self.buffer_types[name][1], f"{tile_row}x{tile_col}")
+        code = self.get_dma_code("MVOUT", mm_stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, f"{tile_row}x{tile_col}")
         self.cse.generate(self.reductions_suffix, code, assignment = False)
 
     def codegen_global_init(self):
@@ -1080,7 +1082,7 @@ def get_dma_code(self, dma_type_name, stride, chunk, mlir_dtype, dram_var, index
         sram_operand = f"%{sram_var}[%{zero_cse}, %{zero_cse}]"
         tag_var = f"%{tag}[0]"
         dma_attribute = f"%{dma_type}, %{mm_stride}, %{chunk}"
-        dram_shape = f"memref<{dram_shape}x{mlir_dtype}>"
+        #dram_shape = f"memref<{dram_shape}x{mlir_dtype}>"
         sram_shape = f"memref<{tile_shape}x{mlir_dtype}, 1>"
         tag_shape = "memref<1xi32>"
 
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 51399197..02d3da3c 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -99,28 +99,36 @@ def is_mlir_arg_out(value):
     def is_mlir_arg_inout(value):
         return MLIRKernelArgs.MLIR_ARGS_INOUT & value
 
+    @staticmethod
+    def get_mlir_shape(info):
+        tensor_shape = "x".join([str(i) for i in info[1]])
+        tensor_type = DTYPE_TO_MLIR[info[0]]
+        return f"memref<{tensor_shape}x{tensor_type}, strided<{info[2]}>>"
+
     def mlir_argdefs(self, extra_node=dict()):
         buffer_types = {}
         for x in V.graph.buffers:
             if not isinstance(x.layout, MultiOutputLayout): # FIXME: MultiOutputLayout should be handled
-                buffer_types[x.get_name()] = [x.get_dtype(), x.get_numel()]
+                buffer_types[x.get_name()] = [x.get_dtype(), x.get_size(), x.get_stride()]
         for name, val in V.graph.graph_inputs.items():
             if isinstance(val, sympy.Expr):
                 buffer_types[name] = [get_sympy_Expr_dtype(val), 1]
+                buffer_types[name] = [get_sympy_Expr_dtype(val), [1], [1]]
             else:
-                buffer_types[name] = [val.get_dtype(), val.get_numel()]
+                buffer_types[name] = [val.get_dtype(), val.get_size(), val.get_stride()]
         buffer_types.update(
             {name: val.dtype for name, val in V.graph.constants.items()}
         )
         buffer_types.update(
-            {name: [val.get_dtype(), val.get_numel()] for name, val in extra_node.items()}
+            {name: [val.get_dtype(), val.get_size(), val.get_stride()] for name, val in extra_node.items()}
         )
 
         call_args = []
         arg_defs = []
         arg_attributes = []
         def set_info(outer, inner, arg_type):
-            arg_defs.append(f"%{inner}: memref<{buffer_types[outer][1]}x{DTYPE_TO_MLIR[buffer_types[outer][0]]}>")
+            mlir_shape = self.get_mlir_shape(buffer_types[outer])
+            arg_defs.append(f"%{inner}: {mlir_shape}")
             call_args.append(outer)
             arg_attributes.append([outer] + [[arg_type] + buffer_types[outer]])
 
@@ -142,7 +150,6 @@ def set_info(outer, inner, arg_type):
             set_info(outer, inner, self.MLIR_ARGS_VAR)
         return arg_defs, call_args, arg_attributes, buffer_types
 
-
 class MLIRTile():
     TILE_ROW_WISE = 0
     TILE_COL_WISE = 1
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 5ff9a9a2..80e157be 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -170,7 +170,7 @@ def template_store(options):
             stride = options['N']
             chunk = 2
             mlir_dtype = "f32"
-            dram_shape = f"{options['M'] * options['N']}"
+            dram_shape = f"memref<{options['M']}x{options['N']}x{mlir_dtype}>"
             tile_shape = f"{options['TILE_M']}x{options['TILE_N']}"
             code = self.get_dma_code("MVOUT", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, tag_var, dram_shape, tile_shape)
             self.cse.generate(self.stores, code, assignment = False)
@@ -302,6 +302,7 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         if name not in self.buffer_names:
             # Allocate sram buffer
+            dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
             tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
             sram_var, index_var = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], tile_shape, self.loads, index_var, index)
             self.buffer_names[name] = sram_var
@@ -310,7 +311,7 @@ def load_epilogue(self, name: str, index: sympy.Expr):
             stride = self.render_options['N']   # FIXME. Is it okay?
             chunk = 2                           # FIXME. Is it okay?
             index_var = "index2"                # FIXME. Is it okay?
-            code = self.get_dma_code("MVIN", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", self.buffer_types[name][1], tile_shape)
+            code = self.get_dma_code("MVIN", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape)
             self.cse.generate(self.loads, code, assignment = False)
 
         # Load vector from sram
@@ -349,7 +350,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
         stride = self.render_options['N']   # FIXME. Is it okay?
         index_var = "index2"                # FIXME. Is it okay?
-        dram_shape = f"{self.render_options['M'] * self.render_options['N']}"
+        dram_shape = f"memref<{self.render_options['M']}x{self.render_options['N']}x{mlir_dtype}>"
         tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
         code = self.get_dma_code("MVOUT", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape)
         self.cse.generate(self.stores, code, assignment = False)

From 5c292901a1c9705a92228be18b699c1ae96cfeaa Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 16 Jan 2025 05:55:02 +0000
Subject: [PATCH 022/432] [WIP]

---
 .../mlir/mlir_codegen_backend.py              | 64 +++++++++----------
 PyTorchSimFrontend/mlir/mlir_common.py        |  8 +--
 PyTorchSimFrontend/mlir/mlir_template.py      | 19 +++---
 3 files changed, 46 insertions(+), 45 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index e31412ed..61c45207 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -706,14 +706,14 @@ def load(self, name: str, index: sympy.Expr):
         dram_var = self.args.input(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        stride, chunk, tile_shape, tile_size_per_lane = self.get_dma_info(name, index, dtype)
+        vlane_split_axis, vlane_stride, tile_shape, tile_size_per_lane = self.get_dma_info(name, index)
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
 
         # Define scratch pad buffer
         sram_var, index_var = self.get_scratchpad_buffer(dtype, name, self.tile_desc.n_row, self.tile_desc.n_col, tile_shape, self.loads, index_var, index)
         # MVIN Encoding
-        code = self.get_dma_code("MVIN", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape, padding)
+        code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape, padding)
         self.cse.generate(self.loads, code, assignment = False) # FIXME: assignment = False does not support caching
 
         # Generate vector load instruction
@@ -730,7 +730,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         dram_var = self.args.output(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        stride, chunk, tile_shape, tile_size_per_lane = self.get_dma_info(name, index, dtype)
+        vlane_split_axis, vlane_stride, tile_shape, tile_size_per_lane = self.get_dma_info(name, index)
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
 
@@ -748,7 +748,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         self.cse.generate(self.stores, line, assignment = False)
 
         # Generate DMA instruction
-        code = self.get_dma_code("MVOUT", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape)
+        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape)
         self.cse.generate(self.stores, code, assignment = False)
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
@@ -877,13 +877,13 @@ def store_reduction(self, name, index, value):
 
         # MVOUT Encoding
         mm_stride = tile_col
-        is_col_major = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
-        chunk_size = self.tile_desc.get_rows_per_lane()
-        chunk = chunk_size << 1 | (is_col_major == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
+        vlane_split_axis = 0 #FIXME.
+        vlane_stride = self.tile_desc.get_rows_per_lane()
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
+
         # Generate DMA instruction
         # Change row, col
-        code = self.get_dma_code("MVOUT", mm_stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, f"{tile_row}x{tile_col}")
+        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, f"{tile_row}x{tile_col}")
         self.cse.generate(self.reductions_suffix, code, assignment = False)
 
     def codegen_global_init(self):
@@ -946,7 +946,7 @@ def codegen_nodes(self, nodes, kernel_name):
             write_atomic(gem5_write_path, self.gem5_header.getvalue())
         return src_code
 
-    def get_dma_info(self, name, index, dtype):
+    def get_dma_info(self, name, index):
         current_tile = mlir_common.MLIRTile(self.tile_desc.n_row, self.tile_desc.n_col, self.tile_desc.vector_lane, self.tile_desc.used_vector_lane)
         cv = self.get_constant_vector(index)
         cv2 = self.get_constant_vector2(index)
@@ -966,14 +966,14 @@ def get_dma_info(self, name, index, dtype):
             current_tile.tile_layout = mlir_common.MLIRTile.TILE_ROW_WISE
             current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
             mm_stride, tile_size_per_lane = 1, 1
-            chunk_size = current_tile.get_chunk_size()
+            vlane_stride = current_tile.get_vlane_stride()
         # Case 1. Tile is 1-D vector type
         elif len(cv) == 1 and len(cv) <= self.reduction_depth:
             current_tile.n_row = 1
             current_tile.n_col = self.tile_desc.get_tile_size()
             current_tile.tile_layout = mlir_common.MLIRTile.TILE_ROW_WISE
             current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE # Actually it is not needed in vector case
-            chunk_size = current_tile.get_chunk_size()
+            vlane_stride = current_tile.get_vlane_stride()
             mm_stride = current_tile.n_col
             if self.is_scalar(name): # scalar to vector broadcasting
                 mm_stride = 0
@@ -986,7 +986,7 @@ def get_dma_info(self, name, index, dtype):
             current_tile.n_row = 1
             current_tile.n_col = self.tile_desc.get_tile_size()
             current_tile.used_vector_lane = 1
-            chunk_size = current_tile.get_chunk_size()
+            vlane_stride = current_tile.get_vlane_stride()
             mm_stride = 0 # don't care
             tile_size_per_lane = current_tile.get_tile_size_per_lane()
             if self.is_scalar(name): # scalar to vector broadcasting
@@ -1008,20 +1008,20 @@ def get_dma_info(self, name, index, dtype):
                 if is_reduction and is_transposed:
                     current_tile.tile_layout = mlir_common.MLIRTile.TILE_COL_WISE
                     current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
-                    chunk_size = current_tile.get_chunk_size()
+                    vlane_stride = current_tile.get_vlane_stride()
                 elif is_reduction and not is_transposed:
                     current_tile.tile_layout = mlir_common.MLIRTile.TILE_ROW_WISE
                     current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE
-                    chunk_size = current_tile.get_chunk_size()
+                    vlane_stride = current_tile.get_vlane_stride()
                 elif not is_reduction and is_transposed:
                     # Transposed case
                     current_tile.tile_layout = mlir_common.MLIRTile.TILE_COL_WISE
                     current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE
-                    chunk_size = current_tile.get_chunk_size()
+                    vlane_stride = current_tile.get_vlane_stride()
                 else: # not is_reduction and not is_transpose
                     current_tile.tile_layout = mlir_common.MLIRTile.TILE_COL_WISE if self.tile_desc.vector_lane_axis else mlir_common.MLIRTile.TILE_ROW_WISE
                     current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
-                    chunk_size = current_tile.get_chunk_size()
+                    vlane_stride = current_tile.get_vlane_stride()
             else:
                 # Broadcast pattern
                 current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
@@ -1030,14 +1030,14 @@ def get_dma_info(self, name, index, dtype):
                     current_tile.tile_layout = mlir_common.MLIRTile.TILE_COL_WISE if self.tile_desc.vector_lane_axis else mlir_common.MLIRTile.TILE_ROW_WISE
                     current_tile.n_row = self.tile_desc.n_row
                     current_tile.n_col = self.tile_desc.n_col
-                    chunk_size = current_tile.get_chunk_size()
+                    vlane_stride = current_tile.get_vlane_stride()
                 else: # cv[1][0] == 0
                     current_tile.n_row = self.tile_desc.n_col
                     current_tile.n_col = self.tile_desc.n_row
-                    chunk_size = current_tile.get_cols_per_lane()
+                    vlane_stride = current_tile.get_cols_per_lane()
                     if not is_reduction:
                         current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE
-                        chunk_size = current_tile.n_col if self.tile_desc.vector_lane_axis else chunk_size
+                        vlane_stride = current_tile.n_col if self.tile_desc.vector_lane_axis else vlane_stride
         elif len(cv) == 3:
             current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE # Actually it is not needed in vector case
             mm_stride = cv[-1][0]
@@ -1049,31 +1049,31 @@ def get_dma_info(self, name, index, dtype):
             else:
                 current_tile.n_row = self.tile_desc.get_tile_size()
                 current_tile.n_col = 1
-            chunk_size = current_tile.get_tile_size_per_lane()
+            vlane_stride = current_tile.get_tile_size_per_lane()
         else:
             raise NotImplementedError()
 
-        #assert(not (dtype==torch.bool and chunk_size < 8))
-        chunk = chunk_size << 1 | (current_tile.tile_per_lane_layout == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
-        return mm_stride, chunk, [current_tile.n_row, current_tile.n_col], tile_size_per_lane
+        #assert(not (dtype==torch.bool and vlane_stride < 8))
+        vlane_split_axis = int(current_tile.tile_per_lane_layout == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
+        return vlane_split_axis, vlane_stride, [current_tile.n_row, current_tile.n_col], tile_size_per_lane
 
-    def get_dma_code(self, dma_type_name, stride, chunk, mlir_dtype, dram_var, index_var, sram_var, tag_name, dram_shape, tile_shape, padding_type=None):
-        dma_key = (stride, chunk, mlir_dtype)
+    def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_var, index_var, sram_var, tag_name, dram_shape, tile_shape, padding_type=None):
+        dma_key = (attribute1, attribute2, mlir_dtype)
         if dma_type_name == "MVIN" and dma_key in self.dma_read_cache:
-            dma_type, mm_stride, chunk = self.dma_read_cache[dma_key]
+            dma_type, attribute1, attribute2 = self.dma_read_cache[dma_key]
         elif dma_type_name == "MVOUT" and dma_key in self.dma_write_cache:
-            dma_type, mm_stride, chunk = self.dma_write_cache[dma_key]
+            dma_type, attribute1, attribute2 = self.dma_write_cache[dma_key]
         else:
-            mm_stride = self.get_const_cse(stride)
-            chunk = self.get_const_cse(chunk)
+            attribute1 = self.get_const_cse(attribute1)
+            attribute2 = self.get_const_cse(attribute2)
             if dma_type_name == "MVIN":
                 dma_type = self.get_const_cse(DMA_TYPE[f"{dma_type_name}{self.dma_read_counter}"])
                 self.dma_read_counter += 1
-                self.dma_read_cache[dma_key] = [dma_type, mm_stride, chunk]
+                self.dma_read_cache[dma_key] = [dma_type, attribute1, attribute2]
             else:
                 dma_type = self.get_const_cse(DMA_TYPE[f"{dma_type_name}{self.dma_write_counter}"])
                 # self.dma_write_counter += 1 Is it okay?
-                self.dma_write_cache[dma_key] = [dma_type, mm_stride, chunk]
+                self.dma_write_cache[dma_key] = [dma_type, attribute1, attribute2]
         tag = self.get_tag_cse(tag_name)
         zero_cse = self.get_const_cse(0)
 
@@ -1081,7 +1081,7 @@ def get_dma_code(self, dma_type_name, stride, chunk, mlir_dtype, dram_var, index
         dram_operand = f"%{dram_var}[%{index_var}]"
         sram_operand = f"%{sram_var}[%{zero_cse}, %{zero_cse}]"
         tag_var = f"%{tag}[0]"
-        dma_attribute = f"%{dma_type}, %{mm_stride}, %{chunk}"
+        dma_attribute = f"%{dma_type}, %{attribute1}, %{attribute2}"
         #dram_shape = f"memref<{dram_shape}x{mlir_dtype}>"
         sram_shape = f"memref<{tile_shape}x{mlir_dtype}, 1>"
         tag_shape = "memref<1xi32>"
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 02d3da3c..032877bb 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -188,12 +188,12 @@ def get_tile_size_per_lane(self):
     def get_tile_shape(self):
         return f"{self.n_row}x{self.n_col}"
 
-    def get_chunk_size(self):
+    def get_vlane_stride(self):
         if self.tile_layout == self.TILE_ROW_WISE:
-            chunk_size = self.get_tile_size_per_lane()
+            vlane_stride = self.get_tile_size_per_lane()
         else:
-            chunk_size = self.get_cols_per_lane()
-        return chunk_size
+            vlane_stride = self.get_cols_per_lane()
+        return vlane_stride
 
     @staticmethod
     def div_round_up(size, round_val):
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 80e157be..d0b07629 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -168,11 +168,12 @@ def template_store(options):
             index_var = "index2"
             tag_var = "tag"
             stride = options['N']
-            chunk = 2
+            vlane_split_axis = 0
+            vlane_stride = 1
             mlir_dtype = "f32"
             dram_shape = f"memref<{options['M']}x{options['N']}x{mlir_dtype}>"
             tile_shape = f"{options['TILE_M']}x{options['TILE_N']}"
-            code = self.get_dma_code("MVOUT", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, tag_var, dram_shape, tile_shape)
+            code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, tag_var, dram_shape, tile_shape)
             self.cse.generate(self.stores, code, assignment = False)
         self.body.splice(self.loads)
         self.body.splice(self.compute)
@@ -308,10 +309,10 @@ def load_epilogue(self, name: str, index: sympy.Expr):
             self.buffer_names[name] = sram_var
 
             # Generate DMA instruction
-            stride = self.render_options['N']   # FIXME. Is it okay?
-            chunk = 2                           # FIXME. Is it okay?
+            vlane_split_axis = 0                # FIXME. Is it okay?
+            vlane_stride = 1                    # FIXME. Is it okay?
             index_var = "index2"                # FIXME. Is it okay?
-            code = self.get_dma_code("MVIN", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape)
+            code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape)
             self.cse.generate(self.loads, code, assignment = False)
 
         # Load vector from sram
@@ -332,8 +333,9 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
-        chunk_size = 1  # Fixed for template kernel
-        chunk = chunk_size << 1 | (self.tile_desc.tile_per_lane_layout == MLIRTile.TILE_PER_LANE_COL_WISE)
+        vlane_split_axis = 0
+        vlane_stride = 1  # Fixed for template kernel
+        #chunk = chunk_size << 1 | (self.tile_desc.tile_per_lane_layout == MLIRTile.TILE_PER_LANE_COL_WISE)
 
         if name not in self.buffer_names:
             dram_tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
@@ -348,11 +350,10 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         line = f"{operation} %{value}, %{sram_var}[%{zero_var}, %{zero_var}] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{mlir_dtype}, 1>{shape}"
         self.cse.generate(self.stores, line, assignment = False)
 
-        stride = self.render_options['N']   # FIXME. Is it okay?
         index_var = "index2"                # FIXME. Is it okay?
         dram_shape = f"memref<{self.render_options['M']}x{self.render_options['N']}x{mlir_dtype}>"
         tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
-        code = self.get_dma_code("MVOUT", stride, chunk, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape)
+        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape)
         self.cse.generate(self.stores, code, assignment = False)
 
     def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, index_var, raw_index):

From e2863c8a25f20bd7290d25291df3f3e067dc69af Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 16 Jan 2025 12:08:47 +0000
Subject: [PATCH 023/432] [WIP2]

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 61c45207..1e309803 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -647,6 +647,7 @@ def __init__(self):
         self.welford_reduce_out = None
         self.reduce_iterator = {}
         self.is_template_kernel = False
+
     def set_ranges(self, lengths, reduction_lengths, read_writes):
         ret = super().set_ranges(lengths, reduction_lengths, read_writes)
 

From 2d5f3d35bb002be5073ef524d520bc5313deeaef Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 16 Jan 2025 12:17:09 +0000
Subject: [PATCH 024/432] [WIP3]

---
 PyTorchSimFrontend/mlir/mlir_common.py | 45 ++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 032877bb..f60374c1 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -101,26 +101,24 @@ def is_mlir_arg_inout(value):
 
     @staticmethod
     def get_mlir_shape(info):
-        tensor_shape = "x".join([str(i) for i in info[1]])
         tensor_type = DTYPE_TO_MLIR[info[0]]
-        return f"memref<{tensor_shape}x{tensor_type}, strided<{info[2]}>>"
+        return f"memref<{info[1]}x{tensor_type}>"
 
     def mlir_argdefs(self, extra_node=dict()):
         buffer_types = {}
         for x in V.graph.buffers:
             if not isinstance(x.layout, MultiOutputLayout): # FIXME: MultiOutputLayout should be handled
-                buffer_types[x.get_name()] = [x.get_dtype(), x.get_size(), x.get_stride()]
+                buffer_types[x.get_name()] = [x.get_dtype(), x.get_numel()]
         for name, val in V.graph.graph_inputs.items():
             if isinstance(val, sympy.Expr):
                 buffer_types[name] = [get_sympy_Expr_dtype(val), 1]
-                buffer_types[name] = [get_sympy_Expr_dtype(val), [1], [1]]
             else:
-                buffer_types[name] = [val.get_dtype(), val.get_size(), val.get_stride()]
+                buffer_types[name] = [val.get_dtype(), val.get_numel()]
         buffer_types.update(
             {name: val.dtype for name, val in V.graph.constants.items()}
         )
         buffer_types.update(
-            {name: [val.get_dtype(), val.get_size(), val.get_stride()] for name, val in extra_node.items()}
+            {name: [val.get_dtype(), val.get_numel()] for name, val in extra_node.items()}
         )
 
         call_args = []
@@ -150,6 +148,41 @@ def set_info(outer, inner, arg_type):
             set_info(outer, inner, self.MLIR_ARGS_VAR)
         return arg_defs, call_args, arg_attributes, buffer_types
 
+class MLIRMultiDimTile():
+    def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None):
+        self.tile_size = list(tile_size)
+        self.tile_order = list(range(len(self.tile_size)))[::-1]
+
+        # Vector lane mapping config
+        self.vector_lane = vector_lane
+        self.vlane_split_axis = vlane_split_axis
+        self.vlane_stride = vlane_stride
+
+    def get_tile_size(self):
+        """
+        Return size of multi-dimensional tile
+        """
+        size = 1
+        for dim_size in self.tile_size:
+            size *= dim_size
+        return size
+
+    def dim_size(self):
+        """
+        Return number of dimensions
+        """
+        return len(self.tile_size)
+
+    def get_used_vlane(self):
+        """
+        Return number of used vector lane
+        """
+        return self.div_round_up(self.tile_size[self.vlane_split_axis], self.vlane_stride)
+
+    @staticmethod
+    def div_round_up(size, round_val):
+        return (size + round_val - 1) // round_val
+
 class MLIRTile():
     TILE_ROW_WISE = 0
     TILE_COL_WISE = 1

From 50fa19ecc38b3f52c4b94bc6f0e1a24dbf7347a2 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 17 Jan 2025 02:39:18 +0000
Subject: [PATCH 025/432] [WIP4]

---
 .../mlir/mlir_codegen_backend.py              | 23 ++++++---
 PyTorchSimFrontend/mlir/mlir_common.py        | 47 +++++++++----------
 2 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 1e309803..9277cf41 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -648,13 +648,6 @@ def __init__(self):
         self.reduce_iterator = {}
         self.is_template_kernel = False
 
-    def set_ranges(self, lengths, reduction_lengths, read_writes):
-        ret = super().set_ranges(lengths, reduction_lengths, read_writes)
-
-        # Adjust time size when it is vector
-        self.adjust_tile_size()
-        return ret
-
     # padding type 0: zero-padding 1: negative-padding(-inf) ...
     def get_padding_type(self):
         ops = self.current_node.node.origins
@@ -731,6 +724,8 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         dram_var = self.args.output(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
+
+        # Prepare dma instruction
         vlane_split_axis, vlane_stride, tile_shape, tile_size_per_lane = self.get_dma_info(name, index)
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
@@ -1058,6 +1053,20 @@ def get_dma_info(self, name, index):
         vlane_split_axis = int(current_tile.tile_per_lane_layout == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
         return vlane_split_axis, vlane_stride, [current_tile.n_row, current_tile.n_col], tile_size_per_lane
 
+    def get_dma_info(self, name, index): # Need more argument?
+        """
+        A tile descriptor exists that is configured on a kernel group
+        DMA desc should be adjusted according to buffer.
+        Therefore, this function shoulde determin DRAM, SRAM stride and
+        vectorlane mapping policy
+        """
+        # TODO.
+        kg_tile_desc = self.kernel_group.tile_desc
+        vlane_split_axis = 0
+        vlane_stride = 1
+        local_tile_desc = mlir_common.MLIRMultiDimTile()
+        return vlane_split_axis, vlane_stride, local_tile_desc
+
     def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_var, index_var, sram_var, tag_name, dram_shape, tile_shape, padding_type=None):
         dma_key = (attribute1, attribute2, mlir_dtype)
         if dma_type_name == "MVIN" and dma_key in self.dma_read_cache:
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index f60374c1..6ae5cfe5 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -151,7 +151,7 @@ def set_info(outer, inner, arg_type):
 class MLIRMultiDimTile():
     def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None):
         self.tile_size = list(tile_size)
-        self.tile_order = list(range(len(self.tile_size)))[::-1]
+        self.tile_stride = None # Todo.
 
         # Vector lane mapping config
         self.vector_lane = vector_lane
@@ -167,6 +167,9 @@ def get_tile_size(self):
             size *= dim_size
         return size
 
+    def get_tile_stride(self):
+        return self.tile_stride
+
     def dim_size(self):
         """
         Return number of dimensions
@@ -179,6 +182,9 @@ def get_used_vlane(self):
         """
         return self.div_round_up(self.tile_size[self.vlane_split_axis], self.vlane_stride)
 
+    def get_vlane_stride(self):
+        return self.vlane_stride
+
     @staticmethod
     def div_round_up(size, round_val):
         return (size + round_val - 1) // round_val
@@ -218,9 +224,6 @@ def get_tile_size_per_lane(self):
             print(f"[Warning] n_col({self.n_col}) % vector_lane({self.used_vector_lane}) != 0")
         return self.div_round_up(self.get_tile_size(), self.used_vector_lane)
 
-    def get_tile_shape(self):
-        return f"{self.n_row}x{self.n_col}"
-
     def get_vlane_stride(self):
         if self.tile_layout == self.TILE_ROW_WISE:
             vlane_stride = self.get_tile_size_per_lane()
@@ -236,6 +239,10 @@ class MLIRWrapperKenrelGroup(cpp.KernelGroup):
     def __init__(self):
         super().__init__()
         self.args = MLIRKernelArgs()
+        self.tile_desc : MLIRMultiDimTile = None
+
+    def set_tile_info(self, tile_desc : MLIRMultiDimTile):
+        self.tile_desc = tile_desc
 
 class BaseMLIRHardwareInfo():
     def __init__(self):
@@ -270,19 +277,12 @@ def __init__(self, args=None):
         self.reductions_suffix = IndentedBuffer()
         self.cse = common.CSE(self.newvar_prefix, self.suffix)
         # Tile size setting
-        tile_row = extension_config.CONFIG_TILE_ROW
-        if tile_row == -1:
-            tile_row = self.vlen * self.vector_lane
-        tile_col = extension_config.CONFIG_TILE_COL
-        if tile_col == -1:
-            tile_col = 8 # FIXME: tile_col is not always vector_lane * vlen
-        self.tile_desc = MLIRTile(tile_row, tile_col, self.vector_lane)
+        self.tile_desc : MLIRMultiDimTile = None
+        # MLIR SSA tracker
         self.var_info = {} # MLIR variable info
         self.buffer_types : dict = None
-        self.read_writes = None
 
-    def set_ranges(self, lengths, reduction_lengths, read_writes):
-        self.read_writes = read_writes
+    def set_ranges(self, lengths, reduction_lengths):
         if self.call_ranges:
             assert self.call_ranges == tuple(lengths) + tuple(
                 reduction_lengths
@@ -328,14 +328,17 @@ def codegen_nodes(self, nodes, kernel_name):
             nodes, key=lambda x: int(x.is_reduction())
         ).group
 
-        self.set_ranges(group, reduction_group, None)
+        # Set node range info
+        vars, reduction_vars = self.set_ranges(group, reduction_group)
+
+        # Select tile info.
+        # Note: Kernel Group have to share same tile desc for fusion
+        tile_desc = MLIRMultiDimTile([128, 128], self.vector_lane)
+        self.kernel_group.set_tile_info(tile_desc)
+
         with self as kernel:
             kernel.args = kernel.kernel_group.args
             for node in nodes:
-                vars, reduction_vars = kernel.set_ranges(group, reduction_group, node.read_writes)
-                kernel.args.tile_row = kernel.tile_desc.n_row
-                kernel.args.tile_col = kernel.tile_desc.n_col
-                _, _, _, kernel.buffer_types = kernel.args.mlir_argdefs()
                 node.run(vars, reduction_vars)
         src_code = self.codegen_kernel(kernel_name=kernel_name)
         self.meta_kernel()
@@ -343,10 +346,6 @@ def codegen_nodes(self, nodes, kernel_name):
 
     def codegen_kernel(self, kernel_name):
         arg_defs, _, _, _ = self.kernel_group.args.mlir_argdefs()
-        code = self._codegen_kernel(arg_defs, kernel_name)
-        return code.getvalue()
-
-    def _codegen_kernel(self, arg_defs, kernel_name):
         arg_defs = ",\n".ljust(25).join(arg_defs)
         code = common.BracesBuffer()
 
@@ -360,7 +359,7 @@ def _codegen_kernel(self, arg_defs, kernel_name):
                 code.writeline(f"auto {old} = {new};")
             # Loop body part
             code.splice(self.codegen_loops())
-        return code
+        return code.getvalue()
 
     def meta_kernel(self):
         wrapper = V.graph.wrapper_code

From c7f33327e69093a1ab2cfe63ff3a5499082f5d6a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 17 Jan 2025 05:14:37 +0000
Subject: [PATCH 026/432] [WIP5]

---
 .../mlir/mlir_caller_codegen.py               |   4 +-
 .../mlir/mlir_codegen_backend.py              | 145 ++++++++++++------
 PyTorchSimFrontend/mlir/mlir_common.py        |  39 +++--
 3 files changed, 126 insertions(+), 62 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
index 92f250df..12c2cb8a 100644
--- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
+++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
@@ -50,7 +50,7 @@ def generate_kernel_declare(self):
 
     def generate_args_define(self):
         name_set = set()
-        for arg_name, (_, arg_type, arg_size) in self.arg_attributes:
+        for arg_name, (_, arg_type, arg_size, arg_sizes, arg_stride) in self.arg_attributes:
             if not arg_name in name_set:
                 if self.validation:
                     self.writeline(f'{DTYPE_TO_C[arg_type]} c_{arg_name}[{arg_size}]{self.ending}')
@@ -77,7 +77,7 @@ def generate_main(self):
             else:
                 self.generate_args_define()
 
-            func_arguments = [f"c_{arg_name}, c_{arg_name}, 0, {arg_shape}, 1" if arg_type != torch.bool else f"c_{arg_name}, c_{arg_name}, 0, {(arg_shape + 7) // 8}, 1" for arg_name, (_, arg_type, arg_shape) in self.arg_attributes]
+            func_arguments = [f"c_{arg_name}, c_{arg_name}, 0, {arg_shape}, 1" if arg_type != torch.bool else f"c_{arg_name}, c_{arg_name}, 0, {(arg_shape + 7) // 8}, 1" for arg_name, (_, arg_type, arg_shape, _, _) in self.arg_attributes]
             self.writeline(f"wrapper_{self.kernel_name}({', '.join(func_arguments)}){self.ending}{self.newline}")
 
             if self.validation:
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 9277cf41..db318e3b 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -700,20 +700,26 @@ def load(self, name: str, index: sympy.Expr):
         dram_var = self.args.input(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        vlane_split_axis, vlane_stride, tile_shape, tile_size_per_lane = self.get_dma_info(name, index)
+        local_tile_desc = self.get_dma_info(name, index)
+        vlane_split_axis = local_tile_desc.vlane_split_axis
+        vlane_stride = local_tile_desc.vlane_stride
+        tile_size_per_lane = local_tile_desc.get_tile_size_per_lane()
+
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
-        tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
+        tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
 
         # Define scratch pad buffer
-        sram_var, index_var = self.get_scratchpad_buffer(dtype, name, self.tile_desc.n_row, self.tile_desc.n_col, tile_shape, self.loads, index_var, index)
+        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_size_per_lane, tile_shape, self.loads, index_var, index)
+
         # MVIN Encoding
-        code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape, padding)
+        code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
+                                 f"{name}_tag", dram_shape, tile_shape, padding)
         self.cse.generate(self.loads, code, assignment = False) # FIXME: assignment = False does not support caching
 
         # Generate vector load instruction
         operation = "affine.vector_load" if tile_size_per_lane > 1 else "affine.load"
         shape = f", vector<{tile_size_per_lane}x{mlir_dtype}>" if tile_size_per_lane > 1 else ""
-        line = f"{operation} %{sram_var}[0, 0] : memref<{tile_shape}x{mlir_dtype}, 1>{shape}"
+        line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}{shape}"
         out = self.cse.generate(self.loads, line)
         self.register_var_info(out, [tile_size_per_lane, mlir_dtype])
         return out
@@ -726,12 +732,16 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
         # Prepare dma instruction
-        vlane_split_axis, vlane_stride, tile_shape, tile_size_per_lane = self.get_dma_info(name, index)
+        local_tile_desc = self.get_dma_info(name, index)
+        vlane_split_axis = local_tile_desc.vlane_split_axis
+        vlane_stride = local_tile_desc.vlane_stride
+        tile_size_per_lane = local_tile_desc.get_tile_size_per_lane()
+
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
-        tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
+        tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
 
         # Define scratch pad buffer
-        sram_var, index_var = self.get_scratchpad_buffer(dtype, name, self.tile_desc.n_row, self.tile_desc.n_col, tile_shape, self.stores, index_var, index)
+        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_size_per_lane, tile_shape, self.stores, index_var, index)
 
         # Generate vector store instruction
         store_size, operand_type = self.var_info[value]
@@ -740,11 +750,12 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         if mlir_dtype != operand_type:
             value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
 
-        line = f"{operation} %{value}, %{sram_var}[0, 0] : memref<{tile_shape}x{mlir_dtype}, 1>{shape}"
+        line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}{shape}"
         self.cse.generate(self.stores, line, assignment = False)
 
         # Generate DMA instruction
-        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape)
+        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
+                                 f"{name}_tag", dram_shape, tile_shape)
         self.cse.generate(self.stores, code, assignment = False)
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
@@ -829,10 +840,18 @@ def store_reduction(self, name, index, value):
         index_var = self.parse_indices(index)
 
         # Tile is always reuduced in inner loop
-        tile_col = self.tile_desc.n_row
-        tile_row = 1
-        dram_tile_shape = f"{tile_row}x{tile_col}"
-        sram_var, index_var = self.get_scratchpad_buffer(dtype, name, tile_row, tile_col, dram_tile_shape, self.reductions_suffix, index_var, index)
+        local_tile_desc = self.get_dma_info(name, index)
+        vlane_split_axis = local_tile_desc.vlane_split_axis
+        vlane_stride = local_tile_desc.vlane_stride
+        tile_size_per_lane = local_tile_desc.get_tile_size_per_lane()
+
+        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
+        tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
+        #tile_col = self.tile_desc.n_row
+        #tile_row = 1
+        #dram_tile_shape = f"{tile_row}x{tile_col}"
+
+        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_size_per_lane, tile_shape, self.reductions_suffix, index_var, index)
         if self.welford_reduce_out is not None:
             # raise NotImplementedError()
             sum, sqr_sum, _ = self.welford_reduce_out
@@ -856,30 +875,27 @@ def store_reduction(self, name, index, value):
                 value = m2
 
         # Select mlir store operaiton
-        if self.buffer_types[name][1] == 1 or self.tile_desc.get_rows_per_lane() == 1:
+        if self.buffer_types[name][1] == 1 or tile_size_per_lane == 1:
             operation = "affine.store"
             # raise NotImplementedError("Scalar store!")
         else:
             operation =  "affine.vector_store"
 
         # Select src type
-        if self.tile_desc.get_rows_per_lane() == 1:
+        if tile_size_per_lane == 1:
             shape = ""
         else:
-            shape = f"vector<{self.tile_desc.get_rows_per_lane()}x{mlir_dtype}>"
+            shape = f"vector<{tile_size_per_lane}x{mlir_dtype}>"
             shape = f", {shape}" if self.buffer_types[name][1] > 1 else ""
-        line = f"{operation} %{value}, %{sram_var}[0, 0] : memref<{tile_row}x{tile_col}x{mlir_dtype}, 1>{shape}"
+
+        line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}{shape}"
         self.cse.generate(self.reductions_suffix, line, assignment = False)
 
         # MVOUT Encoding
-        mm_stride = tile_col
-        vlane_split_axis = 0 #FIXME.
-        vlane_stride = self.tile_desc.get_rows_per_lane()
-        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
 
         # Generate DMA instruction
-        # Change row, col
-        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, f"{tile_row}x{tile_col}")
+        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
+                                 f"{name}_tag", dram_shape, tile_shape)
         self.cse.generate(self.reductions_suffix, code, assignment = False)
 
     def codegen_global_init(self):
@@ -888,10 +904,10 @@ def codegen_global_init(self):
     def codegen_loops(self):
         code = mlir_common.ParallelLoopBuffer()
         # Loop body part
-        tile_row, tile_col = self.tile_desc.n_row, self.tile_desc.n_col
-        # FIXME.
-        tile_row = self.tile_desc.get_tile_size() if len(self.itervars) == 1 else tile_row
-        loops = [LoopLevel(var, size, idx-len(self.itervars), tile_row=tile_row, tile_col=tile_col) for idx, (var, size) in enumerate(zip(self.itervars, self.ranges))]
+        tile_size = self.kernel_group.tile_desc.tile_size
+        # Apply paddings
+        tile_size = [1] * (len(self.itervars) - len(tile_size)) + tile_size
+        loops = [LoopLevel(var, size, idx-len(self.itervars), tile_size=tile_size) for idx, (var, size) in enumerate(zip(self.itervars, self.ranges))]
         loops, reductions = [LoopNest(loops[: self.reduction_depth]),
                              LoopNest(loops[self.reduction_depth :])]
         if (self.reduction_depth==0):
@@ -942,7 +958,7 @@ def codegen_nodes(self, nodes, kernel_name):
             write_atomic(gem5_write_path, self.gem5_header.getvalue())
         return src_code
 
-    def get_dma_info(self, name, index):
+    def get_dma_info2(self, name, index):
         current_tile = mlir_common.MLIRTile(self.tile_desc.n_row, self.tile_desc.n_col, self.tile_desc.vector_lane, self.tile_desc.used_vector_lane)
         cv = self.get_constant_vector(index)
         cv2 = self.get_constant_vector2(index)
@@ -1062,12 +1078,37 @@ def get_dma_info(self, name, index): # Need more argument?
         """
         # TODO.
         kg_tile_desc = self.kernel_group.tile_desc
-        vlane_split_axis = 0
-        vlane_stride = 1
-        local_tile_desc = mlir_common.MLIRMultiDimTile()
-        return vlane_split_axis, vlane_stride, local_tile_desc
+        buffer_info = self.buffer_types[name]
+        # Note: index could contain symbols that represent dynamic axies
+        # Extract dimension of index(e.g, index0, index1)
+        dims = [i for i in index.free_symbols if "index" in str(i)]
+        local_tile_desc = mlir_common.MLIRMultiDimTile([1], self.vector_lane)
 
-    def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_var, index_var, sram_var, tag_name, dram_shape, tile_shape, padding_type=None):
+        # Case 0. Tile is 0-D scalar
+        if len(dims) == 0:
+            local_tile_desc.tile_size = [1]         # Broadcast needed?
+            local_tile_desc.tile_stride = [1]
+            local_tile_desc.vlane_split_axis = 0    # last axis
+            local_tile_desc.vlane_stride = 1
+        # Case 1. Tile is 1-D vector type
+        elif len(dims) == 1 and len(dims) <= self.reduction_depth:
+            local_tile_desc.tile_size = [kg_tile_desc.get_dim_size(dims[0])]
+            local_tile_desc.tile_stride = [1]
+            local_tile_desc.vlane_split_axis = 0    # last axis
+            local_tile_desc.vlane_stride = 1        # Need to choice best...
+        # Case 2. Tile is 1-D vector type with reduction
+        elif len(dims) == 1 and len(dims) == self.reduction_depth + 1:
+            local_tile_desc.tile_size = [1, kg_tile_desc.get_dim_size(dims[0])]
+            local_tile_desc.tile_stride = [0, 1]
+            local_tile_desc.vlane_split_axis = 1
+            local_tile_desc.vlane_stride = 1
+        else:
+            raise NotImplementedError("Currently not implemented... ;)")
+
+        return local_tile_desc
+
+    def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_var, dram_index_var, sram_var, sram_index_var,
+                     tag_name, dram_shape, tile_shape, padding_type=None):
         dma_key = (attribute1, attribute2, mlir_dtype)
         if dma_type_name == "MVIN" and dma_key in self.dma_read_cache:
             dma_type, attribute1, attribute2 = self.dma_read_cache[dma_key]
@@ -1088,12 +1129,15 @@ def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_v
         zero_cse = self.get_const_cse(0)
 
         # Prepare opearnds and attributes
-        dram_operand = f"%{dram_var}[%{index_var}]"
-        sram_operand = f"%{sram_var}[%{zero_cse}, %{zero_cse}]"
+        sram_dims = len(tile_shape.split("x")) - 1
+        sram_index = [f"%{zero_cse}"] * sram_dims
+        dram_operand = f"%{dram_var}[%{dram_index_var}]"
+        sram_operand = f"%{sram_var}[{sram_index_var}]" # Use string
         tag_var = f"%{tag}[0]"
         dma_attribute = f"%{dma_type}, %{attribute1}, %{attribute2}"
         #dram_shape = f"memref<{dram_shape}x{mlir_dtype}>"
-        sram_shape = f"memref<{tile_shape}x{mlir_dtype}, 1>"
+        #sram_shape = f"memref<{tile_shape}x{mlir_dtype}, 1>"
+        sram_shape = tile_shape
         tag_shape = "memref<1xi32>"
 
         if dma_type_name == "MVIN":
@@ -1157,11 +1201,10 @@ def adjust_tile_size(self):
         if len(self.itervars) >= 3 and self.reduction_depth < len(self.itervars):
             raise NotImplementedError()
 
-    def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, indices, raw_index, is_template=False):
+    def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, code_buffer, indices, raw_index, is_template=False):
         c_type = mlir_common.DTYPE_TO_C[dtype]
-        mlir_type = mlir_common.DTYPE_TO_MLIR[dtype]
         # Make sure each lane's buffer has at least two element
-        tile_size = max(self.roundup_vectorlane(tile_row * tile_col), self.vector_lane * 2)
+        tile_size = max(tile_size_per_lane, 2) * self.vector_lane
 
         if dtype == torch.bool and not is_template:
             mapping = self.map_cse.generate(self.global_vars, f"affine_map<({indices}) -> ({indices} floordiv 8)>")
@@ -1175,12 +1218,17 @@ def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape
             # Add definition to header
             self.header.writeline(f"{c_type} {new_name}_spad[{tile_size // self.vector_lane}] __attribute__ ((section(\".spad\")));")
             self.gem5_header.writeline(f"{c_type} {new_name}_spad[{tile_size}];")
-            self.global_vars.writeline(f"memref.global @{new_name}_spad : memref<{dram_tile_shape}x{mlir_type}, 1>")
+            self.global_vars.writeline(f"memref.global @{new_name}_spad : {dram_tile_shape}")
             self.global_vars_dict[name].append(str(raw_index))
         else:
             new_name = f"{name}_{self.global_vars_dict[name].index(str(raw_index))}"
-        buffer = self.cse.generate(code_buffer, f"memref.get_global @{new_name}_spad : memref<{dram_tile_shape}x{mlir_type}, 1>")
-        return buffer, indices
+        buffer = self.cse.generate(code_buffer, f"memref.get_global @{new_name}_spad : {dram_tile_shape}")
+
+        zero_cse = self.get_const_cse(0)
+        sram_dims = len(dram_tile_shape.split("x")) - 1
+        sram_index_var = ",".join([f"%{zero_cse}"] * sram_dims)
+
+        return buffer, indices, sram_index_var
 
     def get_const_cse(self, value) -> common.CSEVariable:
         if value not in self.consts:
@@ -1198,16 +1246,13 @@ class LoopLevel:
     size: sympy.Expr
     idx: int
     start: int = 0
-    tile_row: int = 4
-    tile_col: int = 4
+    tile_size : List = None
     reduction_vars: Dict[str, str] = None
+    loop_nr : int = 0
 
     def lines(self):
-        step = 1
-        if self.idx == -2:
-            step = self.tile_row
-        elif self.idx == -1:
-            step = self.tile_col
+        step = self.tile_size[self.loop_nr]
+        self.loop_nr += 1
         if self.reduction_vars:
             acc = ', '.join([f"%{acc.name}" for acc in self.reduction_vars.keys()])
             args = ', '.join([f"%{iter.name} = %{init.name}" for (_, iter, init, _) in self.reduction_vars.values()])
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 6ae5cfe5..4d23a94b 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -108,17 +108,17 @@ def mlir_argdefs(self, extra_node=dict()):
         buffer_types = {}
         for x in V.graph.buffers:
             if not isinstance(x.layout, MultiOutputLayout): # FIXME: MultiOutputLayout should be handled
-                buffer_types[x.get_name()] = [x.get_dtype(), x.get_numel()]
+                buffer_types[x.get_name()] = [x.get_dtype(), x.get_numel(), x.get_size(), x.get_stride()]
         for name, val in V.graph.graph_inputs.items():
             if isinstance(val, sympy.Expr):
-                buffer_types[name] = [get_sympy_Expr_dtype(val), 1]
+                buffer_types[name] = [get_sympy_Expr_dtype(val), 1, [1], [1]]
             else:
-                buffer_types[name] = [val.get_dtype(), val.get_numel()]
+                buffer_types[name] = [val.get_dtype(), val.get_numel(), val.get_size(), val.get_stride()]
         buffer_types.update(
-            {name: val.dtype for name, val in V.graph.constants.items()}
+            {name: [val.dtype, 1, [1], [1]] for name, val in V.graph.constants.items()}
         )
         buffer_types.update(
-            {name: [val.get_dtype(), val.get_numel()] for name, val in extra_node.items()}
+            {name: [val.get_dtype(), val.get_numel(), val.get_size(), val.get_stride()] for name, val in extra_node.items()}
         )
 
         call_args = []
@@ -167,20 +167,40 @@ def get_tile_size(self):
             size *= dim_size
         return size
 
+    def get_tile_size_per_lane(self):
+        tile_size_per_lane = list(self.tile_size)
+        tile_size_per_lane[self.vlane_split_axis] = self.vlane_stride
+        size = 1
+        for dim_size in tile_size_per_lane:
+            size *= dim_size
+        return size
+
     def get_tile_stride(self):
         return self.tile_stride
 
-    def dim_size(self):
+    def get_nr_dim(self):
         """
         Return number of dimensions
         """
         return len(self.tile_size)
 
+    def get_dim_size(self, index):
+        if isinstance(index, int):
+            return self.tile_size[index]
+        elif "index" in str(index):
+            return self.tile_size[int(str(index)[5:])]
+        raise NotImplementedError("Unsupported format of index")
+
+    def get_mlir_shape(self, dtype):
+        str_tile_size = [str(dim) for dim in self.tile_size]
+        shape = "x".join(str_tile_size)
+        return f"memref<{shape}x{dtype}, 1>"
+
     def get_used_vlane(self):
         """
         Return number of used vector lane
         """
-        return self.div_round_up(self.tile_size[self.vlane_split_axis], self.vlane_stride)
+        return min(self.div_round_up(self.tile_size[self.vlane_split_axis], self.vlane_stride), self.vector_lane)
 
     def get_vlane_stride(self):
         return self.vlane_stride
@@ -276,11 +296,9 @@ def __init__(self, args=None):
         self.vector_compute = IndentedBuffer()
         self.reductions_suffix = IndentedBuffer()
         self.cse = common.CSE(self.newvar_prefix, self.suffix)
-        # Tile size setting
-        self.tile_desc : MLIRMultiDimTile = None
         # MLIR SSA tracker
         self.var_info = {} # MLIR variable info
-        self.buffer_types : dict = None
+        self.buffer_types : dict = None # format: dtype, numel, size, stride
 
     def set_ranges(self, lengths, reduction_lengths):
         if self.call_ranges:
@@ -335,6 +353,7 @@ def codegen_nodes(self, nodes, kernel_name):
         # Note: Kernel Group have to share same tile desc for fusion
         tile_desc = MLIRMultiDimTile([128, 128], self.vector_lane)
         self.kernel_group.set_tile_info(tile_desc)
+        _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs()
 
         with self as kernel:
             kernel.args = kernel.kernel_group.args

From 184c254d20691eb8f8f56d0721924449260d49f1 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Fri, 17 Jan 2025 05:20:17 +0000
Subject: [PATCH 027/432] [WIP6]

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index db318e3b..6eec6f5f 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1133,8 +1133,8 @@ def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_v
         sram_index = [f"%{zero_cse}"] * sram_dims
         dram_operand = f"%{dram_var}[%{dram_index_var}]"
         sram_operand = f"%{sram_var}[{sram_index_var}]" # Use string
-        tag_var = f"%{tag}[0]"
-        dma_attribute = f"%{dma_type}, %{attribute1}, %{attribute2}"
+        tag_var = f"%{tag}[%{zero_cse}]"
+        dma_attribute = f"%{attribute1}, %{attribute2}"
         #dram_shape = f"memref<{dram_shape}x{mlir_dtype}>"
         #sram_shape = f"memref<{tile_shape}x{mlir_dtype}, 1>"
         sram_shape = tile_shape
@@ -1147,7 +1147,7 @@ def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_v
             src_operand, dst_operand = sram_operand, dram_operand
             src_shape, dst_shape = sram_shape, dram_shape
 
-        code = f"affine.dma_start {src_operand}, {dst_operand}, {tag_var}, {dma_attribute} : {src_shape}, {dst_shape}, {tag_shape}"
+        code = f"memref.dma_start {src_operand}, {dst_operand}, %{dma_type}, {tag_var}, {dma_attribute} : {src_shape}, {dst_shape}, {tag_shape}"
         if padding_type is not None:
             code = code + f" {{padding = {padding_type}}}"
         return code

From 9f1bcc539e3977e7db4b2de685006cf541d6a244 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 17 Jan 2025 06:59:14 +0000
Subject: [PATCH 028/432] [WIP6]

---
 .../mlir/mlir_codegen_backend.py              | 21 ++++++++++++-------
 PyTorchSimFrontend/mlir/mlir_common.py        | 12 ++++++++++-
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 6eec6f5f..95a804a2 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -847,15 +847,12 @@ def store_reduction(self, name, index, value):
 
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
-        #tile_col = self.tile_desc.n_row
-        #tile_row = 1
-        #dram_tile_shape = f"{tile_row}x{tile_col}"
 
         sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_size_per_lane, tile_shape, self.reductions_suffix, index_var, index)
         if self.welford_reduce_out is not None:
             # raise NotImplementedError()
             sum, sqr_sum, _ = self.welford_reduce_out
-            shape = f"vector<{self.tile_desc.get_rows_per_lane()}x{mlir_dtype}>" if self.buffer_types[name][1] > 1 else mlir_dtype
+            shape = f"vector<{tile_size_per_lane}x{mlir_dtype}>" if self.buffer_types[name][1] > 1 else mlir_dtype
             # mean
             divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.ranges[self.reduction_depth])} : f32")
             if self.buffer_types[name][1] > 1:
@@ -906,7 +903,6 @@ def codegen_loops(self):
         # Loop body part
         tile_size = self.kernel_group.tile_desc.tile_size
         # Apply paddings
-        tile_size = [1] * (len(self.itervars) - len(tile_size)) + tile_size
         loops = [LoopLevel(var, size, idx-len(self.itervars), tile_size=tile_size) for idx, (var, size) in enumerate(zip(self.itervars, self.ranges))]
         loops, reductions = [LoopNest(loops[: self.reduction_depth]),
                              LoopNest(loops[self.reduction_depth :])]
@@ -1100,8 +1096,19 @@ def get_dma_info(self, name, index): # Need more argument?
         elif len(dims) == 1 and len(dims) == self.reduction_depth + 1:
             local_tile_desc.tile_size = [1, kg_tile_desc.get_dim_size(dims[0])]
             local_tile_desc.tile_stride = [0, 1]
-            local_tile_desc.vlane_split_axis = 1
+            local_tile_desc.vlane_split_axis = 0
             local_tile_desc.vlane_stride = 1
+        # Case 3. Tile is 2-D tile
+        elif len(dims) == 2:
+            is_reduction = self.reduction_depth == 1
+            local_tile_desc.tile_size = [kg_tile_desc.get_dim_size(dim) for dim in dims]
+            if is_reduction:
+                local_tile_desc.vlane_split_axis = 0
+                local_tile_desc.vlane_stride = 8
+            else:
+                local_tile_desc.vlane_split_axis= 1
+                local_tile_desc.vlane_stride = 1
+
         else:
             raise NotImplementedError("Currently not implemented... ;)")
 
@@ -1129,8 +1136,6 @@ def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_v
         zero_cse = self.get_const_cse(0)
 
         # Prepare opearnds and attributes
-        sram_dims = len(tile_shape.split("x")) - 1
-        sram_index = [f"%{zero_cse}"] * sram_dims
         dram_operand = f"%{dram_var}[%{dram_index_var}]"
         sram_operand = f"%{sram_var}[{sram_index_var}]" # Use string
         tag_var = f"%{tag}[%{zero_cse}]"
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 4d23a94b..4dcc8ee3 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -349,9 +349,19 @@ def codegen_nodes(self, nodes, kernel_name):
         # Set node range info
         vars, reduction_vars = self.set_ranges(group, reduction_group)
 
+        # Dummy tile size
+        tile_size = [1] * (len(vars) + len(reduction_vars))
+        if len(tile_size) >= 2:
+            tile_size[-1] = 128
+            tile_size[-2] = 128
+        elif len(tile_size) == 1:
+            tile_size[0] = 256
+        else:
+            raise NotImplementedError("dummy tile size fail!")
+
         # Select tile info.
         # Note: Kernel Group have to share same tile desc for fusion
-        tile_desc = MLIRMultiDimTile([128, 128], self.vector_lane)
+        tile_desc = MLIRMultiDimTile(tile_size, self.vector_lane)
         self.kernel_group.set_tile_info(tile_desc)
         _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs()
 

From e28ac4672285a0bf9741168fe0d99104bb57c1c7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 17 Jan 2025 09:58:22 +0000
Subject: [PATCH 029/432] [WIP7]

---
 .../mlir/mlir_codegen_backend.py              | 27 +++++-----
 PyTorchSimFrontend/mlir/mlir_common.py        | 50 +++++++++++++------
 PyTorchSimFrontend/mlir/mlir_template.py      | 25 +++++-----
 3 files changed, 61 insertions(+), 41 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 95a804a2..dcaffe1d 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -697,7 +697,7 @@ def load(self, name: str, index: sympy.Expr):
         index = self.rename_indexing(index)
         padding = self.get_padding_type()
         index_var = self.parse_indices(index)
-        dram_var = self.args.input(name)
+        dram_var = self.kernel_group.args.input(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         local_tile_desc = self.get_dma_info(name, index)
@@ -727,7 +727,7 @@ def load(self, name: str, index: sympy.Expr):
     def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         index = self.rename_indexing(index)
         index_var = self.parse_indices(index)
-        dram_var = self.args.output(name)
+        dram_var = self.kernel_group.args.output(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
@@ -833,7 +833,7 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         return acc
 
     def store_reduction(self, name, index, value):
-        dram_var = self.args.output(name)
+        dram_var = self.kernel_group.args.output(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         index = self.rename_indexing(index)
@@ -901,7 +901,7 @@ def codegen_global_init(self):
     def codegen_loops(self):
         code = mlir_common.ParallelLoopBuffer()
         # Loop body part
-        tile_size = self.kernel_group.tile_desc.tile_size
+        tile_size = self.kernel_group.tile_desc.get_tile_size()
         # Apply paddings
         loops = [LoopLevel(var, size, idx-len(self.itervars), tile_size=tile_size) for idx, (var, size) in enumerate(zip(self.itervars, self.ranges))]
         loops, reductions = [LoopNest(loops[: self.reduction_depth]),
@@ -1082,33 +1082,30 @@ def get_dma_info(self, name, index): # Need more argument?
 
         # Case 0. Tile is 0-D scalar
         if len(dims) == 0:
-            local_tile_desc.tile_size = [1]         # Broadcast needed?
-            local_tile_desc.tile_stride = [1]
+            local_tile_desc.set_tile_size([1])         # Broadcast needed?
             local_tile_desc.vlane_split_axis = 0    # last axis
             local_tile_desc.vlane_stride = 1
         # Case 1. Tile is 1-D vector type
         elif len(dims) == 1 and len(dims) <= self.reduction_depth:
-            local_tile_desc.tile_size = [kg_tile_desc.get_dim_size(dims[0])]
-            local_tile_desc.tile_stride = [1]
+            local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dims[0])])
             local_tile_desc.vlane_split_axis = 0    # last axis
-            local_tile_desc.vlane_stride = 1        # Need to choice best...
+            local_tile_desc.vlane_stride = 1        # Need to choose best...
         # Case 2. Tile is 1-D vector type with reduction
         elif len(dims) == 1 and len(dims) == self.reduction_depth + 1:
-            local_tile_desc.tile_size = [1, kg_tile_desc.get_dim_size(dims[0])]
-            local_tile_desc.tile_stride = [0, 1]
+            local_tile_desc.set_tile_size([1, kg_tile_desc.get_dim_size(dims[0])])
             local_tile_desc.vlane_split_axis = 0
             local_tile_desc.vlane_stride = 1
         # Case 3. Tile is 2-D tile
         elif len(dims) == 2:
             is_reduction = self.reduction_depth == 1
-            local_tile_desc.tile_size = [kg_tile_desc.get_dim_size(dim) for dim in dims]
             if is_reduction:
+                local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in dims], [1, 0])
                 local_tile_desc.vlane_split_axis = 0
-                local_tile_desc.vlane_stride = 8
+                local_tile_desc.vlane_stride = 2
             else:
-                local_tile_desc.vlane_split_axis= 1
+                local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in dims])
+                local_tile_desc.vlane_split_axis = local_tile_desc.get_tile_size().index(max(local_tile_desc.get_tile_size()))
                 local_tile_desc.vlane_stride = 1
-
         else:
             raise NotImplementedError("Currently not implemented... ;)")
 
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 4dcc8ee3..72a98c22 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -150,57 +150,80 @@ def set_info(outer, inner, arg_type):
 
 class MLIRMultiDimTile():
     def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None):
-        self.tile_size = list(tile_size)
-        self.tile_stride = None # Todo.
+        self._tile_size = list(tile_size)
+        self.tile_axis_order = range(len(tile_size))
 
         # Vector lane mapping config
         self.vector_lane = vector_lane
         self.vlane_split_axis = vlane_split_axis
         self.vlane_stride = vlane_stride
 
+    def set_tile_size(self, tile_size, tile_axis_order=None):
+        self._tile_size = tile_size
+        if tile_axis_order is None:
+            self.tile_axis_order = range(len(tile_size))
+        else:
+            self.tile_axis_order = tile_axis_order
+
     def get_tile_size(self):
+        return self._tile_size
+
+    def get_numel(self):
         """
         Return size of multi-dimensional tile
         """
         size = 1
-        for dim_size in self.tile_size:
+        for dim_size in self._tile_size:
             size *= dim_size
         return size
 
     def get_tile_size_per_lane(self):
-        tile_size_per_lane = list(self.tile_size)
-        tile_size_per_lane[self.vlane_split_axis] = self.vlane_stride
+        tile_size_per_lane = list(self._tile_size)
+        used_vlane = self.get_used_vlane()
+        tile_size_per_lane[self.vlane_split_axis] = \
+            self.div_round_up(tile_size_per_lane[self.vlane_split_axis], used_vlane)
         size = 1
         for dim_size in tile_size_per_lane:
             size *= dim_size
         return size
 
     def get_tile_stride(self):
-        return self.tile_stride
+        strides = [1] * len(self._tile_size)
+        init = 1
+
+        original_indices = list(range(len(self.tile_axis_order)))
+        sorted_pairs = sorted(
+            zip(self.tile_axis_order, self._tile_size, original_indices),
+            key=lambda x: x[0], reverse=True
+        )
+        for _, size, original_indices in sorted_pairs:
+            strides[original_indices] = init
+            init *= size
+        return strides
 
     def get_nr_dim(self):
         """
         Return number of dimensions
         """
-        return len(self.tile_size)
+        return len(self._tile_size)
 
     def get_dim_size(self, index):
         if isinstance(index, int):
-            return self.tile_size[index]
+            return self._tile_size[index]
         elif "index" in str(index):
-            return self.tile_size[int(str(index)[5:])]
+            return self._tile_size[int(str(index)[5:])]
         raise NotImplementedError("Unsupported format of index")
 
     def get_mlir_shape(self, dtype):
-        str_tile_size = [str(dim) for dim in self.tile_size]
+        str_tile_size = [str(dim) for dim in self._tile_size]
         shape = "x".join(str_tile_size)
-        return f"memref<{shape}x{dtype}, 1>"
+        return f"memref<{shape}x{dtype}, strided<{self.get_tile_stride()}>, 1>"
 
     def get_used_vlane(self):
         """
         Return number of used vector lane
         """
-        return min(self.div_round_up(self.tile_size[self.vlane_split_axis], self.vlane_stride), self.vector_lane)
+        return min(self.div_round_up(self._tile_size[self.vlane_split_axis], self.vlane_stride), self.vector_lane)
 
     def get_vlane_stride(self):
         return self.vlane_stride
@@ -366,7 +389,6 @@ def codegen_nodes(self, nodes, kernel_name):
         _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs()
 
         with self as kernel:
-            kernel.args = kernel.kernel_group.args
             for node in nodes:
                 node.run(vars, reduction_vars)
         src_code = self.codegen_kernel(kernel_name=kernel_name)
@@ -450,7 +472,7 @@ def rename_indexing(self, index) -> sympy.Expr:
         index = V.graph.sizevars.simplify(index)
         sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name)
         replacements = {
-            x: self.args.size(x)
+            x: self.kernel_group.args.size(x)
             for x in sorted_symbols
             if x.name.startswith("s") or x.name.startswith("ps")
         }
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index d0b07629..65ccf93a 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -143,7 +143,7 @@ def meta_kernel(self):
         wrapper = V.graph.wrapper_code
         arg_attributes = self.kernel_arg_attributes
         if arg_attributes is None:
-            _, _, arg_attributes, _ = self.args.mlir_argdefs()
+            _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs()
         wrapper.add_import_once('\nprint(f\'Wrapper Codegen Path = {__file__}\')')
         wrapper.add_import_once(f'\nfrom PyTorchSimFrontend.extension_codecache import CustomAsyncCompile')
         wrapper.add_import_once(f'\ncustom_async_compile = CustomAsyncCompile()')
@@ -155,7 +155,7 @@ def meta_kernel(self):
 
     def call_kernel(self, kernel_name):
         wrapper = V.graph.wrapper_code
-        _, call_args, _, _ = self.args.mlir_argdefs()
+        _, call_args, _, _ = self.kernel_group.args.mlir_argdefs()
         # generate the code to call this
         wrapper.generate_kernel_call(
             kernel_name if self.outer_func_name is None else self.outer_func_name,
@@ -167,7 +167,6 @@ def template_store(options):
             dram_var = "Y"
             index_var = "index2"
             tag_var = "tag"
-            stride = options['N']
             vlane_split_axis = 0
             vlane_stride = 1
             mlir_dtype = "f32"
@@ -207,19 +206,19 @@ def def_kernel(
             node = inputs[idx]
             if node is not None:
                 self.named_nodes[name] = node
-                self.args.input_buffers[node.get_name()] = name
+                self.kernel_group.args.input_buffers[node.get_name()] = name
 
         extra_node = {}
         for name, node in zip(names[len(inputs) : len(inputs) + len(outputs)], outputs):
             if node is not None:
                 self.named_nodes[name] = node
-                self.args.output_buffers[node.get_name()] = name
+                self.kernel_group.args.output_buffers[node.get_name()] = name
                 self.store_buffer_names.add(node.get_name())    #TODO: Is this enough not calling store() in mlir_common.py?
                 extra_node[node.get_name()] = node
                 self.buffer_names[node.name] = 'Y_buffer'   #TODO: Buffer name fixed
 
         def hook():
-            arg_defs, *_ = self.args.mlir_argdefs(extra_node=extra_node)
+            arg_defs, *_ = self.kernel_group.args.mlir_argdefs(extra_node=extra_node)
             return f"({', '.join(arg_defs)})"
 
         assert "<DEF_KERNEL>" not in self.render_hooks
@@ -229,7 +228,7 @@ def hook():
     def output_name(self):
         # Cannot know the output name from the template, so we need to hook it
         def hook():
-            arg_defs, *_ = self.args.mlir_argdefs()
+            arg_defs, *_ = self.kernel_group.args.mlir_argdefs()
             output = arg_defs[3]    #FIXME: Constant index used
             pattern = r"%(\w+):"
             output = re.search(pattern, output).group(1)
@@ -248,7 +247,7 @@ def hook():
         return "<STORE_OUTPUT>"
 
     def def_function(self):
-        _, call_args, _ = self.args.python_argdefs()
+        _, call_args, _ = self.kernel_group.args.python_argdefs()
         if self.outer_func_render is not None:
             return self.outer_func_render(input_args=call_args)
         else:
@@ -295,10 +294,11 @@ def adjust_tile_size(self):
         return
 
     def load_epilogue(self, name: str, index: sympy.Expr):
+        raise NotImplementedError("Not implemented!")
         #index_var = self.parse_indices(index)
         index_var = "index2"
         index = self.rename_indexing(index)
-        dram_var = self.args.input(name)
+        dram_var = self.kernel_group.args.input(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         if name not in self.buffer_names:
@@ -327,9 +327,10 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         return out
 
     def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
+        raise NotImplementedError("Not implemented!")
         #index_var = self.parse_indices(index)
         index_var = "index2"
-        dram_var = self.args.output(name)
+        dram_var = self.kernel_group.args.output(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
@@ -356,8 +357,8 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape)
         self.cse.generate(self.stores, code, assignment = False)
 
-    def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, index_var, raw_index):
-        return super().get_scratchpad_buffer(dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, index_var, raw_index, True)
+    def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, code_buffer, index_var, raw_index):
+        return super().get_scratchpad_buffer(dtype, name, tile_size_per_lane, dram_tile_shape, code_buffer, index_var, raw_index, True)
 
 class MLIRTemplateCaller(CUDATemplateCaller):
     def __str__(self):

From e2d6bbce940f1fd11cfd73d27b8ae35b15cd946a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 17 Jan 2025 13:58:47 +0000
Subject: [PATCH 030/432] [Frontend/dma4d] Provide sram stride ad a sttribute
 of dma_start op

---
 .../mlir/mlir_codegen_backend.py              | 83 ++++++++++---------
 PyTorchSimFrontend/mlir/mlir_common.py        | 28 ++++---
 2 files changed, 60 insertions(+), 51 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index dcaffe1d..7d833e1d 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -703,25 +703,26 @@ def load(self, name: str, index: sympy.Expr):
         local_tile_desc = self.get_dma_info(name, index)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
-        tile_size_per_lane = local_tile_desc.get_tile_size_per_lane()
+        tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
 
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
+        tile_stride = local_tile_desc.get_tile_stride()
 
         # Define scratch pad buffer
-        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_size_per_lane, tile_shape, self.loads, index_var, index)
+        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.loads, index_var, index)
 
         # MVIN Encoding
         code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                 f"{name}_tag", dram_shape, tile_shape, padding)
+                                 f"{name}_tag", dram_shape, tile_shape, tile_stride, padding)
         self.cse.generate(self.loads, code, assignment = False) # FIXME: assignment = False does not support caching
 
         # Generate vector load instruction
-        operation = "affine.vector_load" if tile_size_per_lane > 1 else "affine.load"
-        shape = f", vector<{tile_size_per_lane}x{mlir_dtype}>" if tile_size_per_lane > 1 else ""
+        operation = "affine.vector_load" if tile_numel_per_lane > 1 else "affine.load"
+        shape = f", vector<{tile_numel_per_lane}x{mlir_dtype}>" if tile_numel_per_lane > 1 else ""
         line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}{shape}"
         out = self.cse.generate(self.loads, line)
-        self.register_var_info(out, [tile_size_per_lane, mlir_dtype])
+        self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
         return out
 
     def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
@@ -735,18 +736,19 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         local_tile_desc = self.get_dma_info(name, index)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
-        tile_size_per_lane = local_tile_desc.get_tile_size_per_lane()
+        tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
 
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
+        tile_stride = local_tile_desc.get_tile_stride()
 
         # Define scratch pad buffer
-        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_size_per_lane, tile_shape, self.stores, index_var, index)
+        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.stores, index_var, index)
 
         # Generate vector store instruction
         store_size, operand_type = self.var_info[value]
-        operation = "affine.vector_store" if tile_size_per_lane > 1 and store_size > 1 else "affine.store"
-        shape = f", vector<{tile_size_per_lane}x{mlir_dtype}>" if tile_size_per_lane > 1 and store_size > 1 else ""
+        operation = "affine.vector_store" if tile_numel_per_lane > 1 and store_size > 1 else "affine.store"
+        shape = f", vector<{tile_numel_per_lane}x{mlir_dtype}>" if tile_numel_per_lane > 1 and store_size > 1 else ""
         if mlir_dtype != operand_type:
             value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
 
@@ -755,7 +757,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
         # Generate DMA instruction
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                 f"{name}_tag", dram_shape, tile_shape)
+                                 f"{name}_tag", dram_shape, tile_shape, tile_stride)
         self.cse.generate(self.stores, code, assignment = False)
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
@@ -789,20 +791,18 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             )
             type_name = mlir_common.DTYPE_TO_MLIR[dtype]
             acc_var = init
-            acc_shape = type_name
-            shape = f"vector<{self.tile_desc.get_tile_size()}x{type_name}>"
             reduced_shape = type_name
             init = self.cse.generate(self.reduction_prefix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
             if len(self.ranges) == 1: # 1-D vector to scalar
                 axis = "0"
                 acc_var = init
-                shape = f"vector<{self.tile_desc.get_tile_size()}x{type_name}>" # use single vector lane
+                shape = f"vector<{self.var_info[value][0]*self.vector_lane}x{type_name}>" # use single vector lane
             elif len(self.ranges) == 2:
-                vec_len = self.tile_desc.get_rows_per_lane()
-                flattened_size = f"vector<{self.tile_desc.get_tile_size_per_lane()}x{type_name}>"
+                vec_len = self.kernel_group.tile_desc.get_vlane_stride()
+                flattened_size = f"vector<{self.var_info[value][0]}x{type_name}>"
 
                 # It is column majored per lane tile
-                expaned_size = f"vector<{self.tile_desc.get_tile_size_per_lane()//vec_len}x{vec_len}x{type_name}>"
+                expaned_size = f"vector<{self.var_info[value][0]//vec_len}x{vec_len}x{type_name}>"
                 value = self.cse.generate(self.compute, f"vector.shape_cast %{value} : {flattened_size} to {expaned_size}")
                 shape = expaned_size
 
@@ -843,16 +843,17 @@ def store_reduction(self, name, index, value):
         local_tile_desc = self.get_dma_info(name, index)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
-        tile_size_per_lane = local_tile_desc.get_tile_size_per_lane()
+        tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
 
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
+        tile_stride = local_tile_desc.get_tile_stride()
 
-        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_size_per_lane, tile_shape, self.reductions_suffix, index_var, index)
+        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.reductions_suffix, index_var, index)
         if self.welford_reduce_out is not None:
             # raise NotImplementedError()
             sum, sqr_sum, _ = self.welford_reduce_out
-            shape = f"vector<{tile_size_per_lane}x{mlir_dtype}>" if self.buffer_types[name][1] > 1 else mlir_dtype
+            shape = f"vector<{tile_numel_per_lane}x{mlir_dtype}>" if self.buffer_types[name][1] > 1 else mlir_dtype
             # mean
             divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.ranges[self.reduction_depth])} : f32")
             if self.buffer_types[name][1] > 1:
@@ -872,17 +873,17 @@ def store_reduction(self, name, index, value):
                 value = m2
 
         # Select mlir store operaiton
-        if self.buffer_types[name][1] == 1 or tile_size_per_lane == 1:
+        if self.buffer_types[name][1] == 1 or tile_numel_per_lane == 1:
             operation = "affine.store"
             # raise NotImplementedError("Scalar store!")
         else:
             operation =  "affine.vector_store"
 
         # Select src type
-        if tile_size_per_lane == 1:
+        if tile_numel_per_lane == 1:
             shape = ""
         else:
-            shape = f"vector<{tile_size_per_lane}x{mlir_dtype}>"
+            shape = f"vector<{tile_numel_per_lane}x{mlir_dtype}>"
             shape = f", {shape}" if self.buffer_types[name][1] > 1 else ""
 
         line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}{shape}"
@@ -892,7 +893,7 @@ def store_reduction(self, name, index, value):
 
         # Generate DMA instruction
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                 f"{name}_tag", dram_shape, tile_shape)
+                                 f"{name}_tag", dram_shape, tile_shape, tile_stride)
         self.cse.generate(self.reductions_suffix, code, assignment = False)
 
     def codegen_global_init(self):
@@ -1077,42 +1078,47 @@ def get_dma_info(self, name, index): # Need more argument?
         buffer_info = self.buffer_types[name]
         # Note: index could contain symbols that represent dynamic axies
         # Extract dimension of index(e.g, index0, index1)
-        dims = [i for i in index.free_symbols if "index" in str(i)]
+        dims = [int(str(i)[5:]) for i in index.free_symbols if "index" in str(i)]
         local_tile_desc = mlir_common.MLIRMultiDimTile([1], self.vector_lane)
 
+        if kg_tile_desc.vlane_split_axis in dims:
+            local_vlane_split_axis = dims.index(kg_tile_desc.vlane_split_axis)
+        else:
+            local_vlane_split_axis = len(dims) - 1
+
         # Case 0. Tile is 0-D scalar
         if len(dims) == 0:
             local_tile_desc.set_tile_size([1])         # Broadcast needed?
-            local_tile_desc.vlane_split_axis = 0    # last axis
-            local_tile_desc.vlane_stride = 1
+            local_tile_desc.vlane_split_axis = local_vlane_split_axis    # last axis
+            local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         # Case 1. Tile is 1-D vector type
         elif len(dims) == 1 and len(dims) <= self.reduction_depth:
             local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dims[0])])
-            local_tile_desc.vlane_split_axis = 0    # last axis
-            local_tile_desc.vlane_stride = 1        # Need to choose best...
+            local_tile_desc.vlane_split_axis = local_vlane_split_axis
+            local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         # Case 2. Tile is 1-D vector type with reduction
         elif len(dims) == 1 and len(dims) == self.reduction_depth + 1:
             local_tile_desc.set_tile_size([1, kg_tile_desc.get_dim_size(dims[0])])
-            local_tile_desc.vlane_split_axis = 0
-            local_tile_desc.vlane_stride = 1
+            local_tile_desc.vlane_split_axis = local_vlane_split_axis
+            local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         # Case 3. Tile is 2-D tile
         elif len(dims) == 2:
             is_reduction = self.reduction_depth == 1
             if is_reduction:
                 local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in dims], [1, 0])
-                local_tile_desc.vlane_split_axis = 0
-                local_tile_desc.vlane_stride = 2
+                local_tile_desc.vlane_split_axis = local_vlane_split_axis
+                local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
             else:
                 local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in dims])
-                local_tile_desc.vlane_split_axis = local_tile_desc.get_tile_size().index(max(local_tile_desc.get_tile_size()))
-                local_tile_desc.vlane_stride = 1
+                local_tile_desc.vlane_split_axis = local_vlane_split_axis
+                local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         else:
             raise NotImplementedError("Currently not implemented... ;)")
 
         return local_tile_desc
 
     def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_var, dram_index_var, sram_var, sram_index_var,
-                     tag_name, dram_shape, tile_shape, padding_type=None):
+                     tag_name, dram_shape, tile_shape, tile_stride, padding_type=0, ):
         dma_key = (attribute1, attribute2, mlir_dtype)
         if dma_type_name == "MVIN" and dma_key in self.dma_read_cache:
             dma_type, attribute1, attribute2 = self.dma_read_cache[dma_key]
@@ -1137,8 +1143,6 @@ def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_v
         sram_operand = f"%{sram_var}[{sram_index_var}]" # Use string
         tag_var = f"%{tag}[%{zero_cse}]"
         dma_attribute = f"%{attribute1}, %{attribute2}"
-        #dram_shape = f"memref<{dram_shape}x{mlir_dtype}>"
-        #sram_shape = f"memref<{tile_shape}x{mlir_dtype}, 1>"
         sram_shape = tile_shape
         tag_shape = "memref<1xi32>"
 
@@ -1150,8 +1154,7 @@ def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_v
             src_shape, dst_shape = sram_shape, dram_shape
 
         code = f"memref.dma_start {src_operand}, {dst_operand}, %{dma_type}, {tag_var}, {dma_attribute} : {src_shape}, {dst_shape}, {tag_shape}"
-        if padding_type is not None:
-            code = code + f" {{padding = {padding_type}}}"
+        code = code + f" {{padding={padding_type}, sram_stride={tile_stride}}}"
         return code
 
     def adjust_tile_size(self):
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 72a98c22..960e188a 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -151,7 +151,7 @@ def set_info(outer, inner, arg_type):
 class MLIRMultiDimTile():
     def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None):
         self._tile_size = list(tile_size)
-        self.tile_axis_order = range(len(tile_size))
+        self.tile_axis_order = list(range(len(tile_size)-1))
 
         # Vector lane mapping config
         self.vector_lane = vector_lane
@@ -161,7 +161,7 @@ def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=N
     def set_tile_size(self, tile_size, tile_axis_order=None):
         self._tile_size = tile_size
         if tile_axis_order is None:
-            self.tile_axis_order = range(len(tile_size))
+            self.tile_axis_order = list(range(len(tile_size)-1))
         else:
             self.tile_axis_order = tile_axis_order
 
@@ -176,12 +176,9 @@ def get_numel(self):
         for dim_size in self._tile_size:
             size *= dim_size
         return size
-
-    def get_tile_size_per_lane(self):
-        tile_size_per_lane = list(self._tile_size)
-        used_vlane = self.get_used_vlane()
-        tile_size_per_lane[self.vlane_split_axis] = \
-            self.div_round_up(tile_size_per_lane[self.vlane_split_axis], used_vlane)
+ 
+    def get_numel_per_lane(self):
+        tile_size_per_lane = self.get_tile_size_per_lane()
         size = 1
         for dim_size in tile_size_per_lane:
             size *= dim_size
@@ -191,7 +188,7 @@ def get_tile_stride(self):
         strides = [1] * len(self._tile_size)
         init = 1
 
-        original_indices = list(range(len(self.tile_axis_order)))
+        original_indices = list(range(len(self.tile_axis_order)-1))
         sorted_pairs = sorted(
             zip(self.tile_axis_order, self._tile_size, original_indices),
             key=lambda x: x[0], reverse=True
@@ -201,6 +198,13 @@ def get_tile_stride(self):
             init *= size
         return strides
 
+    def get_tile_size_per_lane(self):
+        tile_size_per_lane = list(self._tile_size)
+        used_vlane = self.get_used_vlane()
+        tile_size_per_lane[self.vlane_split_axis] = \
+            self.div_round_up(tile_size_per_lane[self.vlane_split_axis], used_vlane)
+        return tile_size_per_lane
+
     def get_nr_dim(self):
         """
         Return number of dimensions
@@ -217,7 +221,7 @@ def get_dim_size(self, index):
     def get_mlir_shape(self, dtype):
         str_tile_size = [str(dim) for dim in self._tile_size]
         shape = "x".join(str_tile_size)
-        return f"memref<{shape}x{dtype}, strided<{self.get_tile_stride()}>, 1>"
+        return f"memref<{shape}x{dtype}, 1>"
 
     def get_used_vlane(self):
         """
@@ -385,9 +389,11 @@ def codegen_nodes(self, nodes, kernel_name):
         # Select tile info.
         # Note: Kernel Group have to share same tile desc for fusion
         tile_desc = MLIRMultiDimTile(tile_size, self.vector_lane)
+        tile_desc.vlane_split_axis = len(vars) - 1
+        tile_desc.vlane_stride = 2
         self.kernel_group.set_tile_info(tile_desc)
-        _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs()
 
+        _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs()
         with self as kernel:
             for node in nodes:
                 node.run(vars, reduction_vars)

From 355c3a0b17d881513898f368a2b12b0b95c4078d Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Sat, 18 Jan 2025 05:02:53 +0000
Subject: [PATCH 031/432] [Fix] lowering pass ordering

---
 PyTorchSimFrontend/extension_codecache.py | 4 ++--
 PyTorchSimFrontend/mlir/mlir_common.py    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index a3aa1e28..b695d48e 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -51,11 +51,11 @@ def mlir_compile_command(filename, vectorlane_size, tile_size, vlen=256):
             -test-loop-padding \
             -dma-fine-grained='systolic-array-size={vectorlane_size} tile-size={tile_size[0]},{tile_size[1]},{tile_size[2]}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
+            -test-memref-to-gemmini="vectorlane={vectorlane_size}" \
             -lower-affine \
+            -finalize-memref-to-llvm \
             -lower-vector-multi-reduction \
             -convert-vector-to-llvm \
-            -test-memref-to-gemmini="vectorlane={vectorlane_size}" \
-            -finalize-memref-to-llvm \
             -convert-arith-to-llvm \
             -convert-math-to-llvm \
             -convert-scf-to-cf \
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 960e188a..2a12cd4e 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -382,7 +382,7 @@ def codegen_nodes(self, nodes, kernel_name):
             tile_size[-1] = 128
             tile_size[-2] = 128
         elif len(tile_size) == 1:
-            tile_size[0] = 256
+            tile_size[0] = 512
         else:
             raise NotImplementedError("dummy tile size fail!")
 

From a38d20b54b8d3b93d74605c7b4f8727f2004caf3 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 21 Jan 2025 02:34:21 +0000
Subject: [PATCH 032/432] Fix mlir template codegen

---
 PyTorchSimFrontend/mlir/mlir_common.py   |  2 +-
 PyTorchSimFrontend/mlir/mlir_template.py | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 2a12cd4e..bbeea492 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -313,7 +313,7 @@ class BaseMLIRKernel(common.Kernel, BaseMLIRHardwareInfo):
 
     def __init__(self, args=None):
         super().__init__(args)
-        self.kernel_group : MLIRWrapperKenrelGroup = None
+        self.kernel_group : MLIRWrapperKenrelGroup = MLIRWrapperKenrelGroup()
         # Kernel iteration range info
         self.call_ranges = None
         self.ranges = None
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 65ccf93a..971d98da 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -170,9 +170,13 @@ def template_store(options):
             vlane_split_axis = 0
             vlane_stride = 1
             mlir_dtype = "f32"
-            dram_shape = f"memref<{options['M']}x{options['N']}x{mlir_dtype}>"
-            tile_shape = f"{options['TILE_M']}x{options['TILE_N']}"
-            code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, tag_var, dram_shape, tile_shape)
+            dram_shape = f"memref<{options['M']*options['N']}x{mlir_dtype}>"
+            tile_shape = f"memref<{options['TILE_M']}x{options['TILE_N']}x{mlir_dtype}, 1>"
+            zero_cse = self.get_const_cse(0)
+            sram_index_var = ",".join([f"%{zero_cse}"] * 2)
+            tile_stride = [options['N'], 1]
+            code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
+                                 tag_var, dram_shape, tile_shape, tile_stride)
             self.cse.generate(self.stores, code, assignment = False)
         self.body.splice(self.loads)
         self.body.splice(self.compute)

From 2d710d136f89cd1ff8ce7608feca2f803c54ec76 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 21 Jan 2025 09:38:33 +0000
Subject: [PATCH 033/432] [Fix] small GEMM template

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 27 ++++++++-----------
 PyTorchSimFrontend/mlir/mlir_template.py      |  4 +--
 tests/test_matmul.py                          |  5 ++--
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index c116ebf6..3e9c9eed 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -24,11 +24,8 @@
   %c_mvin3 = arith.constant 14 : index{% endif %}
   %c_mvout = arith.constant 3 : index
   %c_set = arith.constant 2 : index
-  %x_chunk = arith.constant {% if X_transposed %} {{ kernel.vector_lane * 2 + 0 }} {% else %} {{ 2 }} {% endif %} : index
-  %w_chunk = arith.constant {% if W_transposed %} {{ TILE_K * 2 + 0 }} {% else %} {{ 2 }} {% endif %} : index
-  %M = arith.constant {{ M }} : index
-  %N = arith.constant {{ N }} : index
-  %K = arith.constant {{ K }} : index
+  %vstride = arith.constant 1 : index
+  %axis = arith.constant 1 : index
   %X_buffer = memref.get_global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
   %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
   %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
@@ -41,25 +38,23 @@
     affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
       %index2 = affine.apply #map2(%t_m, %t_n)
       {% if Bias -%}
-      affine.dma_start %Bias[
+      memref.dma_start %Bias[
         {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
-        ], %Y_buffer[%c0, %c0], %tag[0], %c_mvin3, %
-        {%- if Bias_rank == 2 -%} N {%- else -%} c0 {%- endif -%}
-        , %c_set : memref<
+        ], %Y_buffer[%c0, %c0], %c_mvin3, %tag[%c0], %
+        {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
+        , %vstride : memref<
         {%- if Bias_rank == 2 -%}  {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
-        xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1 }
+        xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1, sram_stride=[{{ TILE_N }}, 1] }
       {%- else -%}
       affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>
       {%- endif %}
       affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
         %index0 = affine.apply #map0(%t_m, %t_k)
         %index1 = affine.apply #map1(%t_k, %t_n)
-        affine.dma_start %X[%index0], %X_buffer[%c0, %c0], %tag[0], %c_mvin,
-        {%- if X_transposed -%} %M, %x_chunk {%- else -%} %K, %x_chunk {%- endif -%}
-           : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1{% if X_transposed %}, transpose=1{% endif %} }
-        affine.dma_start %W[%index1], %W_buffer[%c0, %c0], %tag[0], %c_mvin2,
-        {%- if W_transposed -%} %K, %w_chunk {%- else -%} %N, %w_chunk {%- endif -%}
-           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1{% if W_transposed %}, transpose=1{% endif %} }
+        memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %axis, %vstride
+           : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_K }}, 1]}
+        memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag[%c0], %axis, %vstride
+           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[{{ TILE_N }}, 1]}
         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                 outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
       } { accumulation_loop=true }
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 971d98da..f496e56c 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -167,14 +167,14 @@ def template_store(options):
             dram_var = "Y"
             index_var = "index2"
             tag_var = "tag"
-            vlane_split_axis = 0
+            vlane_split_axis = 1
             vlane_stride = 1
             mlir_dtype = "f32"
             dram_shape = f"memref<{options['M']*options['N']}x{mlir_dtype}>"
             tile_shape = f"memref<{options['TILE_M']}x{options['TILE_N']}x{mlir_dtype}, 1>"
             zero_cse = self.get_const_cse(0)
             sram_index_var = ",".join([f"%{zero_cse}"] * 2)
-            tile_stride = [options['N'], 1]
+            tile_stride = [options['TILE_N'], 1]
             code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  tag_var, dram_shape, tile_shape, tile_stride)
             self.cse.generate(self.stores, code, assignment = False)
diff --git a/tests/test_matmul.py b/tests/test_matmul.py
index 3913df5b..460ab989 100644
--- a/tests/test_matmul.py
+++ b/tests/test_matmul.py
@@ -29,13 +29,13 @@ def custom_matmul(a, b):
     y = custom_matmul(x2, w2)
     test_result("Matmul Forward", res, y)
 
-def test_addmm(device, input_size=128, hidden_size=128, output_size=128):
+def test_addmm(device, input_size=128, hidden_size=128, output_size=128, bias_rank=1):
     def custom_matmul(bias, a, b):
         return torch.addmm(bias, a, b)
     torch.manual_seed(0)
     input = torch.randn(input_size, hidden_size)
     weight = torch.randn(hidden_size, output_size)
-    bias = torch.randn(output_size)
+    bias = torch.randn(output_size) if bias_rank == 1 else torch.randn(input_size, output_size)
     x1 = input.to(device=device)
     w1 = weight.to(device=device)
     b1 = bias.to(device=device)
@@ -60,4 +60,5 @@ def custom_matmul(bias, a, b):
     test_matmul(device, 512, 512, 512)
     test_matmul(device, 129, 61, 56)
     test_addmm(device, 128, 128, 128)
+    test_addmm(device, 128, 128, 128, bias_rank=2)
     test_addmm(device, 129, 61, 56)

From 81daf30f2a4031b8bfb4e0d80855f44488be4c29 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 21 Jan 2025 09:42:05 +0000
Subject: [PATCH 034/432] [Fix] index order & sram stride

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +-
 PyTorchSimFrontend/mlir/mlir_common.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 7d833e1d..b7389aa9 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1080,7 +1080,7 @@ def get_dma_info(self, name, index): # Need more argument?
         # Extract dimension of index(e.g, index0, index1)
         dims = [int(str(i)[5:]) for i in index.free_symbols if "index" in str(i)]
         local_tile_desc = mlir_common.MLIRMultiDimTile([1], self.vector_lane)
-
+        dims.sort() # Assume that smaller index is placed in the outer loop
         if kg_tile_desc.vlane_split_axis in dims:
             local_vlane_split_axis = dims.index(kg_tile_desc.vlane_split_axis)
         else:
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index bbeea492..08649832 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -188,7 +188,7 @@ def get_tile_stride(self):
         strides = [1] * len(self._tile_size)
         init = 1
 
-        original_indices = list(range(len(self.tile_axis_order)-1))
+        original_indices = list(range(len(self.tile_axis_order)))
         sorted_pairs = sorted(
             zip(self.tile_axis_order, self._tile_size, original_indices),
             key=lambda x: x[0], reverse=True

From 9abe5bbc8ff5aa736e76ef8521c9083e17a595f3 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 21 Jan 2025 10:40:58 +0000
Subject: [PATCH 035/432] [Fronted/dma4d] Fix settting tile_axis_order list

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 12 ++++++++++++
 PyTorchSimFrontend/mlir/mlir_common.py          | 10 +++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index b7389aa9..44a2a8ce 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1112,6 +1112,18 @@ def get_dma_info(self, name, index): # Need more argument?
                 local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in dims])
                 local_tile_desc.vlane_split_axis = local_vlane_split_axis
                 local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+        # Case 3. Tile is 3-D tile
+        elif len(dims) == 3:
+            is_reduction = self.reduction_depth < 3
+            if is_reduction:
+                #local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in dims], [1, 0])
+                #local_tile_desc.vlane_split_axis = local_vlane_split_axis
+                #local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+                raise NotImplementedError("Currently not implemented... ;)")
+            else:
+                local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in dims])
+                local_tile_desc.vlane_split_axis = local_vlane_split_axis
+                local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         else:
             raise NotImplementedError("Currently not implemented... ;)")
 
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 08649832..f7a4dc96 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -151,7 +151,7 @@ def set_info(outer, inner, arg_type):
 class MLIRMultiDimTile():
     def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None):
         self._tile_size = list(tile_size)
-        self.tile_axis_order = list(range(len(tile_size)-1))
+        self.tile_axis_order = list(range(len(tile_size)))
 
         # Vector lane mapping config
         self.vector_lane = vector_lane
@@ -161,7 +161,7 @@ def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=N
     def set_tile_size(self, tile_size, tile_axis_order=None):
         self._tile_size = tile_size
         if tile_axis_order is None:
-            self.tile_axis_order = list(range(len(tile_size)-1))
+            self.tile_axis_order = list(range(len(tile_size)))
         else:
             self.tile_axis_order = tile_axis_order
 
@@ -378,11 +378,15 @@ def codegen_nodes(self, nodes, kernel_name):
 
         # Dummy tile size
         tile_size = [1] * (len(vars) + len(reduction_vars))
-        if len(tile_size) >= 2:
+        if len(tile_size) == 2:
             tile_size[-1] = 128
             tile_size[-2] = 128
         elif len(tile_size) == 1:
             tile_size[0] = 512
+        elif len(tile_size) == 3:
+            tile_size[-1] = 128
+            tile_size[-2] = 128
+            tile_size[-2] = 128
         else:
             raise NotImplementedError("dummy tile size fail!")
 

From 7f2a3f0ca2957cbd722927c3c227a59114d47abf Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 22 Jan 2025 03:53:25 +0000
Subject: [PATCH 036/432] [Tests] Add vector scalar test

---
 test_extension_backend.py |  3 ++-
 tests/test_add.py         | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/test_extension_backend.py b/test_extension_backend.py
index bee28729..170849a4 100644
--- a/test_extension_backend.py
+++ b/test_extension_backend.py
@@ -1,6 +1,6 @@
 import torch._dynamo
 import torch.utils.cpp_extension
-from tests.test_add import test_vectoradd
+from tests.test_add import test_vectoradd, test_vector_scalar_add
 from tests.test_reduce import test_reduce_sum
 from tests.test_transpose2D import test_Transpose2D, test_Transpose2D_2
 from tests.test_transpose3D import test_Transpose3D_1, test_Transpose3D_2, test_Transpose3D_3
@@ -26,6 +26,7 @@
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
     test_vectoradd(device, (47, 10))
+    test_vector_scalar_add(device, (10, 10))
     test_reduce_sum(device, (29, 47), 1, keepdim=True)
     test_reduce_sum(device, (17, 68), 0, keepdim=True)
     test_Transpose2D(device, [64, 156])
diff --git a/tests/test_add.py b/tests/test_add.py
index a3c2be9a..9e1b154a 100644
--- a/tests/test_add.py
+++ b/tests/test_add.py
@@ -23,6 +23,16 @@ def vectoradd(a, b):
     out = vectoradd(x.cpu(), y.cpu())
     test_result("VectorAdd", res, out)
 
+def test_vector_scalar_add(device, size=(128, 128)):
+    def vectoradd(a, b):
+        return a + b
+    x = torch.randn(size).to(device=device)
+    y = torch.randn([1]).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(vectoradd)
+    res = opt_fn(x, y)
+    out = vectoradd(x.cpu(), y.cpu())
+    test_result("VectorScalarAdd", res, out)
+
 
 if __name__ == "__main__":
     import os

From 0428ee172d1c1b390f1de4d7e19950c934583667 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 22 Jan 2025 03:54:27 +0000
Subject: [PATCH 037/432] [Frontend] Support vector to vector broadcast

---
 .../mlir/mlir_codegen_backend.py              | 38 +++++++++++++++----
 PyTorchSimFrontend/mlir/mlir_common.py        |  4 +-
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 44a2a8ce..c5e01166 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -592,14 +592,32 @@ def index_cast(operand, target_type, *args, var_info=None, **kwrags):
         des_shape = f"vector<{op_type[0]}x{target_type}>" if op_type[0] > 1 else target_type
         return f"arith.index_cast %{operand} : {src_shape} to {des_shape}", [op_type[0], target_type]
 
+    @staticmethod
+    def broadcast_unflat(operand1, operand2, *args, var_info=None):
+        op_type1 = var_info[operand1]
+        op_type2 = var_info[operand2]
+        src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>"# if op_type1[0] > 1 else op_type1[1]
+        des_shape = f"vector<{op_type2[0]//op_type1[0]}x{op_type1[0]}x{op_type1[1]}>"# if op_type2[0] > 1 else op_type1[1] # Use tile size only
+
+        expand = f"vector.broadcast %{operand1} : {src_shape} to {des_shape}"
+        return expand, [op_type2[0], op_type1[1]]
 
     @staticmethod
     def broadcast(operand1, operand2, *args, var_info=None):
         op_type1 = var_info[operand1]
         op_type2 = var_info[operand2]
-        src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>" if op_type1[0] > 1 else op_type1[1]
-        des_shape = f"vector<{op_type2[0]}x{op_type1[1]}>" if op_type2[0] > 1 else op_type1[1] # Use tile size only
-        expand = f"vector.broadcast %{operand1} : {src_shape} to {des_shape}"
+        src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>"# if op_type1[0] > 1 else op_type1[1]
+        des_shape = f"vector<{op_type2[0]}x{op_type1[1]}>"# if op_type2[0] > 1 else op_type1[1] # Use tile size only
+
+        # Special case for length 2 vector. We used this vector to avoid scalar operations...
+        if op_type1[0] != 1 and op_type2[0] % op_type1[0] == 0:
+            unflat_operand = ops.broadcast_unflat(operand1, operand2)
+            unflat_shape = f"vector<{op_type2[0]//op_type1[0]}x{op_type1[0]}x{op_type1[1]}>"
+            expand = f"vector.shape_cast %{unflat_operand} : {unflat_shape} to {des_shape}"
+        elif op_type1[0] == 1:
+            expand = f"vector.broadcast %{operand1} : {src_shape} to {des_shape}"
+        else:
+            raise NotImplementedError("Not supporting broadcast type...")
         return expand, [op_type2[0], op_type1[1]]
 
 RTYPE_TO_MLIR = {
@@ -1088,7 +1106,7 @@ def get_dma_info(self, name, index): # Need more argument?
 
         # Case 0. Tile is 0-D scalar
         if len(dims) == 0:
-            local_tile_desc.set_tile_size([1])         # Broadcast needed?
+            local_tile_desc.set_tile_size([2])         # Force it to use vector instruction.
             local_tile_desc.vlane_split_axis = local_vlane_split_axis    # last axis
             local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         # Case 1. Tile is 1-D vector type
@@ -1247,10 +1265,16 @@ def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape
 
         return buffer, indices, sram_index_var
 
-    def get_const_cse(self, value) -> common.CSEVariable:
+    def get_const_cse(self, value, dtype="index") -> common.CSEVariable:
+        # Type convert
+        if dtype[0] == "f":
+            value = float(value)
+        else:
+            value = int(value)
+
         if value not in self.consts:
-            self.consts[value] = self.const_cse.generate(self.const_buffer, f"arith.constant {value} : index")
-        return self.consts[value]
+            self.consts[str(value)+dtype] = self.const_cse.generate(self.const_buffer, f"arith.constant {value} : {dtype}")
+        return self.consts[str(value)+dtype]
 
     def get_tag_cse(self, value, shape="memref<1xi32>"):
         if value not in self.tags:
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index f7a4dc96..926673c7 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -386,14 +386,14 @@ def codegen_nodes(self, nodes, kernel_name):
         elif len(tile_size) == 3:
             tile_size[-1] = 128
             tile_size[-2] = 128
-            tile_size[-2] = 128
+            tile_size[-3] = 128
         else:
             raise NotImplementedError("dummy tile size fail!")
 
         # Select tile info.
         # Note: Kernel Group have to share same tile desc for fusion
         tile_desc = MLIRMultiDimTile(tile_size, self.vector_lane)
-        tile_desc.vlane_split_axis = len(vars) - 1
+        tile_desc.vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
         tile_desc.vlane_stride = 2
         self.kernel_group.set_tile_info(tile_desc)
 

From 92b4a8c8a79cc93204a7a823c18031c64674f8fa Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 22 Jan 2025 04:30:21 +0000
Subject: [PATCH 038/432] [Frontend/DMA4D] add fake apply map for scalar

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index c5e01166..5f8821a6 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -678,7 +678,9 @@ def get_padding_type(self):
     def parse_indices(self, expr) -> common.CSEVariable:
         # Constant case
         if expr.is_number:
-            return self.get_const_cse(int(expr))
+            map_var = self.map_cse.generate(self.global_vars, f"affine_map<(d0) -> ({str(expr)}*d0)>")
+            fake_dim = self.get_const_cse(1)
+            return self.cse.generate(self.loads, f"affine.apply #{map_var}(%{fake_dim})")
 
         # Identity case
         if len(expr.args) == 0:

From f881b92ac2141ece14425154e8cc5d74f72bdd3c Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Wed, 22 Jan 2025 05:44:18 +0000
Subject: [PATCH 039/432] [Frontend] lowering order

---
 PyTorchSimFrontend/extension_codecache.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index b695d48e..15c05d84 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -83,13 +83,13 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
             -test-loop-padding='timing_mode=1' \
             -dma-fine-grained='systolic-array-size={vectorlane_size} tile-size={tile_size[0]},{tile_size[1]},{tile_size[2]}' \
-            -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen=256' \
             -test-tile-operation-graph='vectorlane={vectorlane_size}' \
+            -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
+            -test-memref-to-gemmini="vectorlane={vectorlane_size} timing=1" \
             -lower-affine \
+            -finalize-memref-to-llvm \
             -lower-vector-multi-reduction \
             -convert-vector-to-llvm \
-            -test-memref-to-gemmini="vectorlane={vectorlane_size} timing=1" \
-            -finalize-memref-to-llvm \
             -convert-arith-to-llvm \
             -convert-math-to-llvm \
             -convert-scf-to-cf \

From 1e5e61d67f034739d2f8fb2e293f925b3f7710eb Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Wed, 22 Jan 2025 06:52:10 +0000
Subject: [PATCH 040/432] [Fix] GEMM sram strides

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 4 ++--
 PyTorchSimFrontend/mlir/mlir_template.py      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 3e9c9eed..52051066 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -52,9 +52,9 @@
         %index0 = affine.apply #map0(%t_m, %t_k)
         %index1 = affine.apply #map1(%t_k, %t_n)
         memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %axis, %vstride
-           : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_K }}, 1]}
+           : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_K }}]}
         memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag[%c0], %axis, %vstride
-           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[{{ TILE_N }}, 1]}
+           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, {{ TILE_N }}]}
         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                 outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
       } { accumulation_loop=true }
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index f496e56c..f0b71ade 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -174,7 +174,7 @@ def template_store(options):
             tile_shape = f"memref<{options['TILE_M']}x{options['TILE_N']}x{mlir_dtype}, 1>"
             zero_cse = self.get_const_cse(0)
             sram_index_var = ",".join([f"%{zero_cse}"] * 2)
-            tile_stride = [options['TILE_N'], 1]
+            tile_stride = [1, options['TILE_N']]
             code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  tag_var, dram_shape, tile_shape, tile_stride)
             self.cse.generate(self.stores, code, assignment = False)

From 3bf6221980b33ca624a423b9a05541c728ebaf38 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Wed, 22 Jan 2025 10:10:31 +0000
Subject: [PATCH 041/432] [Frontend] BMM template revised

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  | 49 ++++++++-----------
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  1 -
 2 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 5a68947e..1515a5c4 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -22,47 +22,40 @@
   %c_mvin2 = arith.constant 1 : index{% if Bias %}
   %c_mvin3 = arith.constant 14 : index{% endif %}
   %c_mvout = arith.constant 3 : index
-  %c_set = arith.constant 2 : index
-  %c{{ TILE_K * 2 + 0}} = arith.constant {{ TILE_K * 2 + 0}} : index
-  %c0 = arith.constant 0 : index{% if X_transposed %}
-  %x_chunk = arith.constant {{ kernel.vector_lane * 2 + 0 }} : index{% endif %}{% if W_transposed %}
-  %w_chunk = arith.constant {{ TILE_K * 2 + 0 }} : index{% endif %}
-  %M = arith.constant {{ M }} : index
-  %N = arith.constant {{ N }} : index
-  %K = arith.constant {{ K }} : index
+  %vstride = arith.constant 1 : index
+  %axis = arith.constant 2 : index
   %X_buffer = memref.get_global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
   %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
   %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-  %tag = memref.alloc() : memref<1xi32>
-  %v0 = arith.constant dense<0.0> : vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>
+  %tag = memref.alloc() : memref<1xi32>{% if not Bias %}
+  %v0 = arith.constant dense<0.0> : vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>{% endif %}
+  %c0 = arith.constant 0 : index
   {{- kernel.def_local_vars() }}
-
   affine.for %b=0 to {{ B }} {
     affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
       affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
-        %index2 = affine.apply #map2(%b, %t_m, %t_n){% if Bias %}
-        affine.dma_start %Bias[
+        %index2 = affine.apply #map2(%b, %t_m, %t_n)
+        {% if Bias -%}
+        memref.dma_start %Bias[
         {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
-          ], %Y_buffer[0, 0], %tag[0], %c_mvin3,
-        %{%- if Bias_rank == 2 -%} N {%- else -%} c0 {%- endif -%}
-          , %c_set : memref<
+          ], %Y_buffer[0, 0], %c_mvin3, %tag[%c0], %
+        {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
+          , %vstride : memref<
         {%- if Bias_rank == 2 -%} {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
-          xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>
-        {%- else %}
+          xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1, sram_stride=[{{ TILE_N }}, 1] }
+        {%- else -%}
         affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>{% endif %}
         affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
           %index0 = affine.apply #map0(%b, %t_m, %t_k)
           %index1 = affine.apply #map1(%b, %t_k, %t_n)
-          affine.dma_start %X[%index0], %X_buffer[%c0, %c0], %tag[0], %c_mvin,
-          {%- if X_transposed -%} %M, %x_chunk {%- else -%} %K, %c_set {%- endif -%}
-             : memref<{{ B * M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1{% if X_transposed %}, transpose=1{% endif %} }
-          affine.dma_start %W[%index1], %W_buffer[%c0, %c0], %tag[0], %c_mvin2,
-          {%- if W_transposed -%} %K, %w_chunk {%- else -%} %N, %c_set {%- endif -%}
-             : memref<{{ B * K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1{% if W_transposed %}, transpose=1{% endif %} }
+          memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %axis, %vstride
+             : memref<{{ B * M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_K }}]}
+          memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag[%c0], %axis, %vstride
+             : memref<{{ B * K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, {{ TILE_N }}]}
           linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                   outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
         } { accumulation_loop=true }
-        affine.dma_start %Y_buffer[%c0, %c0], %Y[%index2], %tag[0], %c_mvout, %N, %c_set : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ B * M * N }}xf32>, memref<1xi32> { async=1 }
+       {{kernel.store_output()}}
       } { outer_loop=true }
     } { outer_loop=true }
   } { outer_loop=true }
@@ -106,7 +99,7 @@ def render(self,
         W_transposed = self.is_transposed(W)
         X_transposed = self.is_transposed(X)
 
-        options = dict(
+        kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
             B=X.get_size()[0],
@@ -127,8 +120,8 @@ def render(self,
             X_transposed = X_transposed,
             input_reorder = self.input_reorder
         )
-        code = self._template_from_string(BMM_TEMPLATE).render(**options)
-        kernel.add_loop_info([options["M"], options["N"], options["K"]], [options["TILE_M"], options["TILE_N"], options["TILE_K"]])
+        code = self._template_from_string(BMM_TEMPLATE).render(**kernel.render_options)
+        kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
 
         self.header = f"float X_spad[{TILE_M * TILE_K // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
         self.header += f"float W_spad[{TILE_K * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 52051066..2032012e 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -23,7 +23,6 @@
   %c_mvin2 = arith.constant 1 : index{% if Bias %}
   %c_mvin3 = arith.constant 14 : index{% endif %}
   %c_mvout = arith.constant 3 : index
-  %c_set = arith.constant 2 : index
   %vstride = arith.constant 1 : index
   %axis = arith.constant 1 : index
   %X_buffer = memref.get_global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>

From 411671b3b2118feca254fd22eb55592ff7a47e20 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 22 Jan 2025 12:19:05 +0000
Subject: [PATCH 042/432] [Frontned/DMA4d] loop codegen for multi-dim tile

---
 .../mlir/mlir_codegen_backend.py               | 18 +++++++-----------
 PyTorchSimFrontend/mlir/mlir_common.py         |  6 +++++-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 5f8821a6..ebfa9930 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -678,7 +678,7 @@ def get_padding_type(self):
     def parse_indices(self, expr) -> common.CSEVariable:
         # Constant case
         if expr.is_number:
-            map_var = self.map_cse.generate(self.global_vars, f"affine_map<(d0) -> ({str(expr)}*d0)>")
+            map_var = self.map_cse.generate(self.global_vars, f"affine_map<(d0) -> ({str(expr)}*d0 + 0)>")
             fake_dim = self.get_const_cse(1)
             return self.cse.generate(self.loads, f"affine.apply #{map_var}(%{fake_dim})")
 
@@ -924,7 +924,7 @@ def codegen_loops(self):
         # Loop body part
         tile_size = self.kernel_group.tile_desc.get_tile_size()
         # Apply paddings
-        loops = [LoopLevel(var, size, idx-len(self.itervars), tile_size=tile_size) for idx, (var, size) in enumerate(zip(self.itervars, self.ranges))]
+        loops = [LoopLevel(var, size, step=step) for idx, (var, size, step) in enumerate(zip(self.itervars, self.ranges, tile_size))]
         loops, reductions = [LoopNest(loops[: self.reduction_depth]),
                              LoopNest(loops[self.reduction_depth :])]
         if (self.reduction_depth==0):
@@ -1104,11 +1104,11 @@ def get_dma_info(self, name, index): # Need more argument?
         if kg_tile_desc.vlane_split_axis in dims:
             local_vlane_split_axis = dims.index(kg_tile_desc.vlane_split_axis)
         else:
-            local_vlane_split_axis = len(dims) - 1
+            local_vlane_split_axis = max(len(dims) - 1, 0)
 
         # Case 0. Tile is 0-D scalar
         if len(dims) == 0:
-            local_tile_desc.set_tile_size([2])         # Force it to use vector instruction.
+            local_tile_desc.set_tile_size([kg_tile_desc.get_used_vlane() * kg_tile_desc.vlane_stride])         # Force it to use vector instruction.
             local_tile_desc.vlane_split_axis = local_vlane_split_axis    # last axis
             local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         # Case 1. Tile is 1-D vector type
@@ -1287,22 +1287,18 @@ def get_tag_cse(self, value, shape="memref<1xi32>"):
 class LoopLevel:
     var: sympy.Expr
     size: sympy.Expr
-    idx: int
     start: int = 0
-    tile_size : List = None
+    step: int = 1
     reduction_vars: Dict[str, str] = None
-    loop_nr : int = 0
 
     def lines(self):
-        step = self.tile_size[self.loop_nr]
-        self.loop_nr += 1
         if self.reduction_vars:
             acc = ', '.join([f"%{acc.name}" for acc in self.reduction_vars.keys()])
             args = ', '.join([f"%{iter.name} = %{init.name}" for (_, iter, init, _) in self.reduction_vars.values()])
             dtype = ', '.join([f"{dtype}" for (_, _, _, dtype) in self.reduction_vars.values()])
-            line = f"{acc} = affine.for %{self.var} = {self.start} to {self.size} step {step} iter_args({args}) -> ({dtype})"
+            line = f"{acc} = affine.for %{self.var} = {self.start} to {self.size} step {self.step} iter_args({args}) -> ({dtype})"
         else:
-            line = f"affine.for %{self.var} = {self.start} to {self.size} step {step}"
+            line = f"affine.for %{self.var} = {self.start} to {self.size} step {self.step}"
 
         return [line]
 
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 926673c7..dd8220d9 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -386,10 +386,14 @@ def codegen_nodes(self, nodes, kernel_name):
         elif len(tile_size) == 3:
             tile_size[-1] = 128
             tile_size[-2] = 128
-            tile_size[-3] = 128
+            tile_size[-3] = 2
         else:
             raise NotImplementedError("dummy tile size fail!")
 
+        for i in range(1, len(tile_size)+1):
+            if tile_size[-i] > self.ranges[-i]:
+                tile_size[-i] = self.ranges[-i]
+
         # Select tile info.
         # Note: Kernel Group have to share same tile desc for fusion
         tile_desc = MLIRMultiDimTile(tile_size, self.vector_lane)

From ccbc5b275ca7e269111dab869e48ef8f66126577 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 24 Jan 2025 03:23:16 +0000
Subject: [PATCH 043/432] [Frontend/DMA4d] Support broadcast pattern codegen

---
 .../mlir/mlir_codegen_backend.py              | 62 +++++++++++--------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index ebfa9930..ab91d74c 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -606,8 +606,8 @@ def broadcast_unflat(operand1, operand2, *args, var_info=None):
     def broadcast(operand1, operand2, *args, var_info=None):
         op_type1 = var_info[operand1]
         op_type2 = var_info[operand2]
-        src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>"# if op_type1[0] > 1 else op_type1[1]
-        des_shape = f"vector<{op_type2[0]}x{op_type1[1]}>"# if op_type2[0] > 1 else op_type1[1] # Use tile size only
+        src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>" if op_type1[0] > 1 else op_type1[1]
+        des_shape = f"vector<{op_type2[0]}x{op_type1[1]}>" # if op_type2[0] > 1 else op_type1[1] # Use tile size only
 
         # Special case for length 2 vector. We used this vector to avoid scalar operations...
         if op_type1[0] != 1 and op_type2[0] % op_type1[0] == 0:
@@ -678,9 +678,7 @@ def get_padding_type(self):
     def parse_indices(self, expr) -> common.CSEVariable:
         # Constant case
         if expr.is_number:
-            map_var = self.map_cse.generate(self.global_vars, f"affine_map<(d0) -> ({str(expr)}*d0 + 0)>")
-            fake_dim = self.get_const_cse(1)
-            return self.cse.generate(self.loads, f"affine.apply #{map_var}(%{fake_dim})")
+            return self.get_const_cse(int(expr))
 
         # Identity case
         if len(expr.args) == 0:
@@ -720,7 +718,7 @@ def load(self, name: str, index: sympy.Expr):
         dram_var = self.kernel_group.args.input(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        local_tile_desc = self.get_dma_info(name, index)
+        local_tile_desc, index_var = self.get_dma_info(name, index, index_var)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
@@ -753,7 +751,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
         # Prepare dma instruction
-        local_tile_desc = self.get_dma_info(name, index)
+        local_tile_desc, index_var = self.get_dma_info(name, index, index_var)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
@@ -860,7 +858,7 @@ def store_reduction(self, name, index, value):
         index_var = self.parse_indices(index)
 
         # Tile is always reuduced in inner loop
-        local_tile_desc = self.get_dma_info(name, index)
+        local_tile_desc, index_var = self.get_dma_info(name, index, index_var)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
@@ -1086,7 +1084,7 @@ def get_dma_info2(self, name, index):
         vlane_split_axis = int(current_tile.tile_per_lane_layout == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
         return vlane_split_axis, vlane_stride, [current_tile.n_row, current_tile.n_col], tile_size_per_lane
 
-    def get_dma_info(self, name, index): # Need more argument?
+    def get_dma_info(self, name, index, index_var): # Need more argument?
         """
         A tile descriptor exists that is configured on a kernel group
         DMA desc should be adjusted according to buffer.
@@ -1098,42 +1096,56 @@ def get_dma_info(self, name, index): # Need more argument?
         buffer_info = self.buffer_types[name]
         # Note: index could contain symbols that represent dynamic axies
         # Extract dimension of index(e.g, index0, index1)
-        dims = [int(str(i)[5:]) for i in index.free_symbols if "index" in str(i)]
+        local_dims = [int(str(i)[5:]) for i in index.free_symbols if "index" in str(i)]
+        total_dims =  [int(str(i)[5:]) for i in self.itervars]
         local_tile_desc = mlir_common.MLIRMultiDimTile([1], self.vector_lane)
-        dims.sort() # Assume that smaller index is placed in the outer loop
-        if kg_tile_desc.vlane_split_axis in dims:
-            local_vlane_split_axis = dims.index(kg_tile_desc.vlane_split_axis)
+        local_dims.sort() # Assume that smaller index is placed in the outer loop
+
+        # Reduction can have two type of tile size
+        if total_dims != local_dims and total_dims[:self.reduction_depth] != local_dims:
+            # We have to create custom apply map to provide dram stride
+            # ex) (d0, d1, ... dn, dn+1, dn+2, dk) -> (s0*d0 + s1*d1 + ... dn*0+ dn+1*0 + ... dk*0 + const)
+            fake_dim = self.get_const_cse(0)
+            input_expr = ",".join(["d"+str(i) for i in total_dims])
+            output_expr = str(index).replace('index', 'd')
+            input_argument = ",".join(["%index" + str(i) if i in local_dims else f"%{fake_dim}" for i in total_dims])
+            map_var = self.map_cse.generate(self.global_vars, f"affine_map<({input_expr}) -> ({output_expr})>")
+            index_var = self.cse.generate(self.loads, f"affine.apply #{map_var}({input_argument})")
+            local_dims = total_dims # Brodatcast tile shape
+
+        if kg_tile_desc.vlane_split_axis in local_dims:
+            local_vlane_split_axis = local_dims.index(kg_tile_desc.vlane_split_axis)
         else:
-            local_vlane_split_axis = max(len(dims) - 1, 0)
+            local_vlane_split_axis = max(len(local_dims) - 1, 0)
 
         # Case 0. Tile is 0-D scalar
-        if len(dims) == 0:
+        if len(local_dims) == 0:
             local_tile_desc.set_tile_size([kg_tile_desc.get_used_vlane() * kg_tile_desc.vlane_stride])         # Force it to use vector instruction.
             local_tile_desc.vlane_split_axis = local_vlane_split_axis    # last axis
             local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         # Case 1. Tile is 1-D vector type
-        elif len(dims) == 1 and len(dims) <= self.reduction_depth:
-            local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dims[0])])
+        elif len(local_dims) == 1 and len(local_dims) <= self.reduction_depth:
+            local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(local_dims[0])])
             local_tile_desc.vlane_split_axis = local_vlane_split_axis
             local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         # Case 2. Tile is 1-D vector type with reduction
-        elif len(dims) == 1 and len(dims) == self.reduction_depth + 1:
-            local_tile_desc.set_tile_size([1, kg_tile_desc.get_dim_size(dims[0])])
+        elif len(local_dims) == 1 and len(local_dims) == self.reduction_depth + 1:
+            local_tile_desc.set_tile_size([1, kg_tile_desc.get_dim_size(local_dims[0])])
             local_tile_desc.vlane_split_axis = local_vlane_split_axis
             local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         # Case 3. Tile is 2-D tile
-        elif len(dims) == 2:
+        elif len(local_dims) == 2:
             is_reduction = self.reduction_depth == 1
             if is_reduction:
-                local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in dims], [1, 0])
+                local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims], [1, 0])
                 local_tile_desc.vlane_split_axis = local_vlane_split_axis
                 local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
             else:
-                local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in dims])
+                local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims])
                 local_tile_desc.vlane_split_axis = local_vlane_split_axis
                 local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         # Case 3. Tile is 3-D tile
-        elif len(dims) == 3:
+        elif len(local_dims) == 3:
             is_reduction = self.reduction_depth < 3
             if is_reduction:
                 #local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in dims], [1, 0])
@@ -1141,13 +1153,13 @@ def get_dma_info(self, name, index): # Need more argument?
                 #local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
                 raise NotImplementedError("Currently not implemented... ;)")
             else:
-                local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in dims])
+                local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims])
                 local_tile_desc.vlane_split_axis = local_vlane_split_axis
                 local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         else:
             raise NotImplementedError("Currently not implemented... ;)")
 
-        return local_tile_desc
+        return local_tile_desc, index_var
 
     def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_var, dram_index_var, sram_var, sram_index_var,
                      tag_name, dram_shape, tile_shape, tile_stride, padding_type=0, ):

From 45ab4be71bfa998234f921f3b3b6af2b4fc886fc Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 24 Jan 2025 05:21:30 +0000
Subject: [PATCH 044/432] [Frontned/DMA4d] Disallow broadcast for
 store_reduction

---
 .../mlir/mlir_codegen_backend.py              |  8 ++---
 tests/test_exponent.py                        | 33 +++++++++++++++++++
 2 files changed, 37 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_exponent.py

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index ab91d74c..d885ba18 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -858,7 +858,7 @@ def store_reduction(self, name, index, value):
         index_var = self.parse_indices(index)
 
         # Tile is always reuduced in inner loop
-        local_tile_desc, index_var = self.get_dma_info(name, index, index_var)
+        local_tile_desc, index_var = self.get_dma_info(name, index, index_var, broadcast=False)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
@@ -1084,7 +1084,7 @@ def get_dma_info2(self, name, index):
         vlane_split_axis = int(current_tile.tile_per_lane_layout == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
         return vlane_split_axis, vlane_stride, [current_tile.n_row, current_tile.n_col], tile_size_per_lane
 
-    def get_dma_info(self, name, index, index_var): # Need more argument?
+    def get_dma_info(self, name, index, index_var, broadcast=True): # Need more argument?
         """
         A tile descriptor exists that is configured on a kernel group
         DMA desc should be adjusted according to buffer.
@@ -1102,7 +1102,7 @@ def get_dma_info(self, name, index, index_var): # Need more argument?
         local_dims.sort() # Assume that smaller index is placed in the outer loop
 
         # Reduction can have two type of tile size
-        if total_dims != local_dims and total_dims[:self.reduction_depth] != local_dims:
+        if broadcast and (total_dims != local_dims or total_dims[:self.reduction_depth] == local_dims):
             # We have to create custom apply map to provide dram stride
             # ex) (d0, d1, ... dn, dn+1, dn+2, dk) -> (s0*d0 + s1*d1 + ... dn*0+ dn+1*0 + ... dk*0 + const)
             fake_dim = self.get_const_cse(0)
@@ -1162,7 +1162,7 @@ def get_dma_info(self, name, index, index_var): # Need more argument?
         return local_tile_desc, index_var
 
     def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_var, dram_index_var, sram_var, sram_index_var,
-                     tag_name, dram_shape, tile_shape, tile_stride, padding_type=0, ):
+                     tag_name, dram_shape, tile_shape, tile_stride, padding_type=0):
         dma_key = (attribute1, attribute2, mlir_dtype)
         if dma_type_name == "MVIN" and dma_key in self.dma_read_cache:
             dma_type, attribute1, attribute2 = self.dma_read_cache[dma_key]
diff --git a/tests/test_exponent.py b/tests/test_exponent.py
new file mode 100644
index 00000000..a0bd6c8b
--- /dev/null
+++ b/tests/test_exponent.py
@@ -0,0 +1,33 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    message = f"|{name} Test Passed|"
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def test_exponent(device, size=(128, 128)):
+    def exponent(a):
+        return a.exp()
+    x = torch.randn(size).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(exponent)
+    res = opt_fn(x)
+    out = exponent(x.cpu())
+    test_result("exponent", res, out)
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    test_exponent(device, size=(512, 512))

From bd7454a4b885f6bb2c12bf37c6953e726c60efa3 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Fri, 24 Jan 2025 10:15:41 +0000
Subject: [PATCH 045/432] [Fix] GEMM template sram stride

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  | 11 ++++++-----
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  9 +++++----
 PyTorchSimFrontend/mlir/mlir_template.py      | 13 ++++++-------
 tests/test_bmm.py                             | 18 +++++++++++++++++-
 tests/test_matmul.py                          |  6 ++++--
 5 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 1515a5c4..207745d7 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -49,13 +49,13 @@
           %index0 = affine.apply #map0(%b, %t_m, %t_k)
           %index1 = affine.apply #map1(%b, %t_k, %t_n)
           memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %axis, %vstride
-             : memref<{{ B * M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_K }}]}
+             : memref<{{ B * M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
           memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag[%c0], %axis, %vstride
-             : memref<{{ B * K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, {{ TILE_N }}]}
+             : memref<{{ B * K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, 1]}
           linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                   outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
         } { accumulation_loop=true }
-       {{kernel.store_output()}}
+       {{kernel.store_output(vlane_split_axis=2)}}
       } { outer_loop=true }
     } { outer_loop=true }
   } { outer_loop=true }
@@ -91,7 +91,7 @@ def render(self,
         Y = self.output_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
-        M, N, K = X.get_size()[1], W.get_size()[2], X.get_size()[2]
+        B, M, N, K = X.get_size()[0], X.get_size()[1], W.get_size()[2], X.get_size()[2]
         TILE_M, TILE_N, TILE_K = kernel.gemmini_gemm_mapping(M, N, K)
         kernel.tile_size = [TILE_M, TILE_N, TILE_K]
         kernel.loop_size = [M, N, K]
@@ -102,7 +102,7 @@ def render(self,
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
-            B=X.get_size()[0],
+            B=B,
             M=M,
             N=N,
             K=K,
@@ -118,6 +118,7 @@ def render(self,
             Bias_rank = len(Bias.data.get_size()) if Bias is not None else 0,
             W_transposed = W_transposed,
             X_transposed = X_transposed,
+            Y_numel = B * M * N,
             input_reorder = self.input_reorder
         )
         code = self._template_from_string(BMM_TEMPLATE).render(**kernel.render_options)
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 2032012e..9fe404bf 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -36,7 +36,7 @@
   affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
     affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
       %index2 = affine.apply #map2(%t_m, %t_n)
-      {% if Bias -%}
+      {%- if Bias -%}
       memref.dma_start %Bias[
         {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
         ], %Y_buffer[%c0, %c0], %c_mvin3, %tag[%c0], %
@@ -51,13 +51,13 @@
         %index0 = affine.apply #map0(%t_m, %t_k)
         %index1 = affine.apply #map1(%t_k, %t_n)
         memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %axis, %vstride
-           : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_K }}]}
+           : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
         memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag[%c0], %axis, %vstride
-           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, {{ TILE_N }}]}
+           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, 1]}
         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                 outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
       } { accumulation_loop=true }
-      {{kernel.store_output()}}
+      {{kernel.store_output(vlane_split_axis=1)}}
     } { outer_loop=true }
   } { outer_loop=true }
   return
@@ -131,6 +131,7 @@ def render(self,
             Bias_rank = len(Bias.data.get_size()) if Bias is not None else 0,
             W_transposed = W_transposed,
             X_transposed = X_transposed,
+            Y_numel = M * N,
             epilogue_nodes = epilogue_nodes,
             input_reorder = self.input_reorder
         )
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index f0b71ade..188815bb 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -161,20 +161,19 @@ def call_kernel(self, kernel_name):
             kernel_name if self.outer_func_name is None else self.outer_func_name,
             call_args, cuda=False)
 
-    def codegen_body(self):
+    def codegen_body(self, vlane_split_axis):
         def template_store(options):
             sram_var = "Y_buffer"
             dram_var = "Y"
             index_var = "index2"
             tag_var = "tag"
-            vlane_split_axis = 1
             vlane_stride = 1
             mlir_dtype = "f32"
-            dram_shape = f"memref<{options['M']*options['N']}x{mlir_dtype}>"
+            dram_shape = f"memref<{options['Y_numel']}x{mlir_dtype}>"
             tile_shape = f"memref<{options['TILE_M']}x{options['TILE_N']}x{mlir_dtype}, 1>"
             zero_cse = self.get_const_cse(0)
             sram_index_var = ",".join([f"%{zero_cse}"] * 2)
-            tile_stride = [1, options['TILE_N']]
+            tile_stride = [1, options['TILE_M']]
             code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  tag_var, dram_shape, tile_shape, tile_stride)
             self.cse.generate(self.stores, code, assignment = False)
@@ -219,7 +218,7 @@ def def_kernel(
                 self.kernel_group.args.output_buffers[node.get_name()] = name
                 self.store_buffer_names.add(node.get_name())    #TODO: Is this enough not calling store() in mlir_common.py?
                 extra_node[node.get_name()] = node
-                self.buffer_names[node.name] = 'Y_buffer'   #TODO: Buffer name fixed
+                self.buffer_names[node.get_name()] = 'Y_buffer'   #TODO: Buffer name fixed
 
         def hook():
             arg_defs, *_ = self.kernel_group.args.mlir_argdefs(extra_node=extra_node)
@@ -241,9 +240,9 @@ def hook():
         self.render_hooks["<OUPUT>"] = hook
         return "<OUPUT>"
 
-    def store_output(self):
+    def store_output(self, vlane_split_axis=1):
         def hook():
-            self.codegen_body()
+            self.codegen_body(vlane_split_axis)
             return textwrap.indent(self.body.getvalue(), "      ").strip()  #TODO: First line is not indented
 
         assert "<STORE_OUTPUT>" not in self.render_hooks
diff --git a/tests/test_bmm.py b/tests/test_bmm.py
index 483980d8..73831c5c 100644
--- a/tests/test_bmm.py
+++ b/tests/test_bmm.py
@@ -24,6 +24,19 @@ def bmm(a, b):
     out = bmm(a.cpu(), b.cpu())
     test_result("BMM Forward", res, out)
 
+def test_addBMM(device, batch_size=1, m=32, n=16, k=64, bias_rank=1):#TODO: Fusion should be implemented for this test
+    def bmm(a, b, bias):
+        return torch.bmm(a, b.transpose(1, 2)) + bias
+    torch.manual_seed(0)
+    a = torch.randn(batch_size, m, k).to(device=device)
+    b = torch.randn(batch_size, n, k).to(device=device)
+    bias = torch.randn(batch_size, n) if bias_rank == 1 else torch.randn(batch_size, m, n)
+    bias = bias.to(device=device)
+    opt_fn = torch.compile(dynamic=False)(bmm)
+    res = opt_fn(a, b, bias)
+    out = bmm(a.cpu(), b.cpu(), bias.cpu())
+    test_result("BMM Forward", res, out)
+
 if __name__ == "__main__":
     import os
     import sys
@@ -33,4 +46,7 @@ def bmm(a, b):
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
     test_BMM(device)
-    test_BMM(device, 2, 512, 512, 512)
+    test_BMM(device, 2, 256, 128, 256)
+    test_BMM(device, 2, 128, 256, 256)
+    test_BMM(device, 2, 256, 256, 128)
+    test_BMM(device, 4, 256, 256, 256)
\ No newline at end of file
diff --git a/tests/test_matmul.py b/tests/test_matmul.py
index 460ab989..a32a38bb 100644
--- a/tests/test_matmul.py
+++ b/tests/test_matmul.py
@@ -19,7 +19,6 @@ def custom_matmul(a, b):
     torch.manual_seed(0)
     input = torch.randn(input_size, hidden_size)
     weight = torch.randn(hidden_size, output_size)
-    bias = torch.randn(output_size)
     x1 = input.to(device=device)
     w1 = weight.to(device=device)
     x2 = input.to("cpu")
@@ -57,7 +56,10 @@ def custom_matmul(bias, a, b):
     device = module.custom_device()
     test_matmul(device, 32, 32, 32)
     test_matmul(device, 128, 128, 128)
-    test_matmul(device, 512, 512, 512)
+    test_matmul(device, 256, 256, 256)
+    test_matmul(device, 256, 128, 256)
+    test_matmul(device, 256, 256, 128)
+    test_matmul(device, 128, 256, 256)
     test_matmul(device, 129, 61, 56)
     test_addmm(device, 128, 128, 128)
     test_addmm(device, 128, 128, 128, bias_rank=2)

From 86d32f3fcc87072f24382bdb47c4ab5cc22243c6 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Fri, 24 Jan 2025 10:16:52 +0000
Subject: [PATCH 046/432] [Fix] codegen func & test case

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py |  2 +-
 tests/test_add.py                          | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index c793dcf7..5c5c1fb0 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -133,7 +133,7 @@ def codegen_src_code(self, kernel, render, template_node, epilogue_nodes):
             partial_code = render()
             for node in epilogue_nodes:
                 ranges = node.get_ranges()
-                node.codegen(kernel.set_ranges(ranges[0], ranges[1], None))
+                node.codegen(kernel.set_ranges(ranges[0], ranges[1]))
         with V.set_kernel_handler(kernel):
             src_code = (
                 partial_code
diff --git a/tests/test_add.py b/tests/test_add.py
index 9e1b154a..708fa9f0 100644
--- a/tests/test_add.py
+++ b/tests/test_add.py
@@ -33,6 +33,15 @@ def vectoradd(a, b):
     out = vectoradd(x.cpu(), y.cpu())
     test_result("VectorScalarAdd", res, out)
 
+def test_vector_tensor_add(device, size=(128, 128)):
+    def vectoradd(a, b):
+        return a + b
+    x = torch.randn(size).to(device=device)
+    y = torch.randn(size[0]).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(vectoradd)
+    res = opt_fn(x, y)
+    out = vectoradd(x.cpu(), y.cpu())
+    test_result("VectorTensorAdd", res, out)
 
 if __name__ == "__main__":
     import os
@@ -45,3 +54,4 @@ def vectoradd(a, b):
     test_vectoradd(device, (47, 10))
     test_vectoradd(device, (128, 128))
     test_vectoradd(device, (4071, 429))
+    test_vector_tensor_add(device, (128, 128))

From ca4c4af8e6069a5796d143e69430be9daeca6657 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 31 Jan 2025 04:41:25 +0000
Subject: [PATCH 047/432] [Frontend] Fix reduction tiling policy to avoid
 conflict between broadcast

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 13 ++++++++-----
 PyTorchSimFrontend/mlir/mlir_common.py          |  8 ++++++--
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index d885ba18..f464ca0c 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -814,7 +814,10 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             if len(self.ranges) == 1: # 1-D vector to scalar
                 axis = "0"
                 acc_var = init
-                shape = f"vector<{self.var_info[value][0]*self.vector_lane}x{type_name}>" # use single vector lane
+                vec_len = self.kernel_group.tile_desc.get_vlane_stride()
+                shape = f"vector<{self.var_info[value][0]}x{type_name}>"
+                var_info = [vec_len, mlir_common.DTYPE_TO_MLIR[dtype]]
+                self.register_var_info(acc, var_info)
             elif len(self.ranges) == 2:
                 vec_len = self.kernel_group.tile_desc.get_vlane_stride()
                 flattened_size = f"vector<{self.var_info[value][0]}x{type_name}>"
@@ -926,7 +929,7 @@ def codegen_loops(self):
         loops, reductions = [LoopNest(loops[: self.reduction_depth]),
                              LoopNest(loops[self.reduction_depth :])]
         if (self.reduction_depth==0):
-            loops = LoopNest([LoopLevel("dummy", 1, 1, 0)])
+            loops = LoopNest([LoopLevel("dummy", 1)])
         reductions.mark_reduction(self.reduction_vars)
         if len(self.affine_yield) > 0:
             vars = ', '.join([f"%{name}" for name, _ in self.affine_yield.items()])
@@ -1130,9 +1133,9 @@ def get_dma_info(self, name, index, index_var, broadcast=True): # Need more argu
             local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         # Case 2. Tile is 1-D vector type with reduction
         elif len(local_dims) == 1 and len(local_dims) == self.reduction_depth + 1:
-            local_tile_desc.set_tile_size([1, kg_tile_desc.get_dim_size(local_dims[0])])
-            local_tile_desc.vlane_split_axis = local_vlane_split_axis
-            local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+            local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(local_dims[0])])
+            local_tile_desc.vlane_split_axis = 0
+            local_tile_desc.vlane_stride = kg_tile_desc.get_dim_size(local_dims[0])
         # Case 3. Tile is 2-D tile
         elif len(local_dims) == 2:
             is_reduction = self.reduction_depth == 1
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index dd8220d9..29b9ad33 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -338,7 +338,6 @@ def set_ranges(self, lengths, reduction_lengths):
             self.ranges = [self.rename_indexing(x) for x in self.call_ranges]
             self.itervars = [sympy.Symbol(f"index{n}") for n in range(len(self.ranges))]
             self.reduction_depth = len(lengths)
-
         return (
             self.itervars[: self.reduction_depth],
             self.itervars[self.reduction_depth :],
@@ -390,15 +389,20 @@ def codegen_nodes(self, nodes, kernel_name):
         else:
             raise NotImplementedError("dummy tile size fail!")
 
+        vlane_stride = 2
+        # Adjust tile size to avoid too much paddings
         for i in range(1, len(tile_size)+1):
             if tile_size[-i] > self.ranges[-i]:
+                remains = (self.ranges[-i] % vlane_stride)
                 tile_size[-i] = self.ranges[-i]
+                if remains:
+                    tile_size[-i] += vlane_stride - remains
 
         # Select tile info.
         # Note: Kernel Group have to share same tile desc for fusion
         tile_desc = MLIRMultiDimTile(tile_size, self.vector_lane)
         tile_desc.vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
-        tile_desc.vlane_stride = 2
+        tile_desc.vlane_stride = vlane_stride
         self.kernel_group.set_tile_info(tile_desc)
 
         _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs()

From cca7f1ddec5e1b102753ed4cc52368d725d09dab Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Fri, 31 Jan 2025 04:58:30 +0000
Subject: [PATCH 048/432] [Fix] bias GEMM sram stride

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  | 2 +-
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 6 +++---
 tests/test_matmul.py                          | 6 ++----
 tests/test_softmax.py                         | 2 ++
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 207745d7..38593f70 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -42,7 +42,7 @@
         {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
           , %vstride : memref<
         {%- if Bias_rank == 2 -%} {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
-          xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1, sram_stride=[{{ TILE_N }}, 1] }
+          xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1, sram_stride=[{{ kernel.vector_lane }}, 1] }
         {%- else -%}
         affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>{% endif %}
         affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 9fe404bf..ffc0816d 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -36,15 +36,15 @@
   affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
     affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
       %index2 = affine.apply #map2(%t_m, %t_n)
-      {%- if Bias -%}
+      {%- if Bias %}
       memref.dma_start %Bias[
         {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
         ], %Y_buffer[%c0, %c0], %c_mvin3, %tag[%c0], %
         {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
         , %vstride : memref<
         {%- if Bias_rank == 2 -%}  {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
-        xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1, sram_stride=[{{ TILE_N }}, 1] }
-      {%- else -%}
+        xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1, sram_stride=[{{ kernel.vector_lane }}, 1] }
+      {%- else %}
       affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>
       {%- endif %}
       affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
diff --git a/tests/test_matmul.py b/tests/test_matmul.py
index a32a38bb..232eb5b4 100644
--- a/tests/test_matmul.py
+++ b/tests/test_matmul.py
@@ -57,10 +57,8 @@ def custom_matmul(bias, a, b):
     test_matmul(device, 32, 32, 32)
     test_matmul(device, 128, 128, 128)
     test_matmul(device, 256, 256, 256)
-    test_matmul(device, 256, 128, 256)
-    test_matmul(device, 256, 256, 128)
     test_matmul(device, 128, 256, 256)
     test_matmul(device, 129, 61, 56)
-    test_addmm(device, 128, 128, 128)
-    test_addmm(device, 128, 128, 128, bias_rank=2)
+    test_addmm(device, 128, 256, 512)
+    test_addmm(device, 128, 256, 512, bias_rank=2)
     test_addmm(device, 129, 61, 56)
diff --git a/tests/test_softmax.py b/tests/test_softmax.py
index d68638f8..48c236ac 100644
--- a/tests/test_softmax.py
+++ b/tests/test_softmax.py
@@ -55,5 +55,7 @@ def test_softmax(device, size=(128, 128), dim=1):
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
     test_softmax(device, size=(64, 128))
+    test_softmax(device, size=(64, 128), dim=0)
     test_softmax(device, size=(256, 128))
+    test_softmax(device, size=(256, 128), dim=0)
     test_softmax(device, size=(1, 16))

From 9ac72487d5bdc30450a26892e561dc88dcb9f5cd Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Fri, 31 Jan 2025 05:40:01 +0000
Subject: [PATCH 049/432] [Frontend] scalar tile size

---
 PyTorchSimFrontend/mlir/mlir_common.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 29b9ad33..7609eadb 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -380,6 +380,9 @@ def codegen_nodes(self, nodes, kernel_name):
         if len(tile_size) == 2:
             tile_size[-1] = 128
             tile_size[-2] = 128
+        elif len(tile_size) == 0: # Scalar
+            tile_size = [1]
+            self.ranges = [1]
         elif len(tile_size) == 1:
             tile_size[0] = 512
         elif len(tile_size) == 3:

From 80cba3643a662c0764d915e882e7712f5da2f9e3 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 31 Jan 2025 08:36:28 +0000
Subject: [PATCH 050/432] [Frontend/dma4d] Support index_expression operation
 on single vlane

---
 .../mlir/mlir_codegen_backend.py              | 131 ++++++++++++------
 PyTorchSimFrontend/mlir/mlir_common.py        |  10 +-
 test_extension_backend.py                     |  54 ++++----
 tests/test_exponent.py                        |   2 +-
 tests/test_pool.py                            |   9 +-
 tests/test_softmax.py                         |  41 +++---
 6 files changed, 147 insertions(+), 100 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index f464ca0c..bb453739 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -103,9 +103,10 @@ def write_header(self):
         )
 
 class ExtensionOverrides(common.OpOverrides):
+    index_set = set()
     # Binary element wise operations
     @staticmethod
-    def custom_cast(operand, target_type, *args, var_info=None):
+    def custom_cast(operand, target_type, *args, var_info=None, **kwargs):
         dtype = var_info[operand][1]
         if dtype == "index":
             ret = ops.index_cast(operand, target_type, var_info=var_info)
@@ -153,28 +154,28 @@ def binary_elementwise_common(operand1, operand2, var_info):
         return tile_size, ret_type, operand1, operand2
 
     @staticmethod
-    def add(operand1, operand2, *args, var_info=None):
+    def add(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         opcode = f'arith.add{ret_type[0]}'
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def sub(operand1, operand2, *args, var_info=None):
+    def sub(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         opcode = f'arith.sub{ret_type[0]}'
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def mul(operand1, operand2, *args, var_info=None):
+    def mul(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         opcode = f'arith.mul{ret_type[0]}'
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def div(operand1, operand2, *args, var_info=None):
+    def div(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         if ret_type[0] == "f":
@@ -184,7 +185,7 @@ def div(operand1, operand2, *args, var_info=None):
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def truediv(operand1, operand2, *args, var_info=None):
+    def truediv(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         if ret_type[0] == "f":
@@ -194,7 +195,7 @@ def truediv(operand1, operand2, *args, var_info=None):
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def minimum(operand1, operand2, *args, var_info=None):
+    def minimum(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         if ret_type[0] == "f":
@@ -204,7 +205,7 @@ def minimum(operand1, operand2, *args, var_info=None):
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def maximum(operand1, operand2, *args, var_info=None):
+    def maximum(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         if ret_type[0] == "f":
@@ -214,7 +215,7 @@ def maximum(operand1, operand2, *args, var_info=None):
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def to_dtype(operand, dst_mlir_dtype, *args, var_info=None):
+    def to_dtype(operand, dst_mlir_dtype, *args, var_info=None, **kwargs):
         src_mlir_dtype = var_info[operand][1]
         tile_size = var_info[operand][0]
 
@@ -242,7 +243,7 @@ def to_dtype(operand, dst_mlir_dtype, *args, var_info=None):
             raise NotImplementedError("Unsupported type for to_dtype ops")
 
     @staticmethod
-    def constant(value, src_type, *args, var_info=None):
+    def constant(value, src_type, *args, var_info=None, **kwargs):
         if isinstance(src_type, torch.dtype):
             src_type = mlir_common.DTYPE_TO_MLIR[src_type]
 
@@ -255,9 +256,13 @@ def constant(value, src_type, *args, var_info=None):
             value = int(value)
         return f'arith.constant {value} : {src_type}', [1, src_type]
 
+    @staticmethod
+    def alloc(size, src_type, *args, var_info=None, **kwargs):
+        return f"memref.alloc() : memref<{size}x{src_type}>", [size, src_type]
+
     # transcendental functions
     @staticmethod
-    def exp(operand, *args, var_info=None):
+    def exp(operand, *args, var_info=None, **kwargs):
         op_type = var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
@@ -266,7 +271,7 @@ def exp(operand, *args, var_info=None):
         return f'math.exp %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def sqrt(operand, *args, var_info=None):
+    def sqrt(operand, *args, var_info=None, **kwargs):
         op_type = var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
@@ -280,7 +285,7 @@ def sqrt(operand, *args, var_info=None):
         return f'math.sqrt %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def rsqrt(operand, *args, var_info=None):
+    def rsqrt(operand, *args, var_info=None, **kwargs):
         op_type = var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
@@ -294,7 +299,7 @@ def rsqrt(operand, *args, var_info=None):
         return f'math.rsqrt %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def pow(operand1, operand2, *args, var_info=None):
+    def pow(operand1, operand2, *args, var_info=None, **kwargs):
         op_type1 = var_info[operand1]
         op_type2 = var_info[operand2]
 
@@ -316,7 +321,7 @@ def pow(operand1, operand2, *args, var_info=None):
         return f"math.pow{dtype[0]} %{operand1}, %{operand2} : {shape}", []
 
     @staticmethod
-    def log(operand, *args, var_info=None):
+    def log(operand, *args, var_info=None, **kwargs):
         op_type = var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
@@ -344,7 +349,7 @@ def reciprocal(operand, *args, var_info):
 
     # Logical operations
     @staticmethod
-    def neg(operand, *args, var_info=None):
+    def neg(operand, *args, var_info=None, **kwargs):
         op_type = var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
@@ -358,7 +363,7 @@ def neg(operand, *args, var_info=None):
         return f'arith.negf %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def eq(operand1, operand2, *args, var_info=None):
+    def eq(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         if ret_type[0] == "f":
             op_type = "arith.cmpf"
@@ -373,7 +378,7 @@ def eq(operand1, operand2, *args, var_info=None):
         return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def ne(operand1, operand2, *args, var_info=None):
+    def ne(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         if ret_type[0] == "f":
             op_type = "arith.cmpf"
@@ -388,7 +393,7 @@ def ne(operand1, operand2, *args, var_info=None):
         return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def lt(operand1, operand2, *args, var_info=None):
+    def lt(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         if ret_type[0] == "f":
             op_type = "arith.cmpf"
@@ -403,7 +408,7 @@ def lt(operand1, operand2, *args, var_info=None):
         return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def gt(operand1, operand2, *args, var_info=None):
+    def gt(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         if ret_type[0] == "f":
             op_type = "arith.cmpf"
@@ -418,7 +423,7 @@ def gt(operand1, operand2, *args, var_info=None):
         return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def le(operand1, operand2, *args, var_info=None):
+    def le(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         if ret_type[0] == "f":
             op_type = "arith.cmpf"
@@ -433,7 +438,7 @@ def le(operand1, operand2, *args, var_info=None):
         return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def ge(operand1, operand2, *args, var_info=None):
+    def ge(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         if ret_type[0] == "f":
             op_type = "arith.cmpf"
@@ -448,7 +453,7 @@ def ge(operand1, operand2, *args, var_info=None):
         return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def and_(operand1, operand2, *args, var_info=None):
+    def and_(operand1, operand2, *args, var_info=None, **kwargs):
         op_type1 = var_info[operand1]
         op_type2 = var_info[operand2]
 
@@ -469,7 +474,7 @@ def and_(operand1, operand2, *args, var_info=None):
         return f'arith.andi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def or_(operand1, operand2, *args, var_info=None):
+    def or_(operand1, operand2, *args, var_info=None, **kwargs):
         op_type1 = var_info[operand1]
         op_type2 = var_info[operand2]
 
@@ -490,7 +495,7 @@ def or_(operand1, operand2, *args, var_info=None):
         return f'arith.ori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def xor(operand1, operand2, *args, var_info=None):
+    def xor(operand1, operand2, *args, var_info=None, **kwargs):
         op_type1 = var_info[operand1]
         op_type2 = var_info[operand2]
 
@@ -512,30 +517,30 @@ def xor(operand1, operand2, *args, var_info=None):
 
 
     @staticmethod
-    def logical_and(operand, *args, var_info=None):
+    def logical_and(operand, *args, var_info=None, **kwargs):
         raise NotImplementedError("logical_and")
 
     @staticmethod
-    def logical_not(operand, *args, var_info=None):
+    def logical_not(operand, *args, var_info=None, **kwargs):
         raise NotImplementedError("logical_not")
 
     @staticmethod
-    def logical_or(operand, *args, var_info=None):
+    def logical_or(operand, *args, var_info=None, **kwargs):
         raise NotImplementedError("logical_not")
 
     @staticmethod
-    def logical_xor(operand, *args, var_info=None):
+    def logical_xor(operand, *args, var_info=None, **kwargs):
         raise NotImplementedError("logical_not")
 
     @staticmethod
-    def relu(operand, *args, var_info=None):
+    def relu(operand, *args, var_info=None, **kwargs):
         op_type = var_info[operand]
         tile_size = op_type[0]
         ret_type = "f32"
         return ops.maximum(operand, ops.constant(0.0, "f32")), [tile_size, ret_type]
 
     @staticmethod
-    def sigmoid(operand, *args, var_info=None):
+    def sigmoid(operand, *args, var_info=None, **kwargs):
         op_type = var_info[operand]
         tile_size = op_type[0]
         ret_type = "f32"
@@ -544,7 +549,7 @@ def sigmoid(operand, *args, var_info=None):
 
     # Special operaitons
     @staticmethod
-    def where(condition, operand1, operand2, *args, var_info=None):
+    def where(condition, operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         cond_type = var_info[condition]
         if cond_type[0] < tile_size:
@@ -560,29 +565,63 @@ def where(condition, operand1, operand2, *args, var_info=None):
 
 
     @staticmethod
-    def masked(mask, body, other, *args, var_info=None, tile_size=16, dtype="f32", ninf_declared=False):
+    def masked(mask, body, other, *args, var_info=None, tile_size=16, dtype="f32", ninf_declared=False, **kwargs):
         result = body()
-        val = ops.constant(0.0, "f32")
+        val = ops.constant(0.0, "f32", *args, **kwargs)
         result = ops.where(mask, result, val)
         return result, var_info[result]
 
     @staticmethod
-    def _index_expr(operand, *args, var_info=None, **kwargs):
-        symbols = sorted([str(i) for i in operand.free_symbols])
-        renamed_symbols = {symbol: sympy.Symbol(f"d{i}") for i, symbol in enumerate(symbols)}
+    def _index_expr(tile_size, buffer, renamed_expression, vec_size, *args, var_info=None, **kwargs):
+        strides = [1] * len(tile_size)
+        for i in range(len(tile_size) - 2, -1, -1):
+            strides[i] = strides[i + 1] * tile_size[i + 1]
 
-        renamed_expression = operand.subs(renamed_symbols)
+        linear_expression = []
+        for i, stride in enumerate(strides):
+            linear_expression.append(f"d{i}*{stride}")
 
-        affine_map_str = "(" + ", ".join([f"d{i}" for i in range(len(symbols))]) + ") -> ("
+        dim = ["%d"+str(i) for i in range(len(tile_size))]
+        sym_dim = ["d"+str(i) for i in range(len(tile_size))]
+        start_dim = [str(0) for i in tile_size]
+        end_dim = [str(i) for i in tile_size]
+
+        affine_map_str = "(" + ", ".join(sym_dim) + ") -> ("
         affine_map_str += sympy.printing.ccode(renamed_expression) + ")"
 
-        map_operands = [f"%{str(symbol)}" for symbol in symbols]
-        return f"affine.apply affine_map<{affine_map_str}>({', '.join(map_operands)})", [1, "index"]
+        affine_map_str2 = "(" + ", ".join(sym_dim) + ") -> ("
+        affine_map_str2 += "+".join(linear_expression) + ")"
+
+        apply_map_var = f"%index_var = affine.apply affine_map<{affine_map_str}>({', '.join(dim)})\n"
+        linear_index_var = f"%buffer_index_var = affine.apply affine_map<{affine_map_str2}>({', '.join(dim)})\n"
+        affine_store_var = f"affine.store %index_var, %{buffer}[%buffer_index_var] : memref<{vec_size}xindex>\n"
+
+        result = f"affine.parallel ({','.join(dim)}) = ({','.join(start_dim)}) to ({','.join(end_dim)}) {{\n" + \
+            apply_map_var + linear_index_var + affine_store_var + f"}}"
+        return result, [None, None]
 
     @staticmethod
-    def index_expr(operand, *args, var_info=None, **kwargs):
-        result = ops._index_expr(operand)
-        ret_type = [1, "index"]
+    def index_expr(operand, *args, var_info=None, tile_desc=None, **kwargs):
+        # Todo. To support index_expr, we have to custom instructions
+        tile_size = tile_desc.get_tile_size()
+        if tile_desc.get_used_vlane() != 1:
+            raise NotImplementedError("Currently index operation is only executable on single vectorlane configuration")
+
+        vec_size = 1
+        for ds in tile_size:
+            vec_size *= ds
+
+        buffer = ops.alloc(vec_size, "index")
+        ret_type = [vec_size, "index"]
+
+        renamed_symbols = {symbol: "d"+str(symbol)[5:] for symbol in operand.free_symbols}
+        renamed_expression = operand.subs(renamed_symbols)
+        if operand not in ExtensionOverrides.index_set:
+            # Register this operand
+            ExtensionOverrides.index_set.add(operand)
+            ops._index_expr(tile_size, buffer, renamed_expression, vec_size)
+
+        result = f"affine.vector_load %{buffer}[0] : memref<{vec_size}xindex>, vector<{vec_size}xindex> // {renamed_expression}"
         return result, ret_type
 
     @staticmethod
@@ -593,7 +632,7 @@ def index_cast(operand, target_type, *args, var_info=None, **kwrags):
         return f"arith.index_cast %{operand} : {src_shape} to {des_shape}", [op_type[0], target_type]
 
     @staticmethod
-    def broadcast_unflat(operand1, operand2, *args, var_info=None):
+    def broadcast_unflat(operand1, operand2, *args, var_info=None, **kwargs):
         op_type1 = var_info[operand1]
         op_type2 = var_info[operand2]
         src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>"# if op_type1[0] > 1 else op_type1[1]
@@ -603,7 +642,7 @@ def broadcast_unflat(operand1, operand2, *args, var_info=None):
         return expand, [op_type2[0], op_type1[1]]
 
     @staticmethod
-    def broadcast(operand1, operand2, *args, var_info=None):
+    def broadcast(operand1, operand2, *args, var_info=None, **kwargs):
         op_type1 = var_info[operand1]
         op_type2 = var_info[operand2]
         src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>" if op_type1[0] > 1 else op_type1[1]
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 7609eadb..30523ece 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -384,7 +384,7 @@ def codegen_nodes(self, nodes, kernel_name):
             tile_size = [1]
             self.ranges = [1]
         elif len(tile_size) == 1:
-            tile_size[0] = 512
+            tile_size[0] = 128*128*2
         elif len(tile_size) == 3:
             tile_size[-1] = 128
             tile_size[-2] = 128
@@ -514,14 +514,16 @@ def inner(*args, **kwargs):
                         buf_bounds = self.node_to_bounds.get(
                             fx_node, ValueRanges.unknown()
                         )
-                    code, ret_info = getattr(parent_handler, name)(*args, var_info=self.var_info)
+                    code, ret_info = getattr(parent_handler, name)(*args, var_info=self.var_info, tile_desc=self.kernel_group.tile_desc)
                     csevar = self.cse.generate(
                         self.compute,
                         code,
                         bounds=buf_bounds,
+                        assignment=(ret_info[0] is not None)
                     )
-                    self.register_var_info(csevar, ret_info)
-                    csevar.update_on_args(name, args, kwargs)
+                    if ret_info[0] is not None:
+                        self.register_var_info(csevar, ret_info)
+                        csevar.update_on_args(name, args, kwargs)
                     return csevar
 
                 return inner
diff --git a/test_extension_backend.py b/test_extension_backend.py
index 170849a4..10bc9854 100644
--- a/test_extension_backend.py
+++ b/test_extension_backend.py
@@ -25,32 +25,34 @@
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_vectoradd(device, (47, 10))
-    test_vector_scalar_add(device, (10, 10))
-    test_reduce_sum(device, (29, 47), 1, keepdim=True)
-    test_reduce_sum(device, (17, 68), 0, keepdim=True)
-    test_Transpose2D(device, [64, 156])
-    test_Transpose2D_2(device, [16, 64])
-    test_Transpose3D_1(device, [62, 34, 44])
-    test_Transpose3D_2(device, [62, 34, 44])
-    test_Transpose3D_3(device, [62, 34, 44])
-    test_view3D_2D(device)
+    #test_vectoradd(device, (47, 10))
+    #test_vector_scalar_add(device, (10, 10))
+    #test_reduce_sum(device, (32, 32), 1, keepdim=True)
+    #test_reduce_sum(device, (32, 32), 0, keepdim=True)
+    #test_reduce_sum(device, (512, 512), 1, keepdim=True)
+    #test_reduce_sum(device, (512, 512), 0, keepdim=True)
+    #test_Transpose2D(device, [64, 156])
+    #test_Transpose2D_2(device, [16, 64])
+    #test_Transpose3D_1(device, [62, 34, 256])
+    #test_Transpose3D_2(device, [62, 34, 256])
+    #test_Transpose3D_3(device, [62, 34, 256])
+    #test_view3D_2D(device)
     test_maxpool(device)
-    test_avgpool(device)
-    test_softmax(device, (64, 128), dim=1)
-    test_BatchNorm(device)
-    test_LayerNorm(device, (64, 128))
-    test_conv2d(device)
-    test_matmul(device, 33, 45, 68)
-    test_BMM(device)
-    test_CNN(device)
-    test_DecoderBlock(device)
-    test_resnet(device)
-    test_mlp(device)
-    test_mlp_inf(device, batch_size=64, input_size=256, hidden_size=512, output_size=256, sparsity=0.97)
+    #test_avgpool(device)
+    #test_softmax(device, (256, 256), dim=1)
+    #test_BatchNorm(device)
+    #test_LayerNorm(device, (64, 128))
+    #test_conv2d(device)
+    #test_matmul(device, 33, 45, 68)
+    #test_BMM(device)
+    #test_CNN(device)
+    #test_DecoderBlock(device)
+    #test_resnet(device)
+    #test_mlp(device)
+    #test_mlp_inf(device, batch_size=64, input_size=256, hidden_size=512, output_size=256, sparsity=0.97)
 
     # # Fusion Test
-    test_matmul_scalar(device)
-    test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="relu")
-    test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="sigmoid")
-    test_addmm_residual(device)
+    #test_matmul_scalar(device)
+    #test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="relu")
+    #test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="sigmoid")
+    #test_addmm_residual(device)
diff --git a/tests/test_exponent.py b/tests/test_exponent.py
index a0bd6c8b..536bef13 100644
--- a/tests/test_exponent.py
+++ b/tests/test_exponent.py
@@ -30,4 +30,4 @@ def exponent(a):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_exponent(device, size=(512, 512))
+    test_exponent(device, size=(32, 32))
diff --git a/tests/test_pool.py b/tests/test_pool.py
index f28becac..7abcb3e6 100644
--- a/tests/test_pool.py
+++ b/tests/test_pool.py
@@ -15,16 +15,19 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
 
 def test_maxpool(device):
     torch.manual_seed(0)
-    model = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1).eval()
+    model = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=1).eval()
     model.to(device=device)
-    input = torch.randn(1, 8, 64, 64).to(device=device)
+    input = torch.randn(1, 2, 5, 2).to(device=device)
+    input = torch.arange(2*5*2, 0, -1, dtype=torch.float32)
+    input = input.reshape(1,2,5,2)
+    input = input.to(device=device)
     x1 = input.to(device=device)
     x2 = input.to("cpu")
     opt_fn = torch.compile(dynamic=False)(model)
     res = opt_fn(x1)
     model.to("cpu")
     out = model(x2)
-    # test_result("Maxpool Forward", res, out) # TODO: MaxPool Functionality is not working
+    test_result("Maxpool Forward", res, out) # TODO: MaxPool Functionality is not working
 
 def test_avgpool(device):
     def avgpool(a):
diff --git a/tests/test_softmax.py b/tests/test_softmax.py
index 48c236ac..02201e78 100644
--- a/tests/test_softmax.py
+++ b/tests/test_softmax.py
@@ -20,26 +20,27 @@ def test_softmax(device, size=(128, 128), dim=1):
     x2 = input.to("cpu")
 
     # split softmax into 3 steps
-    # def softmax1(x): # find max
-    #     return x.max(dim=dim, keepdim=True).values
-    # def softmax2(x, max):
-    #     return (x - max).exp().sum(dim=dim, keepdim=True)
-    # def softmax3(x, max, sum):
-    #     return (x - max).exp().div(sum)
-
-    # opt_fn1 = torch.compile(dynamic=False)(softmax1)
-    # opt_fn2 = torch.compile(dynamic=False)(softmax2)
-    # opt_fn3 = torch.compile(dynamic=False)(softmax3)
-
-    # max = opt_fn1(x1)
-    # cpu_max = softmax1(x2)
-    # test_result("Softmax Max", max, cpu_max)
-    # sum = opt_fn2(x1, max)
-    # cpu_sum = softmax2(x2, cpu_max)
-    # test_result("Softmax Sum", sum, cpu_sum)
-    # y = opt_fn3(x1, max, sum)
-    # cpu_y = softmax3(x2, cpu_max, cpu_sum)
-    # test_result("Softmax", y, cpu_y)
+    #def softmax1(x): # find max
+    #    return x.max(dim=dim, keepdim=True).values
+    #def softmax2(x, max):
+    #    return (x - max).exp().sum(dim=dim, keepdim=True)
+    #def softmax3(x, max, sum):
+    #    return (x - max).exp().div(sum)
+
+    #opt_fn1 = torch.compile(dynamic=False)(softmax1)
+    #opt_fn2 = torch.compile(dynamic=False)(softmax2)
+    #opt_fn3 = torch.compile(dynamic=False)(softmax3)
+
+    #max = opt_fn1(x1)
+    #cpu_max = softmax1(x2)
+    #test_result("Softmax Max", max, cpu_max)
+    #sum = opt_fn2(x1, max)
+    #cpu_sum = softmax2(x2, cpu_max)
+    #test_result("Softmax Sum", sum, cpu_sum)
+
+    #y = opt_fn3(x1, max, sum)
+    #cpu_y = softmax3(x2, cpu_max, cpu_sum)
+    #test_result("Softmax", y, cpu_y)
 
     opt_fn = torch.compile(dynamic=False)(torch.nn.functional.softmax)
     y = opt_fn(x1, dim=dim)

From cf45f8ede551d0fcfb2c692b90fdf32229c35454 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 31 Jan 2025 10:03:03 +0000
Subject: [PATCH 051/432] [Frontend/dma4d] Use vector op for index_operation

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index bb453739..1c9331b1 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -594,10 +594,11 @@ def _index_expr(tile_size, buffer, renamed_expression, vec_size, *args, var_info
 
         apply_map_var = f"%index_var = affine.apply affine_map<{affine_map_str}>({', '.join(dim)})\n"
         linear_index_var = f"%buffer_index_var = affine.apply affine_map<{affine_map_str2}>({', '.join(dim)})\n"
-        affine_store_var = f"affine.store %index_var, %{buffer}[%buffer_index_var] : memref<{vec_size}xindex>\n"
+        broadcast_var = f"%broadcast_var = vector.broadcast %index_var : index to vector<2xindex>\n"
+        affine_store_var = f"affine.vector_store %broadcast_var, %{buffer}[%buffer_index_var] : memref<{vec_size}xindex>, vector<2xindex>\n"
 
         result = f"affine.parallel ({','.join(dim)}) = ({','.join(start_dim)}) to ({','.join(end_dim)}) {{\n" + \
-            apply_map_var + linear_index_var + affine_store_var + f"}}"
+            apply_map_var + linear_index_var + broadcast_var + affine_store_var + f"}}"
         return result, [None, None]
 
     @staticmethod

From d827a824555513e91aec8514a0af6b7bb34fb01a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sun, 2 Feb 2025 10:34:17 +0000
Subject: [PATCH 052/432] [Frontend/dma4d] View operator fix

---
 .../mlir/mlir_codegen_backend.py              | 70 +++++++++++++------
 PyTorchSimFrontend/mlir/mlir_common.py        | 67 +++++++++++++++++-
 2 files changed, 114 insertions(+), 23 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 1c9331b1..82ef5143 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -19,6 +19,7 @@
     IndentedBuffer,
     is_welford_reduction,
 )
+from torch.utils._sympy.functions import ModularIndexing
 import PyTorchSimFrontend.extension_codecache as extension_codecache
 
 from . import mlir_common
@@ -715,6 +716,29 @@ def get_padding_type(self):
                     return 1
         return 0
 
+    def convert_index(self, expr):
+        if len(expr.free_symbols) != 1:
+            raise NotImplementedError("Not supporting this view operation...!")
+
+        if expr.is_symbol:
+            return expr
+
+        expr_str = str(expr)
+        if isinstance(expr, ModularIndexing):
+            replace_str = f"({expr.args[0]} floordiv {expr.args[1]}) mod {expr.args[2]}"
+            expr_str = re.sub(r"ModularIndexing\([^)]*\)", replace_str, expr_str)
+        elif "//" in expr_str:
+            expr_str = expr_str.replace("//", " floordiv ")
+        else:
+            raise NotImplementedError("What is this case?")
+
+        indices = [expr.args[0]]
+        args = ", ".join(map(str, indices))
+        map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args}) -> ({expr_str})>")
+        args = ", ".join([f"%{i}" for i in indices])
+        index = self.cse.generate(self.loads, f"affine.apply #{map_var}({args})")
+        return index
+
     def parse_indices(self, expr) -> common.CSEVariable:
         # Constant case
         if expr.is_number:
@@ -724,27 +748,19 @@ def parse_indices(self, expr) -> common.CSEVariable:
         if len(expr.args) == 0:
             return expr
 
+        indices = []
+        for arg in expr.args:
+            if arg.is_Mul and arg.args[0].is_number:
+                new_arg = sympy.Symbol(str(self.convert_index(arg.args[1])))
+                expr = expr.replace(arg.args[1], new_arg)
+            else:
+                new_arg = sympy.Symbol(str(self.convert_index(arg)))
+                expr = expr.replace(arg, new_arg)
+            indices.append(str(new_arg))
+        indices.sort()
+
         # Extract index var
         expr_str = str(expr)
-        pattern = r'index\d+'
-        indices = OrderedDict()
-        for index in re.findall(pattern, expr_str):
-            indices[index] = None
-        indices = list(indices.keys())
-
-        # Extract // pattern
-        if "//" in expr_str:
-            expr_str = expr_str.replace("//", " floordiv ")
-
-        # Extract modular pattern
-        pattern = r"ModularIndexing\((.*?)\)"
-        matches = re.search(pattern, expr_str)
-        if matches:
-            mod_args = matches.group(1)
-            args_list = mod_args.split(", ")
-            replace_str = f"({args_list[0]} floordiv {args_list[1]}) mod {args_list[2]}"
-            expr_str = re.sub(r"ModularIndexing\([^)]*\)", replace_str, expr_str)
-
         args = ", ".join(map(str, indices))
         map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args}) -> ({expr_str})>")
         args = ", ".join([f"%{i}" for i in indices])
@@ -1140,12 +1156,13 @@ def get_dma_info(self, name, index, index_var, broadcast=True): # Need more argu
         # Note: index could contain symbols that represent dynamic axies
         # Extract dimension of index(e.g, index0, index1)
         local_dims = [int(str(i)[5:]) for i in index.free_symbols if "index" in str(i)]
+        implicit_local_dims = list(index.args)
         total_dims =  [int(str(i)[5:]) for i in self.itervars]
         local_tile_desc = mlir_common.MLIRMultiDimTile([1], self.vector_lane)
         local_dims.sort() # Assume that smaller index is placed in the outer loop
 
         # Reduction can have two type of tile size
-        if broadcast and (total_dims != local_dims or total_dims[:self.reduction_depth] == local_dims):
+        if broadcast and (total_dims != local_dims or (self.reduction_depth!=len(total_dims) and total_dims[:self.reduction_depth] == local_dims)):
             # We have to create custom apply map to provide dram stride
             # ex) (d0, d1, ... dn, dn+1, dn+2, dk) -> (s0*d0 + s1*d1 + ... dn*0+ dn+1*0 + ... dk*0 + const)
             fake_dim = self.get_const_cse(0)
@@ -1202,6 +1219,19 @@ def get_dma_info(self, name, index, index_var, broadcast=True): # Need more argu
         else:
             raise NotImplementedError("Currently not implemented... ;)")
 
+        if len(implicit_local_dims)!=0 and len(local_dims) != len(implicit_local_dims):
+            tile_size = local_tile_desc.get_tile_size()
+            new_tile_size = []
+            new_vlane_split_axis = local_tile_desc.vlane_split_axis
+            implicit_dim_size = list(kg_tile_desc.implicit_dim_size.values())
+            for i, target_dim_size in enumerate(implicit_dim_size):
+                new_tile_size += [1]*(len(target_dim_size)-1) + tile_size[i:i+1]
+                if local_tile_desc.vlane_split_axis >= i:
+                    new_vlane_split_axis += len(target_dim_size)-1
+            # Update
+            local_tile_desc.set_tile_size(new_tile_size)
+            local_tile_desc.vlane_split_axis = new_vlane_split_axis
+
         return local_tile_desc, index_var
 
     def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_var, dram_index_var, sram_var, sram_index_var,
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 30523ece..90f1aa73 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -1,9 +1,14 @@
 import os
+from collections import defaultdict
+from functools import reduce
+from operator import mul
 import torch
 from torch._inductor.codegen import common
 from torch._inductor.codegen import cpp
 from torch._inductor.virtualized import V
 from torch._inductor.ir import MultiOutputLayout
+from torch._inductor.dependencies import MemoryDep
+from torch.utils._sympy.functions import ModularIndexing
 import sympy
 import contextlib
 
@@ -157,6 +162,7 @@ def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=N
         self.vector_lane = vector_lane
         self.vlane_split_axis = vlane_split_axis
         self.vlane_stride = vlane_stride
+        self.implicit_dim_size = None
 
     def set_tile_size(self, tile_size, tile_axis_order=None):
         self._tile_size = tile_size
@@ -375,6 +381,56 @@ def codegen_nodes(self, nodes, kernel_name):
         # Set node range info
         vars, reduction_vars = self.set_ranges(group, reduction_group)
 
+        # Handle implict dims. Input operand could have larger dimension space.
+        implicit_ranges = False
+        target_operand : MemoryDep = None
+        implicit_dim_size = defaultdict(list)
+        for read_operand in nodes[0].read_writes.reads:
+            read_operand : MemoryDep
+            read_index = read_operand.index
+            for arg in read_index.args:
+                if "ModularIndexing" in str(arg) or "//" in str(arg):
+                    implicit_ranges = True
+                    target_operand = read_operand
+                    break
+
+        if implicit_ranges:
+            print("This operation contina implicit dimension space!")
+            linearized_stride = [1] * len(target_operand.var_names)
+            for i in range(len(target_operand[3])-2, -1, -1):
+                linearized_stride[i] = linearized_stride[i+1] * target_operand[3][i+1]
+
+            linearized_index = sympy.Integer(0)
+            for dim, stride in zip(target_operand[2], linearized_stride):
+                linearized_index += stride * dim
+
+            new_dim_expression = []
+            new_dim_size = []
+            for arg in target_operand.index.args:
+                if len(arg.free_symbols) != 1:
+                    raise NotImplementedError("Not supporting this view operation...!")
+
+                if arg.is_Mul and arg.args[0].is_number:
+                    arg = arg.args[1]
+
+                if isinstance(arg, ModularIndexing):
+                    modular_expr = ModularIndexing(arg.args[0], arg.args[1], arg.args[2])
+                elif arg.is_symbol:
+                    modular_expr = ModularIndexing(arg, 1, target_operand.ranges[arg])
+                elif "//" in str(arg):
+                    modular_expr = ModularIndexing(arg.args[0], arg.args[1], target_operand.ranges[arg.args[0]]//arg.args[1])
+                else:
+                    raise NotImplementedError("What is this case?")
+                new_dim_expression.append(modular_expr)
+                new_dim_size.append(modular_expr.args[2])
+                implicit_dim_size[int(str(modular_expr.args[0])[1:])].append(int(modular_expr.args[2]))
+
+            # Sanity check
+            for dim, sub_dims in implicit_dim_size.items():
+                sz = reduce(mul, sub_dims, 1)
+                if sz != target_operand[3][dim]:
+                    raise NotImplementedError("Not supporting type...")
+
         # Dummy tile size
         tile_size = [1] * (len(vars) + len(reduction_vars))
         if len(tile_size) == 2:
@@ -395,9 +451,13 @@ def codegen_nodes(self, nodes, kernel_name):
         vlane_stride = 2
         # Adjust tile size to avoid too much paddings
         for i in range(1, len(tile_size)+1):
-            if tile_size[-i] > self.ranges[-i]:
-                remains = (self.ranges[-i] % vlane_stride)
-                tile_size[-i] = self.ranges[-i]
+            target_range = self.ranges[-i]
+            if implicit_ranges:
+                target_range = implicit_dim_size[len(tile_size)-i][-1]
+
+            if tile_size[-i] > target_range:
+                remains = (target_range % vlane_stride)
+                tile_size[-i] = target_range
                 if remains:
                     tile_size[-i] += vlane_stride - remains
 
@@ -406,6 +466,7 @@ def codegen_nodes(self, nodes, kernel_name):
         tile_desc = MLIRMultiDimTile(tile_size, self.vector_lane)
         tile_desc.vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
         tile_desc.vlane_stride = vlane_stride
+        tile_desc.implicit_dim_size = implicit_dim_size
         self.kernel_group.set_tile_info(tile_desc)
 
         _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs()

From a754a0fc87fd8aa5b68e36c4ee0e961f4aa7ee93 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sun, 2 Feb 2025 10:49:10 +0000
Subject: [PATCH 053/432] [Frontend] Add kwargs for reciprocal op

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 82ef5143..d9460500 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -336,7 +336,7 @@ def log(operand, *args, var_info=None, **kwargs):
         return f'math.log %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def reciprocal(operand, *args, var_info):
+    def reciprocal(operand, *args, var_info=None, **kwargs):
         op_type = var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]

From 167b1bdbc1f740c5108f4b7116842840c65b1b6e Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Mon, 3 Feb 2025 08:31:53 +0000
Subject: [PATCH 054/432] [Frontend] dma4d fusion

---
 PyTorchSimFrontend/mlir/mlir_common.py     | 19 ++++----
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 13 ++++--
 PyTorchSimFrontend/mlir/mlir_template.py   | 53 ++++++++++------------
 3 files changed, 46 insertions(+), 39 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 90f1aa73..415358d8 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -373,14 +373,7 @@ def call_kernel(self, kernel_name):
        # generate the code to call this
         wrapper.generate_kernel_call(kernel_name, call_args, cuda=False)
 
-    def codegen_nodes(self, nodes, kernel_name):
-        _, (group, reduction_group) = max(
-            nodes, key=lambda x: int(x.is_reduction())
-        ).group
-
-        # Set node range info
-        vars, reduction_vars = self.set_ranges(group, reduction_group)
-
+    def compute_tile_size(self, nodes, vars, reduction_vars):
         # Handle implict dims. Input operand could have larger dimension space.
         implicit_ranges = False
         target_operand : MemoryDep = None
@@ -467,6 +460,16 @@ def codegen_nodes(self, nodes, kernel_name):
         tile_desc.vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
         tile_desc.vlane_stride = vlane_stride
         tile_desc.implicit_dim_size = implicit_dim_size
+        return tile_desc
+
+    def codegen_nodes(self, nodes, kernel_name):
+        _, (group, reduction_group) = max(
+            nodes, key=lambda x: int(x.is_reduction())
+        ).group
+
+        # Set node range info
+        vars, reduction_vars = self.set_ranges(group, reduction_group)
+        tile_desc = self.compute_tile_size(nodes, vars, reduction_vars)
         self.kernel_group.set_tile_info(tile_desc)
 
         _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs()
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 5c5c1fb0..718bbbfd 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -129,11 +129,18 @@ def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, tile_size
     def codegen_src_code(self, kernel, render, template_node, epilogue_nodes):
         with kernel:
             for node in [template_node, *epilogue_nodes]:
-                    node.mark_run()
+                node.mark_run()
             partial_code = render()
+            if epilogue_nodes:
+                _, (group, reduction_group) = max(
+                    epilogue_nodes, key=lambda x: int(x.is_reduction())
+                ).group
+                vars, reduction_vars = kernel.set_ranges(group, reduction_group)
+                tile_desc = kernel.compute_tile_size(epilogue_nodes, vars, reduction_vars)
+                kernel.kernel_group.set_tile_info(tile_desc)
+                kernel.adjust_tile_size()
             for node in epilogue_nodes:
-                ranges = node.get_ranges()
-                node.codegen(kernel.set_ranges(ranges[0], ranges[1]))
+                node.codegen((vars, reduction_vars))
         with V.set_kernel_handler(kernel):
             src_code = (
                 partial_code
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 188815bb..e5b279d9 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -291,73 +291,70 @@ def render(self, template, kwargs):
 
     def adjust_tile_size(self):
         # Fixed tile size for template kernel
-        self.tile_desc.tile_layout = MLIRTile.TILE_COL_WISE
-        self.tile_desc.n_row = self.render_options['TILE_M']
-        self.tile_desc.n_col = self.render_options['TILE_N']
+        self.kernel_group.tile_desc.set_tile_size((self.render_options['TILE_M'], self.render_options['TILE_N']))
+        self.kernel_group.tile_desc.vlane_split_axis = 0 # FIXME: Fixed
+        self.kernel_group.tile_desc.vlane_stride = 1 # FIXME: Fixed
         return
 
     def load_epilogue(self, name: str, index: sympy.Expr):
-        raise NotImplementedError("Not implemented!")
         #index_var = self.parse_indices(index)
         index_var = "index2"
         index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.input(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
+        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis
+        vlane_stride = self.kernel_group.tile_desc.vlane_stride
+        tile_numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
         if name not in self.buffer_names:
             # Allocate sram buffer
             dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
-            tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
-            sram_var, index_var = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], tile_shape, self.loads, index_var, index)
+            tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
+            tile_stride = self.kernel_group.tile_desc.get_tile_stride()
+            sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.loads, index_var, index)
             self.buffer_names[name] = sram_var
-
-            # Generate DMA instruction
-            vlane_split_axis = 0                # FIXME. Is it okay?
-            vlane_stride = 1                    # FIXME. Is it okay?
-            index_var = "index2"                # FIXME. Is it okay?
-            code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape)
+            code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
+                                     f"{name}_tag", dram_shape, tile_shape, tile_stride)
             self.cse.generate(self.loads, code, assignment = False)
 
         # Load vector from sram
         sram_var = self.buffer_names[name]
-        tile_size_per_lane = self.render_options['TILE_M'] * self.render_options['TILE_N'] // self.vector_lane
-        operation = "affine.vector_load" if tile_size_per_lane > 1 else "affine.load"
-        shape = f", vector<{tile_size_per_lane}x{mlir_dtype}>" if tile_size_per_lane > 1 else ""
+        operation = "affine.vector_load" if tile_numel_per_lane > 1 else "affine.load"
+        shape = f", vector<{tile_numel_per_lane}x{mlir_dtype}>" if tile_numel_per_lane > 1 else ""
         zero_var = self.get_const_cse(0)
         line = f"{operation} %{sram_var}[%{zero_var}, %{zero_var}] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{mlir_dtype}, 1>{shape}"
         out = self.cse.generate(self.loads, line)
-        self.register_var_info(out, [tile_size_per_lane, mlir_dtype])
+        self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
         return out
 
     def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
-        raise NotImplementedError("Not implemented!")
         #index_var = self.parse_indices(index)
         index_var = "index2"
         dram_var = self.kernel_group.args.output(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
+        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis
+        vlane_stride = self.kernel_group.tile_desc.vlane_stride
+        tile_numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
 
-        vlane_split_axis = 0
-        vlane_stride = 1  # Fixed for template kernel
-        #chunk = chunk_size << 1 | (self.tile_desc.tile_per_lane_layout == MLIRTile.TILE_PER_LANE_COL_WISE)
+        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
+        tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
+        tile_stride = self.kernel_group.tile_desc.get_tile_stride()
 
         if name not in self.buffer_names:
-            dram_tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
-            sram_var, index_var = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], dram_tile_shape, self.stores, index_var, index)
+            sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.stores, index_var, index)
             self.buffer_names[name] = sram_var
         sram_var = self.buffer_names[name]
 
-        tile_size_per_lane = self.render_options['TILE_M'] * self.render_options['TILE_N'] // self.vector_lane
-        operation = "affine.vector_store" if tile_size_per_lane > 1 else "affine.store"
-        shape = f", vector<{tile_size_per_lane}x{mlir_dtype}>" if tile_size_per_lane > 1 else ""
+        operation = "affine.vector_store" if tile_numel_per_lane > 1 else "affine.store"
+        shape = f", vector<{tile_numel_per_lane}x{mlir_dtype}>" if tile_numel_per_lane > 1 else ""
         zero_var = self.get_const_cse(0)
         line = f"{operation} %{value}, %{sram_var}[%{zero_var}, %{zero_var}] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{mlir_dtype}, 1>{shape}"
         self.cse.generate(self.stores, line, assignment = False)
 
         index_var = "index2"                # FIXME. Is it okay?
-        dram_shape = f"memref<{self.render_options['M']}x{self.render_options['N']}x{mlir_dtype}>"
-        tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
-        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, f"{name}_tag", dram_shape, tile_shape)
+        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
+                                 f"{name}_tag", dram_shape, tile_shape, tile_stride)
         self.cse.generate(self.stores, code, assignment = False)
 
     def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, code_buffer, index_var, raw_index):

From c95ae5ae5d5567d013360758ff327bce87aac713 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Mon, 3 Feb 2025 13:22:58 +0000
Subject: [PATCH 055/432] [MLIR] lowering option

---
 PyTorchSimFrontend/extension_codecache.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 15c05d84..cc3a15d7 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -49,7 +49,7 @@ def mlir_compile_command(filename, vectorlane_size, tile_size, vlen=256):
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
             -test-loop-padding \
-            -dma-fine-grained='systolic-array-size={vectorlane_size} tile-size={tile_size[0]},{tile_size[1]},{tile_size[2]}' \
+            -dma-fine-grained='systolic-array-size={vectorlane_size}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size}" \
             -lower-affine \
@@ -82,7 +82,7 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
             -test-loop-padding='timing_mode=1' \
-            -dma-fine-grained='systolic-array-size={vectorlane_size} tile-size={tile_size[0]},{tile_size[1]},{tile_size[2]}' \
+            -dma-fine-grained='systolic-array-size={vectorlane_size}' \
             -test-tile-operation-graph='vectorlane={vectorlane_size}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size} timing=1" \

From 1e0e3dd80adb0e9fd64be6e413a5d0c5875cd383 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Tue, 4 Feb 2025 01:15:23 +0000
Subject: [PATCH 056/432] [Fix] kernel_group fixed for fusion

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py |  4 ++--
 PyTorchSimFrontend/mlir/mlir_common.py          |  8 ++++----
 PyTorchSimFrontend/mlir/mlir_scheduling.py      |  9 ++++-----
 PyTorchSimFrontend/mlir/mlir_template.py        | 15 ++++++++++-----
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index d9460500..61a4d40c 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -677,8 +677,8 @@ class MLIRKernel(mlir_common.BaseMLIRKernel):
     overrides = ExtensionOverrides
     newvar_prefix = "%"
 
-    def __init__(self):
-        super().__init__(mlir_common.MLIRKernelArgs())
+    def __init__(self, kernel_group):
+        super().__init__(kernel_group)
         self.const_buffer = IndentedBuffer()
         self.alloc_buffer = IndentedBuffer()
         self.reduction_prefix = IndentedBuffer()
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 415358d8..494f6e10 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -182,7 +182,7 @@ def get_numel(self):
         for dim_size in self._tile_size:
             size *= dim_size
         return size
- 
+
     def get_numel_per_lane(self):
         tile_size_per_lane = self.get_tile_size_per_lane()
         size = 1
@@ -317,9 +317,9 @@ class BaseMLIRKernel(common.Kernel, BaseMLIRHardwareInfo):
     load_format = None
     store_format = None
 
-    def __init__(self, args=None):
-        super().__init__(args)
-        self.kernel_group : MLIRWrapperKenrelGroup = MLIRWrapperKenrelGroup()
+    def __init__(self, kernel_group):
+        super().__init__(kernel_group.args)
+        self.kernel_group = kernel_group
         # Kernel iteration range info
         self.call_ranges = None
         self.ranges = None
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 718bbbfd..8a85bfc1 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -73,7 +73,7 @@ def codegen_nodes(self, nodes):
         _, (group, reduction_group) = max(
             nodes, key=lambda x: int(x.is_reduction())
         ).group
-        ex_kernel = self.target_kernel()
+        ex_kernel = self.target_kernel(kernel_group=self.kernel_group)
         ex_kernel.kernel_group = self.kernel_group
 
         kernel_name = f"extension_kernel_{MLIRScheduling.count}"
@@ -154,8 +154,8 @@ def codegen_src_code(self, kernel, render, template_node, epilogue_nodes):
     def codegen_template(self, template_node, epilogue_nodes):
         _, (numel, rnumel) = template_node.group
         template_buffer = template_node.node
-        kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, epilogue_nodes=epilogue_nodes)
-        _, _, _, kernel.buffer_types = kernel.args.mlir_argdefs()
+        kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group)
+        _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs()
 
         src_code = self.codegen_src_code(kernel, render, template_node, epilogue_nodes)
         wrapper = V.graph.wrapper_code
@@ -167,7 +167,6 @@ def codegen_template(self, template_node, epilogue_nodes):
 
         with V.set_kernel_handler(kernel):
             codegen_header(src_code, (kernel.header.getvalue(), kernel.gem5_header.getvalue()))
-            # node_schedule = [template_node, *epilogue_nodes]
             kernel.meta_kernel()
             kernel_name = self.define_kernel(src_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info,
                                              kernel.tile_size, kernel.loop_size, origins={str(i) for i in template_node.node.origins})
@@ -175,7 +174,7 @@ def codegen_template(self, template_node, epilogue_nodes):
 
         kernel.call_kernel(kernel_name)
         V.graph.removed_buffers |= kernel.removed_buffers
-        _, args, _, _ = kernel.args.mlir_argdefs()
+        _, args, _, _ = self.kernel_group.args.mlir_argdefs()
         args = ", ".join(args)
         if (extension_config.CONFIG_BACKENDSIM_EAGER_MODE):
             target_kernel_name = kernel_name if kernel.outer_func_name is None else kernel.outer_func_name
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index e5b279d9..d29f665a 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -27,10 +27,11 @@ def __init__(self,
                  kernel_name,
                  input_nodes,
                  call_size,
+                 kernel_group = None,
                  outer_func_name=None,
                  outer_func_render=None,
                  kernel_arg_attributes=None) -> None:
-        super().__init__()
+        super().__init__(kernel_group if kernel_group is not None else mlir_common.MLIRWrapperKenrelGroup())
         self.kernel_name = kernel_name
         self.input_nodes = input_nodes
         self.call_size = call_size
@@ -292,7 +293,7 @@ def render(self, template, kwargs):
     def adjust_tile_size(self):
         # Fixed tile size for template kernel
         self.kernel_group.tile_desc.set_tile_size((self.render_options['TILE_M'], self.render_options['TILE_N']))
-        self.kernel_group.tile_desc.vlane_split_axis = 0 # FIXME: Fixed
+        self.kernel_group.tile_desc.vlane_split_axis = 1 # FIXME: Fixed
         self.kernel_group.tile_desc.vlane_stride = 1 # FIXME: Fixed
         return
 
@@ -310,7 +311,8 @@ def load_epilogue(self, name: str, index: sympy.Expr):
             # Allocate sram buffer
             dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
             tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
-            tile_stride = self.kernel_group.tile_desc.get_tile_stride()
+            # tile_stride = self.kernel_group.tile_desc.get_tile_stride()
+            tile_stride = [1, self.render_options['TILE_M']] # FIXME: Fixed
             sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.loads, index_var, index)
             self.buffer_names[name] = sram_var
             code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
@@ -339,7 +341,8 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
-        tile_stride = self.kernel_group.tile_desc.get_tile_stride()
+        # tile_stride = self.kernel_group.tile_desc.get_tile_stride()
+        tile_stride = [1, self.render_options['TILE_M']] # FIXME: Fixed
 
         if name not in self.buffer_names:
             sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.stores, index_var, index)
@@ -390,7 +393,7 @@ def __init__(self, name, input_nodes, layout, input_reorder = None):
     def generate(self, **kwargs) -> ChoiceCaller:
         kernel_name = f"mlir_{self.name}"
         with patch.object(V.graph, "get_dtype", self._fake_get_dtype(self.output_node)):
-            kernel  = MLIRTemplateKernel(kernel_name=kernel_name, input_nodes=self.input_nodes, call_size=self.layout.size,
+            kernel  = MLIRTemplateKernel(kernel_name=kernel_name, input_nodes=self.input_nodes, call_size=self.layout.size, kernel_group=None,
                                          outer_func_name=self.function_name if hasattr(self, 'function_name') else None,
                                          outer_func_render=self.outer_func_render if hasattr(self, 'outer_func_render') else None,
                                          kernel_arg_attributes=self.get_arg_attributes() if hasattr(self, 'get_arg_attributes') else None)
@@ -411,11 +414,13 @@ def make_kernel_render(
             template_node: TemplateBuffer,
             epilogue_nodes: Optional[List[IRNode]] = None,
             kernel_name: str = kernel_hash_name,
+            kernel_group: Optional[mlir_common.MLIRWrapperKenrelGroup] = None
         ):
             kernel = MLIRTemplateKernel(
                 kernel_name=kernel_name,
                 input_nodes=self.input_nodes,
                 call_size=self.layout.size,
+                kernel_group=kernel_group,
                 outer_func_name=self.function_name if hasattr(self, 'function_name') else None,
                 outer_func_render=functools.partial(
                     self.outer_func_render,

From 47ed86931738f21472895b68e3401b1f6e64e73f Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Tue, 4 Feb 2025 04:05:25 +0000
Subject: [PATCH 057/432] [Fix] remove deprecated code

---
 PyTorchSimFrontend/extension_codecache.py        | 10 +++++-----
 PyTorchSimFrontend/mlir/mlir_bmm_template.py     |  1 -
 PyTorchSimFrontend/mlir/mlir_conv_template.py    |  1 -
 PyTorchSimFrontend/mlir/mlir_gemm_template.py    |  1 -
 PyTorchSimFrontend/mlir/mlir_maxpool_template.py |  1 -
 PyTorchSimFrontend/mlir/mlir_scheduling.py       |  5 ++---
 6 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index cc3a15d7..91029329 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -44,7 +44,7 @@ def llvm_compile_command(input, output):
         """,
     ).strip()]
 
-def mlir_compile_command(filename, vectorlane_size, tile_size, vlen=256):
+def mlir_compile_command(filename, vectorlane_size, vlen=256):
     return [re.sub(r"[ \n]+", " ",
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
@@ -77,7 +77,7 @@ def mlir_compile_command(filename, vectorlane_size, tile_size, vlen=256):
         """,
     ).strip()]
 
-def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_size, tile_size, vlen=256):
+def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_size, vlen=256):
     return [re.sub(r"[ \n]+", " ",
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
@@ -125,14 +125,14 @@ def load(cls, source_code,
              validation_binary_name="validation_bin",
              cycle_wrapper_name="cycle_wrapper",
              cycle_binary_name="cycle_bin",
-             arg_attributes=[], vectorlane_size=16, tile_size=[],
+             arg_attributes=[], vectorlane_size=16,
              spad_info=None, origins=None, **kwargs):
         write_path = get_write_path(source_code)
         key, input_path = write(source_code, "mlir", specified_dir=write_path)
         new_input_path = os.path.splitext(input_path)[0]
         raw_tog_path = new_input_path + "_tog.py"
         sample_mlir_path = new_input_path + "_sample"
-        gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size, tile_size)
+        gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)
 
         from filelock import FileLock
         lock_dir = get_lock_dir()
@@ -144,7 +144,7 @@ def load(cls, source_code,
             link_option = ""
         # Generate LLVM kernel calller and binary for validation
         if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE:
-            cmds = mlir_compile_command(new_input_path, vectorlane_size, tile_size, vlen=256)
+            cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=256)
             opt_cmd = shlex.split(cmds[0])
             translate_cmd = shlex.split(cmds[1])
             llc_cmd = shlex.split(cmds[2])
diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 38593f70..26d30266 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -93,7 +93,6 @@ def render(self,
 
         B, M, N, K = X.get_size()[0], X.get_size()[1], W.get_size()[2], X.get_size()[2]
         TILE_M, TILE_N, TILE_K = kernel.gemmini_gemm_mapping(M, N, K)
-        kernel.tile_size = [TILE_M, TILE_N, TILE_K]
         kernel.loop_size = [M, N, K]
 
         W_transposed = self.is_transposed(W)
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 304474cc..c0331d8e 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -177,7 +177,6 @@ def render(self,
         N = self.gemm_weight_shape[0]
         K = self.gemm_weight_shape[1]
         TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
-        kernel.tile_size = [TILE_M, TILE_N, TILE_K]
         kernel.loop_size = [M, N, K]
 
         W_transposed = self.is_transposed(W)
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index ffc0816d..c8824ba4 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -107,7 +107,6 @@ def render(self,
         else:
             TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
             template = GEMM_TEMPLATE
-        kernel.tile_size = [TILE_M, TILE_N, TILE_K]
         kernel.loop_size =[M, N, K]
 
         W_transposed = self.is_transposed(W)
diff --git a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
index 1f93f82a..f8c58b8d 100644
--- a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
@@ -62,7 +62,6 @@ def render(self,
         H = Y.get_size()[2]
         W = Y.get_size()[3]
         BCH = B * C * H
-        kernel.tile_size = [1, 1, 1] # Dummy Tile
         kernel.loop_size = None
         options = {
           "KERNEL_NAME" : self.name,
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 8a85bfc1..6bc3facf 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -108,7 +108,7 @@ def define_function(self, kernel):
             wrapper.header.writeline(code)
             self.outer_function.add(function_name)
 
-    def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, tile_size=[1, 1, 1], loop_size=None, origins={}):
+    def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, loop_size=None, origins={}):
         wrapper = V.graph.wrapper_code
         if src_code in wrapper.src_to_kernel:
             kernel_name = wrapper.src_to_kernel[src_code]
@@ -118,7 +118,6 @@ def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, tile_size
             codecache_def = IndentedBuffer()
             codecache_def.writeline(f"custom_async_compile.mlir('''{src_code}''', ")
             codecache_def.writeline(f"vectorlane_size={vector_lane},")
-            codecache_def.writeline(f"tile_size={tile_size},")
             codecache_def.writeline(f"loop_size={loop_size},")
             codecache_def.writeline(f"spad_info={spad_info},")
             codecache_def.writeline(f"origins={origins},")
@@ -169,7 +168,7 @@ def codegen_template(self, template_node, epilogue_nodes):
             codegen_header(src_code, (kernel.header.getvalue(), kernel.gem5_header.getvalue()))
             kernel.meta_kernel()
             kernel_name = self.define_kernel(src_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info,
-                                             kernel.tile_size, kernel.loop_size, origins={str(i) for i in template_node.node.origins})
+                                             kernel.loop_size, origins={str(i) for i in template_node.node.origins})
             self.define_function(kernel)
 
         kernel.call_kernel(kernel_name)

From 0e029b0dda18309b795410bbc1d439778178f62d Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Tue, 4 Feb 2025 07:39:26 +0000
Subject: [PATCH 058/432] [Frontend] constant idx & test case

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 7 ++++---
 PyTorchSimFrontend/mlir/mlir_common.py          | 3 +++
 tests/MoE/test_moe.py                           | 4 ++--
 tests/test_softmax.py                           | 1 +
 tests/test_view3D_2D.py                         | 7 ++++---
 5 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 61a4d40c..d95aa2a2 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -753,10 +753,11 @@ def parse_indices(self, expr) -> common.CSEVariable:
             if arg.is_Mul and arg.args[0].is_number:
                 new_arg = sympy.Symbol(str(self.convert_index(arg.args[1])))
                 expr = expr.replace(arg.args[1], new_arg)
-            else:
+                indices.append(str(new_arg))
+            elif not arg.is_number:
                 new_arg = sympy.Symbol(str(self.convert_index(arg)))
                 expr = expr.replace(arg, new_arg)
-            indices.append(str(new_arg))
+                indices.append(str(new_arg))
         indices.sort()
 
         # Extract index var
@@ -1219,7 +1220,7 @@ def get_dma_info(self, name, index, index_var, broadcast=True): # Need more argu
         else:
             raise NotImplementedError("Currently not implemented... ;)")
 
-        if len(implicit_local_dims)!=0 and len(local_dims) != len(implicit_local_dims):
+        if len(implicit_local_dims)!=0 and len(local_dims) != len(implicit_local_dims) and self.is_modular_indexing(index):
             tile_size = local_tile_desc.get_tile_size()
             new_tile_size = []
             new_vlane_split_axis = local_tile_desc.vlane_split_axis
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 494f6e10..dc25b0a1 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -373,6 +373,9 @@ def call_kernel(self, kernel_name):
        # generate the code to call this
         wrapper.generate_kernel_call(kernel_name, call_args, cuda=False)
 
+    def is_modular_indexing(self, expr):
+        return "ModularIndexing" in str(expr)
+
     def compute_tile_size(self, nodes, vars, reduction_vars):
         # Handle implict dims. Input operand could have larger dimension space.
         implicit_ranges = False
diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py
index d14bf5c6..cf2f37f4 100644
--- a/tests/MoE/test_moe.py
+++ b/tests/MoE/test_moe.py
@@ -341,7 +341,7 @@ def forward(self, x, loss_coef=1e-2):
         expert_inputs = dispatcher.dispatch(x)
         gates = dispatcher.expert_to_gates()
         expert_outputs = [self.experts[i](expert_inputs[i]) for i in range(self.num_experts)]
-        y = dispatcher.combine(expert_outputs, multiply_by_gates=False)
+        y = dispatcher.combine(expert_outputs, multiply_by_gates=True)
         return y, loss
 
     @torch.compiler.disable(recursive=True)
@@ -514,7 +514,7 @@ def weight_update(a, b, lr):
     # model.eval()
     model_device = model.to(device=device)
     opt_model = torch.compile(model_device, dynamic=False)
-    opt_w = torch.compile()(weight_update, dynamic=False)
+    # opt_w = torch.compile()(weight_update, dynamic=False)
     y_hat, aux_loss = opt_model(x1)
     print("MoE Custom Device Done!")
 
diff --git a/tests/test_softmax.py b/tests/test_softmax.py
index 02201e78..7f5d75ac 100644
--- a/tests/test_softmax.py
+++ b/tests/test_softmax.py
@@ -60,3 +60,4 @@ def test_softmax(device, size=(128, 128), dim=1):
     test_softmax(device, size=(256, 128))
     test_softmax(device, size=(256, 128), dim=0)
     test_softmax(device, size=(1, 16))
+    test_softmax(device, size=(5, 8))
diff --git a/tests/test_view3D_2D.py b/tests/test_view3D_2D.py
index 60575ada..7b754131 100644
--- a/tests/test_view3D_2D.py
+++ b/tests/test_view3D_2D.py
@@ -13,11 +13,11 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
         print("cpu out: ", cpu_out)
         exit(1)
 
-def test_view3D_2D(device):
+def test_view3D_2D(device, size=(16, 8, 16), t_x=0, t_y=1):
     def view3D_2D(a):
-        return a.view(16, 128).contiguous()
+        return a.transpose(t_x, t_y).contiguous().view(-1, size[0] * size[2])
     torch.manual_seed(0)
-    cpu_input = torch.randn(16, 8, 16)
+    cpu_input = torch.randn(size)
     input = cpu_input.clone().to(device=device)
     opt_fn = torch.compile(dynamic=False)(view3D_2D)
     res = opt_fn(input)
@@ -33,4 +33,5 @@ def view3D_2D(a):
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
     test_view3D_2D(device)
+    test_view3D_2D(device, [12, 512, 64])
 

From b8270f655f48a00fecce37bbdeef862f732b19d7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 4 Feb 2025 03:15:50 +0000
Subject: [PATCH 059/432] [Frontend] Support 1D tag for tog generation

---
 AsmParser/onnx_utility.py                 | 2 ++
 AsmParser/tog_generator.py                | 6 ++++--
 PyTorchSimFrontend/extension_codecache.py | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/AsmParser/onnx_utility.py b/AsmParser/onnx_utility.py
index d6eab9b2..9b9929ad 100644
--- a/AsmParser/onnx_utility.py
+++ b/AsmParser/onnx_utility.py
@@ -70,6 +70,7 @@ def __init__(self, tile_info, inst_list=list(), node_id=0):
         self.torchsim_tile_size = tile_info["tile_size"]
         self.torchsim_element_size = tile_info["element_size"]
         self.torchsim_tag_idx_list = tile_info["tag_idx_list"]
+        self.torchsim_tag_stride_list = tile_info["tag_stride_list"]
         self.torchsim_loop_idx_list = tile_info["loop_idx_list"]
         self.torchsim_is_async = tile_info["is_async"]
 
@@ -83,6 +84,7 @@ class memory_wait_node(node):
     def __init__(self, tile_info, inst_list=list(), node_id=0):
         super().__init__(node_id)
         self.torchsim_tag_idx_list = tile_info["tag_idx_list"]
+        self.torchsim_tag_stride_list = tile_info["tag_stride_list"]
         self.torchsim_base_addr = tile_info["base_addr"]
 
 class compute_node(node):
diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
index 1b5971e2..44332b43 100644
--- a/AsmParser/tog_generator.py
+++ b/AsmParser/tog_generator.py
@@ -31,7 +31,7 @@ class tog_generator:
     LoopNodeKind = 2
     DMANodeKind = 3
     DMAWaitNodeKind = 4
-    def __init__(self, origins=None) -> None:
+    def __init__(self, origins="Unknown") -> None:
         self.module_name = "tile_operation_graph"
         self.module = None
         self.raw_graph = {}
@@ -89,6 +89,7 @@ def _create_node(self, dump_data):
             tile_info["tile_size"] = dump_data["tile_size"]
             tile_info["element_size"] = dump_data["element_size"]
             tile_info["tag_idx_list"] = dump_data["tag_idx_list"]
+            tile_info["tag_stride_list"] = dump_data["tag_stride_list"]
             tile_info["loop_idx_list"] = dump_data["loop_idx_list"]
             tile_info["is_async"] = dump_data["is_async"]
             is_write = dump_data["is_write"]
@@ -99,6 +100,7 @@ def _create_node(self, dump_data):
         elif node_type == self.DMAWaitNodeKind:
             tile_info = {}
             tile_info["tag_idx_list"] = dump_data["tag_idx_list"]
+            tile_info["tag_stride_list"] = dump_data["tag_stride_list"]
             tile_info["base_addr"] = dump_data["base_address"]
             new_node = memory_wait_node(tile_info, node_id=node_id)
         else:
@@ -214,4 +216,4 @@ def generate_tile_graph(self, name="tile_graph", cycle_list=list, offset=int, ve
 if __name__ == "__main__":
     t = tog_generator()
     t.load_file("/workspace/llvm-project/build/tile_operation_graph.py")
-    t.parse_graph()
\ No newline at end of file
+    t.generate_tile_graph("./tile_graph.onnx", cycle_list=[1,1,1,1,1], offset=0, vector_lane=128)
\ No newline at end of file
diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 91029329..8839887c 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -83,8 +83,8 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
             -test-loop-padding='timing_mode=1' \
             -dma-fine-grained='systolic-array-size={vectorlane_size}' \
-            -test-tile-operation-graph='vectorlane={vectorlane_size}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
+            -test-tile-operation-graph='vectorlane={vectorlane_size}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size} timing=1" \
             -lower-affine \
             -finalize-memref-to-llvm \

From cf06510fff7be4f0ee3395e122fc8d875f461bff Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 4 Feb 2025 12:13:41 +0000
Subject: [PATCH 060/432] [Backend] Support multi-Dim dma

---
 PyTorchSimBackend/src/Core.cc            | 73 +++++++++++++++---------
 PyTorchSimBackend/src/Core.h             |  1 +
 PyTorchSimBackend/src/Instruction.cc     |  6 +-
 PyTorchSimBackend/src/Instruction.h      | 37 ++++++++++--
 PyTorchSimBackend/src/TMA.cc             |  4 +-
 PyTorchSimBackend/src/TMA.h              | 29 +++++++---
 PyTorchSimBackend/src/TileGraphParser.cc | 64 +++++++++------------
 PyTorchSimBackend/src/TileGraphParser.h  |  4 ++
 8 files changed, 135 insertions(+), 83 deletions(-)

diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index a253540e..5209c2cc 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -75,16 +75,16 @@ void Core::dma_cycle() {
 
     /* Set tag table of async dma load */
     if (instruction->is_dma_read() && instruction->is_async_dma()) {
-      std::ostringstream oss;
-      auto key = std::make_pair(instruction->get_addr_name(), instruction->get_tag_idx_list());
+      auto key = std::make_pair(instruction->get_addr_name(), instruction->get_tag_id());
       assert(!_tma.get_tag_finish(instruction->subgraph_id, key));
       _tma.set_tag_finish(instruction->subgraph_id, key);
-      for (const auto& idx : instruction->get_tag_idx_list())
-        oss << idx << ", ";
-      spdlog::trace("[Core {}][{}] {} ASYNC FINISHED, Used sram: {}, Release sram: {}, subgraph_id: {} addr_name: {} tag_idx_list: {}",
+      spdlog::trace("[Core {}][{}] {} ASYNC FINISHED, Used sram: {}, Release sram: {}, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}",
                     _id, _core_cycle, opcode_to_string(instruction->get_opcode()),
                     _used_sram_size, instruction->get_free_sram_size(),
-                    instruction->subgraph_id, instruction->get_addr_name(), oss.str());
+                    instruction->subgraph_id, instruction->get_addr_name(),
+                    instruction->get_tag_id(),
+                    fmt::format("[{}]", fmt::join(instruction->get_tag_idx_list(), ", ")),
+                    fmt::format("[{}]", fmt::join(instruction->get_tag_stride_list(), ", ")));
       for (auto & wait_inst : _tma.get_tag_waiter(instruction->subgraph_id, key)) {
         finish_instruction(wait_inst);
       }
@@ -105,17 +105,17 @@ void Core::dma_cycle() {
       } else if (finished_inst->is_dma_read() && finished_inst->is_async_dma()) {
         /* Register tag table for async dma load */
         _tma.register_tag(finished_inst->subgraph_id,
-                          std::make_pair(finished_inst->get_addr_name(), finished_inst->get_tag_idx_list()));
+                          std::make_pair(finished_inst->get_addr_name(), finished_inst->get_tag_id()));
         finish_instruction(finished_inst);
       } else if(!finished_inst->is_dma_read()) {
         spdlog::error("[Core {}][{}] TMA instruction in not valid", _id, _core_cycle);
         exit(EXIT_FAILURE);
       } else if (finished_inst->get_opcode() == Opcode::BAR) {
-        std::ostringstream oss;
-        for (const auto& idx : finished_inst->get_tag_idx_list())
-          oss << idx << ", ";
-        spdlog::trace("[Core {}][{}] {} FINISHED, addr_name: {} tag_list: {}", _id, _core_cycle,
-                      opcode_to_string(finished_inst->get_opcode()), finished_inst->get_addr_name(), oss.str());
+        spdlog::trace("[Core {}][{}] {} FINISHED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
+                      opcode_to_string(finished_inst->get_opcode()), finished_inst->get_addr_name(),
+                      finished_inst->get_tag_id(),
+                      fmt::format("[{}]", fmt::join(finished_inst->get_tag_idx_list(), ", ")),
+                      fmt::format("[{}]", fmt::join(finished_inst->get_tag_stride_list(), ", ")));
       }
       /*Pass to waiting queue */
       _dma_waiting_queue.push_back(std::move(finished_inst));
@@ -170,16 +170,34 @@ void Core::cycle() {
       switch (inst->get_opcode()) {
         case Opcode::MOVIN:
           {
-            std::ostringstream oss;
-            for (const auto& idx : inst->get_tag_idx_list())
-              oss << idx << ", ";
-            spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {} addr_name: {} tag_idx_list: {}", _id, _core_cycle,
-                          opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(),
-                          inst->get_addr_name(), oss.str());
-            _ld_inst_queue.push(inst);
-            issued = true;
+            /* Check another MOVIN with same tag is issued */
+            auto key = std::make_pair(inst->get_addr_name(), inst->get_tag_id());
+            if (inst->is_async_dma() && _tma.tag_key_exist(inst->subgraph_id, key)) {
+              bool finished = _tma.get_tag_finish(inst->subgraph_id, key);
+              if (finished)
+                finish_instruction(inst);
+              else
+                _tma.register_tag_waiter(inst->subgraph_id, key, inst);
+              spdlog::trace("[Core {}][{}] {} SKIPPED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
+                            opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(),
+                            inst->get_addr_name(),
+                            inst->get_tag_id(),
+                            fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
+                            fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
+              issued = true;
+              break;
+            } else {
+              spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
+                            opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(),
+                            inst->get_addr_name(),
+                            inst->get_tag_id(),
+                            fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
+                            fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
+              _ld_inst_queue.push(inst);
+              issued = true;
+              break;
+            }
           }
-          break;
         case Opcode::MOVOUT:
           spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {}", _id, _core_cycle,
                         opcode_to_string(inst->get_opcode()), inst->get_free_sram_size());
@@ -205,8 +223,7 @@ void Core::cycle() {
           break;
         case Opcode::BAR:
           {
-            std::ostringstream oss;
-            auto key = std::make_pair(inst->get_addr_name(), inst->get_tag_idx_list());
+            auto key = std::make_pair(inst->get_addr_name(), inst->get_tag_id());
             bool finished = _tma.get_tag_finish(inst->subgraph_id, key);
             if (finished) {
               finish_instruction(inst);
@@ -262,12 +279,12 @@ void Core::finish_instruction(std::shared_ptr<Instruction>& inst) {
       _id, _core_cycle, opcode_to_string(inst->get_opcode()), inst->get_compute_type(),
       _used_sram_size, inst->get_free_sram_size());
   } else if (inst->get_opcode() != Opcode::BAR && inst->is_async_dma()){
-    std::ostringstream oss;
-    for (const auto& idx : inst->get_tag_idx_list())
-      oss << idx << ", ";
-    spdlog::trace("[Core {}][{}] {} ASYNC REGISTERED, Used sram: {}, Release sram: {} subgraph_id: {} addr_name: {} tag_idx_list: {}",
+    spdlog::trace("[Core {}][{}] {} ASYNC REGISTERED, Used sram: {}, Release sram: {} subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}",
       _id, _core_cycle, opcode_to_string(inst->get_opcode()), _used_sram_size,
-      inst->get_free_sram_size(), inst->subgraph_id, inst->get_addr_name(), oss.str());
+      inst->get_free_sram_size(), inst->subgraph_id, inst->get_addr_name(),
+      inst->get_tag_id(),
+      fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
+      fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
   } else if ((inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) && !inst->is_async_dma()) {
     spdlog::trace("[Core {}][{}] {} FINISHED, free_sram_size: {} addr_name: {}", _id, _core_cycle,
       opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(),
diff --git a/PyTorchSimBackend/src/Core.h b/PyTorchSimBackend/src/Core.h
index 77f13fec..f23ad739 100644
--- a/PyTorchSimBackend/src/Core.h
+++ b/PyTorchSimBackend/src/Core.h
@@ -3,6 +3,7 @@
 
 #include <memory>
 #include <vector>
+#include <fmt/core.h>
 
 #include "Dram.h"
 #include "Tile.h"
diff --git a/PyTorchSimBackend/src/Instruction.cc b/PyTorchSimBackend/src/Instruction.cc
index 9039f3d2..e78d429c 100644
--- a/PyTorchSimBackend/src/Instruction.cc
+++ b/PyTorchSimBackend/src/Instruction.cc
@@ -12,10 +12,12 @@ std::string opcode_to_string(Opcode opcode) {
 
 Instruction::Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_parents,
             addr_type dram_addr, std::vector<size_t> tile_size, size_t precision,
-            std::vector<int>& idx_list, std::vector<int>& stride_list, std::vector<int> tag_idx_list, std::vector<int> loop_size_list)
+            std::vector<int>& idx_list, std::vector<int>& stride_list,
+            std::vector<int> tag_idx_list, std::vector<int> tag_stride_list, std::vector<int> loop_size_list)
   : opcode(opcode), compute_cycle(compute_cycle), ready_counter(num_parents), dram_addr(dram_addr),
     tile_size(tile_size), _precision(precision), _idx_list(idx_list),
-    _stride_list(stride_list), _tag_idx_list(tag_idx_list), _loop_size_list(loop_size_list) {
+    _stride_list(stride_list), _tag_idx_list(tag_idx_list), _tag_stride_list(tag_stride_list), _loop_size_list(loop_size_list) {
+  assert(_tag_idx_list.size()==_tag_stride_list.size());
   _tile_numel = 1;
   for (auto dim : tile_size)
     _tile_numel *= dim;
diff --git a/PyTorchSimBackend/src/Instruction.h b/PyTorchSimBackend/src/Instruction.h
index ad469b9c..b18f1b3a 100644
--- a/PyTorchSimBackend/src/Instruction.h
+++ b/PyTorchSimBackend/src/Instruction.h
@@ -22,7 +22,8 @@ class Instruction {
  public:
   Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_parents, addr_type dram_addr,
               std::vector<size_t> tile_size, size_t precision, std::vector<int> &idx_list,
-              std::vector<int> &stride_list,  std::vector<int> tag_idx_list, std::vector<int> loop_size_list);
+              std::vector<int> &stride_list,  std::vector<int> tag_idx_list, std::vector<int> tag_stride_list,
+              std::vector<int> loop_size_list);
   void finish_instruction();
   void add_child(std::shared_ptr<Instruction> child);
   bool check_ready() { return ready_counter == 0; }
@@ -49,10 +50,27 @@ class Instruction {
   void print();
   std::set<addr_type> get_dram_address(addr_type dram_req_size) {
     std::set<addr_type> address_set;
-    for (int row=0; row<tile_size.at(0); row++) {
-      for (int col=0; col<tile_size.at(1); col++) {
-        addr_type address = dram_addr + (row* _stride_list[_stride_list.size()-2] + col* _stride_list[_stride_list.size()-1]) * _precision;
-        address_set.insert(address - (address & dram_req_size-1));
+
+    /* Set 4D shape*/
+    while (tile_size.size() < 4)
+      tile_size.insert(tile_size.begin(), 1);
+
+    while (_stride_list.size() < 4)
+      _stride_list.insert(_stride_list.begin(), 1);
+
+    /* Iterate tile_size */
+    for (int dim0=0; dim0<tile_size.at(0); dim0++) {
+      for (int dim1=0; dim1<tile_size.at(1); dim1++) {
+        for (int dim2=0; dim2<tile_size.at(2); dim2++) {
+          for (int dim3=0; dim3<tile_size.at(3); dim3++) {
+            addr_type address = dim0*_stride_list.at(_stride_list.size() - 4) + \
+                                dim1*_stride_list.at(_stride_list.size() - 3) + \
+                                dim2*_stride_list.at(_stride_list.size() - 2) + \
+                                dim3*_stride_list.at(_stride_list.size() - 1);
+            address = dram_addr + address * _precision;
+            address_set.insert(address - (address & dram_req_size-1));
+          }
+        }
       }
     }
     return address_set;
@@ -71,6 +89,14 @@ class Instruction {
   uint32_t get_numa_id() { return _numa_id; }
   std::vector<int>& get_idx_list() { return _idx_list; }
   std::vector<int>& get_tag_idx_list() { return _tag_idx_list; }
+  std::vector<int>& get_tag_stride_list() { return _tag_stride_list; }
+  int get_tag_id() {
+    assert(_tag_idx_list.size()==_tag_stride_list.size());
+    int ret = 0;
+    for (int i=0; i<_tag_idx_list.size(); i++)
+      ret += _tag_idx_list.at(i) * _tag_stride_list.at(i);
+    return ret;
+  }
   void set_addr_name(std::string name) { _addr_name = name; }
   std::string get_addr_name() { return _addr_name; }
   void set_nr_inner_loop(int nr) { _nr_inner_loop = nr; }
@@ -101,6 +127,7 @@ class Instruction {
   std::vector<int> _idx_list;
   std::vector<int> _stride_list;
   std::vector<int> _tag_idx_list;
+  std::vector<int> _tag_stride_list;
   std::vector<int> _loop_size_list;
   std::string _addr_name;
   int _nr_inner_loop = 0;
diff --git a/PyTorchSimBackend/src/TMA.cc b/PyTorchSimBackend/src/TMA.cc
index 03d88ce6..a6232dc6 100644
--- a/PyTorchSimBackend/src/TMA.cc
+++ b/PyTorchSimBackend/src/TMA.cc
@@ -11,8 +11,8 @@ TMA::TMA(uint32_t id, uint32_t dram_req_size) {
 void TMA::issue_tile(std::shared_ptr<Instruction> inst) {
   _current_inst = std::move(inst);
   std::vector<size_t>& tile_size = _current_inst->get_tile_size();
-  if (tile_size.size() != 2) {
-    spdlog::error("[TMA {}] issued tile is not [y,x] format..", _id);
+  if (tile_size.size() <= 0 || tile_size.size() > get_max_dim()) {
+    spdlog::error("[TMA {}] issued tile is not supported format..", _id);
     exit(EXIT_FAILURE);
   }
   _finished = false;
diff --git a/PyTorchSimBackend/src/TMA.h b/PyTorchSimBackend/src/TMA.h
index decd3c60..1bc209f9 100644
--- a/PyTorchSimBackend/src/TMA.h
+++ b/PyTorchSimBackend/src/TMA.h
@@ -36,21 +36,30 @@ class TMA {
   void issue_tile(std::shared_ptr<Instruction> inst);
   bool is_finished() { return _finished; }
   bool empty() { return _current_inst==nullptr; }
-  void register_tag(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
+  void register_tag(int subgraph_id, const std::pair<std::string, int> key) {
     if (tag_table.find(subgraph_id) == tag_table.end()) {
-      tag_table[subgraph_id] = std::map<std::pair<std::string, std::vector<int>>, bool>();
-      waiters[subgraph_id] = std::map<std::pair<std::string, std::vector<int>>, std::vector<std::shared_ptr<Instruction>>>();
+      tag_table[subgraph_id] = std::map<std::pair<std::string, int>, bool>();
+      waiters[subgraph_id] = std::map<std::pair<std::string, int>, std::vector<std::shared_ptr<Instruction>>>();
     }
     tag_table[subgraph_id][key] = false;
     waiters[subgraph_id][key] = std::vector<std::shared_ptr<Instruction>>();
   }
-  void set_tag_finish(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
+  void set_tag_finish(int subgraph_id, const std::pair<std::string, int> key) {
     if (tag_table.find(subgraph_id) == tag_table.end()) {
       throw std::runtime_error("Subgraph does not exist in tag_table");
     }
     tag_table[subgraph_id][key] = true;
   }
-  bool get_tag_finish(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
+  bool tag_key_exist(int subgraph_id, const std::pair<std::string, int> key) {
+    auto subgraph_it = tag_table.find(subgraph_id);
+    if (subgraph_it == tag_table.end())
+      return false;
+
+    auto& key_map = subgraph_it->second;
+    auto key_it = key_map.find(key);
+    return key_it != key_map.end();
+  }
+  bool get_tag_finish(int subgraph_id, const std::pair<std::string, int> key) {
     auto subgraph_it = tag_table.find(subgraph_id);
     auto& key_map = subgraph_it->second;
     auto key_it = key_map.find(key);
@@ -67,7 +76,7 @@ class TMA {
     tag_table.erase(subgraph_id);
     waiters.erase(subgraph_id);
   }
-  void register_tag_waiter(int subgraph_id, const std::pair<std::string, std::vector<int>>& key, std::shared_ptr<Instruction> inst) {
+  void register_tag_waiter(int subgraph_id, const std::pair<std::string, int> key, std::shared_ptr<Instruction> inst) {
     auto subgraph_it = tag_table.find(subgraph_id);
     auto& key_map = subgraph_it->second;
     auto key_it = key_map.find(key);
@@ -76,7 +85,7 @@ class TMA {
     }
     waiters[subgraph_id][key].push_back(inst);
   }
-  std::vector<std::shared_ptr<Instruction>>& get_tag_waiter(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
+  std::vector<std::shared_ptr<Instruction>>& get_tag_waiter(int subgraph_id, const std::pair<std::string, int> key) {
     auto subgraph_it = tag_table.find(subgraph_id);
     auto& key_map = subgraph_it->second;
     auto key_it = key_map.find(key);
@@ -89,9 +98,11 @@ class TMA {
   std::shared_ptr<Instruction>& get_current_inst() { return _current_inst; }
   std::vector<MemoryAccess*> get_memory_access();
   uint32_t generate_mem_access_id();
+  const uint32_t get_max_dim() { return _max_dim; }
 
  protected:
   uint32_t _id;
+  const uint32_t _max_dim = 4;
   std::shared_ptr<Instruction> _current_inst;
   uint32_t _dram_req_size;
   uint32_t _tile_size_x=0;
@@ -99,7 +110,7 @@ class TMA {
   size_t _tile_idx_stride=1;
   uint32_t _tile_idx;
   bool _finished=true;
-  std::map<int, std::map<std::pair<std::string, std::vector<int>>, bool>> tag_table;
-  std::map<int, std::map<std::pair<std::string, std::vector<int>>, std::vector<std::shared_ptr<Instruction>>>> waiters;
+  std::map<int, std::map<std::pair<std::string, int>, bool>> tag_table;
+  std::map<int, std::map<std::pair<std::string, int>, std::vector<std::shared_ptr<Instruction>>>> waiters;
 };
 #endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index b9ea2b08..8f543573 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -185,6 +185,9 @@ TileMemoryNode::TileMemoryNode(onnx::NodeProto& node) : TileNode(node) {
     } else if (attribute.name() == "torchsim_tag_idx_list") {
       for (int i = 0; i < attribute.strings_size(); i++)
         _tag_idx_list.push_back(attribute.strings(i));
+    } else if (attribute.name() == "torchsim_tag_stride_list") {
+      for (int i = 0; i < attribute.ints_size(); i++)
+        _tag_stride_list.push_back(attribute.ints(i));
     } else if (attribute.name() == "torchsim_loop_idx_list") {
       for (int i = 0; i < attribute.strings_size(); i++)
         _loop_idx_list.push_back(attribute.strings(i));
@@ -202,6 +205,7 @@ void TileMemoryNode::print_node() {
   spdlog::debug("{} stride_list: {} ", spaces, _stride_list);
   spdlog::debug("{} tile_size: {} ", spaces, _tile_size);
   spdlog::debug("{} tag_list: {}", spaces, fmt::join(_tag_idx_list, ", "));
+  spdlog::debug("{} tag_stride_list: {}", spaces, fmt::join(_tag_stride_list, ", "));
   spdlog::debug("{} index_list: {}", spaces, fmt::join(_loop_idx_list, ", "));
 }
 
@@ -210,6 +214,9 @@ TileMemoryWaitNode::TileMemoryWaitNode(onnx::NodeProto& node) : TileNode(node) {
     if (attribute.name() == "torchsim_tag_idx_list") {
       for (int i = 0; i < attribute.strings_size(); i++)
         _tag_idx_list.push_back(attribute.strings(i));
+    } else if (attribute.name() == "torchsim_tag_stride_list") {
+      for (int i = 0; i < attribute.ints_size(); i++)
+        _tag_stride_list.push_back(attribute.ints(i));
     } else if (attribute.name() == "torchsim_base_addr") {
       _base_addr_name = attribute.s();
     }
@@ -219,7 +226,8 @@ TileMemoryWaitNode::TileMemoryWaitNode(onnx::NodeProto& node) : TileNode(node) {
 void TileMemoryWaitNode::print_node() {
   TileNode::print_node();
   std::string spaces(get_depth(), '\t');
-  spdlog::debug("{} tag_list: {}", spaces, fmt::join(_tag_idx_list, ", "));
+  spdlog::debug("{} tag_idx_list: {}", spaces, fmt::join(_tag_idx_list, ", "));
+  spdlog::debug("{} tag_stride_list: {}", spaces, fmt::join(_tag_stride_list, ", "));
 }
 
 TileLoopNode::TileLoopNode(onnx::NodeProto& node) : TileNode(node) {
@@ -256,31 +264,9 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       std::shared_ptr<TileMemoryNode> mem_node = std::static_pointer_cast<TileMemoryNode>(tile_node);
       auto base_addr_name = mem_node->get_base_addr_name();
       std::vector<std::string>& tag_idx_list = mem_node->get_tag_idx_list();
+      std::vector<int>& tag_stride_list = mem_node->get_tag_stride_list();
       std::vector<int> skip_idx_list;
       std::vector<int> values;
-      bool skip = false;
-      /* Find axis */
-      if (mem_node->is_async_node()) {
-        for (int i=0;i<tag_idx_list.size();i++) {
-          if (tag_idx_list.at(i) == "0")
-            skip_idx_list.push_back(i);
-        }
-
-        /* Extract iter values */
-        std::transform(iter.begin(), iter.end(), std::back_inserter(values),
-                    [](const std::pair<std::string, int>& pair) { return pair.second; });
-
-        for (auto axis : skip_idx_list) {
-          if (values.at(iter.size() - tag_idx_list.size() + axis) != 0) {
-            skip = true;
-            break;
-          }
-        }
-
-        /* Skip this node */
-        if (skip)
-          continue;
-      }
 
       /* Lookup given name's address */
       addr_type base_addr = tog_parser->lookup(base_addr_name);
@@ -299,14 +285,14 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
           nr_inner_loop++;
       }
       /* Add accumulation loop info to tag list */
-      for (auto loop_idx = loop_idx_list.begin();
-            loop_idx != loop_idx_list.end() - nr_inner_loop; ++loop_idx) {
-        // Check loop type and process
-        if (tog_parser->get_loop_type(*loop_idx)==LoopType::ACCUMULATION_LOOP) {
-          auto iter_value = getLoopIndexValue(iter, *loop_idx);
-          tag_list.push_back(iter_value);
-        }
-      }
+      //for (auto loop_idx = loop_idx_list.begin();
+      //      loop_idx != loop_idx_list.end() - nr_inner_loop; ++loop_idx) {
+      //  // Check loop type and process
+      //  if (tog_parser->get_loop_type(*loop_idx)==LoopType::ACCUMULATION_LOOP) {
+      //    auto iter_value = getLoopIndexValue(iter, *loop_idx);
+      //    tag_list.push_back(iter_value);
+      //  }
+      //}
 
       for (auto loop_idx = loop_idx_list.begin();
             loop_idx != loop_idx_list.end(); ++loop_idx) {
@@ -318,7 +304,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         }
       }
 
-      for (auto loop_idx: mem_node->get_tag_idx_list()) {
+      for (auto loop_idx: tag_idx_list) {
         if (iter.find(loop_idx) == iter.end())
           tag_list.push_back(0);
         else {
@@ -341,7 +327,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         Opcode::MOVIN, 0,
         0, base_addr,
         mem_node->get_tile_size(), mem_node->get_precision(), iter_list,
-        mem_node->get_stride_list(), tag_list, loop_size_list
+        mem_node->get_stride_list(), tag_list, tag_stride_list, loop_size_list
       );
       inst->set_addr_name(base_addr_name);
       inst->set_nr_inner_loop(nr_inner_loop);
@@ -355,6 +341,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       auto base_addr_name = mem_node->get_base_addr_name();
       /* Lookup given name's address */
       addr_type base_addr = tog_parser->lookup(base_addr_name);
+      std::vector<int>& tag_stride_list = mem_node->get_tag_stride_list();
       std::vector<int> iter_list;
       std::vector<int> loop_size_list;
       std::vector<uint32_t> outer_loop_idx;
@@ -389,7 +376,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         Opcode::MOVOUT, 0,
         0, base_addr,
         mem_node->get_tile_size(), mem_node->get_precision(), iter_list,
-        mem_node->get_stride_list(), std::vector<int>(), loop_size_list
+        mem_node->get_stride_list(), std::vector<int>(1), tag_stride_list, loop_size_list
       );
       inst->set_addr_name(base_addr_name);
       inst->set_nr_inner_loop(nr_inner_loop);
@@ -406,6 +393,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       /* Lookup given name's address */
       std::vector<int> iter_list;
       std::vector<int> tag_list;
+      std::vector<int>& tag_stride_list = wait_node->get_tag_stride_list();
       auto& wait_tag_list = wait_node->get_tag_idx_list();
       int inner_step = std::stoi(tog_parser->getMetaByName("systolic_size"));
       /* Add accumulation loop info to tag list */
@@ -429,7 +417,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         Opcode::BAR, 0,
         0, base_addr,
         std::vector<size_t>(), 0, iter_list,
-        iter_list, tag_list, std::vector<int>()
+        iter_list, tag_list, tag_stride_list, std::vector<int>()
       );
       inst->set_addr_name(base_addr_name);
       link_map[tile_node] = inst;
@@ -438,11 +426,13 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       printIndexMap("[TOGParser] Compute Node ", iter);
       std::shared_ptr<TileComputeNode> compute_node = std::static_pointer_cast<TileComputeNode>(tile_node);
       std::vector<int> iter_list;
+      std::vector<int> tag_list = {0};
+      std::vector<int> tag_stride_list = {1};
       std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
         Opcode::COMP, compute_node->get_cycle(),
         0, 0,
         std::vector<size_t>(), 0, iter_list, iter_list,
-        std::vector<int>(), std::vector<int>()
+        tag_list, tag_stride_list, std::vector<int>()
       );
       inst->set_overlapping_cycle(compute_node->get_overlapping_cycle());
       inst->set_compute_type(compute_node->get_compute_type());
diff --git a/PyTorchSimBackend/src/TileGraphParser.h b/PyTorchSimBackend/src/TileGraphParser.h
index 36ec8091..0fba4d06 100644
--- a/PyTorchSimBackend/src/TileGraphParser.h
+++ b/PyTorchSimBackend/src/TileGraphParser.h
@@ -116,6 +116,7 @@ class TileMemoryNode : public TileNode {
   std::vector<size_t> get_tile_size() { return _tile_size; }
   std::vector<int>& get_stride_list () { return _stride_list; }
   std::vector<std::string>& get_tag_idx_list() { return _tag_idx_list; }
+  std::vector<int>& get_tag_stride_list() { return _tag_stride_list; }
   std::vector<std::string>& get_loop_idx_list() { return _loop_idx_list; }
   bool is_async_node() { return _is_async; }
   void print_node() override;
@@ -127,6 +128,7 @@ class TileMemoryNode : public TileNode {
   bool _is_async;
   std::string _base_addr_name;
   std::vector<std::string> _tag_idx_list;
+  std::vector<int> _tag_stride_list;
   std::vector<std::string> _loop_idx_list;
 };
 
@@ -135,10 +137,12 @@ class TileMemoryWaitNode : public TileNode {
   TileMemoryWaitNode(onnx::NodeProto& node);
   std::string get_base_addr_name() { return _base_addr_name; }
   std::vector<std::string>& get_tag_idx_list() { return _tag_idx_list; }
+  std::vector<int>& get_tag_stride_list() { return _tag_stride_list; }
   void print_node() override;
 
  private:
   std::vector<std::string> _tag_idx_list;
+  std::vector<int> _tag_stride_list;
   std::string _base_addr_name;
 };
 

From d9a904d893b824ea648b3aee33ee3c848304dece Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 4 Feb 2025 13:02:07 +0000
Subject: [PATCH 061/432] [Backend] Avoid UAF bug

---
 PyTorchSimBackend/src/TMA.cc             | 2 +-
 PyTorchSimBackend/src/Tile.h             | 6 +++---
 PyTorchSimBackend/src/TileGraph.cc       | 1 -
 PyTorchSimBackend/src/TileGraphParser.cc | 1 +
 PyTorchSimBackend/src/main.cc            | 1 +
 5 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimBackend/src/TMA.cc b/PyTorchSimBackend/src/TMA.cc
index a6232dc6..048cedfd 100644
--- a/PyTorchSimBackend/src/TMA.cc
+++ b/PyTorchSimBackend/src/TMA.cc
@@ -22,7 +22,7 @@ std::vector<MemoryAccess*> TMA::get_memory_access() {
   std::set<addr_type> addr_set = _current_inst->get_dram_address(_dram_req_size);
   std::vector<MemoryAccess *> access_vec;
   Tile* owner = (Tile*)_current_inst->get_owner();
-  TileSubGraph* owner_subgraph = owner->get_owner();
+  std::shared_ptr<TileSubGraph> owner_subgraph = owner->get_owner();
   spdlog::trace("[NUMA Trace] Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}",
     owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write());
   for (auto addr: addr_set) {
diff --git a/PyTorchSimBackend/src/Tile.h b/PyTorchSimBackend/src/Tile.h
index c62329c3..36da1f1e 100644
--- a/PyTorchSimBackend/src/Tile.h
+++ b/PyTorchSimBackend/src/Tile.h
@@ -17,8 +17,8 @@ class Tile {
   };
 
   Tile(Status status);
-  TileSubGraph* get_owner() { return _onwer_graph; }
-  void set_ownwer(TileSubGraph* graph) { _onwer_graph = graph; }
+  std::shared_ptr<TileSubGraph> get_owner() { return _onwer_graph; }
+  void set_owner(std::shared_ptr<TileSubGraph> graph) { _onwer_graph = graph; }
   Status get_status() { return _status; }
   void set_status(Status status) { _status=status; }
   size_t get_ready_counter() { return _ready_counter; }
@@ -42,7 +42,7 @@ class Tile {
   bool all_insts_finshed() { return _nr_insts == _nr_finished_insts; }
   
  protected:
-  TileSubGraph* _onwer_graph;
+  std::shared_ptr<TileSubGraph> _onwer_graph;
   Status _status = Status::EMPTY;
   size_t _required_sram_size=0;
   size_t _ready_counter=0;
diff --git a/PyTorchSimBackend/src/TileGraph.cc b/PyTorchSimBackend/src/TileGraph.cc
index 48d76990..2a36b78d 100644
--- a/PyTorchSimBackend/src/TileGraph.cc
+++ b/PyTorchSimBackend/src/TileGraph.cc
@@ -5,7 +5,6 @@ TileSubGraph::TileSubGraph() : _ready_tile_queue(), _tile_set(), _id(_next_id++)
 }
 
 void TileSubGraph::add_tile(std::shared_ptr<Tile> tile) {
-  tile->set_ownwer(this);
   for (auto& inst : tile->get_instructions())
     inst->subgraph_id = _id;
   if (tile->get_ready_counter() == 0) {
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index 8f543573..fed9cfb7 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -662,6 +662,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, json& attribute_json) {
       /* insert tiles to subgraph */
       for (const auto& sub_tile: sub_tiles){
         subgraph->add_tile(sub_tile);
+        sub_tile->set_owner(subgraph);
       }
     }
     /* insert subgraph to graph */
diff --git a/PyTorchSimBackend/src/main.cc b/PyTorchSimBackend/src/main.cc
index 67f19d6d..c7d9684b 100644
--- a/PyTorchSimBackend/src/main.cc
+++ b/PyTorchSimBackend/src/main.cc
@@ -157,6 +157,7 @@ int main(int argc, char** argv) {
     /* Get onnx_path, attribute from user input, request_time */
     interactive_mode(simulator);
   }
+  delete simulator;
 
   /* Simulation time measurement */
   auto end = std::chrono::high_resolution_clock::now();

From 3584fe2efc8d229781bea6804f6379b48daac343 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 5 Feb 2025 02:16:36 +0000
Subject: [PATCH 062/432] [Backendsim] Rework accumlation tag mechanism

---
 PyTorchSimBackend/src/Core.cc            |  8 ++--
 PyTorchSimBackend/src/Instruction.cc     | 13 +++++-
 PyTorchSimBackend/src/Instruction.h      | 12 ++---
 PyTorchSimBackend/src/TMA.h              | 20 ++++-----
 PyTorchSimBackend/src/TileGraphParser.cc | 57 ++++++++++++++----------
 5 files changed, 62 insertions(+), 48 deletions(-)

diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index 5209c2cc..92b955aa 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -82,7 +82,7 @@ void Core::dma_cycle() {
                     _id, _core_cycle, opcode_to_string(instruction->get_opcode()),
                     _used_sram_size, instruction->get_free_sram_size(),
                     instruction->subgraph_id, instruction->get_addr_name(),
-                    instruction->get_tag_id(),
+                    fmt::format("[{}]", fmt::join(instruction->get_tag_id(), ", ")),
                     fmt::format("[{}]", fmt::join(instruction->get_tag_idx_list(), ", ")),
                     fmt::format("[{}]", fmt::join(instruction->get_tag_stride_list(), ", ")));
       for (auto & wait_inst : _tma.get_tag_waiter(instruction->subgraph_id, key)) {
@@ -113,7 +113,7 @@ void Core::dma_cycle() {
       } else if (finished_inst->get_opcode() == Opcode::BAR) {
         spdlog::trace("[Core {}][{}] {} FINISHED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
                       opcode_to_string(finished_inst->get_opcode()), finished_inst->get_addr_name(),
-                      finished_inst->get_tag_id(),
+                      fmt::format("[{}]", fmt::join(finished_inst->get_tag_id(), ", ")),
                       fmt::format("[{}]", fmt::join(finished_inst->get_tag_idx_list(), ", ")),
                       fmt::format("[{}]", fmt::join(finished_inst->get_tag_stride_list(), ", ")));
       }
@@ -181,7 +181,7 @@ void Core::cycle() {
               spdlog::trace("[Core {}][{}] {} SKIPPED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
                             opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(),
                             inst->get_addr_name(),
-                            inst->get_tag_id(),
+                            fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
                             fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
                             fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
               issued = true;
@@ -190,7 +190,7 @@ void Core::cycle() {
               spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
                             opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(),
                             inst->get_addr_name(),
-                            inst->get_tag_id(),
+                            fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
                             fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
                             fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
               _ld_inst_queue.push(inst);
diff --git a/PyTorchSimBackend/src/Instruction.cc b/PyTorchSimBackend/src/Instruction.cc
index e78d429c..ac79775b 100644
--- a/PyTorchSimBackend/src/Instruction.cc
+++ b/PyTorchSimBackend/src/Instruction.cc
@@ -13,10 +13,11 @@ std::string opcode_to_string(Opcode opcode) {
 Instruction::Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_parents,
             addr_type dram_addr, std::vector<size_t> tile_size, size_t precision,
             std::vector<int>& idx_list, std::vector<int>& stride_list,
-            std::vector<int> tag_idx_list, std::vector<int> tag_stride_list, std::vector<int> loop_size_list)
+            std::vector<int> tag_idx_list, std::vector<int> tag_stride_list, std::vector<int> accum_tag_idx_list, std::vector<int> loop_size_list)
   : opcode(opcode), compute_cycle(compute_cycle), ready_counter(num_parents), dram_addr(dram_addr),
     tile_size(tile_size), _precision(precision), _idx_list(idx_list),
-    _stride_list(stride_list), _tag_idx_list(tag_idx_list), _tag_stride_list(tag_stride_list), _loop_size_list(loop_size_list) {
+    _stride_list(stride_list), _tag_idx_list(tag_idx_list), _tag_stride_list(tag_stride_list),
+    _accum_tag_idx_list(accum_tag_idx_list), _loop_size_list(loop_size_list) {
   assert(_tag_idx_list.size()==_tag_stride_list.size());
   _tile_numel = 1;
   for (auto dim : tile_size)
@@ -26,6 +27,14 @@ Instruction::Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_par
   if (_stride_list.size() == 1) {
     _stride_list.push_back(1);
   }
+
+  /* Calculate tag key */
+  int key_offset = 0;
+  for (int i=0; i<_tag_idx_list.size(); i++)
+    key_offset += _tag_idx_list.at(i) * _tag_stride_list.at(i);
+  for (auto accum_dim : accum_tag_idx_list)
+    _tag_key.push_back(accum_dim);
+  _tag_key.push_back(key_offset);
 }
 
 void Instruction::finish_instruction() {
diff --git a/PyTorchSimBackend/src/Instruction.h b/PyTorchSimBackend/src/Instruction.h
index b18f1b3a..014ff41e 100644
--- a/PyTorchSimBackend/src/Instruction.h
+++ b/PyTorchSimBackend/src/Instruction.h
@@ -23,7 +23,7 @@ class Instruction {
   Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_parents, addr_type dram_addr,
               std::vector<size_t> tile_size, size_t precision, std::vector<int> &idx_list,
               std::vector<int> &stride_list,  std::vector<int> tag_idx_list, std::vector<int> tag_stride_list,
-              std::vector<int> loop_size_list);
+              std::vector<int> accum_tag_idx_list, std::vector<int> loop_size_list);
   void finish_instruction();
   void add_child(std::shared_ptr<Instruction> child);
   bool check_ready() { return ready_counter == 0; }
@@ -90,13 +90,7 @@ class Instruction {
   std::vector<int>& get_idx_list() { return _idx_list; }
   std::vector<int>& get_tag_idx_list() { return _tag_idx_list; }
   std::vector<int>& get_tag_stride_list() { return _tag_stride_list; }
-  int get_tag_id() {
-    assert(_tag_idx_list.size()==_tag_stride_list.size());
-    int ret = 0;
-    for (int i=0; i<_tag_idx_list.size(); i++)
-      ret += _tag_idx_list.at(i) * _tag_stride_list.at(i);
-    return ret;
-  }
+  std::vector<int>& get_tag_id() { return _tag_key; }
   void set_addr_name(std::string name) { _addr_name = name; }
   std::string get_addr_name() { return _addr_name; }
   void set_nr_inner_loop(int nr) { _nr_inner_loop = nr; }
@@ -128,6 +122,8 @@ class Instruction {
   std::vector<int> _stride_list;
   std::vector<int> _tag_idx_list;
   std::vector<int> _tag_stride_list;
+  std::vector<int> _tag_key;
+  std::vector<int> _accum_tag_idx_list;
   std::vector<int> _loop_size_list;
   std::string _addr_name;
   int _nr_inner_loop = 0;
diff --git a/PyTorchSimBackend/src/TMA.h b/PyTorchSimBackend/src/TMA.h
index 1bc209f9..f9a9cbcc 100644
--- a/PyTorchSimBackend/src/TMA.h
+++ b/PyTorchSimBackend/src/TMA.h
@@ -36,21 +36,21 @@ class TMA {
   void issue_tile(std::shared_ptr<Instruction> inst);
   bool is_finished() { return _finished; }
   bool empty() { return _current_inst==nullptr; }
-  void register_tag(int subgraph_id, const std::pair<std::string, int> key) {
+  void register_tag(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
     if (tag_table.find(subgraph_id) == tag_table.end()) {
-      tag_table[subgraph_id] = std::map<std::pair<std::string, int>, bool>();
-      waiters[subgraph_id] = std::map<std::pair<std::string, int>, std::vector<std::shared_ptr<Instruction>>>();
+      tag_table[subgraph_id] = std::map<std::pair<std::string, std::vector<int>>, bool>();
+      waiters[subgraph_id] = std::map<std::pair<std::string, std::vector<int>>, std::vector<std::shared_ptr<Instruction>>>();
     }
     tag_table[subgraph_id][key] = false;
     waiters[subgraph_id][key] = std::vector<std::shared_ptr<Instruction>>();
   }
-  void set_tag_finish(int subgraph_id, const std::pair<std::string, int> key) {
+  void set_tag_finish(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
     if (tag_table.find(subgraph_id) == tag_table.end()) {
       throw std::runtime_error("Subgraph does not exist in tag_table");
     }
     tag_table[subgraph_id][key] = true;
   }
-  bool tag_key_exist(int subgraph_id, const std::pair<std::string, int> key) {
+  bool tag_key_exist(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
     auto subgraph_it = tag_table.find(subgraph_id);
     if (subgraph_it == tag_table.end())
       return false;
@@ -59,7 +59,7 @@ class TMA {
     auto key_it = key_map.find(key);
     return key_it != key_map.end();
   }
-  bool get_tag_finish(int subgraph_id, const std::pair<std::string, int> key) {
+  bool get_tag_finish(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
     auto subgraph_it = tag_table.find(subgraph_id);
     auto& key_map = subgraph_it->second;
     auto key_it = key_map.find(key);
@@ -76,7 +76,7 @@ class TMA {
     tag_table.erase(subgraph_id);
     waiters.erase(subgraph_id);
   }
-  void register_tag_waiter(int subgraph_id, const std::pair<std::string, int> key, std::shared_ptr<Instruction> inst) {
+  void register_tag_waiter(int subgraph_id, const std::pair<std::string, std::vector<int>>& key, std::shared_ptr<Instruction> inst) {
     auto subgraph_it = tag_table.find(subgraph_id);
     auto& key_map = subgraph_it->second;
     auto key_it = key_map.find(key);
@@ -85,7 +85,7 @@ class TMA {
     }
     waiters[subgraph_id][key].push_back(inst);
   }
-  std::vector<std::shared_ptr<Instruction>>& get_tag_waiter(int subgraph_id, const std::pair<std::string, int> key) {
+  std::vector<std::shared_ptr<Instruction>>& get_tag_waiter(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
     auto subgraph_it = tag_table.find(subgraph_id);
     auto& key_map = subgraph_it->second;
     auto key_it = key_map.find(key);
@@ -110,7 +110,7 @@ class TMA {
   size_t _tile_idx_stride=1;
   uint32_t _tile_idx;
   bool _finished=true;
-  std::map<int, std::map<std::pair<std::string, int>, bool>> tag_table;
-  std::map<int, std::map<std::pair<std::string, int>, std::vector<std::shared_ptr<Instruction>>>> waiters;
+  std::map<int, std::map<std::pair<std::string, std::vector<int>>, bool>> tag_table;
+  std::map<int, std::map<std::pair<std::string, std::vector<int>>, std::vector<std::shared_ptr<Instruction>>>> waiters;
 };
 #endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index fed9cfb7..0fb25b94 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -272,6 +272,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       addr_type base_addr = tog_parser->lookup(base_addr_name);
       std::vector<int> iter_list;
       std::vector<int> tag_list;
+      std::vector<int> accum_tag_list;
       std::vector<int> loop_size_list;
       std::vector<uint32_t> outer_loop_idx;
       std::vector<uint32_t> outer_loop_size;
@@ -284,15 +285,15 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         if (tog_parser->get_loop_type(loop_idx)==LoopType::INNER_LOOP)
           nr_inner_loop++;
       }
-      /* Add accumulation loop info to tag list */
-      //for (auto loop_idx = loop_idx_list.begin();
-      //      loop_idx != loop_idx_list.end() - nr_inner_loop; ++loop_idx) {
-      //  // Check loop type and process
-      //  if (tog_parser->get_loop_type(*loop_idx)==LoopType::ACCUMULATION_LOOP) {
-      //    auto iter_value = getLoopIndexValue(iter, *loop_idx);
-      //    tag_list.push_back(iter_value);
-      //  }
-      //}
+      /* Add accumulation loop info to accum_tag list */
+      for (auto loop_idx = loop_idx_list.begin();
+            loop_idx != loop_idx_list.end() - nr_inner_loop; ++loop_idx) {
+        // Check loop type and process
+        if (tog_parser->get_loop_type(*loop_idx)==LoopType::ACCUMULATION_LOOP) {
+          auto iter_value = getLoopIndexValue(iter, *loop_idx);
+          accum_tag_list.push_back(iter_value);
+        }
+      }
 
       for (auto loop_idx = loop_idx_list.begin();
             loop_idx != loop_idx_list.end(); ++loop_idx) {
@@ -327,7 +328,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         Opcode::MOVIN, 0,
         0, base_addr,
         mem_node->get_tile_size(), mem_node->get_precision(), iter_list,
-        mem_node->get_stride_list(), tag_list, tag_stride_list, loop_size_list
+        mem_node->get_stride_list(), tag_list, tag_stride_list, accum_tag_list, loop_size_list
       );
       inst->set_addr_name(base_addr_name);
       inst->set_nr_inner_loop(nr_inner_loop);
@@ -342,6 +343,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       /* Lookup given name's address */
       addr_type base_addr = tog_parser->lookup(base_addr_name);
       std::vector<int>& tag_stride_list = mem_node->get_tag_stride_list();
+      std::vector<int> accum_tag_list;
       std::vector<int> iter_list;
       std::vector<int> loop_size_list;
       std::vector<uint32_t> outer_loop_idx;
@@ -376,7 +378,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         Opcode::MOVOUT, 0,
         0, base_addr,
         mem_node->get_tile_size(), mem_node->get_precision(), iter_list,
-        mem_node->get_stride_list(), std::vector<int>(1), tag_stride_list, loop_size_list
+        mem_node->get_stride_list(), std::vector<int>(1), tag_stride_list, accum_tag_list, loop_size_list
       );
       inst->set_addr_name(base_addr_name);
       inst->set_nr_inner_loop(nr_inner_loop);
@@ -394,30 +396,36 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       std::vector<int> iter_list;
       std::vector<int> tag_list;
       std::vector<int>& tag_stride_list = wait_node->get_tag_stride_list();
+      std::vector<int> new_tag_stride_list;
+      std::vector<int> accum_tag_list;
       auto& wait_tag_list = wait_node->get_tag_idx_list();
-      int inner_step = std::stoi(tog_parser->getMetaByName("systolic_size"));
-      /* Add accumulation loop info to tag list */
-      for (auto loop_idx = iter.begin(); loop_idx != iter.end(); ++loop_idx) {
-        /* FIXME. Used heuristic that wait_tag_size has 2d dim */
-        if (tog_parser->get_loop_type(loop_idx->first)==LoopType::ACCUMULATION_LOOP && wait_tag_list.size() != 2) {
-          tag_list.push_back(loop_idx->second);
-        }
-      }
 
       for (auto loop_idx: wait_tag_list) {
-        if (iter.find(loop_idx) == iter.end())
+        if (iter.find(loop_idx) == iter.end()) {
           tag_list.push_back(0);
-        else {
-          auto iter_value = getLoopIndexValue(iter, loop_idx) * inner_step;
+          continue;
+        }
+
+        if (tog_parser->get_loop_type(loop_idx)==LoopType::ACCUMULATION_LOOP) {
+          auto iter_value = getLoopIndexValue(iter, loop_idx);
+          accum_tag_list.push_back(iter_value);
+        } else {
+          auto iter_value = getLoopIndexValue(iter, loop_idx);
           tag_list.push_back(iter_value);
         }
       }
 
+      /* Skip accum stride */
+      for (auto i : tag_stride_list) {
+        if (i!=-1)
+          new_tag_stride_list.push_back(i);
+      }
+
       std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
         Opcode::BAR, 0,
         0, base_addr,
         std::vector<size_t>(), 0, iter_list,
-        iter_list, tag_list, tag_stride_list, std::vector<int>()
+        iter_list, tag_list, new_tag_stride_list, accum_tag_list, std::vector<int>()
       );
       inst->set_addr_name(base_addr_name);
       link_map[tile_node] = inst;
@@ -428,11 +436,12 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       std::vector<int> iter_list;
       std::vector<int> tag_list = {0};
       std::vector<int> tag_stride_list = {1};
+      std::vector<int> accum_tag_list;
       std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
         Opcode::COMP, compute_node->get_cycle(),
         0, 0,
         std::vector<size_t>(), 0, iter_list, iter_list,
-        tag_list, tag_stride_list, std::vector<int>()
+        tag_list, tag_stride_list, accum_tag_list, std::vector<int>()
       );
       inst->set_overlapping_cycle(compute_node->get_overlapping_cycle());
       inst->set_compute_type(compute_node->get_compute_type());

From 2a723e37b4058ada922f274c68d2a131130f5558 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 5 Feb 2025 02:21:43 +0000
Subject: [PATCH 063/432] [Frontend] Use seperate tags in the tempalte

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  | 11 +++++++----
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 11 +++++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 26d30266..85520658 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -27,7 +27,10 @@
   %X_buffer = memref.get_global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
   %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
   %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-  %tag = memref.alloc() : memref<1xi32>{% if not Bias %}
+  %tag = memref.alloc() : memref<1xi32>
+  %tag0 = memref.alloc() : memref<1xi32>
+  %tag1 = memref.alloc() : memref<1xi32>
+  %tag2 = memref.alloc() : memref<1xi32>{% if not Bias %}
   %v0 = arith.constant dense<0.0> : vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>{% endif %}
   %c0 = arith.constant 0 : index
   {{- kernel.def_local_vars() }}
@@ -38,7 +41,7 @@
         {% if Bias -%}
         memref.dma_start %Bias[
         {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
-          ], %Y_buffer[0, 0], %c_mvin3, %tag[%c0], %
+          ], %Y_buffer[0, 0], %c_mvin3, %tag0[%c0], %
         {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
           , %vstride : memref<
         {%- if Bias_rank == 2 -%} {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
@@ -48,9 +51,9 @@
         affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
           %index0 = affine.apply #map0(%b, %t_m, %t_k)
           %index1 = affine.apply #map1(%b, %t_k, %t_n)
-          memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %axis, %vstride
+          memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
              : memref<{{ B * M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
-          memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag[%c0], %axis, %vstride
+          memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
              : memref<{{ B * K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, 1]}
           linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                   outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index c8824ba4..8e2d9c65 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -28,7 +28,10 @@
   %X_buffer = memref.get_global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
   %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
   %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-  %tag = memref.alloc() : memref<1xi32>{% if not Bias %}
+  %tag = memref.alloc() : memref<1xi32>
+  %tag0 = memref.alloc() : memref<1xi32>
+  %tag1 = memref.alloc() : memref<1xi32>
+  %tag2 = memref.alloc() : memref<1xi32>{% if not Bias %}
   %v0 = arith.constant dense<0.0> : vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>{% endif %}
   %c0 = arith.constant 0 : index
   {{- kernel.def_local_vars() }}
@@ -39,7 +42,7 @@
       {%- if Bias %}
       memref.dma_start %Bias[
         {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
-        ], %Y_buffer[%c0, %c0], %c_mvin3, %tag[%c0], %
+        ], %Y_buffer[%c0, %c0], %c_mvin3, %tag0[%c0], %
         {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
         , %vstride : memref<
         {%- if Bias_rank == 2 -%}  {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
@@ -50,9 +53,9 @@
       affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
         %index0 = affine.apply #map0(%t_m, %t_k)
         %index1 = affine.apply #map1(%t_k, %t_n)
-        memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %axis, %vstride
+        memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
            : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
-        memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag[%c0], %axis, %vstride
+        memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
            : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, 1]}
         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                 outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)

From e8d3aebbf7149641f9967c4040e69f2125834c19 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Wed, 5 Feb 2025 07:32:02 +0000
Subject: [PATCH 064/432] [Frontend] GeLU codegen

---
 .../mlir/mlir_codegen_backend.py              | 22 ++++++++++++
 tests/test_gelu.py                            | 36 +++++++++++++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 tests/test_gelu.py

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index d95aa2a2..52a9a835 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -271,6 +271,28 @@ def exp(operand, *args, var_info=None, **kwargs):
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
         return f'math.exp %{operand} : {shape}', [tile_size, dtype]
 
+    @staticmethod
+    def erf(x, *args, var_info=None, **kwargs):
+        op_type = var_info[x]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f'math.erf %{x} : {shape}', [tile_size, dtype] # TODO: erf lowering pass is not implemented
+
+    @staticmethod
+    def tanh(operand, *args, var_info=None, **kwargs):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
+        # Type check & auto cast
+        if dtype[0] != "f":
+            operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
+            var_info[operand] = dtype
+
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f'math.tanh %{operand} : {shape}', [tile_size, dtype]
+
     @staticmethod
     def sqrt(operand, *args, var_info=None, **kwargs):
         op_type = var_info[operand]
diff --git a/tests/test_gelu.py b/tests/test_gelu.py
new file mode 100644
index 00000000..8639677b
--- /dev/null
+++ b/tests/test_gelu.py
@@ -0,0 +1,36 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    message = f"|{name} Test Passed|"
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def test_GeLU(device, size=(128, 128), approximate='none'):
+    torch.manual_seed(0)
+    input = torch.randn(size)
+    x1 = input.to(device=device)
+    x2 = input.to("cpu")
+    GeLU = torch.nn.GELU(approximate=approximate)
+    opt_fn = torch.compile(dynamic=False)(GeLU)
+    y = opt_fn(x1)
+    cpu_y = GeLU(x2)
+    test_result("GeLU", y, cpu_y)
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    # test_GeLU(device, (128, 128))
+    # test_GeLU(device, (128, 128), approximate='tanh')

From ff248919b06dc2128582544a98a7ca6b608194eb Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Wed, 5 Feb 2025 07:36:01 +0000
Subject: [PATCH 065/432] [Test] optimizer & testcase

---
 PyTorchSimFrontend/extension_device.cpp       |  2 --
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  2 +-
 tests/test_mlp.py                             |  5 ++++-
 tests/test_transformer.py                     | 12 ++++++++++++
 4 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/extension_device.cpp b/PyTorchSimFrontend/extension_device.cpp
index 6bceb8ae..4d33db08 100644
--- a/PyTorchSimFrontend/extension_device.cpp
+++ b/PyTorchSimFrontend/extension_device.cpp
@@ -307,8 +307,6 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
   m.impl("_foreach_addcdiv_.ScalarList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
   m.impl("_foreach_add_.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
   m.impl("cat.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  // m.impl("addmm.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); // TODO: only for optimizer test
-  // m.impl("mm.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); // TODO: only for optimizer test
 }
 
 // This basic implementation doesn't bother dealing with different device indices
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 8e2d9c65..ed643f71 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -56,7 +56,7 @@
         memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
            : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
         memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
-           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, 1]}
+           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, {{ TILE_K }}]}
         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                 outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
       } { accumulation_loop=true }
diff --git a/tests/test_mlp.py b/tests/test_mlp.py
index 0582ce74..2787499c 100644
--- a/tests/test_mlp.py
+++ b/tests/test_mlp.py
@@ -79,14 +79,16 @@ def test_mlp_inf(device, batch_size=64, input_size=64, hidden_size=32, output_si
 def test_optimizer(device):
     torch.manual_seed(0)
     model = MLP(input_size=16, hidden_size=16, output_size=16).to(device=device)
+    model.requires_grad = True
     cpu_model = copy.deepcopy(model).to("cpu")
+    opt_model = torch.compile(dynamic=False)(model)
     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
     cpu_optimizer = torch.optim.Adam(cpu_model.parameters(), lr=0.001)
     opt_step = torch.compile(dynamic=False)(optimizer.step)
     input = torch.randn(16, 16)
     x1 = copy.deepcopy(input).to(device=device)
     x2 = copy.deepcopy(input).to("cpu")
-    y = model(x1)
+    y = opt_model(x1)
     cpu_y = cpu_model(x2)
     loss = y.sum()
     cpu_loss = cpu_y.sum()
@@ -110,3 +112,4 @@ def test_optimizer(device):
     test_mlp_inf(device, batch_size=1, input_size=256, hidden_size=512, output_size=256)
     test_mlp_inf(device, batch_size=8, input_size=256, hidden_size=512, output_size=256)
     test_mlp_inf(device, batch_size=64, input_size=256, hidden_size=512, output_size=256)
+    test_optimizer(device)
diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index 44ffe5b8..1cb1fd50 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -112,6 +112,18 @@ def attention(query, key, value):
     cpu_res, cpu_p_attn = attention(query.cpu(), key.cpu(), value.cpu())
     test_result("Attention Forward", res, cpu_res)
 
+def test_MHA(device, num_heads=12, embed_dim=768):
+    MHA = my_MultiheadAttention(num_heads, embed_dim)
+    cpu_query = torch.randn(512, 768)
+    cpu_res = MHA(cpu_query, cpu_query, cpu_query)
+
+    query = cpu_query.clone().to(device=device)
+    MHA.to(device=device)
+    opt_fn = torch.compile(dynamic=False)(MHA)
+    res = opt_fn(query, query, query)
+
+    test_result("MHA Forward", res, cpu_res)
+
 if __name__ == "__main__":
     import os
     import sys

From 41606a25ec26ebebd5f555a8b04ef015967375d0 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 6 Feb 2025 04:05:41 +0000
Subject: [PATCH 066/432] [Frontend] Add extra pass for lowering realloc

---
 PyTorchSimFrontend/extension_codecache.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 8839887c..3107ae73 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -49,10 +49,12 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
             -test-loop-padding \
+            -expand-realloc \
             -dma-fine-grained='systolic-array-size={vectorlane_size}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size}" \
             -lower-affine \
+            -expand-strided-metadata \
             -finalize-memref-to-llvm \
             -lower-vector-multi-reduction \
             -convert-vector-to-llvm \
@@ -82,11 +84,13 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
             -test-loop-padding='timing_mode=1' \
+            -expand-realloc \
             -dma-fine-grained='systolic-array-size={vectorlane_size}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
             -test-tile-operation-graph='vectorlane={vectorlane_size}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size} timing=1" \
             -lower-affine \
+            -expand-strided-metadata \
             -finalize-memref-to-llvm \
             -lower-vector-multi-reduction \
             -convert-vector-to-llvm \

From 1e732e0bcb98a79e18c0a56e3363d33ceb68e38a Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Thu, 6 Feb 2025 04:09:02 +0000
Subject: [PATCH 067/432] [Fix] MoE operation debug

---
 .../mlir/mlir_codegen_backend.py              | 24 +++++++------------
 PyTorchSimFrontend/mlir/mlir_common.py        |  6 +++--
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  4 ++--
 3 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 52a9a835..3fd94fb1 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -323,25 +323,19 @@ def rsqrt(operand, *args, var_info=None, **kwargs):
 
     @staticmethod
     def pow(operand1, operand2, *args, var_info=None, **kwargs):
-        op_type1 = var_info[operand1]
-        op_type2 = var_info[operand2]
-
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         # Type check & auto cast
-        if op_type1[1][0] != "f":
-            operand1, dtype = ops.to_dtype(operand1, "f32", var_info=var_info)
-            var_info[operand1] = dtype
+        if ret_type[0] != "f":
+            operand1, ret_type = ops.to_dtype(operand1, "f32", var_info=var_info)
+            var_info[operand1] = ret_type
 
         # Type check & auto cast
-        if op_type2[1][0] != "f":
-            operand2, dtype = ops.to_dtype(operand2, "f32", var_info=var_info)
-            var_info[operand2] = dtype
+        if ret_type[0] != "f":
+            operand2, ret_type = ops.to_dtype(operand2, "f32", var_info=var_info)
+            var_info[operand2] = ret_type
 
-        op_type1 = var_info[operand1]
-        tile_size = op_type1[0]
-        dtype = op_type1[1]
-
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f"math.pow{dtype[0]} %{operand1}, %{operand2} : {shape}", []
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f"math.pow{ret_type[0]} %{operand1}, %{operand2} : {shape}", [tile_size, ret_type]
 
     @staticmethod
     def log(operand, *args, var_info=None, **kwargs):
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index dc25b0a1..c88a51dd 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -7,7 +7,7 @@
 from torch._inductor.codegen import cpp
 from torch._inductor.virtualized import V
 from torch._inductor.ir import MultiOutputLayout
-from torch._inductor.dependencies import MemoryDep
+from torch._inductor.dependencies import MemoryDep, StarDep, WeakDep
 from torch.utils._sympy.functions import ModularIndexing
 import sympy
 import contextlib
@@ -383,6 +383,8 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
         implicit_dim_size = defaultdict(list)
         for read_operand in nodes[0].read_writes.reads:
             read_operand : MemoryDep
+            if isinstance(read_operand, StarDep) or isinstance(read_operand, WeakDep): # FIXME: WeakDep & StarDep are not supported (MoE case)
+                continue
             read_index = read_operand.index
             for arg in read_index.args:
                 if "ModularIndexing" in str(arg) or "//" in str(arg):
@@ -444,7 +446,7 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
         else:
             raise NotImplementedError("dummy tile size fail!")
 
-        vlane_stride = 2
+        vlane_stride = 8 # TODO: VCIX widening is not implemented
         # Adjust tile size to avoid too much paddings
         for i in range(1, len(tile_size)+1):
             target_range = self.ranges[-i]
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 6bc3facf..5c7ec252 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -35,10 +35,10 @@ def can_fuse_horizontal(self, node1, node2):
             return False
 
         # Convolution is currently not supported
-        if not isinstance(node1, FusedSchedulerNode) and node1.node.origin_node is not None and node1.node.origin_node.target._name == 'aten::convolution':
+        if not isinstance(node1, FusedSchedulerNode) and node1.node.origin_node is not None and hasattr(node1.node.origin_node.target, "_name") and node1.node.origin_node.target._name == 'aten::convolution':
             return False
 
-        if not isinstance(node2, FusedSchedulerNode) and node2.node.origin_node is not None and node2.node.origin_node.target._name == 'aten::convolution':
+        if not isinstance(node2, FusedSchedulerNode) and node2.node.origin_node is not None and hasattr(node2.node.origin_node.target, "_name") and node2.node.origin_node.target._name == 'aten::convolution':
             return False
 
         if not isinstance(node1, FusedSchedulerNode) and not isinstance(node2, FusedSchedulerNode):

From 6c9a80be8b08ee82d7bc1f541b8fa38b03ff8e84 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 6 Feb 2025 07:33:01 +0000
Subject: [PATCH 068/432] [Frontend] Change lowering pass for linalg.fill

---
 PyTorchSimFrontend/extension_codecache.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 3107ae73..13e11094 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -49,12 +49,11 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
             -test-loop-padding \
-            -expand-realloc \
             -dma-fine-grained='systolic-array-size={vectorlane_size}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size}" \
+            -convert-linalg-to-loops \
             -lower-affine \
-            -expand-strided-metadata \
             -finalize-memref-to-llvm \
             -lower-vector-multi-reduction \
             -convert-vector-to-llvm \
@@ -84,13 +83,12 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
             -test-loop-padding='timing_mode=1' \
-            -expand-realloc \
             -dma-fine-grained='systolic-array-size={vectorlane_size}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
             -test-tile-operation-graph='vectorlane={vectorlane_size}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size} timing=1" \
+            -convert-linalg-to-loops \
             -lower-affine \
-            -expand-strided-metadata \
             -finalize-memref-to-llvm \
             -lower-vector-multi-reduction \
             -convert-vector-to-llvm \

From d1608e3bd30b44ccd345589564de77a2bdfbd5b4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 6 Feb 2025 09:56:52 +0000
Subject: [PATCH 069/432] [Frontend] Reserve scratchpad section size to match
 real spad size

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 3 ++-
 PyTorchSimFrontend/mlir/mlir_scheduling.py      | 3 ++-
 Simulator/simulator.py                          | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 3fd94fb1..8eac59bf 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1044,7 +1044,8 @@ def codegen_nodes(self, nodes, kernel_name):
         spike_write_path = os.path.join(write_path, "global_var.h")
         gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
         if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, self.header.getvalue())
+            spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
+            write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol)
         if not os.path.exists(gem5_write_path):
             write_atomic(gem5_write_path, self.gem5_header.getvalue())
         return src_code
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 5c7ec252..dca37f42 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -165,7 +165,8 @@ def codegen_template(self, template_node, epilogue_nodes):
             src_code = self.codegen_src_code(kernel, render, template_node, epilogue_nodes)
 
         with V.set_kernel_handler(kernel):
-            codegen_header(src_code, (kernel.header.getvalue(), kernel.gem5_header.getvalue()))
+            spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\"), aligned({kernel.spad_info['spad_size']*kernel.vector_lane})));"
+            codegen_header(src_code, (kernel.header.getvalue()+spad_end_symbol, kernel.gem5_header.getvalue()))
             kernel.meta_kernel()
             kernel_name = self.define_kernel(src_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info,
                                              kernel.loop_size, origins={str(i) for i in template_node.node.origins})
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index c4a778a5..7b123f4e 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -108,7 +108,7 @@ def run_spike(self, args, arg_attributes, path, binary, intermediate_op=None, ve
         vectorlane_option = f"--vectorlane-size={vectorlane_size}"
         kernel_address = f"--kernel-addr={kernel_start_addr}:{kernel_end_addr}"
         base_addr = f"--base-path={path}"
-        run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_addr} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
+        run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 -m102400 {vectorlane_option} {spad_option} {kernel_address} {base_addr} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
 
         print("[SpikeSimulator] cmd> ", run)
         run_cmd = shlex.split(run)

From 5b263398b4bf142b66ab79c74bb3618682566497 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Thu, 6 Feb 2025 10:34:04 +0000
Subject: [PATCH 070/432] [Frontend] SiLU, SwiGLU test case

---
 tests/test_activation.py | 91 ++++++++++++++++++++++++++++++++++++++++
 tests/test_gelu.py       | 36 ----------------
 tests/test_relu.py       | 36 ----------------
 3 files changed, 91 insertions(+), 72 deletions(-)
 create mode 100644 tests/test_activation.py
 delete mode 100644 tests/test_gelu.py
 delete mode 100644 tests/test_relu.py

diff --git a/tests/test_activation.py b/tests/test_activation.py
new file mode 100644
index 00000000..97b77cac
--- /dev/null
+++ b/tests/test_activation.py
@@ -0,0 +1,91 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+import torch.nn.functional as F
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    message = f"|{name} Test Passed|"
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def test_ReLU(device, size=(128, 128)):
+    torch.manual_seed(0)
+    input = torch.randn(size)
+    x1 = input.to(device=device)
+    x2 = input.to("cpu")
+    opt_fn = torch.compile(dynamic=False)(torch.nn.functional.relu)
+    y = opt_fn(x1)
+    cpu_y = torch.nn.functional.relu(x2)
+    test_result("ReLU", y, cpu_y)
+
+def test_GeLU(device, size=(128, 128), approximate='none'):
+    torch.manual_seed(0)
+    input = torch.randn(size)
+    x1 = input.to(device=device)
+    x2 = input.to("cpu")
+    GeLU = torch.nn.GELU(approximate=approximate)
+    opt_fn = torch.compile(dynamic=False)(GeLU)
+    y = opt_fn(x1)
+    cpu_y = GeLU(x2)
+    test_result("GeLU", y, cpu_y)
+
+def test_sigmoid(device, size=(128, 128)):
+    torch.manual_seed(0)
+    input = torch.randn(size)
+    x1 = input.to(device=device)
+    x2 = input.to("cpu")
+    Sigmoid = torch.nn.Sigmoid()
+    opt_fn = torch.compile(dynamic=False)(Sigmoid)
+    y = opt_fn(x1)
+    cpu_y = Sigmoid(x2)
+    test_result("Sigmoid", y, cpu_y)
+
+def test_SiLU(device, size=(128, 128)):
+    torch.manual_seed(0)
+    input = torch.randn(size)
+    x1 = input.to(device=device)
+    x2 = input.to("cpu")
+    SiLU = torch.nn.SiLU()
+    opt_fn = torch.compile(dynamic=False)(SiLU)
+    y = opt_fn(x1)
+    cpu_y = SiLU(x2)
+    test_result("SiLU", y, cpu_y)
+
+class SwiGLU(torch.nn.Module):
+    def forward(self, x):
+        x, gate = x.chunk(2, dim=-1)
+        return F.silu(gate) * x
+
+def test_SwiGLU(device, size=(128, 128)):
+    torch.manual_seed(0)
+    input = torch.randn(size)
+    x1 = input.to(device=device)
+    x2 = input.to("cpu")
+    SwiGLU_fn = SwiGLU()
+    opt_fn = torch.compile(dynamic=False)(SwiGLU_fn)
+    y = opt_fn(x1)
+    cpu_y = SwiGLU_fn(x2)
+    test_result("SwiGLU", y, cpu_y)
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    test_ReLU(device, (47, 10))
+    test_ReLU(device, (128, 128))
+    test_ReLU(device, (4071, 429))
+    test_sigmoid(device, (128, 128))
+    test_SiLU(device, (128, 128))
+    test_SwiGLU(device, (128, 128))
+    # test_GeLU(device, (128, 128))
+    # test_GeLU(device, (128, 128), approximate='tanh')
diff --git a/tests/test_gelu.py b/tests/test_gelu.py
deleted file mode 100644
index 8639677b..00000000
--- a/tests/test_gelu.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch
-import torch._dynamo
-import torch.utils.cpp_extension
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
-
-def test_GeLU(device, size=(128, 128), approximate='none'):
-    torch.manual_seed(0)
-    input = torch.randn(size)
-    x1 = input.to(device=device)
-    x2 = input.to("cpu")
-    GeLU = torch.nn.GELU(approximate=approximate)
-    opt_fn = torch.compile(dynamic=False)(GeLU)
-    y = opt_fn(x1)
-    cpu_y = GeLU(x2)
-    test_result("GeLU", y, cpu_y)
-
-if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
-    device = module.custom_device()
-    # test_GeLU(device, (128, 128))
-    # test_GeLU(device, (128, 128), approximate='tanh')
diff --git a/tests/test_relu.py b/tests/test_relu.py
deleted file mode 100644
index 3c3915d7..00000000
--- a/tests/test_relu.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch
-import torch._dynamo
-import torch.utils.cpp_extension
-
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
-
-def test_ReLU(device, size=(128, 128)):
-    torch.manual_seed(0)
-    input = torch.randn(size)
-    x1 = input.to(device=device)
-    x2 = input.to("cpu")
-    opt_fn = torch.compile(dynamic=False)(torch.nn.functional.relu)
-    y = opt_fn(x1)
-    cpu_y = torch.nn.functional.relu(x2)
-    test_result("ReLU", y, cpu_y)
-
-if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
-    device = module.custom_device()
-    test_ReLU(device, (47, 10))
-    test_ReLU(device, (128, 128))
-    test_ReLU(device, (4071, 429))

From 6bbb277775aeab46897434556f3881bc334fc555 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Tue, 7 Jan 2025 10:28:24 +0000
Subject: [PATCH 071/432] [Convoluton] Setting debug env new conv template

---
 PyTorchSimFrontend/mlir/mlir_lowering.py      |   1 +
 .../mlir/mlir_new_conv_template.py            | 183 ++++++++++++++++++
 tests/test_conv2d.py                          |   2 +-
 3 files changed, 185 insertions(+), 1 deletion(-)
 create mode 100644 PyTorchSimFrontend/mlir/mlir_new_conv_template.py

diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index e7ca37eb..a8787b02 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -9,6 +9,7 @@
 from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
 from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
 from PyTorchSimFrontend.mlir.mlir_conv_template import MLIRConvTemplate
+from PyTorchSimFrontend.mlir.mlir_new_conv_template import MLIRConvTemplate
 from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
 
 aten = torch.ops.aten
diff --git a/PyTorchSimFrontend/mlir/mlir_new_conv_template.py b/PyTorchSimFrontend/mlir/mlir_new_conv_template.py
new file mode 100644
index 00000000..f1b3baf2
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/mlir_new_conv_template.py
@@ -0,0 +1,183 @@
+import os
+import math
+from typing import List, Optional, cast
+
+from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
+from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
+from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
+from torch._inductor.ir import Buffer
+from torch._inductor.ir import IRNode
+from torch._inductor.ir import ReinterpretView
+from torch._inductor.codecache import write_atomic
+import PyTorchSimFrontend.extension_codecache as extension_codecache
+from torch._inductor.codecache import get_hash
+from PyTorchSimFrontend import extension_config
+
+GEMM_TEMPLATE = r"""
+func.func @{{ KERNEL_NAME }}({{ KERNEL_DEF }}) {
+  return
+}
+"""
+
+
+CONV2D_FUNC_TEMPLATE = r"""
+def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}, {{ OUT }}):
+    {{ KERNEL_NAME }}({{ INPUT }}, {{ WEIGHT }}, {{ OUT }})
+    print("Print OUTPUT ")
+    print("out > ")
+    print({{ OUT }}.shape)
+    print({{ OUT }}.cpu())
+"""
+
+
+class MLIRConvTemplate(MLIRTemplate):
+    def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
+        super().__init__("kernel", input_nodes, layout, input_reorder)
+        self.stride = kwargs["stride"]
+        self.padding = kwargs["padding"]
+        self.dilation = kwargs["dilation"]
+        weight_shape = [str(i) for i in input_nodes[1].layout.size]
+        self.function_name = "Conv2D_" + "_".join(weight_shape)+ "_" \
+            + "_".join([str(i) for i in self.stride]) \
+            + "_" + "_".join([str(i) for i in self.padding]) \
+            + "_" + "_".join([str(i) for i in self.dilation])
+        self.gemm_args = ['input', 'weight', 'output']
+
+        self.calculate_gemm_shape()
+
+    def is_transposed(self, node):
+        if isinstance(node, ReinterpretView):
+            if node.layout.stride != node.data.layout.stride:
+                if node.layout.stride[-2] == node.data.layout.stride[-1] and node.layout.stride[-1] == node.data.layout.stride[-2]:
+                    return True
+                else:
+                  raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
+        return False
+
+    def calculate_gemm_shape(self):
+        input_shape = self.input_nodes[0].get_size()
+        weight_shape = self.input_nodes[1].get_size()
+        gemm_h = int((input_shape[2] + 2*self.padding[0] - (weight_shape[2]-1) - 1) / self.stride[0]) + 1
+        gemm_w = int((input_shape[3] + 2*self.padding[1] - (weight_shape[3]-1) - 1) / self.stride[1]) + 1
+
+        self.gemm_input_shape = [input_shape[0],input_shape[1],gemm_h, gemm_w]
+        self.gemm_weight_shape = [weight_shape[0],weight_shape[1],1,1]
+        self.gemm_output_shape = [self.gemm_input_shape[2]*self.gemm_input_shape[3], self.gemm_weight_shape[0]] # Consider Batch size 1
+
+    def def_kernel(self) ->str:
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
+
+        def flatten(shape):
+            r = 1
+            for i in shape:
+                r *= i
+            return r
+
+        input_size = flatten(X.layout.size)
+        weight_size = flatten(W.layout.size)
+        output_size = flatten(Y.layout.size)
+        return f"%X: memref<{input_size}xf32>, %W: memref<{weight_size}xf32>, %Y: memref<{output_size}xf32>"
+
+    def render(self,
+               kernel: MLIRTemplateKernel,
+               template_buffer_node = None,
+               epilogue_nodes: Optional[List[IRNode]] = None,
+               **kwargs):
+        if template_buffer_node is not None:
+            self.output_node = template_buffer_node
+        if epilogue_nodes is not None and len(epilogue_nodes) > 0:
+            self.output_node = cast(Buffer, epilogue_nodes[-1])
+            self.function_name += f"_fused_{epilogue_nodes[0].node.origin_node.name}"
+
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
+        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+
+        M = self.gemm_input_shape[2] * self.gemm_input_shape[3]
+        N = self.gemm_weight_shape[0]
+        K = self.gemm_weight_shape[1]
+        TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
+        kernel.tile_size = [TILE_M, TILE_N, TILE_K]
+        kernel.loop_size = [M, N, K]
+
+        W_transposed = self.is_transposed(W)
+        X_transposed = self.is_transposed(X)
+
+        options = dict(
+            KERNEL_NAME=self.name,
+            KERNEL_DEF=self.def_kernel(),
+            kernel=kernel,
+            M=M,
+            N=N,
+            K=K,
+            TILE_M=TILE_M,
+            TILE_N=TILE_N,
+            TILE_K=TILE_K,
+            DATA_STYPE="f32",
+            DATA_SIZE=4,
+        )
+        code = self._template_from_string(GEMM_TEMPLATE).render(**options)
+
+        self.header = f"float X_spad[{TILE_M * TILE_K // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float W_spad[{TILE_K * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float Y_spad[{TILE_M * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header = f"float X_spad[{TILE_M * TILE_K}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header += f"float W_spad[{TILE_K * TILE_N}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header += f"float Y_spad[{TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
+
+        kernel.add_loop_info([options["M"], options["N"], options["K"]], [options["TILE_M"], options["TILE_N"], options["TILE_K"]])
+        kernel.def_kernel(inputs=[X, W], outputs=[Y], names_str="X, W, Y", input_reorder=self.input_reorder)
+
+        return code
+
+    def outer_func_render(self, kernel_name, input_args):
+        options = dict(
+            KERNEL_NAME=kernel_name,
+            FUNC_NAME=self.function_name,
+            INPUT=input_args[0],
+            WEIGHT=input_args[1],
+            OUT=input_args[3] if len(input_args) == 4 else input_args[2],
+            PADDING_H=self.padding[0],
+            PADDING_W=self.padding[1],
+            STRIDE_H=self.stride[0],
+            STRIDE_W=self.stride[1],
+            DILATION_H=self.dilation[0],
+            DILATION_W=self.dilation[1],
+            VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
+            BACKENDSIM_EAGER_MODE=extension_config.CONFIG_BACKENDSIM_EAGER_MODE,
+            HASH_VALUE=self.hash_value
+        )
+        code = self._template_from_string(CONV2D_FUNC_TEMPLATE).render(**options)
+        return code, self.function_name
+
+    def get_arg_attributes(self):
+        arg_attributes = []
+
+        input_shape = self.input_nodes[0].get_size()
+        weight_shape = self.input_nodes[1].get_size()
+        gemm_h = int((input_shape[2] + 2*self.padding[0] - (weight_shape[2]-1) - 1) / self.stride[0]) + 1
+        gemm_w = int((input_shape[3] + 2*self.padding[1] - (weight_shape[3]-1) - 1) / self.stride[1]) + 1
+
+        gemm_input_shape = [input_shape[0],input_shape[1],gemm_h, gemm_w]
+        gemm_weight_shape = [weight_shape[0],weight_shape[1],1,1]
+        gemm_output_shape = [gemm_input_shape[2]*gemm_input_shape[3], gemm_weight_shape[0]] # Consider Batch size 1
+
+        arg_attributes.append([self.gemm_args[0], [MLIRKernelArgs.MLIR_ARGS_IN, self.input_nodes[0].layout.dtype, math.prod(gemm_input_shape)]])
+        arg_attributes.append([self.gemm_args[1], [MLIRKernelArgs.MLIR_ARGS_IN, self.input_nodes[1].layout.dtype, math.prod(gemm_weight_shape)]])
+        # arg_attributes.append([self.gemm_args[2], [MLIRKernelArgs.MLIR_ARGS_IN, self.input_nodes[0].layout.dtype, math.prod(gemm_output_shape)]])
+        arg_attributes.append([self.gemm_args[2], [MLIRKernelArgs.MLIR_ARGS_OUT, self.input_nodes[0].layout.dtype, math.prod(gemm_output_shape)]])
+
+        return arg_attributes
+
+    def codegen_header(self, code, extra_headers):
+        write_path = extension_codecache.get_write_path(code)
+        if not os.path.exists(write_path):
+            os.makedirs(write_path)
+        spike_write_path = os.path.join(write_path, "global_var.h")
+        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
+        if not os.path.exists(spike_write_path):
+            write_atomic(spike_write_path, self.header+extra_headers[0])
+        if not os.path.exists(gem5_write_path):
+            write_atomic(gem5_write_path, self.gem5_header+extra_headers[1])
+        self.hash_value = get_hash(code.strip())
\ No newline at end of file
diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py
index 29924156..92ac959c 100644
--- a/tests/test_conv2d.py
+++ b/tests/test_conv2d.py
@@ -17,7 +17,7 @@ def test_conv2d(device, batch_size=1, in_channels=8, out_channels=16, input_size
     def custom_conv2d(a, b, bias):
         i_c = a.shape[1]
         o_c = b.shape[0]
-        conv2d = torch.nn.Conv2d(i_c, o_c, b.shape[-1], stride=stride, padding=padding, dilation=1)
+        conv2d = torch.nn.Conv2d(i_c, o_c, b.shape[-1], stride=stride, padding=padding, dilation=1, bias=False)
         conv2d.weight = torch.nn.Parameter(b)
         conv2d.bias = torch.nn.Parameter(bias)
         return conv2d(a)

From 151eaf245541a9443d4df70868d8035bc0e2679f Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Fri, 17 Jan 2025 05:12:27 +0000
Subject: [PATCH 072/432] [Frontend] convolution tiling option

---
 .../mlir/mlir_new_conv_template.py            | 89 ++++++++++++++++---
 1 file changed, 78 insertions(+), 11 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_new_conv_template.py b/PyTorchSimFrontend/mlir/mlir_new_conv_template.py
index f1b3baf2..89f31668 100644
--- a/PyTorchSimFrontend/mlir/mlir_new_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_new_conv_template.py
@@ -1,5 +1,6 @@
 import os
 import math
+from sympy import divisors, Range
 from typing import List, Optional, cast
 
 from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
@@ -14,15 +15,49 @@
 from PyTorchSimFrontend import extension_config
 
 GEMM_TEMPLATE = r"""
+%map1 = affine_map<(d0, d1, d2, d3) -> ()>
+memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+memref.global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+
 func.func @{{ KERNEL_NAME }}({{ KERNEL_DEF }}) {
+  %c_mvin = arith.constant 2 : index
+  %c_mvin2 = arith.constant 1 : index
+  %c_mvin3 = arith.constant 14 : index
+  %c_mvout = arith.constant 3 : index
+  %c_set = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+
+  // 1x1 convolution loop
+  affine.for %k_h = 0 to {{ K_H }} {
+    affine.for %k_w = 0 to {{ K_W }} {
+      // 1x1 convolution tiling loop
+        affine.for %tile_m = 0 to {{ M * N }} step {{ TILE_M }} {
+          affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
+            %index2 = affine.apply #map1(%k_h, %k_w, %tile_m, %tile_n)
+            affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
+            
+            }
+          }
+        }
+      }
+  } { outer_loop=true }
   return
 }
 """
 
-
 CONV2D_FUNC_TEMPLATE = r"""
 def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}, {{ OUT }}):
-    {{ KERNEL_NAME }}({{ INPUT }}, {{ WEIGHT }}, {{ OUT }})
+    # Tanspose tensors
+    t_{{ INPUT }} = {{ INPUT }}.permute(0, 2, 3, 1)
+    t_{{ WEIGHT }} = {{ WEIGHT }}.permute(0, 2, 3, 1)
+    t_{{ OUT }} = {{ OUT }}.permute(0, 2, 3, 1)
+
+    {{ KERNEL_NAME }}(t_{{ INPUT }}, t_{{ WEIGHT }}, t_{{ OUT }})
+
+    # Transpose back
+    {{ OUT }} = t_{{ OUT }}.permute(0, 3, 1, 2)
+
     print("Print OUTPUT ")
     print("out > ")
     print({{ OUT }}.shape)
@@ -79,6 +114,20 @@ def flatten(shape):
         output_size = flatten(Y.layout.size)
         return f"%X: memref<{input_size}xf32>, %W: memref<{weight_size}xf32>, %Y: memref<{output_size}xf32>"
 
+    def get_tile_option(self):
+        I_C, H, W = self.input_nodes[0].layout.size[1], self.input_nodes[0].layout.size[2], self.input_nodes[0].layout.size[3]
+        O_C = self.input_nodes[1].layout.size[0]
+        
+        tile_k_options = divisors(I_C)
+        tile_n_options = divisors(O_C)
+
+        H_divisors = divisors(H)
+        H_multiples = list(Range(H, H * W, H))
+        tile_m_options = sorted(set(H_divisors) | set(H_multiples))
+        
+        return tile_m_options, tile_n_options, tile_k_options
+        
+
     def render(self,
                kernel: MLIRTemplateKernel,
                template_buffer_node = None,
@@ -97,23 +146,47 @@ def render(self,
         M = self.gemm_input_shape[2] * self.gemm_input_shape[3]
         N = self.gemm_weight_shape[0]
         K = self.gemm_weight_shape[1]
-        TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
+        # TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
+        # kernel.tile_size = [TILE_M, TILE_N, TILE_K]
+        # kernel.loop_size = [M, N, K]
+
+        tile_m_options, tile_n_options, tile_k_options = self.get_tile_option()
+        print("tile_m_options: ", tile_m_options)
+        print("tile_n_options: ", tile_n_options)
+        print("tile_k_options: ", tile_k_options)
+
+        TILE_M = tile_m_options[3]
+        TILE_N = tile_n_options[3]
+        TILE_K = tile_k_options[3]
+
         kernel.tile_size = [TILE_M, TILE_N, TILE_K]
         kernel.loop_size = [M, N, K]
 
-        W_transposed = self.is_transposed(W)
-        X_transposed = self.is_transposed(X)
+        # W_transposed = self.is_transposed(W)
+        # X_transposed = self.is_transposed(X)
 
         options = dict(
             KERNEL_NAME=self.name,
             KERNEL_DEF=self.def_kernel(),
             kernel=kernel,
+            I_C=X.layout.size[1],
+            I_H=X.layout.size[2],
+            I_W=X.layout.size[3],
+            O_C=W.layout.size[0],
+            K_H=W.layout.size[2],
+            K_W=W.layout.size[3],
             M=M,
             N=N,
             K=K,
             TILE_M=TILE_M,
             TILE_N=TILE_N,
             TILE_K=TILE_K,
+            PADDING_H=self.padding[0],
+            PADDING_W=self.padding[1],
+            STRIDE_H=self.stride[0],
+            STRIDE_W=self.stride[1],
+            DILATION_H=self.dilation[0],
+            DILATION_W=self.dilation[1],
             DATA_STYPE="f32",
             DATA_SIZE=4,
         )
@@ -138,12 +211,6 @@ def outer_func_render(self, kernel_name, input_args):
             INPUT=input_args[0],
             WEIGHT=input_args[1],
             OUT=input_args[3] if len(input_args) == 4 else input_args[2],
-            PADDING_H=self.padding[0],
-            PADDING_W=self.padding[1],
-            STRIDE_H=self.stride[0],
-            STRIDE_W=self.stride[1],
-            DILATION_H=self.dilation[0],
-            DILATION_W=self.dilation[1],
             VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
             BACKENDSIM_EAGER_MODE=extension_config.CONFIG_BACKENDSIM_EAGER_MODE,
             HASH_VALUE=self.hash_value

From 6f55d50bc4782a30e78f2db0626cca11d7cb4f14 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Fri, 24 Jan 2025 02:06:59 +0000
Subject: [PATCH 073/432] [Template] Convoluton rework

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 368 ++++++++++++------
 PyTorchSimFrontend/mlir/mlir_lowering.py      |   1 -
 .../mlir/mlir_new_conv_template.py            | 250 ------------
 3 files changed, 239 insertions(+), 380 deletions(-)
 delete mode 100644 PyTorchSimFrontend/mlir/mlir_new_conv_template.py

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index c0331d8e..7f9bf296 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -1,5 +1,6 @@
 import os
 import math
+from sympy import divisors, Range
 from typing import List, Optional, cast
 
 from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
@@ -13,111 +14,147 @@
 from torch._inductor.codecache import get_hash
 from PyTorchSimFrontend import extension_config
 
+
+# %tmp_i_h = arith.muli %o_h, %stride_h : index
+# %index_i_h = arith.addi %tmp_i_h, %k_h : index
+# %tmp_i_w = arith.muli %o_w, %stride_w : index
+# %index_i_w = arith.addi %tmp_i_w, %k_w : index
+# %tmp_k = arith.muli %k_h, %K_W : index
+# %index_k_hw = arith.addi %tmp_k, %k_w : index
+
 GEMM_TEMPLATE = r"""
-#map0 = affine_map<(d0, d1) -> (d0 * {{ K }} + d1)>
-#map1 = affine_map<(d0, d1) -> (d0 * {{ N }} + d1)>
-memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-memref.global @B_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-memref.global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+// Conv2D kernel
+// BATCH = {{ BATCH }}
+// I_C = {{ I_C }}
+// I_H = {{ I_H }}
+// I_W = {{ I_W }}
+// O_C = {{ O_C }}
+// K_H = {{ K_H }}
+// K_W = {{ K_W }}
+// O_H = {{ O_H }}
+// O_W = {{ O_W }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
+// TILE_M_PADDING = {{ TILE_M_PADDING }}
+// TILE_N_PADDING = {{ TILE_N_PADDING }}
+// TILE_K_PADDING = {{ TILE_K_PADDING }}
+// PADDING_H = {{ PADDING_H }}
+// PADDING_W = {{ PADDING_W }}
+// STRIDE_H = {{ STRIDE_H }}
+// STRIDE_W = {{ STRIDE_W }}
+// DILATION_H = {{ DILATION_H }}
+// DILATION_W = {{ DILATION_W }}
+// DATA_STYPE = {{ DATA_STYPE }}
+// DATA_SIZE = {{ DATA_SIZE }}
+
+#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * BATCH * O_C }} + d1 * {{ BATCH * O_C }} + d2 * {{ O_C }} + d3)> // output (O_H, O_W, BATCH, O_C)
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ I_W * BATCH * I_C }} + d1 * {{ BATCH * I_C }} + d2 * {{ I_C }} + d3)> // input (I_H, I_W, BATCH, I_C)
+#map2 = affine_map<(d0, d1, d2) -> (d0 * {{ I_C * O_C }} + d1 * {{ O_C }} + d2)> // weight (K_H * K_W, I_C, O_C)
+#map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
+#map_I_W = affine_map<(d0, d1) -> (d0 * {{ STRIDE_W }} + d1)>
+#map_K_HW = affine_map<(d0, d1) -> (d0 * {{ K_W }} + d1)>
+
+memref.global @X_spad : memref<{{TILE_M_PADDING }}x{{ TILE_K_PADDING }}xf32, 1>
+memref.global @W_spad : memref<{{ TILE_K_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>
+memref.global @Y_spad : memref<{{ TILE_M_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>
 
 func.func @{{ KERNEL_NAME }}({{ KERNEL_DEF }}) {
   %c_mvin = arith.constant 2 : index
   %c_mvin2 = arith.constant 1 : index
   %c_mvin3 = arith.constant 14 : index
   %c_mvout = arith.constant 3 : index
-  %c_set = arith.constant 2 : index
-  %c0 = arith.constant 0 : index
-
-  %N = arith.constant {{ N }} : index
-  %K = arith.constant {{ K }} : index
-  %X_buffer = memref.get_global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-  %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-  %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+  %vstride = arith.constant 1 : index
+  %input_axis = arith.constant 3 : index
+  %weight_axis = arith.constant 2 : index
+  %X_buffer = memref.get_global @X_spad : memref<{{ TILE_M_PADDING }}x{{ TILE_K_PADDING }}xf32, 1>
+  %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>
+  %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>
   %tag = memref.alloc() : memref<1xi32>
-  {{- kernel.def_local_vars() }}
-
-  affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
-    affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
-        %index2 = affine.apply #map1(%t_m, %t_n)
-        affine.dma_start %B[%index2], %Y_buffer[%c0, %c0], %tag[0], %c_mvin3, %N, %c_set : memref<{{ M * N }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1 }
-        affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
-            %index0 = affine.apply #map0(%t_m, %t_k)
-            %index1 = affine.apply #map1(%t_k, %t_n)
-            affine.dma_start %X[%index0], %X_buffer[%c0, %c0], %tag[0], %c_mvin, %K, %c_set : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1 }
-            affine.dma_start %W[%index1], %W_buffer[%c0, %c0], %tag[0], %c_mvin2, %N, %c_set : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1 }
-            linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
-                    outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
-        } { accumulation_loop=true }
-        affine.dma_start %Y_buffer[%c0, %c0], %Y[%index2], %tag[0], %c_mvout, %N, %c_set : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ M * N }}xf32>, memref<1xi32>  { async=1 }
+  %v0 = arith.constant dense<0.0> : vector<{{ TILE_N_PADDING }}xf32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %K_W = arith.constant {{ K_W }} : index
+  %stride_h = arith.constant {{ STRIDE_H }} : index
+  %stride_w = arith.constant {{ STRIDE_W }} : index
+
+  // 1x1 convolution loop
+  affine.for %k_h = 0 to {{ K_H }} {
+    affine.for %k_w = 0 to {{ K_W }} {
+      // 1x1 convolution tiling loop
+      affine.for %o_h = 0 to {{ O_H }} {
+        affine.for %o_w = 0 to {{ O_W }} {
+          affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M_PADDING }} {
+            affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N_PADDING }} {
+              // Init output matrix
+              %index0 = affine.apply #map0(%o_h, %o_w, %tile_m, %tile_n)
+              %cond_h = arith.cmpi eq, %k_h, %c0 : index
+              %cond_w = arith.cmpi eq, %k_w, %c0 : index
+              %cond_hw = arith.andi %cond_h, %cond_w : i1
+              scf.if %cond_hw {
+                {%- if BIAS %}
+                memref.dma_start %Bias[%tile_n], %Y_buffer[%c0, %c0], %c_mvin, %tag[%c0], %c0, %vstride
+                    : memref<{{ O_C }}xf32>, memref<{{ TILE_M_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>, memref<1xi32> { async=1, sram_stride=[1, {{ TILE_M_PADDING }}]}
+                {%- else %}
+                affine.vector_store %v0, %Y_buffer[%c0, %c0] : memref<{{ TILE_M_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>, vector<{{ TILE_N_PADDING }}xf32>
+                {%- endif %}
+              } else {
+                memref.dma_start %Y[%index0], %Y_buffer[%c0, %c0], %c_mvin, %tag[%c0], %input_axis, %vstride
+                    : memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<{{ TILE_M_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>, memref<1xi32> { padding=0, sram_stride=[1, {{ TILE_M_PADDING }}]}
+              }
+              affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K_PADDING }} {
+                %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
+                %index_i_w = affine.apply #map_I_W(%o_w, %k_w)
+                %index1 = affine.apply #map1(%index_i_h, %index_i_w, %tile_m, %tile_k) // input index
+                %index_k_hw = affine.apply #map_K_HW(%k_h, %k_w)
+                %index2 = affine.apply #map2(%index_k_hw, %tile_k, %tile_n) // weight index
+                // Load input matrix
+                memref.dma_start %X[%index1], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %input_axis, %vstride
+                    : memref<{{ BATCH * I_C * I_H * I_W }}xf32>, memref<{{ TILE_M_PADDING }}x{{ TILE_K_PADDING }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K_PADDING }}], async=1, sram_stride=[1, {{ TILE_M_PADDING }}]}
+                // Load kernel matrix
+                memref.dma_start %W[%index2], %W_buffer[%c0, %c0], %c_mvin, %tag[%c0], %weight_axis, %vstride
+                    : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K_PADDING }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, 1]}
+                // matmul
+                linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M_PADDING }}x{{ TILE_K_PADDING }}xf32, 1>, memref<{{ TILE_K_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>)
+                      outs(%Y_buffer : memref<{{ TILE_M_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>)
+              } { accumulation_loop=true }
+              // Store output matrix
+              memref.dma_start %Y_buffer[%c0, %c0], %Y[%index0], %c_mvout, %tag[%c0], %input_axis, %vstride
+                  : memref<{{ TILE_M_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[1, {{ TILE_M_PADDING }}]}
+            } { outer_loop=true }
+          } { outer_loop=true }
+        } { outer_loop=true }
+      } { outer_loop=true }
     } { outer_loop=true }
   } { outer_loop=true }
   return
 }
 """
 
-
 CONV2D_FUNC_TEMPLATE = r"""
-def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}{% if BIAS %}, {{ BIAS }}{% endif %}, {{ OUT }}):
-    {{ INPUT }}_cpu = {{ INPUT }}.cpu()
-    {{ WEIGHT }}_cpu = {{ WEIGHT }}.cpu(){% if BIAS %}
-    {{ BIAS }}_cpu = {{ BIAS }}.cpu(){% endif %}
-    {{ OUT }}_cpu = {{ OUT }}.cpu()
-
-    # Torch support NCHW, so we need to transpose for now
-    {{ INPUT }}_cpu = {{ INPUT }}_cpu.permute(0, 2, 3, 1)
-    {{ WEIGHT }}_cpu = {{ WEIGHT }}_cpu.permute(0, 2, 3, 1)
-    {{ OUT }}_cpu = {{ OUT }}_cpu.permute(0, 2, 3, 1)
-    {{ OUT }}_cpu.zero_()
-
-    input_shape = {{ INPUT }}_cpu.shape
-    weight_shape = {{ WEIGHT }}_cpu.shape
-    output_shape = {{ OUT }}_cpu.shape
-    {{ OUT }}_cpu = {{ OUT }}_cpu.reshape(-1, output_shape[3]).contiguous()
-
-    input_pad_shape = (input_shape[0], input_shape[1]+2*{{ PADDING_H }}, input_shape[2]+2*{{ PADDING_W }}, input_shape[3])
-    input_pad = torch.zeros(input_pad_shape)
-
-    if {{ PADDING_H }} != 0 and {{ PADDING_W }} != 0:
-        input_pad[:, {{ PADDING_H }}:-{{ PADDING_H }}, {{ PADDING_W }}:-{{ PADDING_W }}, :] = {{ INPUT }}_cpu
-    elif {{ PADDING_H }} != 0:
-        input_pad[:, {{ PADDING_H }}:-{{ PADDING_H }}, :, :] = {{ INPUT }}_cpu
-    elif {{ PADDING_W }} != 0:
-        input_pad[:,:, {{ PADDING_W }}:-{{ PADDING_W }}, :] = {{ INPUT }}_cpu
-    else:
-        input_pad = {{ INPUT }}_cpu
-
-    {% if VALIDATION_MODE %}
-    {% endif %}
-
-    for kh in range(weight_shape[1]):
-        for kw in range(weight_shape[2]):
-            input_tile = input_pad[:, kh:input_pad_shape[1]-(weight_shape[1]-1)+kh, kw:input_pad_shape[2]-(weight_shape[2]-1)+kw, :]
-            input_tile = input_tile[:,::{{ STRIDE_H }},::{{ STRIDE_W }}, :]
-            kernel_tile = {{ WEIGHT }}_cpu[:, kh, kw, :].t()
-            input_tile = input_tile.reshape(-1, input_pad_shape[3])
-
-            {% if VALIDATION_MODE %}
-            if kh == 0 and kw == 0:
-                {{ KERNEL_NAME }}(input_tile, kernel_tile, {{ OUT }}_cpu, {{ OUT }}_cpu, intermediate_op=0b01)
-            elif kh == weight_shape[1]-1 and kw == weight_shape[2]-1:
-                {{ KERNEL_NAME }}(input_tile, kernel_tile, {{ OUT }}_cpu, {{ OUT }}_cpu, intermediate_op=0b10)
-            else:
-                {{ KERNEL_NAME }}(input_tile, kernel_tile, {{ OUT }}_cpu, {{ OUT }}_cpu, intermediate_op=0b11)
-            {% else %}
-            {{ KERNEL_NAME }}(input_tile, kernel_tile, {{ OUT }}_cpu, {{ OUT }}_cpu)  # input, weight, bias, out
-            {% endif %}
-            {% if BACKENDSIM_EAGER_MODE %}
-            yield ({{KERNEL_NAME}}, (input_tile, kernel_tile, {{ OUT }}_cpu, {{ OUT }}_cpu))
-            {% endif %}
-
-    {{ OUT }}_cpu = {{ OUT }}_cpu.reshape(output_shape)
-    {{ OUT }}_cpu = {{ OUT }}_cpu.permute(0, 3, 1, 2){% if BIAS %}
-    {{ OUT }}_cpu += {{ BIAS }}_cpu.reshape(-1, 1, 1) #TODO: BIAS should be added in the kernel{% endif %}
-    {{ OUT }}.copy_({{ OUT }}_cpu)
+def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif %}, {{ OUT }}):
+    # Padding input
+    padded_shape = list({{ INPUT }}.shape)
+    padded_shape[2] += 2 * {{ PADDING_H }}
+    padded_shape[3] += 2 * {{ PADDING_W }}
+    {{ INPUT }}_padding = torch.zeros(padded_shape, device={{ INPUT }}.device)
+    {{ INPUT }}_padding[:, :, {{ PADDING_H }}:{{ INPUT }}.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:{{ INPUT }}.shape[3] + {{ PADDING_W }}] = {{ INPUT }}
+
+    print(f"input_padding")
+    print({{ INPUT }}_padding.cpu())
+
+    # Tanspose tensors
+    t_{{ INPUT }} = {{ INPUT }}_padding.permute(2, 3, 0, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, I_W, BATCH, I_C)
+    t_{{ WEIGHT }} = {{ WEIGHT }}.permute(2, 3, 1, 0).contiguous() # (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)
+    t_{{ OUT }} = {{ OUT }}.permute(2, 3, 0, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (O_H, O_W, BATCH, O_C)
+
+    {{ KERNEL_NAME }}(t_{{ INPUT }}, t_{{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif %}, t_{{ OUT }})
+
+    # Transpose back
+    {{ OUT }}.copy_(t_{{ OUT }}.permute(2, 3, 0, 1).contiguous()) # (O_H, O_W, BATCH, O_C) -> (BATCH, O_C, O_H, O_W)
 """
 
-
 class MLIRConvTemplate(MLIRTemplate):
     def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
         super().__init__("kernel", input_nodes, layout, input_reorder)
@@ -129,7 +166,7 @@ def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
             + "_".join([str(i) for i in self.stride]) \
             + "_" + "_".join([str(i) for i in self.padding]) \
             + "_" + "_".join([str(i) for i in self.dilation])
-        self.gemm_args = ['input', 'weight', 'bias', 'output']
+        self.gemm_args = ['X', 'W', 'Bias', 'Y']
 
         self.calculate_gemm_shape()
 
@@ -142,6 +179,7 @@ def is_transposed(self, node):
                   raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
         return False
 
+    # Is this function necessary? (Find output node instead)
     def calculate_gemm_shape(self):
         input_shape = self.input_nodes[0].get_size()
         weight_shape = self.input_nodes[1].get_size()
@@ -152,11 +190,43 @@ def calculate_gemm_shape(self):
         self.gemm_weight_shape = [weight_shape[0],weight_shape[1],1,1]
         self.gemm_output_shape = [self.gemm_input_shape[2]*self.gemm_input_shape[3], self.gemm_weight_shape[0]] # Consider Batch size 1
 
+
+    # Can use math.multi ?
     def def_kernel(self) ->str:
-        input_size = self.gemm_input_shape[1]*self.gemm_input_shape[2]*self.gemm_input_shape[3]
-        weight_size = self.gemm_weight_shape[0]*self.gemm_weight_shape[1]
-        output_size = self.gemm_output_shape[0]*self.gemm_output_shape[1]
-        return f"%X: memref<{input_size}xf32>, %W: memref<{weight_size}xf32>, %B: memref<{output_size}xf32>, %Y: memref<{output_size}xf32>"
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        if len(self.input_nodes) == 3:
+          Bias = self.input_nodes[2]
+        else:
+          Bias = None
+
+        Y = self.output_node
+
+        def flatten(shape):
+            r = 1
+            for i in shape:
+                r *= i
+            return r
+
+        input_size = flatten(X.layout.size)
+        weight_size = flatten(W.layout.size)
+        if Bias is not None:
+          bias_size = flatten(Bias.layout.size)
+        output_size = flatten(Y.layout.size)
+
+        if Bias is None:
+          return f"%{self.gemm_args[0]}: memref<{input_size}xf32>, %{self.gemm_args[1]}: memref<{weight_size}xf32>, %{self.gemm_args[3]}: memref<{output_size}xf32>"
+        else:
+          return f"%{self.gemm_args[0]}: memref<{input_size}xf32>, %{self.gemm_args[1]}: memref<{weight_size}xf32>, %{self.gemm_args[2]}: memref<{bias_size}xf32>, %{self.gemm_args[3]}: memref<{output_size}xf32>"
+
+    def get_tile_options(self):
+        BATCH, I_C = self.input_nodes[0].layout.size[0], self.input_nodes[0].layout.size[1]
+        O_C = self.input_nodes[1].layout.size[0]
+
+        tile_m_options = divisors(BATCH)
+        tile_n_options = divisors(O_C)
+        tile_k_options = divisors(I_C)
+
+        return tile_m_options, tile_n_options, tile_k_options
 
     def render(self,
                kernel: MLIRTemplateKernel,
@@ -173,41 +243,70 @@ def render(self,
         Y = self.output_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
-        M = self.gemm_input_shape[2] * self.gemm_input_shape[3]
-        N = self.gemm_weight_shape[0]
-        K = self.gemm_weight_shape[1]
-        TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
-        kernel.loop_size = [M, N, K]
+        O_H = self.gemm_input_shape[2]
+        O_W = self.gemm_input_shape[3]
 
-        W_transposed = self.is_transposed(W)
-        X_transposed = self.is_transposed(X)
+        tile_m_options, tile_n_options, tile_k_options = self.get_tile_options()
+
+        TILE_M = tile_m_options[0]
+        TILE_N = tile_n_options[0]
+        TILE_K = tile_k_options[0]
+
+        BATCH = X.layout.size[0]
+        I_C = X.layout.size[1]
+        O_C = W.layout.size[0]
+        K_H = W.layout.size[2]
+        K_W = W.layout.size[3]
+
+        TILE_M_PADDING= int((TILE_M + kernel.vector_lane - 1) // kernel.vector_lane * kernel.vector_lane)
+        TILE_N_PADDING= int((TILE_N + kernel.vector_lane - 1) // kernel.vector_lane * kernel.vector_lane)
+        TILE_K_PADDING= int((TILE_K + kernel.vector_lane - 1) // kernel.vector_lane * kernel.vector_lane)
+
+        kernel.tile_size = [TILE_M_PADDING, TILE_N_PADDING, TILE_K_PADDING]
+        kernel.loop_size = [K_H, K_W, O_H, O_W, BATCH, O_C, I_C]
+
+        # W_transposed = self.is_transposed(W)
+        # X_transposed = self.is_transposed(X)
 
         options = dict(
             KERNEL_NAME=self.name,
             KERNEL_DEF=self.def_kernel(),
             kernel=kernel,
-            M=M,
-            N=N,
-            K=K,
+            BATCH=X.layout.size[0],
+            I_C=X.layout.size[1],
+            I_H=X.layout.size[2],
+            I_W=X.layout.size[3],
+            O_C=W.layout.size[0],
+            K_H=W.layout.size[2],
+            K_W=W.layout.size[3],
+            O_H=O_H,
+            O_W=O_W,
             TILE_M=TILE_M,
             TILE_N=TILE_N,
             TILE_K=TILE_K,
+            TILE_M_PADDING=TILE_M_PADDING,
+            TILE_N_PADDING=TILE_N_PADDING,
+            TILE_K_PADDING=TILE_K_PADDING,
+            PADDING_H=self.padding[0],
+            PADDING_W=self.padding[1],
+            STRIDE_H=self.stride[0],
+            STRIDE_W=self.stride[1],
+            DILATION_H=self.dilation[0],
+            DILATION_W=self.dilation[1],
             DATA_STYPE="f32",
             DATA_SIZE=4,
+            BIAS=Bias
         )
         code = self._template_from_string(GEMM_TEMPLATE).render(**options)
 
-        self.header = f"float X_spad[{TILE_M * TILE_K // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float W_spad[{TILE_K * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float Y_spad[{TILE_M * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header = f"float X_spad[{TILE_M * TILE_K}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float W_spad[{TILE_K * TILE_N}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float Y_spad[{TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
-        if Bias is not None:
-            self.header += f"float B_spad[{TILE_M * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-            self.gem5_header += f"float B_spad[{TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
+        self.header = f"float X_spad[{TILE_M_PADDING * TILE_K_PADDING // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float W_spad[{TILE_K_PADDING * TILE_N_PADDING // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float Y_spad[{TILE_M_PADDING * TILE_N_PADDING // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header = f"float X_spad[{TILE_M_PADDING * TILE_K_PADDING}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header += f"float W_spad[{TILE_K_PADDING * TILE_N_PADDING}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header += f"float Y_spad[{TILE_M_PADDING * TILE_N_PADDING}] __attribute__ ((section(\".spad\")));\n"
 
-        kernel.add_loop_info([options["M"], options["N"], options["K"]], [options["TILE_M"], options["TILE_N"], options["TILE_K"]])
+        kernel.add_loop_info([options["K_H"], options["K_W"], options["O_H"], options["O_W"], options["BATCH"], options["O_C"], options["I_C"]], [options["TILE_M_PADDING"], options["TILE_N_PADDING"], options["TILE_K_PADDING"]])
         kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=self.input_reorder)
 
         return code
@@ -218,14 +317,10 @@ def outer_func_render(self, kernel_name, input_args):
             FUNC_NAME=self.function_name,
             INPUT=input_args[0],
             WEIGHT=input_args[1],
-            BIAS=input_args[2] if len(input_args) == 4 else None,
+            BIAS=0 if len(input_args) == 3 else input_args[2],
             OUT=input_args[3] if len(input_args) == 4 else input_args[2],
             PADDING_H=self.padding[0],
             PADDING_W=self.padding[1],
-            STRIDE_H=self.stride[0],
-            STRIDE_W=self.stride[1],
-            DILATION_H=self.dilation[0],
-            DILATION_W=self.dilation[1],
             VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
             BACKENDSIM_EAGER_MODE=extension_config.CONFIG_BACKENDSIM_EAGER_MODE,
             HASH_VALUE=self.hash_value
@@ -236,19 +331,34 @@ def outer_func_render(self, kernel_name, input_args):
     def get_arg_attributes(self):
         arg_attributes = []
 
-        input_shape = self.input_nodes[0].get_size()
-        weight_shape = self.input_nodes[1].get_size()
-        gemm_h = int((input_shape[2] + 2*self.padding[0] - (weight_shape[2]-1) - 1) / self.stride[0]) + 1
-        gemm_w = int((input_shape[3] + 2*self.padding[1] - (weight_shape[3]-1) - 1) / self.stride[1]) + 1
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
+        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
-        gemm_input_shape = [input_shape[0],input_shape[1],gemm_h, gemm_w]
-        gemm_weight_shape = [weight_shape[0],weight_shape[1],1,1]
-        gemm_output_shape = [gemm_input_shape[2]*gemm_input_shape[3], gemm_weight_shape[0]] # Consider Batch size 1
+        X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)]
+        W_shape = [W.get_size()[i] for i in (2, 3, 1, 0)]
+        Y_shape = [Y.get_size()[i] for i in (2, 3, 0, 1)]
 
-        arg_attributes.append([self.gemm_args[0], [MLIRKernelArgs.MLIR_ARGS_IN, self.input_nodes[0].layout.dtype, math.prod(gemm_input_shape)]])
-        arg_attributes.append([self.gemm_args[1], [MLIRKernelArgs.MLIR_ARGS_IN, self.input_nodes[1].layout.dtype, math.prod(gemm_weight_shape)]])
-        arg_attributes.append([self.gemm_args[2], [MLIRKernelArgs.MLIR_ARGS_IN, self.input_nodes[0].layout.dtype, math.prod(gemm_output_shape)]])
-        arg_attributes.append([self.gemm_args[3], [MLIRKernelArgs.MLIR_ARGS_OUT, self.input_nodes[0].layout.dtype, math.prod(gemm_output_shape)]])
+        if Bias is not None:
+          Bias_shape = [Bias.get_size()]
+
+        def compute_stride(shape):
+            stride = [1] * len(shape)
+            for i in range(len(shape)-2, -1, -1):
+                stride[i] = stride[i+1] * shape[i+1]
+            return stride
+
+        X_stride = compute_stride(X_shape)
+        W_stride = compute_stride(W_shape)
+        Y_stride = compute_stride(Y_shape)
+        if Bias is not None:
+          Bias_stride = compute_stride(Bias_shape)
+
+        arg_attributes.append([self.gemm_args[0], [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X.get_size()), X_shape, X_stride]])
+        arg_attributes.append([self.gemm_args[1], [MLIRKernelArgs.MLIR_ARGS_IN, W.layout.dtype, math.prod(W.get_size()), W_shape, W_stride]])
+        if Bias is not None:
+          arg_attributes.append([self.gemm_args[2], [MLIRKernelArgs.MLIR_ARGS_IN, Bias.layout.dtype, math.prod(Bias.get_size()), Bias_shape, Bias_stride]])
+        arg_attributes.append([self.gemm_args[3], [MLIRKernelArgs.MLIR_ARGS_OUT, Y.layout.dtype, math.prod(Y.get_size()), Y_shape, Y_stride]])
 
         return arg_attributes
 
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index a8787b02..e7ca37eb 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -9,7 +9,6 @@
 from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
 from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
 from PyTorchSimFrontend.mlir.mlir_conv_template import MLIRConvTemplate
-from PyTorchSimFrontend.mlir.mlir_new_conv_template import MLIRConvTemplate
 from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
 
 aten = torch.ops.aten
diff --git a/PyTorchSimFrontend/mlir/mlir_new_conv_template.py b/PyTorchSimFrontend/mlir/mlir_new_conv_template.py
deleted file mode 100644
index 89f31668..00000000
--- a/PyTorchSimFrontend/mlir/mlir_new_conv_template.py
+++ /dev/null
@@ -1,250 +0,0 @@
-import os
-import math
-from sympy import divisors, Range
-from typing import List, Optional, cast
-
-from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
-from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
-from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
-from torch._inductor.ir import Buffer
-from torch._inductor.ir import IRNode
-from torch._inductor.ir import ReinterpretView
-from torch._inductor.codecache import write_atomic
-import PyTorchSimFrontend.extension_codecache as extension_codecache
-from torch._inductor.codecache import get_hash
-from PyTorchSimFrontend import extension_config
-
-GEMM_TEMPLATE = r"""
-%map1 = affine_map<(d0, d1, d2, d3) -> ()>
-memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-memref.global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-
-func.func @{{ KERNEL_NAME }}({{ KERNEL_DEF }}) {
-  %c_mvin = arith.constant 2 : index
-  %c_mvin2 = arith.constant 1 : index
-  %c_mvin3 = arith.constant 14 : index
-  %c_mvout = arith.constant 3 : index
-  %c_set = arith.constant 2 : index
-  %c0 = arith.constant 0 : index
-
-  // 1x1 convolution loop
-  affine.for %k_h = 0 to {{ K_H }} {
-    affine.for %k_w = 0 to {{ K_W }} {
-      // 1x1 convolution tiling loop
-        affine.for %tile_m = 0 to {{ M * N }} step {{ TILE_M }} {
-          affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
-            %index2 = affine.apply #map1(%k_h, %k_w, %tile_m, %tile_n)
-            affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
-            
-            }
-          }
-        }
-      }
-  } { outer_loop=true }
-  return
-}
-"""
-
-CONV2D_FUNC_TEMPLATE = r"""
-def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}, {{ OUT }}):
-    # Tanspose tensors
-    t_{{ INPUT }} = {{ INPUT }}.permute(0, 2, 3, 1)
-    t_{{ WEIGHT }} = {{ WEIGHT }}.permute(0, 2, 3, 1)
-    t_{{ OUT }} = {{ OUT }}.permute(0, 2, 3, 1)
-
-    {{ KERNEL_NAME }}(t_{{ INPUT }}, t_{{ WEIGHT }}, t_{{ OUT }})
-
-    # Transpose back
-    {{ OUT }} = t_{{ OUT }}.permute(0, 3, 1, 2)
-
-    print("Print OUTPUT ")
-    print("out > ")
-    print({{ OUT }}.shape)
-    print({{ OUT }}.cpu())
-"""
-
-
-class MLIRConvTemplate(MLIRTemplate):
-    def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
-        super().__init__("kernel", input_nodes, layout, input_reorder)
-        self.stride = kwargs["stride"]
-        self.padding = kwargs["padding"]
-        self.dilation = kwargs["dilation"]
-        weight_shape = [str(i) for i in input_nodes[1].layout.size]
-        self.function_name = "Conv2D_" + "_".join(weight_shape)+ "_" \
-            + "_".join([str(i) for i in self.stride]) \
-            + "_" + "_".join([str(i) for i in self.padding]) \
-            + "_" + "_".join([str(i) for i in self.dilation])
-        self.gemm_args = ['input', 'weight', 'output']
-
-        self.calculate_gemm_shape()
-
-    def is_transposed(self, node):
-        if isinstance(node, ReinterpretView):
-            if node.layout.stride != node.data.layout.stride:
-                if node.layout.stride[-2] == node.data.layout.stride[-1] and node.layout.stride[-1] == node.data.layout.stride[-2]:
-                    return True
-                else:
-                  raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
-        return False
-
-    def calculate_gemm_shape(self):
-        input_shape = self.input_nodes[0].get_size()
-        weight_shape = self.input_nodes[1].get_size()
-        gemm_h = int((input_shape[2] + 2*self.padding[0] - (weight_shape[2]-1) - 1) / self.stride[0]) + 1
-        gemm_w = int((input_shape[3] + 2*self.padding[1] - (weight_shape[3]-1) - 1) / self.stride[1]) + 1
-
-        self.gemm_input_shape = [input_shape[0],input_shape[1],gemm_h, gemm_w]
-        self.gemm_weight_shape = [weight_shape[0],weight_shape[1],1,1]
-        self.gemm_output_shape = [self.gemm_input_shape[2]*self.gemm_input_shape[3], self.gemm_weight_shape[0]] # Consider Batch size 1
-
-    def def_kernel(self) ->str:
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        Y = self.output_node
-
-        def flatten(shape):
-            r = 1
-            for i in shape:
-                r *= i
-            return r
-
-        input_size = flatten(X.layout.size)
-        weight_size = flatten(W.layout.size)
-        output_size = flatten(Y.layout.size)
-        return f"%X: memref<{input_size}xf32>, %W: memref<{weight_size}xf32>, %Y: memref<{output_size}xf32>"
-
-    def get_tile_option(self):
-        I_C, H, W = self.input_nodes[0].layout.size[1], self.input_nodes[0].layout.size[2], self.input_nodes[0].layout.size[3]
-        O_C = self.input_nodes[1].layout.size[0]
-        
-        tile_k_options = divisors(I_C)
-        tile_n_options = divisors(O_C)
-
-        H_divisors = divisors(H)
-        H_multiples = list(Range(H, H * W, H))
-        tile_m_options = sorted(set(H_divisors) | set(H_multiples))
-        
-        return tile_m_options, tile_n_options, tile_k_options
-        
-
-    def render(self,
-               kernel: MLIRTemplateKernel,
-               template_buffer_node = None,
-               epilogue_nodes: Optional[List[IRNode]] = None,
-               **kwargs):
-        if template_buffer_node is not None:
-            self.output_node = template_buffer_node
-        if epilogue_nodes is not None and len(epilogue_nodes) > 0:
-            self.output_node = cast(Buffer, epilogue_nodes[-1])
-            self.function_name += f"_fused_{epilogue_nodes[0].node.origin_node.name}"
-
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        Y = self.output_node
-        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
-
-        M = self.gemm_input_shape[2] * self.gemm_input_shape[3]
-        N = self.gemm_weight_shape[0]
-        K = self.gemm_weight_shape[1]
-        # TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
-        # kernel.tile_size = [TILE_M, TILE_N, TILE_K]
-        # kernel.loop_size = [M, N, K]
-
-        tile_m_options, tile_n_options, tile_k_options = self.get_tile_option()
-        print("tile_m_options: ", tile_m_options)
-        print("tile_n_options: ", tile_n_options)
-        print("tile_k_options: ", tile_k_options)
-
-        TILE_M = tile_m_options[3]
-        TILE_N = tile_n_options[3]
-        TILE_K = tile_k_options[3]
-
-        kernel.tile_size = [TILE_M, TILE_N, TILE_K]
-        kernel.loop_size = [M, N, K]
-
-        # W_transposed = self.is_transposed(W)
-        # X_transposed = self.is_transposed(X)
-
-        options = dict(
-            KERNEL_NAME=self.name,
-            KERNEL_DEF=self.def_kernel(),
-            kernel=kernel,
-            I_C=X.layout.size[1],
-            I_H=X.layout.size[2],
-            I_W=X.layout.size[3],
-            O_C=W.layout.size[0],
-            K_H=W.layout.size[2],
-            K_W=W.layout.size[3],
-            M=M,
-            N=N,
-            K=K,
-            TILE_M=TILE_M,
-            TILE_N=TILE_N,
-            TILE_K=TILE_K,
-            PADDING_H=self.padding[0],
-            PADDING_W=self.padding[1],
-            STRIDE_H=self.stride[0],
-            STRIDE_W=self.stride[1],
-            DILATION_H=self.dilation[0],
-            DILATION_W=self.dilation[1],
-            DATA_STYPE="f32",
-            DATA_SIZE=4,
-        )
-        code = self._template_from_string(GEMM_TEMPLATE).render(**options)
-
-        self.header = f"float X_spad[{TILE_M * TILE_K // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float W_spad[{TILE_K * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float Y_spad[{TILE_M * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header = f"float X_spad[{TILE_M * TILE_K}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float W_spad[{TILE_K * TILE_N}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float Y_spad[{TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
-
-        kernel.add_loop_info([options["M"], options["N"], options["K"]], [options["TILE_M"], options["TILE_N"], options["TILE_K"]])
-        kernel.def_kernel(inputs=[X, W], outputs=[Y], names_str="X, W, Y", input_reorder=self.input_reorder)
-
-        return code
-
-    def outer_func_render(self, kernel_name, input_args):
-        options = dict(
-            KERNEL_NAME=kernel_name,
-            FUNC_NAME=self.function_name,
-            INPUT=input_args[0],
-            WEIGHT=input_args[1],
-            OUT=input_args[3] if len(input_args) == 4 else input_args[2],
-            VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
-            BACKENDSIM_EAGER_MODE=extension_config.CONFIG_BACKENDSIM_EAGER_MODE,
-            HASH_VALUE=self.hash_value
-        )
-        code = self._template_from_string(CONV2D_FUNC_TEMPLATE).render(**options)
-        return code, self.function_name
-
-    def get_arg_attributes(self):
-        arg_attributes = []
-
-        input_shape = self.input_nodes[0].get_size()
-        weight_shape = self.input_nodes[1].get_size()
-        gemm_h = int((input_shape[2] + 2*self.padding[0] - (weight_shape[2]-1) - 1) / self.stride[0]) + 1
-        gemm_w = int((input_shape[3] + 2*self.padding[1] - (weight_shape[3]-1) - 1) / self.stride[1]) + 1
-
-        gemm_input_shape = [input_shape[0],input_shape[1],gemm_h, gemm_w]
-        gemm_weight_shape = [weight_shape[0],weight_shape[1],1,1]
-        gemm_output_shape = [gemm_input_shape[2]*gemm_input_shape[3], gemm_weight_shape[0]] # Consider Batch size 1
-
-        arg_attributes.append([self.gemm_args[0], [MLIRKernelArgs.MLIR_ARGS_IN, self.input_nodes[0].layout.dtype, math.prod(gemm_input_shape)]])
-        arg_attributes.append([self.gemm_args[1], [MLIRKernelArgs.MLIR_ARGS_IN, self.input_nodes[1].layout.dtype, math.prod(gemm_weight_shape)]])
-        # arg_attributes.append([self.gemm_args[2], [MLIRKernelArgs.MLIR_ARGS_IN, self.input_nodes[0].layout.dtype, math.prod(gemm_output_shape)]])
-        arg_attributes.append([self.gemm_args[2], [MLIRKernelArgs.MLIR_ARGS_OUT, self.input_nodes[0].layout.dtype, math.prod(gemm_output_shape)]])
-
-        return arg_attributes
-
-    def codegen_header(self, code, extra_headers):
-        write_path = extension_codecache.get_write_path(code)
-        if not os.path.exists(write_path):
-            os.makedirs(write_path)
-        spike_write_path = os.path.join(write_path, "global_var.h")
-        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-        if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, self.header+extra_headers[0])
-        if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, self.gem5_header+extra_headers[1])
-        self.hash_value = get_hash(code.strip())
\ No newline at end of file

From 30e0cffad093ccc5278ffd70be0500deccb45e57 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 5 Feb 2025 08:44:58 +0000
Subject: [PATCH 074/432] [Template] Convolution padding implement

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 69 ++++++-------------
 1 file changed, 21 insertions(+), 48 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 7f9bf296..bbd17b28 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -14,14 +14,6 @@
 from torch._inductor.codecache import get_hash
 from PyTorchSimFrontend import extension_config
 
-
-# %tmp_i_h = arith.muli %o_h, %stride_h : index
-# %index_i_h = arith.addi %tmp_i_h, %k_h : index
-# %tmp_i_w = arith.muli %o_w, %stride_w : index
-# %index_i_w = arith.addi %tmp_i_w, %k_w : index
-# %tmp_k = arith.muli %k_h, %K_W : index
-# %index_k_hw = arith.addi %tmp_k, %k_w : index
-
 GEMM_TEMPLATE = r"""
 // Conv2D kernel
 // BATCH = {{ BATCH }}
@@ -49,7 +41,7 @@
 // DATA_SIZE = {{ DATA_SIZE }}
 
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * BATCH * O_C }} + d1 * {{ BATCH * O_C }} + d2 * {{ O_C }} + d3)> // output (O_H, O_W, BATCH, O_C)
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ I_W * BATCH * I_C }} + d1 * {{ BATCH * I_C }} + d2 * {{ I_C }} + d3)> // input (I_H, I_W, BATCH, I_C)
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * BATCH * I_C }} + d1 * {{ BATCH * I_C }} + d2 * {{ I_C }} + d3)> // input (I_H, I_W, BATCH, I_C)
 #map2 = affine_map<(d0, d1, d2) -> (d0 * {{ I_C * O_C }} + d1 * {{ O_C }} + d2)> // weight (K_H * K_W, I_C, O_C)
 #map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
 #map_I_W = affine_map<(d0, d1) -> (d0 * {{ STRIDE_W }} + d1)>
@@ -111,7 +103,7 @@
                 %index2 = affine.apply #map2(%index_k_hw, %tile_k, %tile_n) // weight index
                 // Load input matrix
                 memref.dma_start %X[%index1], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %input_axis, %vstride
-                    : memref<{{ BATCH * I_C * I_H * I_W }}xf32>, memref<{{ TILE_M_PADDING }}x{{ TILE_K_PADDING }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K_PADDING }}], async=1, sram_stride=[1, {{ TILE_M_PADDING }}]}
+                    : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_M_PADDING }}x{{ TILE_K_PADDING }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K_PADDING }}], async=1, sram_stride=[1, {{ TILE_M_PADDING }}]}
                 // Load kernel matrix
                 memref.dma_start %W[%index2], %W_buffer[%c0, %c0], %c_mvin, %tag[%c0], %weight_axis, %vstride
                     : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K_PADDING }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, 1]}
@@ -141,9 +133,6 @@ def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif
     {{ INPUT }}_padding = torch.zeros(padded_shape, device={{ INPUT }}.device)
     {{ INPUT }}_padding[:, :, {{ PADDING_H }}:{{ INPUT }}.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:{{ INPUT }}.shape[3] + {{ PADDING_W }}] = {{ INPUT }}
 
-    print(f"input_padding")
-    print({{ INPUT }}_padding.cpu())
-
     # Tanspose tensors
     t_{{ INPUT }} = {{ INPUT }}_padding.permute(2, 3, 0, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, I_W, BATCH, I_C)
     t_{{ WEIGHT }} = {{ WEIGHT }}.permute(2, 3, 1, 0).contiguous() # (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)
@@ -166,9 +155,7 @@ def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
             + "_".join([str(i) for i in self.stride]) \
             + "_" + "_".join([str(i) for i in self.padding]) \
             + "_" + "_".join([str(i) for i in self.dilation])
-        self.gemm_args = ['X', 'W', 'Bias', 'Y']
-
-        self.calculate_gemm_shape()
+        self.kernel_args = ['X', 'W', 'Bias', 'Y']
 
     def is_transposed(self, node):
         if isinstance(node, ReinterpretView):
@@ -179,44 +166,28 @@ def is_transposed(self, node):
                   raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
         return False
 
-    # Is this function necessary? (Find output node instead)
-    def calculate_gemm_shape(self):
-        input_shape = self.input_nodes[0].get_size()
-        weight_shape = self.input_nodes[1].get_size()
-        gemm_h = int((input_shape[2] + 2*self.padding[0] - (weight_shape[2]-1) - 1) / self.stride[0]) + 1
-        gemm_w = int((input_shape[3] + 2*self.padding[1] - (weight_shape[3]-1) - 1) / self.stride[1]) + 1
-
-        self.gemm_input_shape = [input_shape[0],input_shape[1],gemm_h, gemm_w]
-        self.gemm_weight_shape = [weight_shape[0],weight_shape[1],1,1]
-        self.gemm_output_shape = [self.gemm_input_shape[2]*self.gemm_input_shape[3], self.gemm_weight_shape[0]] # Consider Batch size 1
-
-
     # Can use math.multi ?
     def def_kernel(self) ->str:
         X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
         if len(self.input_nodes) == 3:
           Bias = self.input_nodes[2]
         else:
           Bias = None
 
-        Y = self.output_node
-
-        def flatten(shape):
-            r = 1
-            for i in shape:
-                r *= i
-            return r
-
-        input_size = flatten(X.layout.size)
-        weight_size = flatten(W.layout.size)
+        input_padded = list(X.layout.size)
+        input_padded[2] += 2 * self.padding[0]
+        input_padded[3] += 2 * self.padding[1]
+        input_size = math.prod(input_padded)
+        weight_size = math.prod(W.layout.size)
         if Bias is not None:
-          bias_size = flatten(Bias.layout.size)
-        output_size = flatten(Y.layout.size)
+          bias_size = math.prod(Bias.layout.size)
+        output_size = math.prod(Y.layout.size)
 
         if Bias is None:
-          return f"%{self.gemm_args[0]}: memref<{input_size}xf32>, %{self.gemm_args[1]}: memref<{weight_size}xf32>, %{self.gemm_args[3]}: memref<{output_size}xf32>"
+          return f"%{self.kernel_args[0]}: memref<{input_size}xf32>, %{self.kernel_args[1]}: memref<{weight_size}xf32>, %{self.kernel_args[3]}: memref<{output_size}xf32>"
         else:
-          return f"%{self.gemm_args[0]}: memref<{input_size}xf32>, %{self.gemm_args[1]}: memref<{weight_size}xf32>, %{self.gemm_args[2]}: memref<{bias_size}xf32>, %{self.gemm_args[3]}: memref<{output_size}xf32>"
+          return f"%{self.kernel_args[0]}: memref<{input_size}xf32>, %{self.kernel_args[1]}: memref<{weight_size}xf32>, %{self.kernel_args[2]}: memref<{bias_size}xf32>, %{self.kernel_args[3]}: memref<{output_size}xf32>"
 
     def get_tile_options(self):
         BATCH, I_C = self.input_nodes[0].layout.size[0], self.input_nodes[0].layout.size[1]
@@ -243,8 +214,8 @@ def render(self,
         Y = self.output_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
-        O_H = self.gemm_input_shape[2]
-        O_W = self.gemm_input_shape[3]
+        O_H = Y.layout.size[2]
+        O_W = Y.layout.size[3]
 
         tile_m_options, tile_n_options, tile_k_options = self.get_tile_options()
 
@@ -336,6 +307,8 @@ def get_arg_attributes(self):
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
         X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)]
+        X_shape[0] += 2 * self.padding[0]
+        X_shape[1] += 2 * self.padding[1]
         W_shape = [W.get_size()[i] for i in (2, 3, 1, 0)]
         Y_shape = [Y.get_size()[i] for i in (2, 3, 0, 1)]
 
@@ -354,11 +327,11 @@ def compute_stride(shape):
         if Bias is not None:
           Bias_stride = compute_stride(Bias_shape)
 
-        arg_attributes.append([self.gemm_args[0], [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X.get_size()), X_shape, X_stride]])
-        arg_attributes.append([self.gemm_args[1], [MLIRKernelArgs.MLIR_ARGS_IN, W.layout.dtype, math.prod(W.get_size()), W_shape, W_stride]])
+        arg_attributes.append([self.kernel_args[0], [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]])
+        arg_attributes.append([self.kernel_args[1], [MLIRKernelArgs.MLIR_ARGS_IN, W.layout.dtype, math.prod(W_shape), W_shape, W_stride]])
         if Bias is not None:
-          arg_attributes.append([self.gemm_args[2], [MLIRKernelArgs.MLIR_ARGS_IN, Bias.layout.dtype, math.prod(Bias.get_size()), Bias_shape, Bias_stride]])
-        arg_attributes.append([self.gemm_args[3], [MLIRKernelArgs.MLIR_ARGS_OUT, Y.layout.dtype, math.prod(Y.get_size()), Y_shape, Y_stride]])
+          arg_attributes.append([self.kernel_args[2], [MLIRKernelArgs.MLIR_ARGS_IN, Bias.layout.dtype, math.prod(Bias_shape), Bias_shape, Bias_stride]])
+        arg_attributes.append([self.kernel_args[3], [MLIRKernelArgs.MLIR_ARGS_OUT, Y.layout.dtype, math.prod(Y_shape), Y_shape, Y_stride]])
 
         return arg_attributes
 

From 8821ffe978f443578e30220c39e7d3a132b9600a Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Thu, 6 Feb 2025 03:03:10 +0000
Subject: [PATCH 075/432] [Template] convolution template cleanup

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 93 +++++++++----------
 1 file changed, 43 insertions(+), 50 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index bbd17b28..90214f19 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -14,7 +14,7 @@
 from torch._inductor.codecache import get_hash
 from PyTorchSimFrontend import extension_config
 
-GEMM_TEMPLATE = r"""
+CONV_TEMPLATE = r"""
 // Conv2D kernel
 // BATCH = {{ BATCH }}
 // I_C = {{ I_C }}
@@ -28,9 +28,9 @@
 // TILE_M = {{ TILE_M }}
 // TILE_N = {{ TILE_N }}
 // TILE_K = {{ TILE_K }}
-// TILE_M_PADDING = {{ TILE_M_PADDING }}
-// TILE_N_PADDING = {{ TILE_N_PADDING }}
-// TILE_K_PADDING = {{ TILE_K_PADDING }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
 // PADDING_H = {{ PADDING_H }}
 // PADDING_W = {{ PADDING_W }}
 // STRIDE_H = {{ STRIDE_H }}
@@ -47,9 +47,9 @@
 #map_I_W = affine_map<(d0, d1) -> (d0 * {{ STRIDE_W }} + d1)>
 #map_K_HW = affine_map<(d0, d1) -> (d0 * {{ K_W }} + d1)>
 
-memref.global @X_spad : memref<{{TILE_M_PADDING }}x{{ TILE_K_PADDING }}xf32, 1>
-memref.global @W_spad : memref<{{ TILE_K_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>
-memref.global @Y_spad : memref<{{ TILE_M_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>
+memref.global @X_spad : memref<{{TILE_M }}x{{ TILE_K }}xf32, 1>
+memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+memref.global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
 
 func.func @{{ KERNEL_NAME }}({{ KERNEL_DEF }}) {
   %c_mvin = arith.constant 2 : index
@@ -59,11 +59,11 @@
   %vstride = arith.constant 1 : index
   %input_axis = arith.constant 3 : index
   %weight_axis = arith.constant 2 : index
-  %X_buffer = memref.get_global @X_spad : memref<{{ TILE_M_PADDING }}x{{ TILE_K_PADDING }}xf32, 1>
-  %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>
-  %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>
+  %X_buffer = memref.get_global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+  %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+  %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
   %tag = memref.alloc() : memref<1xi32>
-  %v0 = arith.constant dense<0.0> : vector<{{ TILE_N_PADDING }}xf32>
+  %v0 = arith.constant dense<0.0> : vector<{{ TILE_N }}xf32>
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -77,8 +77,8 @@
       // 1x1 convolution tiling loop
       affine.for %o_h = 0 to {{ O_H }} {
         affine.for %o_w = 0 to {{ O_W }} {
-          affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M_PADDING }} {
-            affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N_PADDING }} {
+          affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M }} {
+            affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
               // Init output matrix
               %index0 = affine.apply #map0(%o_h, %o_w, %tile_m, %tile_n)
               %cond_h = arith.cmpi eq, %k_h, %c0 : index
@@ -87,15 +87,15 @@
               scf.if %cond_hw {
                 {%- if BIAS %}
                 memref.dma_start %Bias[%tile_n], %Y_buffer[%c0, %c0], %c_mvin, %tag[%c0], %c0, %vstride
-                    : memref<{{ O_C }}xf32>, memref<{{ TILE_M_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>, memref<1xi32> { async=1, sram_stride=[1, {{ TILE_M_PADDING }}]}
+                    : memref<{{ O_C }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { async=1, sram_stride=[1, {{ TILE_M }}]}
                 {%- else %}
-                affine.vector_store %v0, %Y_buffer[%c0, %c0] : memref<{{ TILE_M_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>, vector<{{ TILE_N_PADDING }}xf32>
+                affine.vector_store %v0, %Y_buffer[%c0, %c0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ TILE_N }}xf32>
                 {%- endif %}
               } else {
                 memref.dma_start %Y[%index0], %Y_buffer[%c0, %c0], %c_mvin, %tag[%c0], %input_axis, %vstride
-                    : memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<{{ TILE_M_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>, memref<1xi32> { padding=0, sram_stride=[1, {{ TILE_M_PADDING }}]}
+                    : memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { padding=0, sram_stride=[1, {{ TILE_M }}]}
               }
-              affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K_PADDING }} {
+              affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
                 %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
                 %index_i_w = affine.apply #map_I_W(%o_w, %k_w)
                 %index1 = affine.apply #map1(%index_i_h, %index_i_w, %tile_m, %tile_k) // input index
@@ -103,17 +103,17 @@
                 %index2 = affine.apply #map2(%index_k_hw, %tile_k, %tile_n) // weight index
                 // Load input matrix
                 memref.dma_start %X[%index1], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %input_axis, %vstride
-                    : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_M_PADDING }}x{{ TILE_K_PADDING }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K_PADDING }}], async=1, sram_stride=[1, {{ TILE_M_PADDING }}]}
+                    : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
                 // Load kernel matrix
                 memref.dma_start %W[%index2], %W_buffer[%c0, %c0], %c_mvin, %tag[%c0], %weight_axis, %vstride
-                    : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K_PADDING }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, 1]}
+                    : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, 1]}
                 // matmul
-                linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M_PADDING }}x{{ TILE_K_PADDING }}xf32, 1>, memref<{{ TILE_K_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>)
-                      outs(%Y_buffer : memref<{{ TILE_M_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>)
+                linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>)
+                      outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>)
               } { accumulation_loop=true }
               // Store output matrix
               memref.dma_start %Y_buffer[%c0, %c0], %Y[%index0], %c_mvout, %tag[%c0], %input_axis, %vstride
-                  : memref<{{ TILE_M_PADDING }}x{{ TILE_N_PADDING }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[1, {{ TILE_M_PADDING }}]}
+                  : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[1, {{ TILE_M }}]}
             } { outer_loop=true }
           } { outer_loop=true }
         } { outer_loop=true }
@@ -124,7 +124,7 @@
 }
 """
 
-CONV2D_FUNC_TEMPLATE = r"""
+WRAPPER_TEMPLATE = r"""
 def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif %}, {{ OUT }}):
     # Padding input
     padded_shape = list({{ INPUT }}.shape)
@@ -190,12 +190,13 @@ def def_kernel(self) ->str:
           return f"%{self.kernel_args[0]}: memref<{input_size}xf32>, %{self.kernel_args[1]}: memref<{weight_size}xf32>, %{self.kernel_args[2]}: memref<{bias_size}xf32>, %{self.kernel_args[3]}: memref<{output_size}xf32>"
 
     def get_tile_options(self):
-        BATCH, I_C = self.input_nodes[0].layout.size[0], self.input_nodes[0].layout.size[1]
+        BATCH = self.input_nodes[0].layout.size[0]
+        I_C = self.input_nodes[0].layout.size[1]
         O_C = self.input_nodes[1].layout.size[0]
 
         tile_m_options = divisors(BATCH)
-        tile_n_options = divisors(O_C)
         tile_k_options = divisors(I_C)
+        tile_n_options = divisors(O_C)
 
         return tile_m_options, tile_n_options, tile_k_options
 
@@ -214,28 +215,23 @@ def render(self,
         Y = self.output_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
-        O_H = Y.layout.size[2]
-        O_W = Y.layout.size[3]
-
-        tile_m_options, tile_n_options, tile_k_options = self.get_tile_options()
-
-        TILE_M = tile_m_options[0]
-        TILE_N = tile_n_options[0]
-        TILE_K = tile_k_options[0]
-
         BATCH = X.layout.size[0]
         I_C = X.layout.size[1]
         O_C = W.layout.size[0]
         K_H = W.layout.size[2]
         K_W = W.layout.size[3]
+        O_H = Y.layout.size[2]
+        O_W = Y.layout.size[3]
 
-        TILE_M_PADDING= int((TILE_M + kernel.vector_lane - 1) // kernel.vector_lane * kernel.vector_lane)
-        TILE_N_PADDING= int((TILE_N + kernel.vector_lane - 1) // kernel.vector_lane * kernel.vector_lane)
-        TILE_K_PADDING= int((TILE_K + kernel.vector_lane - 1) // kernel.vector_lane * kernel.vector_lane)
+        # FIXME: fixed tile size
+        TILE_M = kernel.vector_lane
+        TILE_N = kernel.vector_lane
+        TILE_K = kernel.vector_lane
 
-        kernel.tile_size = [TILE_M_PADDING, TILE_N_PADDING, TILE_K_PADDING]
+        kernel.tile_size = [TILE_M, TILE_N, TILE_K]
         kernel.loop_size = [K_H, K_W, O_H, O_W, BATCH, O_C, I_C]
 
+        # FIXME: transposed inputs not supported
         # W_transposed = self.is_transposed(W)
         # X_transposed = self.is_transposed(X)
 
@@ -255,9 +251,6 @@ def render(self,
             TILE_M=TILE_M,
             TILE_N=TILE_N,
             TILE_K=TILE_K,
-            TILE_M_PADDING=TILE_M_PADDING,
-            TILE_N_PADDING=TILE_N_PADDING,
-            TILE_K_PADDING=TILE_K_PADDING,
             PADDING_H=self.padding[0],
             PADDING_W=self.padding[1],
             STRIDE_H=self.stride[0],
@@ -268,16 +261,16 @@ def render(self,
             DATA_SIZE=4,
             BIAS=Bias
         )
-        code = self._template_from_string(GEMM_TEMPLATE).render(**options)
+        code = self._template_from_string(CONV_TEMPLATE).render(**options)
 
-        self.header = f"float X_spad[{TILE_M_PADDING * TILE_K_PADDING // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float W_spad[{TILE_K_PADDING * TILE_N_PADDING // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float Y_spad[{TILE_M_PADDING * TILE_N_PADDING // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header = f"float X_spad[{TILE_M_PADDING * TILE_K_PADDING}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float W_spad[{TILE_K_PADDING * TILE_N_PADDING}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float Y_spad[{TILE_M_PADDING * TILE_N_PADDING}] __attribute__ ((section(\".spad\")));\n"
+        self.header = f"float X_spad[{TILE_M * TILE_K // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float W_spad[{TILE_K * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float Y_spad[{TILE_M * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header = f"float X_spad[{TILE_M * TILE_K}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header += f"float W_spad[{TILE_K * TILE_N}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header += f"float Y_spad[{TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
 
-        kernel.add_loop_info([options["K_H"], options["K_W"], options["O_H"], options["O_W"], options["BATCH"], options["O_C"], options["I_C"]], [options["TILE_M_PADDING"], options["TILE_N_PADDING"], options["TILE_K_PADDING"]])
+        kernel.add_loop_info([options["K_H"], options["K_W"], options["O_H"], options["O_W"], options["BATCH"], options["O_C"], options["I_C"]], [options["TILE_M"], options["TILE_N"], options["TILE_K"]])
         kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=self.input_reorder)
 
         return code
@@ -296,7 +289,7 @@ def outer_func_render(self, kernel_name, input_args):
             BACKENDSIM_EAGER_MODE=extension_config.CONFIG_BACKENDSIM_EAGER_MODE,
             HASH_VALUE=self.hash_value
         )
-        code = self._template_from_string(CONV2D_FUNC_TEMPLATE).render(**options)
+        code = self._template_from_string(WRAPPER_TEMPLATE).render(**options)
         return code, self.function_name
 
     def get_arg_attributes(self):

From f7020dbcfdb218f47464ffbd3364939cb19ccfe8 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Thu, 6 Feb 2025 11:29:28 +0000
Subject: [PATCH 076/432] [Fix] Bias Shape

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 90214f19..b77b3303 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -306,7 +306,7 @@ def get_arg_attributes(self):
         Y_shape = [Y.get_size()[i] for i in (2, 3, 0, 1)]
 
         if Bias is not None:
-          Bias_shape = [Bias.get_size()]
+          Bias_shape = Bias.get_size()
 
         def compute_stride(shape):
             stride = [1] * len(shape)

From 23b9f3983dcae2b9549866d562469f4c2b658ba1 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 6 Feb 2025 15:37:15 +0000
Subject: [PATCH 077/432] [Frontend] Use brk for spike and place spad space
 behind the DRAM

---
 PyTorchSimFrontend/extension_codecache.py       | 4 +++-
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 4 ++++
 PyTorchSimFrontend/mlir/mlir_common.py          | 2 +-
 Simulator/simulator.py                          | 7 ++++---
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 13e11094..d1e7671b 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -146,6 +146,8 @@ def load(cls, source_code,
             link_option = ""
         # Generate LLVM kernel calller and binary for validation
         if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE:
+            # Use custom malloc to avoid size error
+            new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
             cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=256)
             opt_cmd = shlex.split(cmds[0])
             translate_cmd = shlex.split(cmds[1])
@@ -163,7 +165,7 @@ def load(cls, source_code,
                 val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, arg_attributes)
                 val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name)
                 val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name,
-                                                   validation_binary_name, link_option)
+                                                   validation_binary_name, new_link_option)
         # Launch tile graph generator
         gem5_sample_cmd = shlex.split(gem5_cmds[0])
         gem5_translate_cmd = shlex.split(gem5_cmds[1])
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 8eac59bf..5deefeef 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -703,6 +703,10 @@ def __init__(self, kernel_group):
         self.global_vars = IndentedBuffer()
         self.global_vars_dict = dict()
         self.header = IndentedBuffer()
+        self.header.writeline("#include <unistd.h>")
+        self.header.writeline("#include <stdlib.h>")
+        self.header.writeline("void* __wrap_malloc(size_t size) { return sbrk(size); }")
+        self.header.writeline("void __wrap_free(void *ptr) { return; }")
         self.gem5_header = IndentedBuffer()
         self.reduction_vars = {}
         self.reduction_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc")
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index c88a51dd..fc836e92 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -303,7 +303,7 @@ def __init__(self):
         self.vector_lane = 128
         self.spad_info = {
             "spad_vaddr" : 0xD0000000,
-            "spad_paddr" : 0xD0000000,
+            "spad_paddr" : 0x2000000000,
             "spad_size" : 128 << 10 # 128KB per Lane
         }
         self.precision = 4 # 32bit
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 7b123f4e..b6292f07 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -102,13 +102,14 @@ def run_spike(self, args, arg_attributes, path, binary, intermediate_op=None, ve
         file_path_str = ' '.join(file_path)
 
         # Set hardware information
-        spad_option = f"--scratchpad-base-paddr={spad_info['spad_paddr']} " + \
+        spad_option = f"-m0x{0x80000000:x}:0x{100<<30:x},0x{spad_info['spad_paddr']:x}:0x{spad_info['spad_size']*vectorlane_size:x} " + \
+            f"--scratchpad-base-paddr={spad_info['spad_paddr']} " + \
             f"--scratchpad-base-vaddr={spad_info['spad_vaddr']} " + \
-            f"--scratchpad-size={spad_info['spad_size']}"
+            f"--scratchpad-size={spad_info['spad_size']} "
         vectorlane_option = f"--vectorlane-size={vectorlane_size}"
         kernel_address = f"--kernel-addr={kernel_start_addr}:{kernel_end_addr}"
         base_addr = f"--base-path={path}"
-        run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 -m102400 {vectorlane_option} {spad_option} {kernel_address} {base_addr} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
+        run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_addr} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
 
         print("[SpikeSimulator] cmd> ", run)
         run_cmd = shlex.split(run)

From 9c06bed662682d1ede89e3ee28095ec817b36045 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Fri, 7 Feb 2025 06:06:39 +0000
Subject: [PATCH 078/432] [Frontend] Remove wasted padding

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 16 ++++++++++------
 PyTorchSimFrontend/mlir/mlir_template.py      |  6 +++---
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index ed643f71..dd5afe2e 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -46,7 +46,7 @@
         {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
         , %vstride : memref<
         {%- if Bias_rank == 2 -%}  {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
-        xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1, sram_stride=[{{ kernel.vector_lane }}, 1] }
+        xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}] }
       {%- else %}
       affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>
       {%- endif %}
@@ -54,9 +54,9 @@
         %index0 = affine.apply #map0(%t_m, %t_k)
         %index1 = affine.apply #map1(%t_k, %t_n)
         memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
-           : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
+           : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
         memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
-           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, {{ TILE_K }}]}
+           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                 outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
       } { accumulation_loop=true }
@@ -111,6 +111,8 @@ def render(self,
             TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
             template = GEMM_TEMPLATE
         kernel.loop_size =[M, N, K]
+        SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
 
         W_transposed = self.is_transposed(W)
         X_transposed = self.is_transposed(X)
@@ -124,6 +126,8 @@ def render(self,
             TILE_M=TILE_M,
             TILE_N=TILE_N,
             TILE_K=TILE_K,
+            SUB_TILE_M=SUB_TILE_M,
+            SUB_TILE_N=SUB_TILE_N,
             DATA_STYPE="f32",
             DATA_SIZE=4,
             X = X,
@@ -139,9 +143,9 @@ def render(self,
         )
         code = self._template_from_string(template).render(**kernel.render_options)
 
-        self.header = f"float X_spad[{TILE_M * TILE_K // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float W_spad[{TILE_K * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float Y_spad[{TILE_M * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.header = f"float X_spad[{TILE_M * ((TILE_K + kernel.vector_lane - 1) // kernel.vector_lane)}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float W_spad[{TILE_K * ((TILE_N + kernel.vector_lane - 1) // kernel.vector_lane)}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float Y_spad[{TILE_M * ((TILE_N + kernel.vector_lane - 1) // kernel.vector_lane)}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header = f"float X_spad[{TILE_M * TILE_K}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header += f"float W_spad[{TILE_K * TILE_N}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header += f"float Y_spad[{TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index d29f665a..5c156abc 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -118,9 +118,9 @@ def gemmini_gemm_mapping(self, M, N, K):
     def gemm_combination_mapping(self, M, N, K):
         spad_size = self.spad_info["spad_size"] * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
-        M_padded = ((M + self.vector_lane - 1) // self.vector_lane) * self.vector_lane
-        N_padded = ((N + self.vector_lane - 1) // self.vector_lane) * self.vector_lane
-        K_padded = ((K + self.vector_lane - 1) // self.vector_lane) * self.vector_lane
+        M_padded = ((M + self.vector_lane - 1) // self.vector_lane) * self.vector_lane if M > self.vector_lane else M
+        N_padded = ((N + self.vector_lane - 1) // self.vector_lane) * self.vector_lane if N > self.vector_lane else N
+        K_padded = ((K + self.vector_lane - 1) // self.vector_lane) * self.vector_lane if K > self.vector_lane else K
 
         max_used_spad_size = 0
         mapping = (self.vector_lane, self.vector_lane, self.vector_lane)

From 6541340733cb80691be69dd5af0ccebfbcdb79a9 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Fri, 7 Feb 2025 07:15:11 +0000
Subject: [PATCH 079/432] [Template] Change convolution loop sequence to place
 K_H, K_W as accumulation loop

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 64 ++++++++-----------
 1 file changed, 27 insertions(+), 37 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index b77b3303..04bfcc53 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -71,30 +71,20 @@
   %stride_h = arith.constant {{ STRIDE_H }} : index
   %stride_w = arith.constant {{ STRIDE_W }} : index
 
-  // 1x1 convolution loop
-  affine.for %k_h = 0 to {{ K_H }} {
-    affine.for %k_w = 0 to {{ K_W }} {
-      // 1x1 convolution tiling loop
-      affine.for %o_h = 0 to {{ O_H }} {
-        affine.for %o_w = 0 to {{ O_W }} {
-          affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M }} {
-            affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
-              // Init output matrix
-              %index0 = affine.apply #map0(%o_h, %o_w, %tile_m, %tile_n)
-              %cond_h = arith.cmpi eq, %k_h, %c0 : index
-              %cond_w = arith.cmpi eq, %k_w, %c0 : index
-              %cond_hw = arith.andi %cond_h, %cond_w : i1
-              scf.if %cond_hw {
-                {%- if BIAS %}
-                memref.dma_start %Bias[%tile_n], %Y_buffer[%c0, %c0], %c_mvin, %tag[%c0], %c0, %vstride
-                    : memref<{{ O_C }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { async=1, sram_stride=[1, {{ TILE_M }}]}
-                {%- else %}
-                affine.vector_store %v0, %Y_buffer[%c0, %c0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ TILE_N }}xf32>
-                {%- endif %}
-              } else {
-                memref.dma_start %Y[%index0], %Y_buffer[%c0, %c0], %c_mvin, %tag[%c0], %input_axis, %vstride
-                    : memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { padding=0, sram_stride=[1, {{ TILE_M }}]}
-              }
+  affine.for %o_h = 0 to {{ O_H }} {
+    affine.for %o_w = 0 to {{ O_W }} {
+      affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M }} {
+        affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
+          %index0 = affine.apply #map0(%o_h, %o_w, %tile_m, %tile_n)
+          // Initialize output
+          {%- if BIAS %}
+          memref.dma_start %Bias[%tile_n], %Y_buffer[%c0, %c0], %c_mvin, %tag[%c0], %c0, %vstride
+              : memref<{{ O_C }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { async=1, sram_stride=[1, {{ TILE_M }}]}
+          {%- else %}
+          affine.vector_store %v0, %Y_buffer[%c0, %c0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ TILE_N }}xf32>
+          {%- endif %}
+          affine.for %k_h = 0 to {{ K_H }} {
+            affine.for %k_w = 0 to {{ K_W }} {
               affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
                 %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
                 %index_i_w = affine.apply #map_I_W(%o_w, %k_w)
@@ -111,11 +101,11 @@
                 linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>)
                       outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>)
               } { accumulation_loop=true }
-              // Store output matrix
-              memref.dma_start %Y_buffer[%c0, %c0], %Y[%index0], %c_mvout, %tag[%c0], %input_axis, %vstride
-                  : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[1, {{ TILE_M }}]}
-            } { outer_loop=true }
-          } { outer_loop=true }
+            } { accumulation_loop=true }
+          } { accumulation_loop=true }
+          // Store output matrix
+          memref.dma_start %Y_buffer[%c0, %c0], %Y[%index0], %c_mvout, %tag[%c0], %input_axis, %vstride
+              : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[1, {{ TILE_M }}]}
         } { outer_loop=true }
       } { outer_loop=true }
     } { outer_loop=true }
@@ -207,9 +197,9 @@ def render(self,
                **kwargs):
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
-        if epilogue_nodes is not None and len(epilogue_nodes) > 0:
-            self.output_node = cast(Buffer, epilogue_nodes[-1])
-            self.function_name += f"_fused_{epilogue_nodes[0].node.origin_node.name}"
+        # if epilogue_nodes is not None and len(epilogue_nodes) > 0:
+        #     self.output_node = cast(Buffer, epilogue_nodes[-1])
+        #     self.function_name += f"_fused_{epilogue_nodes[0].node.origin_node.name}"
 
         X, W = self.input_nodes[0], self.input_nodes[1]
         Y = self.output_node
@@ -220,8 +210,8 @@ def render(self,
         O_C = W.layout.size[0]
         K_H = W.layout.size[2]
         K_W = W.layout.size[3]
-        O_H = Y.layout.size[2]
-        O_W = Y.layout.size[3]
+        O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2]
+        O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
 
         # FIXME: fixed tile size
         TILE_M = kernel.vector_lane
@@ -235,7 +225,7 @@ def render(self,
         # W_transposed = self.is_transposed(W)
         # X_transposed = self.is_transposed(X)
 
-        options = dict(
+        kernel.render_options = dict(
             KERNEL_NAME=self.name,
             KERNEL_DEF=self.def_kernel(),
             kernel=kernel,
@@ -261,7 +251,7 @@ def render(self,
             DATA_SIZE=4,
             BIAS=Bias
         )
-        code = self._template_from_string(CONV_TEMPLATE).render(**options)
+        code = self._template_from_string(CONV_TEMPLATE).render(**kernel.render_options)
 
         self.header = f"float X_spad[{TILE_M * TILE_K // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
         self.header += f"float W_spad[{TILE_K * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
@@ -270,7 +260,7 @@ def render(self,
         self.gem5_header += f"float W_spad[{TILE_K * TILE_N}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header += f"float Y_spad[{TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
 
-        kernel.add_loop_info([options["K_H"], options["K_W"], options["O_H"], options["O_W"], options["BATCH"], options["O_C"], options["I_C"]], [options["TILE_M"], options["TILE_N"], options["TILE_K"]])
+        kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
         kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=self.input_reorder)
 
         return code

From 863f0249ea353ffd391bb3b8e292c76e9038ed01 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 7 Feb 2025 10:40:51 +0000
Subject: [PATCH 080/432] [Frontend] Use seperate tag for dependency tracking

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 04bfcc53..d98a2a15 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -63,6 +63,10 @@
   %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
   %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
   %tag = memref.alloc() : memref<1xi32>
+  %tag0 = memref.alloc() : memref<1xi32>
+  %tag1 = memref.alloc() : memref<1xi32>
+  %tag2 = memref.alloc() : memref<1xi32>
+  %tag3 = memref.alloc() : memref<1xi32>
   %v0 = arith.constant dense<0.0> : vector<{{ TILE_N }}xf32>
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -78,8 +82,8 @@
           %index0 = affine.apply #map0(%o_h, %o_w, %tile_m, %tile_n)
           // Initialize output
           {%- if BIAS %}
-          memref.dma_start %Bias[%tile_n], %Y_buffer[%c0, %c0], %c_mvin, %tag[%c0], %c0, %vstride
-              : memref<{{ O_C }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { async=1, sram_stride=[1, {{ TILE_M }}]}
+          memref.dma_start %Bias[%tile_n], %Y_buffer[%c0, %c0], %c_mvin, %tag0[%c0], %c0, %vstride
+              : memref<{{ O_C }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_M }}, {{ TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}]}
           {%- else %}
           affine.vector_store %v0, %Y_buffer[%c0, %c0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ TILE_N }}xf32>
           {%- endif %}
@@ -92,10 +96,10 @@
                 %index_k_hw = affine.apply #map_K_HW(%k_h, %k_w)
                 %index2 = affine.apply #map2(%index_k_hw, %tile_k, %tile_n) // weight index
                 // Load input matrix
-                memref.dma_start %X[%index1], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %input_axis, %vstride
+                memref.dma_start %X[%index1], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
                     : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
                 // Load kernel matrix
-                memref.dma_start %W[%index2], %W_buffer[%c0, %c0], %c_mvin, %tag[%c0], %weight_axis, %vstride
+                memref.dma_start %W[%index2], %W_buffer[%c0, %c0], %c_mvin, %tag2[%c0], %weight_axis, %vstride
                     : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, 1]}
                 // matmul
                 linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>)
@@ -104,7 +108,7 @@
             } { accumulation_loop=true }
           } { accumulation_loop=true }
           // Store output matrix
-          memref.dma_start %Y_buffer[%c0, %c0], %Y[%index0], %c_mvout, %tag[%c0], %input_axis, %vstride
+          memref.dma_start %Y_buffer[%c0, %c0], %Y[%index0], %c_mvout, %tag3[%c0], %input_axis, %vstride
               : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[1, {{ TILE_M }}]}
         } { outer_loop=true }
       } { outer_loop=true }

From 9c5000cd9ee30d4401f1681597fd59ca15b5b5c5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 7 Feb 2025 11:34:20 +0000
Subject: [PATCH 081/432] [Backend] Apply tag step size

---
 PyTorchSimBackend/src/TileGraphParser.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index 0fb25b94..0c38cd9b 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -305,11 +305,12 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         }
       }
 
+      uint32_t step = std::stoi(tog_parser->getMetaByName("systolic_size"));
       for (auto loop_idx: tag_idx_list) {
         if (iter.find(loop_idx) == iter.end())
           tag_list.push_back(0);
         else {
-          auto iter_value = getLoopIndexValue(iter, loop_idx);
+          auto iter_value = getLoopIndexValue(iter, loop_idx) / step;
           tag_list.push_back(iter_value);
         }
       }

From b62318411d560e0d9cd22b296cf312d21662f807 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Fri, 7 Feb 2025 12:12:19 +0000
Subject: [PATCH 082/432] [Frontend] CONV remove wasted padding

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 25 +++++++++++--------
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 10 ++++----
 PyTorchSimFrontend/mlir/mlir_template.py      |  5 ++++
 3 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index d98a2a15..1e95e605 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -67,7 +67,7 @@
   %tag1 = memref.alloc() : memref<1xi32>
   %tag2 = memref.alloc() : memref<1xi32>
   %tag3 = memref.alloc() : memref<1xi32>
-  %v0 = arith.constant dense<0.0> : vector<{{ TILE_N }}xf32>
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -85,7 +85,7 @@
           memref.dma_start %Bias[%tile_n], %Y_buffer[%c0, %c0], %c_mvin, %tag0[%c0], %c0, %vstride
               : memref<{{ O_C }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_M }}, {{ TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}]}
           {%- else %}
-          affine.vector_store %v0, %Y_buffer[%c0, %c0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ TILE_N }}xf32>
+          affine.vector_store %v0, %Y_buffer[%c0, %c0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
           {%- endif %}
           affine.for %k_h = 0 to {{ K_H }} {
             affine.for %k_w = 0 to {{ K_W }} {
@@ -97,10 +97,10 @@
                 %index2 = affine.apply #map2(%index_k_hw, %tile_k, %tile_n) // weight index
                 // Load input matrix
                 memref.dma_start %X[%index1], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
-                    : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
+                    : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
                 // Load kernel matrix
                 memref.dma_start %W[%index2], %W_buffer[%c0, %c0], %c_mvin, %tag2[%c0], %weight_axis, %vstride
-                    : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, 1]}
+                    : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
                 // matmul
                 linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>)
                       outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>)
@@ -218,11 +218,12 @@ def render(self,
         O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
 
         # FIXME: fixed tile size
-        TILE_M = kernel.vector_lane
-        TILE_N = kernel.vector_lane
-        TILE_K = kernel.vector_lane
+        TILE_M = kernel.vector_lane if kernel.vector_lane < BATCH else BATCH
+        TILE_N = kernel.vector_lane if kernel.vector_lane < O_C else O_C
+        TILE_K = kernel.vector_lane if kernel.vector_lane < I_C else I_C
+        SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
 
-        kernel.tile_size = [TILE_M, TILE_N, TILE_K]
         kernel.loop_size = [K_H, K_W, O_H, O_W, BATCH, O_C, I_C]
 
         # FIXME: transposed inputs not supported
@@ -245,6 +246,8 @@ def render(self,
             TILE_M=TILE_M,
             TILE_N=TILE_N,
             TILE_K=TILE_K,
+            SUB_TILE_M=SUB_TILE_M,
+            SUB_TILE_N=SUB_TILE_N,
             PADDING_H=self.padding[0],
             PADDING_W=self.padding[1],
             STRIDE_H=self.stride[0],
@@ -257,9 +260,9 @@ def render(self,
         )
         code = self._template_from_string(CONV_TEMPLATE).render(**kernel.render_options)
 
-        self.header = f"float X_spad[{TILE_M * TILE_K // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float W_spad[{TILE_K * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float Y_spad[{TILE_M * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.header = f"float X_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_K)}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float W_spad[{kernel.get_spad_size_per_lane(TILE_K, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float Y_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header = f"float X_spad[{TILE_M * TILE_K}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header += f"float W_spad[{TILE_K * TILE_N}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header += f"float Y_spad[{TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index dd5afe2e..9426a2e7 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -32,7 +32,7 @@
   %tag0 = memref.alloc() : memref<1xi32>
   %tag1 = memref.alloc() : memref<1xi32>
   %tag2 = memref.alloc() : memref<1xi32>{% if not Bias %}
-  %v0 = arith.constant dense<0.0> : vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>{% endif %}
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
   %c0 = arith.constant 0 : index
   {{- kernel.def_local_vars() }}
 
@@ -48,7 +48,7 @@
         {%- if Bias_rank == 2 -%}  {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
         xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}] }
       {%- else %}
-      affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>
+      affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
       {%- endif %}
       affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
         %index0 = affine.apply #map0(%t_m, %t_k)
@@ -143,9 +143,9 @@ def render(self,
         )
         code = self._template_from_string(template).render(**kernel.render_options)
 
-        self.header = f"float X_spad[{TILE_M * ((TILE_K + kernel.vector_lane - 1) // kernel.vector_lane)}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float W_spad[{TILE_K * ((TILE_N + kernel.vector_lane - 1) // kernel.vector_lane)}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float Y_spad[{TILE_M * ((TILE_N + kernel.vector_lane - 1) // kernel.vector_lane)}] __attribute__ ((section(\".spad\")));\n"
+        self.header = f"float X_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_K)}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float W_spad[{kernel.get_spad_size_per_lane(TILE_K, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float Y_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header = f"float X_spad[{TILE_M * TILE_K}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header += f"float W_spad[{TILE_K * TILE_N}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header += f"float Y_spad[{TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 5c156abc..645dfe96 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -290,6 +290,11 @@ def render(self, template, kwargs):
             self.render_hooks,
         )
 
+    def get_spad_size_per_lane(self, tile_m, tile_n):
+        size = tile_m * ((tile_n + self.vector_lane - 1) // self.vector_lane)
+        size = 2 if size == 1 else size # vector load/store
+        return size
+
     def adjust_tile_size(self):
         # Fixed tile size for template kernel
         self.kernel_group.tile_desc.set_tile_size((self.render_options['TILE_M'], self.render_options['TILE_N']))

From 2e97977c62f6bb72c71a226ef0be1793c3d6adb1 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Sat, 8 Feb 2025 12:32:45 +0000
Subject: [PATCH 083/432] [Frontend] small mapping fixed

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py |  7 ++-----
 PyTorchSimFrontend/mlir/mlir_template.py      | 19 +++++++++++--------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 1e95e605..c46b7fa1 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -83,7 +83,7 @@
           // Initialize output
           {%- if BIAS %}
           memref.dma_start %Bias[%tile_n], %Y_buffer[%c0, %c0], %c_mvin, %tag0[%c0], %c0, %vstride
-              : memref<{{ O_C }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_M }}, {{ TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}]}
+              : memref<{{ O_C }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}]}
           {%- else %}
           affine.vector_store %v0, %Y_buffer[%c0, %c0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
           {%- endif %}
@@ -217,10 +217,7 @@ def render(self,
         O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2]
         O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
 
-        # FIXME: fixed tile size
-        TILE_M = kernel.vector_lane if kernel.vector_lane < BATCH else BATCH
-        TILE_N = kernel.vector_lane if kernel.vector_lane < O_C else O_C
-        TILE_K = kernel.vector_lane if kernel.vector_lane < I_C else I_C
+        TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(BATCH, O_C, I_C)
         SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
         SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
 
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 645dfe96..5928b493 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -118,15 +118,19 @@ def gemmini_gemm_mapping(self, M, N, K):
     def gemm_combination_mapping(self, M, N, K):
         spad_size = self.spad_info["spad_size"] * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
-        M_padded = ((M + self.vector_lane - 1) // self.vector_lane) * self.vector_lane if M > self.vector_lane else M
-        N_padded = ((N + self.vector_lane - 1) // self.vector_lane) * self.vector_lane if N > self.vector_lane else N
-        K_padded = ((K + self.vector_lane - 1) // self.vector_lane) * self.vector_lane if K > self.vector_lane else K
+        M_padded = ((M + self.vector_lane - 1) // self.vector_lane) * self.vector_lane
+        N_padded = ((N + self.vector_lane - 1) // self.vector_lane) * self.vector_lane
+        K_padded = ((K + self.vector_lane - 1) // self.vector_lane) * self.vector_lane
+        M = max(M, 2)
 
         max_used_spad_size = 0
         mapping = (self.vector_lane, self.vector_lane, self.vector_lane)
-        for tile_M in range(self.vector_lane, M_padded + 1, self.vector_lane):
-            for tile_N in range(self.vector_lane, N_padded + 1, self.vector_lane):
-                for tile_K in range(self.vector_lane, K_padded + 1, self.vector_lane):
+        tile_M_range = range(self.vector_lane, M_padded + 1, self.vector_lane) if M > self.vector_lane else [M]
+        tile_N_range = range(self.vector_lane, N_padded + 1, self.vector_lane) if N > self.vector_lane else [N]
+        tile_K_range = range(self.vector_lane, K_padded + 1, self.vector_lane) if K > self.vector_lane else [K]
+        for tile_M in tile_M_range:
+            for tile_N in tile_N_range:
+                for tile_K in tile_K_range:
                     used_spad_size = (tile_M * tile_K + tile_K * tile_N + tile_M * tile_N) * self.precision
                     if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size:
                         max_used_spad_size = used_spad_size
@@ -292,8 +296,7 @@ def render(self, template, kwargs):
 
     def get_spad_size_per_lane(self, tile_m, tile_n):
         size = tile_m * ((tile_n + self.vector_lane - 1) // self.vector_lane)
-        size = 2 if size == 1 else size # vector load/store
-        return size
+        return max(size, 2) # vector load/store
 
     def adjust_tile_size(self):
         # Fixed tile size for template kernel

From 33d4fdf9910ea2091cef8ef262c2d634745534ba Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Mon, 10 Feb 2025 04:23:11 +0000
Subject: [PATCH 084/432] [Frontend] Small Padding limit 8

---
 PyTorchSimFrontend/mlir/mlir_template.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 5928b493..9d7419b5 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -118,16 +118,18 @@ def gemmini_gemm_mapping(self, M, N, K):
     def gemm_combination_mapping(self, M, N, K):
         spad_size = self.spad_info["spad_size"] * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
-        M_padded = ((M + self.vector_lane - 1) // self.vector_lane) * self.vector_lane
-        N_padded = ((N + self.vector_lane - 1) // self.vector_lane) * self.vector_lane
-        K_padded = ((K + self.vector_lane - 1) // self.vector_lane) * self.vector_lane
-        M = max(M, 2)
+        m_pad_factor = self.vector_lane if M > self.vector_lane else 8
+        n_pad_factor = self.vector_lane if N > self.vector_lane else 8
+        k_pad_factor = self.vector_lane if K > self.vector_lane else 8
+        M_padded = ((M + m_pad_factor - 1) // m_pad_factor) * m_pad_factor
+        N_padded = ((N + n_pad_factor - 1) // n_pad_factor) * n_pad_factor
+        K_padded = ((K + k_pad_factor - 1) // k_pad_factor) * k_pad_factor
 
         max_used_spad_size = 0
         mapping = (self.vector_lane, self.vector_lane, self.vector_lane)
-        tile_M_range = range(self.vector_lane, M_padded + 1, self.vector_lane) if M > self.vector_lane else [M]
-        tile_N_range = range(self.vector_lane, N_padded + 1, self.vector_lane) if N > self.vector_lane else [N]
-        tile_K_range = range(self.vector_lane, K_padded + 1, self.vector_lane) if K > self.vector_lane else [K]
+        tile_M_range = range(self.vector_lane, M_padded + 1, self.vector_lane) if M > self.vector_lane else [M_padded]
+        tile_N_range = range(self.vector_lane, N_padded + 1, self.vector_lane) if N > self.vector_lane else [N_padded]
+        tile_K_range = range(self.vector_lane, K_padded + 1, self.vector_lane) if K > self.vector_lane else [K_padded]
         for tile_M in tile_M_range:
             for tile_N in tile_N_range:
                 for tile_K in tile_K_range:

From a15d514f6308e8e0526e8d39cab347ad03fede46 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Mon, 10 Feb 2025 04:24:04 +0000
Subject: [PATCH 085/432] [Frontend] CONV tiling algorithm revised

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 102 ++++++++++--------
 1 file changed, 58 insertions(+), 44 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index c46b7fa1..c7bf8873 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -42,14 +42,16 @@
 
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * BATCH * O_C }} + d1 * {{ BATCH * O_C }} + d2 * {{ O_C }} + d3)> // output (O_H, O_W, BATCH, O_C)
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * BATCH * I_C }} + d1 * {{ BATCH * I_C }} + d2 * {{ I_C }} + d3)> // input (I_H, I_W, BATCH, I_C)
-#map2 = affine_map<(d0, d1, d2) -> (d0 * {{ I_C * O_C }} + d1 * {{ O_C }} + d2)> // weight (K_H * K_W, I_C, O_C)
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C) 
 #map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
 #map_I_W = affine_map<(d0, d1) -> (d0 * {{ STRIDE_W }} + d1)>
-#map_K_HW = affine_map<(d0, d1) -> (d0 * {{ K_W }} + d1)>
+#offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_K_W * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
+#offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_I_W * TILE_M, TILE_K) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_K) }})>
+#offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_O_W * TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
 
-memref.global @X_spad : memref<{{TILE_M }}x{{ TILE_K }}xf32, 1>
-memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-memref.global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+memref.global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{TILE_M }}x{{ TILE_K }}xf32, 1>
+memref.global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+memref.global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
 
 func.func @{{ KERNEL_NAME }}({{ KERNEL_DEF }}) {
   %c_mvin = arith.constant 2 : index
@@ -59,15 +61,15 @@
   %vstride = arith.constant 1 : index
   %input_axis = arith.constant 3 : index
   %weight_axis = arith.constant 2 : index
-  %X_buffer = memref.get_global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-  %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-  %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+  %input_buffer = memref.get_global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1> // FIXME: change the size
+  %weight_buffer = memref.get_global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+  %output_buffer = memref.get_global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
   %tag = memref.alloc() : memref<1xi32>
   %tag0 = memref.alloc() : memref<1xi32>
   %tag1 = memref.alloc() : memref<1xi32>
   %tag2 = memref.alloc() : memref<1xi32>
   %tag3 = memref.alloc() : memref<1xi32>
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N) }}xf32>
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -75,8 +77,8 @@
   %stride_h = arith.constant {{ STRIDE_H }} : index
   %stride_w = arith.constant {{ STRIDE_W }} : index
 
-  affine.for %o_h = 0 to {{ O_H }} {
-    affine.for %o_w = 0 to {{ O_W }} {
+  affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
+    affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }}{
       affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M }} {
         affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
           %index0 = affine.apply #map0(%o_h, %o_w, %tile_m, %tile_n)
@@ -85,31 +87,45 @@
           memref.dma_start %Bias[%tile_n], %Y_buffer[%c0, %c0], %c_mvin, %tag0[%c0], %c0, %vstride
               : memref<{{ O_C }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}]}
           {%- else %}
-          affine.vector_store %v0, %Y_buffer[%c0, %c0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+          affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N) }}xf32>
           {%- endif %}
-          affine.for %k_h = 0 to {{ K_H }} {
-            affine.for %k_w = 0 to {{ K_W }} {
+          affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
+            affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} {
               affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
                 %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
                 %index_i_w = affine.apply #map_I_W(%o_w, %k_w)
                 %index1 = affine.apply #map1(%index_i_h, %index_i_w, %tile_m, %tile_k) // input index
-                %index_k_hw = affine.apply #map_K_HW(%k_h, %k_w)
-                %index2 = affine.apply #map2(%index_k_hw, %tile_k, %tile_n) // weight index
+                %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
                 // Load input matrix
-                memref.dma_start %X[%index1], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
-                    : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
+                memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
+                    : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_I_H }}, {{ TILE_I_W }}, {{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_I_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
                 // Load kernel matrix
-                memref.dma_start %W[%index2], %W_buffer[%c0, %c0], %c_mvin, %tag2[%c0], %weight_axis, %vstride
-                    : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
-                // matmul
-                linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>)
-                      outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>)
+                memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
+                    : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K_H }}, {{ TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
+                affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
+                  affine.for %tile_o_w = 0 to {{ TILE_O_W }} {
+                    affine.for %tile_k_h = 0 to {{ TILE_K_H }} {
+                      affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
+                        %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
+                        %tile_i_w = affine.apply #map_I_W(%tile_o_w, %tile_k_w)
+                        %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
+                        %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_i_w)
+                        %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
+                        %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
+                        %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                        %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                        linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                              outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                      }
+                    }
+                  }
+                }
               } { accumulation_loop=true }
             } { accumulation_loop=true }
           } { accumulation_loop=true }
           // Store output matrix
-          memref.dma_start %Y_buffer[%c0, %c0], %Y[%index0], %c_mvout, %tag3[%c0], %input_axis, %vstride
-              : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[1, {{ TILE_M }}]}
+          memref.dma_start %output_buffer[%c0, %c0, %c0, %c0], %Y[%index0], %c_mvout, %tag3[%c0], %input_axis, %vstride
+              : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[{{ TILE_O_W * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
         } { outer_loop=true }
       } { outer_loop=true }
     } { outer_loop=true }
@@ -183,17 +199,6 @@ def def_kernel(self) ->str:
         else:
           return f"%{self.kernel_args[0]}: memref<{input_size}xf32>, %{self.kernel_args[1]}: memref<{weight_size}xf32>, %{self.kernel_args[2]}: memref<{bias_size}xf32>, %{self.kernel_args[3]}: memref<{output_size}xf32>"
 
-    def get_tile_options(self):
-        BATCH = self.input_nodes[0].layout.size[0]
-        I_C = self.input_nodes[0].layout.size[1]
-        O_C = self.input_nodes[1].layout.size[0]
-
-        tile_m_options = divisors(BATCH)
-        tile_k_options = divisors(I_C)
-        tile_n_options = divisors(O_C)
-
-        return tile_m_options, tile_n_options, tile_k_options
-
     def render(self,
                kernel: MLIRTemplateKernel,
                template_buffer_node = None,
@@ -218,8 +223,11 @@ def render(self,
         O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
 
         TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(BATCH, O_C, I_C)
-        SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_M = TILE_M #if TILE_M < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_N = TILE_N #if TILE_N < kernel.vector_lane else kernel.vector_lane
+        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W = K_H, K_W, O_H, O_W
+        TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+        TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
 
         kernel.loop_size = [K_H, K_W, O_H, O_W, BATCH, O_C, I_C]
 
@@ -243,6 +251,12 @@ def render(self,
             TILE_M=TILE_M,
             TILE_N=TILE_N,
             TILE_K=TILE_K,
+            TILE_I_H=TILE_I_H,
+            TILE_I_W=TILE_I_W,
+            TILE_O_H=TILE_O_H,
+            TILE_O_W=TILE_O_W,
+            TILE_K_H=TILE_K_H,
+            TILE_K_W=TILE_K_W,
             SUB_TILE_M=SUB_TILE_M,
             SUB_TILE_N=SUB_TILE_N,
             PADDING_H=self.padding[0],
@@ -257,12 +271,12 @@ def render(self,
         )
         code = self._template_from_string(CONV_TEMPLATE).render(**kernel.render_options)
 
-        self.header = f"float X_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_K)}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float W_spad[{kernel.get_spad_size_per_lane(TILE_K, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float Y_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header = f"float X_spad[{TILE_M * TILE_K}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float W_spad[{TILE_K * TILE_N}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float Y_spad[{TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
+        self.header = f"float X_spad[{kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H * TILE_M, TILE_K)}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float W_spad[{kernel.get_spad_size_per_lane(TILE_K_W * TILE_K_H * TILE_K, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float Y_spad[{kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header = f"float X_spad[{TILE_I_W * TILE_I_H * TILE_M * TILE_K}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header += f"float W_spad[{TILE_K_W * TILE_K_H * TILE_K * TILE_N}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header += f"float Y_spad[{TILE_O_H * TILE_O_W * TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
 
         kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
         kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=self.input_reorder)

From f2f7d621a0fc1126e3d7dcca79defd5574954a37 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 10 Feb 2025 08:37:20 +0000
Subject: [PATCH 086/432] [Front+Back] Support big subtile size

---
 AsmParser/onnx_utility.py                | 1 +
 AsmParser/tog_generator.py               | 1 +
 PyTorchSimBackend/src/TileGraphParser.cc | 9 +++++++--
 PyTorchSimBackend/src/TileGraphParser.h  | 2 ++
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/AsmParser/onnx_utility.py b/AsmParser/onnx_utility.py
index 9b9929ad..ac48001c 100644
--- a/AsmParser/onnx_utility.py
+++ b/AsmParser/onnx_utility.py
@@ -85,6 +85,7 @@ def __init__(self, tile_info, inst_list=list(), node_id=0):
         super().__init__(node_id)
         self.torchsim_tag_idx_list = tile_info["tag_idx_list"]
         self.torchsim_tag_stride_list = tile_info["tag_stride_list"]
+        self.torchsim_tag_divider_list = tile_info["tag_divider_list"]
         self.torchsim_base_addr = tile_info["base_addr"]
 
 class compute_node(node):
diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
index 44332b43..27fd4f92 100644
--- a/AsmParser/tog_generator.py
+++ b/AsmParser/tog_generator.py
@@ -101,6 +101,7 @@ def _create_node(self, dump_data):
             tile_info = {}
             tile_info["tag_idx_list"] = dump_data["tag_idx_list"]
             tile_info["tag_stride_list"] = dump_data["tag_stride_list"]
+            tile_info["tag_divider_list"] = dump_data["tag_divider_list"]
             tile_info["base_addr"] = dump_data["base_address"]
             new_node = memory_wait_node(tile_info, node_id=node_id)
         else:
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index 0c38cd9b..0813243e 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -217,6 +217,9 @@ TileMemoryWaitNode::TileMemoryWaitNode(onnx::NodeProto& node) : TileNode(node) {
     } else if (attribute.name() == "torchsim_tag_stride_list") {
       for (int i = 0; i < attribute.ints_size(); i++)
         _tag_stride_list.push_back(attribute.ints(i));
+    } else if (attribute.name() == "torchsim_tag_divider_list") {
+      for (int i = 0; i < attribute.ints_size(); i++)
+        _tag_divider_list.push_back(attribute.ints(i));
     } else if (attribute.name() == "torchsim_base_addr") {
       _base_addr_name = attribute.s();
     }
@@ -397,11 +400,13 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       std::vector<int> iter_list;
       std::vector<int> tag_list;
       std::vector<int>& tag_stride_list = wait_node->get_tag_stride_list();
+      std::vector<int>& tag_divider_list = wait_node->get_tag_divider_list();
       std::vector<int> new_tag_stride_list;
       std::vector<int> accum_tag_list;
       auto& wait_tag_list = wait_node->get_tag_idx_list();
 
-      for (auto loop_idx: wait_tag_list) {
+      for (int i=0; i<wait_tag_list.size();i++) {
+        std::string loop_idx = wait_tag_list.at(i);
         if (iter.find(loop_idx) == iter.end()) {
           tag_list.push_back(0);
           continue;
@@ -411,7 +416,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
           auto iter_value = getLoopIndexValue(iter, loop_idx);
           accum_tag_list.push_back(iter_value);
         } else {
-          auto iter_value = getLoopIndexValue(iter, loop_idx);
+          auto iter_value = getLoopIndexValue(iter, loop_idx) / tag_divider_list.at(i);
           tag_list.push_back(iter_value);
         }
       }
diff --git a/PyTorchSimBackend/src/TileGraphParser.h b/PyTorchSimBackend/src/TileGraphParser.h
index 0fba4d06..b1f3a283 100644
--- a/PyTorchSimBackend/src/TileGraphParser.h
+++ b/PyTorchSimBackend/src/TileGraphParser.h
@@ -138,11 +138,13 @@ class TileMemoryWaitNode : public TileNode {
   std::string get_base_addr_name() { return _base_addr_name; }
   std::vector<std::string>& get_tag_idx_list() { return _tag_idx_list; }
   std::vector<int>& get_tag_stride_list() { return _tag_stride_list; }
+  std::vector<int>& get_tag_divider_list() { return _tag_divider_list; }
   void print_node() override;
 
  private:
   std::vector<std::string> _tag_idx_list;
   std::vector<int> _tag_stride_list;
+  std::vector<int> _tag_divider_list;
   std::string _base_addr_name;
 };
 

From 0d30f0b488363556e4d918d376706184292ce572 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Mon, 10 Feb 2025 14:08:30 +0000
Subject: [PATCH 087/432] [Frontend] CONV mapping & bias

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 23 ++++++------
 PyTorchSimFrontend/mlir/mlir_template.py      | 36 +++++++++++++++++++
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index c7bf8873..4e664a32 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -42,7 +42,7 @@
 
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * BATCH * O_C }} + d1 * {{ BATCH * O_C }} + d2 * {{ O_C }} + d3)> // output (O_H, O_W, BATCH, O_C)
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * BATCH * I_C }} + d1 * {{ BATCH * I_C }} + d2 * {{ I_C }} + d3)> // input (I_H, I_W, BATCH, I_C)
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C) 
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C)
 #map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
 #map_I_W = affine_map<(d0, d1) -> (d0 * {{ STRIDE_W }} + d1)>
 #offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_K_W * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
@@ -61,7 +61,7 @@
   %vstride = arith.constant 1 : index
   %input_axis = arith.constant 3 : index
   %weight_axis = arith.constant 2 : index
-  %input_buffer = memref.get_global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1> // FIXME: change the size
+  %input_buffer = memref.get_global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
   %weight_buffer = memref.get_global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
   %output_buffer = memref.get_global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
   %tag = memref.alloc() : memref<1xi32>
@@ -84,8 +84,8 @@
           %index0 = affine.apply #map0(%o_h, %o_w, %tile_m, %tile_n)
           // Initialize output
           {%- if BIAS %}
-          memref.dma_start %Bias[%tile_n], %Y_buffer[%c0, %c0], %c_mvin, %tag0[%c0], %c0, %vstride
-              : memref<{{ O_C }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}]}
+          memref.dma_start %Bias[%tile_n], %output_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag0[%c0], %c0, %vstride
+              : memref<{{ O_C }}xf32>, memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_O_H }}, {{ TILE_O_W }}, {{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_O_W * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
           {%- else %}
           affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N) }}xf32>
           {%- endif %}
@@ -116,10 +116,10 @@
                         %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
                               outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
-                      }
-                    }
-                  }
-                }
+                      } { inner_loop=true }
+                    } { inner_loop=true }
+                  } { inner_loop=true }
+                } { inner_loop=true }
               } { accumulation_loop=true }
             } { accumulation_loop=true }
           } { accumulation_loop=true }
@@ -222,10 +222,9 @@ def render(self,
         O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2]
         O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
 
-        TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(BATCH, O_C, I_C)
-        SUB_TILE_M = TILE_M #if TILE_M < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_N = TILE_N #if TILE_N < kernel.vector_lane else kernel.vector_lane
-        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W = K_H, K_W, O_H, O_W
+        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation)
+        SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
         TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
         TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
 
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 9d7419b5..24714d76 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -146,6 +146,42 @@ def gemm_combination_mapping(self, M, N, K):
         mapping = (M_padded // Outer_M, N_padded // Outer_N, K_padded // Outer_K)
         return mapping
 
+    def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation):
+        spad_size = self.spad_info["spad_size"] * self.vector_lane
+        max_spad_size = spad_size // 2 # double buffer
+        m_pad_factor = self.vector_lane if M > self.vector_lane else 8
+        n_pad_factor = self.vector_lane if N > self.vector_lane else 8
+        k_pad_factor = self.vector_lane if K > self.vector_lane else 8
+        M_padded = ((M + m_pad_factor - 1) // m_pad_factor) * m_pad_factor
+        N_padded = ((N + n_pad_factor - 1) // n_pad_factor) * n_pad_factor
+        K_padded = ((K + k_pad_factor - 1) // k_pad_factor) * k_pad_factor
+
+        max_used_spad_size = 0
+        mapping = (self.vector_lane, self.vector_lane, self.vector_lane)
+        tile_M_range = range(self.vector_lane, M_padded + 1, self.vector_lane) if M > self.vector_lane else [M_padded]
+        tile_N_range = range(self.vector_lane, N_padded + 1, self.vector_lane) if N > self.vector_lane else [N_padded]
+        tile_K_range = range(self.vector_lane, K_padded + 1, self.vector_lane) if K > self.vector_lane else [K_padded]
+        for o_h in range(1, O_H + 1):
+            for o_w in range(1, O_W + 1):
+                i_h = 1 + (o_h - 1) * stride[0] + (K_H - 1) * dilation[0]
+                i_w = 1 + (o_h - 1) * stride[1] + (K_W - 1) * dilation[1]
+                for tile_M in tile_M_range:
+                    for tile_N in tile_N_range:
+                        for tile_K in tile_K_range:
+                            weight_size = K_W * K_H * tile_K * tile_N
+                            input_size = i_w * i_h * tile_M * tile_K
+                            output_size = o_w * o_h * tile_M * tile_N
+                            used_spad_size = (weight_size + input_size + output_size) * self.precision
+                            if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size:
+                                max_used_spad_size = used_spad_size
+                                mapping = (K_H, K_W, o_h, o_w, tile_M, tile_N, tile_K)
+
+        Outer_M = math.ceil(M_padded / mapping[4])
+        Outer_N = math.ceil(N_padded / mapping[5])
+        Outer_K = math.ceil(K_padded / mapping[6])
+        mapping = (mapping[0], mapping[1], mapping[2], mapping[3], M_padded // Outer_M, N_padded // Outer_N, K_padded // Outer_K)
+        return mapping
+
     def meta_kernel(self):
         wrapper = V.graph.wrapper_code
         arg_attributes = self.kernel_arg_attributes

From 6f16486e8089c36c71325fd5a11f5433e733aefe Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sun, 9 Feb 2025 09:51:25 +0000
Subject: [PATCH 088/432] [Backendsim] Convert MemAccess to mem_fetch

---
 PyTorchSimBackend/CMakeLists.txt      |  15 +--
 PyTorchSimBackend/extern/ramulator2   |   2 +-
 PyTorchSimBackend/src/CMakeLists.txt  |   1 -
 PyTorchSimBackend/src/Cache_defs.h    | 142 ++++++++++++++++++++++
 PyTorchSimBackend/src/Core.cc         |  11 +-
 PyTorchSimBackend/src/Core.h          |   8 +-
 PyTorchSimBackend/src/Dram.cc         | 162 +++-----------------------
 PyTorchSimBackend/src/Dram.h          |  61 ++--------
 PyTorchSimBackend/src/Hashing.cc      |  54 +++++++++
 PyTorchSimBackend/src/Hashing.h       |   8 ++
 PyTorchSimBackend/src/Interconnect.cc |  72 ++++++------
 PyTorchSimBackend/src/Interconnect.h  |  26 ++---
 PyTorchSimBackend/src/Memfetch.h      |  90 ++++++++++++++
 PyTorchSimBackend/src/Simulator.cc    |  33 +++---
 PyTorchSimBackend/src/Simulator.h     |   2 +-
 PyTorchSimBackend/src/TMA.cc          |  17 +--
 PyTorchSimBackend/src/TMA.h           |  17 +--
 17 files changed, 410 insertions(+), 311 deletions(-)
 create mode 100644 PyTorchSimBackend/src/Cache_defs.h
 create mode 100644 PyTorchSimBackend/src/Memfetch.h

diff --git a/PyTorchSimBackend/CMakeLists.txt b/PyTorchSimBackend/CMakeLists.txt
index 3cb296d5..3776131e 100644
--- a/PyTorchSimBackend/CMakeLists.txt
+++ b/PyTorchSimBackend/CMakeLists.txt
@@ -27,12 +27,10 @@ message("BINARY DIR ${CMAKE_BINARY_DIR}")
 add_subdirectory("${PROJECT_SOURCE_DIR}/src")
 
 # Add libaray ramulator
-add_subdirectory("${PROJECT_SOURCE_DIR}/extern/ramulator_custom")
-
-# Add libaray ramulator
+include_directories("${PROJECT_SOURCE_DIR}/src")
 add_subdirectory("${PROJECT_SOURCE_DIR}/extern/ramulator2")
 include_directories("${PROJECT_SOURCE_DIR}/extern/ramulator2/src")
-include_directories("${PROJECT_SOURCE_DIR}/extern/ramulator2/resources/ndp_wrappers")
+include_directories("${PROJECT_SOURCE_DIR}/extern/ramulator2/resources/wrappers")
 
 # Add libaray booksim
 add_subdirectory("${PROJECT_SOURCE_DIR}/extern/booksim")
@@ -49,9 +47,6 @@ set_target_properties(onnx PROPERTIES FOLDER "extern/onnx")
 set_target_properties(onnx_proto PROPERTIES FOLDER "extern/onnx")
 
 target_include_directories(Simulator PUBLIC ${ONNX_INCLUDE_DIRS})
-target_link_libraries(Simulator ramulator1 booksim2 ramulator)
-target_link_libraries(Simulator ${PROTOBUF_LIB} onnx_proto ${CONAN_LIBS} stdc++fs)
-
-target_include_directories(Simulator_lib PUBLIC ${ONNX_INCLUDE_DIRS})
-target_link_libraries(Simulator_lib ramulator1 booksim2 ramulator)
-target_link_libraries(Simulator_lib ${PROTOBUF_LIB} onnx_proto ${CONAN_LIBS} stdc++fs)
\ No newline at end of file
+target_include_directories(Simulator PUBLIC ${PROJECT_SOURCE_DIR}/src)
+target_link_libraries(Simulator booksim2 ramulator)
+target_link_libraries(Simulator ${PROTOBUF_LIB} onnx_proto ${CONAN_LIBS} stdc++fs)
\ No newline at end of file
diff --git a/PyTorchSimBackend/extern/ramulator2 b/PyTorchSimBackend/extern/ramulator2
index 10f8dcaa..7ac6f810 160000
--- a/PyTorchSimBackend/extern/ramulator2
+++ b/PyTorchSimBackend/extern/ramulator2
@@ -1 +1 @@
-Subproject commit 10f8dcaab94b5696988a92461500bd3212f82e7d
+Subproject commit 7ac6f810aaba6f59b9a911c5dbc205399cb1a65e
diff --git a/PyTorchSimBackend/src/CMakeLists.txt b/PyTorchSimBackend/src/CMakeLists.txt
index 0b7d93b3..65cd4dd4 100644
--- a/PyTorchSimBackend/src/CMakeLists.txt
+++ b/PyTorchSimBackend/src/CMakeLists.txt
@@ -12,4 +12,3 @@ file(GLOB_RECURSE SRC_FILES
 
 # build
 add_executable(${LIB_NAME} ${SRC_FILES})
-add_library(${LIB_NAME}_lib ${SRC_FILES})
diff --git a/PyTorchSimBackend/src/Cache_defs.h b/PyTorchSimBackend/src/Cache_defs.h
new file mode 100644
index 00000000..8cb75fce
--- /dev/null
+++ b/PyTorchSimBackend/src/Cache_defs.h
@@ -0,0 +1,142 @@
+#ifndef CACHE_DEFS_H
+#define CACHE_DEFS_H
+#include <bitset>
+#include <list>
+#include <map>
+#include <cstdint>
+#include <deque>
+
+const int SECTOR_CHUNCK_SIZE = 4;
+typedef std::bitset<SECTOR_CHUNCK_SIZE> SectorMask;
+enum CacheBlockState { INVALID, RESERVED, VALID, MODIFIED };
+enum CacheRequestStatus {
+  HIT,
+  HIT_RESERVED,
+  MISS,
+  RESERVATION_FAIL,
+  SECTOR_MISS,
+  MSHR_HIT,
+  NUM_CACHE_REQUEST_STATUS
+};
+static const char *cache_request_status_str[] = {
+    "HIT",         "HIT_RESERVED", "MISS", "RESERVATION_FAIL",
+    "SECTOR_MISS", "MSHR_HIT"};
+
+enum CacheReservationFailReason {
+  LINE_ALLOC_FAIL,
+  MISS_QUEUE_FULL,
+  MSHR_ENTRY_FAIL,
+  MSHR_MERGE_ENTRY_FAIL,
+  MSHR_RW_PENDING,
+  NUM_CACHE_RESERVATION_FAIL_REASON
+};
+static const char *cache_reservation_fail_reason_str[] = {
+    "LINE_ALLOCATE_FAIL", "MISS_QUEUE_FULL", "MSHR_ENTRY_FAIL",
+    "MSHR_MERGE_ENTRY_FAIL", "MSHR_RW_PENDING"};
+
+enum CacheEventType {
+  WRITE_BACK_REQUEST_SENT,
+  READ_REQUEST_SENT,
+  WRITE_REQUEST_SENT,
+  WRITE_ALLOCATE_SENT
+};
+
+struct EvictedBlockInfo {
+  uint64_t m_block_addr = 0;
+  uint32_t m_modified_size = 0;
+  SectorMask m_dirty_mask;
+  void set_info(uint64_t block_addr, uint32_t modified_size,
+                SectorMask dirty_mask) {
+    m_block_addr = block_addr;
+    m_modified_size = modified_size;
+    m_dirty_mask = dirty_mask;
+  }
+};
+struct CacheEvent {
+  CacheEvent() {}
+  CacheEvent(CacheEventType cache_event_type)
+      : m_cache_event_type(cache_event_type) {}
+  CacheEvent(CacheEventType cache_event_type, EvictedBlockInfo evicted_block)
+      : m_cache_event_type(cache_event_type), m_evicted_block(evicted_block) {}
+  CacheEventType m_cache_event_type;
+  EvictedBlockInfo m_evicted_block;  // only valid for WRITE_BACK_REQUEST_SENT
+  static bool was_event_sent(const std::deque<CacheEvent> &events,
+                             CacheEventType event_type,
+                             CacheEvent &found_event) {
+    for (auto &event : events) {
+      if (event.m_cache_event_type == event_type) {
+        found_event = event;
+        return true;
+      }
+    }
+    return false;
+  }
+  static bool was_write_sent(const std::deque<CacheEvent> &events) {
+    CacheEvent event;
+    return was_event_sent(events, WRITE_REQUEST_SENT, event);
+  }
+  static bool was_read_sent(const std::deque<CacheEvent> &events) {
+    CacheEvent event;
+    return was_event_sent(events, READ_REQUEST_SENT, event);
+  }
+  static bool was_writeback_sent(const std::deque<CacheEvent> &events,
+                                 CacheEvent event) {
+    return was_event_sent(events, WRITE_BACK_REQUEST_SENT, event);
+  }
+  static bool was_write_allocate_sent(const std::deque<CacheEvent> &events) {
+    CacheEvent event;
+    return was_event_sent(events, WRITE_ALLOCATE_SENT, event);
+  }
+};
+
+enum WritePolicy {
+  READ_ONLY,
+  WRITE_BACK,
+  WRITE_THROUGH,
+  WRITE_EVICT,
+  LOCAL_WB_GLOBAL_WT
+};
+static std::map<char, WritePolicy> WritePolicyMap = {{'R', READ_ONLY},
+                                                     {'B', WRITE_BACK},
+                                                     {'T', WRITE_THROUGH},
+                                                     {'E', WRITE_EVICT},
+                                                     {'L', LOCAL_WB_GLOBAL_WT}};
+
+enum AllocationPolicy { ON_MISS, ON_FILL, STREAMING };
+static std::map<char, AllocationPolicy> AllocationPolicyMap = {
+    {'m', ON_MISS}, {'f', ON_FILL}, {'s', STREAMING}};
+
+enum WriteAllocatePolicy {
+  NO_WRITE_ALLOCATE,
+  WRITE_ALLOCATE,
+  FETCH_ON_WRITE,
+  LAZY_FETCH_ON_READ
+};
+static std::map<char, WriteAllocatePolicy> WriteAllocatePolicyMap = {
+    {'N', NO_WRITE_ALLOCATE},
+    {'W', WRITE_ALLOCATE},
+    {'F', FETCH_ON_WRITE},
+    {'L', LAZY_FETCH_ON_READ}};
+
+enum CacheType { NORMAL, SECTOR };
+static std::map<char, CacheType> CacheTypeMap = {{'N', NORMAL}, {'S', SECTOR}};
+
+enum EvictPolicy { LRU, FIFO };
+static std::map<char, EvictPolicy> EvictPolicyMap = {{'L', LRU}, {'F', FIFO}};
+
+enum MshrConfig { ASSOC, SECTOR_ASSOC };
+static std::map<char, MshrConfig> MshrConfigMap = {{'A', ASSOC},
+                                                   {'S', SECTOR_ASSOC}};
+
+enum SetIndexFunction {
+  LINEAR_SET_FUNCTION,
+  BITWISE_XORING_FUNCTION,
+  HASH_IPOLY_FUNCTION,
+  CUSTOM_SET_FUNCTION
+};
+static std::map<char, SetIndexFunction> SetIndexFunctionMap = {
+    {'L', LINEAR_SET_FUNCTION},
+    {'X', BITWISE_XORING_FUNCTION},
+    {'P', HASH_IPOLY_FUNCTION},
+    {'C', CUSTOM_SET_FUNCTION}};
+#endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index 92b955aa..f986797b 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -136,11 +136,10 @@ void Core::dma_cycle() {
       return;
     }
   }
-  /* Generate MemoryAccess */
-  std::vector<MemoryAccess*> access_vec = _tma.get_memory_access();
+  /* Generate memfetch */
+  std::vector<mem_fetch*> access_vec = _tma.get_memory_access();
   for (auto access : access_vec) {
-    access->core_id = _id;
-    access->start_cycle = _core_cycle;
+    access->set_start_cycle(_core_cycle);
     _request_queue.push(access);
   }
 
@@ -311,8 +310,8 @@ void Core::pop_memory_request() {
   _request_queue.pop();
 }
 
-void Core::push_memory_response(MemoryAccess *response) {
-  Instruction * owner_inst = response->owner_instruction;
+void Core::push_memory_response(mem_fetch* response) {
+  Instruction * owner_inst = static_cast<Instruction*>(response->get_custom_data());
 
   assert(owner_inst);
   assert(owner_inst->get_waiting_request());
diff --git a/PyTorchSimBackend/src/Core.h b/PyTorchSimBackend/src/Core.h
index f23ad739..c6fdb1ab 100644
--- a/PyTorchSimBackend/src/Core.h
+++ b/PyTorchSimBackend/src/Core.h
@@ -23,8 +23,8 @@ class Core {
   void dma_cycle();
   bool has_memory_request();
   void pop_memory_request();
-  MemoryAccess* top_memory_request() { return _request_queue.front(); }
-  void push_memory_response(MemoryAccess* response);
+  mem_fetch* top_memory_request() { return _request_queue.front(); }
+  void push_memory_response(mem_fetch* response);
   void print_stats();
   void print_current_stats();
   void finish_instruction(std::shared_ptr<Instruction>& inst);
@@ -69,7 +69,7 @@ class Core {
 
   std::vector<std::shared_ptr<Instruction>> _dma_waiting_queue;
   /* Interconnect queue */
-  std::queue<MemoryAccess*> _request_queue;
-  std::queue<MemoryAccess*> _response_queue;
+  std::queue<mem_fetch*> _request_queue;
+  std::queue<mem_fetch*> _response_queue;
   uint32_t _waiting_write_reqs;
 };
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Dram.cc b/PyTorchSimBackend/src/Dram.cc
index 1d564dc8..858e7f3c 100644
--- a/PyTorchSimBackend/src/Dram.cc
+++ b/PyTorchSimBackend/src/Dram.cc
@@ -1,148 +1,27 @@
 #include "Dram.h"
 
-uint32_t Dram::get_channel_id(MemoryAccess* access) {
+uint32_t Dram::get_channel_id(mem_fetch* access) {
   uint32_t channel_id;
   if (_n_ch_per_partition >= 16)
-    channel_id = ipoly_hash_function((new_addr_type)access->dram_address/_config.dram_req_size, 0, _n_ch_per_partition);
+    channel_id = ipoly_hash_function((new_addr_type)access->get_addr()/_config.dram_req_size, 0, _n_ch_per_partition);
   else
-    channel_id = ipoly_hash_function((new_addr_type)access->dram_address/_config.dram_req_size, 0, 16) % _n_ch_per_partition;
+    channel_id = ipoly_hash_function((new_addr_type)access->get_addr()/_config.dram_req_size, 0, 16) % _n_ch_per_partition;
   
-  channel_id += ((access->numa_id % _n_partitions)* _n_ch_per_partition);
+  channel_id += ((access->get_numa_id() % _n_partitions)* _n_ch_per_partition);
   return channel_id;
 }
 
-/* FIXME: Simple DRAM has bugs */
-SimpleDram::SimpleDram(SimulationConfig config)
-    : _latency(config.dram_latency) {
-  _cycles = 0;
-  _config = config;
-  _n_ch = config.dram_channels;
-  _n_partitions = config.dram_num_partitions;
-  _n_ch_per_partition = _n_ch / _n_partitions;
-  _waiting_queue.resize(_n_ch);
-  _response_queue.resize(_n_ch);
-}
-
-bool SimpleDram::running() { return false; }
-
-void SimpleDram::cycle() {
-  for (uint32_t ch = 0; ch < _n_ch; ch++) {
-    if (!_waiting_queue[ch].empty() &&
-        _waiting_queue[ch].front().first <= _cycles) {
-      _response_queue[ch].push(_waiting_queue[ch].front().second);
-      _waiting_queue[ch].pop();
-    }
-  }
-
-  _cycles++;
-}
-
-bool SimpleDram::is_full(uint32_t cid, MemoryAccess* request) { return false; }
-
-void SimpleDram::push(uint32_t cid, MemoryAccess* request) {
-  request->request = false;
-  std::pair<uint64_t, MemoryAccess*> entity;
-  entity.first = MAX(_cycles + _latency, _last_finish_cycle);
-  _last_finish_cycle = entity.first;
-  entity.second = request;
-  _waiting_queue[cid].push(entity);
-}
-
-bool SimpleDram::is_empty(uint32_t cid) { return _response_queue[cid].empty(); }
-
-MemoryAccess* SimpleDram::top(uint32_t cid) {
-  assert(!is_empty(cid));
-  return _response_queue[cid].front();
-}
-
-void SimpleDram::pop(uint32_t cid) {
-  assert(!is_empty(cid));
-  _response_queue[cid].pop();
-}
-
-DramRamulator::DramRamulator(SimulationConfig config)
-    : _mem(std::make_unique<ram::Ramulator>(config.dram_config_path,
-                                            config.num_cores, false)) {
-  _n_ch = config.dram_channels;
-  _config = config;
-  _cycles = 0;
-  _total_processed_requests.resize(_n_ch);
-  _processed_requests.resize(_n_ch);
-  for (int ch = 0; ch < _n_ch; ch++) {
-    _total_processed_requests[ch] = 0;
-    _processed_requests[ch] = 0;
-  }
-}
-
-bool DramRamulator::running() { return false; }
-
-void DramRamulator::cycle() {
-  _mem->tick();
-  _cycles++;
-  int interval = _config.dram_print_interval? _config.dram_print_interval: INT32_MAX;
-  int average = 0;
-  if (_cycles % interval == 0) {
-    for (int ch = 0; ch < _n_ch; ch++) {
-      float util = ((float)_processed_requests[ch]) / interval * 100;
-      _total_processed_requests[ch] += _processed_requests[ch];
-      average += _processed_requests[ch];
-      _processed_requests[ch] = 0;
-    }
-    spdlog::info("Avg DRAM: BW Util {:.2f}%", (float)average / (interval * _n_ch) * 100);
-  }
-}
-
-bool DramRamulator::is_full(uint32_t cid, MemoryAccess* request) {
-  return !_mem->isAvailable(cid, request->dram_address, request->write);
-}
-
-void DramRamulator::push(uint32_t cid, MemoryAccess* request) {
-  const addr_type atomic_bytes = _mem->getAtomicBytes();
-  const addr_type target_addr = request->dram_address;
-  // align address
-  const addr_type start_addr = target_addr - (target_addr % atomic_bytes);
-  assert(start_addr == target_addr);
-  assert(request->size == atomic_bytes);
-  int count = 0;
-  request->request = false;
-  _mem->push(cid, target_addr, request->write, request->core_id, request);
-}
-
-bool DramRamulator::is_empty(uint32_t cid) { return _mem->isEmpty(cid); }
-
-MemoryAccess* DramRamulator::top(uint32_t cid) {
-  assert(!is_empty(cid));
-  return (MemoryAccess*)_mem->top(cid);
-}
-
-void DramRamulator::pop(uint32_t cid) {
-  assert(!is_empty(cid));
-  _mem->pop(cid);
-  _processed_requests[cid]++;
-}
-
-void DramRamulator::print_stat() {
-  uint32_t total_reqs = 0;
-  for (int ch = 0; ch < _n_ch; ch++) {
-    _total_processed_requests[ch] += _processed_requests[ch];
-    float util = ((float)_total_processed_requests[ch]) / _cycles * 100;
-    spdlog::info("DRAM CH[{}]: AVG BW Util {:.2f}%", ch, util);
-    total_reqs += _total_processed_requests[ch];
-  }
-  float util = ((float)total_reqs / _n_ch) / _cycles * 100;
-  spdlog::info("DRAM: AVG BW Util {:.2f}%", util);
-  _mem->print_stats();
-}
-
 DramRamulator2::DramRamulator2(SimulationConfig config) {
   _n_ch = config.dram_channels;
   _req_size = config.dram_req_size;
   _n_partitions = config.dram_num_partitions;
   _n_ch_per_partition = _n_ch / _n_partitions;
   _config = config;
+  _m_caches.resize(_n_ch);
   _mem.resize(_n_ch);
   for (int ch = 0; ch < _n_ch; ch++) {
-    _mem[ch] = std::make_unique<NDPSim::Ramulator2>(
+    //_m_caches = std::make_unique<ReadOnlyCache>("L2 RO cache");
+    _mem[ch] = std::make_unique<Ramulator2>(
       ch, _n_ch, config.dram_config_path, "Ramulator2", _config.dram_print_interval, 1);
   }
   _tx_log2 = log2(_req_size);
@@ -159,42 +38,33 @@ void DramRamulator2::cycle() {
   }
 }
 
-bool DramRamulator2::is_full(uint32_t cid, MemoryAccess* request) {
+bool DramRamulator2::is_full(uint32_t cid, mem_fetch* request) {
   return _mem[cid]->full();
 }
 
-void DramRamulator2::push(uint32_t cid, MemoryAccess* request) {
+void DramRamulator2::push(uint32_t cid, mem_fetch* request) {
   addr_type atomic_bytes =_config.dram_req_size;
-  addr_type target_addr = request->dram_address;
+  addr_type target_addr = request->get_addr();
   // align address
   addr_type start_addr = target_addr - (target_addr % atomic_bytes);
   assert(start_addr == target_addr);
-  assert(request->size == atomic_bytes);
-  target_addr = (target_addr >> _tx_ch_log2) << _tx_log2;
-  NDPSim::mem_fetch* mf = new NDPSim::mem_fetch();
-  mf->addr = target_addr;
-  mf->size = request->size;
-  mf->write = request->write;
-  mf->request = true;
-  mf->origin_data = request;
-  _mem[cid]->push(mf);
+  assert(request->get_data_size() == atomic_bytes);
+  _mem[cid]->push(request);
 }
 
 bool DramRamulator2::is_empty(uint32_t cid) {
   return _mem[cid]->return_queue_top() == NULL;
 }
 
-MemoryAccess* DramRamulator2::top(uint32_t cid) {
+mem_fetch* DramRamulator2::top(uint32_t cid) {
   assert(!is_empty(cid));
-  NDPSim::mem_fetch* mf = _mem[cid]->return_queue_top();
-  ((MemoryAccess*)mf->origin_data)->request = false;
-  return (MemoryAccess*)mf->origin_data;
+  mem_fetch* mf = _mem[cid]->return_queue_top();
+  return mf;
 }
 
 void DramRamulator2::pop(uint32_t cid) {
   assert(!is_empty(cid));
-  NDPSim::mem_fetch* mf = _mem[cid]->return_queue_pop();
-  delete mf;
+  mem_fetch* mf = _mem[cid]->return_queue_pop();
 }
 
 void DramRamulator2::print_stat() {
diff --git a/PyTorchSimBackend/src/Dram.h b/PyTorchSimBackend/src/Dram.h
index 112d1783..67666bed 100644
--- a/PyTorchSimBackend/src/Dram.h
+++ b/PyTorchSimBackend/src/Dram.h
@@ -7,21 +7,21 @@
 
 #include "Common.h"
 #include "TMA.h"
-#include "ramulator/Ramulator.hpp"
 #include "ramulator2.hh"
 #include "Hashing.h"
+#include "Cache.h"
 
 class Dram {
  public:
   virtual ~Dram() = default;
   virtual bool running() = 0;
   virtual void cycle() = 0;
-  virtual bool is_full(uint32_t cid, MemoryAccess* request) = 0;
-  virtual void push(uint32_t cid, MemoryAccess* request) = 0;
+  virtual bool is_full(uint32_t cid, mem_fetch* request) = 0;
+  virtual void push(uint32_t cid, mem_fetch* request) = 0;
   virtual bool is_empty(uint32_t cid) = 0;
-  virtual MemoryAccess* top(uint32_t cid) = 0;
+  virtual mem_fetch* top(uint32_t cid) = 0;
   virtual void pop(uint32_t cid) = 0;
-  uint32_t get_channel_id(MemoryAccess* request);
+  uint32_t get_channel_id(mem_fetch* request);
   virtual void print_stat() {}
 
  protected:
@@ -32,63 +32,22 @@ class Dram {
   cycle_type _cycles;
 };
 
-class SimpleDram : public Dram {
- public:
-  SimpleDram(SimulationConfig config);
-  virtual bool running() override;
-  virtual void cycle() override;
-  virtual bool is_full(uint32_t cid, MemoryAccess* request) override;
-  virtual void push(uint32_t cid, MemoryAccess* request) override;
-  virtual bool is_empty(uint32_t cid) override;
-  virtual MemoryAccess* top(uint32_t cid) override;
-  virtual void pop(uint32_t cid) override;
-
- private:
-  uint32_t _latency;
-  double _bandwidth;
-
-  uint64_t _last_finish_cycle;
-  std::vector<std::queue<std::pair<addr_type, MemoryAccess*>>> _waiting_queue;
-  std::vector<std::queue<MemoryAccess*>> _response_queue;
-};
-
-class DramRamulator : public Dram {
- public:
-  DramRamulator(SimulationConfig config);
-
-  virtual bool running() override;
-  virtual void cycle() override;
-  virtual bool is_full(uint32_t cid, MemoryAccess* request) override;
-  virtual void push(uint32_t cid, MemoryAccess* request) override;
-  virtual bool is_empty(uint32_t cid) override;
-  virtual MemoryAccess* top(uint32_t cid) override;
-  virtual void pop(uint32_t cid) override;
-  virtual void print_stat() override;
-
- private:
-  std::unique_ptr<ram::Ramulator> _mem;
-  robin_hood::unordered_flat_map<uint64_t, MemoryAccess*> _waiting_mem_access;
-  std::queue<MemoryAccess*> _responses;
-
-  std::vector<uint64_t> _total_processed_requests;
-  std::vector<uint64_t> _processed_requests;
-};
-
 class DramRamulator2 : public Dram {
  public:
   DramRamulator2(SimulationConfig config);
 
   virtual bool running() override;
   virtual void cycle() override;
-  virtual bool is_full(uint32_t cid, MemoryAccess* request) override;
-  virtual void push(uint32_t cid, MemoryAccess* request) override;
+  virtual bool is_full(uint32_t cid, mem_fetch* request) override;
+  virtual void push(uint32_t cid, mem_fetch* request) override;
   virtual bool is_empty(uint32_t cid) override;
-  virtual MemoryAccess* top(uint32_t cid) override;
+  virtual mem_fetch* top(uint32_t cid) override;
   virtual void pop(uint32_t cid) override;
   virtual void print_stat() override;
 
  private:
-  std::vector<std::unique_ptr<NDPSim::Ramulator2>> _mem;
+  std::vector<std::unique_ptr<Cache>> _m_caches;
+  std::vector<std::unique_ptr<Ramulator2>> _mem;
   int _tx_ch_log2;
   int _tx_log2;
   int _req_size;
diff --git a/PyTorchSimBackend/src/Hashing.cc b/PyTorchSimBackend/src/Hashing.cc
index 45482867..868178ae 100644
--- a/PyTorchSimBackend/src/Hashing.cc
+++ b/PyTorchSimBackend/src/Hashing.cc
@@ -95,3 +95,57 @@ unsigned ipoly_hash_function(new_addr_type higher_bits, unsigned index,
     return 0;
   }
 }
+
+unsigned bitwise_hash_function(new_addr_type higher_bits, unsigned index,
+                               unsigned bank_set_num) {
+  return (index) ^ (higher_bits & (bank_set_num - 1));
+}
+
+unsigned PAE_hash_function(new_addr_type higher_bits, unsigned index,
+                           unsigned bank_set_num) {
+  // Page Address Entropy
+  // random selected bits from the page and bank bits
+  // similar to
+  // Liu, Yuxi, et al. "Get Out of the Valley: Power-Efficient Address
+  if (bank_set_num == 32) {
+    std::bitset<64> a(higher_bits);
+    std::bitset<5> b(index);
+    std::bitset<5> new_index(index);
+    new_index[0] = a[13] ^ a[10] ^ a[9] ^ a[5] ^ a[0] ^ b[3] ^ b[0] ^ b[0];
+    new_index[1] = a[12] ^ a[11] ^ a[6] ^ a[1] ^ b[3] ^ b[2] ^ b[1] ^ b[1];
+    new_index[2] = a[14] ^ a[9] ^ a[8] ^ a[7] ^ a[2] ^ b[1] ^ b[2];
+    new_index[3] = a[11] ^ a[10] ^ a[8] ^ a[3] ^ b[2] ^ b[3] ^ b[3];
+    new_index[4] = a[12] ^ a[9] ^ a[8] ^ a[5] ^ a[4] ^ b[1] ^ b[0] ^ b[4];
+
+    return new_index.to_ulong();
+  } else {
+    assert(0);
+    return 0;
+  }
+}
+
+unsigned mini_hash_function(new_addr_type higher_bits, unsigned index,
+                             unsigned bank_set_num) {
+  if (bank_set_num == 16) {
+    std::bitset<64> a(higher_bits);
+    std::bitset<4> b(index);
+    std::bitset<4> new_index(index);
+
+    new_index[0] = a[0] ^ b[0];
+    new_index[1] = a[0] ^ b[1];
+    new_index[2] = a[1] ^ b[2];
+    new_index[3] = a[1] ^ b[3];
+
+
+    return new_index.to_ulong();
+  } else { /* Else incorrect number of channels for the hashing function */
+    assert(
+        "\nmemory_partition_indexing error: The number of "
+        "channels should be "
+        "16, 32 or 64 for the hashing IPOLY index function. other banks "
+        "numbers are not supported. Generate it by yourself! \n" &&
+        0);
+
+    return 0;
+  }
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Hashing.h b/PyTorchSimBackend/src/Hashing.h
index dc134792..da03de04 100644
--- a/PyTorchSimBackend/src/Hashing.h
+++ b/PyTorchSimBackend/src/Hashing.h
@@ -13,4 +13,12 @@ typedef unsigned long long new_addr_type;
 unsigned ipoly_hash_function(new_addr_type higher_bits, unsigned index,
                              unsigned bank_set_num);
 
+unsigned bitwise_hash_function(new_addr_type higher_bits, unsigned index,
+                               unsigned bank_set_num);
+
+unsigned PAE_hash_function(new_addr_type higher_bits, unsigned index,
+                           unsigned bank_set_num);
+
+unsigned mini_hash_function(new_addr_type higher_bits, unsigned index,
+                            unsigned bank_set_num);
 #endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Interconnect.cc b/PyTorchSimBackend/src/Interconnect.cc
index dc62e402..49025c85 100644
--- a/PyTorchSimBackend/src/Interconnect.cc
+++ b/PyTorchSimBackend/src/Interconnect.cc
@@ -40,7 +40,7 @@ void SimpleInterconnect::cycle() {
   _cycles++;
 }
 
-void SimpleInterconnect::push(uint32_t src, uint32_t dest, MemoryAccess* request) {
+void SimpleInterconnect::push(uint32_t src, uint32_t dest, mem_fetch* request) {
   SimpleInterconnect::Entity entity;
   if(_in_buffers[src].empty())
     entity.finish_cycle =  _cycles + _latency;
@@ -51,7 +51,7 @@ void SimpleInterconnect::push(uint32_t src, uint32_t dest, MemoryAccess* request
   _in_buffers[src].push(entity);
 }
 
-bool SimpleInterconnect::is_full(uint32_t nid, MemoryAccess* request) {
+bool SimpleInterconnect::is_full(uint32_t nid, mem_fetch* request) {
   //TODO: limit buffersize
   return false;
 }
@@ -60,7 +60,7 @@ bool SimpleInterconnect::is_empty(uint32_t nid) {
   return _out_buffers[nid].empty();
 }
 
-MemoryAccess* SimpleInterconnect::top(uint32_t nid) {
+mem_fetch* SimpleInterconnect::top(uint32_t nid) {
   assert(!is_empty(nid));
   return _out_buffers[nid].front();
 }
@@ -93,13 +93,13 @@ void Booksim2Interconnect::cycle() {
   _booksim->run();
 }
 
-void Booksim2Interconnect::push(uint32_t src, uint32_t dest, MemoryAccess* request) {
+void Booksim2Interconnect::push(uint32_t src, uint32_t dest, mem_fetch* request) {
   booksim2::Interconnect::Type type = get_booksim_type(request);
   uint32_t size = get_packet_size(request);
   _booksim->push(request, 0, 0, size, type, src, dest);
 }
 
-bool Booksim2Interconnect::is_full(uint32_t nid, MemoryAccess* request) {
+bool Booksim2Interconnect::is_full(uint32_t nid, mem_fetch* request) {
   uint32_t size = get_packet_size(request);
   return _booksim->is_full(nid, 0, size);
 }
@@ -108,9 +108,9 @@ bool Booksim2Interconnect::is_empty(uint32_t nid) {
   return _booksim->is_empty(nid, 0);
 }
 
-MemoryAccess* Booksim2Interconnect::top(uint32_t nid) {
+mem_fetch* Booksim2Interconnect::top(uint32_t nid) {
   assert(!is_empty(nid));
-  return (MemoryAccess*) _booksim->top(nid, 0);
+  return (mem_fetch*) _booksim->top(nid, 0);
 }
 
 void Booksim2Interconnect::pop(uint32_t nid) {
@@ -122,44 +122,44 @@ void Booksim2Interconnect::print_stats() {
   _booksim->print_stats();
 }
 
-booksim2::Interconnect::Type Booksim2Interconnect::get_booksim_type(MemoryAccess* access) {
+booksim2::Interconnect::Type Booksim2Interconnect::get_booksim_type(mem_fetch* access) {
   booksim2::Interconnect::Type type;
-  if(access->write && access->request) {
-    /* Write request */
-    type = booksim2::Interconnect::Type::WRITE;
-  }
-  else if(access->write && !access->request) {
-    /* Write response */
-    type = booksim2::Interconnect::Type::WRITE_REPLY;
-  }
-  else if(!access->write && access->request){
-    /* Read request */
+  switch (access->get_type())
+  {
+  case mf_type::READ_REQUEST:
     type = booksim2::Interconnect::Type::READ;
-  } 
-  else if(!access->write && !access->request) {
-    /* Read reply */
+    break;
+  case mf_type::READ_REPLY:
     type = booksim2::Interconnect::Type::READ_REPLY;
+    break;
+  case mf_type::WRITE_REQUEST:
+    type = booksim2::Interconnect::Type::WRITE;
+    break;
+  case mf_type::WRITE_ACK:
+    type = booksim2::Interconnect::Type::WRITE_REPLY;
+    break;
+  default:
+    spdlog::error("[Interconenct] Unexpected memory type...");
+    break;
   }
   return type;
 }
 
-uint32_t Booksim2Interconnect::get_packet_size(MemoryAccess* access) {
+uint32_t Booksim2Interconnect::get_packet_size(mem_fetch* access) {
   uint32_t size;
-  if(access->write && access->request) {
-    /* Write request */
-    size = access->size;
-  }
-  else if(access->write && !access->request) {
-    /* Write response */
-    size = _ctrl_size;
-  }
-  else if(!access->write && access->request){
-    /* Read request */
+  switch (access->get_type())
+  {
+  case mf_type::READ_REQUEST:
+  case mf_type::WRITE_ACK:
     size = _ctrl_size;
-  } 
-  else if(!access->write && !access->request) {
-    /* Read reply */
-    size = access->size;
+    break;
+  case mf_type::READ_REPLY:
+  case mf_type::WRITE_REQUEST:
+    size = access->get_data_size();
+    break;
+  default:
+    spdlog::error("[Interconenct] Unexpected memory type...");
+    break;
   }
   return size;
 }
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Interconnect.h b/PyTorchSimBackend/src/Interconnect.h
index a47b8c6a..8467b7aa 100644
--- a/PyTorchSimBackend/src/Interconnect.h
+++ b/PyTorchSimBackend/src/Interconnect.h
@@ -12,10 +12,10 @@ class Interconnect {
   virtual ~Interconnect() = default;
   virtual bool running() = 0;
   virtual void cycle() = 0;
-  virtual void push(uint32_t src, uint32_t dest, MemoryAccess* request) = 0;
-  virtual bool is_full(uint32_t src, MemoryAccess* request) = 0;
+  virtual void push(uint32_t src, uint32_t dest, mem_fetch* request) = 0;
+  virtual bool is_full(uint32_t src, mem_fetch* request) = 0;
   virtual bool is_empty(uint32_t nid) = 0;
-  virtual MemoryAccess* top(uint32_t nid) = 0;
+  virtual mem_fetch* top(uint32_t nid) = 0;
   virtual void pop(uint32_t nid) = 0;
   virtual void print_stats() = 0;
 
@@ -32,10 +32,10 @@ class SimpleInterconnect : public Interconnect {
   virtual bool running() override;
   virtual void cycle() override;
   virtual void push(uint32_t src, uint32_t dest,
-                    MemoryAccess* request) override;
-  virtual bool is_full(uint32_t src, MemoryAccess* request) override;
+                    mem_fetch* request) override;
+  virtual bool is_full(uint32_t src, mem_fetch* request) override;
   virtual bool is_empty(uint32_t nid) override;
-  virtual MemoryAccess* top(uint32_t nid) override;
+  virtual mem_fetch* top(uint32_t nid) override;
   virtual void pop(uint32_t nid) override;
   virtual void print_stats() override {}
 
@@ -48,11 +48,11 @@ class SimpleInterconnect : public Interconnect {
   struct Entity {
     cycle_type finish_cycle;
     uint32_t dest;
-    MemoryAccess* access;
+    mem_fetch* access;
   };
 
   std::vector<std::queue<Entity>> _in_buffers;
-  std::vector<std::queue<MemoryAccess*>> _out_buffers;
+  std::vector<std::queue<mem_fetch*>> _out_buffers;
   std::vector<bool> _busy_node;
 };
 
@@ -62,10 +62,10 @@ class Booksim2Interconnect : public Interconnect {
   virtual bool running() override;
   virtual void cycle() override;
   virtual void push(uint32_t src, uint32_t dest,
-                    MemoryAccess* request) override;
-  virtual bool is_full(uint32_t src, MemoryAccess* request) override;
+                    mem_fetch* request) override;
+  virtual bool is_full(uint32_t src, mem_fetch* request) override;
   virtual bool is_empty(uint32_t nid) override;
-  virtual MemoryAccess* top(uint32_t nid) override;
+  virtual mem_fetch* top(uint32_t nid) override;
   virtual void pop(uint32_t nid) override;
   virtual void print_stats() override;
 
@@ -74,7 +74,7 @@ class Booksim2Interconnect : public Interconnect {
   std::string _config_path;
   std::unique_ptr<booksim2::Interconnect> _booksim;
 
-  booksim2::Interconnect::Type get_booksim_type(MemoryAccess* access);
-  uint32_t get_packet_size(MemoryAccess* access);
+  booksim2::Interconnect::Type get_booksim_type(mem_fetch* access);
+  uint32_t get_packet_size(mem_fetch* access);
 };
 #endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Memfetch.h b/PyTorchSimBackend/src/Memfetch.h
new file mode 100644
index 00000000..f40fc781
--- /dev/null
+++ b/PyTorchSimBackend/src/Memfetch.h
@@ -0,0 +1,90 @@
+#ifndef MEM_FETCH_H
+#define MEM_FETCH_H
+
+#include "Cache_defs.h"
+
+typedef unsigned long long new_addr_type;
+
+enum mem_access_type {
+  GLOBAL_ACC_R,
+  GLOBAL_ACC_W,
+  L2_CACHE_WA, /* Data L2 cache write alloc */
+  L2_CACHE_WB, /* Data L2 cache write back */
+  NUM_MEM_ACCESS_TYPE
+};
+
+static const char* mem_access_type_str[] = {
+    "GLOBAL_ACC_R", "GLOBAL_ACC_W",  
+    "L2_CACHE_WA", "L2_CACHE_WB"};
+enum mf_type { READ_REQUEST = 0, WRITE_REQUEST, READ_REPLY, WRITE_ACK };
+
+class mem_fetch {
+ public:
+  mem_fetch(new_addr_type addr, mem_access_type acc_type, mf_type type,
+            unsigned data_size, unsigned request_id, unsigned numa_id=-1,
+            void* custom_data=NULL) :
+            m_addr(addr), m_mem_access_type(acc_type),
+            m_type(type), m_data_size(data_size),
+            m_request_id(request_id), m_numa_id(numa_id),
+            m_custom_data(custom_data) {}
+  mem_fetch(new_addr_type addr, mem_access_type acc_type, mf_type type,
+            unsigned data_size) : m_addr(addr), m_mem_access_type(acc_type),
+            m_type(type), m_data_size(data_size) {}
+  mem_fetch(new_addr_type addr, mem_access_type acc_type, mf_type type,
+            unsigned data_size, SectorMask sector_mask) :
+            m_addr(addr), m_mem_access_type(acc_type), m_type(type), m_data_size(data_size),
+            m_sector_mask(sector_mask) {}
+  mem_fetch(std::deque<mem_fetch*> mfs);  // for wrapping multiple mfs into one
+  /* Src & Des */
+  void set_core_id(int core_id) {m_core_id = core_id;}
+  int get_core_id() { return m_core_id; }
+  void set_channel(unsigned channel) { m_channel = channel; }
+  unsigned get_channel() { return m_channel; }
+  void set_numa_id(unsigned numa_id) { m_numa_id=numa_id; }
+  unsigned get_numa_id() { return m_numa_id; }
+  /* Data & size */
+  void set_data(void* data) { m_data = data; }
+  void* get_data() { return m_data; }
+  void set_data_size(unsigned size) { m_data_size = size; }
+  unsigned get_data_size() { return m_data_size; }
+  new_addr_type get_addr() { return m_addr; }
+  void set_addr(new_addr_type addr) { m_addr = addr; }
+  /* Mem info */
+  mem_access_type get_access_type() { return m_mem_access_type; }
+  mf_type get_type() { return m_type; }
+  void set_type(mf_type type) { m_type = type; }
+  bool is_write() { return m_type == mf_type::WRITE_REQUEST || m_type == mf_type::WRITE_ACK; }
+  void set_request_id(unsigned id) { m_request_id = id; }
+  unsigned get_request_id() { return m_request_id; }
+  SectorMask get_access_sector_mask() { return m_sector_mask; }
+  void set_dirty_mask(SectorMask dirty_mask) { m_dirty_mask = dirty_mask; }
+  SectorMask get_dirty_mask() { return m_dirty_mask; }
+  mem_fetch* get_original_mf() { return m_original_mf; }
+  bool is_atomic() { return false; }
+  void set_custom_data(void* custom_data) { m_custom_data = custom_data; }
+  void* get_custom_data() { return m_custom_data; }
+  /* Stat */
+  void set_start_cycle(uint64_t start_cycle) { m_start_cycle = start_cycle; }
+  uint64_t get_start_cycle() { return m_start_cycle; } 
+
+  std::string current_state = "NONE";
+  uint64_t request_cycle;
+  uint64_t response_cycle;
+ private:
+  uint64_t m_request_id;
+  unsigned m_data_size;
+  new_addr_type m_addr;
+  void* m_data = NULL;
+  mem_access_type m_mem_access_type;
+  mf_type m_type;
+  unsigned m_core_id;
+  unsigned m_channel;
+  unsigned m_numa_id;
+  SectorMask m_sector_mask;
+  SectorMask m_dirty_mask;
+  mem_fetch* m_original_mf;
+  void* m_custom_data = NULL;
+  uint64_t m_start_cycle = 0ULL;
+};
+
+#endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index a296897a..eef67064 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -23,16 +23,7 @@ Simulator::Simulator(SimulationConfig config)
   char* onnxim_path_env = std::getenv("TORCHSIM_DIR");
   std::string onnxim_path = onnxim_path_env != NULL?
     std::string(onnxim_path_env) + "/PyTorchSimBackend" : std::string("./");
-  if (config.dram_type == DramType::SIMPLE) {
-    _dram = std::make_unique<SimpleDram>(config);
-  } else if (config.dram_type == DramType::RAMULATOR1) {
-    std::string ramulator_config = fs::path(onnxim_path)
-                                       .append("configs")
-                                       .append(config.dram_config_path)
-                                       .string();
-    config.dram_config_path = ramulator_config;
-    _dram = std::make_unique<DramRamulator>(config);
-  } else if (config.dram_type == DramType::RAMULATOR2) {
+  if (config.dram_type == DramType::RAMULATOR2) {
     std::string ramulator_config = fs::path(onnxim_path)
                                        .append("configs")
                                        .append(config.dram_config_path)
@@ -108,8 +99,8 @@ void Simulator::icnt_cycle() {
     // PUHS core to ICNT. memory request
       int port_id = core_id * _noc_node_per_core + noc_id;
       if (_cores[core_id]->has_memory_request()) {
-        MemoryAccess *front = _cores[core_id]->top_memory_request();
-        front->core_id = core_id;
+        mem_fetch *front = _cores[core_id]->top_memory_request();
+        front->set_core_id(core_id);
         if (!_icnt->is_full(port_id, front)) {
           _icnt->push(port_id , get_dest_node(front), front);
           _cores[core_id]->pop_memory_request();
@@ -238,11 +229,21 @@ void Simulator::set_cycle_mask() {
   }
 }
 
-uint32_t Simulator::get_dest_node(MemoryAccess *access) {
-  if (access->request) {
+uint32_t Simulator::get_dest_node(mem_fetch *access) {
+  switch (access->get_type())
+  {
+  case mf_type::READ_REQUEST:
+  case mf_type::WRITE_REQUEST:
     return _config.num_cores * _noc_node_per_core + _dram->get_channel_id(access);
-  } else {
-    return access->core_id * _noc_node_per_core + (_dram->get_channel_id(access) % _noc_node_per_core);
+    break;
+  case mf_type::READ_REPLY:
+  case mf_type::WRITE_ACK:
+    return access->get_core_id() * _noc_node_per_core + (_dram->get_channel_id(access) % _noc_node_per_core);
+    break;
+  default:
+    spdlog::error("Unexpected memfetc type...");
+    return -1;
+    break;
   }
 }
 
diff --git a/PyTorchSimBackend/src/Simulator.h b/PyTorchSimBackend/src/Simulator.h
index 907808a6..7733e9c4 100644
--- a/PyTorchSimBackend/src/Simulator.h
+++ b/PyTorchSimBackend/src/Simulator.h
@@ -38,7 +38,7 @@ class Simulator {
   void icnt_cycle();
   bool running();
   void set_cycle_mask();
-  uint32_t get_dest_node(MemoryAccess *access);
+  uint32_t get_dest_node(mem_fetch *access);
   SimulationConfig _config;
   uint32_t _n_cores;
   uint32_t _noc_node_per_core;
diff --git a/PyTorchSimBackend/src/TMA.cc b/PyTorchSimBackend/src/TMA.cc
index 048cedfd..89a3f311 100644
--- a/PyTorchSimBackend/src/TMA.cc
+++ b/PyTorchSimBackend/src/TMA.cc
@@ -18,23 +18,18 @@ void TMA::issue_tile(std::shared_ptr<Instruction> inst) {
   _finished = false;
 }
 
-std::vector<MemoryAccess*> TMA::get_memory_access() {
+std::vector<mem_fetch*> TMA::get_memory_access() {
   std::set<addr_type> addr_set = _current_inst->get_dram_address(_dram_req_size);
-  std::vector<MemoryAccess *> access_vec;
+  std::vector<mem_fetch *> access_vec;
   Tile* owner = (Tile*)_current_inst->get_owner();
   std::shared_ptr<TileSubGraph> owner_subgraph = owner->get_owner();
   spdlog::trace("[NUMA Trace] Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}",
     owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write());
   for (auto addr: addr_set) {
-    MemoryAccess* access = new MemoryAccess({
-      .id = generate_mem_access_id(),
-      .dram_address = addr,
-      .size = _dram_req_size,
-      .write = _current_inst->is_dma_write(),
-      .request = true,
-      .numa_id = _current_inst->get_numa_id(),
-      .owner_instruction = _current_inst.get()
-    });
+    mem_access_type acc_type = _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W : mem_access_type::GLOBAL_ACC_R;
+    mf_type type = _current_inst->is_dma_write() ? mf_type::WRITE_REQUEST : mf_type::READ_REQUEST;
+    mem_fetch* access = new mem_fetch(addr, acc_type, type, _dram_req_size, generate_mem_access_id(),
+      _current_inst->get_numa_id(), static_cast<void*>(_current_inst.get()));
     _current_inst->inc_waiting_request();
     access_vec.push_back(access);
   }
diff --git a/PyTorchSimBackend/src/TMA.h b/PyTorchSimBackend/src/TMA.h
index f9a9cbcc..f2ea3943 100644
--- a/PyTorchSimBackend/src/TMA.h
+++ b/PyTorchSimBackend/src/TMA.h
@@ -8,20 +8,7 @@
 #include "Instruction.h"
 #include "SimulationConfig.h"
 #include "Tile.h"
-
-typedef struct {
-  uint32_t id;
-  addr_type dram_address;
-  uint64_t size;
-  bool write;
-  bool request;
-  uint32_t core_id;
-  uint32_t numa_id=0;
-  Instruction* owner_instruction;
-  cycle_type start_cycle;
-  cycle_type dram_enter_cycle;
-  cycle_type dram_finish_cycle;
-} MemoryAccess;
+#include "Memfetch.h"
 
 struct VectorCompare {
     bool operator()(const std::vector<int>& a, const std::vector<int>& b) const {
@@ -96,7 +83,7 @@ class TMA {
   }
 
   std::shared_ptr<Instruction>& get_current_inst() { return _current_inst; }
-  std::vector<MemoryAccess*> get_memory_access();
+  std::vector<mem_fetch*> get_memory_access();
   uint32_t generate_mem_access_id();
   const uint32_t get_max_dim() { return _max_dim; }
 

From b61774a47ec66295c79f44eddd205f4be1ee6d76 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sun, 9 Feb 2025 14:35:55 +0000
Subject: [PATCH 089/432] [Backendsim] Introduce L2 cache(Currently Readonly
 cache is available)

---
 PyTorchSimBackend/extern/ramulator2      |   2 +-
 PyTorchSimBackend/src/Cache.cc           | 712 +++++++++++++++++++++++
 PyTorchSimBackend/src/Cache.h            | 375 ++++++++++++
 PyTorchSimBackend/src/Cache_stats.cc     | 244 ++++++++
 PyTorchSimBackend/src/Cache_stats.h      |  51 ++
 PyTorchSimBackend/src/Common.cc          |   6 +
 PyTorchSimBackend/src/DelayQueue.cc      |  55 ++
 PyTorchSimBackend/src/DelayQueue.h       |  45 ++
 PyTorchSimBackend/src/Dram.cc            | 130 ++++-
 PyTorchSimBackend/src/Dram.h             |  12 +-
 PyTorchSimBackend/src/Memfetch.h         |  28 +-
 PyTorchSimBackend/src/SimulationConfig.h |   4 +
 PyTorchSimBackend/src/Simulator.cc       |   3 +-
 PyTorchSimBackend/src/TMA.cc             |   3 +-
 14 files changed, 1638 insertions(+), 32 deletions(-)
 create mode 100644 PyTorchSimBackend/src/Cache.cc
 create mode 100644 PyTorchSimBackend/src/Cache.h
 create mode 100644 PyTorchSimBackend/src/Cache_stats.cc
 create mode 100644 PyTorchSimBackend/src/Cache_stats.h
 create mode 100644 PyTorchSimBackend/src/DelayQueue.cc
 create mode 100644 PyTorchSimBackend/src/DelayQueue.h

diff --git a/PyTorchSimBackend/extern/ramulator2 b/PyTorchSimBackend/extern/ramulator2
index 7ac6f810..2ea90841 160000
--- a/PyTorchSimBackend/extern/ramulator2
+++ b/PyTorchSimBackend/extern/ramulator2
@@ -1 +1 @@
-Subproject commit 7ac6f810aaba6f59b9a911c5dbc205399cb1a65e
+Subproject commit 2ea9084120eb389846326bd765d55f5e64632b7d
diff --git a/PyTorchSimBackend/src/Cache.cc b/PyTorchSimBackend/src/Cache.cc
new file mode 100644
index 00000000..b7423424
--- /dev/null
+++ b/PyTorchSimBackend/src/Cache.cc
@@ -0,0 +1,712 @@
+#include "Cache.h"
+#include "Hashing.h"
+
+unsigned int LOGB2(unsigned int v) {
+  unsigned int shift;
+  unsigned int r;
+  r = 0;
+  shift = ((v & 0xFFFF0000) != 0) << 4;
+  v >>= shift;
+  r |= shift;
+  shift = ((v & 0xFF00) != 0) << 3;
+  v >>= shift;
+  r |= shift;
+  shift = ((v & 0xF0) != 0) << 2;
+  v >>= shift;
+  r |= shift;
+  shift = ((v & 0xC) != 0) << 1;
+  v >>= shift;
+  r |= shift;
+  shift = ((v & 0x2) != 0) << 0;
+  v >>= shift;
+  r |= shift;
+  return r;
+}
+
+void CacheConfig::init(std::string config) {
+  assert(config.size() > 0);
+  char cache_type, evict_policy, write_policy, alloc_policy, write_alloc_policy,
+      sif;
+  char mshr_type;
+  // sif : sector index function
+  int ntok =
+      sscanf(config.c_str(), "%c:%u:%u:%u,%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u",
+             &cache_type, &m_nset, &m_line_size, &m_assoc, &m_sector_size, &evict_policy,
+             &write_policy, &alloc_policy, &write_alloc_policy, &sif,
+             &mshr_type, &m_mshr_entries, &m_mshr_max_merge, &m_miss_queue_size,
+             &m_result_fifo_entries, &m_data_port_width);
+  assert(ntok >= 12);
+  m_valid = true;
+  m_cache_type = CacheTypeMap[cache_type];
+  m_evict_policy = EvictPolicyMap[evict_policy];
+  m_write_policy = WritePolicyMap[write_policy];
+  m_alloc_policy = AllocationPolicyMap[alloc_policy];
+  m_write_alloc_policy = WriteAllocatePolicyMap[write_alloc_policy];
+  m_set_index_function = SetIndexFunctionMap[sif];
+  m_mshr_type = MshrConfigMap[mshr_type];
+  m_line_size_log2 = LOGB2(m_line_size);
+  m_nset_log2 = LOGB2(m_nset);
+  m_atom_size = m_cache_type == SECTOR ? m_sector_size : m_line_size;
+  m_sector_size_log2 = LOGB2(m_sector_size);
+  m_origin_assoc = m_assoc;
+  m_origin_nset = m_nset;
+}
+
+uint32_t CacheConfig::get_set_index(uint64_t addr) const {
+  return hash_function(addr);
+}
+
+uint64_t CacheConfig::get_tag(uint64_t addr) const {
+  return addr & ~(uint64_t)(m_line_size - 1);
+}
+
+uint64_t CacheConfig::get_block_addr(uint64_t addr) const {
+  return addr & ~(uint64_t)(m_line_size - 1);
+}
+
+uint64_t CacheConfig::get_mshr_addr(uint64_t addr) const {
+  return addr & ~(uint64_t)(m_atom_size - 1);
+}
+
+uint32_t CacheConfig::hash_function(uint64_t addr) const {
+  uint32_t set_index = 0;
+  switch (m_set_index_function) {
+    case LINEAR_SET_FUNCTION:
+      set_index = (addr >> m_line_size_log2) & (m_nset - 1);
+      break;
+    case BITWISE_XORING_FUNCTION: {
+      uint64_t higher_bits = addr > (m_line_size_log2 + m_nset_log2);
+      uint32_t index = (addr >> m_line_size_log2) & (m_nset - 1);
+      set_index = bitwise_hash_function(higher_bits, index, m_nset);
+    } break;
+    case HASH_IPOLY_FUNCTION: {
+      uint64_t higher_bits = addr > (m_line_size_log2 + m_nset_log2);
+      uint32_t index = (addr >> m_line_size_log2) & (m_nset - 1);
+      set_index = ipoly_hash_function(higher_bits, index, m_nset);
+    } break;
+    case CUSTOM_SET_FUNCTION:
+      break;
+    default:
+      assert(0);
+  }
+  return set_index;
+}
+
+/* Normal Cache Block */
+void LineCacheBlock::allocate(uint64_t tag, uint64_t block_addr, uint32_t time,
+                              SectorMask mask) {
+  m_tag = tag;
+  m_block_addr = block_addr;
+  m_alloc_time = time;
+  m_last_access_time = time;
+  m_fill_time = 0;
+  m_status = RESERVED;
+  m_ignore_on_fill_status = false;
+  m_set_modified_on_fill = false;
+}
+
+void LineCacheBlock::fill(uint32_t time, SectorMask) {
+  m_fill_time = time;
+  m_status = m_set_modified_on_fill ? MODIFIED : VALID;
+}
+
+SectorMask LineCacheBlock::get_dirty_mask() {
+  SectorMask dirty_mask;
+  dirty_mask.set();
+  return dirty_mask;
+}
+
+/* Sector Cache Block */
+void SectorCacheBlock::allocate(uint64_t tag, uint64_t block_addr,
+                                uint32_t time, SectorMask sector_mask) {
+  // Allocate line
+  init();
+  m_tag = tag;
+  m_block_addr = block_addr;
+  uint32_t sidx = get_sector_index(sector_mask);
+  m_sector_alloc_time[sidx] = time;
+  m_sector_last_access_time[sidx] = time;
+  m_sector_fill_time[sidx] = 0;
+  m_status[sidx] = RESERVED;
+  m_ignore_on_fill_status[sidx] = false;
+  m_set_modified_on_fill_status[sidx] = false;
+  m_line_alloc_time = time;
+  m_line_last_access_time = time;
+  m_line_fill_time = 0;
+}
+
+void SectorCacheBlock::allocate_sector(uint32_t time, SectorMask sector_mask) {
+  assert(is_valid_line());
+  uint32_t sidx = get_sector_index(sector_mask);
+  m_sector_alloc_time[sidx] = time;
+  m_sector_last_access_time[sidx] = time;
+  m_sector_fill_time[sidx] = 0;
+  if (m_status[sidx] == MODIFIED)
+    m_set_modified_on_fill_status[sidx] = true;
+  else
+    m_set_modified_on_fill_status[sidx] = false;
+  m_status[sidx] = RESERVED;
+  m_ignore_on_fill_status[sidx] = false;
+  m_readable[sidx] = true;
+  m_line_last_access_time = time;
+  m_line_fill_time = 0;
+}
+
+void SectorCacheBlock::fill(uint32_t time, SectorMask sector_mask) {
+  uint32_t sidx = get_sector_index(sector_mask);
+  m_status[sidx] = m_set_modified_on_fill_status[sidx] ? MODIFIED : VALID;
+  m_sector_fill_time[sidx] = time;
+  m_line_fill_time = time;
+}
+bool SectorCacheBlock::is_valid_line() { return !(is_invalid_line()); }
+
+bool SectorCacheBlock::is_invalid_line() {
+  // all the sectors should be invalid
+  for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; ++i) {
+    if (m_status[i] != INVALID) return false;
+  }
+  return true;
+}
+
+bool SectorCacheBlock::is_reserved_line() {
+  // all the sectors should be invalid
+  for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; ++i) {
+    if (m_status[i] == RESERVED) return true;
+  }
+  return false;
+}
+
+bool SectorCacheBlock::is_modified_line() {
+  for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; ++i) {
+    if (m_status[i] == MODIFIED) return true;
+  }
+  return false;
+}
+
+SectorMask SectorCacheBlock::get_dirty_mask() {
+  SectorMask dirty_mask;
+  dirty_mask.reset();
+  for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; ++i) {
+    if (m_status[i] == MODIFIED) dirty_mask.set(i);
+  }
+  return dirty_mask;
+}
+
+void SectorCacheBlock::init() {
+  for (int i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+    m_sector_alloc_time[i] = 0;
+    m_sector_fill_time[i] = 0;
+    m_sector_last_access_time[i] = 0;
+    m_status[i] = INVALID;
+    m_ignore_on_fill_status[i] = false;
+    m_set_modified_on_fill_status[i] = false;
+    m_readable[i] = true;
+  }
+  m_line_alloc_time = 0;
+  m_line_fill_time = 0;
+  m_line_last_access_time = 0;
+}
+
+CacheBlockState SectorCacheBlock::get_status(SectorMask mask) {
+  uint32_t sidx = get_sector_index(mask);
+  return m_status[sidx];
+}
+
+void SectorCacheBlock::set_status(CacheBlockState status, SectorMask mask) {
+  uint32_t sidx = get_sector_index(mask);
+  m_status[sidx] = status;
+}
+
+bool SectorCacheBlock::is_readable(SectorMask mask) {
+  uint32_t sidx = get_sector_index(mask);
+  return m_readable[sidx];
+}
+
+uint64_t SectorCacheBlock::get_last_access_time() {
+  return m_line_last_access_time;
+}
+
+uint64_t SectorCacheBlock::get_alloc_time() { return m_line_alloc_time; }
+
+void SectorCacheBlock::set_ignore_on_fill(bool ignore, SectorMask mask) {
+  uint32_t sidx = get_sector_index(mask);
+  m_ignore_on_fill_status[sidx] = ignore;
+}
+
+void SectorCacheBlock::set_modified_on_fill(bool modified, SectorMask mask) {
+  uint32_t sidx = get_sector_index(mask);
+  m_set_modified_on_fill_status[sidx] = modified;
+}
+
+void SectorCacheBlock::set_readable(bool readable, SectorMask mask) {
+  uint32_t sidx = get_sector_index(mask);
+  m_readable[sidx] = readable;
+}
+
+void SectorCacheBlock::set_last_access_time(uint64_t time,
+                                            SectorMask sector_mask) {
+  m_line_last_access_time = time;
+  uint32_t sidx = get_sector_index(sector_mask);
+  m_sector_last_access_time[sidx] = time;
+}
+
+uint32_t SectorCacheBlock::get_modified_size() {
+  uint32_t modified_size = 0;
+  for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; ++i) {
+    if (m_status[i] == MODIFIED) modified_size++;
+  }
+  return modified_size * m_sector_size;
+}
+
+/*Tag Array*/
+TagArray::TagArray(CacheConfig &config, int core_id, int type_id)
+    : m_config(config) {
+  uint32_t cache_lines_num = config.get_num_lines();
+  m_lines = new CacheBlock *[cache_lines_num];
+  for (uint32_t i = 0; i < cache_lines_num; ++i) {
+    if (config.get_cache_type() == SECTOR)
+      m_lines[i] = new SectorCacheBlock(config.get_sector_size());
+    else if (config.get_cache_type() == NORMAL)
+      m_lines[i] = new LineCacheBlock(config.get_sector_size());
+    else
+      assert(0);
+  }
+  init(core_id, type_id);
+}
+
+TagArray::~TagArray() {
+  uint32_t cache_lines_num = m_config.get_num_lines();
+  for (uint32_t i = 0; i < cache_lines_num; ++i) {
+    delete m_lines[i];
+  }
+  delete[] m_lines;
+}
+
+CacheRequestStatus TagArray::probe(uint64_t addr, uint32_t &idx, mem_fetch *mf,
+                                   bool probe_mode) const {
+  SectorMask sector_mask = mf->get_access_sector_mask();
+  return probe(addr, idx, sector_mask, mf, probe_mode);
+}
+
+CacheRequestStatus TagArray::probe(uint64_t addr, uint32_t &idx,
+                                   SectorMask mask, mem_fetch *mf,
+                                   bool probe_mode) const {
+  int set_index = m_config.get_set_index(addr);
+  uint64_t tag = m_config.get_tag(addr);
+  uint32_t valid_line = (uint32_t)-1;
+  uint32_t invalid_line = (uint32_t)-1;
+  uint64_t valid_timestamp = (uint64_t)-1;
+  bool all_reserved = true;
+  for (uint32_t way = 0; way < m_config.get_num_assoc(); way++) {
+    uint32_t index = set_index * m_config.get_num_assoc() + way;
+    CacheBlock *line = m_lines[index];
+    if (line->match_tag(tag)) {  // tag matched
+      if (line->get_status(mask) == RESERVED) {
+        idx = index;
+        return HIT_RESERVED;
+      } else if (line->get_status(mask) == VALID ||
+                 (line->get_status(mask) == MODIFIED &&
+                  line->is_readable(mask))) {
+        idx = index;
+        return HIT;
+      } else if ((line->get_status(mask) == MODIFIED &&
+                  !line->is_readable(mask)) ||
+                 (line->is_valid_line() && line->get_status(mask) == INVALID)) {
+        idx = index;
+        return SECTOR_MISS;
+      } else {
+        assert(line->get_status(mask) == INVALID);
+      }
+    }
+    if (!line->is_reserved_line()) {
+      all_reserved = false;
+      if (line->is_invalid_line()) {
+        invalid_line = index;
+      } else {
+        if (m_config.get_evict_policy() == LRU) {
+          if (line->get_last_access_time() < valid_timestamp) {
+            valid_timestamp = line->get_last_access_time();
+            valid_line = index;
+          }
+        } else if (m_config.get_evict_policy() == FIFO) {
+          if (line->get_alloc_time() < valid_timestamp) {
+            valid_timestamp = line->get_alloc_time();
+            valid_line = index;
+          }
+        }
+      }
+    }
+  }
+  if (all_reserved) {
+    assert(m_config.get_alloc_policy() == ON_MISS);
+    return RESERVATION_FAIL;
+  }
+  if (invalid_line != (uint32_t)-1) {
+    idx = invalid_line;
+  } else if (valid_line != (uint32_t)-1) {
+    idx = valid_line;
+  } else {
+    assert(0);
+  }
+  return MISS;
+}
+
+CacheRequestStatus TagArray::access(uint64_t addr, uint32_t time, uint32_t &idx,
+                                    mem_fetch *mf) {
+  bool wb = false;
+  EvictedBlockInfo evicted;
+  return access(addr, time, idx, mf, wb, evicted);
+}
+
+CacheRequestStatus TagArray::access(uint64_t addr, uint32_t time, uint32_t &idx,
+                                    mem_fetch *mf, bool &wb,
+                                    EvictedBlockInfo &evicted) {
+  is_used = true;
+  m_access++;
+  SectorMask sector_mask = mf->get_access_sector_mask();
+  uint64_t tag = m_config.get_tag(addr);
+  uint64_t block_addr = m_config.get_block_addr(addr);
+  CacheRequestStatus status = probe(addr, idx, mf);
+  switch (status) {
+    case HIT_RESERVED:
+      m_pending_hit++;
+      break;
+    case HIT:
+      m_lines[idx]->set_last_access_time(time, sector_mask);
+      break;
+    case SECTOR_MISS:
+      assert(m_config.get_cache_type() == SECTOR);
+      m_sector_miss++;
+      if (m_config.get_alloc_policy() == ON_MISS) {
+        ((SectorCacheBlock *)m_lines[idx])->allocate_sector(time, sector_mask);
+      }
+      break;
+    case MISS:
+      m_miss++;
+      if (m_config.get_alloc_policy() == ON_MISS) {
+        if (m_lines[idx]->is_modified_line()) {
+          wb = true;
+          evicted.set_info(m_lines[idx]->get_block_addr(), m_lines[idx]->get_modified_size(),
+                           m_lines[idx]->get_status(sector_mask));
+        }
+        m_lines[idx]->allocate(tag, block_addr, time, sector_mask);
+      }
+      break;
+    case RESERVATION_FAIL:
+      m_res_fail++;
+      break;
+  }
+  return status;
+}
+
+void TagArray::fill(uint64_t addr, uint32_t time, mem_fetch *mf) {
+  fill(addr, time, mf->get_access_sector_mask());
+}
+
+void TagArray::fill(uint32_t index, uint32_t time, mem_fetch *mf) {
+  assert(m_config.get_alloc_policy() == ON_MISS);
+  m_lines[index]->fill(time, mf->get_access_sector_mask());
+}
+
+void TagArray::fill(uint64_t addr, uint32_t time, SectorMask mask) {
+  uint32_t idx;
+  CacheRequestStatus status = probe(addr, idx, mask);
+  if (status == MISS) {
+    m_lines[idx]->allocate(m_config.get_tag(addr),
+                           m_config.get_block_addr(addr), time, mask);
+  } else if (status == SECTOR_MISS) {
+    assert(m_config.get_cache_type() == SECTOR);
+    ((SectorCacheBlock *)m_lines[idx])->allocate_sector(time, mask);
+  }
+  m_lines[idx]->fill(time, mask);
+}
+
+void TagArray::invalidate() {
+  if (!is_used) return;
+  for (uint32_t i = 0; i < m_config.get_num_lines(); i++) {
+    for (uint32_t j = 0; j < SECTOR_CHUNCK_SIZE; j++) {
+      m_lines[i]->set_status(INVALID, SectorMask().set(j));
+    }
+  }
+}
+
+void TagArray::init(int core_id, int type_id) {
+  m_core_id = core_id;
+  m_type_id = type_id;
+  m_access = 0;
+  m_miss = 0;
+  m_pending_hit = 0;
+  m_res_fail = 0;
+  m_sector_miss = 0;
+  is_used = false;
+}
+
+/* MSHR Table */
+bool MshrTable::probe(uint64_t block_addr) const {
+  return m_table.find(block_addr) != m_table.end();
+}
+
+bool MshrTable::full(uint64_t block_addr) const {
+  if (probe(block_addr))
+    return m_table.at(block_addr).m_list.size() >= m_max_merged;
+  else
+    return m_table.size() >= m_num_entries;
+}
+
+void MshrTable::add(uint64_t block_addr, mem_fetch *mf) {
+  assert(!full(block_addr));
+  m_table[block_addr].m_list.push_back(mf);
+  if (mf->is_atomic()) {
+    m_table[block_addr].m_has_atomic = true;
+  }
+}
+
+void MshrTable::mark_ready(uint64_t block_addr, bool &has_atomic) {
+  assert(probe(block_addr));
+  has_atomic = m_table[block_addr].m_has_atomic;
+  m_current_response.push_back(block_addr);
+  }
+
+mem_fetch *MshrTable::pop_next_access() {
+  assert(access_ready());
+  uint64_t block_addr = m_current_response.front();
+  assert(probe(block_addr));
+  mem_fetch *mf = m_table[block_addr].m_list.front();
+  m_table[block_addr].m_list.pop_front();
+  if (m_table[block_addr].m_list.empty()) {
+    m_table.erase(block_addr);
+    m_current_response.pop_front();
+  }
+  return mf;
+}
+
+mem_fetch *MshrTable::top_next_access() {
+  assert(access_ready());
+  uint64_t block_addr = m_current_response.front();
+  assert(probe(block_addr));
+  mem_fetch *mf = m_table[block_addr].m_list.front();
+  return mf;
+}
+
+bool MshrTable::is_read_after_write_pending(uint64_t block_addr) {
+  std::deque<mem_fetch *> list = m_table[block_addr].m_list;
+  bool write_found = false;
+  for (auto it = list.begin(); it != list.end(); ++it) {
+    if ((*it)->is_write()) {
+      write_found = true;  // Pending write
+    } else if (write_found) {
+      return true;  // Pending read after write
+    }
+  }
+  return false;
+}
+
+void MshrTable::print(FILE *fp) const {
+
+}
+
+/* Cache */
+Cache::Cache(std::string name, CacheConfig &config, int core_id, int type_id,
+             std::queue<mem_fetch*> *to_mem_queue)
+    : m_config(config), m_bandwidth_management(config) {
+  m_tag_array = new TagArray(config, core_id, type_id);
+  m_mshrs = new MshrTable(config.get_mshr_entries(),
+                                        config.get_mshr_max_merge());
+  m_name = name + std::to_string(core_id);
+  m_id = core_id;
+  m_to_mem_queue = to_mem_queue;
+}
+
+void Cache::cycle() {
+  if (!m_miss_queue.empty()) {
+    mem_fetch *mf = m_miss_queue.front();
+    m_to_mem_queue->push(mf);
+    m_miss_queue.pop_front();
+  }
+  m_bandwidth_management.replenish_port_bandwidth();
+}
+
+void Cache::fill(mem_fetch *mf, uint32_t time) {
+  if (m_config.get_mshr_config() == SECTOR_ASSOC) {
+    assert(mf->get_original_mf());
+    assert(m_extra_mf_fields.find(mf->get_original_mf()) !=
+           m_extra_mf_fields.end());
+    m_extra_mf_fields[mf->get_original_mf()].pending_read--;
+    if (m_extra_mf_fields[mf->get_original_mf()].pending_read > 0) {
+      delete mf;
+      return;
+    } else {
+      mem_fetch *tmp = mf;
+      mf = mf->get_original_mf();
+      delete tmp;
+    }
+  }
+  assert(m_extra_mf_fields.find(mf) != m_extra_mf_fields.end());
+  ExtraMfFields field = m_extra_mf_fields[mf];
+  mf->set_data_size(field.m_data_size);
+  mf->set_addr(field.m_addr);
+  if (m_config.get_alloc_policy() == ON_MISS) {
+    m_tag_array->fill(field.m_cache_index, time, mf);
+  } else if (m_config.get_alloc_policy() == ON_FILL) {
+    m_tag_array->fill(field.m_block_addr, time, mf);
+  }
+  bool has_atomic = false;
+  m_mshrs->mark_ready(field.m_block_addr, has_atomic);
+  if (has_atomic) {
+    assert(m_config.get_alloc_policy() == ON_MISS);
+    CacheBlock *block = m_tag_array->get_block(field.m_cache_index);
+    if(!block->is_modified_line()) {
+      // m_tag_array->inc_dirty(); // TODO
+    }
+    block->set_status(MODIFIED, mf->get_access_sector_mask());
+  }
+  m_extra_mf_fields.erase(mf);
+  m_bandwidth_management.use_fill_port(mf);
+}
+
+bool Cache::waiting_for_fill(mem_fetch *mf) {
+  return m_extra_mf_fields.find(mf) != m_extra_mf_fields.end();
+}
+
+void Cache::send_read_request(uint64_t addr, uint64_t block_addr,
+                              uint32_t cache_index, mem_fetch *mf,
+                              uint32_t time, bool &do_miss,
+                              std::deque<CacheEvent> &events, bool read_only,
+                              bool ws) {
+  bool wb = false;
+  EvictedBlockInfo evicted;
+  send_read_request(addr, block_addr, cache_index, mf, time, do_miss, wb,
+                    evicted, events, read_only, ws);
+}
+
+void Cache::send_read_request(uint64_t addr, uint64_t block_addr,
+                              uint32_t cache_index, mem_fetch *mf,
+                              uint32_t time, bool &do_miss, bool &wb,
+                              EvictedBlockInfo &evicted,
+                              std::deque<CacheEvent> &events, bool read_only,
+                              bool wa) {
+  new_addr_type mshr_addr = m_config.get_mshr_addr(addr);
+  bool mshr_hit = m_mshrs->probe(mshr_addr);
+  bool mshr_avail = !m_mshrs->full(mshr_addr);
+  if (mshr_hit && mshr_avail) {
+    if (read_only)
+      m_tag_array->access(block_addr, time, cache_index, mf);
+    else
+      m_tag_array->access(block_addr, time, cache_index, mf, wb, evicted);
+    m_mshrs->add(mshr_addr, mf);
+    m_stats.inc_stats(mf->get_access_type(), MSHR_HIT);
+    do_miss = true;
+  } else if (!mshr_hit && mshr_avail && !miss_queue_full(0)) {
+    if (read_only)
+      m_tag_array->access(block_addr, time, cache_index, mf);
+    else
+      m_tag_array->access(block_addr, time, cache_index, mf, wb, evicted);
+    m_mshrs->add(mshr_addr, mf);
+    m_extra_mf_fields[mf] = ExtraMfFields();
+    m_extra_mf_fields[mf].m_valid = true;
+    m_extra_mf_fields[mf].m_block_addr = mshr_addr;
+    m_extra_mf_fields[mf].m_addr = mf->get_addr();
+    m_extra_mf_fields[mf].m_cache_index = cache_index;
+    m_extra_mf_fields[mf].m_data_size = mf->get_data_size();
+    m_extra_mf_fields[mf].pending_read = m_config.get_mshr_config() == SECTOR_ASSOC
+                            ? m_config.get_line_size() / m_config.get_sector_size()
+                            : 0;
+    mf->set_data_size(m_config.get_atom_size());
+    // assert(m_config.get_atom_size() <= PACKET_SIZE); //TODO: for now, it should be true
+    mf->set_addr(mshr_addr);
+    m_miss_queue.push_back(mf);
+    if (!wa) events.push_back(CacheEvent(READ_REQUEST_SENT));
+    do_miss = true;
+  } else if (mshr_hit && !mshr_avail) {
+    m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENTRY_FAIL);
+  } else if (!mshr_hit && !mshr_avail) {
+    m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENTRY_FAIL);
+  }
+}
+
+void Cache::BandwidthManagement::use_data_port(
+    mem_fetch *mf, CacheRequestStatus outcome,
+    const std::deque<CacheEvent> &events) {
+  uint32_t data_size = mf->get_data_size();
+  uint32_t port_width = m_config.get_data_port_width();
+  uint32_t data_cycles = 0;
+  CacheEvent event;
+  switch (outcome) {
+    case HIT:
+      data_cycles = data_size / port_width + ((data_size % port_width) ? 1 : 0);
+      m_data_port_occupied_cycles += data_cycles;
+      break;
+    case HIT_RESERVED:
+    case MISS:
+      if (CacheEvent::was_writeback_sent(events, event)) {
+        data_cycles = event.m_evicted_block.m_modified_size / port_width;
+        m_data_port_occupied_cycles += data_cycles;
+      }
+      break;
+    case SECTOR_MISS:
+    case RESERVATION_FAIL:
+      break;
+    default:
+      assert(0);
+  }
+}
+
+void Cache::BandwidthManagement::use_fill_port(mem_fetch *mf) {
+  unsigned fill_cycles =
+      m_config.get_atom_size() / m_config.get_data_port_width();
+  m_fill_port_occupied_cycles += fill_cycles;
+}
+
+void Cache::BandwidthManagement::replenish_port_bandwidth() {
+  if (m_data_port_occupied_cycles > 0) {
+    m_data_port_occupied_cycles--;
+  }
+  if (m_fill_port_occupied_cycles > 0) {
+    m_fill_port_occupied_cycles--;
+  }
+}
+
+bool Cache::BandwidthManagement::data_port_free() const {
+  return true; // ignore this feature
+}
+
+bool Cache::BandwidthManagement::fill_port_free() const {
+  return true;
+}
+
+/* Read-only Cache */
+CacheRequestStatus ReadOnlyCache::access(uint64_t addr, uint32_t time,
+                                         mem_fetch *mf,
+                                         std::deque<CacheEvent> &events) {
+  assert(mf->get_data_size() <= m_config.get_atom_size());
+  assert(m_config.get_write_policy() == READ_ONLY);
+  assert(!mf->is_write());
+  uint64_t block_addr = m_config.get_block_addr(addr);
+  uint32_t cache_index = (uint32_t)-1;
+  CacheRequestStatus status =
+      m_tag_array->probe(block_addr, cache_index, mf, true);
+  CacheRequestStatus cache_status = RESERVATION_FAIL;
+  if (status == HIT) {
+    cache_status = m_tag_array->access(block_addr, time, cache_index, mf);
+  } else if (status != RESERVATION_FAIL) {
+    if (!miss_queue_full(0)) {
+      bool do_miss = false;
+      send_read_request(addr, block_addr, cache_index, mf, time, do_miss,
+                        events, true, false);
+      if (do_miss)
+        cache_status = MISS;
+      else
+        cache_status = RESERVATION_FAIL;
+    } else {
+      cache_status = RESERVATION_FAIL;
+      m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    }
+  } else {
+    m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL);
+  }
+  m_stats.inc_stats(mf->get_access_type(),
+                    m_stats.select_stats_status(status, cache_status));
+
+  m_bandwidth_management.use_data_port(mf, cache_status, events);
+  return cache_status;
+}
diff --git a/PyTorchSimBackend/src/Cache.h b/PyTorchSimBackend/src/Cache.h
new file mode 100644
index 00000000..e09af658
--- /dev/null
+++ b/PyTorchSimBackend/src/Cache.h
@@ -0,0 +1,375 @@
+#ifndef CACHE_H_
+#define CACHE_H_
+#include <bitset>
+#include <cassert>
+#include <cstdint>
+#include <list>
+#include <queue>
+#include <memory>
+#include <string>
+#include <spdlog/fmt/ranges.h>
+#include <spdlog/spdlog.h>
+
+#include "Cache_defs.h"
+#include "Cache_stats.h"
+#include "DelayQueue.h"
+#include "Memfetch.h"
+
+class CacheConfig {
+ public:
+  CacheConfig() {}
+  void init(std::string config);
+  bool disabled() const { return m_disabled; }
+  uint32_t get_line_size() const { return m_line_size; }
+  uint32_t get_atom_size() const { return m_atom_size; }
+  uint32_t get_num_lines() const { return m_nset * m_assoc; }
+  uint32_t get_num_assoc() const { return m_assoc; }
+  uint32_t get_max_assoc() const { return m_origin_assoc; }
+  uint32_t get_max_sets() const { return m_origin_nset; }
+  uint32_t get_num_sets() const { return m_nset; }
+  void set_sets(uint32_t sets) { m_nset = sets; }
+  void set_assoc (uint32_t assoc) { m_assoc = assoc; }
+  uint32_t get_mshr_entries() const { return m_mshr_entries; }
+  uint32_t get_mshr_max_merge() const { return m_mshr_max_merge; }
+  uint32_t get_miss_queue_size() const { return m_miss_queue_size; }
+  uint32_t get_sector_size() { return m_sector_size; }
+  uint32_t get_set_index(uint64_t addr) const;
+  uint64_t get_tag(uint64_t addr) const;
+  uint64_t get_block_addr(uint64_t addr) const;
+  uint64_t get_mshr_addr(uint64_t addr) const;
+  CacheType get_cache_type() const { return m_cache_type; }
+  EvictPolicy get_evict_policy() const { return m_evict_policy; }
+  WritePolicy get_write_policy() const { return m_write_policy; }
+  WriteAllocatePolicy get_write_alloc_policy() const {
+    return m_write_alloc_policy;
+  }
+  AllocationPolicy get_alloc_policy() const { return m_alloc_policy; }
+  MshrConfig get_mshr_config() const { return m_mshr_type; }
+  uint32_t get_nset() const { return m_nset; }
+  uint32_t get_total_size_in_kb() const {
+    return (m_line_size * m_nset * m_assoc) / 1024;
+  }
+  uint32_t get_origin_size () const {
+    return m_line_size * m_origin_assoc * m_origin_nset;
+  }
+  uint32_t get_data_port_width() const { return m_data_port_width; }
+ protected:
+  bool m_valid = false;
+  bool m_disabled = false;
+  uint32_t m_origin_nset = 0;
+  uint32_t m_line_size = 0;
+  uint32_t m_line_size_log2 = 0;
+  uint32_t m_nset = 0;
+  uint32_t m_nset_log2 = 0;
+  uint32_t m_assoc = 0;
+  uint32_t m_origin_assoc = 0;
+  uint32_t m_atom_size = 0;
+  uint32_t m_sector_size = 0;
+  uint32_t m_sector_size_log2 = 0;
+  uint32_t m_mshr_entries = 0;
+  uint32_t m_mshr_max_merge = 0;
+  uint32_t m_miss_queue_size = 0;
+  uint32_t m_result_fifo_entries = 0;
+  uint32_t m_data_port_width = 0;
+  CacheType m_cache_type;
+  EvictPolicy m_evict_policy;
+  WritePolicy m_write_policy;
+  WriteAllocatePolicy m_write_alloc_policy;
+  AllocationPolicy m_alloc_policy;
+  MshrConfig m_mshr_type;
+  SetIndexFunction m_set_index_function;
+
+  uint32_t hash_function(uint64_t addr) const;
+};
+
+class CacheBlock {
+ public:
+  virtual void allocate(uint64_t tage, uint64_t block_addr, uint32_t time,
+                        SectorMask sector_mask) = 0;
+  virtual void fill(uint32_t time, SectorMask sector_mask) = 0;
+  virtual bool match_tag(uint64_t tag) { return m_tag == tag; }
+  virtual uint64_t get_block_addr() { return m_block_addr; }
+  virtual bool is_valid_line() = 0;
+  virtual bool is_invalid_line() = 0;
+  virtual bool is_reserved_line() = 0;
+  virtual bool is_modified_line() = 0;
+  virtual SectorMask get_dirty_mask() = 0;
+  virtual CacheBlockState get_status(SectorMask mask) = 0;
+  virtual void set_status(CacheBlockState status, SectorMask mask) = 0;
+  virtual bool is_readable(SectorMask mask) = 0;
+  virtual uint64_t get_last_access_time() = 0;
+  virtual uint64_t get_alloc_time() = 0;
+  virtual void set_ignore_on_fill(bool ignore, SectorMask sector_mask) = 0;
+  virtual void set_modified_on_fill(bool modified, SectorMask sector_mask) = 0;
+  virtual void set_readable(bool readable, SectorMask sector_mask) = 0;
+  virtual void set_last_access_time(uint64_t time, SectorMask sector_mask) = 0;
+  virtual uint32_t get_modified_size() = 0;
+
+ protected:
+  uint64_t m_tag;
+  uint64_t m_block_addr;
+};
+
+class LineCacheBlock : public CacheBlock {
+ public:
+  LineCacheBlock(uint32_t sector_size) : m_sector_size(sector_size) {};
+  virtual void allocate(uint64_t tag, uint64_t block_addr, uint32_t time,
+                        SectorMask sector_mask) override;
+  virtual void fill(uint32_t time, SectorMask sector_mask) override;
+  virtual bool is_valid_line() override { return m_status == VALID; }
+  virtual bool is_invalid_line() override { return m_status == INVALID; }
+  virtual bool is_reserved_line() override { return m_status == RESERVED; }
+  virtual bool is_modified_line() override { return m_status == MODIFIED; }
+  virtual SectorMask get_dirty_mask() override;
+  virtual CacheBlockState get_status(SectorMask mask) override {
+    return m_status;
+  }
+  virtual void set_status(CacheBlockState status, SectorMask mask) override {
+    m_status = status;
+  }
+  virtual bool is_readable(SectorMask mask) override { return m_readable; }
+  virtual uint64_t get_last_access_time() override {
+    return m_last_access_time;
+  }
+  virtual uint64_t get_alloc_time() override { return m_alloc_time; }
+  virtual void set_ignore_on_fill(bool ignore,
+                                  SectorMask sector_mask) override {
+    m_ignore_on_fill_status = ignore;
+  }
+  virtual void set_modified_on_fill(bool modified,
+                                    SectorMask sector_mask) override {
+    m_set_modified_on_fill = modified;
+  }
+  virtual void set_readable(bool readable, SectorMask sector_mask) override {
+    m_readable = readable;
+  }
+  virtual void set_last_access_time(uint64_t time,
+                                    SectorMask sector_mask) override {
+    m_last_access_time = time;
+  }
+  virtual uint32_t get_modified_size() override {
+    return SECTOR_CHUNCK_SIZE * m_sector_size;
+  }
+
+ protected:
+  uint64_t m_alloc_time = 0;
+  uint32_t m_sector_size = 0;
+  uint64_t m_last_access_time = 0;
+  uint64_t m_fill_time = 0;
+  CacheBlockState m_status = INVALID;
+  bool m_ignore_on_fill_status = false;
+  bool m_set_modified_on_fill = false;
+  bool m_readable = true;
+};
+
+class SectorCacheBlock : public CacheBlock {
+ public:
+  SectorCacheBlock(uint32_t sector_size) : m_sector_size(sector_size) {};
+  virtual void allocate(uint64_t tag, uint64_t block_addr, uint32_t time,
+                        SectorMask sector_mask) override;
+  virtual void allocate_sector(uint32_t time, SectorMask sector_mask);
+  virtual void fill(uint32_t time, SectorMask sector_mask) override;
+  virtual bool is_valid_line() override;
+  virtual bool is_invalid_line() override;
+  virtual bool is_reserved_line() override;
+  virtual bool is_modified_line() override;
+  virtual SectorMask get_dirty_mask() override;
+  virtual CacheBlockState get_status(SectorMask mask) override;
+  virtual void set_status(CacheBlockState status, SectorMask mask) override;
+  virtual bool is_readable(SectorMask mask) override;
+  virtual uint64_t get_last_access_time() override;
+  virtual uint64_t get_alloc_time() override;
+  virtual void set_ignore_on_fill(bool ignore, SectorMask sector_mask) override;
+  virtual void set_modified_on_fill(bool modified,
+                                    SectorMask sector_mask) override;
+  virtual void set_readable(bool readable, SectorMask sector_mask) override;
+  virtual void set_last_access_time(uint64_t time,
+                                    SectorMask sector_mask) override;
+  virtual uint32_t get_modified_size() override;
+
+ private:
+  uint32_t m_sector_alloc_time[SECTOR_CHUNCK_SIZE] = {0};
+  uint32_t m_sector_fill_time[SECTOR_CHUNCK_SIZE] = {0};
+  uint32_t m_sector_last_access_time[SECTOR_CHUNCK_SIZE] = {0};
+  uint32_t m_sector_size = 0;
+  uint32_t m_line_alloc_time = 0;
+  uint32_t m_line_fill_time = 0;
+  uint32_t m_line_last_access_time = 0;
+  CacheBlockState m_status[SECTOR_CHUNCK_SIZE] = {INVALID};
+  bool m_ignore_on_fill_status[SECTOR_CHUNCK_SIZE] = {false};
+  bool m_set_modified_on_fill_status[SECTOR_CHUNCK_SIZE] = {false};
+  bool m_readable[SECTOR_CHUNCK_SIZE] = {true};
+
+  void init();
+
+  uint32_t get_sector_index(SectorMask sector_mask) {
+    assert(sector_mask.count() == 1);
+    for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; ++i) {
+      if (sector_mask.to_ulong() & (1 << i)) return i;
+    }
+    assert(false);
+    return 0;
+  }
+};
+
+class TagArray {
+ public:
+  TagArray(CacheConfig &config, int core_id, int type_id);
+  ~TagArray();
+  CacheRequestStatus probe(uint64_t addr, uint32_t &idx, mem_fetch *mf,
+                           bool probe_mode = false) const;
+  CacheRequestStatus probe(uint64_t addr, uint32_t &idx, SectorMask mask,
+                           mem_fetch *mf = NULL, bool probe_mode = false) const;
+  CacheRequestStatus access(uint64_t addr, uint32_t time, uint32_t &idx,
+                            mem_fetch *mf);
+  CacheRequestStatus access(uint64_t addr, uint32_t time, uint32_t &idx,
+                            mem_fetch *mf, bool &wb,
+                            EvictedBlockInfo &evicted_block);
+  void fill(uint64_t addr, uint32_t time, mem_fetch *mf);
+  void fill(uint32_t idx, uint32_t time, mem_fetch *mf);
+  void fill(uint64_t addr, uint32_t time, SectorMask mask);
+  uint32_t size() const { return m_config.get_num_lines(); }
+  CacheBlock *get_block(uint32_t idx) const { return m_lines[idx]; }
+  void invalidate();
+
+ protected:
+  CacheConfig &m_config;
+  CacheBlock **m_lines; /* N banks x M sets x assoc lines in total */
+  uint32_t m_core_id;
+  uint32_t m_type_id;
+  uint32_t m_access;
+  uint32_t m_miss;
+  uint32_t m_pending_hit;
+  uint32_t m_res_fail;
+  uint32_t m_sector_miss;
+  bool is_used;
+  void init(int core_id, int type_id);
+};
+
+class MshrTable {
+ public:
+  MshrTable(uint32_t num_entries, uint32_t max_merged)
+      : m_num_entries(num_entries), m_max_merged(max_merged) {}
+  bool probe(uint64_t block_addr) const;
+  bool full(uint64_t block_addr) const;
+  void add(uint64_t block_addr, mem_fetch *mf);
+  bool busy() const { return false; }
+  void mark_ready(uint64_t block_addr, bool &has_atomic);
+  bool access_ready() const { return !m_current_response.empty(); }
+  mem_fetch *pop_next_access();
+  mem_fetch *top_next_access();
+  bool is_read_after_write_pending(uint64_t block_addr);
+  void print(FILE *fp) const;
+
+ private:
+  const unsigned m_num_entries;
+  const unsigned m_max_merged;
+
+  struct MshrEntry {
+    std::deque<mem_fetch *> m_list;
+    bool m_has_atomic = false;
+  };
+  std::map<uint64_t, MshrEntry> m_table;
+  std::map<uint64_t, MshrEntry> m_line_table;
+  bool m_current_response_ready;
+  std::deque<uint64_t> m_current_response;
+};
+
+class Cache {
+ public:
+  Cache(std::string name, CacheConfig &config, int core_id, int type_id,
+        std::queue<mem_fetch*> *to_mem_queue);
+  ~Cache() {
+    delete m_tag_array;
+    delete m_mshrs;
+  }
+  virtual CacheRequestStatus access(uint64_t addr, uint32_t time, mem_fetch *mf,
+                                    std::deque<CacheEvent> &event) = 0;
+  virtual void cycle();
+  virtual void fill(mem_fetch *mf, uint32_t time);
+  virtual bool waiting_for_fill(mem_fetch *mf);
+  virtual bool access_ready() { return m_mshrs->access_ready(); }
+  virtual mem_fetch *pop_next_access() { return m_mshrs->pop_next_access(); }
+  virtual mem_fetch *top_next_access() { return m_mshrs->top_next_access(); }
+  virtual void invalidate() { m_tag_array->invalidate(); }
+
+  virtual bool data_port_free() {
+    return m_bandwidth_management.data_port_free();
+  }
+  virtual bool fill_port_free() {
+    return m_bandwidth_management.fill_port_free();
+  }
+  // virtual bool miss_queue_size(bool from_ndp);
+  virtual void force_tag_access(uint64_t addr, uint32_t time, SectorMask mask) {
+    m_tag_array->fill(addr, time, mask);
+  }
+  virtual CacheStats get_stats() const { return m_stats; }
+  virtual void print_cache_stats() {}
+  
+ protected:
+  uint32_t m_id;
+  std::string m_name;
+  CacheConfig &m_config;
+  TagArray *m_tag_array;
+  MshrTable *m_mshrs;
+  std::deque<mem_fetch*> m_miss_queue;
+  std::queue<mem_fetch*> *m_to_mem_queue;
+  CacheStats m_stats;
+  struct ExtraMfFields {
+    bool m_valid = false;
+    uint64_t m_block_addr;
+    uint64_t m_addr;
+    uint32_t m_cache_index;
+    uint32_t m_data_size;
+    uint32_t pending_read;
+  };
+  class BandwidthManagement {
+   public:
+    BandwidthManagement(CacheConfig &config) : m_config(config) {}
+    void use_data_port(mem_fetch *mf, CacheRequestStatus outcome,
+                       const std::deque<CacheEvent> &events);
+    void use_fill_port(mem_fetch *mf);
+    void replenish_port_bandwidth();
+    bool data_port_free() const;
+    bool fill_port_free() const;
+
+   protected:
+    const CacheConfig &m_config;
+    int m_data_port_occupied_cycles = 0;
+    int m_fill_port_occupied_cycles = 0;
+  };
+
+  std::map<mem_fetch *, ExtraMfFields> m_extra_mf_fields;
+  BandwidthManagement m_bandwidth_management;
+
+ protected:
+  /// Checks whether this request can be handled on this cycle. num_miss equals
+  /// max # of misses to be handled on this cycle
+  bool miss_queue_full(uint32_t num_misses) {
+    return (m_miss_queue.size() + num_misses) > m_config.get_miss_queue_size();
+    ;
+  }
+  // Read miss handler without write back
+  void send_read_request(uint64_t addr, uint64_t block_addr,
+                         uint32_t cache_index, mem_fetch *mf, uint32_t time,
+                         bool &do_miss, std::deque<CacheEvent> &events,
+                         bool read_only, bool wa);
+  // Read miss handler. Check MSHR hit or avaiable
+  void send_read_request(uint64_t addr, uint64_t block_addr,
+                         uint32_t cache_index, mem_fetch *mf, uint32_t time,
+                         bool &do_miss, bool &wb, EvictedBlockInfo &eviced,
+                         std::deque<CacheEvent> &events, bool read_only,
+                         bool wa);
+};
+
+class ReadOnlyCache : public Cache {
+ public:
+  ReadOnlyCache(std::string name, CacheConfig &config, int core_id, int type_id,
+                std::queue<mem_fetch*> *to_mem_queue)
+      : Cache(name, config, core_id, type_id, to_mem_queue) {}
+
+  virtual CacheRequestStatus access(uint64_t addr, uint32_t time, mem_fetch *mf,
+                                    std::deque<CacheEvent> &event) override;
+};
+
+#endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Cache_stats.cc b/PyTorchSimBackend/src/Cache_stats.cc
new file mode 100644
index 00000000..4d076686
--- /dev/null
+++ b/PyTorchSimBackend/src/Cache_stats.cc
@@ -0,0 +1,244 @@
+#include "Cache_stats.h"
+#include "Memfetch.h"
+
+CacheStats::CacheStats() {
+  m_stats.resize(NUM_MEM_ACCESS_TYPE);
+  m_fail_stats.resize(NUM_MEM_ACCESS_TYPE);
+  for (int i = 0; i < NUM_MEM_ACCESS_TYPE; i++) {
+    m_stats[i].resize(NUM_CACHE_REQUEST_STATUS, 0);
+    m_fail_stats[i].resize(NUM_CACHE_REQUEST_STATUS, 0);
+  }
+  m_cache_port_available_cycles = 0;
+  m_cache_data_port_busy_cycles = 0;
+  m_cache_fill_port_busy_cycles = 0;
+
+  m_prev_hit = 0;
+  m_prev_miss = 0;
+}
+
+void CacheStats::clear() {
+  for (int i = 0; i < NUM_MEM_ACCESS_TYPE; i++) {
+    std::fill(m_stats[i].begin(), m_stats[i].end(), 0);
+    std::fill(m_fail_stats[i].begin(), m_fail_stats[i].end(), 0);
+  }
+  m_cache_port_available_cycles = 0;
+  m_cache_data_port_busy_cycles = 0;
+  m_cache_fill_port_busy_cycles = 0;
+}
+
+void CacheStats::inc_stats(int access_type, int access_outcome) {
+  assert(check_valid(access_type, access_outcome));
+  m_stats[access_type][access_outcome]++;
+}
+
+void CacheStats::inc_fail_stats(int access_type, int fail_outcome) {
+  assert(check_fail_valid(access_type, fail_outcome));
+  m_fail_stats[access_type][fail_outcome]++;
+}
+
+CacheRequestStatus CacheStats::select_stats_status(
+    CacheRequestStatus probe, CacheRequestStatus access) const {
+  if (probe == HIT_RESERVED && access != RESERVATION_FAIL)
+    return probe;
+  else if (probe == SECTOR_MISS && access == MISS)
+    return probe;
+  else
+    return access;
+}
+
+uint64_t &CacheStats::operator()(int access_type, int access_outcome,
+                                 bool fail_outcome) {
+  if (fail_outcome) {
+    assert(check_fail_valid(access_type, access_outcome));
+    return m_fail_stats[access_type][access_outcome];
+  } else {
+    assert(check_valid(access_type, access_outcome));
+    return m_stats[access_type][access_outcome];
+  }
+}
+
+uint64_t CacheStats::operator()(int access_type, int access_outcome,
+                                bool fail_outcome) const {
+  if (fail_outcome) {
+    assert(check_fail_valid(access_type, access_outcome));
+    return m_fail_stats[access_type][access_outcome];
+  } else {
+    assert(check_valid(access_type, access_outcome));
+    return m_stats[access_type][access_outcome];
+  }
+}
+
+CacheStats CacheStats::operator+(const CacheStats &other) {
+  CacheStats sum;
+  for (int i = 0; i < NUM_MEM_ACCESS_TYPE; i++) {
+    for (int j = 0; j < NUM_CACHE_REQUEST_STATUS; j++) {
+      sum.m_stats[i][j] = m_stats[i][j] + other.m_stats[i][j];
+      sum.m_fail_stats[i][j] = m_fail_stats[i][j] + other.m_fail_stats[i][j];
+    }
+  }
+  sum.m_cache_port_available_cycles =
+      m_cache_port_available_cycles + other.m_cache_port_available_cycles;
+  sum.m_cache_data_port_busy_cycles =
+      m_cache_data_port_busy_cycles + other.m_cache_data_port_busy_cycles;
+  sum.m_cache_fill_port_busy_cycles =
+      m_cache_fill_port_busy_cycles + other.m_cache_fill_port_busy_cycles;
+  return sum;
+}
+
+CacheStats &CacheStats::operator+=(const CacheStats &other) {
+  for (int i = 0; i < NUM_MEM_ACCESS_TYPE; i++) {
+    for (int j = 0; j < NUM_CACHE_REQUEST_STATUS; j++) {
+      m_stats[i][j] += other.m_stats[i][j];
+      m_fail_stats[i][j] += other.m_fail_stats[i][j];
+    }
+  }
+  m_cache_port_available_cycles += other.m_cache_port_available_cycles;
+  m_cache_data_port_busy_cycles += other.m_cache_data_port_busy_cycles;
+  m_cache_fill_port_busy_cycles += other.m_cache_fill_port_busy_cycles;
+  return *this;
+}
+
+uint64_t CacheStats::get_hit() const {
+  uint64_t hit = 0;
+  for (int i = 0; i < NUM_MEM_ACCESS_TYPE; i++) {
+    for (int j = 0; j < NUM_CACHE_REQUEST_STATUS; j++) {
+      if (j == HIT) hit += m_stats[i][j];
+    }
+  }
+  return hit;
+}
+
+uint64_t CacheStats::get_read_hit() const {
+  uint64_t hit = 0;
+  mem_access_type types[] = {GLOBAL_ACC_R};
+  CacheRequestStatus status[] = {HIT, HIT_RESERVED};
+  for (int i = 0; i < 1; i++) {
+    for (int j = 0; j < 2; j++) {
+      hit += m_stats[types[i]][status[j]];
+    }
+  }
+  return hit;
+}
+
+uint64_t CacheStats::get_write_hit() const {
+  uint64_t hit = 0;
+  mem_access_type types[] = {GLOBAL_ACC_W, L2_CACHE_WA, L2_CACHE_WB};
+  CacheRequestStatus status[] = {HIT, HIT_RESERVED};
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      hit += m_stats[types[i]][status[j]];
+    }
+  }
+  return hit;
+}
+
+
+uint64_t CacheStats::get_miss() const {
+  uint64_t miss = 0;
+  for (int i = 0; i < NUM_MEM_ACCESS_TYPE; i++) {
+    for (int j = 0; j < NUM_CACHE_REQUEST_STATUS; j++) {
+      if (j == MISS || j == SECTOR_MISS) miss += m_stats[i][j];
+    }
+  }
+  return miss;
+}
+
+uint64_t CacheStats::get_read_miss() const {
+  uint64_t miss = 0;
+  mem_access_type types[] = {GLOBAL_ACC_R};
+  CacheRequestStatus status[] = {MISS, SECTOR_MISS};
+  for (int i = 0; i < 1; i++) {
+    for (int j = 0; j < 2; j++) {
+      miss += m_stats[types[i]][status[j]];
+    }
+  }
+  return miss;
+}
+
+uint64_t CacheStats::get_write_miss() const {
+  uint64_t miss = 0;
+  mem_access_type types[] = {GLOBAL_ACC_W, L2_CACHE_WA, L2_CACHE_WB};
+  CacheRequestStatus status[] = {MISS, SECTOR_MISS};
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      miss += m_stats[types[i]][status[j]];
+    }
+  }
+  return miss;
+}
+
+uint64_t CacheStats::get_accesses() const {
+  uint64_t access = 0;
+  for (int i = 0; i < NUM_MEM_ACCESS_TYPE; i++) {
+    for (int j = 0; j < NUM_CACHE_REQUEST_STATUS; j++) {
+      if(j == HIT || j == MISS || j == SECTOR_MISS || j == HIT_RESERVED)
+        access += m_stats[i][j];
+    }
+  }
+  return access;
+}
+
+uint64_t CacheStats::get_interval_hit() {
+  uint64_t prev_hit = m_prev_hit;
+  m_prev_hit = get_hit();
+
+  return m_prev_hit - prev_hit;
+}
+
+uint64_t CacheStats::get_interval_miss() {
+  uint64_t prev_miss = m_prev_miss;
+  m_prev_miss = get_miss();
+
+  return m_prev_miss - prev_miss;
+}
+
+void CacheStats::print_stats(FILE *out, const char *cache_name) const {
+  uint64_t hit = get_hit();
+  uint64_t miss = get_miss();
+  fprintf(out, "\tCache Hit : %llu, Cache Miss : %llu, Hit Ratio : %.2f\n", hit,
+          miss, (float)hit / (get_accesses()));
+  std::vector<uint32_t> total_access;
+  total_access.resize(NUM_MEM_ACCESS_TYPE, 0);
+  for (int type = 0; type < NUM_MEM_ACCESS_TYPE; type++) {
+    for (int status = 0; status < NUM_CACHE_REQUEST_STATUS; status++) {
+      fprintf(out, "\t%s[%s][%s] = %llu\n", cache_name,
+              mem_access_type_str[type], cache_request_status_str[status],
+              m_stats[type][status]);
+      if (status != RESERVATION_FAIL && status != MSHR_HIT)
+        total_access[type] += m_stats[type][status];
+    }
+  }
+  for (int type = 0; type < NUM_MEM_ACCESS_TYPE; type++) {
+    fprintf(out, "\t%s[%s][TOTAL] = %u\n", cache_name,
+            mem_access_type_str[type], total_access[type]);
+  }
+}
+
+void CacheStats::print_fail_stats(FILE *out, const char *cache_name) const {
+  for (int type = 0; type < NUM_MEM_ACCESS_TYPE; type++) {
+    for (int status = 0; status < NUM_CACHE_REQUEST_STATUS; status++) {
+      fprintf(out, "\t%s[%s][%s] = %llu\n", cache_name,
+              mem_access_type_str[type],
+              cache_reservation_fail_reason_str[status],
+              m_fail_stats[type][status]);
+    }
+  }
+}
+
+void CacheStats ::print_energy_stats(FILE *out, const char *cache_name) const {
+  fprintf(out, "%s_RH: %llu\n", cache_name, get_read_hit());
+  fprintf(out, "%s_RM: %llu\n", cache_name, get_read_miss());
+  fprintf(out, "%s_WH: %llu\n", cache_name, get_write_hit());
+  fprintf(out, "%s_WM: %llu\n", cache_name, get_write_miss());
+}
+
+bool CacheStats::check_valid(int access_type, int access_outcome) const {
+  return (access_type >= 0 && access_type < NUM_MEM_ACCESS_TYPE &&
+          access_outcome >= 0 && access_outcome < NUM_CACHE_REQUEST_STATUS);
+}
+
+bool CacheStats::check_fail_valid(int access_type, int fail_outcome) const {
+  return (access_type >= 0 && access_type < NUM_MEM_ACCESS_TYPE &&
+          fail_outcome >= 0 &&
+          fail_outcome < NUM_CACHE_RESERVATION_FAIL_REASON);
+}
diff --git a/PyTorchSimBackend/src/Cache_stats.h b/PyTorchSimBackend/src/Cache_stats.h
new file mode 100644
index 00000000..1bf92d8a
--- /dev/null
+++ b/PyTorchSimBackend/src/Cache_stats.h
@@ -0,0 +1,51 @@
+#ifndef CACHE_STATS_H
+#define CACHE_STATS_H
+#include <vector>
+#include <cstdint>
+#include <cassert>
+
+#include "Cache_defs.h"
+
+class CacheStats {
+ public:
+  CacheStats();
+  void clear();
+  void inc_stats(int access_type, int accss_outcome);
+  void inc_fail_stats(int access_type, int fail_outcome);
+  CacheRequestStatus select_stats_status(CacheRequestStatus probe,
+                                         CacheRequestStatus access) const;
+  uint64_t &operator()(int access_type, int access_outcome, bool fail_outcome);
+  uint64_t operator()(int access_type, int access_outcome,
+                      bool fail_outcome) const;
+  CacheStats operator+(const CacheStats &other);
+  CacheStats &operator+=(const CacheStats &other);
+  void aggregate_stats();
+  uint64_t get_hit() const;
+  uint64_t get_read_hit() const;
+  uint64_t get_write_hit() const;
+  uint64_t get_miss() const;
+  uint64_t get_read_miss() const;
+  uint64_t get_write_miss() const;
+  uint64_t get_accesses() const;
+  uint64_t get_interval_hit();
+  uint64_t get_interval_miss();
+  void print_stats(FILE *out, const char *cache_name = "CacheStats") const;
+  void print_fail_stats(FILE *out, const char *cache_name = "CacheStats") const;
+  void print_energy_stats(FILE *out,
+                          const char *cache_name = "CacheStats") const;
+
+ private:
+  bool check_valid(int type, int status) const;
+  bool check_fail_valid(int type, int fail) const;
+
+  std::vector<std::vector<uint64_t>> m_stats;
+  std::vector<std::vector<uint64_t>> m_fail_stats;
+
+  uint64_t m_cache_port_available_cycles;
+  uint64_t m_cache_data_port_busy_cycles;
+  uint64_t m_cache_fill_port_busy_cycles;
+
+  uint64_t m_prev_hit;
+  uint64_t m_prev_miss;
+};
+#endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc
index 8c437b14..236a5088 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/PyTorchSimBackend/src/Common.cc
@@ -46,6 +46,12 @@ SimulationConfig initialize_config(json config) {
   if (config.contains("dram_num_partitions"))
     parsed_config.dram_num_partitions = config["dram_num_partitions"];
 
+   /* L2D config */
+  if (config.contains("l2d_config"))
+    parsed_config.l2d_config_str = config["l2d_config"];
+  if (config.contains("l2d_hit_latency"))
+    parsed_config.l2d_config_str = config["l2d_hit_latency"];
+
   /* Icnt config */
   if ((std::string)config["icnt_type"] == "simple")
     parsed_config.icnt_type = IcntType::SIMPLE;
diff --git a/PyTorchSimBackend/src/DelayQueue.cc b/PyTorchSimBackend/src/DelayQueue.cc
new file mode 100644
index 00000000..fd1463fa
--- /dev/null
+++ b/PyTorchSimBackend/src/DelayQueue.cc
@@ -0,0 +1,55 @@
+#include "DelayQueue.h"
+#include "Memfetch.h"
+
+template <typename T>
+void DelayQueue<T>::push(T data, int delay) {
+  assert(m_only_latency);
+  m_size++;
+  m_queue.push(QueueEntry{data, m_cycle + delay});
+}
+
+template <typename T>
+void DelayQueue<T>::push(T data, int delay, int interval) {
+  assert(m_issued == false);
+  m_size++;
+  m_queue.push(QueueEntry{data, m_cycle + delay});
+  if(!m_only_latency) m_issued = true;
+  m_interval = interval;
+}
+
+template <typename T>
+void DelayQueue<T>::pop() {
+  assert(arrived());
+  m_queue.pop();
+  m_size--;
+}
+
+template <typename T>
+T DelayQueue<T>::top() {
+  assert(arrived());
+  return m_queue.front().data;
+}
+
+template <typename T>
+bool DelayQueue<T>::arrived() {
+  return !m_queue.empty() && (m_queue.front().finish_cycle <= m_cycle);
+}
+
+template <typename T>
+bool DelayQueue<T>::queue_empty() {
+  return m_queue.empty();
+}
+
+template <typename T>
+bool DelayQueue<T>::full() {
+  return m_issued || (m_max_size > 0 && m_size >= m_max_size);
+}
+
+template <typename T>
+void DelayQueue<T>::cycle() {
+  if (m_interval > 0) m_interval--;
+  if (m_interval <= 0) m_issued = false;
+  m_cycle++;
+}
+
+template class DelayQueue<mem_fetch*>;
diff --git a/PyTorchSimBackend/src/DelayQueue.h b/PyTorchSimBackend/src/DelayQueue.h
new file mode 100644
index 00000000..67d08a23
--- /dev/null
+++ b/PyTorchSimBackend/src/DelayQueue.h
@@ -0,0 +1,45 @@
+#ifndef PairDelayQueue_H
+#define PairDelayQueue_H
+#include <cassert>
+#include <cstdint>
+#include <queue>
+#include <string>
+
+template <typename T>
+class DelayQueue {
+ public:
+  DelayQueue() {}
+  DelayQueue(std::string name, bool only_latency, int max_size)
+      : m_only_latency(only_latency),
+        m_name(name),
+        m_interval(0),
+        m_cycle(0),
+        m_max_size(max_size),
+        m_issued(false),
+        m_size(0) {}
+  DelayQueue(std::string name) : DelayQueue(name, false, -1) {}
+  void push(T data, int delay);
+  void push(T data, int delay, int interval);
+  void pop();
+  T top();
+  int size() { return m_size; }
+  bool arrived();
+  bool queue_empty();
+  bool full();
+  void cycle();
+
+ private:
+  struct QueueEntry {
+    T data;
+    uint64_t finish_cycle = 0;
+  };
+  std::string m_name;
+  int m_interval;
+  uint64_t m_cycle;
+  int m_size;
+  int m_max_size;
+  bool m_issued;
+  bool m_only_latency;
+  std::queue<QueueEntry> m_queue;
+};
+#endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Dram.cc b/PyTorchSimBackend/src/Dram.cc
index 858e7f3c..a8bb6398 100644
--- a/PyTorchSimBackend/src/Dram.cc
+++ b/PyTorchSimBackend/src/Dram.cc
@@ -6,24 +6,42 @@ uint32_t Dram::get_channel_id(mem_fetch* access) {
     channel_id = ipoly_hash_function((new_addr_type)access->get_addr()/_config.dram_req_size, 0, _n_ch_per_partition);
   else
     channel_id = ipoly_hash_function((new_addr_type)access->get_addr()/_config.dram_req_size, 0, 16) % _n_ch_per_partition;
-  
+
   channel_id += ((access->get_numa_id() % _n_partitions)* _n_ch_per_partition);
   return channel_id;
 }
 
-DramRamulator2::DramRamulator2(SimulationConfig config) {
+DramRamulator2::DramRamulator2(SimulationConfig config, cycle_type* core_cycle) {
+  _core_cycles = core_cycle;
   _n_ch = config.dram_channels;
   _req_size = config.dram_req_size;
   _n_partitions = config.dram_num_partitions;
   _n_ch_per_partition = _n_ch / _n_partitions;
   _config = config;
-  _m_caches.resize(_n_ch);
   _mem.resize(_n_ch);
+
+  spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}", config.max_dram_bandwidth(), config.dram_freq, _n_ch, _req_size);
+  /* Initialize DRAM Channels */
   for (int ch = 0; ch < _n_ch; ch++) {
-    //_m_caches = std::make_unique<ReadOnlyCache>("L2 RO cache");
+    m_to_crossbar_queue.push_back(std::queue<mem_fetch*>());
+    m_from_crossbar_queue.push_back(std::queue<mem_fetch*>());
     _mem[ch] = std::make_unique<Ramulator2>(
       ch, _n_ch, config.dram_config_path, "Ramulator2", _config.dram_print_interval, 1);
   }
+
+  /* Initialize L2 cache */
+  _m_caches.resize(_n_ch);
+  _m_cache_config.init(config.l2d_config_str);
+  spdlog::info("[Config/L2] Total Size: {} KB, Partition Size: {} KB, Set: {}, Assoc: {}, Line Size: {}B Sector Size: {}B",
+              _m_cache_config.get_total_size_in_kb() * _n_ch, _m_cache_config.get_total_size_in_kb(),
+              _m_cache_config.get_num_sets(), _m_cache_config.get_num_assoc(),
+              _m_cache_config.get_line_size(), _m_cache_config.get_sector_size());
+  for (int ch = 0; ch < _n_ch; ch++) {
+    m_to_mem_queue.push_back(std::queue<mem_fetch*>());
+    m_cache_latency_queue.push_back(DelayQueue<mem_fetch*>("cache_latency_queue", true, 0));
+    _m_caches[ch] = std::make_unique<ReadOnlyCache>("L2 RO cache", _m_cache_config, ch, 0, &m_to_mem_queue[ch]);
+  }
+
   _tx_log2 = log2(_req_size);
   _tx_ch_log2 = log2(_n_ch_per_partition) + _tx_log2;
 }
@@ -32,39 +50,119 @@ bool DramRamulator2::running() {
   return false;
 }
 
+void DramRamulator2::cache_cycle() {
+  uint32_t line_size = _m_cache_config.get_line_size();
+  uint32_t sector_size = _m_cache_config.get_sector_size();
+  for (int i = 0; i < _n_ch; i++) {
+    m_cache_latency_queue[i].cycle();
+    // NDP to Cache. Read Only cache
+    if (!m_from_crossbar_queue[i].empty() && !m_from_crossbar_queue[i].front()->is_write() &&
+        _m_caches[i]->data_port_free()) {
+      mem_fetch* req = m_from_crossbar_queue[i].front();
+      req->set_access_sector_mask(line_size, sector_size);
+      std::deque<CacheEvent> events;
+      CacheRequestStatus status = _m_caches[i]->access(
+          req->get_addr(), *_core_cycles, req, events);
+      bool write_sent = CacheEvent::was_write_sent(events);
+      bool read_sent = CacheEvent::was_read_sent(events);
+      if (status == HIT) {
+        if (!write_sent) {
+          req->set_reply();
+          m_cache_latency_queue[i].push(req, _config.l2d_hit_latency);
+        }
+        m_from_crossbar_queue[i].pop();
+      } else if (status != RESERVATION_FAIL) {
+        if (req->is_write() &&
+            (_m_cache_config.get_write_alloc_policy() == FETCH_ON_WRITE ||
+              _m_cache_config.get_write_alloc_policy() == LAZY_FETCH_ON_READ)) {
+          req->set_reply();
+          m_cache_latency_queue[i].push(req, _config.l2d_hit_latency);
+        }
+        m_from_crossbar_queue[i].pop();
+      } else {
+        // Status Reservation fail
+        assert(!write_sent);
+        assert(!read_sent);
+      }
+    }
+
+    /* Write request is go mem directly */
+    if(!m_from_crossbar_queue[i].empty() && m_from_crossbar_queue[i].front()->is_write()) {
+      mem_fetch* req = m_from_crossbar_queue[i].front();
+      m_to_mem_queue[i].push(req);
+      m_from_crossbar_queue[i].pop();
+    }
+
+    if (_m_caches[i]->access_ready() &&
+        !m_cache_latency_queue[i].full()) {
+      mem_fetch* req = _m_caches[i]->top_next_access();
+      req->current_state = "L2 top next access";
+      if (req->is_request()) req->set_reply();
+      m_cache_latency_queue[i].push(req, _config.l2d_hit_latency);
+      _m_caches[i]->pop_next_access();
+    }
+
+    if (m_cache_latency_queue[i].arrived()) {
+      mem_fetch* req = m_cache_latency_queue[i].top();
+      m_to_crossbar_queue[i].push(req);
+      m_cache_latency_queue[i].pop();
+    }
+    _m_caches[i]->cycle();
+  }
+}
+
 void DramRamulator2::cycle() {
   for (int ch = 0; ch < _n_ch; ch++) {
     _mem[ch]->cycle();
+    // From Cache to Ramulator
+    if (!m_to_mem_queue[ch].empty()) {
+      mem_fetch* mf = m_to_mem_queue[ch].front();
+      _mem[ch]->push(mf);
+      m_to_mem_queue[ch].pop();
+    }
+    // From memory response
+    if (_mem[ch]->return_queue_top()) {
+      mem_fetch* req = _mem[ch]->return_queue_top();
+      if (_m_caches[ch]->waiting_for_fill(req)) {
+        if (_m_caches[ch]->fill_port_free()) {
+          _m_caches[ch]->fill(req, *_core_cycles);
+          _mem[ch]->return_queue_pop();
+        }
+      } else {
+        if (req->get_access_type() == L2_CACHE_WB &&
+            req->get_type() == WRITE_ACK) {
+          _mem[ch]->return_queue_pop();
+          delete req;
+        } else if (req->get_access_type() == GLOBAL_ACC_W &&
+          req->get_type() == WRITE_ACK) {
+          m_to_crossbar_queue[ch].push(req);
+          _mem[ch]->return_queue_pop();
+        }
+      }
+    }
   }
 }
 
 bool DramRamulator2::is_full(uint32_t cid, mem_fetch* request) {
-  return _mem[cid]->full();
+  return false; //m_from_crossbar_queue[cid].full(); Infinite length
 }
 
 void DramRamulator2::push(uint32_t cid, mem_fetch* request) {
-  addr_type atomic_bytes =_config.dram_req_size;
-  addr_type target_addr = request->get_addr();
-  // align address
-  addr_type start_addr = target_addr - (target_addr % atomic_bytes);
-  assert(start_addr == target_addr);
-  assert(request->get_data_size() == atomic_bytes);
-  _mem[cid]->push(request);
+  m_from_crossbar_queue[cid].push(request);
 }
 
 bool DramRamulator2::is_empty(uint32_t cid) {
-  return _mem[cid]->return_queue_top() == NULL;
+  return m_to_crossbar_queue[cid].empty();
 }
 
 mem_fetch* DramRamulator2::top(uint32_t cid) {
   assert(!is_empty(cid));
-  mem_fetch* mf = _mem[cid]->return_queue_top();
-  return mf;
+  return m_to_crossbar_queue[cid].front();
 }
 
 void DramRamulator2::pop(uint32_t cid) {
   assert(!is_empty(cid));
-  mem_fetch* mf = _mem[cid]->return_queue_pop();
+  m_to_crossbar_queue[cid].pop();
 }
 
 void DramRamulator2::print_stat() {
diff --git a/PyTorchSimBackend/src/Dram.h b/PyTorchSimBackend/src/Dram.h
index 67666bed..fc0dd5e1 100644
--- a/PyTorchSimBackend/src/Dram.h
+++ b/PyTorchSimBackend/src/Dram.h
@@ -10,12 +10,14 @@
 #include "ramulator2.hh"
 #include "Hashing.h"
 #include "Cache.h"
+#include "DelayQueue.h"
 
 class Dram {
  public:
   virtual ~Dram() = default;
   virtual bool running() = 0;
   virtual void cycle() = 0;
+  virtual void cache_cycle() = 0;
   virtual bool is_full(uint32_t cid, mem_fetch* request) = 0;
   virtual void push(uint32_t cid, mem_fetch* request) = 0;
   virtual bool is_empty(uint32_t cid) = 0;
@@ -26,18 +28,25 @@ class Dram {
 
  protected:
   SimulationConfig _config;
+  CacheConfig _m_cache_config;
   uint32_t _n_ch;
   uint32_t _n_partitions;
   uint32_t _n_ch_per_partition;
   cycle_type _cycles;
+
+  std::vector<DelayQueue<mem_fetch*>> m_cache_latency_queue;
+  std::vector<std::queue<mem_fetch*>> m_from_crossbar_queue;
+  std::vector<std::queue<mem_fetch*>> m_to_crossbar_queue;
+  std::vector<std::queue<mem_fetch*>> m_to_mem_queue;
 };
 
 class DramRamulator2 : public Dram {
  public:
-  DramRamulator2(SimulationConfig config);
+  DramRamulator2(SimulationConfig config, cycle_type *core_cycle);
 
   virtual bool running() override;
   virtual void cycle() override;
+  virtual void cache_cycle() override;
   virtual bool is_full(uint32_t cid, mem_fetch* request) override;
   virtual void push(uint32_t cid, mem_fetch* request) override;
   virtual bool is_empty(uint32_t cid) override;
@@ -48,6 +57,7 @@ class DramRamulator2 : public Dram {
  private:
   std::vector<std::unique_ptr<Cache>> _m_caches;
   std::vector<std::unique_ptr<Ramulator2>> _mem;
+  cycle_type* _core_cycles;
   int _tx_ch_log2;
   int _tx_log2;
   int _req_size;
diff --git a/PyTorchSimBackend/src/Memfetch.h b/PyTorchSimBackend/src/Memfetch.h
index f40fc781..5eb659cf 100644
--- a/PyTorchSimBackend/src/Memfetch.h
+++ b/PyTorchSimBackend/src/Memfetch.h
@@ -1,6 +1,6 @@
 #ifndef MEM_FETCH_H
 #define MEM_FETCH_H
-
+#include <spdlog/spdlog.h>
 #include "Cache_defs.h"
 
 typedef unsigned long long new_addr_type;
@@ -18,22 +18,18 @@ static const char* mem_access_type_str[] = {
     "L2_CACHE_WA", "L2_CACHE_WB"};
 enum mf_type { READ_REQUEST = 0, WRITE_REQUEST, READ_REPLY, WRITE_ACK };
 
+static unsigned long long unique_uid = 0;
+
 class mem_fetch {
  public:
   mem_fetch(new_addr_type addr, mem_access_type acc_type, mf_type type,
-            unsigned data_size, unsigned request_id, unsigned numa_id=-1,
+            unsigned data_size, unsigned numa_id=-1,
             void* custom_data=NULL) :
             m_addr(addr), m_mem_access_type(acc_type),
             m_type(type), m_data_size(data_size),
-            m_request_id(request_id), m_numa_id(numa_id),
-            m_custom_data(custom_data) {}
-  mem_fetch(new_addr_type addr, mem_access_type acc_type, mf_type type,
-            unsigned data_size) : m_addr(addr), m_mem_access_type(acc_type),
-            m_type(type), m_data_size(data_size) {}
-  mem_fetch(new_addr_type addr, mem_access_type acc_type, mf_type type,
-            unsigned data_size, SectorMask sector_mask) :
-            m_addr(addr), m_mem_access_type(acc_type), m_type(type), m_data_size(data_size),
-            m_sector_mask(sector_mask) {}
+            m_numa_id(numa_id), m_custom_data(custom_data) {
+    m_request_id = unique_uid++;
+  }
   mem_fetch(std::deque<mem_fetch*> mfs);  // for wrapping multiple mfs into one
   /* Src & Des */
   void set_core_id(int core_id) {m_core_id = core_id;}
@@ -56,11 +52,21 @@ class mem_fetch {
   bool is_write() { return m_type == mf_type::WRITE_REQUEST || m_type == mf_type::WRITE_ACK; }
   void set_request_id(unsigned id) { m_request_id = id; }
   unsigned get_request_id() { return m_request_id; }
+  void set_access_sector_mask(uint32_t line_size, uint32_t sector_size) { m_sector_mask.set((m_addr % line_size) / sector_size); }
   SectorMask get_access_sector_mask() { return m_sector_mask; }
   void set_dirty_mask(SectorMask dirty_mask) { m_dirty_mask = dirty_mask; }
   SectorMask get_dirty_mask() { return m_dirty_mask; }
   mem_fetch* get_original_mf() { return m_original_mf; }
   bool is_atomic() { return false; }
+  bool is_request() { return m_type == mf_type::READ_REQUEST || m_type == mf_type::WRITE_REQUEST; }
+  void set_reply() {
+    if (m_type == mf_type::READ_REQUEST)
+      m_type = mf_type::READ_REPLY;
+    else if(m_type == mf_type::WRITE_REQUEST)
+      m_type = mf_type::WRITE_ACK;
+    else
+      spdlog::error("Unexpected mf_type in the set_reply");
+  }
   void set_custom_data(void* custom_data) { m_custom_data = custom_data; }
   void* get_custom_data() { return m_custom_data; }
   /* Stat */
diff --git a/PyTorchSimBackend/src/SimulationConfig.h b/PyTorchSimBackend/src/SimulationConfig.h
index 29296cf7..8069c4d5 100644
--- a/PyTorchSimBackend/src/SimulationConfig.h
+++ b/PyTorchSimBackend/src/SimulationConfig.h
@@ -26,6 +26,10 @@ struct SimulationConfig {
   uint32_t dram_print_interval;
   std::string dram_config_path;
 
+  /* L2 Cache config */
+  std::string l2d_config_str = "S:64:128:16,32,L:R:m:L:L,A:192:4,32:0,32";
+  uint32_t l2d_hit_latency = 1;
+
   /* ICNT config */
   IcntType icnt_type;
   uint32_t icnt_node_per_core = 1;
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index eef67064..278f2db6 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -30,7 +30,7 @@ Simulator::Simulator(SimulationConfig config)
                                        .string();
     spdlog::info("Ramulator2 config: {}", ramulator_config);
     config.dram_config_path = ramulator_config;
-    _dram = std::make_unique<DramRamulator2>(config);
+    _dram = std::make_unique<DramRamulator2>(config, &_core_cycles);
   } else {
     spdlog::error("[Configuration] Invalid DRAM type...!");
     exit(EXIT_FAILURE);
@@ -84,6 +84,7 @@ void Simulator::core_cycle() {
     }
     _cores[core_id]->cycle();
   }
+  _dram->cache_cycle();
   _core_cycles++;
 }
 
diff --git a/PyTorchSimBackend/src/TMA.cc b/PyTorchSimBackend/src/TMA.cc
index 89a3f311..cdff9869 100644
--- a/PyTorchSimBackend/src/TMA.cc
+++ b/PyTorchSimBackend/src/TMA.cc
@@ -28,8 +28,7 @@ std::vector<mem_fetch*> TMA::get_memory_access() {
   for (auto addr: addr_set) {
     mem_access_type acc_type = _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W : mem_access_type::GLOBAL_ACC_R;
     mf_type type = _current_inst->is_dma_write() ? mf_type::WRITE_REQUEST : mf_type::READ_REQUEST;
-    mem_fetch* access = new mem_fetch(addr, acc_type, type, _dram_req_size, generate_mem_access_id(),
-      _current_inst->get_numa_id(), static_cast<void*>(_current_inst.get()));
+    mem_fetch* access = new mem_fetch(addr, acc_type, type, _dram_req_size, _current_inst->get_numa_id(), static_cast<void*>(_current_inst.get()));
     _current_inst->inc_waiting_request();
     access_vec.push_back(access);
   }

From 89c1b379bccf5c2d33f5fb5e445be425d598880d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 10 Feb 2025 04:34:35 +0000
Subject: [PATCH 090/432] [Backendsim] Configuration log fix

---
 PyTorchSimBackend/src/Common.cc          | 4 ++--
 PyTorchSimBackend/src/Interconnect.cc    | 1 -
 PyTorchSimBackend/src/Simulator.cc       | 8 ++++----
 PyTorchSimBackend/src/TileGraphParser.cc | 4 ++--
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc
index 236a5088..c7aee546 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/PyTorchSimBackend/src/Common.cc
@@ -78,13 +78,13 @@ SimulationConfig initialize_config(json config) {
       std::string core_partition = "core_" + std::to_string(i);
       uint32_t partition_id = uint32_t(config["partition"][core_partition]);
       parsed_config.partiton_map[i] = partition_id;
-      spdlog::info("CPU {}: Partition {}", i, partition_id);
+      spdlog::info("[Config/Core] CPU {}: Partition {}", i, partition_id);
     }
   } else {
     /* Default: all partition 0 */
     for (int i=0; i<parsed_config.num_cores; i++) {
       parsed_config.partiton_map[i] = 0;
-      spdlog::info("CPU {}: Partition {}", i, 0);
+      spdlog::info("[Config/Core] CPU {}: Partition {}", i, 0);
     }
   }
   return parsed_config;
diff --git a/PyTorchSimBackend/src/Interconnect.cc b/PyTorchSimBackend/src/Interconnect.cc
index 49025c85..8a684ff7 100644
--- a/PyTorchSimBackend/src/Interconnect.cc
+++ b/PyTorchSimBackend/src/Interconnect.cc
@@ -2,7 +2,6 @@
 
 SimpleInterconnect::SimpleInterconnect(SimulationConfig config)
   :  _latency(config.icnt_latency) {
-  spdlog::info("Initialize SimpleInterconnect");
   _cycles = 0;
   _config = config;
   _n_nodes = config.num_cores + config.dram_channels;
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index 278f2db6..5f89cead 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -3,11 +3,9 @@
 Simulator::Simulator(SimulationConfig config)
     : _config(config), _core_cycles(0) {
   // Create dram object
-  spdlog::info("Simulator Configuration:");
   for (int i=0; i<config.num_cores;i++)
-    spdlog::info("[Config] Core {}: {} MHz, Spad size: {} KB",
+    spdlog::info("[Config/Core] Core {}: {} MHz, Spad size: {} KB",
       i, config.core_freq , config.sram_size);
-  spdlog::info("[Config] DRAM Bandwidth {} GB/s", config.max_dram_bandwidth());
   _core_period = 1000000 / (config.core_freq);
   _icnt_period = 1000000 / (config.icnt_freq);
   _dram_period = 1000000 / (config.dram_freq);
@@ -28,7 +26,7 @@ Simulator::Simulator(SimulationConfig config)
                                        .append("configs")
                                        .append(config.dram_config_path)
                                        .string();
-    spdlog::info("Ramulator2 config: {}", ramulator_config);
+    spdlog::info("[Config/DRAM] Ramulator2 config: {}", ramulator_config);
     config.dram_config_path = ramulator_config;
     _dram = std::make_unique<DramRamulator2>(config, &_core_cycles);
   } else {
@@ -38,8 +36,10 @@ Simulator::Simulator(SimulationConfig config)
 
   // Create interconnect object
   if (config.icnt_type == IcntType::SIMPLE) {
+    spdlog::info("[Config/Interconnect] SimpleInerconnect selected");
     _icnt = std::make_unique<SimpleInterconnect>(config);
   } else if (config.icnt_type == IcntType::BOOKSIM2) {
+    spdlog::info("[Config/Interconnect] BookSim2 selected");
     _icnt = std::make_unique<Booksim2Interconnect>(config);
   } else {
     spdlog::error("[Configuration] {} Invalid interconnect type...!");
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index 0813243e..0669fe3e 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -573,7 +573,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, json& attribute_json) {
     for (auto it = address_info.begin(); it != address_info.end(); ++it) {
       uint64_t value = it.value();
       _arg_to_address[it.key()] = value;
-      spdlog::info("[TOGParser] Address Attribute key: {} address: 0x{:x}", it.key(), value);
+      spdlog::info("[TOGParser/Attribute] Address Attribute key: {} address: 0x{:x}", it.key(), value);
     }
   }
   if (_attribute_json.contains("address_numa_stride")) {
@@ -583,7 +583,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, json& attribute_json) {
       for (auto value : value_list) {
         _arg_numa_stride[it.key()].push_back(value);
       }
-      spdlog::info("[TOGParser] Address numa info key: {} numa stride : {}", it.key(), fmt::join(_arg_numa_stride[it.key()], ", "));
+      spdlog::info("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", it.key(), fmt::join(_arg_numa_stride[it.key()], ", "));
     }
   }
 

From 3ef28a6ae7058a46fac768bf36e6d15506855550 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 11 Feb 2025 04:39:46 +0000
Subject: [PATCH 091/432] [Backendsim] Make L2 cache selectable

---
 PyTorchSimBackend/src/Cache_stats.cc     |  14 +--
 PyTorchSimBackend/src/Common.cc          |  12 ++
 PyTorchSimBackend/src/Dram.cc            | 140 ++++++++---------------
 PyTorchSimBackend/src/Dram.h             |   9 +-
 PyTorchSimBackend/src/L2Cache.cc         |  98 ++++++++++++++++
 PyTorchSimBackend/src/L2Cache.h          |  51 +++++++++
 PyTorchSimBackend/src/SimulationConfig.h |   5 +-
 PyTorchSimBackend/src/Simulator.cc       |   1 +
 8 files changed, 223 insertions(+), 107 deletions(-)
 create mode 100644 PyTorchSimBackend/src/L2Cache.cc
 create mode 100644 PyTorchSimBackend/src/L2Cache.h

diff --git a/PyTorchSimBackend/src/Cache_stats.cc b/PyTorchSimBackend/src/Cache_stats.cc
index 4d076686..fe6800dd 100644
--- a/PyTorchSimBackend/src/Cache_stats.cc
+++ b/PyTorchSimBackend/src/Cache_stats.cc
@@ -195,13 +195,13 @@ uint64_t CacheStats::get_interval_miss() {
 void CacheStats::print_stats(FILE *out, const char *cache_name) const {
   uint64_t hit = get_hit();
   uint64_t miss = get_miss();
-  fprintf(out, "\tCache Hit : %llu, Cache Miss : %llu, Hit Ratio : %.2f\n", hit,
+  fprintf(out, "\tCache Hit : %lu, Cache Miss : %lu, Hit Ratio : %.2f\n", hit,
           miss, (float)hit / (get_accesses()));
   std::vector<uint32_t> total_access;
   total_access.resize(NUM_MEM_ACCESS_TYPE, 0);
   for (int type = 0; type < NUM_MEM_ACCESS_TYPE; type++) {
     for (int status = 0; status < NUM_CACHE_REQUEST_STATUS; status++) {
-      fprintf(out, "\t%s[%s][%s] = %llu\n", cache_name,
+      fprintf(out, "\t%s[%s][%s] = %lu\n", cache_name,
               mem_access_type_str[type], cache_request_status_str[status],
               m_stats[type][status]);
       if (status != RESERVATION_FAIL && status != MSHR_HIT)
@@ -217,7 +217,7 @@ void CacheStats::print_stats(FILE *out, const char *cache_name) const {
 void CacheStats::print_fail_stats(FILE *out, const char *cache_name) const {
   for (int type = 0; type < NUM_MEM_ACCESS_TYPE; type++) {
     for (int status = 0; status < NUM_CACHE_REQUEST_STATUS; status++) {
-      fprintf(out, "\t%s[%s][%s] = %llu\n", cache_name,
+      fprintf(out, "\t%s[%s][%s] = %lu\n", cache_name,
               mem_access_type_str[type],
               cache_reservation_fail_reason_str[status],
               m_fail_stats[type][status]);
@@ -226,10 +226,10 @@ void CacheStats::print_fail_stats(FILE *out, const char *cache_name) const {
 }
 
 void CacheStats ::print_energy_stats(FILE *out, const char *cache_name) const {
-  fprintf(out, "%s_RH: %llu\n", cache_name, get_read_hit());
-  fprintf(out, "%s_RM: %llu\n", cache_name, get_read_miss());
-  fprintf(out, "%s_WH: %llu\n", cache_name, get_write_hit());
-  fprintf(out, "%s_WM: %llu\n", cache_name, get_write_miss());
+  fprintf(out, "%s_RH: %lu\n", cache_name, get_read_hit());
+  fprintf(out, "%s_RM: %lu\n", cache_name, get_read_miss());
+  fprintf(out, "%s_WH: %lu\n", cache_name, get_write_hit());
+  fprintf(out, "%s_WM: %lu\n", cache_name, get_write_miss());
 }
 
 bool CacheStats::check_valid(int access_type, int access_outcome) const {
diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc
index c7aee546..7b3e0714 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/PyTorchSimBackend/src/Common.cc
@@ -47,6 +47,18 @@ SimulationConfig initialize_config(json config) {
     parsed_config.dram_num_partitions = config["dram_num_partitions"];
 
    /* L2D config */
+  if (config.contains("l2d_type")) {
+    if ((std::string)config["l2d_type"] == "nocache")
+      parsed_config.l2d_type = L2CacheType::NOCACHE;
+    else if ((std::string)config["l2d_type"] == "readonly")
+      parsed_config.l2d_type = L2CacheType::READONLY;
+    else
+      throw std::runtime_error(fmt::format("Not implemented l2 cache type {} ",
+                                          (std::string)config["l2d_type"]));
+  } else {
+    parsed_config.l2d_type = L2CacheType::NOCACHE;
+  }
+
   if (config.contains("l2d_config"))
     parsed_config.l2d_config_str = config["l2d_config"];
   if (config.contains("l2d_hit_latency"))
diff --git a/PyTorchSimBackend/src/Dram.cc b/PyTorchSimBackend/src/Dram.cc
index a8bb6398..f59d359c 100644
--- a/PyTorchSimBackend/src/Dram.cc
+++ b/PyTorchSimBackend/src/Dram.cc
@@ -11,135 +11,85 @@ uint32_t Dram::get_channel_id(mem_fetch* access) {
   return channel_id;
 }
 
-DramRamulator2::DramRamulator2(SimulationConfig config, cycle_type* core_cycle) {
+Dram::Dram(SimulationConfig config, cycle_type* core_cycle) {
   _core_cycles = core_cycle;
   _n_ch = config.dram_channels;
   _req_size = config.dram_req_size;
   _n_partitions = config.dram_num_partitions;
   _n_ch_per_partition = _n_ch / _n_partitions;
   _config = config;
-  _mem.resize(_n_ch);
 
   spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}", config.max_dram_bandwidth(), config.dram_freq, _n_ch, _req_size);
   /* Initialize DRAM Channels */
   for (int ch = 0; ch < _n_ch; ch++) {
     m_to_crossbar_queue.push_back(std::queue<mem_fetch*>());
     m_from_crossbar_queue.push_back(std::queue<mem_fetch*>());
-    _mem[ch] = std::make_unique<Ramulator2>(
-      ch, _n_ch, config.dram_config_path, "Ramulator2", _config.dram_print_interval, 1);
   }
 
   /* Initialize L2 cache */
   _m_caches.resize(_n_ch);
-  _m_cache_config.init(config.l2d_config_str);
-  spdlog::info("[Config/L2] Total Size: {} KB, Partition Size: {} KB, Set: {}, Assoc: {}, Line Size: {}B Sector Size: {}B",
-              _m_cache_config.get_total_size_in_kb() * _n_ch, _m_cache_config.get_total_size_in_kb(),
-              _m_cache_config.get_num_sets(), _m_cache_config.get_num_assoc(),
-              _m_cache_config.get_line_size(), _m_cache_config.get_sector_size());
-  for (int ch = 0; ch < _n_ch; ch++) {
-    m_to_mem_queue.push_back(std::queue<mem_fetch*>());
-    m_cache_latency_queue.push_back(DelayQueue<mem_fetch*>("cache_latency_queue", true, 0));
-    _m_caches[ch] = std::make_unique<ReadOnlyCache>("L2 RO cache", _m_cache_config, ch, 0, &m_to_mem_queue[ch]);
+  if (config.l2d_type == L2CacheType::NOCACHE) {
+    std::string name = "No cache";
+    spdlog::info("[Config/L2Cache] No L2 cache");
+    for (int ch = 0; ch < _n_ch; ch++)
+      _m_caches[ch] = new NoL2Cache(name, _m_cache_config, ch, _core_cycles, &m_to_crossbar_queue[ch], &m_from_crossbar_queue[ch]);
+  } else if (config.l2d_type == L2CacheType::READONLY) {
+    std::string name = "L2 ReadOnly cache";
+    _m_cache_config.init(config.l2d_config_str);
+    spdlog::info("[Config/L2Cache] Total Size: {} KB, Partition Size: {} KB, Set: {}, Assoc: {}, Line Size: {}B Sector Size: {}B",
+            _m_cache_config.get_total_size_in_kb() * _n_ch, _m_cache_config.get_total_size_in_kb(),
+            _m_cache_config.get_num_sets(), _m_cache_config.get_num_assoc(),
+            _m_cache_config.get_line_size(), _m_cache_config.get_sector_size());
+    for (int ch = 0; ch < _n_ch; ch++)
+      _m_caches[ch] = new ReadOnlyL2Cache(name, _m_cache_config, ch, _core_cycles, _config.l2d_hit_latency, &m_to_crossbar_queue[ch], &m_from_crossbar_queue[ch]);
+  } else {
+    spdlog::error("[Config/L2D] Invalid L2 cache type...!");
+    exit(EXIT_FAILURE);
   }
+}
 
+DramRamulator2::DramRamulator2(SimulationConfig config, cycle_type* core_cycle) : Dram(config, core_cycle) {
+  /* Initialize DRAM Channels */
+  _mem.resize(_n_ch);
+  for (int ch = 0; ch < _n_ch; ch++) {
+    _mem[ch] = std::make_unique<Ramulator2>(
+      ch, _n_ch, config.dram_config_path, "Ramulator2", _config.dram_print_interval, 1);
+  }
   _tx_log2 = log2(_req_size);
   _tx_ch_log2 = log2(_n_ch_per_partition) + _tx_log2;
 }
 
 bool DramRamulator2::running() {
+  for (int ch = 0; ch < _n_ch; ch++) {
+    if (mem_fetch* req = _mem[ch]->return_queue_top())
+      return true;
+    if (mem_fetch* req = _m_caches[ch]->top())
+      return true;
+  }
   return false;
 }
 
-void DramRamulator2::cache_cycle() {
-  uint32_t line_size = _m_cache_config.get_line_size();
-  uint32_t sector_size = _m_cache_config.get_sector_size();
-  for (int i = 0; i < _n_ch; i++) {
-    m_cache_latency_queue[i].cycle();
-    // NDP to Cache. Read Only cache
-    if (!m_from_crossbar_queue[i].empty() && !m_from_crossbar_queue[i].front()->is_write() &&
-        _m_caches[i]->data_port_free()) {
-      mem_fetch* req = m_from_crossbar_queue[i].front();
-      req->set_access_sector_mask(line_size, sector_size);
-      std::deque<CacheEvent> events;
-      CacheRequestStatus status = _m_caches[i]->access(
-          req->get_addr(), *_core_cycles, req, events);
-      bool write_sent = CacheEvent::was_write_sent(events);
-      bool read_sent = CacheEvent::was_read_sent(events);
-      if (status == HIT) {
-        if (!write_sent) {
-          req->set_reply();
-          m_cache_latency_queue[i].push(req, _config.l2d_hit_latency);
-        }
-        m_from_crossbar_queue[i].pop();
-      } else if (status != RESERVATION_FAIL) {
-        if (req->is_write() &&
-            (_m_cache_config.get_write_alloc_policy() == FETCH_ON_WRITE ||
-              _m_cache_config.get_write_alloc_policy() == LAZY_FETCH_ON_READ)) {
-          req->set_reply();
-          m_cache_latency_queue[i].push(req, _config.l2d_hit_latency);
-        }
-        m_from_crossbar_queue[i].pop();
-      } else {
-        // Status Reservation fail
-        assert(!write_sent);
-        assert(!read_sent);
-      }
-    }
-
-    /* Write request is go mem directly */
-    if(!m_from_crossbar_queue[i].empty() && m_from_crossbar_queue[i].front()->is_write()) {
-      mem_fetch* req = m_from_crossbar_queue[i].front();
-      m_to_mem_queue[i].push(req);
-      m_from_crossbar_queue[i].pop();
-    }
+void DramRamulator2::cycle() {
+  for (int ch = 0; ch < _n_ch; ch++) {
+    _mem[ch]->cycle();
 
-    if (_m_caches[i]->access_ready() &&
-        !m_cache_latency_queue[i].full()) {
-      mem_fetch* req = _m_caches[i]->top_next_access();
-      req->current_state = "L2 top next access";
-      if (req->is_request()) req->set_reply();
-      m_cache_latency_queue[i].push(req, _config.l2d_hit_latency);
-      _m_caches[i]->pop_next_access();
+    // From Cache to DRAM
+    if (mem_fetch* req = _m_caches[ch]->top()) {
+      _mem[ch]->push(req);
+      _m_caches[ch]->pop();
     }
 
-    if (m_cache_latency_queue[i].arrived()) {
-      mem_fetch* req = m_cache_latency_queue[i].top();
-      m_to_crossbar_queue[i].push(req);
-      m_cache_latency_queue[i].pop();
+    // From DRAM to Cache
+    if (mem_fetch* req = _mem[ch]->return_queue_top()) {
+      if(_m_caches[ch]->push(req))
+        _mem[ch]->return_queue_pop();
     }
-    _m_caches[i]->cycle();
   }
 }
 
-void DramRamulator2::cycle() {
+void DramRamulator2::cache_cycle()  {
   for (int ch = 0; ch < _n_ch; ch++) {
-    _mem[ch]->cycle();
-    // From Cache to Ramulator
-    if (!m_to_mem_queue[ch].empty()) {
-      mem_fetch* mf = m_to_mem_queue[ch].front();
-      _mem[ch]->push(mf);
-      m_to_mem_queue[ch].pop();
-    }
-    // From memory response
-    if (_mem[ch]->return_queue_top()) {
-      mem_fetch* req = _mem[ch]->return_queue_top();
-      if (_m_caches[ch]->waiting_for_fill(req)) {
-        if (_m_caches[ch]->fill_port_free()) {
-          _m_caches[ch]->fill(req, *_core_cycles);
-          _mem[ch]->return_queue_pop();
-        }
-      } else {
-        if (req->get_access_type() == L2_CACHE_WB &&
-            req->get_type() == WRITE_ACK) {
-          _mem[ch]->return_queue_pop();
-          delete req;
-        } else if (req->get_access_type() == GLOBAL_ACC_W &&
-          req->get_type() == WRITE_ACK) {
-          m_to_crossbar_queue[ch].push(req);
-          _mem[ch]->return_queue_pop();
-        }
-      }
-    }
+    _m_caches[ch]->cycle();
   }
 }
 
diff --git a/PyTorchSimBackend/src/Dram.h b/PyTorchSimBackend/src/Dram.h
index fc0dd5e1..137a9811 100644
--- a/PyTorchSimBackend/src/Dram.h
+++ b/PyTorchSimBackend/src/Dram.h
@@ -11,9 +11,11 @@
 #include "Hashing.h"
 #include "Cache.h"
 #include "DelayQueue.h"
+#include "L2Cache.h"
 
 class Dram {
  public:
+  Dram(SimulationConfig config, cycle_type* core_cycle);
   virtual ~Dram() = default;
   virtual bool running() = 0;
   virtual void cycle() = 0;
@@ -32,12 +34,14 @@ class Dram {
   uint32_t _n_ch;
   uint32_t _n_partitions;
   uint32_t _n_ch_per_partition;
+  uint32_t _req_size;
   cycle_type _cycles;
-
+  cycle_type* _core_cycles;
   std::vector<DelayQueue<mem_fetch*>> m_cache_latency_queue;
   std::vector<std::queue<mem_fetch*>> m_from_crossbar_queue;
   std::vector<std::queue<mem_fetch*>> m_to_crossbar_queue;
   std::vector<std::queue<mem_fetch*>> m_to_mem_queue;
+  std::vector<L2Cache*> _m_caches;
 };
 
 class DramRamulator2 : public Dram {
@@ -55,12 +59,9 @@ class DramRamulator2 : public Dram {
   virtual void print_stat() override;
 
  private:
-  std::vector<std::unique_ptr<Cache>> _m_caches;
   std::vector<std::unique_ptr<Ramulator2>> _mem;
-  cycle_type* _core_cycles;
   int _tx_ch_log2;
   int _tx_log2;
-  int _req_size;
 };
 
 #endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/L2Cache.cc b/PyTorchSimBackend/src/L2Cache.cc
new file mode 100644
index 00000000..69ec58bc
--- /dev/null
+++ b/PyTorchSimBackend/src/L2Cache.cc
@@ -0,0 +1,98 @@
+#include "L2Cache.h"
+
+bool NoL2Cache::push(mem_fetch* req) {
+  l_to_xbar_queue->push(req);
+  return true;
+}
+void NoL2Cache::cycle() {
+  if (!l_from_xbar_queue->empty()) {
+    mem_fetch* req = l_from_xbar_queue->front();
+    l_to_mem_queue.push(req);
+    l_from_xbar_queue->pop();
+  }
+}
+
+ReadOnlyL2Cache::ReadOnlyL2Cache(std::string name,  CacheConfig &cache_config, uint32_t id, 
+  cycle_type *core_cycle, uint32_t l2d_hit_latency,
+  std::queue<mem_fetch*> *to_xbar_queue, std::queue<mem_fetch*> *from_xbar_queue) :
+  L2Cache(name, cache_config, id, core_cycle, l2d_hit_latency, to_xbar_queue, from_xbar_queue) {
+  l_cache = std::make_unique<ReadOnlyCache>(name, cache_config, id, 0, &l_to_mem_queue);
+  l_from_cache_queue = DelayQueue<mem_fetch*>(l_name + "_latency_queue", true, 0);
+}
+
+bool ReadOnlyL2Cache::push(mem_fetch* req) {
+  if (l_cache->waiting_for_fill(req)) {
+    if (!l_cache->fill_port_free())
+      return false;
+    l_cache->fill(req, *l_core_cycle);
+  } else {
+    if (req->get_access_type() == L2_CACHE_WB && req->get_type() == WRITE_ACK) {
+      delete req;
+    } else if (req->get_access_type() == GLOBAL_ACC_W && req->get_type() == WRITE_ACK) {
+      l_to_xbar_queue->push(req);
+    }
+  }
+  return true;
+}
+
+void ReadOnlyL2Cache::cycle() {
+  l_from_cache_queue.cycle();
+  l_cache->cycle();
+
+  // Mem to Cache. Read Only cache
+  uint32_t line_size = l_cache_config.get_line_size();
+  uint32_t sector_size = l_cache_config.get_sector_size();
+
+  /* Read request*/
+  if (!l_from_xbar_queue->empty() && !l_from_xbar_queue->front()->is_write() &&
+        l_cache->data_port_free()) {
+    mem_fetch* req = l_from_xbar_queue->front();
+    req->set_access_sector_mask(line_size, sector_size);
+    std::deque<CacheEvent> events;
+    CacheRequestStatus status = l_cache->access(
+        req->get_addr(), *l_core_cycle, req, events);
+    bool write_sent = CacheEvent::was_write_sent(events);
+    bool read_sent = CacheEvent::was_read_sent(events);
+    if (status == HIT) {
+      if (!write_sent) {
+        req->set_reply();
+        l_from_cache_queue.push(req, l2d_hit_latency);
+      }
+      l_from_xbar_queue->pop();
+    } else if (status != RESERVATION_FAIL) {
+      if (req->is_write() &&
+          (l_cache_config.get_write_alloc_policy() == FETCH_ON_WRITE ||
+            l_cache_config.get_write_alloc_policy() == LAZY_FETCH_ON_READ)) {
+        req->set_reply();
+        l_from_cache_queue.push(req, l2d_hit_latency);
+      }
+      l_from_xbar_queue->pop();
+    } else {
+      // Status Reservation fail
+      assert(!write_sent);
+      assert(!read_sent);
+    }
+  }
+
+  /* Write request is go mem directly */
+  if(!l_from_xbar_queue->empty() && l_from_xbar_queue->front()->is_write()) {
+    mem_fetch* req = l_from_xbar_queue->front();
+    l_to_mem_queue.push(req);
+    l_from_xbar_queue->pop();
+  }
+
+  if (l_cache->access_ready() &&
+      !l_from_cache_queue.full()) {
+    mem_fetch* req = l_cache->top_next_access();
+    req->current_state = "L2 top next access";
+    if (req->is_request()) req->set_reply();
+    l_from_cache_queue.push(req, l2d_hit_latency);
+    l_cache->pop_next_access();
+  }
+
+  if (l_from_cache_queue.arrived()) {
+    mem_fetch* req = l_from_cache_queue.top();
+    l_to_xbar_queue->push(req);
+    l_from_cache_queue.pop();
+  }
+}
diff --git a/PyTorchSimBackend/src/L2Cache.h b/PyTorchSimBackend/src/L2Cache.h
new file mode 100644
index 00000000..f8ca55d1
--- /dev/null
+++ b/PyTorchSimBackend/src/L2Cache.h
@@ -0,0 +1,51 @@
+#include <string>
+#include <queue>
+#include "Memfetch.h"
+#include "Cache.h"
+#include "Instruction.h"
+
+class L2Cache {
+public:
+  L2Cache(std::string name, CacheConfig &cache_config, uint32_t id, cycle_type *core_cycle,
+    uint32_t l2d_hit_latency, std::queue<mem_fetch*> *to_xbar_queue,
+    std::queue<mem_fetch*> *from_xbar_queue) : 
+    l_name(name), l_cache_config(cache_config), l_id(id), l_core_cycle(core_cycle),
+    l2d_hit_latency(l2d_hit_latency),
+    l_to_xbar_queue(to_xbar_queue), l_from_xbar_queue(from_xbar_queue) {}
+  virtual void cycle()=0;
+  // Push memory response from DRAM
+  virtual bool push(mem_fetch* req)=0;
+  // Pop memory request from Cache
+  void pop() { l_to_mem_queue.pop(); }
+  mem_fetch* top() { return l_to_mem_queue.empty() ? NULL : l_to_mem_queue.front(); }
+
+protected:
+  cycle_type *l_core_cycle;   // Core cycle
+  std::string l_name;         // L2 name
+  CacheConfig l_cache_config; // L2 cache config
+  uint32_t l_id;              // L2 partition id
+  uint32_t l2d_hit_latency;
+  std::queue<mem_fetch*> *l_to_xbar_queue;
+  std::queue<mem_fetch*> *l_from_xbar_queue;
+  std::queue<mem_fetch*> l_to_mem_queue;
+  DelayQueue<mem_fetch*> l_from_cache_queue;
+  std::unique_ptr<Cache> l_cache;
+};
+
+class NoL2Cache : public L2Cache {
+public:
+  NoL2Cache(std::string name,  CacheConfig &cache_config, uint32_t id, cycle_type *core_cycle,
+    std::queue<mem_fetch*> *to_xbar_queue, std::queue<mem_fetch*> *from_xbar_queue) : 
+    L2Cache(name, cache_config, id, core_cycle, 0, to_xbar_queue, from_xbar_queue) {}
+  void cycle() override;
+  bool push(mem_fetch* req) override;  // Push memory response from DRAM
+};
+
+class ReadOnlyL2Cache : public L2Cache {
+public:
+  ReadOnlyL2Cache(std::string name,  CacheConfig &cache_config, uint32_t id, cycle_type *core_cycle,
+    uint32_t l2d_hit_latency, std::queue<mem_fetch*> *to_xbar_queue,
+    std::queue<mem_fetch*> *from_xbar_queue);
+  void cycle() override;
+  bool push(mem_fetch* req) override;  // Push memory response from DRAM
+};
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/SimulationConfig.h b/PyTorchSimBackend/src/SimulationConfig.h
index 8069c4d5..4d6e9c52 100644
--- a/PyTorchSimBackend/src/SimulationConfig.h
+++ b/PyTorchSimBackend/src/SimulationConfig.h
@@ -9,6 +9,8 @@ enum class DramType { SIMPLE, RAMULATOR1, RAMULATOR2 };
 
 enum class IcntType { SIMPLE, BOOKSIM2 };
 
+enum class L2CacheType { NOCACHE, READONLY };
+
 struct SimulationConfig {
   /* Core config */
   uint32_t num_cores;
@@ -27,7 +29,8 @@ struct SimulationConfig {
   std::string dram_config_path;
 
   /* L2 Cache config */
-  std::string l2d_config_str = "S:64:128:16,32,L:R:m:L:L,A:192:4,32:0,32";
+  L2CacheType l2d_type = L2CacheType::NOCACHE;
+  std::string l2d_config_str;
   uint32_t l2d_hit_latency = 1;
 
   /* ICNT config */
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index 5f89cead..41ba3eb8 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -84,6 +84,7 @@ void Simulator::core_cycle() {
     }
     _cores[core_id]->cycle();
   }
+  /* L2 cache */
   _dram->cache_cycle();
   _core_cycles++;
 }

From 0048f9bc66bfbe3daddc34b180110686f8bbaf53 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Tue, 11 Feb 2025 13:28:23 +0000
Subject: [PATCH 092/432] [Backendsim] Make systolic array number configurable
 in a core

---
 PyTorchSimBackend/src/Common.cc          |   1 +
 PyTorchSimBackend/src/Core.cc            | 143 ++++++++++++++++-------
 PyTorchSimBackend/src/Core.h             |  24 ++--
 PyTorchSimBackend/src/SimulationConfig.h |   1 +
 4 files changed, 119 insertions(+), 50 deletions(-)

diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc
index 7b3e0714..5ee16e94 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/PyTorchSimBackend/src/Common.cc
@@ -21,6 +21,7 @@ SimulationConfig initialize_config(json config) {
   parsed_config.num_cores = config["num_cores"];
   parsed_config.core_freq = config["core_freq"];
   parsed_config.sram_size = config["sram_size"];
+  parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"];
   parsed_config.core_print_interval = get_config_value<uint32_t>(config, "core_print_interval");
 
   /* DRAM config */
diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index f986797b..2a393c51 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -5,10 +5,15 @@ Core::Core(uint32_t id, SimulationConfig config)
       _config(config),
       _core_cycle(0),
       _stat_tma_cycle(0),
+      _num_systolic_array_per_core(config.num_systolic_array_per_core),
       _tma(id, config.dram_req_size) {
   _sram_size = _config.sram_size * 1024;
   _used_sram_size = 0;
-  _compute_pipeline.resize(NR_COMPUTE_UNIT);
+  _sa_compute_pipeline.resize(_num_systolic_array_per_core);
+  _stat_tot_sa_compute_cycle.resize(_num_systolic_array_per_core);
+  _stat_sa_compute_cycle.resize(_num_systolic_array_per_core);
+  _stat_tot_sa_compute_idle_cycle.resize(_num_systolic_array_per_core);
+  _stat_sa_compute_idle_cycle.resize(_num_systolic_array_per_core);
 }
 
 bool Core::can_issue(const std::shared_ptr<Tile>& op) {
@@ -37,30 +42,69 @@ std::shared_ptr<Tile> Core::pop_finished_tile() {
   return result;
 }
 
-void Core::compute_cycle() {
-  for (int i=0; i<NR_COMPUTE_UNIT; i++) {
-    auto& target_pipeline = _compute_pipeline.at(i);
+std::queue<std::shared_ptr<Instruction>>& Core::get_compute_pipeline(int compute_type) {
+  if (compute_type == VECTOR_UNIT)
+    return _vu_compute_pipeline;
+  else if (compute_type == SYSTOLIC_ARRAY) {
+    uint32_t sa_idx = _systolic_array_rr;
+    _systolic_array_rr = (_systolic_array_rr + 1) % _num_systolic_array_per_core;
+    return _sa_compute_pipeline.at(sa_idx);
+  }
+  else {
+    spdlog::error("Undefined compute type");
+    exit(EXIT_FAILURE);
+  }
+}
+
+void Core::vu_cycle() {
+  bool retry = true;
+  while (retry) {
+    if (!_vu_compute_pipeline.empty()) {
+      _stat_vu_compute_cycle++;
+      if(_vu_compute_pipeline.front()->finish_cycle <= _core_cycle) {
+        int bubble = _vu_compute_pipeline.front()->bubble_cycle;
+        _stat_vu_compute_idle_cycle += bubble;
+        _stat_vu_compute_cycle -= bubble;
+        finish_instruction(_vu_compute_pipeline.front());
+        _vu_compute_pipeline.pop();
+      } else {
+        retry = false;
+      }
+    } else {
+      _stat_vu_compute_idle_cycle++;
+      retry = false;
+    }
+  }
+}
+
+void Core::sa_cycle() {
+  for (int i=0; i<_num_systolic_array_per_core; i++) {
     bool retry = true;
     while (retry) {
-      if (!target_pipeline.empty()) {
-        _stat_compute_cycle[i]++;
-        if(target_pipeline.front()->finish_cycle <= _core_cycle) {
-          int bubble = target_pipeline.front()->bubble_cycle;
-          _stat_compute_idle_cycle[i] += bubble;
-          _stat_compute_cycle[i] -= bubble;
-          finish_instruction(target_pipeline.front());
-          target_pipeline.pop();
+      if (!_sa_compute_pipeline.at(i).empty()) {
+        _stat_sa_compute_cycle.at(i)++;
+        if(_sa_compute_pipeline.at(i).front()->finish_cycle <= _core_cycle) {
+          int bubble = _sa_compute_pipeline.at(i).front()->bubble_cycle;
+          _stat_sa_compute_idle_cycle.at(i) += bubble;
+          _stat_sa_compute_cycle.at(i) -= bubble;
+          finish_instruction(_sa_compute_pipeline.at(i).front());
+          _sa_compute_pipeline.at(i).pop();
         } else {
           retry = false;
         }
       } else {
-        _stat_compute_idle_cycle[i]++;
+        _stat_sa_compute_idle_cycle.at(i)++;
         retry = false;
       }
     }
   }
 }
 
+void Core::compute_cycle() {
+  vu_cycle();
+  sa_cycle();
+}
+
 void Core::dma_cycle() {
   /* Check finished dma operation */
   for (int i=0; i<_dma_waiting_queue.size(); i++){
@@ -205,7 +249,7 @@ void Core::cycle() {
           break;
         case Opcode::COMP:
           {
-            auto& target_pipeline = _compute_pipeline.at(inst->get_compute_type());
+            auto& target_pipeline = get_compute_pipeline(inst->get_compute_type());
             if (target_pipeline.empty())
               inst->finish_cycle = _core_cycle + inst->get_compute_cycle();
             else {
@@ -295,8 +339,9 @@ void Core::finish_instruction(std::shared_ptr<Instruction>& inst) {
 bool Core::running() {
   bool running = false;
   running = running || _tiles.size() > 0;
-  for (int i=0; i<NR_COMPUTE_UNIT;i++)
-    running = running || !_compute_pipeline.at(i).empty();
+  running = running || !_vu_compute_pipeline.empty();
+  for (int i=0; i<_num_systolic_array_per_core;i++)
+    running = running || !_sa_compute_pipeline.at(i).empty();
   running = running || !_dma_waiting_queue.empty();
   running = running || !_tma.empty();
   running = running || !_ld_inst_queue.empty();
@@ -325,47 +370,59 @@ bool Core::can_issue_compute(std::shared_ptr<Instruction>& inst) {
 }
 
 void Core::print_stats() {
-  update_stats();
-  spdlog::info(
-      "Core [{}] : MatMul active cycle {} Vector active cycle {} ",
-      _id, _stat_tot_compute_cycle[SYSTOLIC_ARRAY], _stat_tot_compute_cycle[VECTOR_UNIT]);
+  std::vector<float> sa_utilization;
+  for (int i=0; i<_num_systolic_array_per_core; i++)
+    sa_utilization.push_back(static_cast<float>(_stat_tot_sa_compute_cycle.at(i) * 100) / _core_cycle);
   spdlog::info(
-      "Core [{}] : TMA active cycle {} TMA idle cycle {} Systolic Array idle cycle {} Vector unit idle cycle {}",
-      _id, _stat_tot_tma_cycle, _stat_tot_tma_idle_cycle, _stat_tot_compute_idle_cycle[SYSTOLIC_ARRAY], _stat_compute_idle_cycle[VECTOR_UNIT]);
-  spdlog::info("Core [{}] : Systolic Array Utilization(%) {:.2f}, Vector Unit Utilization(%) {:.2f}, Total cycle: {}",
-    _id, static_cast<float>(_stat_tot_compute_cycle[SYSTOLIC_ARRAY] * 100) / _core_cycle,
-    static_cast<float>(_stat_tot_compute_cycle[VECTOR_UNIT] * 100) / _core_cycle, _core_cycle);
+      "Core [{}] : Vector active cycle {}", _id, _stat_tot_vu_compute_cycle);
+  for (int i=0; i<_num_systolic_array_per_core; i++)
+    spdlog::info("Core [{}] : Systolic array[{}] active cycle {}", _id, i, _stat_tot_sa_compute_cycle.at(i));
+  spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {}", _id, _stat_tot_tma_cycle, _stat_tot_tma_idle_cycle);
+  spdlog::info("Core[{}] : Vector unit idle cycle {}", _id, _stat_vu_compute_idle_cycle);
+  for (int i=0; i<_num_systolic_array_per_core; i++)
+    spdlog::info("Core [{}] : Systolic Array[{}] idle cycle [{}]", _id, i, _stat_tot_sa_compute_cycle.at(i));
+  spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}", _id, static_cast<float>(_stat_tot_vu_compute_cycle * 100) / _core_cycle);
+  for (int i=0; i<_num_systolic_array_per_core; i++)
+    spdlog::info("Core [{}] : Systolic Array[{}] Utilization(%) {:.2f}", _id, i, sa_utilization.at(i));
+  spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle);
+  update_stats();
 }
 
 void Core::print_current_stats() {
+  std::vector<float> sa_utilization;
+  for (int i=0; i<_num_systolic_array_per_core; i++)
+    sa_utilization.push_back(static_cast<float>(_stat_sa_compute_cycle.at(i) * 100) / _core_cycle);
   auto level = spdlog::level::info;
   if(_id != 0)
     level = spdlog::level::debug;
-  spdlog::log(level,
-      "Core [{}] : MatMul active cycle {} Vector active cycle {} ",
-      _id, _stat_compute_cycle[SYSTOLIC_ARRAY], _stat_compute_cycle[VECTOR_UNIT]);
-  spdlog::log(level,
-      "Core [{}] : TMA active cycle {} TMA idle cycle {} Systolic Array idle cycle {} Vector unit idle cycle {}",
-      _id, _stat_tma_cycle, _stat_tma_idle_cycle, _stat_compute_idle_cycle[SYSTOLIC_ARRAY], _stat_compute_idle_cycle[VECTOR_UNIT]);
-  spdlog::log(level,
-      "Core [{}] : Systolic Array Utilization(%) {:.2f}, Vector Unit Utilization(%) {:.2f}, Total cycle: {}",
-      _id, static_cast<float>(_stat_compute_cycle[SYSTOLIC_ARRAY] * 100) / _config.core_print_interval,
-      static_cast<float>(_stat_compute_cycle[VECTOR_UNIT] * 100) / _config.core_print_interval, _core_cycle);
+  spdlog::log(level, "Core [{}] : Vector active cycle {}", _id, _stat_vu_compute_cycle);
+  for (int i=0; i<_num_systolic_array_per_core; i++)
+    spdlog::log(level, "Core [{}] : Systolic array[{}] active cycle {}", _id, i, _stat_sa_compute_cycle.at(i));
+  spdlog::log(level, "Core [{}] : TMA active cycle {} TMA idle cycle {}", _id, _stat_tma_cycle, _stat_tma_idle_cycle);
+  spdlog::log(level, "Core [{}] : Vector unit idle cycle {}", _id, _stat_vu_compute_idle_cycle);
+  for (int i=0; i<_num_systolic_array_per_core; i++)
+    spdlog::log(level, "Core [{}] : Systolic Array[{}] idle cycle {}", _id, i, _stat_sa_compute_idle_cycle.at(i));
+  spdlog::log(level, "Core [{}] : Vector Unit Utilization(%) {:.2f}", _id, static_cast<float>(_stat_vu_compute_cycle * 100) / _config.core_print_interval);
+  for (int i=0; i<_num_systolic_array_per_core; i++)
+    spdlog::log(level, "Core [{}] : Systolic Array[{}] Utilization(%) {:.2f}", _id, i, sa_utilization.at(i));
+  spdlog::log(level, "Core [{}] : Total cycle {}", _id, _core_cycle);
   update_stats();
 }
 
 void Core::update_stats() {
-  _stat_tot_compute_cycle[SYSTOLIC_ARRAY] += _stat_compute_cycle[SYSTOLIC_ARRAY];
-  _stat_tot_compute_cycle[VECTOR_UNIT] += _stat_compute_cycle[VECTOR_UNIT];
+  for (int i=0; i<_num_systolic_array_per_core; i++) {
+    _stat_tot_sa_compute_cycle.at(i) += _stat_sa_compute_cycle.at(i);
+    _stat_tot_sa_compute_idle_cycle.at(i) += _stat_sa_compute_idle_cycle.at(i);
+    _stat_sa_compute_cycle.at(i) = 0;
+    _stat_sa_compute_idle_cycle.at(i) = 0;
+  }
+
+  _stat_tot_vu_compute_cycle += _stat_vu_compute_cycle;
   _stat_tot_tma_cycle += _stat_tma_cycle;
   _stat_tot_tma_idle_cycle += _stat_tma_idle_cycle;
-  _stat_tot_compute_idle_cycle[SYSTOLIC_ARRAY] += _stat_compute_idle_cycle[SYSTOLIC_ARRAY];
-  _stat_compute_idle_cycle[VECTOR_UNIT] += _stat_compute_idle_cycle[VECTOR_UNIT];
 
-  _stat_compute_cycle[SYSTOLIC_ARRAY] = 0;
-  _stat_compute_cycle[VECTOR_UNIT] = 0;
+  _stat_vu_compute_cycle = 0;
   _stat_tma_cycle = 0;
   _stat_tma_idle_cycle = 0;
-  _stat_compute_idle_cycle[SYSTOLIC_ARRAY] = 0;
-  _stat_compute_idle_cycle[VECTOR_UNIT] = 0;
+  _stat_vu_compute_idle_cycle = 0;
 }
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Core.h b/PyTorchSimBackend/src/Core.h
index c6fdb1ab..7af19050 100644
--- a/PyTorchSimBackend/src/Core.h
+++ b/PyTorchSimBackend/src/Core.h
@@ -4,6 +4,7 @@
 #include <memory>
 #include <vector>
 #include <fmt/core.h>
+#include <fmt/ranges.h>
 
 #include "Dram.h"
 #include "Tile.h"
@@ -19,6 +20,8 @@ class Core {
   void issue(std::shared_ptr<Tile> tile);
   std::shared_ptr<Tile> pop_finished_tile();
   void cycle();
+  void vu_cycle();
+  void sa_cycle();
   void compute_cycle();
   void dma_cycle();
   bool has_memory_request();
@@ -28,7 +31,7 @@ class Core {
   void print_stats();
   void print_current_stats();
   void finish_instruction(std::shared_ptr<Instruction>& inst);
-  cycle_type get_compute_cycles() { return _stat_tot_compute_cycle[SYSTOLIC_ARRAY]; }
+  std::queue<std::shared_ptr<Instruction>>& get_compute_pipeline(int compute_type);
   enum {
     VECTOR_UNIT,
     SYSTOLIC_ARRAY,
@@ -40,30 +43,37 @@ class Core {
   void update_stats();
 
   /* Core id & config file */
-  const uint32_t _id;   
+  const uint32_t _id;
   const SimulationConfig _config;
   size_t _sram_size;
   size_t _used_sram_size;
+  uint32_t _num_systolic_array_per_core;
+  uint32_t _systolic_array_rr = 0;
 
   /* TMA Unit */
   TMA _tma;
 
   /* cycle */
   cycle_type _core_cycle;
-  cycle_type _stat_tot_compute_cycle[NR_COMPUTE_UNIT] = {0, };
+  cycle_type _stat_tot_vu_compute_cycle = 0;
+  std::vector<cycle_type> _stat_tot_sa_compute_cycle;
   cycle_type _stat_tot_tma_cycle = 0;
   cycle_type _stat_tot_tma_idle_cycle = 0;
-  cycle_type _stat_tot_compute_idle_cycle[NR_COMPUTE_UNIT] = {0, };
+  cycle_type _stat_tot_vu_compute_idle_cycle = 0;
+  std::vector<cycle_type> _stat_tot_sa_compute_idle_cycle;
 
-  cycle_type _stat_compute_cycle[NR_COMPUTE_UNIT] = {0, };
+  cycle_type _stat_vu_compute_cycle = 0;
+  std::vector<cycle_type> _stat_sa_compute_cycle;
   cycle_type _stat_tma_cycle = 0;
   cycle_type _stat_tma_idle_cycle = 0;
-  cycle_type _stat_compute_idle_cycle[NR_COMPUTE_UNIT] = {0, };
+  cycle_type _stat_vu_compute_idle_cycle = 0;
+  std::vector<cycle_type> _stat_sa_compute_idle_cycle;
 
   std::vector<std::shared_ptr<Tile>> _tiles;
   std::queue<std::shared_ptr<Tile>> _finished_tiles;
 
-  std::vector<std::queue<std::shared_ptr<Instruction>>> _compute_pipeline;
+  std::queue<std::shared_ptr<Instruction>> _vu_compute_pipeline;
+  std::vector<std::queue<std::shared_ptr<Instruction>>> _sa_compute_pipeline;
   std::queue<std::shared_ptr<Instruction>> _ld_inst_queue;
   std::queue<std::shared_ptr<Instruction>> _st_inst_queue;
 
diff --git a/PyTorchSimBackend/src/SimulationConfig.h b/PyTorchSimBackend/src/SimulationConfig.h
index 4d6e9c52..031cd0a7 100644
--- a/PyTorchSimBackend/src/SimulationConfig.h
+++ b/PyTorchSimBackend/src/SimulationConfig.h
@@ -17,6 +17,7 @@ struct SimulationConfig {
   uint32_t core_freq;
   uint32_t sram_size;
   uint32_t core_print_interval = 0;
+  uint32_t num_systolic_array_per_core = 1;
 
   /* DRAM config */
   DramType dram_type;

From 6ce180da193784dff9eb15a9c39fd7c3c9bfd05f Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 11 Feb 2025 22:48:18 +0900
Subject: [PATCH 093/432] Update Common.cc

---
 PyTorchSimBackend/src/Common.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc
index 5ee16e94..d7a37583 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/PyTorchSimBackend/src/Common.cc
@@ -21,7 +21,8 @@ SimulationConfig initialize_config(json config) {
   parsed_config.num_cores = config["num_cores"];
   parsed_config.core_freq = config["core_freq"];
   parsed_config.sram_size = config["sram_size"];
-  parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"];
+  if (config.contains("num_systolic_array_per_core"))
+    parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"];
   parsed_config.core_print_interval = get_config_value<uint32_t>(config, "core_print_interval");
 
   /* DRAM config */
@@ -101,4 +102,4 @@ SimulationConfig initialize_config(json config) {
     }
   }
   return parsed_config;
-}
\ No newline at end of file
+}

From f6d1c78f66068a066f039c528b30eb42ab5bdd07 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Tue, 11 Feb 2025 14:22:50 +0000
Subject: [PATCH 094/432] [Fix] TOGSim tag mismatch fix

---
 PyTorchSimBackend/src/Core.cc            | 2 +-
 PyTorchSimBackend/src/TileGraphParser.cc | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index 2a393c51..3a54b0c6 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -82,7 +82,6 @@ void Core::sa_cycle() {
     bool retry = true;
     while (retry) {
       if (!_sa_compute_pipeline.at(i).empty()) {
-        _stat_sa_compute_cycle.at(i)++;
         if(_sa_compute_pipeline.at(i).front()->finish_cycle <= _core_cycle) {
           int bubble = _sa_compute_pipeline.at(i).front()->bubble_cycle;
           _stat_sa_compute_idle_cycle.at(i) += bubble;
@@ -90,6 +89,7 @@ void Core::sa_cycle() {
           finish_instruction(_sa_compute_pipeline.at(i).front());
           _sa_compute_pipeline.at(i).pop();
         } else {
+          _stat_sa_compute_cycle.at(i)++;
           retry = false;
         }
       } else {
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index 0669fe3e..ce39f474 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -308,11 +308,13 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         }
       }
 
-      uint32_t step = std::stoi(tog_parser->getMetaByName("systolic_size"));
+      uint32_t systolic_size = std::stoi(tog_parser->getMetaByName("systolic_size"));
       for (auto loop_idx: tag_idx_list) {
         if (iter.find(loop_idx) == iter.end())
           tag_list.push_back(0);
         else {
+          uint32_t step = (uint32_t)tog_parser->get_loop_step(loop_idx);
+          step = step > systolic_size ? systolic_size : step;
           auto iter_value = getLoopIndexValue(iter, loop_idx) / step;
           tag_list.push_back(iter_value);
         }

From e0e7d9e4d51100a9394f8719c41667b14a28c0b7 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Tue, 11 Feb 2025 14:24:08 +0000
Subject: [PATCH 095/432] [Frontend] CONV sub-tiling support

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 11 +++--
 PyTorchSimFrontend/mlir/mlir_template.py      | 46 ++++++++-----------
 2 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 4e664a32..ede7ada9 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -78,7 +78,7 @@
   %stride_w = arith.constant {{ STRIDE_W }} : index
 
   affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
-    affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }}{
+    affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }} {
       affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M }} {
         affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
           %index0 = affine.apply #map0(%o_h, %o_w, %tile_m, %tile_n)
@@ -98,10 +98,10 @@
                 %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
                 // Load input matrix
                 memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
-                    : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_I_H }}, {{ TILE_I_W }}, {{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_I_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
+                    : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_I_H }}, {{ SUB_TILE_I_W }}, {{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_I_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
                 // Load kernel matrix
                 memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
-                    : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K_H }}, {{ TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
+                    : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
                 affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
                   affine.for %tile_o_w = 0 to {{ TILE_O_W }} {
                     affine.for %tile_k_h = 0 to {{ TILE_K_H }} {
@@ -227,6 +227,7 @@ def render(self,
         SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
         TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
         TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
+        SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
 
         kernel.loop_size = [K_H, K_W, O_H, O_W, BATCH, O_C, I_C]
 
@@ -258,6 +259,10 @@ def render(self,
             TILE_K_W=TILE_K_W,
             SUB_TILE_M=SUB_TILE_M,
             SUB_TILE_N=SUB_TILE_N,
+            SUB_TILE_I_H=SUB_TILE_I_H,
+            SUB_TILE_I_W=SUB_TILE_I_W,
+            SUB_TILE_K_H=SUB_TILE_K_H,
+            SUB_TILE_K_W=SUB_TILE_K_W,
             PADDING_H=self.padding[0],
             PADDING_W=self.padding[1],
             STRIDE_H=self.stride[0],
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 24714d76..6778e7c8 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -149,37 +149,29 @@ def gemm_combination_mapping(self, M, N, K):
     def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation):
         spad_size = self.spad_info["spad_size"] * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
-        m_pad_factor = self.vector_lane if M > self.vector_lane else 8
-        n_pad_factor = self.vector_lane if N > self.vector_lane else 8
-        k_pad_factor = self.vector_lane if K > self.vector_lane else 8
-        M_padded = ((M + m_pad_factor - 1) // m_pad_factor) * m_pad_factor
-        N_padded = ((N + n_pad_factor - 1) // n_pad_factor) * n_pad_factor
-        K_padded = ((K + k_pad_factor - 1) // k_pad_factor) * k_pad_factor
 
         max_used_spad_size = 0
-        mapping = (self.vector_lane, self.vector_lane, self.vector_lane)
-        tile_M_range = range(self.vector_lane, M_padded + 1, self.vector_lane) if M > self.vector_lane else [M_padded]
-        tile_N_range = range(self.vector_lane, N_padded + 1, self.vector_lane) if N > self.vector_lane else [N_padded]
-        tile_K_range = range(self.vector_lane, K_padded + 1, self.vector_lane) if K > self.vector_lane else [K_padded]
+        M, N, K = self.gemm_combination_mapping(M, N, K)
         for o_h in range(1, O_H + 1):
             for o_w in range(1, O_W + 1):
-                i_h = 1 + (o_h - 1) * stride[0] + (K_H - 1) * dilation[0]
-                i_w = 1 + (o_h - 1) * stride[1] + (K_W - 1) * dilation[1]
-                for tile_M in tile_M_range:
-                    for tile_N in tile_N_range:
-                        for tile_K in tile_K_range:
-                            weight_size = K_W * K_H * tile_K * tile_N
-                            input_size = i_w * i_h * tile_M * tile_K
-                            output_size = o_w * o_h * tile_M * tile_N
-                            used_spad_size = (weight_size + input_size + output_size) * self.precision
-                            if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size:
-                                max_used_spad_size = used_spad_size
-                                mapping = (K_H, K_W, o_h, o_w, tile_M, tile_N, tile_K)
-
-        Outer_M = math.ceil(M_padded / mapping[4])
-        Outer_N = math.ceil(N_padded / mapping[5])
-        Outer_K = math.ceil(K_padded / mapping[6])
-        mapping = (mapping[0], mapping[1], mapping[2], mapping[3], M_padded // Outer_M, N_padded // Outer_N, K_padded // Outer_K)
+                for k_h in range(1, K_H + 1):
+                    for k_w in range(1, K_W + 1):
+                        i_h = 1 + (o_h - 1) * stride[0] + (k_h - 1) * dilation[0]
+                        i_w = 1 + (o_h - 1) * stride[1] + (k_w - 1) * dilation[1]
+                        weight_size = k_w * k_h * K * N
+                        input_size = i_w * i_h * M * K
+                        output_size = o_w * o_h * M * N
+                        used_spad_size = (weight_size + input_size + output_size) * self.precision
+                        if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size:
+                            max_used_spad_size = used_spad_size
+                            mapping = (k_h, k_w, o_h, o_w, M, N, K)
+        if max_used_spad_size == 0:
+            raise RuntimeError("Cannot find a valid mapping")
+        Outer_K_H = math.ceil(K_H / mapping[0])
+        Outer_K_W = math.ceil(K_W / mapping[1])
+        Outer_O_H = math.ceil(O_H / mapping[2])
+        Outer_O_W = math.ceil(O_W / mapping[3])
+        mapping = (math.ceil(K_H / Outer_K_H), math.ceil(K_W / Outer_K_W), math.ceil(O_H / Outer_O_H), math.ceil(O_W / Outer_O_W), M, N, K)
         return mapping
 
     def meta_kernel(self):

From c220a0c68c6c39ce4996420508994d5addf4201e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 12 Feb 2025 04:19:14 +0000
Subject: [PATCH 096/432] [Backendsim] Skip instruction generation for
 independent loop

---
 PyTorchSimBackend/src/TileGraphParser.cc | 26 ++++++++++++++++++++++++
 PyTorchSimBackend/src/TileGraphParser.h  |  5 +++++
 2 files changed, 31 insertions(+)

diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index ce39f474..bb8a65bc 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -328,6 +328,11 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         int stride_idx = calculateAddress(outer_loop_size, tog_parser->lookupNumaInfo(base_addr_name));
         numa_id = total_idx / stride_idx;
       }
+      /* Check need to make this memory node */
+      std::vector<int> key = tog_parser->calc_tag(accum_tag_list, tag_list, tag_stride_list);
+      if (tog_parser->check_memory_tag(base_addr_name, key))
+        continue;
+      tog_parser->register_memory_tag(base_addr_name, key);
 
       printIndexMap("[TOGParser] Load Node " + mem_node->get_base_addr_name() + " Numa_id: " + std::to_string(numa_id), iter);
       std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
@@ -674,6 +679,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, json& attribute_json) {
     auto indices = iter.get_indices();
     for (auto loop : _loop_nodes.at(last_outer_idx)) {
       std::shared_ptr<TileLoopNode> outer_loop = std::static_pointer_cast<TileLoopNode>(loop);
+      this->clear_tag_table(); // Clear tag table for each inner loop
       std::vector<std::shared_ptr<Tile>> sub_tiles = outer_loop->get_tiles_from_iter(this, indices);
 
       /* insert tiles to subgraph */
@@ -728,6 +734,26 @@ void TileGraphParser::register_tile(std::shared_ptr<TileNode> tile_node) {
   }
 }
 
+std::vector<int> TileGraphParser::calc_tag(std::vector<int>& accum_tag, std::vector<int>& tag_idx, std::vector<int>& tag_stride) {
+  int key_offset = 0;
+  std::vector<int> tag_key;
+  for (int i=0; i<tag_idx.size(); i++)
+    key_offset += tag_idx.at(i) * tag_stride.at(i);
+  for (auto accum_dim : accum_tag)
+    tag_key.push_back(accum_dim);
+  tag_key.push_back(key_offset);
+  return tag_key;
+}
+
+void TileGraphParser::register_memory_tag(std::string name, std::vector<int>& tag_key) {
+  assert(_tag_table.find(std::make_pair(name, tag_key))==_tag_table.end());
+  _tag_table[std::make_pair(name, tag_key)] = true;
+}
+
+bool TileGraphParser::check_memory_tag(std::string name, std::vector<int>& tag_key) {
+  return _tag_table.find(std::make_pair(name, tag_key))==_tag_table.end() ? false : true;
+}
+
 std::shared_ptr<TileNode> TileGraphParser::get_top_loop() {
   if (_loop_nodes.empty())
     return nullptr;
diff --git a/PyTorchSimBackend/src/TileGraphParser.h b/PyTorchSimBackend/src/TileGraphParser.h
index b1f3a283..ebbc6aa3 100644
--- a/PyTorchSimBackend/src/TileGraphParser.h
+++ b/PyTorchSimBackend/src/TileGraphParser.h
@@ -74,6 +74,10 @@ class TileGraphParser {
   int getCoreIdFromJson(const json& attribute_json, int subgraph_id);
   std::string getMetaByName(std::string key) { return _tog_meta[key]; }
   const json& get_attribute_file() { return _attribute_json; }
+  std::vector<int> calc_tag(std::vector<int>& accum_tag, std::vector<int>& tag_idx, std::vector<int>& tag_stride);
+  void register_memory_tag(std::string name, std::vector<int>& tag_key);
+  bool check_memory_tag(std::string name, std::vector<int>& tag_key);
+  void clear_tag_table() { _tag_table.clear(); }
  private:
   void register_tile(std::shared_ptr<TileNode> tile_node);
   void _tile_generate() {}
@@ -91,6 +95,7 @@ class TileGraphParser {
   std::map<std::string, std::vector<uint32_t>> _arg_numa_stride;
   std::map<std::string, std::tuple<int, int, LoopType>> _loop_size_map;
   std::map<std::string, std::string> _tog_meta;
+  std::map<std::pair<std::string, std::vector<int>>, bool> _tag_table;
 };
 
 class TileComputeNode : public TileNode {

From e4a1ae41190a796782d3780b0597c87240f7d7d3 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Thu, 13 Feb 2025 10:03:51 +0000
Subject: [PATCH 097/432] [Frontend] Convolution Optimization

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 20 +++++++-------
 PyTorchSimFrontend/mlir/mlir_template.py      | 26 ++++++++++---------
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index ede7ada9..89f48876 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -102,14 +102,14 @@
                 // Load kernel matrix
                 memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
                     : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
-                affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
-                  affine.for %tile_o_w = 0 to {{ TILE_O_W }} {
-                    affine.for %tile_k_h = 0 to {{ TILE_K_H }} {
-                      affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
+                affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
+                  affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
+                    affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
+                      affine.for %tile_o_w = 0 to {{ TILE_O_W }} {
                         %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
                         %tile_i_w = affine.apply #map_I_W(%tile_o_w, %tile_k_w)
-                        %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
                         %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_i_w)
+                        %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
                         %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
                         %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
                         %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
@@ -239,13 +239,13 @@ def render(self,
             KERNEL_NAME=self.name,
             KERNEL_DEF=self.def_kernel(),
             kernel=kernel,
-            BATCH=X.layout.size[0],
-            I_C=X.layout.size[1],
+            BATCH=BATCH,
+            I_C=I_C,
             I_H=X.layout.size[2],
             I_W=X.layout.size[3],
-            O_C=W.layout.size[0],
-            K_H=W.layout.size[2],
-            K_W=W.layout.size[3],
+            O_C=O_C,
+            K_H=K_H,
+            K_W=K_W,
             O_H=O_H,
             O_W=O_W,
             TILE_M=TILE_M,
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 6778e7c8..48e1d1e9 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -147,31 +147,33 @@ def gemm_combination_mapping(self, M, N, K):
         return mapping
 
     def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation):
-        spad_size = self.spad_info["spad_size"] * self.vector_lane
+        spad_size_per_lane = self.spad_info["spad_size"]
+        spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
 
         max_used_spad_size = 0
         M, N, K = self.gemm_combination_mapping(M, N, K)
-        for o_h in range(1, O_H + 1):
-            for o_w in range(1, O_W + 1):
-                for k_h in range(1, K_H + 1):
-                    for k_w in range(1, K_W + 1):
+        max_k_h_w = 1 # maximize kernel size
+        for o_h in sympy.divisors(O_H):
+            for o_w in sympy.divisors(O_W):
+                for k_h in sympy.divisors(K_H):
+                    for k_w in sympy.divisors(K_W):
                         i_h = 1 + (o_h - 1) * stride[0] + (k_h - 1) * dilation[0]
-                        i_w = 1 + (o_h - 1) * stride[1] + (k_w - 1) * dilation[1]
+                        i_w = 1 + (o_w - 1) * stride[1] + (k_w - 1) * dilation[1]
                         weight_size = k_w * k_h * K * N
                         input_size = i_w * i_h * M * K
                         output_size = o_w * o_h * M * N
                         used_spad_size = (weight_size + input_size + output_size) * self.precision
-                        if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size:
+                        weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N)
+                        input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
+                        output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M, N)
+                        used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
+                        if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < spad_size_per_lane and max_k_h_w <= k_h * k_w:
                             max_used_spad_size = used_spad_size
+                            max_k_h_w = k_h * k_w
                             mapping = (k_h, k_w, o_h, o_w, M, N, K)
         if max_used_spad_size == 0:
             raise RuntimeError("Cannot find a valid mapping")
-        Outer_K_H = math.ceil(K_H / mapping[0])
-        Outer_K_W = math.ceil(K_W / mapping[1])
-        Outer_O_H = math.ceil(O_H / mapping[2])
-        Outer_O_W = math.ceil(O_W / mapping[3])
-        mapping = (math.ceil(K_H / Outer_K_H), math.ceil(K_W / Outer_K_W), math.ceil(O_H / Outer_O_H), math.ceil(O_W / Outer_O_W), M, N, K)
         return mapping
 
     def meta_kernel(self):

From eea5904e23bf4d3d28aadf2e0d73bac0f03c2f2c Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Thu, 13 Feb 2025 10:05:03 +0000
Subject: [PATCH 098/432] [TOGSim] Instruction stats

---
 PyTorchSimBackend/src/Core.cc       | 18 +++++++++++++++++-
 PyTorchSimBackend/src/Core.h        |  3 +++
 PyTorchSimBackend/src/Instruction.h |  2 +-
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index 3a54b0c6..fbc7c8a6 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -14,6 +14,8 @@ Core::Core(uint32_t id, SimulationConfig config)
   _stat_sa_compute_cycle.resize(_num_systolic_array_per_core);
   _stat_tot_sa_compute_idle_cycle.resize(_num_systolic_array_per_core);
   _stat_sa_compute_idle_cycle.resize(_num_systolic_array_per_core);
+  _stat_tot_sa_inst.resize(_num_systolic_array_per_core);
+  _stat_tot_sa_inst.resize(static_cast<size_t>(Opcode::COUNT), 0);
 }
 
 bool Core::can_issue(const std::shared_ptr<Tile>& op) {
@@ -228,6 +230,7 @@ void Core::cycle() {
                             fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
                             fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
               issued = true;
+              _stat_skip_dma++;
               break;
             } else {
               spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
@@ -262,6 +265,9 @@ void Core::cycle() {
                           opcode_to_string(inst->get_opcode()), inst->get_compute_type(), inst->finish_cycle);
             target_pipeline.push(inst);
             issued = true;
+            if (inst->get_compute_type()) {
+              _stat_gemm_inst++;
+            }
           }
           break;
         case Opcode::BAR:
@@ -273,6 +279,8 @@ void Core::cycle() {
             } else {
               _tma.register_tag_waiter(inst->subgraph_id, key, inst);
             }
+            spdlog::trace("[Core {}][{}] {} ISSUED", _id, _core_cycle,
+                          opcode_to_string(inst->get_opcode()));
             issued = true;
           }
           break;
@@ -282,6 +290,7 @@ void Core::cycle() {
       }
 
       if (issued) {
+        _stat_tot_sa_inst.at(static_cast<size_t>(inst->get_opcode()))++;
         auto it = instructions.begin() + j; // Position 2 is the third element
         instructions.erase(it);
         break;
@@ -371,6 +380,13 @@ bool Core::can_issue_compute(std::shared_ptr<Instruction>& inst) {
 
 void Core::print_stats() {
   std::vector<float> sa_utilization;
+  for (int i=0; i < static_cast<size_t>(Opcode::COUNT); i++) {
+    if (i == static_cast<size_t>(Opcode::COMP))
+      spdlog::info("Core [{}] : {} inst count {} (GEMM: {}, Vector: {})", _id, opcode_to_string(static_cast<Opcode>(i)), _stat_tot_sa_inst.at(i), _stat_gemm_inst, _stat_tot_sa_inst.at(i) - _stat_gemm_inst);
+    else
+      spdlog::info("Core [{}] : {} inst count {}", _id, opcode_to_string(static_cast<Opcode>(i)), _stat_tot_sa_inst.at(i));
+  }
+  spdlog::trace("Core [{}] : SKipped MOVIN inst count {}", _id, _stat_skip_dma);
   for (int i=0; i<_num_systolic_array_per_core; i++)
     sa_utilization.push_back(static_cast<float>(_stat_tot_sa_compute_cycle.at(i) * 100) / _core_cycle);
   spdlog::info(
@@ -378,7 +394,7 @@ void Core::print_stats() {
   for (int i=0; i<_num_systolic_array_per_core; i++)
     spdlog::info("Core [{}] : Systolic array[{}] active cycle {}", _id, i, _stat_tot_sa_compute_cycle.at(i));
   spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {}", _id, _stat_tot_tma_cycle, _stat_tot_tma_idle_cycle);
-  spdlog::info("Core[{}] : Vector unit idle cycle {}", _id, _stat_vu_compute_idle_cycle);
+  spdlog::info("Core [{}] : Vector unit idle cycle {}", _id, _stat_vu_compute_idle_cycle);
   for (int i=0; i<_num_systolic_array_per_core; i++)
     spdlog::info("Core [{}] : Systolic Array[{}] idle cycle [{}]", _id, i, _stat_tot_sa_compute_cycle.at(i));
   spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}", _id, static_cast<float>(_stat_tot_vu_compute_cycle * 100) / _core_cycle);
diff --git a/PyTorchSimBackend/src/Core.h b/PyTorchSimBackend/src/Core.h
index 7af19050..0babeb6f 100644
--- a/PyTorchSimBackend/src/Core.h
+++ b/PyTorchSimBackend/src/Core.h
@@ -61,6 +61,9 @@ class Core {
   cycle_type _stat_tot_tma_idle_cycle = 0;
   cycle_type _stat_tot_vu_compute_idle_cycle = 0;
   std::vector<cycle_type> _stat_tot_sa_compute_idle_cycle;
+  std::vector<uint64_t> _stat_tot_sa_inst;
+  uint64_t _stat_gemm_inst = 0;
+  uint64_t _stat_skip_dma = 0;
 
   cycle_type _stat_vu_compute_cycle = 0;
   std::vector<cycle_type> _stat_sa_compute_cycle;
diff --git a/PyTorchSimBackend/src/Instruction.h b/PyTorchSimBackend/src/Instruction.h
index 014ff41e..acf416b0 100644
--- a/PyTorchSimBackend/src/Instruction.h
+++ b/PyTorchSimBackend/src/Instruction.h
@@ -11,7 +11,7 @@
 #include <memory>
 #include <vector>
 
-enum class Opcode { MOVIN, MOVOUT, COMP, BAR};
+enum class Opcode { MOVIN, MOVOUT, COMP, BAR, COUNT};
 
 typedef uint64_t addr_type;
 typedef uint64_t cycle_type;

From f981abf2c50668dd2cc29177ecc7ad73c4cbd4ce Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Thu, 13 Feb 2025 10:05:52 +0000
Subject: [PATCH 099/432] [Fix] Tile Latency Bug Fix

---
 PyTorchSimFrontend/extension_codecache.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index d1e7671b..ee4e8c17 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -201,8 +201,8 @@ def load(cls, source_code,
 
         # Create TOG
         offset = vectorlane_size
-        if kwargs['loop_size'] is not None and kwargs['loop_size'][0] < vectorlane_size:
-            offset = kwargs['loop_size'][0]
+        if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size:
+            offset = kwargs['loop_size'][-3]
         tile_graph_generator = tog_generator(origins)
         tile_graph_generator.load_file(raw_tog_path)
         tile_graph_generator.generate_tile_graph(

From bf6e44ff3a8a6da6ecd4168fc91e89dbebe8905c Mon Sep 17 00:00:00 2001
From: Yunseon Shin <ysshin@postech.ac.kr>
Date: Fri, 14 Feb 2025 15:09:33 +0000
Subject: [PATCH 100/432] [Frontend] Multi-tile CONV

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 118 +++++++++++++++++-
 PyTorchSimFrontend/mlir/mlir_template.py      |  33 ++++-
 2 files changed, 145 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 89f48876..33d7619f 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -49,7 +49,7 @@
 #offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_I_W * TILE_M, TILE_K) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_K) }})>
 #offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_O_W * TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
 
-memref.global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{TILE_M }}x{{ TILE_K }}xf32, 1>
+memref.global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
 memref.global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
 
@@ -73,7 +73,6 @@
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
-  %K_W = arith.constant {{ K_W }} : index
   %stride_h = arith.constant {{ STRIDE_H }} : index
   %stride_w = arith.constant {{ STRIDE_W }} : index
 
@@ -134,6 +133,100 @@
 }
 """
 
+MULTI_TILE_CONV_TEMPLATE = r"""
+#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * BATCH * O_C }} + d1 * {{ BATCH * O_C }} + d2 * {{ O_C }} + d3)> // output (O_H, O_W, BATCH, O_C)
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * BATCH * I_C }} + d1 * {{ I_C }} + d2 * {{ I_C * (I_W + 2 * PADDING_W) }} + d3)> // input (I_H, BATCH, I_W, I_C)
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C)
+#map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
+#map_I_W = affine_map<(d0, d1) -> (d0 * {{ STRIDE_W }} + d1)>
+#offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(1 * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
+#offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_I_W * TILE_M, TILE_K) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_K) }})>
+#offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_O_W * TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
+
+memref.global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+memref.global @W_spad : memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+memref.global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+
+func.func @{{ KERNEL_NAME }}({{ KERNEL_DEF }}) {
+  %c_mvin = arith.constant 2 : index
+  %c_mvin2 = arith.constant 1 : index
+  %c_mvin3 = arith.constant 14 : index
+  %c_mvout = arith.constant 3 : index
+  %vstride = arith.constant 1 : index
+  %input_axis = arith.constant 3 : index
+  %weight_axis = arith.constant 2 : index
+  %input_buffer = memref.get_global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+  %weight_buffer = memref.get_global @W_spad : memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+  %output_buffer = memref.get_global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+  %tag = memref.alloc() : memref<1xi32>
+  %tag0 = memref.alloc() : memref<1xi32>
+  %tag1 = memref.alloc() : memref<1xi32>
+  %tag2 = memref.alloc() : memref<1xi32>
+  %tag3 = memref.alloc() : memref<1xi32>
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N) }}xf32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %stride_h = arith.constant {{ STRIDE_H }} : index
+  %stride_w = arith.constant {{ STRIDE_W }} : index
+
+  affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
+    affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }} {
+      affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M }} {
+        affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
+          %index0 = affine.apply #map0(%o_h, %o_w, %tile_m, %tile_n)
+          // Initialize output
+          {%- if BIAS %}
+          memref.dma_start %Bias[%tile_n], %output_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag0[%c0], %c0, %vstride
+              : memref<{{ O_C }}xf32>, memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_O_H }}, {{ TILE_O_W }}, {{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_O_W * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
+          {%- else %}
+          affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N) }}xf32>
+          {%- endif %}
+          affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
+            affine.for %k_w = 0 to {{ 1 }} step {{ 1 }} {
+              affine.for %tile_k = 0 to {{ TILE_K }} step {{ TILE_K }} {
+                %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
+                %index_i_w = affine.apply #map_I_W(%o_w, %k_w)
+                %index1 = affine.apply #map1(%index_i_h, %index_i_w, %tile_m, %tile_k) // input index
+                %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
+                // Load input matrix
+                memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
+                    : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_I_H }}, {{ SUB_TILE_I_W }}, {{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_I_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
+                // Load kernel matrix
+                memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
+                    : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
+                affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
+                  affine.for %tile_k_w = 0 to 1 {
+                    affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
+                      affine.for %tile_o_w = 0 to {{ TILE_O_W }} {
+                        %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
+                        %tile_i_w = affine.apply #map_I_W(%tile_o_w, %tile_k_w)
+                        %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_i_w)
+                        %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
+                        %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
+                        %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
+                        %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                        %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                        linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                              outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                      } { inner_loop=true }
+                    } { inner_loop=true }
+                  } { inner_loop=true }
+                } { inner_loop=true }
+              } { accumulation_loop=true }
+            } { accumulation_loop=true }
+          } { accumulation_loop=true }
+          // Store output matrix
+          memref.dma_start %output_buffer[%c0, %c0, %c0, %c0], %Y[%index0], %c_mvout, %tag3[%c0], %input_axis, %vstride
+              : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[{{ TILE_O_W * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
+        } { outer_loop=true }
+      } { outer_loop=true }
+    } { outer_loop=true }
+  } { outer_loop=true }
+  return
+}
+"""
+
 WRAPPER_TEMPLATE = r"""
 def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif %}, {{ OUT }}):
     # Padding input
@@ -144,7 +237,11 @@ def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif
     {{ INPUT }}_padding[:, :, {{ PADDING_H }}:{{ INPUT }}.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:{{ INPUT }}.shape[3] + {{ PADDING_W }}] = {{ INPUT }}
 
     # Tanspose tensors
+    {%- if MULTI_TILE %}
+    t_{{ INPUT }} = {{ INPUT }}_padding.permute(2, 0, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, BATCH, I_W, I_C)
+    {% else %}
     t_{{ INPUT }} = {{ INPUT }}_padding.permute(2, 3, 0, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, I_W, BATCH, I_C)
+    {% endif -%}
     t_{{ WEIGHT }} = {{ WEIGHT }}.permute(2, 3, 1, 0).contiguous() # (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)
     t_{{ OUT }} = {{ OUT }}.permute(2, 3, 0, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (O_H, O_W, BATCH, O_C)
 
@@ -160,8 +257,8 @@ def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
         self.stride = kwargs["stride"]
         self.padding = kwargs["padding"]
         self.dilation = kwargs["dilation"]
-        weight_shape = [str(i) for i in input_nodes[1].layout.size]
-        self.function_name = "Conv2D_" + "_".join(weight_shape)+ "_" \
+        self.weight_shape = [str(i) for i in input_nodes[1].layout.size]
+        self.function_name = "Conv2D_" + "_".join(self.weight_shape)+ "_" \
             + "_".join([str(i) for i in self.stride]) \
             + "_" + "_".join([str(i) for i in self.padding]) \
             + "_" + "_".join([str(i) for i in self.dilation])
@@ -176,6 +273,10 @@ def is_transposed(self, node):
                   raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
         return False
 
+    def is_multi_tile(self, I_C):
+        return False
+        return I_C < 16 # 16 is hard-coded for now. This should be changed to a better heuristic.
+
     # Can use math.multi ?
     def def_kernel(self) ->str:
         X, W = self.input_nodes[0], self.input_nodes[1]
@@ -228,6 +329,12 @@ def render(self,
         TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
         TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
         SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
+        conv_template = CONV_TEMPLATE
+        if self.is_multi_tile(I_C):
+          conv_template = MULTI_TILE_CONV_TEMPLATE
+          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation)
+          TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
+          TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
 
         kernel.loop_size = [K_H, K_W, O_H, O_W, BATCH, O_C, I_C]
 
@@ -273,7 +380,7 @@ def render(self,
             DATA_SIZE=4,
             BIAS=Bias
         )
-        code = self._template_from_string(CONV_TEMPLATE).render(**kernel.render_options)
+        code = self._template_from_string(conv_template).render(**kernel.render_options)
 
         self.header = f"float X_spad[{kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H * TILE_M, TILE_K)}] __attribute__ ((section(\".spad\")));\n"
         self.header += f"float W_spad[{kernel.get_spad_size_per_lane(TILE_K_W * TILE_K_H * TILE_K, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
@@ -297,6 +404,7 @@ def outer_func_render(self, kernel_name, input_args):
             OUT=input_args[3] if len(input_args) == 4 else input_args[2],
             PADDING_H=self.padding[0],
             PADDING_W=self.padding[1],
+            MULTI_TILE=self.is_multi_tile(int(self.weight_shape[1])),
             VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
             BACKENDSIM_EAGER_MODE=extension_config.CONFIG_BACKENDSIM_EAGER_MODE,
             HASH_VALUE=self.hash_value
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 48e1d1e9..9c45d6be 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -150,6 +150,7 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
+        max_spad_per_lane = spad_size_per_lane // 2 # double buffer
 
         max_used_spad_size = 0
         M, N, K = self.gemm_combination_mapping(M, N, K)
@@ -168,7 +169,7 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation
                         input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
                         output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M, N)
                         used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
-                        if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < spad_size_per_lane and max_k_h_w <= k_h * k_w:
+                        if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h * k_w:
                             max_used_spad_size = used_spad_size
                             max_k_h_w = k_h * k_w
                             mapping = (k_h, k_w, o_h, o_w, M, N, K)
@@ -176,6 +177,36 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation
             raise RuntimeError("Cannot find a valid mapping")
         return mapping
 
+    def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation):
+        spad_size_per_lane = self.spad_info["spad_size"]
+        spad_size = spad_size_per_lane * self.vector_lane
+        max_spad_size = spad_size // 2
+        max_spad_per_lane = spad_size_per_lane // 2
+
+        max_used_spad_size = 0
+        M, N, K = self.gemm_combination_mapping(M, N, K * K_W)
+        max_k_h_w = K_W
+        for o_h in sympy.divisors(O_H):
+            for o_w in sympy.divisors(O_W):
+                for k_h in sympy.divisors(K_H):
+                    i_h = 1 + (o_h - 1) * stride[0] + (k_h - 1) * dilation[0]
+                    i_w = 1 + (o_w - 1) * stride[1] + (K_W - 1) * dilation[1]
+                    weight_size = 1 * k_h * K * N
+                    input_size = i_w * i_h * M * K
+                    output_size = o_w * o_h * M * N
+                    used_spad_size = (weight_size + input_size + output_size) * self.precision
+                    weight_size_per_lane = self.get_spad_size_per_lane(1 * k_h * K, N)
+                    input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
+                    output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M, N)
+                    used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
+                    if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h:
+                        max_used_spad_size = used_spad_size
+                        max_k_h_w = k_h
+                        mapping = (k_h, K_W, o_h, o_w, M, N, K)
+        if max_used_spad_size == 0:
+            raise RuntimeError("Cannot find a valid mapping")
+        return mapping
+
     def meta_kernel(self):
         wrapper = V.graph.wrapper_code
         arg_attributes = self.kernel_arg_attributes

From d25aa021619060d83f1a7fe65cbf7055b7cfa252 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Sat, 15 Feb 2025 07:54:32 +0000
Subject: [PATCH 101/432] [Backendsim] TPUv3 HBM2 config

---
 .../ramulator2_configs/HBM2_TPUv3.yaml        | 25 ++++++++++++++++
 ...stolic_ws_128x128_c1_simple_noc_tpuv3.json | 30 +++++++++++++++++++
 ...stolic_ws_128x128_c2_simple_noc_tpuv3.json | 30 +++++++++++++++++++
 PyTorchSimBackend/extern/ramulator2           |  2 +-
 4 files changed, 86 insertions(+), 1 deletion(-)
 create mode 100644 PyTorchSimBackend/configs/ramulator2_configs/HBM2_TPUv3.yaml
 create mode 100644 PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
 create mode 100644 PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json

diff --git a/PyTorchSimBackend/configs/ramulator2_configs/HBM2_TPUv3.yaml b/PyTorchSimBackend/configs/ramulator2_configs/HBM2_TPUv3.yaml
new file mode 100644
index 00000000..e6543d14
--- /dev/null
+++ b/PyTorchSimBackend/configs/ramulator2_configs/HBM2_TPUv3.yaml
@@ -0,0 +1,25 @@
+Frontend:
+  impl: GEM5
+
+MemorySystem:
+  impl: GenericDRAM
+  clock_ratio: 1
+
+  DRAM:
+    impl: HBM2
+    org:
+      preset: HBM2_8Gb
+      channel: 1
+    timing:
+      preset: HBM2_1.8Gbps
+
+  Controller:
+    impl: Generic
+    Scheduler:
+      impl: FRFCFS
+    RefreshManager:
+      impl: AllBank
+    plugins:
+
+  AddrMapper:
+    impl: RoBaRaCoCh
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
new file mode 100644
index 00000000..64c19a1d
--- /dev/null
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
@@ -0,0 +1,30 @@
+{
+  "num_cores" : 1,
+  "core_freq" : 940,
+  "sram_size" : 65536,
+  "core_print_interval" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" : 940,
+  "dram_channels": 32,
+  "dram_req_size": 32,
+  "dram_latency" : 10,
+  "dram_size" : 32,
+  "dram_nbl" : 1,
+  "dram_print_interval": 10000,
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq" : 7000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
+
+  "precision" : 4,
+  "scheduler" : "simple",
+  "num_partition" : 2,
+  "partition": {
+    "core_0":0,
+    "core_1":0
+  }
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
new file mode 100644
index 00000000..21f75c0e
--- /dev/null
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
@@ -0,0 +1,30 @@
+{
+  "num_cores" : 2,
+  "core_freq" : 940,
+  "sram_size" : 65536,
+  "core_print_interval" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" : 940,
+  "dram_channels": 32,
+  "dram_req_size": 32,
+  "dram_latency" : 10,
+  "dram_size" : 32,
+  "dram_nbl" : 1,
+  "dram_print_interval": 10000,
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq" : 7000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
+
+  "precision" : 4,
+  "scheduler" : "simple",
+  "num_partition" : 2,
+  "partition": {
+    "core_0":0,
+    "core_1":0
+  }
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/extern/ramulator2 b/PyTorchSimBackend/extern/ramulator2
index 2ea90841..00efad33 160000
--- a/PyTorchSimBackend/extern/ramulator2
+++ b/PyTorchSimBackend/extern/ramulator2
@@ -1 +1 @@
-Subproject commit 2ea9084120eb389846326bd765d55f5e64632b7d
+Subproject commit 00efad33121408dcd3443465835649b120080395

From 1b490dcc4f1bf5d5e2ebafd3fc22866a37afc1bf Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 15 Feb 2025 15:11:33 +0000
Subject: [PATCH 102/432] [Backendsim] minor issue fix for accessing cache stat

---
 .gitmodules                          | 3 +++
 PyTorchSimBackend/extern/stonneCore  | 1 +
 PyTorchSimBackend/src/Cache_stats.cc | 4 ++--
 3 files changed, 6 insertions(+), 2 deletions(-)
 create mode 160000 PyTorchSimBackend/extern/stonneCore

diff --git a/.gitmodules b/.gitmodules
index 831a8746..f65e5f2b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,3 +13,6 @@
 [submodule "PyTorchSimBackend/extern/ramulator2"]
 	path = PyTorchSimBackend/extern/ramulator2
 	url = https://github.com/PSAL-POSTECH/ramulator2
+[submodule "PyTorchSimBackend/extern/stonneCore"]
+	path = PyTorchSimBackend/extern/stonneCore
+	url = https://github.com/PSAL-POSTECH/stonne_core.git
diff --git a/PyTorchSimBackend/extern/stonneCore b/PyTorchSimBackend/extern/stonneCore
new file mode 160000
index 00000000..80a20d68
--- /dev/null
+++ b/PyTorchSimBackend/extern/stonneCore
@@ -0,0 +1 @@
+Subproject commit 80a20d68eb613b12b8361d215ada2bec7fb9cce9
diff --git a/PyTorchSimBackend/src/Cache_stats.cc b/PyTorchSimBackend/src/Cache_stats.cc
index fe6800dd..eacd0c2f 100644
--- a/PyTorchSimBackend/src/Cache_stats.cc
+++ b/PyTorchSimBackend/src/Cache_stats.cc
@@ -6,7 +6,7 @@ CacheStats::CacheStats() {
   m_fail_stats.resize(NUM_MEM_ACCESS_TYPE);
   for (int i = 0; i < NUM_MEM_ACCESS_TYPE; i++) {
     m_stats[i].resize(NUM_CACHE_REQUEST_STATUS, 0);
-    m_fail_stats[i].resize(NUM_CACHE_REQUEST_STATUS, 0);
+    m_fail_stats[i].resize(NUM_CACHE_RESERVATION_FAIL_REASON, 0);
   }
   m_cache_port_available_cycles = 0;
   m_cache_data_port_busy_cycles = 0;
@@ -216,7 +216,7 @@ void CacheStats::print_stats(FILE *out, const char *cache_name) const {
 
 void CacheStats::print_fail_stats(FILE *out, const char *cache_name) const {
   for (int type = 0; type < NUM_MEM_ACCESS_TYPE; type++) {
-    for (int status = 0; status < NUM_CACHE_REQUEST_STATUS; status++) {
+    for (int status = 0; status < NUM_CACHE_RESERVATION_FAIL_REASON; status++) {
       fprintf(out, "\t%s[%s][%s] = %lu\n", cache_name,
               mem_access_type_str[type],
               cache_reservation_fail_reason_str[status],

From cf832f6ff09aaeeeee05273ea7312420ea0d5835 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 15 Feb 2025 15:12:41 +0000
Subject: [PATCH 103/432] [Backendsim] change iternal functions type in core

---
 PyTorchSimBackend/src/Core.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimBackend/src/Core.h b/PyTorchSimBackend/src/Core.h
index 0babeb6f..00ab8f4a 100644
--- a/PyTorchSimBackend/src/Core.h
+++ b/PyTorchSimBackend/src/Core.h
@@ -20,10 +20,6 @@ class Core {
   void issue(std::shared_ptr<Tile> tile);
   std::shared_ptr<Tile> pop_finished_tile();
   void cycle();
-  void vu_cycle();
-  void sa_cycle();
-  void compute_cycle();
-  void dma_cycle();
   bool has_memory_request();
   void pop_memory_request();
   mem_fetch* top_memory_request() { return _request_queue.front(); }
@@ -39,6 +35,10 @@ class Core {
   };
 
  protected:
+  void dma_cycle();
+  void compute_cycle();
+  void vu_cycle();
+  void sa_cycle();
   bool can_issue_compute(std::shared_ptr<Instruction>& inst);
   void update_stats();
 

From da69d21f15fa1edc960724d82ac74a5eb5484360 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 15 Feb 2025 15:15:45 +0000
Subject: [PATCH 104/432] [Backendsim] Integrate Stonne engine to model sparse
 core

---
 PyTorchSimBackend/CMakeLists.txt         |  8 ++-
 PyTorchSimBackend/extern/stonneCore      |  2 +-
 PyTorchSimBackend/src/Common.cc          |  7 +++
 PyTorchSimBackend/src/SimulationConfig.h |  3 +
 PyTorchSimBackend/src/SparseCore.cc      | 77 ++++++++++++++++++++++++
 PyTorchSimBackend/src/SparseCore.h       | 25 ++++++++
 6 files changed, 119 insertions(+), 3 deletions(-)
 create mode 100644 PyTorchSimBackend/src/SparseCore.cc
 create mode 100644 PyTorchSimBackend/src/SparseCore.h

diff --git a/PyTorchSimBackend/CMakeLists.txt b/PyTorchSimBackend/CMakeLists.txt
index 3776131e..a9bda0e8 100644
--- a/PyTorchSimBackend/CMakeLists.txt
+++ b/PyTorchSimBackend/CMakeLists.txt
@@ -40,13 +40,17 @@ add_subdirectory("${PROJECT_SOURCE_DIR}/extern/protobuf/cmake" EXCLUDE_FROM_ALL)
 set_target_properties(libprotoc PROPERTIES FOLDER "external/protobuf")
 set_target_properties(protoc PROPERTIES FOLDER "external/protobuf")
 
+# Add libaray stonne core
+add_subdirectory("${PROJECT_SOURCE_DIR}/extern/stonneCore")
+
 # Add libaray onnx
 add_definitions("-DONNX_NAMESPACE=onnx")
 add_subdirectory("${PROJECT_SOURCE_DIR}/extern/onnx" EXCLUDE_FROM_ALL)
 set_target_properties(onnx PROPERTIES FOLDER "extern/onnx")
 set_target_properties(onnx_proto PROPERTIES FOLDER "extern/onnx")
 
+target_include_directories(Simulator PUBLIC ${PROJECT_SOURCE_DIR}/extern/stonneCore/include)
 target_include_directories(Simulator PUBLIC ${ONNX_INCLUDE_DIRS})
 target_include_directories(Simulator PUBLIC ${PROJECT_SOURCE_DIR}/src)
-target_link_libraries(Simulator booksim2 ramulator)
-target_link_libraries(Simulator ${PROTOBUF_LIB} onnx_proto ${CONAN_LIBS} stdc++fs)
\ No newline at end of file
+target_link_libraries(Simulator booksim2 ramulator sstStonne)
+target_link_libraries(Simulator ${PROTOBUF_LIB} onnx_proto sstStonne ${CONAN_LIBS} stdc++fs)
diff --git a/PyTorchSimBackend/extern/stonneCore b/PyTorchSimBackend/extern/stonneCore
index 80a20d68..88986c27 160000
--- a/PyTorchSimBackend/extern/stonneCore
+++ b/PyTorchSimBackend/extern/stonneCore
@@ -1 +1 @@
-Subproject commit 80a20d68eb613b12b8361d215ada2bec7fb9cce9
+Subproject commit 88986c2713622b574d0ce61194c19a62af91c31c
diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc
index d7a37583..a0602f3f 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/PyTorchSimBackend/src/Common.cc
@@ -18,6 +18,13 @@ SimulationConfig initialize_config(json config) {
   SimulationConfig parsed_config;
 
   /* Core configs */
+  if ((std::string)config["core_type"] == "os_mesh")
+    parsed_config.core_type = CoreType::OS_MESH;
+  else if ((std::string)config["core_type"] == "stonne")
+    parsed_config.core_type = CoreType::STONNE;
+  else
+    throw std::runtime_error(fmt::format("Not implemented dram type {} ",
+                                         (std::string)config["core_type"]));
   parsed_config.num_cores = config["num_cores"];
   parsed_config.core_freq = config["core_freq"];
   parsed_config.sram_size = config["sram_size"];
diff --git a/PyTorchSimBackend/src/SimulationConfig.h b/PyTorchSimBackend/src/SimulationConfig.h
index 031cd0a7..eac2bbb5 100644
--- a/PyTorchSimBackend/src/SimulationConfig.h
+++ b/PyTorchSimBackend/src/SimulationConfig.h
@@ -5,6 +5,8 @@
 
 using json = nlohmann::json;
 
+enum class CoreType { OS_MESH, STONNE };
+
 enum class DramType { SIMPLE, RAMULATOR1, RAMULATOR2 };
 
 enum class IcntType { SIMPLE, BOOKSIM2 };
@@ -13,6 +15,7 @@ enum class L2CacheType { NOCACHE, READONLY };
 
 struct SimulationConfig {
   /* Core config */
+  CoreType core_type = CoreType::OS_MESH;
   uint32_t num_cores;
   uint32_t core_freq;
   uint32_t sram_size;
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
new file mode 100644
index 00000000..fca68b3c
--- /dev/null
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -0,0 +1,77 @@
+#include "SparseCore.h"
+
+SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config) {
+  std::string hardware_configuration = "/workspace/sstStonne/tests/sparseflex_op_128mses_128_bw.cfg"; //FIXME.
+  stonneCore = new SST_STONNE::sstStonne(hardware_configuration);
+};
+
+bool SparseCore::running() {
+  return !_request_queue.empty() || !_response_queue.empty() || !stonneCore->isFinished();
+}
+
+bool SparseCore::can_issue(const std::shared_ptr<Tile>& op) {
+  return !running();
+}
+
+std::shared_ptr<Tile> SparseCore::pop_finished_tile() {
+  return nullptr;
+}
+
+void SparseCore::cycle() {
+  stonneCore->cycle();
+
+  /* Send Memory Request */
+  if (SimpleMem::Request* req = stonneCore->popRequest()) {
+    mem_access_type acc_type;
+    mf_type type;
+    switch(req->getcmd()) {
+      case SimpleMem::Request::Read:
+        acc_type = mem_access_type::GLOBAL_ACC_R;
+        type = mf_type::READ_REQUEST;
+        break;
+      case SimpleMem::Request::Write:
+        acc_type = mem_access_type::GLOBAL_ACC_W;
+        type = mf_type::WRITE_REQUEST;
+        break;
+      default:
+        spdlog::error("[SparseCore] Invalid request type from core");
+        return;
+    }
+    mem_fetch* req_wrapper = new mem_fetch(req->getAddress(), acc_type, type, _config.dram_req_size, -1, req);
+    _request_queue.push(req_wrapper);
+  }
+
+  if (!_response_queue.empty()) {
+    mem_fetch* resp_wrapper = _response_queue.front();
+    _request_queue.pop();
+    SimpleMem::Request* resp = static_cast<SimpleMem::Request*>(resp_wrapper->get_custom_data());
+    resp->setReply();
+    stonneCore->pushResponse(resp);
+    delete resp_wrapper;
+  }
+}
+
+bool SparseCore::has_memory_request() {
+  return !_request_queue.empty();
+}
+
+void SparseCore::pop_memory_request() {
+  if (!_request_queue.empty()) {
+    _request_queue.pop();
+  }
+}
+
+void SparseCore::push_memory_response(mem_fetch* response) {
+  _response_queue.push(response);
+}
+
+void SparseCore::print_stats() {
+  stonneCore->printStats();
+  std::cout << "Pending Requests: " << _request_queue.size() << std::endl;
+  std::cout << "Pending Responses: " << _response_queue.size() << std::endl;
+}
+
+void SparseCore::print_current_stats() {
+  std::cout << "Current SparseCore Status:" << std::endl;
+  print_stats();
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/SparseCore.h b/PyTorchSimBackend/src/SparseCore.h
new file mode 100644
index 00000000..628c5449
--- /dev/null
+++ b/PyTorchSimBackend/src/SparseCore.h
@@ -0,0 +1,25 @@
+#include "Core.h"
+#include "sstStonne.h"
+#include "SimpleMem.h"
+
+class SparseCore : public Core {
+public:
+  SparseCore(uint32_t id, SimulationConfig config);
+  ~SparseCore() = default;
+  bool running();
+  bool can_issue(const std::shared_ptr<Tile>& op);
+  std::shared_ptr<Tile> pop_finished_tile();
+  void cycle();
+  bool has_memory_request();
+  void pop_memory_request();
+  mem_fetch* top_memory_request() { return _request_queue.front(); }
+  void push_memory_response(mem_fetch* response);
+  void print_stats();
+  void print_current_stats();
+
+private:
+  SST_STONNE::sstStonne *stonneCore;
+  /* Interconnect queue */
+  std::queue<mem_fetch*> _request_queue;
+  std::queue<mem_fetch*> _response_queue;
+};
\ No newline at end of file

From 003ac8f414bd1bfbf62636cfded234d5038e9332 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 15 Feb 2025 15:22:33 +0000
Subject: [PATCH 105/432] [Backendsim] Seperate header files

---
 PyTorchSimBackend/CMakeLists.txt                         | 6 ++++--
 PyTorchSimBackend/{src => include}/Cache.h               | 0
 PyTorchSimBackend/{src => include}/Cache_defs.h          | 0
 PyTorchSimBackend/{src => include}/Cache_stats.h         | 0
 PyTorchSimBackend/{src => include}/Common.h              | 0
 PyTorchSimBackend/{src => include}/Core.h                | 0
 PyTorchSimBackend/{src => include}/DelayQueue.h          | 0
 PyTorchSimBackend/{src => include}/Dram.h                | 0
 PyTorchSimBackend/{src => include}/Hashing.h             | 0
 PyTorchSimBackend/{src => include}/Instruction.h         | 0
 PyTorchSimBackend/{src => include}/Interconnect.h        | 0
 PyTorchSimBackend/{src => include}/L2Cache.h             | 0
 PyTorchSimBackend/{src => include}/Memfetch.h            | 0
 PyTorchSimBackend/{src => include}/Model.h               | 0
 PyTorchSimBackend/{src => include}/SimulationConfig.h    | 0
 PyTorchSimBackend/{src => include}/Simulator.h           | 0
 PyTorchSimBackend/{src => include}/SparseCore.h          | 0
 PyTorchSimBackend/{src => include}/TMA.h                 | 0
 PyTorchSimBackend/{src => include}/Tile.h                | 0
 PyTorchSimBackend/{src => include}/TileGraph.h           | 0
 PyTorchSimBackend/{src => include}/TileGraphParser.h     | 0
 PyTorchSimBackend/{src => include}/scheduler/Scheduler.h | 6 +++---
 22 files changed, 7 insertions(+), 5 deletions(-)
 rename PyTorchSimBackend/{src => include}/Cache.h (100%)
 rename PyTorchSimBackend/{src => include}/Cache_defs.h (100%)
 rename PyTorchSimBackend/{src => include}/Cache_stats.h (100%)
 rename PyTorchSimBackend/{src => include}/Common.h (100%)
 rename PyTorchSimBackend/{src => include}/Core.h (100%)
 rename PyTorchSimBackend/{src => include}/DelayQueue.h (100%)
 rename PyTorchSimBackend/{src => include}/Dram.h (100%)
 rename PyTorchSimBackend/{src => include}/Hashing.h (100%)
 rename PyTorchSimBackend/{src => include}/Instruction.h (100%)
 rename PyTorchSimBackend/{src => include}/Interconnect.h (100%)
 rename PyTorchSimBackend/{src => include}/L2Cache.h (100%)
 rename PyTorchSimBackend/{src => include}/Memfetch.h (100%)
 rename PyTorchSimBackend/{src => include}/Model.h (100%)
 rename PyTorchSimBackend/{src => include}/SimulationConfig.h (100%)
 rename PyTorchSimBackend/{src => include}/Simulator.h (100%)
 rename PyTorchSimBackend/{src => include}/SparseCore.h (100%)
 rename PyTorchSimBackend/{src => include}/TMA.h (100%)
 rename PyTorchSimBackend/{src => include}/Tile.h (100%)
 rename PyTorchSimBackend/{src => include}/TileGraph.h (100%)
 rename PyTorchSimBackend/{src => include}/TileGraphParser.h (100%)
 rename PyTorchSimBackend/{src => include}/scheduler/Scheduler.h (93%)

diff --git a/PyTorchSimBackend/CMakeLists.txt b/PyTorchSimBackend/CMakeLists.txt
index a9bda0e8..0d36d463 100644
--- a/PyTorchSimBackend/CMakeLists.txt
+++ b/PyTorchSimBackend/CMakeLists.txt
@@ -27,7 +27,7 @@ message("BINARY DIR ${CMAKE_BINARY_DIR}")
 add_subdirectory("${PROJECT_SOURCE_DIR}/src")
 
 # Add libaray ramulator
-include_directories("${PROJECT_SOURCE_DIR}/src")
+include_directories("${PROJECT_SOURCE_DIR}/include")
 add_subdirectory("${PROJECT_SOURCE_DIR}/extern/ramulator2")
 include_directories("${PROJECT_SOURCE_DIR}/extern/ramulator2/src")
 include_directories("${PROJECT_SOURCE_DIR}/extern/ramulator2/resources/wrappers")
@@ -50,7 +50,9 @@ set_target_properties(onnx PROPERTIES FOLDER "extern/onnx")
 set_target_properties(onnx_proto PROPERTIES FOLDER "extern/onnx")
 
 target_include_directories(Simulator PUBLIC ${PROJECT_SOURCE_DIR}/extern/stonneCore/include)
-target_include_directories(Simulator PUBLIC ${ONNX_INCLUDE_DIRS})
+target_include_directories(Simulator PUBLIC ${PROJECT_SOURCE_DIR}/include)
+target_include_directories(Simulator PUBLIC ${PROJECT_SOURCE_DIR}/include/scheduler)
 target_include_directories(Simulator PUBLIC ${PROJECT_SOURCE_DIR}/src)
+target_include_directories(Simulator PUBLIC ${ONNX_INCLUDE_DIRS})
 target_link_libraries(Simulator booksim2 ramulator sstStonne)
 target_link_libraries(Simulator ${PROTOBUF_LIB} onnx_proto sstStonne ${CONAN_LIBS} stdc++fs)
diff --git a/PyTorchSimBackend/src/Cache.h b/PyTorchSimBackend/include/Cache.h
similarity index 100%
rename from PyTorchSimBackend/src/Cache.h
rename to PyTorchSimBackend/include/Cache.h
diff --git a/PyTorchSimBackend/src/Cache_defs.h b/PyTorchSimBackend/include/Cache_defs.h
similarity index 100%
rename from PyTorchSimBackend/src/Cache_defs.h
rename to PyTorchSimBackend/include/Cache_defs.h
diff --git a/PyTorchSimBackend/src/Cache_stats.h b/PyTorchSimBackend/include/Cache_stats.h
similarity index 100%
rename from PyTorchSimBackend/src/Cache_stats.h
rename to PyTorchSimBackend/include/Cache_stats.h
diff --git a/PyTorchSimBackend/src/Common.h b/PyTorchSimBackend/include/Common.h
similarity index 100%
rename from PyTorchSimBackend/src/Common.h
rename to PyTorchSimBackend/include/Common.h
diff --git a/PyTorchSimBackend/src/Core.h b/PyTorchSimBackend/include/Core.h
similarity index 100%
rename from PyTorchSimBackend/src/Core.h
rename to PyTorchSimBackend/include/Core.h
diff --git a/PyTorchSimBackend/src/DelayQueue.h b/PyTorchSimBackend/include/DelayQueue.h
similarity index 100%
rename from PyTorchSimBackend/src/DelayQueue.h
rename to PyTorchSimBackend/include/DelayQueue.h
diff --git a/PyTorchSimBackend/src/Dram.h b/PyTorchSimBackend/include/Dram.h
similarity index 100%
rename from PyTorchSimBackend/src/Dram.h
rename to PyTorchSimBackend/include/Dram.h
diff --git a/PyTorchSimBackend/src/Hashing.h b/PyTorchSimBackend/include/Hashing.h
similarity index 100%
rename from PyTorchSimBackend/src/Hashing.h
rename to PyTorchSimBackend/include/Hashing.h
diff --git a/PyTorchSimBackend/src/Instruction.h b/PyTorchSimBackend/include/Instruction.h
similarity index 100%
rename from PyTorchSimBackend/src/Instruction.h
rename to PyTorchSimBackend/include/Instruction.h
diff --git a/PyTorchSimBackend/src/Interconnect.h b/PyTorchSimBackend/include/Interconnect.h
similarity index 100%
rename from PyTorchSimBackend/src/Interconnect.h
rename to PyTorchSimBackend/include/Interconnect.h
diff --git a/PyTorchSimBackend/src/L2Cache.h b/PyTorchSimBackend/include/L2Cache.h
similarity index 100%
rename from PyTorchSimBackend/src/L2Cache.h
rename to PyTorchSimBackend/include/L2Cache.h
diff --git a/PyTorchSimBackend/src/Memfetch.h b/PyTorchSimBackend/include/Memfetch.h
similarity index 100%
rename from PyTorchSimBackend/src/Memfetch.h
rename to PyTorchSimBackend/include/Memfetch.h
diff --git a/PyTorchSimBackend/src/Model.h b/PyTorchSimBackend/include/Model.h
similarity index 100%
rename from PyTorchSimBackend/src/Model.h
rename to PyTorchSimBackend/include/Model.h
diff --git a/PyTorchSimBackend/src/SimulationConfig.h b/PyTorchSimBackend/include/SimulationConfig.h
similarity index 100%
rename from PyTorchSimBackend/src/SimulationConfig.h
rename to PyTorchSimBackend/include/SimulationConfig.h
diff --git a/PyTorchSimBackend/src/Simulator.h b/PyTorchSimBackend/include/Simulator.h
similarity index 100%
rename from PyTorchSimBackend/src/Simulator.h
rename to PyTorchSimBackend/include/Simulator.h
diff --git a/PyTorchSimBackend/src/SparseCore.h b/PyTorchSimBackend/include/SparseCore.h
similarity index 100%
rename from PyTorchSimBackend/src/SparseCore.h
rename to PyTorchSimBackend/include/SparseCore.h
diff --git a/PyTorchSimBackend/src/TMA.h b/PyTorchSimBackend/include/TMA.h
similarity index 100%
rename from PyTorchSimBackend/src/TMA.h
rename to PyTorchSimBackend/include/TMA.h
diff --git a/PyTorchSimBackend/src/Tile.h b/PyTorchSimBackend/include/Tile.h
similarity index 100%
rename from PyTorchSimBackend/src/Tile.h
rename to PyTorchSimBackend/include/Tile.h
diff --git a/PyTorchSimBackend/src/TileGraph.h b/PyTorchSimBackend/include/TileGraph.h
similarity index 100%
rename from PyTorchSimBackend/src/TileGraph.h
rename to PyTorchSimBackend/include/TileGraph.h
diff --git a/PyTorchSimBackend/src/TileGraphParser.h b/PyTorchSimBackend/include/TileGraphParser.h
similarity index 100%
rename from PyTorchSimBackend/src/TileGraphParser.h
rename to PyTorchSimBackend/include/TileGraphParser.h
diff --git a/PyTorchSimBackend/src/scheduler/Scheduler.h b/PyTorchSimBackend/include/scheduler/Scheduler.h
similarity index 93%
rename from PyTorchSimBackend/src/scheduler/Scheduler.h
rename to PyTorchSimBackend/include/scheduler/Scheduler.h
index 1ceb9f4d..21567547 100644
--- a/PyTorchSimBackend/src/scheduler/Scheduler.h
+++ b/PyTorchSimBackend/include/scheduler/Scheduler.h
@@ -1,8 +1,8 @@
 #pragma once
 #include <robin_hood.h>
-#include "../Tile.h"
-#include "../Common.h"
-#include "../TileGraph.h"
+#include "Tile.h"
+#include "Common.h"
+#include "TileGraph.h"
 
 class Scheduler {
  public:

From e9050e34a914fb0b60114c556ff13970e49f5905 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 15 Feb 2025 15:35:11 +0000
Subject: [PATCH 106/432] [Backendsim] Add stonne config paht argument & set
 dummy instruction

---
 PyTorchSimBackend/include/SimulationConfig.h |  1 +
 PyTorchSimBackend/include/SparseCore.h       |  3 ++-
 PyTorchSimBackend/src/Common.cc              |  8 +++++--
 PyTorchSimBackend/src/SparseCore.cc          | 25 ++++++++++++++++++--
 4 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/PyTorchSimBackend/include/SimulationConfig.h
index eac2bbb5..b8995897 100644
--- a/PyTorchSimBackend/include/SimulationConfig.h
+++ b/PyTorchSimBackend/include/SimulationConfig.h
@@ -16,6 +16,7 @@ enum class L2CacheType { NOCACHE, READONLY };
 struct SimulationConfig {
   /* Core config */
   CoreType core_type = CoreType::OS_MESH;
+  std::string stonne_config_path;
   uint32_t num_cores;
   uint32_t core_freq;
   uint32_t sram_size;
diff --git a/PyTorchSimBackend/include/SparseCore.h b/PyTorchSimBackend/include/SparseCore.h
index 628c5449..e5ba3abe 100644
--- a/PyTorchSimBackend/include/SparseCore.h
+++ b/PyTorchSimBackend/include/SparseCore.h
@@ -5,9 +5,10 @@
 class SparseCore : public Core {
 public:
   SparseCore(uint32_t id, SimulationConfig config);
-  ~SparseCore() = default;
+  ~SparseCore();
   bool running();
   bool can_issue(const std::shared_ptr<Tile>& op);
+  void issue(std::shared_ptr<Tile> tile);
   std::shared_ptr<Tile> pop_finished_tile();
   void cycle();
   bool has_memory_request();
diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc
index a0602f3f..e9b0d114 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/PyTorchSimBackend/src/Common.cc
@@ -20,9 +20,13 @@ SimulationConfig initialize_config(json config) {
   /* Core configs */
   if ((std::string)config["core_type"] == "os_mesh")
     parsed_config.core_type = CoreType::OS_MESH;
-  else if ((std::string)config["core_type"] == "stonne")
+  else if ((std::string)config["core_type"] == "stonne"){
     parsed_config.core_type = CoreType::STONNE;
-  else
+    if (config.contains("stonne_config_path"))
+      parsed_config.stonne_config_path = config["stonne_config_path"];
+    else
+      throw std::runtime_error("Stonne config path is missing");
+  } else
     throw std::runtime_error(fmt::format("Not implemented dram type {} ",
                                          (std::string)config["core_type"]));
   parsed_config.num_cores = config["num_cores"];
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index fca68b3c..b0f9bae8 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -1,10 +1,31 @@
 #include "SparseCore.h"
 
 SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config) {
-  std::string hardware_configuration = "/workspace/sstStonne/tests/sparseflex_op_128mses_128_bw.cfg"; //FIXME.
-  stonneCore = new SST_STONNE::sstStonne(hardware_configuration);
+  stonneCore = new SST_STONNE::sstStonne(config.stonne_config_path);
+
+  // Dummy instruction
+  SST_STONNE::StonneOpDesc opDesc;
+  opDesc.operation = Layer_t::outerProductGEMM;
+  opDesc.GEMM_K = 512;
+  opDesc.GEMM_N = 64;
+  opDesc.GEMM_M = 64;
+  opDesc.GEMM_T_K = 4;
+  opDesc.GEMM_T_N = 1;
+  opDesc.mem_init = "/workspace/sstStonne/tests/outerproduct/outerproduct_gemm_mem.ini";
+  opDesc.mem_matrix_c_file_name = "/workspace/sstStonne/tests/outerproduct/result.out";
+  opDesc.matrix_a_dram_address = 0;
+  opDesc.matrix_b_dram_address = 12444;
+  opDesc.matrix_c_dram_address = 24608;
+  opDesc.rowpointer_matrix_a_init = "/workspace/sstStonne/tests/outerproduct/outerproduct_gemm_rowpointerA.in";
+  opDesc.colpointer_matrix_a_init = "/workspace/sstStonne/tests/outerproduct/outerproduct_gemm_colpointerA.in";
+  opDesc.rowpointer_matrix_b_init = "/workspace/sstStonne/tests/outerproduct/outerproduct_gemm_rowpointerB.in";
+  opDesc.colpointer_matrix_b_init = "/worksnpace/sstStonne/tests/outerproduct/outerproduct_gemm_colpointerB.in";
+  stonneCore->setup(opDesc);
+  stonneCore->init(1);
 };
 
+SparseCore::~SparseCore() { delete stonneCore; }
+
 bool SparseCore::running() {
   return !_request_queue.empty() || !_response_queue.empty() || !stonneCore->isFinished();
 }

From f1c8381637e571e8d620835e52b138a7a0cd032e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 15 Feb 2025 15:51:01 +0000
Subject: [PATCH 107/432] [Backendsim] Fix stat printing

---
 PyTorchSimBackend/src/Core.cc | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index fbc7c8a6..e7a7e876 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -380,6 +380,8 @@ bool Core::can_issue_compute(std::shared_ptr<Instruction>& inst) {
 
 void Core::print_stats() {
   std::vector<float> sa_utilization;
+  update_stats();
+  spdlog::info("===== Instructions count =====");
   for (int i=0; i < static_cast<size_t>(Opcode::COUNT); i++) {
     if (i == static_cast<size_t>(Opcode::COMP))
       spdlog::info("Core [{}] : {} inst count {} (GEMM: {}, Vector: {})", _id, opcode_to_string(static_cast<Opcode>(i)), _stat_tot_sa_inst.at(i), _stat_gemm_inst, _stat_tot_sa_inst.at(i) - _stat_gemm_inst);
@@ -387,21 +389,16 @@ void Core::print_stats() {
       spdlog::info("Core [{}] : {} inst count {}", _id, opcode_to_string(static_cast<Opcode>(i)), _stat_tot_sa_inst.at(i));
   }
   spdlog::trace("Core [{}] : SKipped MOVIN inst count {}", _id, _stat_skip_dma);
+  spdlog::info("========= Core stat =========");
   for (int i=0; i<_num_systolic_array_per_core; i++)
     sa_utilization.push_back(static_cast<float>(_stat_tot_sa_compute_cycle.at(i) * 100) / _core_cycle);
-  spdlog::info(
-      "Core [{}] : Vector active cycle {}", _id, _stat_tot_vu_compute_cycle);
   for (int i=0; i<_num_systolic_array_per_core; i++)
-    spdlog::info("Core [{}] : Systolic array[{}] active cycle {}", _id, i, _stat_tot_sa_compute_cycle.at(i));
+    spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i),
+      _stat_tot_sa_compute_cycle.at(i), _stat_tot_sa_compute_idle_cycle.at(i));
   spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {}", _id, _stat_tot_tma_cycle, _stat_tot_tma_idle_cycle);
-  spdlog::info("Core [{}] : Vector unit idle cycle {}", _id, _stat_vu_compute_idle_cycle);
-  for (int i=0; i<_num_systolic_array_per_core; i++)
-    spdlog::info("Core [{}] : Systolic Array[{}] idle cycle [{}]", _id, i, _stat_tot_sa_compute_cycle.at(i));
-  spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}", _id, static_cast<float>(_stat_tot_vu_compute_cycle * 100) / _core_cycle);
-  for (int i=0; i<_num_systolic_array_per_core; i++)
-    spdlog::info("Core [{}] : Systolic Array[{}] Utilization(%) {:.2f}", _id, i, sa_utilization.at(i));
+  spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
+    static_cast<float>(_stat_tot_vu_compute_cycle * 100) / _core_cycle, _stat_tot_vu_compute_cycle, _stat_tot_vu_compute_idle_cycle);
   spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle);
-  update_stats();
 }
 
 void Core::print_current_stats() {
@@ -411,17 +408,17 @@ void Core::print_current_stats() {
   auto level = spdlog::level::info;
   if(_id != 0)
     level = spdlog::level::debug;
-  spdlog::log(level, "Core [{}] : Vector active cycle {}", _id, _stat_vu_compute_cycle);
-  for (int i=0; i<_num_systolic_array_per_core; i++)
-    spdlog::log(level, "Core [{}] : Systolic array[{}] active cycle {}", _id, i, _stat_sa_compute_cycle.at(i));
-  spdlog::log(level, "Core [{}] : TMA active cycle {} TMA idle cycle {}", _id, _stat_tma_cycle, _stat_tma_idle_cycle);
-  spdlog::log(level, "Core [{}] : Vector unit idle cycle {}", _id, _stat_vu_compute_idle_cycle);
+
+  spdlog::info("========= Core stat =========");
   for (int i=0; i<_num_systolic_array_per_core; i++)
-    spdlog::log(level, "Core [{}] : Systolic Array[{}] idle cycle {}", _id, i, _stat_sa_compute_idle_cycle.at(i));
-  spdlog::log(level, "Core [{}] : Vector Unit Utilization(%) {:.2f}", _id, static_cast<float>(_stat_vu_compute_cycle * 100) / _config.core_print_interval);
+    sa_utilization.push_back(static_cast<float>(_stat_sa_compute_cycle.at(i) * 100) / _core_cycle);
   for (int i=0; i<_num_systolic_array_per_core; i++)
-    spdlog::log(level, "Core [{}] : Systolic Array[{}] Utilization(%) {:.2f}", _id, i, sa_utilization.at(i));
-  spdlog::log(level, "Core [{}] : Total cycle {}", _id, _core_cycle);
+    spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i),
+      _stat_sa_compute_cycle.at(i), _stat_sa_compute_idle_cycle.at(i));
+  spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {}", _id, _stat_tma_cycle, _stat_tma_idle_cycle);
+  spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
+    static_cast<float>(_stat_vu_compute_cycle * 100) / _core_cycle, _stat_vu_compute_cycle, _stat_vu_compute_idle_cycle);
+  spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle);
   update_stats();
 }
 

From cbd018b5c4fae209d9121b6361632ecfd7b258cb Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 15 Feb 2025 17:06:08 +0000
Subject: [PATCH 108/432] [Backend] Fix Sparse Core modeling

---
 PyTorchSimBackend/extern/stonneCore    |  2 +-
 PyTorchSimBackend/include/Core.h       | 25 ++++++++++++-----------
 PyTorchSimBackend/include/Simulator.h  |  1 +
 PyTorchSimBackend/include/SparseCore.h | 15 +++++++-------
 PyTorchSimBackend/src/Core.cc          |  4 +++-
 PyTorchSimBackend/src/Simulator.cc     | 18 +++++++++++++----
 PyTorchSimBackend/src/SparseCore.cc    | 28 +++++++++++---------------
 PyTorchSimBackend/src/main.cc          |  4 +++-
 8 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/PyTorchSimBackend/extern/stonneCore b/PyTorchSimBackend/extern/stonneCore
index 88986c27..2fb60343 160000
--- a/PyTorchSimBackend/extern/stonneCore
+++ b/PyTorchSimBackend/extern/stonneCore
@@ -1 +1 @@
-Subproject commit 88986c2713622b574d0ce61194c19a62af91c31c
+Subproject commit 2fb60343d6f75d5fd2d53d50a465ee9cbab24c0a
diff --git a/PyTorchSimBackend/include/Core.h b/PyTorchSimBackend/include/Core.h
index 00ab8f4a..30a404f9 100644
--- a/PyTorchSimBackend/include/Core.h
+++ b/PyTorchSimBackend/include/Core.h
@@ -15,18 +15,19 @@ class Core {
  public:
   Core(uint32_t id, SimulationConfig config);
   ~Core() = default;
-  bool running();
-  bool can_issue(const std::shared_ptr<Tile>& op);
-  void issue(std::shared_ptr<Tile> tile);
-  std::shared_ptr<Tile> pop_finished_tile();
-  void cycle();
-  bool has_memory_request();
-  void pop_memory_request();
-  mem_fetch* top_memory_request() { return _request_queue.front(); }
-  void push_memory_response(mem_fetch* response);
-  void print_stats();
-  void print_current_stats();
-  void finish_instruction(std::shared_ptr<Instruction>& inst);
+  virtual bool running();
+  virtual bool can_issue(const std::shared_ptr<Tile>& op);
+  virtual void issue(std::shared_ptr<Tile> tile);
+  virtual std::shared_ptr<Tile> pop_finished_tile();
+  virtual void cycle();
+  virtual void print_stats();
+  virtual void print_current_stats();
+  virtual void finish_instruction(std::shared_ptr<Instruction>& inst);
+  virtual bool has_memory_request();
+  virtual void pop_memory_request();
+  virtual mem_fetch* top_memory_request() { return _request_queue.front(); }
+  virtual void push_memory_response(mem_fetch* response);
+
   std::queue<std::shared_ptr<Instruction>>& get_compute_pipeline(int compute_type);
   enum {
     VECTOR_UNIT,
diff --git a/PyTorchSimBackend/include/Simulator.h b/PyTorchSimBackend/include/Simulator.h
index 7733e9c4..f00513ac 100644
--- a/PyTorchSimBackend/include/Simulator.h
+++ b/PyTorchSimBackend/include/Simulator.h
@@ -5,6 +5,7 @@
 #include <string>
 #include "Common.h"
 #include "Core.h"
+#include "SparseCore.h"
 #include "Dram.h"
 #include "Interconnect.h"
 #include "scheduler/Scheduler.h"
diff --git a/PyTorchSimBackend/include/SparseCore.h b/PyTorchSimBackend/include/SparseCore.h
index e5ba3abe..a9f58086 100644
--- a/PyTorchSimBackend/include/SparseCore.h
+++ b/PyTorchSimBackend/include/SparseCore.h
@@ -6,17 +6,16 @@ class SparseCore : public Core {
 public:
   SparseCore(uint32_t id, SimulationConfig config);
   ~SparseCore();
-  bool running();
-  bool can_issue(const std::shared_ptr<Tile>& op);
-  void issue(std::shared_ptr<Tile> tile);
-  std::shared_ptr<Tile> pop_finished_tile();
-  void cycle();
+  bool running() override;
+  bool can_issue(const std::shared_ptr<Tile>& op) override;
+  void issue(std::shared_ptr<Tile> tile) override;
+  void cycle() override;
   bool has_memory_request();
   void pop_memory_request();
   mem_fetch* top_memory_request() { return _request_queue.front(); }
-  void push_memory_response(mem_fetch* response);
-  void print_stats();
-  void print_current_stats();
+  void push_memory_response(mem_fetch* response) override;
+  void print_stats() override;
+  void print_current_stats() override;
 
 private:
   SST_STONNE::sstStonne *stonneCore;
diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index e7a7e876..d8fbf466 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -358,7 +358,9 @@ bool Core::running() {
   return running;
 }
 
-bool Core::has_memory_request() { return _request_queue.size() > 0; }
+bool Core::has_memory_request() {
+  return !_request_queue.empty();
+}
 
 void Core::pop_memory_request() {
   _request_queue.pop();
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index 41ba3eb8..7d7fd4ab 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -42,15 +42,25 @@ Simulator::Simulator(SimulationConfig config)
     spdlog::info("[Config/Interconnect] BookSim2 selected");
     _icnt = std::make_unique<Booksim2Interconnect>(config);
   } else {
-    spdlog::error("[Configuration] {} Invalid interconnect type...!");
+    spdlog::error("[Configuration] Invalid interconnect type...!");
     exit(EXIT_FAILURE);
   }
   _icnt_interval = config.icnt_print_interval;
 
   // Create core objects
-  _cores.resize(config.num_cores);
-  for (int core_index = 0; core_index < _n_cores; core_index++)
-    _cores[core_index] = std::make_unique<Core>(core_index, _config);
+  _cores.resize(_n_cores);
+  for (int core_index = 0; core_index < _n_cores; core_index++) {
+    if (config.core_type == CoreType::OS_MESH) {
+      _cores.at(core_index) = std::make_unique<Core>(core_index, _config);
+    } else if (config.core_type == CoreType::STONNE) {
+      _cores.at(core_index) = std::make_unique<SparseCore>(core_index, _config);
+    } else {
+      spdlog::error("[Configuration] Invalid core type...!");
+      exit(EXIT_FAILURE);
+    }
+  }
+
+
 
   // Initialize Scheduler
   for (int i=0; i<config.num_patition;i++)
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index b0f9bae8..382f4ea1 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -11,15 +11,15 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config)
   opDesc.GEMM_M = 64;
   opDesc.GEMM_T_K = 4;
   opDesc.GEMM_T_N = 1;
-  opDesc.mem_init = "/workspace/sstStonne/tests/outerproduct/outerproduct_gemm_mem.ini";
-  opDesc.mem_matrix_c_file_name = "/workspace/sstStonne/tests/outerproduct/result.out";
+  opDesc.mem_init = "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini";
+  opDesc.mem_matrix_c_file_name = "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/outerproduct/result.out";
   opDesc.matrix_a_dram_address = 0;
   opDesc.matrix_b_dram_address = 12444;
   opDesc.matrix_c_dram_address = 24608;
-  opDesc.rowpointer_matrix_a_init = "/workspace/sstStonne/tests/outerproduct/outerproduct_gemm_rowpointerA.in";
-  opDesc.colpointer_matrix_a_init = "/workspace/sstStonne/tests/outerproduct/outerproduct_gemm_colpointerA.in";
-  opDesc.rowpointer_matrix_b_init = "/workspace/sstStonne/tests/outerproduct/outerproduct_gemm_rowpointerB.in";
-  opDesc.colpointer_matrix_b_init = "/worksnpace/sstStonne/tests/outerproduct/outerproduct_gemm_colpointerB.in";
+  opDesc.rowpointer_matrix_a_init = "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerA.in";
+  opDesc.colpointer_matrix_a_init = "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerA.in";
+  opDesc.rowpointer_matrix_b_init = "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerB.in";
+  opDesc.colpointer_matrix_b_init = "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerB.in";
   stonneCore->setup(opDesc);
   stonneCore->init(1);
 };
@@ -30,19 +30,17 @@ bool SparseCore::running() {
   return !_request_queue.empty() || !_response_queue.empty() || !stonneCore->isFinished();
 }
 
+void SparseCore::issue(std::shared_ptr<Tile> tile) {};
+
 bool SparseCore::can_issue(const std::shared_ptr<Tile>& op) {
   return !running();
 }
 
-std::shared_ptr<Tile> SparseCore::pop_finished_tile() {
-  return nullptr;
-}
-
 void SparseCore::cycle() {
   stonneCore->cycle();
 
   /* Send Memory Request */
-  if (SimpleMem::Request* req = stonneCore->popRequest()) {
+  while (SimpleMem::Request* req = stonneCore->popRequest()) {
     mem_access_type acc_type;
     mf_type type;
     switch(req->getcmd()) {
@@ -62,12 +60,12 @@ void SparseCore::cycle() {
     _request_queue.push(req_wrapper);
   }
 
-  if (!_response_queue.empty()) {
+  while (!_response_queue.empty()) {
     mem_fetch* resp_wrapper = _response_queue.front();
-    _request_queue.pop();
     SimpleMem::Request* resp = static_cast<SimpleMem::Request*>(resp_wrapper->get_custom_data());
     resp->setReply();
     stonneCore->pushResponse(resp);
+    _response_queue.pop();
     delete resp_wrapper;
   }
 }
@@ -77,9 +75,7 @@ bool SparseCore::has_memory_request() {
 }
 
 void SparseCore::pop_memory_request() {
-  if (!_request_queue.empty()) {
-    _request_queue.pop();
-  }
+  _request_queue.pop();
 }
 
 void SparseCore::push_memory_response(mem_fetch* response) {
diff --git a/PyTorchSimBackend/src/main.cc b/PyTorchSimBackend/src/main.cc
index c7d9684b..7974aab2 100644
--- a/PyTorchSimBackend/src/main.cc
+++ b/PyTorchSimBackend/src/main.cc
@@ -39,7 +39,9 @@ void launchKernel(Simulator* simulator, std::string onnx_path, std::string attri
 
 Simulator* create_simulator(std::string config_path) {
   json config_json;
-  loadConfig(config_path, config_json);
+  if(!loadConfig(config_path, config_json)) {
+    exit(1);
+  }
   SimulationConfig config = initialize_config(config_json);
   auto simulator = new Simulator(config);
   return simulator;

From ef7fbc4a01484e2e5726f6e583a38de0a3b8d7de Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sun, 16 Feb 2025 06:10:23 +0000
Subject: [PATCH 109/432] [Backend] Set ws_mesh as default core

---
 PyTorchSimBackend/src/Common.cc | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc
index e9b0d114..2f521837 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/PyTorchSimBackend/src/Common.cc
@@ -18,17 +18,21 @@ SimulationConfig initialize_config(json config) {
   SimulationConfig parsed_config;
 
   /* Core configs */
-  if ((std::string)config["core_type"] == "os_mesh")
+  if (config.contains("core_type")) {
+    if ((std::string)config["core_type"] == "os_mesh")
+      parsed_config.core_type = CoreType::OS_MESH;
+    else if ((std::string)config["core_type"] == "stonne"){
+      parsed_config.core_type = CoreType::STONNE;
+      if (config.contains("stonne_config_path"))
+        parsed_config.stonne_config_path = config["stonne_config_path"];
+      else
+        throw std::runtime_error("Stonne config path is missing");
+    } else
+      throw std::runtime_error(fmt::format("Not implemented dram type {} ",
+                                          (std::string)config["core_type"]));
+  } else {
     parsed_config.core_type = CoreType::OS_MESH;
-  else if ((std::string)config["core_type"] == "stonne"){
-    parsed_config.core_type = CoreType::STONNE;
-    if (config.contains("stonne_config_path"))
-      parsed_config.stonne_config_path = config["stonne_config_path"];
-    else
-      throw std::runtime_error("Stonne config path is missing");
-  } else
-    throw std::runtime_error(fmt::format("Not implemented dram type {} ",
-                                         (std::string)config["core_type"]));
+  }
   parsed_config.num_cores = config["num_cores"];
   parsed_config.core_freq = config["core_freq"];
   parsed_config.sram_size = config["sram_size"];

From 0d5d7bae9700d19abeb5b804296a94fcea0ab0e9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sun, 16 Feb 2025 06:13:19 +0000
Subject: [PATCH 110/432] [Backend] Fix core dataflow name

---
 PyTorchSimBackend/include/SimulationConfig.h | 4 ++--
 PyTorchSimBackend/src/Common.cc              | 6 +++---
 PyTorchSimBackend/src/Simulator.cc           | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/PyTorchSimBackend/include/SimulationConfig.h
index b8995897..bbdd27c3 100644
--- a/PyTorchSimBackend/include/SimulationConfig.h
+++ b/PyTorchSimBackend/include/SimulationConfig.h
@@ -5,7 +5,7 @@
 
 using json = nlohmann::json;
 
-enum class CoreType { OS_MESH, STONNE };
+enum class CoreType { WS_MESH, STONNE };
 
 enum class DramType { SIMPLE, RAMULATOR1, RAMULATOR2 };
 
@@ -15,7 +15,7 @@ enum class L2CacheType { NOCACHE, READONLY };
 
 struct SimulationConfig {
   /* Core config */
-  CoreType core_type = CoreType::OS_MESH;
+  CoreType core_type = CoreType::WS_MESH;
   std::string stonne_config_path;
   uint32_t num_cores;
   uint32_t core_freq;
diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc
index 2f521837..60da8942 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/PyTorchSimBackend/src/Common.cc
@@ -19,8 +19,8 @@ SimulationConfig initialize_config(json config) {
 
   /* Core configs */
   if (config.contains("core_type")) {
-    if ((std::string)config["core_type"] == "os_mesh")
-      parsed_config.core_type = CoreType::OS_MESH;
+    if ((std::string)config["core_type"] == "ws_mesh")
+      parsed_config.core_type = CoreType::WS_MESH;
     else if ((std::string)config["core_type"] == "stonne"){
       parsed_config.core_type = CoreType::STONNE;
       if (config.contains("stonne_config_path"))
@@ -31,7 +31,7 @@ SimulationConfig initialize_config(json config) {
       throw std::runtime_error(fmt::format("Not implemented dram type {} ",
                                           (std::string)config["core_type"]));
   } else {
-    parsed_config.core_type = CoreType::OS_MESH;
+    parsed_config.core_type = CoreType::WS_MESH;
   }
   parsed_config.num_cores = config["num_cores"];
   parsed_config.core_freq = config["core_freq"];
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index 7d7fd4ab..724a8bd4 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -50,7 +50,7 @@ Simulator::Simulator(SimulationConfig config)
   // Create core objects
   _cores.resize(_n_cores);
   for (int core_index = 0; core_index < _n_cores; core_index++) {
-    if (config.core_type == CoreType::OS_MESH) {
+    if (config.core_type == CoreType::WS_MESH) {
       _cores.at(core_index) = std::make_unique<Core>(core_index, _config);
     } else if (config.core_type == CoreType::STONNE) {
       _cores.at(core_index) = std::make_unique<SparseCore>(core_index, _config);

From c963100619f873e3b4bf1a1f0f2f03a68b515a80 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sun, 16 Feb 2025 07:20:48 +0000
Subject: [PATCH 111/432] [Backendsim] Support stonne tog node

---
 AsmParser/onnx_utility.py                   |  52 +++++++++
 AsmParser/tog_generator.py                  |   5 +-
 PyTorchSimBackend/extern/stonneCore         |   2 +-
 PyTorchSimBackend/include/Tile.h            |   6 +
 PyTorchSimBackend/include/TileGraphParser.h | 119 +++++++++++++++++++-
 PyTorchSimBackend/src/Core.cc               |   2 +-
 PyTorchSimBackend/src/SparseCore.cc         |  38 +++----
 PyTorchSimBackend/src/TileGraphParser.cc    |  61 ++++++++++
 8 files changed, 257 insertions(+), 28 deletions(-)

diff --git a/AsmParser/onnx_utility.py b/AsmParser/onnx_utility.py
index ac48001c..0ee0fda0 100644
--- a/AsmParser/onnx_utility.py
+++ b/AsmParser/onnx_utility.py
@@ -96,6 +96,58 @@ def __init__(self, inst_list=list(), cycle=0, overlapping_cycle=0, compute_type=
         self.torchsim_overlapping_cycle = overlapping_cycle
         self.torchsim_compute_type = compute_type
 
+class stonne_node(node):
+    def __init__(self, tile_info, node_id=0):
+        super().__init__(node_id)
+        self.torchsim_stonne_operation = tile_info.get("stonne_operation", "CONV")
+        self.torchsim_stonne_layer_name = tile_info.get("stonne_layer_name", "")
+        self.torchsim_stonne_mem_init = tile_info.get("stonne_mem_init", "")
+
+        # Convolution Parameters
+        self.torchsim_stonne_R = tile_info.get("stonne_R", 1)
+        self.torchsim_stonne_S = tile_info.get("stonne_S", 1)
+        self.torchsim_stonne_C = tile_info.get("stonne_C", 1)
+        self.torchsim_stonne_K = tile_info.get("stonne_K", 1)
+        self.torchsim_stonne_G = tile_info.get("stonne_G", 1)
+        self.torchsim_stonne_N = tile_info.get("stonne_N", 1)
+        self.torchsim_stonne_X = tile_info.get("stonne_X", 1)
+        self.torchsim_stonne_Y = tile_info.get("stonne_Y", 1)
+        self.torchsim_stonne_X_ = tile_info.get("stonne_X_", 1)
+        self.torchsim_stonne_Y_ = tile_info.get("stonne_Y_", 1)
+        self.torchsim_stonne_strides = tile_info.get("stonne_strides", 1)
+
+        # Convolution Tile Parameters
+        self.torchsim_stonne_T_R = tile_info.get("stonne_T_R", 1)
+        self.torchsim_stonne_T_S = tile_info.get("stonne_T_S", 1)
+        self.torchsim_stonne_T_C = tile_info.get("stonne_T_C", 1)
+        self.torchsim_stonne_T_K = tile_info.get("stonne_T_K", 1)
+        self.torchsim_stonne_T_G = tile_info.get("stonne_T_G", 1)
+        self.torchsim_stonne_T_N = tile_info.get("stonne_T_N", 1)
+        self.torchsim_stonne_T_X_ = tile_info.get("stonne_T_X_", 1)
+        self.torchsim_stonne_T_Y_ = tile_info.get("stonne_T_Y_", 1)
+
+        # GEMM Parameters
+        self.torchsim_stonne_GEMM_K = tile_info.get("stonne_GEMM_K", 1)
+        self.torchsim_stonne_GEMM_N = tile_info.get("stonne_GEMM_N", 1)
+        self.torchsim_stonne_GEMM_M = tile_info.get("stonne_GEMM_M", 1)
+        self.torchsim_stonne_GEMM_T_K = tile_info.get("stonne_GEMM_T_K", 1)
+        self.torchsim_stonne_GEMM_T_N = tile_info.get("stonne_GEMM_T_N", 1)
+        self.torchsim_stonne_GEMM_T_M = tile_info.get("stonne_GEMM_T_M", 1)
+
+        # Memory Addresses
+        self.torchsim_stonne_matrix_a_dram_address = tile_info.get("stonne_matrix_a_dram_address", 0)
+        self.torchsim_stonne_matrix_b_dram_address = tile_info.get("stonne_matrix_b_dram_address", 0)
+        self.torchsim_stonne_matrix_c_dram_address = tile_info.get("stonne_matrix_c_dram_address", 0)
+        self.torchsim_stonne_mem_matrix_c_file_name = tile_info.get("stonne_mem_matrix_c_file_name", "")
+
+        # Bitmap and CSR Data
+        self.torchsim_stonne_bitmap_matrix_a_init = tile_info.get("stonne_bitmap_matrix_a_init", "")
+        self.torchsim_stonne_bitmap_matrix_b_init = tile_info.get("stonne_bitmap_matrix_b_init", "")
+        self.torchsim_stonne_rowpointer_matrix_a_init = tile_info.get("stonne_rowpointer_matrix_a_init", "")
+        self.torchsim_stonne_colpointer_matrix_a_init = tile_info.get("stonne_colpointer_matrix_a_init", "")
+        self.torchsim_stonne_rowpointer_matrix_b_init = tile_info.get("stonne_rowpointer_matrix_b_init", "")
+        self.torchsim_stonne_colpointer_matrix_b_init = tile_info.get("stonne_colpointer_matrix_b_init", "")
+
 def connect_nodes(parent, child):
     child.add_parent(parent)
     parent.add_child(child)
diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
index 27fd4f92..6a665b7b 100644
--- a/AsmParser/tog_generator.py
+++ b/AsmParser/tog_generator.py
@@ -7,7 +7,7 @@
 if __name__ == "__main__":
     from onnx_utility import node, loop_index_node, loop_end_node, load_node, store_node, memory_wait_node, compute_node, connect_nodes, dump_onnx_graph
 else:
-    from AsmParser.onnx_utility import node, loop_index_node, loop_end_node, load_node, store_node, memory_wait_node, compute_node, connect_nodes, dump_onnx_graph
+    from AsmParser.onnx_utility import node, loop_index_node, loop_end_node, load_node, store_node, memory_wait_node, compute_node, stonne_node, connect_nodes, dump_onnx_graph
 
 
 def import_module_from_path(module_name, path):
@@ -31,6 +31,7 @@ class tog_generator:
     LoopNodeKind = 2
     DMANodeKind = 3
     DMAWaitNodeKind = 4
+    StonneNodeKind = 5
     def __init__(self, origins="Unknown") -> None:
         self.module_name = "tile_operation_graph"
         self.module = None
@@ -104,6 +105,8 @@ def _create_node(self, dump_data):
             tile_info["tag_divider_list"] = dump_data["tag_divider_list"]
             tile_info["base_addr"] = dump_data["base_address"]
             new_node = memory_wait_node(tile_info, node_id=node_id)
+        elif node_type == self.StonneNodeKind:
+            new_node = stonne_node(dump_data, node_id=node_id)
         else:
             print("Unexpected node_type :", node_type)
             exit(1)
diff --git a/PyTorchSimBackend/extern/stonneCore b/PyTorchSimBackend/extern/stonneCore
index 2fb60343..0c9414cf 160000
--- a/PyTorchSimBackend/extern/stonneCore
+++ b/PyTorchSimBackend/extern/stonneCore
@@ -1 +1 @@
-Subproject commit 2fb60343d6f75d5fd2d53d50a465ee9cbab24c0a
+Subproject commit 0c9414cf4311d551996ca69a8c8b1507aed81b11
diff --git a/PyTorchSimBackend/include/Tile.h b/PyTorchSimBackend/include/Tile.h
index 36da1f1e..d86e62fb 100644
--- a/PyTorchSimBackend/include/Tile.h
+++ b/PyTorchSimBackend/include/Tile.h
@@ -40,6 +40,10 @@ class Tile {
     _nr_finished_insts++;
   };
   bool all_insts_finshed() { return _nr_insts == _nr_finished_insts; }
+  void* get_custom_data() { return _custom_data; }
+  void set_custom_data(void* custom_data ) { _custom_data = custom_data; }
+  void set_stonne_tile(bool stonne_tile) { _stonne_tile = stonne_tile; }
+  bool is_stonne_tile() { return _stonne_tile; }
   
  protected:
   std::shared_ptr<TileSubGraph> _onwer_graph;
@@ -50,6 +54,8 @@ class Tile {
   size_t _nr_finished_insts = 0;
   std::deque<std::shared_ptr<Instruction>> _instructions;
   std::vector<std::shared_ptr<Tile>> _child_tiles;
+  void *_custom_data;
+  bool _stonne_tile=false;
 };
 
 #endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/PyTorchSimBackend/include/TileGraphParser.h
index ebbc6aa3..a10a2063 100644
--- a/PyTorchSimBackend/include/TileGraphParser.h
+++ b/PyTorchSimBackend/include/TileGraphParser.h
@@ -6,6 +6,7 @@
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include "TileGraph.h"
 #include "Instruction.h"
+#include "sstStonne.h"
 #include "onnx/defs/schema.h"
 #include "onnx/onnx-operators_pb.h"
 #include "onnx/onnx_pb.h"
@@ -18,7 +19,8 @@ enum class TileType{
   LOAD_NODE,
   STORE_NODE,
   COMPUTE_NODE,
-  MEMORY_WAIT_NODE
+  MEMORY_WAIT_NODE,
+  STONNE_NODE,
 };
 
 enum class LoopType {
@@ -153,8 +155,6 @@ class TileMemoryWaitNode : public TileNode {
   std::string _base_addr_name;
 };
 
-
-
 class TileLoopNode : public TileNode {
  public:
  TileLoopNode(onnx::NodeProto& node);
@@ -179,3 +179,116 @@ class TileLoopEndNode : public TileNode {
  public:
   TileLoopEndNode(onnx::NodeProto& node) : TileNode(node) {}
 };
+
+class TileStonneNode : public TileNode {
+ public:
+  TileStonneNode(onnx::NodeProto& node) : TileNode(node) {
+    for (auto attribute : node.attribute()) {
+      if (attribute.name() == "torchsim_stonne_operation") {
+        std::string op_type = attribute.s();
+        if (op_type == "CONV") {
+            desc.operation = Layer_t::CONV;
+        } else if (op_type == "GEMM") {
+            desc.operation = Layer_t::GEMM;
+        } else if (op_type == "POOL") {
+            desc.operation = Layer_t::POOL;
+        } else if (op_type == "FC") {
+            desc.operation = Layer_t::FC;
+        } else if (op_type == "SPARSE_DENSE") {
+            desc.operation = Layer_t::SPARSE_DENSE;
+        } else if (op_type == "bitmapSpMSpM") {
+            desc.operation = Layer_t::bitmapSpMSpM;
+        } else if (op_type == "csrSpMM") {
+            desc.operation = Layer_t::csrSpMM;
+        } else if (op_type == "outerProductGEMM") {
+            desc.operation = Layer_t::outerProductGEMM;
+        } else if (op_type == "gustavsonsGEMM") {
+            desc.operation = Layer_t::gustavsonsGEMM;
+        } else {
+            spdlog::error("[TileStonneNode] Unknown operation type: {}", op_type);
+            throw std::runtime_error("Invalid operation type in TileStonneNode");
+        }
+      } else if (attribute.name() == "torchsim_stonne_layer_name") {
+          desc.layer_name = attribute.s();
+      } else if (attribute.name() == "torchsim_stonne_mem_init") {
+          desc.mem_init = attribute.s();
+      } else if (attribute.name() == "torchsim_stonne_R") {
+          desc.R = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_S") {
+          desc.S = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_C") {
+          desc.C = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_K") {
+          desc.K = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_G") {
+          desc.G = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_N") {
+          desc.N = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_X") {
+          desc.X = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_Y") {
+          desc.Y = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_X_") {
+          desc.X_ = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_Y_") {
+          desc.Y_ = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_strides") {
+          desc.strides = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_T_R") {
+          desc.T_R = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_T_S") {
+          desc.T_S = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_T_C") {
+          desc.T_C = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_T_K") {
+          desc.T_K = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_T_G") {
+          desc.T_G = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_T_N") {
+          desc.T_N = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_T_X_") {
+          desc.T_X_ = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_T_Y_") {
+          desc.T_Y_ = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_GEMM_K") {
+          desc.GEMM_K = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_GEMM_N") {
+          desc.GEMM_N = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_GEMM_M") {
+          desc.GEMM_M = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_GEMM_T_K") {
+          desc.GEMM_T_K = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_GEMM_T_N") {
+          desc.GEMM_T_N = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_GEMM_T_M") {
+          desc.GEMM_T_M = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_matrix_a_dram_address") {
+          desc.matrix_a_dram_address = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_matrix_b_dram_address") {
+          desc.matrix_b_dram_address = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_matrix_c_dram_address") {
+          desc.matrix_c_dram_address = attribute.i();
+      } else if (attribute.name() == "torchsim_stonne_mem_matrix_c_file_name") {
+          desc.mem_matrix_c_file_name = attribute.s();
+      } else if (attribute.name() == "torchsim_stonne_bitmap_matrix_a_init") {
+          desc.bitmap_matrix_a_init = attribute.s();
+      } else if (attribute.name() == "torchsim_stonne_bitmap_matrix_b_init") {
+          desc.bitmap_matrix_b_init = attribute.s();
+      } else if (attribute.name() == "torchsim_stonne_rowpointer_matrix_a_init") {
+          desc.rowpointer_matrix_a_init = attribute.s();
+      } else if (attribute.name() == "torchsim_stonne_colpointer_matrix_a_init") {
+          desc.colpointer_matrix_a_init = attribute.s();
+      } else if (attribute.name() == "torchsim_stonne_rowpointer_matrix_b_init") {
+          desc.rowpointer_matrix_b_init = attribute.s();
+      } else if (attribute.name() == "torchsim_stonne_colpointer_matrix_b_init") {
+          desc.colpointer_matrix_b_init = attribute.s();
+      } else {
+          spdlog::warn("[TileStonneNode] Unrecognized attribute: {}", attribute.name());
+      }
+    }
+  }
+  SST_STONNE::StonneOpDesc* getDesc() { return &desc; }
+  void print_node() override;
+ private:
+  SST_STONNE::StonneOpDesc desc;
+};
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index d8fbf466..245d6618 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -20,7 +20,7 @@ Core::Core(uint32_t id, SimulationConfig config)
 
 bool Core::can_issue(const std::shared_ptr<Tile>& op) {
   /* Check SRAM is enough to run tile */
-  return op->get_required_sram_size() + _used_sram_size <= _sram_size && _tiles.size() < 2;
+  return op->get_required_sram_size() + _used_sram_size <= _sram_size && _tiles.size() < 2 && !op->is_stonne_tile();
 }
 
 void Core::issue(std::shared_ptr<Tile> op) {
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 382f4ea1..4e9126e3 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -2,38 +2,24 @@
 
 SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config) {
   stonneCore = new SST_STONNE::sstStonne(config.stonne_config_path);
-
-  // Dummy instruction
-  SST_STONNE::StonneOpDesc opDesc;
-  opDesc.operation = Layer_t::outerProductGEMM;
-  opDesc.GEMM_K = 512;
-  opDesc.GEMM_N = 64;
-  opDesc.GEMM_M = 64;
-  opDesc.GEMM_T_K = 4;
-  opDesc.GEMM_T_N = 1;
-  opDesc.mem_init = "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini";
-  opDesc.mem_matrix_c_file_name = "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/outerproduct/result.out";
-  opDesc.matrix_a_dram_address = 0;
-  opDesc.matrix_b_dram_address = 12444;
-  opDesc.matrix_c_dram_address = 24608;
-  opDesc.rowpointer_matrix_a_init = "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerA.in";
-  opDesc.colpointer_matrix_a_init = "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerA.in";
-  opDesc.rowpointer_matrix_b_init = "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerB.in";
-  opDesc.colpointer_matrix_b_init = "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerB.in";
-  stonneCore->setup(opDesc);
   stonneCore->init(1);
 };
 
 SparseCore::~SparseCore() { delete stonneCore; }
 
 bool SparseCore::running() {
-  return !_request_queue.empty() || !_response_queue.empty() || !stonneCore->isFinished();
+  return !_request_queue.empty() || !_response_queue.empty() || _tiles.size();
 }
 
-void SparseCore::issue(std::shared_ptr<Tile> tile) {};
+void SparseCore::issue(std::shared_ptr<Tile> tile) {
+  SST_STONNE::StonneOpDesc *opDesc = static_cast<SST_STONNE::StonneOpDesc*>(tile->get_custom_data());
+  stonneCore->setup(*opDesc);
+  stonneCore->init(1);
+  _tiles.push_back(tile);
+};
 
 bool SparseCore::can_issue(const std::shared_ptr<Tile>& op) {
-  return !running();
+  return !running() && op->is_stonne_tile();
 }
 
 void SparseCore::cycle() {
@@ -60,6 +46,7 @@ void SparseCore::cycle() {
     _request_queue.push(req_wrapper);
   }
 
+  /* Send Memory Response */
   while (!_response_queue.empty()) {
     mem_fetch* resp_wrapper = _response_queue.front();
     SimpleMem::Request* resp = static_cast<SimpleMem::Request*>(resp_wrapper->get_custom_data());
@@ -68,6 +55,13 @@ void SparseCore::cycle() {
     _response_queue.pop();
     delete resp_wrapper;
   }
+
+  if (stonneCore->isFinished()) {
+    std::shared_ptr<Tile> target_tile = _tiles.front();
+    target_tile->set_status(Tile::Status::FINISH);
+    _finished_tiles.push(target_tile);
+    _tiles.erase(_tiles.begin());
+  }
 }
 
 bool SparseCore::has_memory_request() {
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index bb8a65bc..a5922fa1 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -128,6 +128,8 @@ TileType TileNode::get_tile_type(std::string type) {
     return TileType::COMPUTE_NODE;
   else if (type == "memory_wait_node")
     return TileType::MEMORY_WAIT_NODE;
+  else if (type == "stonne_node")
+    return TileType::STONNE_NODE;
   spdlog::error("[TileGraphParser] Invalid node type...");
   exit(EXIT_FAILURE);
 }
@@ -226,6 +228,41 @@ TileMemoryWaitNode::TileMemoryWaitNode(onnx::NodeProto& node) : TileNode(node) {
   }
 }
 
+void TileStonneNode::print_node() {
+  TileNode::print_node();
+  std::string spaces(get_depth(), '\t');
+
+  spdlog::debug("{} operation: {}", spaces, static_cast<int>(desc.operation));
+  spdlog::debug("{} layer_name: {}", spaces, desc.layer_name);
+  spdlog::debug("{} mem_init: {}", spaces, desc.mem_init);
+
+  // Convolution Parameters
+  spdlog::debug("{} R: {}, S: {}, C: {}, K: {}, G: {}, N: {}", spaces, desc.R, desc.S, desc.C, desc.K, desc.G, desc.N);
+  spdlog::debug("{} X: {}, Y: {}, X_: {}, Y_: {}, strides: {}", spaces, desc.X, desc.Y, desc.X_, desc.Y_, desc.strides);
+
+  // Convolution Tile Parameters
+  spdlog::debug("{} T_R: {}, T_S: {}, T_C: {}, T_K: {}, T_G: {}, T_N: {}", spaces, desc.T_R, desc.T_S, desc.T_C, desc.T_K, desc.T_G, desc.T_N);
+  spdlog::debug("{} T_X_: {}, T_Y_: {}", spaces, desc.T_X_, desc.T_Y_);
+
+  // GEMM Parameters
+  spdlog::debug("{} GEMM_K: {}, GEMM_N: {}, GEMM_M: {}", spaces, desc.GEMM_K, desc.GEMM_N, desc.GEMM_M);
+  spdlog::debug("{} GEMM_T_K: {}, GEMM_T_N: {}, GEMM_T_M: {}", spaces, desc.GEMM_T_K, desc.GEMM_T_N, desc.GEMM_T_M);
+
+  // Memory Addresses
+  spdlog::debug("{} matrix_a_dram_address: {}", spaces, desc.matrix_a_dram_address);
+  spdlog::debug("{} matrix_b_dram_address: {}", spaces, desc.matrix_b_dram_address);
+  spdlog::debug("{} matrix_c_dram_address: {}", spaces, desc.matrix_c_dram_address);
+  spdlog::debug("{} mem_matrix_c_file_name: {}", spaces, desc.mem_matrix_c_file_name);
+
+  // Bitmap and CSR Data
+  spdlog::debug("{} bitmap_matrix_a_init: {}", spaces, desc.bitmap_matrix_a_init);
+  spdlog::debug("{} bitmap_matrix_b_init: {}", spaces, desc.bitmap_matrix_b_init);
+  spdlog::debug("{} rowpointer_matrix_a_init: {}", spaces, desc.rowpointer_matrix_a_init);
+  spdlog::debug("{} colpointer_matrix_a_init: {}", spaces, desc.colpointer_matrix_a_init);
+  spdlog::debug("{} rowpointer_matrix_b_init: {}", spaces, desc.rowpointer_matrix_b_init);
+  spdlog::debug("{} colpointer_matrix_b_init: {}", spaces, desc.colpointer_matrix_b_init);
+}
+
 void TileMemoryWaitNode::print_node() {
   TileNode::print_node();
   std::string spaces(get_depth(), '\t');
@@ -531,6 +568,26 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       parent->append_child(child);
       /* Create new tile */
       tile_vec.push_back(child);
+    } else if (tile_node->get_type() == TileType::STONNE_NODE) {
+        printIndexMap("[TOGParser] Stonne Node ", iter);
+        std::shared_ptr<TileStonneNode> stonne_node = std::static_pointer_cast<TileStonneNode>(tile_node);
+        /* Lookup given name's address */
+        std::vector<int> iter_list;
+        std::vector<int> tag_list;
+        std::vector<int> tag_stride_list;
+        std::vector<int> accum_tag_list;
+
+        /* Put dummy computation instruction */
+        std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
+          Opcode::COMP, 0,
+          0, 0,
+          std::vector<size_t>(), 0, iter_list,
+          iter_list, tag_list, tag_stride_list, accum_tag_list, std::vector<int>()
+        );
+        link_map[tile_node] = inst;
+        tile_vec.back()->append_instuction(inst);
+        tile_vec.back()->set_custom_data(stonne_node->getDesc());
+        tile_vec.back()->set_stonne_tile(true);
     }
   }
 
@@ -642,6 +699,10 @@ TileGraphParser::TileGraphParser(std::string onnx_path, json& attribute_json) {
       std::shared_ptr<TileMemoryWaitNode> tile_node = std::make_shared<TileMemoryWaitNode>(node_proto);
       /* Register output */
       register_tile(tile_node);
+    } else if (type == TileType::STONNE_NODE) {
+      std::shared_ptr<TileStonneNode> tile_node = std::make_shared<TileStonneNode>(node_proto);
+      /* Register output */
+      register_tile(tile_node);
     }
   }
 

From 20471e753d8aea8efa79537240e22a838c9143ce Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sun, 16 Feb 2025 08:06:24 +0000
Subject: [PATCH 112/432] [Backend/stonne] fix stat printing

---
 PyTorchSimBackend/extern/stonneCore |  2 +-
 PyTorchSimBackend/src/Simulator.cc  | 34 ++++++++++++++---------------
 PyTorchSimBackend/src/SparseCore.cc |  3 ---
 3 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/PyTorchSimBackend/extern/stonneCore b/PyTorchSimBackend/extern/stonneCore
index 0c9414cf..a19b780b 160000
--- a/PyTorchSimBackend/extern/stonneCore
+++ b/PyTorchSimBackend/extern/stonneCore
@@ -1 +1 @@
-Subproject commit 0c9414cf4311d551996ca69a8c8b1507aed81b11
+Subproject commit a19b780bd942812d30f96720ec03ebefb9a8fd03
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index 724a8bd4..77a721c1 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -3,9 +3,6 @@
 Simulator::Simulator(SimulationConfig config)
     : _config(config), _core_cycles(0) {
   // Create dram object
-  for (int i=0; i<config.num_cores;i++)
-    spdlog::info("[Config/Core] Core {}: {} MHz, Spad size: {} KB",
-      i, config.core_freq , config.sram_size);
   _core_period = 1000000 / (config.core_freq);
   _icnt_period = 1000000 / (config.icnt_freq);
   _dram_period = 1000000 / (config.dram_freq);
@@ -21,6 +18,22 @@ Simulator::Simulator(SimulationConfig config)
   char* onnxim_path_env = std::getenv("TORCHSIM_DIR");
   std::string onnxim_path = onnxim_path_env != NULL?
     std::string(onnxim_path_env) + "/PyTorchSimBackend" : std::string("./");
+
+  // Create core objects
+  _cores.resize(_n_cores);
+  for (int core_index = 0; core_index < _n_cores; core_index++) {
+    if (config.core_type == CoreType::WS_MESH) {
+      spdlog::info("[Config/Core] Core {}: {} MHz, Spad size: {} KB", core_index, config.core_freq , config.sram_size);
+      _cores.at(core_index) = std::make_unique<Core>(core_index, _config);
+    } else if (config.core_type == CoreType::STONNE) {
+      spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq);
+      _cores.at(core_index) = std::make_unique<SparseCore>(core_index, _config);
+    } else {
+      spdlog::error("[Configuration] Invalid core type...!");
+      exit(EXIT_FAILURE);
+    }
+  }
+
   if (config.dram_type == DramType::RAMULATOR2) {
     std::string ramulator_config = fs::path(onnxim_path)
                                        .append("configs")
@@ -47,21 +60,6 @@ Simulator::Simulator(SimulationConfig config)
   }
   _icnt_interval = config.icnt_print_interval;
 
-  // Create core objects
-  _cores.resize(_n_cores);
-  for (int core_index = 0; core_index < _n_cores; core_index++) {
-    if (config.core_type == CoreType::WS_MESH) {
-      _cores.at(core_index) = std::make_unique<Core>(core_index, _config);
-    } else if (config.core_type == CoreType::STONNE) {
-      _cores.at(core_index) = std::make_unique<SparseCore>(core_index, _config);
-    } else {
-      spdlog::error("[Configuration] Invalid core type...!");
-      exit(EXIT_FAILURE);
-    }
-  }
-
-
-
   // Initialize Scheduler
   for (int i=0; i<config.num_patition;i++)
     _partition_scheduler.push_back(std::make_unique<Scheduler>(Scheduler(config, &_core_cycles, &_core_time, i)));
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 4e9126e3..62e62006 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -78,11 +78,8 @@ void SparseCore::push_memory_response(mem_fetch* response) {
 
 void SparseCore::print_stats() {
   stonneCore->printStats();
-  std::cout << "Pending Requests: " << _request_queue.size() << std::endl;
-  std::cout << "Pending Responses: " << _response_queue.size() << std::endl;
 }
 
 void SparseCore::print_current_stats() {
-  std::cout << "Current SparseCore Status:" << std::endl;
   print_stats();
 }
\ No newline at end of file

From 8f1d36e452109a4a2dd79b8a22a35cac81ee8eeb Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sun, 16 Feb 2025 08:37:01 +0000
Subject: [PATCH 113/432] [Backend/Stonne] Log configuration file info

---
 PyTorchSimBackend/extern/stonneCore    |  2 +-
 PyTorchSimBackend/include/SparseCore.h |  1 +
 PyTorchSimBackend/src/SparseCore.cc    | 17 +++++++++++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimBackend/extern/stonneCore b/PyTorchSimBackend/extern/stonneCore
index a19b780b..c0105e3a 160000
--- a/PyTorchSimBackend/extern/stonneCore
+++ b/PyTorchSimBackend/extern/stonneCore
@@ -1 +1 @@
-Subproject commit a19b780bd942812d30f96720ec03ebefb9a8fd03
+Subproject commit c0105e3a5961f4730d31fbd01dcb3caebc94c06f
diff --git a/PyTorchSimBackend/include/SparseCore.h b/PyTorchSimBackend/include/SparseCore.h
index a9f58086..c446331f 100644
--- a/PyTorchSimBackend/include/SparseCore.h
+++ b/PyTorchSimBackend/include/SparseCore.h
@@ -1,6 +1,7 @@
 #include "Core.h"
 #include "sstStonne.h"
 #include "SimpleMem.h"
+#include "Config.h"
 
 class SparseCore : public Core {
 public:
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 62e62006..41bd734d 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -3,6 +3,23 @@
 SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config) {
   stonneCore = new SST_STONNE::sstStonne(config.stonne_config_path);
   stonneCore->init(1);
+  Config stonneConfig = stonneCore->getStonneConfig();
+  unsigned int core_freq = config.core_freq; // MHz;
+  unsigned int num_ms = stonneConfig.m_MSNetworkCfg.ms_size;
+  unsigned int dn_bw = stonneConfig.m_SDMemoryCfg.n_read_ports;
+  unsigned int dn_width = stonneConfig.m_SDMemoryCfg.port_width;
+  unsigned int rn_bw = stonneConfig.m_SDMemoryCfg.n_write_ports;
+  unsigned int rn_width = stonneConfig.m_SDMemoryCfg.port_width;
+
+  double compute_throughput = static_cast<double>(num_ms) * core_freq / 1e3; // FLOPs/sec
+  double dn_bandwidth = static_cast<double>(dn_bw) * dn_width * core_freq * 1e6 / 8.0 / 1e9; // GB/s
+  double rn_bandwidth = static_cast<double>(rn_bw) * rn_width * core_freq * 1e6 / 8.0 / 1e9; // GB/s
+
+  spdlog::info("[Config/StonneCore {}] Compute Throughput: {:.2f} GFLOPs/sec", id, compute_throughput);
+  spdlog::info("[Config/StonneCore {}] Distribution Network Bandwidth: {:.2f} GB/s ({} ports x {} bits)",
+             id, dn_bandwidth, dn_bw, dn_width);
+  spdlog::info("[Config/StonneCore {}] Reduction Network Bandwidth: {:.2f} GB/s ({} ports x {} bits)",
+             id, rn_bandwidth, rn_bw, rn_width);
 };
 
 SparseCore::~SparseCore() { delete stonneCore; }

From c916a66fe64fd6ba934b8715b0152eed1f964554 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Sun, 16 Feb 2025 07:32:46 +0000
Subject: [PATCH 114/432] [Frontend] Single Batch CONV Template

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 148 ++++++++++++++++--
 1 file changed, 139 insertions(+), 9 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 33d7619f..bae366ba 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -73,8 +73,6 @@
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
-  %stride_h = arith.constant {{ STRIDE_H }} : index
-  %stride_w = arith.constant {{ STRIDE_W }} : index
 
   affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
     affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }} {
@@ -167,8 +165,6 @@
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
-  %stride_h = arith.constant {{ STRIDE_H }} : index
-  %stride_w = arith.constant {{ STRIDE_W }} : index
 
   affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
     affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }} {
@@ -227,6 +223,118 @@
 }
 """
 
+SINGLE_BATCH_CONV_TEMPLATE = r"""
+// Single Batch Conv2D kernel
+// BATCH = {{ BATCH }}
+// I_C = {{ I_C }}
+// I_H = {{ I_H }}
+// I_W = {{ I_W }}
+// O_C = {{ O_C }}
+// K_H = {{ K_H }}
+// K_W = {{ K_W }}
+// O_H = {{ O_H }}
+// O_W = {{ O_W }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
+// PADDING_H = {{ PADDING_H }}
+// PADDING_W = {{ PADDING_W }}
+// STRIDE_H = {{ STRIDE_H }}
+// STRIDE_W = {{ STRIDE_W }}
+// DILATION_H = {{ DILATION_H }}
+// DILATION_W = {{ DILATION_W }}
+// DATA_STYPE = {{ DATA_STYPE }}
+// DATA_SIZE = {{ DATA_SIZE }}
+
+#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * O_H * O_C }} + d1 * {{ O_W * O_C }} + d2 * {{ O_C }} + d3)> // output (BATCH, O_H, O_W, O_C)
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * (I_H + 2 * PADDING_W) * I_C }} + d1 * {{ (I_W + 2 * PADDING_W) * I_C }} + d2 * {{ I_C }} + d3)> // input (BATCH, I_H, I_W, I_C) Stride should be changed if kernel stride > 1
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C)
+#map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
+#offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_K_W * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
+#offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_I_W, TILE_K) }} + d1)>
+#offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
+
+memref.global @X_spad : memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>
+memref.global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+memref.global @Y_spad : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+
+func.func @{{ KERNEL_NAME }}({{ KERNEL_DEF }}) {
+  %c_mvin = arith.constant 2 : index
+  %c_mvin2 = arith.constant 1 : index
+  %c_mvin3 = arith.constant 14 : index
+  %c_mvout = arith.constant 3 : index
+  %vstride = arith.constant 1 : index
+  %input_axis = arith.constant 3 : index
+  %weight_axis = arith.constant 2 : index
+  %input_buffer = memref.get_global @X_spad : memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>
+  %weight_buffer = memref.get_global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+  %output_buffer = memref.get_global @Y_spad : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+  %tag = memref.alloc() : memref<1xi32>
+  %tag0 = memref.alloc() : memref<1xi32>
+  %tag1 = memref.alloc() : memref<1xi32>
+  %tag2 = memref.alloc() : memref<1xi32>
+  %tag3 = memref.alloc() : memref<1xi32>
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
+    affine.for %tile_m = 0 to {{ O_W }} step {{ TILE_M }} {
+      affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
+        %index0 = affine.apply #map0(%c0, %o_h, %tile_m, %tile_n)
+        // Initialize output
+        {%- if BIAS %}
+        memref.dma_start %Bias[%tile_n], %output_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag0[%c0], %c0, %vstride // not implemented yet
+            : memref<{{ O_C }}xf32>, memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ 1 }}, {{ TILE_O_H }}, {{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_O_H * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
+        {%- else %}
+        affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+        {%- endif %}
+        affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
+          affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} {
+            affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
+              %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
+              %index1 = affine.apply #map1(%c0, %index_i_h, %k_w, %tile_k) // input index
+              %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
+              // Load input matrix
+              memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
+                  : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ 1 }}, {{ SUB_TILE_I_H }}, {{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_I_H * TILE_I_W * TILE_K }}, {{ TILE_I_W * TILE_K }}, 1, {{ TILE_I_W }}]}
+              // Load kernel matrix
+              memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
+                  : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
+              affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
+                affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
+                  affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
+                    affine.for %tile_o_w = 0 to {{ 1 }} { // TILE_O_W
+                      %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
+                      %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_k_w)
+                      %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
+                      %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
+                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
+                      %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                    } { inner_loop=true }
+                  } { inner_loop=true }
+                } { inner_loop=true }
+              } { inner_loop=true }
+            } { accumulation_loop=true }
+          } { accumulation_loop=true }
+        } { accumulation_loop=true }
+        // Store output matrix
+        memref.dma_start %output_buffer[%c0, %c0, %c0, %c0], %Y[%index0], %c_mvout, %tag3[%c0], %input_axis, %vstride
+            : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[{{ TILE_O_W * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
+      } { outer_loop=true }
+    } { outer_loop=true }
+  } { outer_loop=true }
+  return
+}
+"""
+
 WRAPPER_TEMPLATE = r"""
 def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif %}, {{ OUT }}):
     # Padding input
@@ -239,16 +347,25 @@ def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif
     # Tanspose tensors
     {%- if MULTI_TILE %}
     t_{{ INPUT }} = {{ INPUT }}_padding.permute(2, 0, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, BATCH, I_W, I_C)
-    {% else %}
+    {% elif SINGLE_BATCH %}
+    t_{{ INPUT }} = {{ INPUT }}_padding.permute(0, 2, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (BATCH, I_H, I_W, I_C)
+    {% else -%}
     t_{{ INPUT }} = {{ INPUT }}_padding.permute(2, 3, 0, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, I_W, BATCH, I_C)
     {% endif -%}
     t_{{ WEIGHT }} = {{ WEIGHT }}.permute(2, 3, 1, 0).contiguous() # (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)
+    {%- if SINGLE_BATCH %}
+    t_{{ OUT }} = {{ OUT }}.permute(0, 2, 3, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (BATCH, O_H, O_W, O_C)
+    {% else -%}
     t_{{ OUT }} = {{ OUT }}.permute(2, 3, 0, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (O_H, O_W, BATCH, O_C)
-
+    {% endif -%}
     {{ KERNEL_NAME }}(t_{{ INPUT }}, t_{{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif %}, t_{{ OUT }})
 
     # Transpose back
+    {%- if SINGLE_BATCH %}
+    {{ OUT }}.copy_(t_{{ OUT }}.permute(0, 3, 1, 2).contiguous()) # (BATCH, O_H, O_W, O_C) -> (BATCH, O_C, O_H, O_W)
+    {% else -%}
     {{ OUT }}.copy_(t_{{ OUT }}.permute(2, 3, 0, 1).contiguous()) # (O_H, O_W, BATCH, O_C) -> (BATCH, O_C, O_H, O_W)
+    {% endif -%}
 """
 
 class MLIRConvTemplate(MLIRTemplate):
@@ -258,6 +375,7 @@ def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
         self.padding = kwargs["padding"]
         self.dilation = kwargs["dilation"]
         self.weight_shape = [str(i) for i in input_nodes[1].layout.size]
+        self.input_shape = [i for i in input_nodes[0].layout.size]
         self.function_name = "Conv2D_" + "_".join(self.weight_shape)+ "_" \
             + "_".join([str(i) for i in self.stride]) \
             + "_" + "_".join([str(i) for i in self.padding]) \
@@ -277,6 +395,9 @@ def is_multi_tile(self, I_C):
         return False
         return I_C < 16 # 16 is hard-coded for now. This should be changed to a better heuristic.
 
+    def is_single_batch(self, BATCH):
+        return BATCH == 1
+
     # Can use math.multi ?
     def def_kernel(self) ->str:
         X, W = self.input_nodes[0], self.input_nodes[1]
@@ -335,6 +456,14 @@ def render(self,
           TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation)
           TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
           TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+        elif self.is_single_batch(BATCH):
+          conv_template = SINGLE_BATCH_CONV_TEMPLATE
+          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation) # TODO: implement K_W
+          TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+          TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
+          SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane
+          SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+          BATCH = O_W # For TOG latency (heuristic)
 
         kernel.loop_size = [K_H, K_W, O_H, O_W, BATCH, O_C, I_C]
 
@@ -346,8 +475,8 @@ def render(self,
             KERNEL_NAME=self.name,
             KERNEL_DEF=self.def_kernel(),
             kernel=kernel,
-            BATCH=BATCH,
-            I_C=I_C,
+            BATCH=X.layout.size[0],
+            I_C=X.layout.size[1],
             I_H=X.layout.size[2],
             I_W=X.layout.size[3],
             O_C=O_C,
@@ -404,7 +533,8 @@ def outer_func_render(self, kernel_name, input_args):
             OUT=input_args[3] if len(input_args) == 4 else input_args[2],
             PADDING_H=self.padding[0],
             PADDING_W=self.padding[1],
-            MULTI_TILE=self.is_multi_tile(int(self.weight_shape[1])),
+            MULTI_TILE=self.is_multi_tile(self.input_shape[1]),
+            SINGLE_BATCH=self.is_single_batch(self.input_shape[0]),
             VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
             BACKENDSIM_EAGER_MODE=extension_config.CONFIG_BACKENDSIM_EAGER_MODE,
             HASH_VALUE=self.hash_value

From 44fbe70fec58d1014ebd399955a1c57220da3351 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Sun, 16 Feb 2025 11:14:57 +0000
Subject: [PATCH 115/432] [Frontend] Single CONV stride=2

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 23 +++++++-------
 PyTorchSimFrontend/mlir/mlir_template.py      | 30 +++++++++++++++++++
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index bae366ba..023696eb 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -250,14 +250,14 @@
 // DATA_SIZE = {{ DATA_SIZE }}
 
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * O_H * O_C }} + d1 * {{ O_W * O_C }} + d2 * {{ O_C }} + d3)> // output (BATCH, O_H, O_W, O_C)
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * (I_H + 2 * PADDING_W) * I_C }} + d1 * {{ (I_W + 2 * PADDING_W) * I_C }} + d2 * {{ I_C }} + d3)> // input (BATCH, I_H, I_W, I_C) Stride should be changed if kernel stride > 1
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * I_C }} + d1 * {{ I_C }} + d2 * {{ I_C * STRIDE_W }} + d3)> // input (I_H, (k_w), I_W, I_C) // duplicate for k_w
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C)
 #map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
 #offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_K_W * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
-#offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_I_W, TILE_K) }} + d1)>
+#offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_M * TILE_K_W, TILE_K) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_K) }})>
 #offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
 
-memref.global @X_spad : memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>
+memref.global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_K_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
 memref.global @Y_spad : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
 
@@ -269,7 +269,7 @@
   %vstride = arith.constant 1 : index
   %input_axis = arith.constant 3 : index
   %weight_axis = arith.constant 2 : index
-  %input_buffer = memref.get_global @X_spad : memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>
+  %input_buffer = memref.get_global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_K_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
   %weight_buffer = memref.get_global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
   %output_buffer = memref.get_global @Y_spad : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
   %tag = memref.alloc() : memref<1xi32>
@@ -297,11 +297,11 @@
           affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} {
             affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
               %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
-              %index1 = affine.apply #map1(%c0, %index_i_h, %k_w, %tile_k) // input index
+              %index1 = affine.apply #map1(%index_i_h, %k_w, %c0, %tile_k) // input index
               %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
               // Load input matrix
               memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
-                  : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ 1 }}, {{ SUB_TILE_I_H }}, {{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_I_H * TILE_I_W * TILE_K }}, {{ TILE_I_W * TILE_K }}, 1, {{ TILE_I_W }}]}
+                  : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_K_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_I_H }}, {{ SUB_TILE_K_W }}, {{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_K_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
               // Load kernel matrix
               memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
                   : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
@@ -313,7 +313,7 @@
                       %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_k_w)
                       %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
                       %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
-                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
+                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ TILE_I_H }}x{{ TILE_K_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
                       %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                       %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                       linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
@@ -458,12 +458,13 @@ def render(self,
           TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
         elif self.is_single_batch(BATCH):
           conv_template = SINGLE_BATCH_CONV_TEMPLATE
-          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation) # TODO: implement K_W
+          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation) # TODO: implement K_W
           TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-          TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
-          SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane
+          TILE_I_W = TILE_K_W
+          TILE_O_W = 1
+          SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
           SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-          BATCH = O_W # For TOG latency (heuristic)
+          BATCH = TILE_M # For TOG latency (heuristic)
 
         kernel.loop_size = [K_H, K_W, O_H, O_W, BATCH, O_C, I_C]
 
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 9c45d6be..a2310d8d 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -207,6 +207,36 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation)
             raise RuntimeError("Cannot find a valid mapping")
         return mapping
 
+    def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation):
+        spad_size_per_lane = self.spad_info["spad_size"]
+        spad_size = spad_size_per_lane * self.vector_lane
+        max_spad_size = spad_size // 2
+        max_spad_per_lane = spad_size_per_lane // 2
+
+        max_used_spad_size = 0
+        M, N, K = self.gemm_combination_mapping(O_W, N, K)
+        max_k_h_w = 1
+        for o_h in sympy.divisors(O_H):
+            for k_h in sympy.divisors(K_H):
+                for k_w in sympy.divisors(K_W):
+                    i_h = 1 + (o_h - 1) * stride[0] + (k_h - 1) * dilation[0]
+                    i_w = 1 + (M - 1) * stride[1] + (k_w - 1) * dilation[1]
+                    weight_size = k_w * k_h * K * N
+                    input_size = i_w * i_h * k_w * K
+                    output_size = M * o_h * N
+                    used_spad_size = (weight_size + input_size + output_size) * self.precision
+                    weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N)
+                    input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * k_w, K)
+                    output_size_per_lane = self.get_spad_size_per_lane(M * o_h, N)
+                    used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
+                    if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h * k_w:
+                        max_used_spad_size = used_spad_size
+                        max_k_h_w = k_h * k_w
+                        mapping = (k_h, k_w, o_h, M, M, N, K)
+        if max_used_spad_size == 0:
+            raise RuntimeError("Cannot find a valid mapping")
+        return mapping
+
     def meta_kernel(self):
         wrapper = V.graph.wrapper_code
         arg_attributes = self.kernel_arg_attributes

From 2978f60edf76dfb5e06ea96c6bef2e2771823465 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Sun, 16 Feb 2025 14:39:52 +0000
Subject: [PATCH 116/432] [Frontend] CONV various template

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 159 ++++++++++++++++--
 1 file changed, 146 insertions(+), 13 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 023696eb..a267c93f 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -248,6 +248,114 @@
 // DILATION_W = {{ DILATION_W }}
 // DATA_STYPE = {{ DATA_STYPE }}
 // DATA_SIZE = {{ DATA_SIZE }}
+#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * O_H * O_C }} + d1 * {{ O_W * O_C }} + d2 * {{ O_C }} + d3)> // output (BATCH, O_H, O_W, O_C)
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * (I_H + 2 * PADDING_W) * I_C }} + d1 * {{ (I_W + 2 * PADDING_W) * I_C }} + d2 * {{ I_C }} + d3)> // input (BATCH, I_H, I_W, I_C) Stride should be changed if kernel stride > 1
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C)
+#map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
+#offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_K_W * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
+#offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_I_W, TILE_K) }} + d1)>
+#offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
+memref.global @X_spad : memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>
+memref.global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+memref.global @Y_spad : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+func.func @{{ KERNEL_NAME }}({{ KERNEL_DEF }}) {
+  %c_mvin = arith.constant 2 : index
+  %c_mvin2 = arith.constant 1 : index
+  %c_mvin3 = arith.constant 14 : index
+  %c_mvout = arith.constant 3 : index
+  %vstride = arith.constant 1 : index
+  %input_axis = arith.constant 3 : index
+  %weight_axis = arith.constant 2 : index
+  %input_buffer = memref.get_global @X_spad : memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>
+  %weight_buffer = memref.get_global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+  %output_buffer = memref.get_global @Y_spad : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+  %tag = memref.alloc() : memref<1xi32>
+  %tag0 = memref.alloc() : memref<1xi32>
+  %tag1 = memref.alloc() : memref<1xi32>
+  %tag2 = memref.alloc() : memref<1xi32>
+  %tag3 = memref.alloc() : memref<1xi32>
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
+    affine.for %tile_m = 0 to {{ O_W }} step {{ TILE_M }} {
+      affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
+        %index0 = affine.apply #map0(%c0, %o_h, %tile_m, %tile_n)
+        // Initialize output
+        {%- if BIAS %}
+        memref.dma_start %Bias[%tile_n], %output_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag0[%c0], %c0, %vstride // not implemented yet
+            : memref<{{ O_C }}xf32>, memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ 1 }}, {{ TILE_O_H }}, {{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_O_H * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
+        {%- else %}
+        affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+        {%- endif %}
+        affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
+          affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} {
+            affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
+              %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
+              %index1 = affine.apply #map1(%c0, %index_i_h, %k_w, %tile_k) // input index
+              %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
+              // Load input matrix
+              memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
+                  : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ 1 }}, {{ SUB_TILE_I_H }}, {{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_I_H * TILE_I_W * TILE_K }}, {{ TILE_I_W * TILE_K }}, 1, {{ TILE_I_W }}]}
+              // Load kernel matrix
+              memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
+                  : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
+              affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
+                affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
+                  affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
+                    affine.for %tile_o_w = 0 to {{ 1 }} { // TILE_O_W
+                      %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
+                      %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_k_w)
+                      %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
+                      %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
+                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
+                      %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                    } { inner_loop=true }
+                  } { inner_loop=true }
+                } { inner_loop=true }
+              } { inner_loop=true }
+            } { accumulation_loop=true }
+          } { accumulation_loop=true }
+        } { accumulation_loop=true }
+        // Store output matrix
+        memref.dma_start %output_buffer[%c0, %c0, %c0, %c0], %Y[%index0], %c_mvout, %tag3[%c0], %input_axis, %vstride
+            : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[{{ TILE_O_W * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
+      } { outer_loop=true }
+    } { outer_loop=true }
+  } { outer_loop=true }
+  return
+}
+"""
+
+SINGLE_BATCH_CONV_STRIDE_TEMPLATE = r"""
+// Single Batch Conv2D (Stride != 1) kernel
+// BATCH = {{ BATCH }}
+// I_C = {{ I_C }}
+// I_H = {{ I_H }}
+// I_W = {{ I_W }}
+// O_C = {{ O_C }}
+// K_H = {{ K_H }}
+// K_W = {{ K_W }}
+// O_H = {{ O_H }}
+// O_W = {{ O_W }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
+// PADDING_H = {{ PADDING_H }}
+// PADDING_W = {{ PADDING_W }}
+// STRIDE_H = {{ STRIDE_H }}
+// STRIDE_W = {{ STRIDE_W }}
+// DILATION_H = {{ DILATION_H }}
+// DILATION_W = {{ DILATION_W }}
+// DATA_STYPE = {{ DATA_STYPE }}
+// DATA_SIZE = {{ DATA_SIZE }}
 
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * O_H * O_C }} + d1 * {{ O_W * O_C }} + d2 * {{ O_C }} + d3)> // output (BATCH, O_H, O_W, O_C)
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * I_C }} + d1 * {{ I_C }} + d2 * {{ I_C * STRIDE_W }} + d3)> // input (I_H, (k_w), I_W, I_C) // duplicate for k_w
@@ -349,13 +457,13 @@ def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif
     t_{{ INPUT }} = {{ INPUT }}_padding.permute(2, 0, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, BATCH, I_W, I_C)
     {% elif SINGLE_BATCH %}
     t_{{ INPUT }} = {{ INPUT }}_padding.permute(0, 2, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (BATCH, I_H, I_W, I_C)
-    {% else -%}
+    {% else %}
     t_{{ INPUT }} = {{ INPUT }}_padding.permute(2, 3, 0, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, I_W, BATCH, I_C)
     {% endif -%}
     t_{{ WEIGHT }} = {{ WEIGHT }}.permute(2, 3, 1, 0).contiguous() # (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)
     {%- if SINGLE_BATCH %}
     t_{{ OUT }} = {{ OUT }}.permute(0, 2, 3, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (BATCH, O_H, O_W, O_C)
-    {% else -%}
+    {% else %}
     t_{{ OUT }} = {{ OUT }}.permute(2, 3, 0, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (O_H, O_W, BATCH, O_C)
     {% endif -%}
     {{ KERNEL_NAME }}(t_{{ INPUT }}, t_{{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif %}, t_{{ OUT }})
@@ -363,7 +471,7 @@ def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif
     # Transpose back
     {%- if SINGLE_BATCH %}
     {{ OUT }}.copy_(t_{{ OUT }}.permute(0, 3, 1, 2).contiguous()) # (BATCH, O_H, O_W, O_C) -> (BATCH, O_C, O_H, O_W)
-    {% else -%}
+    {% else %}
     {{ OUT }}.copy_(t_{{ OUT }}.permute(2, 3, 0, 1).contiguous()) # (O_H, O_W, BATCH, O_C) -> (BATCH, O_C, O_H, O_W)
     {% endif -%}
 """
@@ -392,7 +500,6 @@ def is_transposed(self, node):
         return False
 
     def is_multi_tile(self, I_C):
-        return False
         return I_C < 16 # 16 is hard-coded for now. This should be changed to a better heuristic.
 
     def is_single_batch(self, BATCH):
@@ -450,18 +557,44 @@ def render(self,
         TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
         TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
         SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
+        x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H * TILE_M, TILE_K)
+        w_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_W * TILE_K_H * TILE_K, TILE_N)
+        y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)
+        x_spad_size = TILE_I_W * TILE_I_H * TILE_M * TILE_K
+        w_spad_size = TILE_K_W * TILE_K_H * TILE_K * TILE_N
+        y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
         conv_template = CONV_TEMPLATE
         if self.is_multi_tile(I_C):
           conv_template = MULTI_TILE_CONV_TEMPLATE
           TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation)
           TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
           TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-        elif self.is_single_batch(BATCH):
+          x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H * TILE_M, TILE_K)
+          w_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_H * TILE_K, TILE_N)
+          y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)
+          x_spad_size = TILE_I_W * TILE_I_H * TILE_M * TILE_K
+          w_spad_size = TILE_K_H * TILE_K * TILE_N
+          y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
+        elif self.is_single_batch(BATCH) and self.stride[0] == 1:
           conv_template = SINGLE_BATCH_CONV_TEMPLATE
+          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation) # TODO: implement K_W
+          TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+          TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
+          SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane
+          SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+          BATCH = TILE_M # For TOG latency (heuristic)
+          x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H, TILE_K)
+          y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)
+          x_spad_size = TILE_I_W * TILE_I_H * TILE_K
+          y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
+        elif self.is_single_batch(BATCH) and self.stride[0] != 1:
+          conv_template = SINGLE_BATCH_CONV_STRIDE_TEMPLATE
           TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation) # TODO: implement K_W
           TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-          TILE_I_W = TILE_K_W
-          TILE_O_W = 1
+          x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_W * TILE_I_H * TILE_M, TILE_K)
+          y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N)
+          x_spad_size = TILE_K_W * TILE_I_H * TILE_M * TILE_K
+          y_spad_size = TILE_O_H * TILE_M * TILE_N
           SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
           SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
           BATCH = TILE_M # For TOG latency (heuristic)
@@ -512,12 +645,12 @@ def render(self,
         )
         code = self._template_from_string(conv_template).render(**kernel.render_options)
 
-        self.header = f"float X_spad[{kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H * TILE_M, TILE_K)}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float W_spad[{kernel.get_spad_size_per_lane(TILE_K_W * TILE_K_H * TILE_K, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float Y_spad[{kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header = f"float X_spad[{TILE_I_W * TILE_I_H * TILE_M * TILE_K}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float W_spad[{TILE_K_W * TILE_K_H * TILE_K * TILE_N}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float Y_spad[{TILE_O_H * TILE_O_W * TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
+        self.header = f"float X_spad[{x_spad_size_per_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float W_spad[{w_spad_size_per_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float Y_spad[{y_spad_size_per_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header = f"float X_spad[{x_spad_size}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header += f"float W_spad[{w_spad_size}] __attribute__ ((section(\".spad\")));\n"
+        self.gem5_header += f"float Y_spad[{y_spad_size}] __attribute__ ((section(\".spad\")));\n"
 
         kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
         kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=self.input_reorder)

From f792fc06a1d61c20df6ec26889810e68e42f5482 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Sun, 16 Feb 2025 12:06:38 +0000
Subject: [PATCH 117/432] [Frontend] Extern kernel call lowering path implement
 with small example

---
 PyTorchSimFrontend/extension_op.py              | 17 +++++++++++++++++
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py |  2 ++
 PyTorchSimFrontend/mlir/mlir_lowering.py        | 16 +++++++++++++++-
 3 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 PyTorchSimFrontend/extension_op.py

diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
new file mode 100644
index 00000000..0503ccb9
--- /dev/null
+++ b/PyTorchSimFrontend/extension_op.py
@@ -0,0 +1,17 @@
+import torch
+from torch._inductor.select_algorithm import ExternKernelChoice
+
+class MLIRExternKernelChoice(ExternKernelChoice):
+    def call_name(self):
+        return f"torch.ops.extension_op.{self.name}"
+
+custom_lib = torch.library.Library("extension_op", "DEF")
+
+# FIXME: Custom op is defined in this file for example. Need refactoring
+def _sparse_mm(a, b, out):
+    print("PYTHON CUSTOM OP EXAMPLE")
+    out.copy_(a + b)
+
+custom_lib.define("_sparse_mm(Tensor a, Tensor b, Tensor out) -> Tensor")
+custom_lib.impl("_sparse_mm", _sparse_mm, "PrivateUse1")
+custom_lib.impl("_sparse_mm", _sparse_mm, "AutogradPrivateUse1")
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 5deefeef..e5bb66ae 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -93,6 +93,8 @@ def write_header(self):
                 from {extension_codecache.__name__} import CustomAsyncCompile
                 from torch._inductor.select_algorithm import extern_kernels
 
+                import PyTorchSimFrontend.extension_op
+
                 aten = torch.ops.aten
                 inductor_ops = torch.ops.inductor
                 assert_size_stride = torch._C._dynamo.guards.assert_size_stride
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index e7ca37eb..845e2f0b 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -3,15 +3,18 @@
 import torch
 from torch._inductor.lowering import lowerings
 from torch._inductor.kernel.mm_common import mm_args
+# from torch._inductor.select_algorithm import ExternKernelChoice
 from torch._inductor import ir
 from torch._inductor.virtualized import V
 from torch._inductor.ir import TensorBox
+from PyTorchSimFrontend.extension_op import MLIRExternKernelChoice
 from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
 from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
 from PyTorchSimFrontend.mlir.mlir_conv_template import MLIRConvTemplate
 from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
 
 aten = torch.ops.aten
+aten_spmm = MLIRExternKernelChoice(torch.sparse.mm, "custom_op::sparse_addmm")
 
 def tuned_mm(mat1, mat2, * ,layout=None):
     m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
@@ -141,7 +144,18 @@ def custom_maxpool(
     mlir_template = MLIRMaxPoolTemplate([x], layout, **kwargs)
     return mlir_template.generate().output_node(), x # FIXME: x is dummy IRNode, indices are not used in our case
 
+def sparse_addmm(*args, **kwargs):
+    print("Custom sparse addmm")
+    _, sp_mat1, sp_mat2 = args
+    mat1_layout = sp_mat1.layout
+    mat2_layout = sp_mat2.layout
+    layout = ir.FlexibleLayout(
+            device=mat1_layout.device, dtype=mat1_layout.dtype, size=[mat1_layout.size[0], mat2_layout.size[1]]  # FIXME: Example code for aten op overwrite by externkernel call
+        )
+    return aten_spmm.bind((sp_mat1, sp_mat2), layout).output_node()
+
 lowerings.update({getattr(aten.mm, overload): tuned_mm for overload in aten.mm.overloads()})
 lowerings.update({getattr(aten.addmm, overload): tuned_addmm for overload in aten.addmm.overloads()})
 lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()})
-lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
\ No newline at end of file
+lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
+lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
\ No newline at end of file

From 91c5ab246c49c771e9b03a2e0871ac1f4b22b2b3 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Sun, 16 Feb 2025 15:25:45 +0000
Subject: [PATCH 118/432] [Frontend] Flexagon frontend implemented

---
 PyTorchSimFrontend/extension_op.py | 138 ++++++++++++++++++++++++++++-
 1 file changed, 135 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 0503ccb9..10ad7224 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -1,17 +1,149 @@
+import os
+import subprocess
+import math
+import struct
 import torch
 from torch._inductor.select_algorithm import ExternKernelChoice
 
+from PyTorchSimFrontend import extension_config
+
 class MLIRExternKernelChoice(ExternKernelChoice):
     def call_name(self):
         return f"torch.ops.extension_op.{self.name}"
 
 custom_lib = torch.library.Library("extension_op", "DEF")
 
-# FIXME: Custom op is defined in this file for example. Need refactoring
 def _sparse_mm(a, b, out):
     print("PYTHON CUSTOM OP EXAMPLE")
     out.copy_(a + b)
 
+def generate_outer_product_matrix(a, outer, inner, name):
+	a_cpu = a.cpu()
+	value_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
+		'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini')
+	row_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
+		f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointer{name}.in')
+	col_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
+		f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointer{name}.in')
+
+	with open(value_pointer, "a") as fd, open(row_pointer, "w") as rp, open(col_pointer, "w") as cp:
+    #generating matrix
+		n_nonzeros=0
+		for o in range(outer):  # col major
+			rp.write(str(n_nonzeros)+","); # writing the index
+			for i in range(inner):
+				value = a_cpu[i, o]
+				if value:  # value is generated
+					if((i==(inner-1)) and (o==(outer-1))):
+						cp.write(str(i))
+					else:
+						cp.write(str(i)+","); #writing the row index
+					ba = bytearray(struct.pack(">f", value))  # generating list of bytes
+					my_int = int.from_bytes(ba, "big")
+					fd.write(str(my_int))
+					fd.write(",")
+					n_nonzeros+=1
+
+def flexagon_frontend(a, b, out):
+	print("FLEXAGON FRONTEND")
+	x_shape = a.shape
+	w_shape = b.shape
+
+	M = a.shape[0]
+	N = b.shape[1]
+	K = b.shape[0]
+
+	def calculate_sparsity(tensor):
+		total_elements = tensor.numel()
+		zero_elements = torch.sum(tensor.cpu() == 0)
+		sparsity_ratio = zero_elements / total_elements * 100
+		return math.ceil(sparsity_ratio.item())
+
+	x_sparsity = calculate_sparsity(a)
+	w_sparsity = calculate_sparsity(b)
+	assert(x_sparsity >= 0 and x_sparsity < 100)
+	assert(w_sparsity >= 0 and w_sparsity < 100)
+	print(f"A Sparsity: {x_sparsity}")
+	print(f"B Sparsity: {w_sparsity}")
+
+	# Generating inputs
+	dir_path = os.path.join(
+		extension_config.CONFIG_TORCHSIM_DIR,
+		'PyTorchSimBackend/extern/stonneCore/tests/outerproduct'
+	)
+	os.makedirs(dir_path, exist_ok=True)
+
+	value_path = os.path.join(
+		extension_config.CONFIG_TORCHSIM_DIR,
+		'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini'
+	)
+
+	if os.path.exists(value_path):
+		os.remove(value_path)
+		print(f"Deleted: {value_path}")
+	else:
+		print(f"File does not exist: {value_path}")
+
+	generate_outer_product_matrix(a, K, M, "A")
+	generate_outer_product_matrix(b, K, N, "B")
+
+
+	graph = {
+		0: {
+			"node_id": 0,
+			"node_name": "root",
+			"node_type": 0,
+			"parents": [],
+			"children": [1]
+		},
+		1: {
+			"node_id": 1,
+			"node_name": "loopNode",
+			"node_type": 2,
+			"parents": [0],
+			"children": [2],
+			"loop_index": "loop_arg000",
+			"loop_start": 0,
+			"loop_end": 1,
+			"loop_step": 1,
+			"loop_type": "outer_loop"
+		},
+		2: {
+			"node_id": 2,
+			"node_name": "stonneNode",
+			"node_type": 5,
+			"parents": [1],
+			"children": [],
+			# Operation Type
+			"stonne_operation": "outerProductGEMM",
+
+			# GEMM Parameters
+			"stonne_GEMM_K": K,
+			"stonne_GEMM_N": N,
+			"stonne_GEMM_M": M,
+			"stonne_GEMM_T_K": 4,	# Currently fixed
+			"stonne_GEMM_T_N": 1,	# Currently fixed
+			"stonne_GEMM_T_M": 1,  # 기본값 설정 (T_M이 빠져있으므로 1로 설정)
+
+			# Memory Initialization & File Paths
+			"stonne_mem_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini'),
+			"stonne_mem_matrix_c_file_name": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/result.out'),
+
+			# Memory Addresses
+			"stonne_matrix_a_dram_address": 0,
+			"stonne_matrix_b_dram_address": 12444,
+			"stonne_matrix_c_dram_address": 24608,
+
+			# CSR & Bitmap Initialization
+			"stonne_rowpointer_matrix_a_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerA.in'),
+			"stonne_colpointer_matrix_a_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerA.in'),
+			"stonne_rowpointer_matrix_b_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerB.in'),
+			"stonne_colpointer_matrix_b_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerB.in'),
+		}
+	}
+
+	out.copy_(a + b)
+
 custom_lib.define("_sparse_mm(Tensor a, Tensor b, Tensor out) -> Tensor")
-custom_lib.impl("_sparse_mm", _sparse_mm, "PrivateUse1")
-custom_lib.impl("_sparse_mm", _sparse_mm, "AutogradPrivateUse1")
+custom_lib.impl("_sparse_mm", flexagon_frontend, "PrivateUse1")
+custom_lib.impl("_sparse_mm", flexagon_frontend, "AutogradPrivateUse1")

From 197b24319ab738a7e300865c89fda59733ffaad7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sun, 16 Feb 2025 15:57:43 +0000
Subject: [PATCH 119/432] [Frontend/stonne] Integrate stonne engine with
 pytorch frontend

---
 .../configs/stonne_c1_simple_noc_tpuv3.json   |  32 +++
 PyTorchSimBackend/extern/stonneCore           |   2 +-
 PyTorchSimFrontend/extension_op.py            | 267 ++++++++++--------
 3 files changed, 177 insertions(+), 124 deletions(-)
 create mode 100644 PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json

diff --git a/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json
new file mode 100644
index 00000000..3e32c02b
--- /dev/null
+++ b/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json
@@ -0,0 +1,32 @@
+{
+  "core_type" : "stonne",
+  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "num_cores" : 1,
+  "core_freq" : 940,
+  "sram_size" : 65536,
+  "core_print_interval" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" : 940,
+  "dram_channels": 32,
+  "dram_req_size": 32,
+  "dram_latency" : 10,
+  "dram_size" : 32,
+  "dram_nbl" : 1,
+  "dram_print_interval": 10000,
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq" : 7000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
+
+  "precision" : 4,
+  "scheduler" : "simple",
+  "num_partition" : 2,
+  "partition": {
+    "core_0":0,
+    "core_1":0
+  }
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/extern/stonneCore b/PyTorchSimBackend/extern/stonneCore
index c0105e3a..629fecdd 160000
--- a/PyTorchSimBackend/extern/stonneCore
+++ b/PyTorchSimBackend/extern/stonneCore
@@ -1 +1 @@
-Subproject commit c0105e3a5961f4730d31fbd01dcb3caebc94c06f
+Subproject commit 629fecdde00d3d76a08da8e213e21aebd82f5b8d
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 10ad7224..9a8aeeab 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -4,8 +4,11 @@
 import struct
 import torch
 from torch._inductor.select_algorithm import ExternKernelChoice
-
+from AsmParser.tog_generator import tog_generator
+from torch._inductor.codecache import write
+from PyTorchSimFrontend.extension_codecache import get_write_path
 from PyTorchSimFrontend import extension_config
+from Simulator.simulator import BackendSimulator
 
 class MLIRExternKernelChoice(ExternKernelChoice):
     def call_name(self):
@@ -18,131 +21,149 @@ def _sparse_mm(a, b, out):
     out.copy_(a + b)
 
 def generate_outer_product_matrix(a, outer, inner, name):
-	a_cpu = a.cpu()
-	value_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-		'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini')
-	row_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-		f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointer{name}.in')
-	col_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-		f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointer{name}.in')
-
-	with open(value_pointer, "a") as fd, open(row_pointer, "w") as rp, open(col_pointer, "w") as cp:
+    a_cpu = a.cpu()
+    value_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
+        'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini')
+    row_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
+        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointer{name}.in')
+    col_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
+        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointer{name}.in')
+
+    with open(value_pointer, "a") as fd, open(row_pointer, "w") as rp, open(col_pointer, "w") as cp:
     #generating matrix
-		n_nonzeros=0
-		for o in range(outer):  # col major
-			rp.write(str(n_nonzeros)+","); # writing the index
-			for i in range(inner):
-				value = a_cpu[i, o]
-				if value:  # value is generated
-					if((i==(inner-1)) and (o==(outer-1))):
-						cp.write(str(i))
-					else:
-						cp.write(str(i)+","); #writing the row index
-					ba = bytearray(struct.pack(">f", value))  # generating list of bytes
-					my_int = int.from_bytes(ba, "big")
-					fd.write(str(my_int))
-					fd.write(",")
-					n_nonzeros+=1
+        n_nonzeros=0
+        for o in range(outer):  # col major
+            rp.write(str(n_nonzeros)+","); # writing the index
+            for i in range(inner):
+                value = a_cpu[i, o]
+                if value:  # value is generated
+                    if((i==(inner-1)) and (o==(outer-1))):
+                        cp.write(str(i))
+                    else:
+                        cp.write(str(i)+","); #writing the row index
+                    ba = bytearray(struct.pack(">f", value))  # generating list of bytes
+                    my_int = int.from_bytes(ba, "big")
+                    fd.write(str(my_int))
+                    fd.write(",")
+                    n_nonzeros+=1
 
 def flexagon_frontend(a, b, out):
-	print("FLEXAGON FRONTEND")
-	x_shape = a.shape
-	w_shape = b.shape
-
-	M = a.shape[0]
-	N = b.shape[1]
-	K = b.shape[0]
-
-	def calculate_sparsity(tensor):
-		total_elements = tensor.numel()
-		zero_elements = torch.sum(tensor.cpu() == 0)
-		sparsity_ratio = zero_elements / total_elements * 100
-		return math.ceil(sparsity_ratio.item())
-
-	x_sparsity = calculate_sparsity(a)
-	w_sparsity = calculate_sparsity(b)
-	assert(x_sparsity >= 0 and x_sparsity < 100)
-	assert(w_sparsity >= 0 and w_sparsity < 100)
-	print(f"A Sparsity: {x_sparsity}")
-	print(f"B Sparsity: {w_sparsity}")
-
-	# Generating inputs
-	dir_path = os.path.join(
-		extension_config.CONFIG_TORCHSIM_DIR,
-		'PyTorchSimBackend/extern/stonneCore/tests/outerproduct'
-	)
-	os.makedirs(dir_path, exist_ok=True)
-
-	value_path = os.path.join(
-		extension_config.CONFIG_TORCHSIM_DIR,
-		'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini'
-	)
-
-	if os.path.exists(value_path):
-		os.remove(value_path)
-		print(f"Deleted: {value_path}")
-	else:
-		print(f"File does not exist: {value_path}")
-
-	generate_outer_product_matrix(a, K, M, "A")
-	generate_outer_product_matrix(b, K, N, "B")
-
-
-	graph = {
-		0: {
-			"node_id": 0,
-			"node_name": "root",
-			"node_type": 0,
-			"parents": [],
-			"children": [1]
-		},
-		1: {
-			"node_id": 1,
-			"node_name": "loopNode",
-			"node_type": 2,
-			"parents": [0],
-			"children": [2],
-			"loop_index": "loop_arg000",
-			"loop_start": 0,
-			"loop_end": 1,
-			"loop_step": 1,
-			"loop_type": "outer_loop"
-		},
-		2: {
-			"node_id": 2,
-			"node_name": "stonneNode",
-			"node_type": 5,
-			"parents": [1],
-			"children": [],
-			# Operation Type
-			"stonne_operation": "outerProductGEMM",
-
-			# GEMM Parameters
-			"stonne_GEMM_K": K,
-			"stonne_GEMM_N": N,
-			"stonne_GEMM_M": M,
-			"stonne_GEMM_T_K": 4,	# Currently fixed
-			"stonne_GEMM_T_N": 1,	# Currently fixed
-			"stonne_GEMM_T_M": 1,  # 기본값 설정 (T_M이 빠져있으므로 1로 설정)
-
-			# Memory Initialization & File Paths
-			"stonne_mem_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini'),
-			"stonne_mem_matrix_c_file_name": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/result.out'),
-
-			# Memory Addresses
-			"stonne_matrix_a_dram_address": 0,
-			"stonne_matrix_b_dram_address": 12444,
-			"stonne_matrix_c_dram_address": 24608,
-
-			# CSR & Bitmap Initialization
-			"stonne_rowpointer_matrix_a_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerA.in'),
-			"stonne_colpointer_matrix_a_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerA.in'),
-			"stonne_rowpointer_matrix_b_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerB.in'),
-			"stonne_colpointer_matrix_b_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerB.in'),
-		}
-	}
-
-	out.copy_(a + b)
+    print("FLEXAGON FRONTEND")
+    x_shape = a.shape
+    w_shape = b.shape
+
+    M = a.shape[0]
+    N = b.shape[1]
+    K = b.shape[0]
+
+    def calculate_sparsity(tensor):
+        total_elements = tensor.numel()
+        zero_elements = torch.sum(tensor.cpu() == 0)
+        sparsity_ratio = zero_elements / total_elements * 100
+        return math.ceil(sparsity_ratio.item())
+
+    x_sparsity = calculate_sparsity(a)
+    w_sparsity = calculate_sparsity(b)
+    assert(x_sparsity >= 0 and x_sparsity < 100)
+    assert(w_sparsity >= 0 and w_sparsity < 100)
+    print(f"A Sparsity: {x_sparsity}")
+    print(f"B Sparsity: {w_sparsity}")
+
+    # Generating inputs
+    dir_path = os.path.join(
+        extension_config.CONFIG_TORCHSIM_DIR,
+        'PyTorchSimBackend/extern/stonneCore/tests/outerproduct'
+    )
+    os.makedirs(dir_path, exist_ok=True)
+
+    value_path = os.path.join(
+        extension_config.CONFIG_TORCHSIM_DIR,
+        'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini'
+    )
+
+    if os.path.exists(value_path):
+        os.remove(value_path)
+        print(f"Deleted: {value_path}")
+    else:
+        print(f"File does not exist: {value_path}")
+
+    generate_outer_product_matrix(a, K, M, "A")
+    generate_outer_product_matrix(b, K, N, "B")
+
+
+    graph = {
+        0: {
+            "node_id": 0,
+            "node_name": "root",
+            "node_type": 0,
+            "parents": [],
+            "children": [1]
+        },
+        1: {
+            "node_id": 1,
+            "node_name": "loopNode",
+            "node_type": 2,
+            "parents": [0],
+            "children": [2],
+            "loop_index": "loop_arg000",
+            "loop_start": 0,
+            "loop_end": 1,
+            "loop_step": 1,
+            "loop_type": "outer_loop"
+        },
+        2: {
+            "node_id": 2,
+            "node_name": "stonneNode",
+            "node_type": 5,
+            "parents": [1],
+            "children": [],
+            # Operation Type
+            "stonne_operation": "outerProductGEMM",
+
+            # GEMM Parameters
+            "stonne_GEMM_K": K,
+            "stonne_GEMM_N": N,
+            "stonne_GEMM_M": M,
+            "stonne_GEMM_T_K": 4,	# Currently fixed
+            "stonne_GEMM_T_N": 1,	# Currently fixed
+            "stonne_GEMM_T_M": 1,
+
+            # Memory Initialization & File Paths
+            "stonne_mem_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini'),
+            "stonne_mem_matrix_c_file_name": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/result.out'),
+
+            # Memory Addresses
+            "stonne_matrix_a_dram_address": 0,
+            "stonne_matrix_b_dram_address": 12444,
+            "stonne_matrix_c_dram_address": 24608,
+
+            # CSR & Bitmap Initialization
+            "stonne_rowpointer_matrix_a_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerA.in'),
+            "stonne_colpointer_matrix_a_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerA.in'),
+            "stonne_rowpointer_matrix_b_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerB.in'),
+            "stonne_colpointer_matrix_b_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerB.in'),
+        }
+    }
+    source_code = "graph = " + str(graph)
+
+    write_path = get_write_path(source_code)
+    key, raw_tog_path = write(source_code, "py", specified_dir=write_path)
+    tile_graph_generator = tog_generator(["flexagon_matmul"])
+    tile_graph_generator.load_file(raw_tog_path)
+    tile_graph_generator.generate_tile_graph(
+        os.path.join(write_path, "tile_graph.onnx"),
+        cycle_list=[0],
+        offset=0,
+        vector_lane=0
+    )
+    onnx_path = os.path.join(write_path, "tile_graph.onnx")
+    #attribute_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "attribute")
+    backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
+    stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json'
+    backsim = BackendSimulator(backend_path, stonne_config_path)
+    result_path = backsim.simulation(onnx_path)
+    result = BackendSimulator.get_result_from_file(result_path)
+    out.copy_(a + b)
 
 custom_lib.define("_sparse_mm(Tensor a, Tensor b, Tensor out) -> Tensor")
 custom_lib.impl("_sparse_mm", flexagon_frontend, "PrivateUse1")

From 1edd59c850678bc0a1d59bccba8a1e20123c8f0e Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Sun, 16 Feb 2025 16:43:37 +0000
Subject: [PATCH 120/432] [Frontend] Stonne input generator debug

---
 PyTorchSimFrontend/extension_op.py | 102 ++++++++++++++++++++++-------
 1 file changed, 80 insertions(+), 22 deletions(-)

diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 9a8aeeab..b6a3cc65 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -20,38 +20,98 @@ def _sparse_mm(a, b, out):
     print("PYTHON CUSTOM OP EXAMPLE")
     out.copy_(a + b)
 
-def generate_outer_product_matrix(a, outer, inner, name):
+# def generate_outer_product_matrix(a, outer, inner, name):
+#     a_cpu = a.cpu()
+#     value_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
+#         'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini')
+#     row_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
+#         f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointer{name}.in')
+#     col_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
+#         f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointer{name}.in')
+
+#     with open(value_pointer, "a") as fd, open(row_pointer, "w") as rp, open(col_pointer, "w") as cp:
+#     #generating matrix
+#         n_nonzeros=0
+#         for o in range(outer):  # col major
+#             rp.write(str(n_nonzeros)+","); # writing the index
+#             for i in range(inner):
+#                 value = a_cpu[i, o]
+#                 if value:
+#                     if((i==(inner-1)) and (o==(outer-1))):
+#                         cp.write(str(i))
+#                     else:
+#                         cp.write(str(i)+","); #writing the row index
+#                     ba = bytearray(struct.pack(">f", value))  # generating list of bytes
+#                     my_int = int.from_bytes(ba, "big")
+#                     fd.write(str(my_int))
+#                     fd.write(",")
+#                     n_nonzeros+=1
+#         rp.write(str(n_nonzeros))
+#         next_address_matrix=n_nonzeros*4
+#     return next_address_matrix
+
+def generate_outer_product_matrix(a, b, M, K, N):
+    # Generating matrix A
+    data_width = 4
     a_cpu = a.cpu()
+    b_cpu = b.cpu()
     value_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
         'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini')
-    row_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointer{name}.in')
-    col_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointer{name}.in')
-
-    with open(value_pointer, "a") as fd, open(row_pointer, "w") as rp, open(col_pointer, "w") as cp:
-    #generating matrix
+    rowA_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
+        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerA.in')
+    colA_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
+        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerA.in')
+    rowB_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
+        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerB.in')
+    colB_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
+        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerB.in')
+    with open(value_pointer, "w") as fd, open(rowA_pointer, "w") as rpA, open(colA_pointer, "w") as cpA, open(rowB_pointer, "w") as rpB, open(colB_pointer, "w") as cpB:
+        #generating matrixA
+        n_nonzeros=0
+        for k in range(K):  # col major
+            initial_values=0
+            rpA.write(str(n_nonzeros)+","); # writing the index of A
+            for m in range(M):
+                if(a_cpu[m, k]):  # value is nonzero
+                    if((m==(M-1)) and (k==(K-1))):
+                        cpA.write(str(m))
+                    else:
+                        cpA.write(str(m)+","); #writing the row index
+                    initial_values+=1
+                    value = a_cpu[m, k]
+                    ba = bytearray(struct.pack(">f", value))  # generating list of bytes
+                    my_int = int.from_bytes(ba, "big")
+                    fd.write(str(my_int))
+                    fd.write(",")
+                    n_nonzeros+=1
+        rpA.write(str(n_nonzeros))
+        address_matrix_b=n_nonzeros*data_width
+        #Generating matrix B
         n_nonzeros=0
-        for o in range(outer):  # col major
-            rp.write(str(n_nonzeros)+","); # writing the index
-            for i in range(inner):
-                value = a_cpu[i, o]
-                if value:  # value is generated
-                    if((i==(inner-1)) and (o==(outer-1))):
-                        cp.write(str(i))
+        for k in range(0,K):  # Row major
+            initial_values=0
+            rpB.write(str(n_nonzeros)+","); # writing the index of A
+            for n in range(0,N):
+                if(b_cpu[k, n]):  # value is nonzero
+                    if((k==(K-1)) and (n==(N-1))):
+                        cpB.write(str(n))
                     else:
-                        cp.write(str(i)+","); #writing the row index
+                        cpB.write(str(n)+","); #writing the row index
+
+                    initial_values+=1
+                    value = b_cpu[k, n]
                     ba = bytearray(struct.pack(">f", value))  # generating list of bytes
                     my_int = int.from_bytes(ba, "big")
                     fd.write(str(my_int))
                     fd.write(",")
                     n_nonzeros+=1
 
+        rpB.write(str(n_nonzeros))
+        fd.write(str(0)) # Adding a final 0 to the memory which will never be used. This is just to avoid having a last comma.
+        address_matrix_c=address_matrix_b+(n_nonzeros*data_width)
+
 def flexagon_frontend(a, b, out):
     print("FLEXAGON FRONTEND")
-    x_shape = a.shape
-    w_shape = b.shape
-
     M = a.shape[0]
     N = b.shape[1]
     K = b.shape[0]
@@ -87,9 +147,7 @@ def calculate_sparsity(tensor):
     else:
         print(f"File does not exist: {value_path}")
 
-    generate_outer_product_matrix(a, K, M, "A")
-    generate_outer_product_matrix(b, K, N, "B")
-
+    generate_outer_product_matrix(a, b, M, K, N)
 
     graph = {
         0: {

From ece216f34abb34bde043c10c9d31269bcb18b681 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 17 Feb 2025 01:40:45 +0000
Subject: [PATCH 121/432] [Cleanup] Set gitignore file

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..88eb2fb8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+PyTorchSimBackend/build/
+.vscode

From 952414e4b3ea7c3f1a57b57fb19124bb23916f94 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 17 Feb 2025 03:29:36 +0000
Subject: [PATCH 122/432] [Frontend/Stonne] Fix address calculation

---
 PyTorchSimBackend/extern/stonneCore |  2 +-
 PyTorchSimFrontend/extension_op.py  | 43 ++++-------------------------
 2 files changed, 6 insertions(+), 39 deletions(-)

diff --git a/PyTorchSimBackend/extern/stonneCore b/PyTorchSimBackend/extern/stonneCore
index 629fecdd..3fd4145e 160000
--- a/PyTorchSimBackend/extern/stonneCore
+++ b/PyTorchSimBackend/extern/stonneCore
@@ -1 +1 @@
-Subproject commit 629fecdde00d3d76a08da8e213e21aebd82f5b8d
+Subproject commit 3fd4145eaadb986700f0dc1cc000edc31df2440f
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index b6a3cc65..580e749f 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -16,40 +16,6 @@ def call_name(self):
 
 custom_lib = torch.library.Library("extension_op", "DEF")
 
-def _sparse_mm(a, b, out):
-    print("PYTHON CUSTOM OP EXAMPLE")
-    out.copy_(a + b)
-
-# def generate_outer_product_matrix(a, outer, inner, name):
-#     a_cpu = a.cpu()
-#     value_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-#         'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini')
-#     row_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-#         f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointer{name}.in')
-#     col_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-#         f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointer{name}.in')
-
-#     with open(value_pointer, "a") as fd, open(row_pointer, "w") as rp, open(col_pointer, "w") as cp:
-#     #generating matrix
-#         n_nonzeros=0
-#         for o in range(outer):  # col major
-#             rp.write(str(n_nonzeros)+","); # writing the index
-#             for i in range(inner):
-#                 value = a_cpu[i, o]
-#                 if value:
-#                     if((i==(inner-1)) and (o==(outer-1))):
-#                         cp.write(str(i))
-#                     else:
-#                         cp.write(str(i)+","); #writing the row index
-#                     ba = bytearray(struct.pack(">f", value))  # generating list of bytes
-#                     my_int = int.from_bytes(ba, "big")
-#                     fd.write(str(my_int))
-#                     fd.write(",")
-#                     n_nonzeros+=1
-#         rp.write(str(n_nonzeros))
-#         next_address_matrix=n_nonzeros*4
-#     return next_address_matrix
-
 def generate_outer_product_matrix(a, b, M, K, N):
     # Generating matrix A
     data_width = 4
@@ -109,6 +75,7 @@ def generate_outer_product_matrix(a, b, M, K, N):
         rpB.write(str(n_nonzeros))
         fd.write(str(0)) # Adding a final 0 to the memory which will never be used. This is just to avoid having a last comma.
         address_matrix_c=address_matrix_b+(n_nonzeros*data_width)
+    return 0, address_matrix_b, address_matrix_c
 
 def flexagon_frontend(a, b, out):
     print("FLEXAGON FRONTEND")
@@ -147,7 +114,7 @@ def calculate_sparsity(tensor):
     else:
         print(f"File does not exist: {value_path}")
 
-    generate_outer_product_matrix(a, b, M, K, N)
+    dram_a_address, dram_b_address, dram_c_address = generate_outer_product_matrix(a, b, M, K, N)
 
     graph = {
         0: {
@@ -191,9 +158,9 @@ def calculate_sparsity(tensor):
             "stonne_mem_matrix_c_file_name": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/result.out'),
 
             # Memory Addresses
-            "stonne_matrix_a_dram_address": 0,
-            "stonne_matrix_b_dram_address": 12444,
-            "stonne_matrix_c_dram_address": 24608,
+            "stonne_matrix_a_dram_address": dram_a_address,
+            "stonne_matrix_b_dram_address": dram_b_address,
+            "stonne_matrix_c_dram_address": dram_c_address,
 
             # CSR & Bitmap Initialization
             "stonne_rowpointer_matrix_a_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerA.in'),

From 7e6545985b440120eacd697344cc8876336e9797 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 17 Feb 2025 11:54:10 +0000
Subject: [PATCH 123/432] [Backendsim] Fix stonne config

---
 PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json
index 3e32c02b..b23cac74 100644
--- a/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json
+++ b/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json
@@ -9,18 +9,17 @@
 
   "dram_type" : "ramulator2",
   "dram_freq" : 940,
-  "dram_channels": 32,
+  "dram_channels": 8,
   "dram_req_size": 32,
   "dram_latency" : 10,
-  "dram_size" : 32,
+  "dram_size" : 16,
   "dram_nbl" : 1,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
   "icnt_type" : "simple",
   "icnt_latency" : 7,
   "icnt_freq" : 7000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m2.icnt",
 
   "precision" : 4,
   "scheduler" : "simple",

From db8b282f22a580250106a87e3b971f5116290d8e Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Mon, 17 Feb 2025 12:05:37 +0000
Subject: [PATCH 124/432] [Fix] MaxPool template fixed

---
 PyTorchSimFrontend/mlir/mlir_lowering.py         | 3 ++-
 PyTorchSimFrontend/mlir/mlir_maxpool_template.py | 9 ++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index 845e2f0b..81e6e8ec 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -158,4 +158,5 @@ def sparse_addmm(*args, **kwargs):
 lowerings.update({getattr(aten.addmm, overload): tuned_addmm for overload in aten.addmm.overloads()})
 lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()})
 lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
-lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
\ No newline at end of file
+lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
+# lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # TODO: maxpool shpuld be implemeneted through llir
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
index f8c58b8d..79493fdd 100644
--- a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
@@ -17,9 +17,8 @@
 func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=[X], outputs=[Y], names_str="X, Y")}} {
   %c_mvin = arith.constant 2 : index
   %c_mvout = arith.constant 3 : index
-  %dummy = arith.constant 2 : index
-  %in_chunk = arith.constant {{ in_tile * 2}} : index
-  %out_chunk = arith.constant {{ out_tile * 2}} : index
+  %axis = arith.constant 1 : index
+  %vstride = arith.constant 1 : index
   %X_buffer = memref.get_global @X_spad : memref<{{ in_tile }}x{{ in_tile }}xf32, 1>
   %Y_buffer = memref.get_global @Y_spad : memref<{{ out_tile }}x{{ out_tile }}xf32, 1>
   %tag = memref.alloc() : memref<1xi32>
@@ -27,8 +26,8 @@
   affine.for %i = 0 to {{ BCH }} step {{ out_tile }} {
     affine.for %j = 0 to {{ W }} step {{ out_tile }} {
       %index0 = affine.apply #map0(%i, %j)
-      affine.dma_start %X[%index0], %X_buffer[%c0, %c0], %tag[0], %c_mvin, %dummy, %in_chunk : memref<{{ IN }}xf32>, memref<{{ in_tile }}x{{ in_tile }}xf32, 1>, memref<1xi32>
-      affine.dma_start %Y_buffer[%c0, %c0], %Y[%index0], %tag[0], %c_mvout, %dummy, %out_chunk : memref<{{ out_tile }}x{{ out_tile }}xf32, 1>, memref<{{ OUT }}xf32>, memref<1xi32>
+      memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %axis, %vstride : memref<{{ IN }}xf32>, memref<{{ in_tile }}x{{ in_tile }}xf32, 1>, memref<1xi32>
+      memref.dma_start %Y_buffer[%c0, %c0], %Y[%index0], %c_mvout, %tag[%c0], %axis, %vstride : memref<{{ out_tile }}x{{ out_tile }}xf32, 1>, memref<{{ OUT }}xf32>, memref<1xi32>
     } { outer_loop=true }
   } { outer_loop=true }
   return

From d6cccf4ede5999994f61a37a166f4c7e2b761995 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 17 Feb 2025 12:38:44 +0000
Subject: [PATCH 125/432] [Frontned] Support 3D transpose + reduction

---
 .../mlir/mlir_codegen_backend.py              | 74 +++++++++++++------
 1 file changed, 53 insertions(+), 21 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index e5bb66ae..c055b723 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -738,7 +738,7 @@ def get_padding_type(self):
                     return 1
         return 0
 
-    def convert_index(self, expr):
+    def convert_index(self, expr, buffer):
         if len(expr.free_symbols) != 1:
             raise NotImplementedError("Not supporting this view operation...!")
 
@@ -758,10 +758,12 @@ def convert_index(self, expr):
         args = ", ".join(map(str, indices))
         map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args}) -> ({expr_str})>")
         args = ", ".join([f"%{i}" for i in indices])
-        index = self.cse.generate(self.loads, f"affine.apply #{map_var}({args})")
+        index = self.cse.generate(buffer, f"affine.apply #{map_var}({args})")
         return index
 
-    def parse_indices(self, expr) -> common.CSEVariable:
+    def parse_indices(self, expr, buffer=None) -> common.CSEVariable:
+        if buffer is None:
+            buffer = self.loads
         # Constant case
         if expr.is_number:
             return self.get_const_cse(int(expr))
@@ -773,11 +775,11 @@ def parse_indices(self, expr) -> common.CSEVariable:
         indices = []
         for arg in expr.args:
             if arg.is_Mul and arg.args[0].is_number:
-                new_arg = sympy.Symbol(str(self.convert_index(arg.args[1])))
+                new_arg = sympy.Symbol(str(self.convert_index(arg.args[1], buffer)))
                 expr = expr.replace(arg.args[1], new_arg)
                 indices.append(str(new_arg))
             elif not arg.is_number:
-                new_arg = sympy.Symbol(str(self.convert_index(arg)))
+                new_arg = sympy.Symbol(str(self.convert_index(arg, buffer)))
                 expr = expr.replace(arg, new_arg)
                 indices.append(str(new_arg))
         indices.sort()
@@ -787,7 +789,7 @@ def parse_indices(self, expr) -> common.CSEVariable:
         args = ", ".join(map(str, indices))
         map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args}) -> ({expr_str})>")
         args = ", ".join([f"%{i}" for i in indices])
-        index = self.cse.generate(self.loads, f"affine.apply #{map_var}({args})")
+        index = self.cse.generate(buffer, f"affine.apply #{map_var}({args})")
         return index
 
     def load(self, name: str, index: sympy.Expr):
@@ -888,6 +890,7 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             )
             type_name = mlir_common.DTYPE_TO_MLIR[dtype]
             acc_var = init
+            ret_var = acc
             reduced_shape = type_name
             init = self.cse.generate(self.reduction_prefix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
             if len(self.ranges) == 1: # 1-D vector to scalar
@@ -920,27 +923,49 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                     acc_var = init_vec
                     var_info = [vec_len, mlir_common.DTYPE_TO_MLIR[dtype]]
                 self.register_var_info(acc, var_info)
+            elif len(self.ranges) == 3:
+                vec_len = self.kernel_group.tile_desc.get_vlane_stride()
+                tile_size = list(self.kernel_group.tile_desc.get_tile_size_per_lane())
+                tile_size.pop(self.kernel_group.tile_desc.vlane_split_axis)
+                flattened_size = f"vector<{self.var_info[value][0]}x{type_name}>"
+
+                reduce_axis_size = tile_size[-1]
+                reduced_size = self.var_info[value][0]//reduce_axis_size
+                # It is column majored per lane tile
+                expaned_size = f"vector<{reduce_axis_size}x{reduced_size}x{type_name}>"
+                value = self.cse.generate(self.compute, f"vector.shape_cast %{value} : {flattened_size} to {expaned_size}")
+                shape = expaned_size
+
+                # Edge case for scalar
+                if vec_len == 1:
+                    raise NotImplementedError()
+                reduced_shape = f"vector<{reduced_size}x{type_name}>"
+                init_vec = self.cse.generate(self.reduction_prefix, f"vector.broadcast %{init} : {type_name} to {reduced_shape}")
+                axis = "0"
+                acc_var = init_vec
+                var_info = [reduced_size, mlir_common.DTYPE_TO_MLIR[dtype]]
+                self.register_var_info(acc, var_info)
+                #ret_var = self.cse.generate(self.reductions_suffix, f"vector.shape_cast %{acc} : {reduced_shape} to {reduced_shape2}")
             else:
                 raise NotImplementedError()
 
             self.reduction_vars[acc] = (reduction_type, iterator, acc_var, reduced_shape)
             out = self.cse.generate(self.compute, reduction_combine_vec(reduction_type, value, iterator, axis=axis, shape=shape, reduced_shape=reduced_shape))
             self.affine_yield[out] = reduced_shape
-
             self.reduction_cse.reduction_cache[reduction_key] = acc
             self.iterator_cse.reduction_cache[reduction_key] = iterator
             self.init_cse.reduction_cache[reduction_key] = init_vec
-        return acc
+        return ret_var
 
     def store_reduction(self, name, index, value):
         dram_var = self.kernel_group.args.output(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         index = self.rename_indexing(index)
-        index_var = self.parse_indices(index)
+        index_var = self.parse_indices(index, buffer=self.reductions_suffix)
 
         # Tile is always reuduced in inner loop
-        local_tile_desc, index_var = self.get_dma_info(name, index, index_var, broadcast=False)
+        local_tile_desc, index_var = self.get_dma_info(name, index, index_var, broadcast=False, buffer=self.reductions_suffix)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
@@ -949,7 +974,8 @@ def store_reduction(self, name, index, value):
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
 
-        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.reductions_suffix, index_var, index)
+        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.reductions_suffix,
+                                                                         index_var, index, buffer=self.reduction_suffix)
         if self.welford_reduce_out is not None:
             # raise NotImplementedError()
             sum, sqr_sum, _ = self.welford_reduce_out
@@ -1167,13 +1193,17 @@ def get_dma_info2(self, name, index):
         vlane_split_axis = int(current_tile.tile_per_lane_layout == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
         return vlane_split_axis, vlane_stride, [current_tile.n_row, current_tile.n_col], tile_size_per_lane
 
-    def get_dma_info(self, name, index, index_var, broadcast=True): # Need more argument?
+    def get_dma_info(self, name, index, index_var, broadcast=True, buffer=None): # Need more argument?
         """
         A tile descriptor exists that is configured on a kernel group
         DMA desc should be adjusted according to buffer.
         Therefore, this function shoulde determin DRAM, SRAM stride and
         vectorlane mapping policy
         """
+        # Use loads as default
+        if buffer is None:
+            buffer = self.loads
+
         # TODO.
         kg_tile_desc = self.kernel_group.tile_desc
         buffer_info = self.buffer_types[name]
@@ -1194,7 +1224,7 @@ def get_dma_info(self, name, index, index_var, broadcast=True): # Need more argu
             output_expr = str(index).replace('index', 'd')
             input_argument = ",".join(["%index" + str(i) if i in local_dims else f"%{fake_dim}" for i in total_dims])
             map_var = self.map_cse.generate(self.global_vars, f"affine_map<({input_expr}) -> ({output_expr})>")
-            index_var = self.cse.generate(self.loads, f"affine.apply #{map_var}({input_argument})")
+            index_var = self.cse.generate(buffer, f"affine.apply #{map_var}({input_argument})")
             local_dims = total_dims # Brodatcast tile shape
 
         if kg_tile_desc.vlane_split_axis in local_dims:
@@ -1232,10 +1262,9 @@ def get_dma_info(self, name, index, index_var, broadcast=True): # Need more argu
         elif len(local_dims) == 3:
             is_reduction = self.reduction_depth < 3
             if is_reduction:
-                #local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in dims], [1, 0])
-                #local_tile_desc.vlane_split_axis = local_vlane_split_axis
-                #local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
-                raise NotImplementedError("Currently not implemented... ;)")
+                local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims], [1, 2, 0])
+                local_tile_desc.vlane_split_axis = local_vlane_split_axis
+                local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
             else:
                 local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims])
                 local_tile_desc.vlane_split_axis = local_vlane_split_axis
@@ -1347,14 +1376,17 @@ def adjust_tile_size(self):
         if len(self.itervars) >= 3 and self.reduction_depth < len(self.itervars):
             raise NotImplementedError()
 
-    def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, code_buffer, indices, raw_index, is_template=False):
+    def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, code_buffer, indices, raw_index, is_template=False, buffer=None):
         c_type = mlir_common.DTYPE_TO_C[dtype]
         # Make sure each lane's buffer has at least two element
         tile_size = max(tile_size_per_lane, 2) * self.vector_lane
 
+        if buffer is None:
+            buffer = self.loads
+
         if dtype == torch.bool and not is_template:
             mapping = self.map_cse.generate(self.global_vars, f"affine_map<({indices}) -> ({indices} floordiv 8)>")
-            indices = self.cse.generate(self.loads, f"affine.apply #{mapping}(%{indices})") # FIXME. Only loads?
+            indices = self.cse.generate(buffer, f"affine.apply #{mapping}(%{indices})") # FIXME. Only loads?
 
         if name not in self.global_vars_dict:
             self.global_vars_dict[name] = list()
@@ -1368,13 +1400,13 @@ def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape
             self.global_vars_dict[name].append(str(raw_index))
         else:
             new_name = f"{name}_{self.global_vars_dict[name].index(str(raw_index))}"
-        buffer = self.cse.generate(code_buffer, f"memref.get_global @{new_name}_spad : {dram_tile_shape}")
+        sram_var = self.cse.generate(code_buffer, f"memref.get_global @{new_name}_spad : {dram_tile_shape}")
 
         zero_cse = self.get_const_cse(0)
         sram_dims = len(dram_tile_shape.split("x")) - 1
         sram_index_var = ",".join([f"%{zero_cse}"] * sram_dims)
 
-        return buffer, indices, sram_index_var
+        return sram_var, indices, sram_index_var
 
     def get_const_cse(self, value, dtype="index") -> common.CSEVariable:
         # Type convert

From 6f43c0c2a4a5a7ec2ad5e74a91f47373db22c930 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 17 Feb 2025 12:41:48 +0000
Subject: [PATCH 126/432] [Frontend] Remove legacy tile

---
 .../mlir/mlir_codegen_backend.py              | 111 ------------------
 PyTorchSimFrontend/mlir/mlir_common.py        |  46 --------
 PyTorchSimFrontend/mlir/mlir_template.py      |   2 +-
 3 files changed, 1 insertion(+), 158 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index c055b723..c821f36d 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1082,117 +1082,6 @@ def codegen_nodes(self, nodes, kernel_name):
             write_atomic(gem5_write_path, self.gem5_header.getvalue())
         return src_code
 
-    def get_dma_info2(self, name, index):
-        current_tile = mlir_common.MLIRTile(self.tile_desc.n_row, self.tile_desc.n_col, self.tile_desc.vector_lane, self.tile_desc.used_vector_lane)
-        cv = self.get_constant_vector(index)
-        cv2 = self.get_constant_vector2(index)
-        tile_size_per_lane = self.tile_desc.get_tile_size_per_lane()            # FIXME. move this
-        tile_size_per_lane = 2 if tile_size_per_lane==1 else tile_size_per_lane # Avoid scalar operation
-
-        if len(cv) != len(cv2) and len(cv2) == 3:
-            print("Mismatch! ", cv)
-            # FIXME. this is really shitty code :(
-            cv = cv2#[[1 if x[0] == 0 else x[0], x[1]] for x in cv]
-
-        # Case 0. Tile is 0-D scalar
-        if len(cv) == 0:
-            # Use only one vectorlane to handle scalar data
-            current_tile.n_row = 1
-            current_tile.n_col = 1
-            current_tile.tile_layout = mlir_common.MLIRTile.TILE_ROW_WISE
-            current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
-            mm_stride, tile_size_per_lane = 1, 1
-            vlane_stride = current_tile.get_vlane_stride()
-        # Case 1. Tile is 1-D vector type
-        elif len(cv) == 1 and len(cv) <= self.reduction_depth:
-            current_tile.n_row = 1
-            current_tile.n_col = self.tile_desc.get_tile_size()
-            current_tile.tile_layout = mlir_common.MLIRTile.TILE_ROW_WISE
-            current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE # Actually it is not needed in vector case
-            vlane_stride = current_tile.get_vlane_stride()
-            mm_stride = current_tile.n_col
-            if self.is_scalar(name): # scalar to vector broadcasting
-                mm_stride = 0
-                current_tile.n_row, current_tile.n_col = current_tile.n_col, current_tile.n_row
-        # Case 2. Tile is 1-D vector type with reduction
-        elif len(cv) == 1 and len(cv) == self.reduction_depth + 1:
-            # Use only one vectorlane to reduce a vector
-            current_tile.tile_layout = mlir_common.MLIRTile.TILE_ROW_WISE
-            current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
-            current_tile.n_row = 1
-            current_tile.n_col = self.tile_desc.get_tile_size()
-            current_tile.used_vector_lane = 1
-            vlane_stride = current_tile.get_vlane_stride()
-            mm_stride = 0 # don't care
-            tile_size_per_lane = current_tile.get_tile_size_per_lane()
-            if self.is_scalar(name): # scalar to vector broadcasting
-                current_tile.n_row, current_tile.n_col = current_tile.n_col, current_tile.n_row
-        # Case 3. Tile is 2-D tile
-        elif len(cv) == 2:
-            is_reduction = self.reduction_depth == 1
-            if cv[0][0] != 0 and cv[1][0] != 0:
-                is_transposed = cv[0][0] < cv[1][0]
-                if is_transposed:
-                    current_tile.n_row = self.tile_desc.n_col
-                    current_tile.n_col = self.tile_desc.n_row
-                    mm_stride = self.ranges[0]
-                else:
-                    current_tile.n_row = self.tile_desc.n_row
-                    current_tile.n_col = self.tile_desc.n_col
-                    mm_stride = self.ranges[1]
-
-                if is_reduction and is_transposed:
-                    current_tile.tile_layout = mlir_common.MLIRTile.TILE_COL_WISE
-                    current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
-                    vlane_stride = current_tile.get_vlane_stride()
-                elif is_reduction and not is_transposed:
-                    current_tile.tile_layout = mlir_common.MLIRTile.TILE_ROW_WISE
-                    current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE
-                    vlane_stride = current_tile.get_vlane_stride()
-                elif not is_reduction and is_transposed:
-                    # Transposed case
-                    current_tile.tile_layout = mlir_common.MLIRTile.TILE_COL_WISE
-                    current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE
-                    vlane_stride = current_tile.get_vlane_stride()
-                else: # not is_reduction and not is_transpose
-                    current_tile.tile_layout = mlir_common.MLIRTile.TILE_COL_WISE if self.tile_desc.vector_lane_axis else mlir_common.MLIRTile.TILE_ROW_WISE
-                    current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
-                    vlane_stride = current_tile.get_vlane_stride()
-            else:
-                # Broadcast pattern
-                current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_ROW_WISE
-                mm_stride = 0
-                if cv[0][0] == 0:
-                    current_tile.tile_layout = mlir_common.MLIRTile.TILE_COL_WISE if self.tile_desc.vector_lane_axis else mlir_common.MLIRTile.TILE_ROW_WISE
-                    current_tile.n_row = self.tile_desc.n_row
-                    current_tile.n_col = self.tile_desc.n_col
-                    vlane_stride = current_tile.get_vlane_stride()
-                else: # cv[1][0] == 0
-                    current_tile.n_row = self.tile_desc.n_col
-                    current_tile.n_col = self.tile_desc.n_row
-                    vlane_stride = current_tile.get_cols_per_lane()
-                    if not is_reduction:
-                        current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE
-                        vlane_stride = current_tile.n_col if self.tile_desc.vector_lane_axis else vlane_stride
-        elif len(cv) == 3:
-            current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE # Actually it is not needed in vector case
-            mm_stride = cv[-1][0]
-            # When current_tile.n_col stride is 1, we can access row vector
-            if mm_stride == 1:
-                current_tile.n_row = 1
-                current_tile.n_col = self.tile_desc.get_tile_size()
-            # if current_tile.n_col stride is not 1, we have to access in a column vector
-            else:
-                current_tile.n_row = self.tile_desc.get_tile_size()
-                current_tile.n_col = 1
-            vlane_stride = current_tile.get_tile_size_per_lane()
-        else:
-            raise NotImplementedError()
-
-        #assert(not (dtype==torch.bool and vlane_stride < 8))
-        vlane_split_axis = int(current_tile.tile_per_lane_layout == mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE)
-        return vlane_split_axis, vlane_stride, [current_tile.n_row, current_tile.n_col], tile_size_per_lane
-
     def get_dma_info(self, name, index, index_var, broadcast=True, buffer=None): # Need more argument?
         """
         A tile descriptor exists that is configured on a kernel group
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index fc836e92..3bf31310 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -242,52 +242,6 @@ def get_vlane_stride(self):
     def div_round_up(size, round_val):
         return (size + round_val - 1) // round_val
 
-class MLIRTile():
-    TILE_ROW_WISE = 0
-    TILE_COL_WISE = 1
-    TILE_PER_LANE_ROW_WISE = 2
-    TILE_PER_LANE_COL_WISE = 3
-    def __init__(self, n_row, n_col, vector_lane, used_vector_lane=None) -> None:
-        self.n_row = n_row
-        self.n_col = n_col
-        self.vector_lane = vector_lane
-        if used_vector_lane is None:
-            self.used_vector_lane = self.vector_lane
-        else:
-            self.used_vector_lane = used_vector_lane
-        self.tile_per_lane_layout = self.TILE_PER_LANE_ROW_WISE # How a given tile per lane is stored
-        self.tile_layout = self.TILE_ROW_WISE # How a given tile is stored per lane
-        self.vector_lane_axis = (self.n_col//self.used_vector_lane) > 0 #(0: Col major, 1: Row major)
-
-    def get_tile_size(self):
-        return self.n_row * self.n_col
-
-    def get_rows_per_lane(self):
-        if self.n_row % self.used_vector_lane != 0 and self.n_row > 1:
-            print(f"[Warning] n_row({self.n_row}) % vector_lane({self.used_vector_lane}) != 0")
-        return self.div_round_up(self.n_row, self.used_vector_lane)
-
-    def get_cols_per_lane(self):
-        if self.n_col % self.used_vector_lane != 0 and self.n_col > 1:
-            print(f"[Warning] n_col({self.n_col}) % vector_lane({self.used_vector_lane}) != 0")
-        return self.div_round_up(self.n_col, self.used_vector_lane)
-
-    def get_tile_size_per_lane(self):
-        if self.get_tile_size() % self.used_vector_lane != 0:
-            print(f"[Warning] n_col({self.n_col}) % vector_lane({self.used_vector_lane}) != 0")
-        return self.div_round_up(self.get_tile_size(), self.used_vector_lane)
-
-    def get_vlane_stride(self):
-        if self.tile_layout == self.TILE_ROW_WISE:
-            vlane_stride = self.get_tile_size_per_lane()
-        else:
-            vlane_stride = self.get_cols_per_lane()
-        return vlane_stride
-
-    @staticmethod
-    def div_round_up(size, round_val):
-        return (size + round_val - 1) // round_val
-
 class MLIRWrapperKenrelGroup(cpp.KernelGroup):
     def __init__(self):
         super().__init__()
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index a2310d8d..75d52332 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -17,7 +17,7 @@
 from torch._inductor.utils import IndentedBuffer
 
 from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
-from PyTorchSimFrontend.mlir.mlir_common import BaseMLIRHardwareInfo, MLIRTile
+from PyTorchSimFrontend.mlir.mlir_common import BaseMLIRHardwareInfo
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
 
 from . import mlir_common

From 1b0796aa39df7cadfbdcfa75785378f1a285e0df Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 17 Feb 2025 16:38:10 +0000
Subject: [PATCH 127/432] [Frontend] Support 3D transpose + reduction store
 codegen

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index c821f36d..d91923fa 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -965,7 +965,7 @@ def store_reduction(self, name, index, value):
         index_var = self.parse_indices(index, buffer=self.reductions_suffix)
 
         # Tile is always reuduced in inner loop
-        local_tile_desc, index_var = self.get_dma_info(name, index, index_var, broadcast=False, buffer=self.reductions_suffix)
+        local_tile_desc, index_var = self.get_dma_info(name, index, index_var, broadcast=False, store_reduction=True, buffer=self.reductions_suffix)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
@@ -1082,7 +1082,7 @@ def codegen_nodes(self, nodes, kernel_name):
             write_atomic(gem5_write_path, self.gem5_header.getvalue())
         return src_code
 
-    def get_dma_info(self, name, index, index_var, broadcast=True, buffer=None): # Need more argument?
+    def get_dma_info(self, name, index, index_var, broadcast=True, store_reduction=False, buffer=None): # Need more argument?
         """
         A tile descriptor exists that is configured on a kernel group
         DMA desc should be adjusted according to buffer.
@@ -1138,7 +1138,7 @@ def get_dma_info(self, name, index, index_var, broadcast=True, buffer=None): # N
             local_tile_desc.vlane_stride = kg_tile_desc.get_dim_size(local_dims[0])
         # Case 3. Tile is 2-D tile
         elif len(local_dims) == 2:
-            is_reduction = self.reduction_depth == 1
+            is_reduction = self.reduction_depth == 1 and not store_reduction
             if is_reduction:
                 local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims], [1, 0])
                 local_tile_desc.vlane_split_axis = local_vlane_split_axis
@@ -1149,9 +1149,9 @@ def get_dma_info(self, name, index, index_var, broadcast=True, buffer=None): # N
                 local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         # Case 3. Tile is 3-D tile
         elif len(local_dims) == 3:
-            is_reduction = self.reduction_depth < 3
+            is_reduction = self.reduction_depth < 3 and not store_reduction
             if is_reduction:
-                local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims], [1, 2, 0])
+                local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims], [2, 1, 0])
                 local_tile_desc.vlane_split_axis = local_vlane_split_axis
                 local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
             else:

From a72eeec58db6ddb32fc5c8c4ae8428e4f966fa15 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 17 Feb 2025 18:00:49 +0000
Subject: [PATCH 128/432] [Backendsim] Add Checking unused tags

---
 PyTorchSimBackend/include/Core.h            |  3 +-
 PyTorchSimBackend/include/TMA.h             | 35 ++++++++++++++++++---
 PyTorchSimBackend/include/TileGraphParser.h |  2 +-
 PyTorchSimBackend/src/Core.cc               | 12 +++++--
 PyTorchSimBackend/src/Simulator.cc          |  6 +++-
 5 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/PyTorchSimBackend/include/Core.h b/PyTorchSimBackend/include/Core.h
index 30a404f9..dfc81686 100644
--- a/PyTorchSimBackend/include/Core.h
+++ b/PyTorchSimBackend/include/Core.h
@@ -14,7 +14,7 @@
 class Core {
  public:
   Core(uint32_t id, SimulationConfig config);
-  ~Core() = default;
+  ~Core()=default;
   virtual bool running();
   virtual bool can_issue(const std::shared_ptr<Tile>& op);
   virtual void issue(std::shared_ptr<Tile> tile);
@@ -27,6 +27,7 @@ class Core {
   virtual void pop_memory_request();
   virtual mem_fetch* top_memory_request() { return _request_queue.front(); }
   virtual void push_memory_response(mem_fetch* response);
+  void check_tag() { _tma.check_table(); }
 
   std::queue<std::shared_ptr<Instruction>>& get_compute_pipeline(int compute_type);
   enum {
diff --git a/PyTorchSimBackend/include/TMA.h b/PyTorchSimBackend/include/TMA.h
index f2ea3943..5d08a882 100644
--- a/PyTorchSimBackend/include/TMA.h
+++ b/PyTorchSimBackend/include/TMA.h
@@ -25,18 +25,45 @@ class TMA {
   bool empty() { return _current_inst==nullptr; }
   void register_tag(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
     if (tag_table.find(subgraph_id) == tag_table.end()) {
-      tag_table[subgraph_id] = std::map<std::pair<std::string, std::vector<int>>, bool>();
+      tag_table[subgraph_id] = std::map<std::pair<std::string, std::vector<int>>, uint32_t>();
       waiters[subgraph_id] = std::map<std::pair<std::string, std::vector<int>>, std::vector<std::shared_ptr<Instruction>>>();
     }
-    tag_table[subgraph_id][key] = false;
+    tag_table[subgraph_id][key] = 0;
     waiters[subgraph_id][key] = std::vector<std::shared_ptr<Instruction>>();
   }
   void set_tag_finish(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
     if (tag_table.find(subgraph_id) == tag_table.end()) {
       throw std::runtime_error("Subgraph does not exist in tag_table");
     }
-    tag_table[subgraph_id][key] = true;
+    tag_table[subgraph_id][key] = 1;
   }
+
+  void mark_tag_used(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
+    if (tag_table.find(subgraph_id) == tag_table.end()) {
+      throw std::runtime_error("Subgraph does not exist in tag_table");
+    } else if (!tag_table[subgraph_id][key]) {
+      throw std::runtime_error("Tag is not ready but freed");
+    }
+    tag_table[subgraph_id][key] += 1;
+  }
+
+  void check_table() {
+    for (const auto& entry: tag_table) {
+      auto subgraph_id = entry.first;
+      for (const auto& tag_key: tag_table[subgraph_id]) {
+        const auto& tag_pair = tag_key.first;
+        const std::string& tag_name = tag_pair.first;
+        const std::vector<int>& tag_values = tag_pair.second;
+        uint32_t value = tag_key.second;
+
+        if (value == 1) {
+          spdlog::warn("[Tag Table][{}] Unused tag found: (name={}, key={}, val={})",
+            subgraph_id, tag_name, fmt::format("[{}]", fmt::join(tag_values, ", ")), value);
+        }
+      }
+    }
+  }
+
   bool tag_key_exist(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
     auto subgraph_it = tag_table.find(subgraph_id);
     if (subgraph_it == tag_table.end())
@@ -97,7 +124,7 @@ class TMA {
   size_t _tile_idx_stride=1;
   uint32_t _tile_idx;
   bool _finished=true;
-  std::map<int, std::map<std::pair<std::string, std::vector<int>>, bool>> tag_table;
+  std::map<int, std::map<std::pair<std::string, std::vector<int>>, uint32_t>> tag_table;
   std::map<int, std::map<std::pair<std::string, std::vector<int>>, std::vector<std::shared_ptr<Instruction>>>> waiters;
 };
 #endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/PyTorchSimBackend/include/TileGraphParser.h
index a10a2063..f2045cfa 100644
--- a/PyTorchSimBackend/include/TileGraphParser.h
+++ b/PyTorchSimBackend/include/TileGraphParser.h
@@ -97,7 +97,7 @@ class TileGraphParser {
   std::map<std::string, std::vector<uint32_t>> _arg_numa_stride;
   std::map<std::string, std::tuple<int, int, LoopType>> _loop_size_map;
   std::map<std::string, std::string> _tog_meta;
-  std::map<std::pair<std::string, std::vector<int>>, bool> _tag_table;
+  std::map<std::pair<std::string, std::vector<int>>, uint32_t> _tag_table;
 };
 
 class TileComputeNode : public TileNode {
diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index 245d6618..ffcc04a2 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -26,7 +26,8 @@ bool Core::can_issue(const std::shared_ptr<Tile>& op) {
 void Core::issue(std::shared_ptr<Tile> op) {
   if (op->get_instructions().size()){
     spdlog::trace("[Core {}][{}] New Tile is issued, remain sram: {} Required size: {}, Free size: {}",
-      _id, _core_cycle, _sram_size-_used_sram_size, op->get_required_sram_size(), op->get_instructions().back()->get_free_sram_size());
+      _id, _core_cycle, _sram_size-_used_sram_size, op->get_required_sram_size(),
+      op->get_instructions().back()->get_free_sram_size());
   } else {
     spdlog::trace("[Core {}][{}] New Tile is issued, remain sram: {} Required size: {}",
       _id, _core_cycle, _sram_size-_used_sram_size, op->get_required_sram_size());
@@ -132,6 +133,7 @@ void Core::dma_cycle() {
                     fmt::format("[{}]", fmt::join(instruction->get_tag_idx_list(), ", ")),
                     fmt::format("[{}]", fmt::join(instruction->get_tag_stride_list(), ", ")));
       for (auto & wait_inst : _tma.get_tag_waiter(instruction->subgraph_id, key)) {
+        _tma.mark_tag_used(instruction->subgraph_id, key);
         finish_instruction(wait_inst);
       }
     }
@@ -275,12 +277,16 @@ void Core::cycle() {
             auto key = std::make_pair(inst->get_addr_name(), inst->get_tag_id());
             bool finished = _tma.get_tag_finish(inst->subgraph_id, key);
             if (finished) {
+              _tma.mark_tag_used(inst->subgraph_id, key);
               finish_instruction(inst);
             } else {
               _tma.register_tag_waiter(inst->subgraph_id, key, inst);
             }
-            spdlog::trace("[Core {}][{}] {} ISSUED", _id, _core_cycle,
-                          opcode_to_string(inst->get_opcode()));
+            spdlog::trace("[Core {}][{}] {} ISSUED,  addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
+                            opcode_to_string(inst->get_opcode()), inst->get_addr_name(),
+                            fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
+                            fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
+                            fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
             issued = true;
           }
           break;
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index 77a721c1..6182ff00 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -23,7 +23,8 @@ Simulator::Simulator(SimulationConfig config)
   _cores.resize(_n_cores);
   for (int core_index = 0; core_index < _n_cores; core_index++) {
     if (config.core_type == CoreType::WS_MESH) {
-      spdlog::info("[Config/Core] Core {}: {} MHz, Spad size: {} KB", core_index, config.core_freq , config.sram_size);
+      spdlog::info("[Config/Core] Core {}: {} MHz, Spad size: {} KB, Systolic array per core: {}",
+        core_index, config.core_freq , config.sram_size, config.num_systolic_array_per_core);
       _cores.at(core_index) = std::make_unique<Core>(core_index, _config);
     } else if (config.core_type == CoreType::STONNE) {
       spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq);
@@ -207,6 +208,9 @@ void Simulator::cycle() {
       icnt_cycle();
   }
   spdlog::info("Simulation Finished");
+  for (auto &core: _cores) {
+    core->check_tag();
+  }
 }
 
 bool Simulator::running() {

From ef301a5173523cc411862f65b81772bb01a09485 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Tue, 18 Feb 2025 06:29:17 +0000
Subject: [PATCH 129/432] [Frontend] Minor fix for sparse.mm custom op

---
 PyTorchSimFrontend/extension_op.py              | 2 +-
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 --
 PyTorchSimFrontend/mlir/mlir_lowering.py        | 3 +--
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 580e749f..a627663b 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -188,7 +188,7 @@ def calculate_sparsity(tensor):
     backsim = BackendSimulator(backend_path, stonne_config_path)
     result_path = backsim.simulation(onnx_path)
     result = BackendSimulator.get_result_from_file(result_path)
-    out.copy_(a + b)
+    out.copy_(torch.matmul(a.cpu(), b.cpu()))
 
 custom_lib.define("_sparse_mm(Tensor a, Tensor b, Tensor out) -> Tensor")
 custom_lib.impl("_sparse_mm", flexagon_frontend, "PrivateUse1")
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index d91923fa..bb6871e7 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -93,8 +93,6 @@ def write_header(self):
                 from {extension_codecache.__name__} import CustomAsyncCompile
                 from torch._inductor.select_algorithm import extern_kernels
 
-                import PyTorchSimFrontend.extension_op
-
                 aten = torch.ops.aten
                 inductor_ops = torch.ops.inductor
                 assert_size_stride = torch._C._dynamo.guards.assert_size_stride
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index 81e6e8ec..f364b546 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -148,9 +148,8 @@ def sparse_addmm(*args, **kwargs):
     print("Custom sparse addmm")
     _, sp_mat1, sp_mat2 = args
     mat1_layout = sp_mat1.layout
-    mat2_layout = sp_mat2.layout
     layout = ir.FlexibleLayout(
-            device=mat1_layout.device, dtype=mat1_layout.dtype, size=[mat1_layout.size[0], mat2_layout.size[1]]  # FIXME: Example code for aten op overwrite by externkernel call
+            device=mat1_layout.device, dtype=mat1_layout.dtype, size=args[0].data.data.data.ranges  # FIXME: Example code for aten op overwrite by externkernel call
         )
     return aten_spmm.bind((sp_mat1, sp_mat2), layout).output_node()
 

From 16980955b6abdeed5e2900d754fcb442533d3973 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 18 Feb 2025 20:40:50 +0000
Subject: [PATCH 130/432] [Backendsim/Stonne] Fix sparse wrapper modeling

---
 .../configs/stonne_c1_simple_noc_tpuv3.json   | 10 ++---
 PyTorchSimBackend/extern/stonneCore           |  2 +-
 PyTorchSimBackend/include/SparseCore.h        |  5 +++
 PyTorchSimBackend/src/SparseCore.cc           | 43 ++++++++++++++++---
 PyTorchSimFrontend/extension_op.py            | 27 ++++++++----
 5 files changed, 67 insertions(+), 20 deletions(-)

diff --git a/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json
index b23cac74..8bce391d 100644
--- a/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json
+++ b/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json
@@ -5,21 +5,21 @@
   "core_freq" : 940,
   "sram_size" : 65536,
   "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 2,
 
   "dram_type" : "ramulator2",
   "dram_freq" : 940,
   "dram_channels": 8,
-  "dram_req_size": 32,
+  "dram_req_size": 16,
   "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 1,
+  "dram_size" : 32,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
   "icnt_type" : "simple",
   "icnt_latency" : 7,
   "icnt_freq" : 7000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m2.icnt",
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
 
   "precision" : 4,
   "scheduler" : "simple",
diff --git a/PyTorchSimBackend/extern/stonneCore b/PyTorchSimBackend/extern/stonneCore
index 3fd4145e..a045aa27 160000
--- a/PyTorchSimBackend/extern/stonneCore
+++ b/PyTorchSimBackend/extern/stonneCore
@@ -1 +1 @@
-Subproject commit 3fd4145eaadb986700f0dc1cc000edc31df2440f
+Subproject commit a045aa2718e9f6a48fd2e8320e3b547d294ddac2
diff --git a/PyTorchSimBackend/include/SparseCore.h b/PyTorchSimBackend/include/SparseCore.h
index c446331f..8f43347e 100644
--- a/PyTorchSimBackend/include/SparseCore.h
+++ b/PyTorchSimBackend/include/SparseCore.h
@@ -1,3 +1,5 @@
+#include <map>
+#include <vector>
 #include "Core.h"
 #include "sstStonne.h"
 #include "SimpleMem.h"
@@ -18,9 +20,12 @@ class SparseCore : public Core {
   void print_stats() override;
   void print_current_stats() override;
 
+  uint32_t r_port_nr = 1;
+  uint32_t w_port_nr = 1;
 private:
   SST_STONNE::sstStonne *stonneCore;
   /* Interconnect queue */
   std::queue<mem_fetch*> _request_queue;
   std::queue<mem_fetch*> _response_queue;
+  std::map<std::tuple<uint64_t, mem_access_type, mf_type>, std::vector<SimpleMem::Request*>*> request_merge_table;
 };
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 41bd734d..642ecbd7 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -40,12 +40,15 @@ bool SparseCore::can_issue(const std::shared_ptr<Tile>& op) {
 }
 
 void SparseCore::cycle() {
+  _core_cycle++;
   stonneCore->cycle();
 
   /* Send Memory Request */
   while (SimpleMem::Request* req = stonneCore->popRequest()) {
+    uint64_t target_addr =  (req->getAddress() / _config.dram_req_size) * _config.dram_req_size;
     mem_access_type acc_type;
     mf_type type;
+
     switch(req->getcmd()) {
       case SimpleMem::Request::Read:
         acc_type = mem_access_type::GLOBAL_ACC_R;
@@ -59,21 +62,49 @@ void SparseCore::cycle() {
         spdlog::error("[SparseCore] Invalid request type from core");
         return;
     }
-    mem_fetch* req_wrapper = new mem_fetch(req->getAddress(), acc_type, type, _config.dram_req_size, -1, req);
+    req->request_time = _core_cycle;
+    std::tuple<uint64_t, mem_access_type, mf_type> key = std::make_tuple(target_addr, acc_type, type);
+    if (request_merge_table.find(key) == request_merge_table.end())
+      request_merge_table[key] = new std::vector<SimpleMem::Request*> ();
+    request_merge_table[key]->push_back(req);
+  }
+
+  int nr_request = 0;
+  for (auto& req_pair : request_merge_table) {
+    uint64_t address;
+    mem_access_type acc_type;
+    mf_type type;
+    std::tie(address, acc_type, type) = req_pair.first;
+    mem_fetch* req_wrapper = new mem_fetch(address, acc_type, type, _config.dram_req_size, -1, req_pair.second);
     _request_queue.push(req_wrapper);
+    request_merge_table.erase(req_pair.first);
+
+    if (nr_request++ > r_port_nr);
+      break;
   }
 
-  /* Send Memory Response */
-  while (!_response_queue.empty()) {
+  // Send Memory Response
+  if (!_response_queue.empty()) {
     mem_fetch* resp_wrapper = _response_queue.front();
-    SimpleMem::Request* resp = static_cast<SimpleMem::Request*>(resp_wrapper->get_custom_data());
+    std::vector<SimpleMem::Request*>* resps = static_cast<std::vector<SimpleMem::Request*>*>(resp_wrapper->get_custom_data());
+
+    SimpleMem::Request* resp = resps->front();
+
+    spdlog::debug("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}", \
+             _core_cycle, _core_cycle - resp->request_time, resp->getAddress(), int(resp_wrapper->get_access_type()), int(resp_wrapper->get_type()), _config.dram_req_size);
+
     resp->setReply();
     stonneCore->pushResponse(resp);
-    _response_queue.pop();
-    delete resp_wrapper;
+    resps->erase(resps->begin());
+    if (resps->empty()) {
+      delete resps;
+      delete resp_wrapper;
+      _response_queue.pop();
+    }
   }
 
   if (stonneCore->isFinished()) {
+    stonneCore->finish();
     std::shared_ptr<Tile> target_tile = _tiles.front();
     target_tile->set_status(Tile::Status::FINISH);
     _finished_tiles.push(target_tile);
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index a627663b..09d5feb0 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -3,12 +3,13 @@
 import math
 import struct
 import torch
+import numpy as np
 from torch._inductor.select_algorithm import ExternKernelChoice
 from AsmParser.tog_generator import tog_generator
 from torch._inductor.codecache import write
 from PyTorchSimFrontend.extension_codecache import get_write_path
 from PyTorchSimFrontend import extension_config
-from Simulator.simulator import BackendSimulator
+from Simulator.simulator import BackendSimulator, TORCH_TO_NUMPY
 
 class MLIRExternKernelChoice(ExternKernelChoice):
     def call_name(self):
@@ -115,7 +116,12 @@ def calculate_sparsity(tensor):
         print(f"File does not exist: {value_path}")
 
     dram_a_address, dram_b_address, dram_c_address = generate_outer_product_matrix(a, b, M, K, N)
-
+    mem_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini')
+    a_row_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerA.in')
+    a_col_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerA.in')
+    b_row_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerB.in')
+    b_col_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerB.in')
+    c_result = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/result.out')
     graph = {
         0: {
             "node_id": 0,
@@ -155,7 +161,7 @@ def calculate_sparsity(tensor):
 
             # Memory Initialization & File Paths
             "stonne_mem_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini'),
-            "stonne_mem_matrix_c_file_name": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/result.out'),
+            "stonne_mem_matrix_c_file_name": c_result,
 
             # Memory Addresses
             "stonne_matrix_a_dram_address": dram_a_address,
@@ -163,10 +169,10 @@ def calculate_sparsity(tensor):
             "stonne_matrix_c_dram_address": dram_c_address,
 
             # CSR & Bitmap Initialization
-            "stonne_rowpointer_matrix_a_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerA.in'),
-            "stonne_colpointer_matrix_a_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerA.in'),
-            "stonne_rowpointer_matrix_b_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerB.in'),
-            "stonne_colpointer_matrix_b_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerB.in'),
+            "stonne_rowpointer_matrix_a_init": a_row_init,
+            "stonne_colpointer_matrix_a_init": a_col_init,
+            "stonne_rowpointer_matrix_b_init": b_row_init,
+            "stonne_colpointer_matrix_b_init": b_col_init,
         }
     }
     source_code = "graph = " + str(graph)
@@ -188,7 +194,12 @@ def calculate_sparsity(tensor):
     backsim = BackendSimulator(backend_path, stonne_config_path)
     result_path = backsim.simulation(onnx_path)
     result = BackendSimulator.get_result_from_file(result_path)
-    out.copy_(torch.matmul(a.cpu(), b.cpu()))
+
+    # Load result data
+    with open(c_result, 'rb') as f:
+        np_array = np.fromfile(f, dtype=TORCH_TO_NUMPY[out.dtype])
+        src_tensor = torch.as_strided(torch.from_numpy(np_array), out.size(), out.stride())
+        out.copy_(src_tensor.to(dtype=out.dtype))
 
 custom_lib.define("_sparse_mm(Tensor a, Tensor b, Tensor out) -> Tensor")
 custom_lib.impl("_sparse_mm", flexagon_frontend, "PrivateUse1")

From 0b793e3e7fc66cfc1d821e4aedac6e9b65104a27 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 19 Feb 2025 04:04:55 +0000
Subject: [PATCH 131/432] [BackendSim] Heterogeneous core implement

---
 .../configs/heterogeneous_c1_simple_noc.json  | 32 ++++++++++++++++
 PyTorchSimBackend/include/SimulationConfig.h  |  3 +-
 PyTorchSimBackend/include/Simulator.h         |  1 +
 PyTorchSimBackend/src/Common.cc               | 38 +++++++++++--------
 PyTorchSimBackend/src/Simulator.cc            |  8 ++--
 5 files changed, 60 insertions(+), 22 deletions(-)
 create mode 100644 PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json

diff --git a/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json b/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json
new file mode 100644
index 00000000..21d045a3
--- /dev/null
+++ b/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json
@@ -0,0 +1,32 @@
+{
+  "num_cores" : 2,
+  "num_sp_cores" : 1,
+  "stonne_config_path" : "/root/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "core_freq" : 940,
+  "sram_size" : 65536,
+  "core_print_interval" : 10000,
+  "num_systolic_array_per_core" : 1,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" : 940,
+  "dram_channels": 32,
+  "dram_req_size": 32,
+  "dram_latency" : 10,
+  "dram_size" : 32,
+  "dram_nbl" : 1,
+  "dram_print_interval": 10000,
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq" : 7000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
+
+  "precision" : 4,
+  "scheduler" : "simple",
+  "num_partition" : 1,
+  "partition": {
+    "core_0":0,
+    "core_1":0
+  }
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/PyTorchSimBackend/include/SimulationConfig.h
index bbdd27c3..3d655b63 100644
--- a/PyTorchSimBackend/include/SimulationConfig.h
+++ b/PyTorchSimBackend/include/SimulationConfig.h
@@ -5,7 +5,7 @@
 
 using json = nlohmann::json;
 
-enum class CoreType { WS_MESH, STONNE };
+enum class CoreType { WS_MESH, STONNE, HETEROGENEOUS };
 
 enum class DramType { SIMPLE, RAMULATOR1, RAMULATOR2 };
 
@@ -18,6 +18,7 @@ struct SimulationConfig {
   CoreType core_type = CoreType::WS_MESH;
   std::string stonne_config_path;
   uint32_t num_cores;
+  uint32_t num_sp_cores;
   uint32_t core_freq;
   uint32_t sram_size;
   uint32_t core_print_interval = 0;
diff --git a/PyTorchSimBackend/include/Simulator.h b/PyTorchSimBackend/include/Simulator.h
index f00513ac..2e294616 100644
--- a/PyTorchSimBackend/include/Simulator.h
+++ b/PyTorchSimBackend/include/Simulator.h
@@ -42,6 +42,7 @@ class Simulator {
   uint32_t get_dest_node(mem_fetch *access);
   SimulationConfig _config;
   uint32_t _n_cores;
+  uint32_t _n_sp_cores;
   uint32_t _noc_node_per_core;
   uint32_t _n_memories;
   uint32_t _memory_req_size;
diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc
index 60da8942..bafe2fce 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/PyTorchSimBackend/src/Common.cc
@@ -18,28 +18,34 @@ SimulationConfig initialize_config(json config) {
   SimulationConfig parsed_config;
 
   /* Core configs */
-  if (config.contains("core_type")) {
-    if ((std::string)config["core_type"] == "ws_mesh")
-      parsed_config.core_type = CoreType::WS_MESH;
-    else if ((std::string)config["core_type"] == "stonne"){
-      parsed_config.core_type = CoreType::STONNE;
-      if (config.contains("stonne_config_path"))
-        parsed_config.stonne_config_path = config["stonne_config_path"];
-      else
-        throw std::runtime_error("Stonne config path is missing");
-    } else
-      throw std::runtime_error(fmt::format("Not implemented dram type {} ",
-                                          (std::string)config["core_type"]));
-  } else {
-    parsed_config.core_type = CoreType::WS_MESH;
-  }
+  if (config.contains("num_sp_cores"))
+    parsed_config.num_sp_cores = config["num_sp_cores"];
+  else
+    parsed_config.num_sp_cores = 0;
   parsed_config.num_cores = config["num_cores"];
   parsed_config.core_freq = config["core_freq"];
   parsed_config.sram_size = config["sram_size"];
-  if (config.contains("num_systolic_array_per_core"))
+  if (config.contains("num_systolic_array_per_core")) {
     parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"];
+    if (parsed_config.num_cores == parsed_config.num_sp_cores)
+      spdlog::warn("Systolic array is not used in sparse core");
+  }
   parsed_config.core_print_interval = get_config_value<uint32_t>(config, "core_print_interval");
 
+  if (parsed_config.num_sp_cores) {
+    parsed_config.core_type = CoreType::STONNE;
+    if (parsed_config.num_cores > parsed_config.num_sp_cores)
+      parsed_config.core_type = CoreType::HETEROGENEOUS;
+    else if (parsed_config.num_cores < parsed_config.num_sp_cores)
+      throw std::runtime_error("Core number should be larger or equal to sparse core number");
+
+    if (config.contains("stonne_config_path"))
+      parsed_config.stonne_config_path = config["stonne_config_path"];
+    else
+      throw std::runtime_error("Stonne config path is missing");
+  } else if (parsed_config.num_sp_cores == 0)
+    parsed_config.core_type = CoreType::WS_MESH;
+
   /* DRAM config */
   if ((std::string)config["dram_type"] == "simple")
     parsed_config.dram_type = DramType::SIMPLE;
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index 6182ff00..02a0546d 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -12,6 +12,7 @@ Simulator::Simulator(SimulationConfig config)
   _slot_id = 0;
   _max_slot = 2;
   _n_cores = config.num_cores;
+  _n_sp_cores = config.num_sp_cores;
   _n_memories = config.dram_channels;
   _memory_req_size = config.dram_req_size;
   _noc_node_per_core = config.icnt_node_per_core;
@@ -22,16 +23,13 @@ Simulator::Simulator(SimulationConfig config)
   // Create core objects
   _cores.resize(_n_cores);
   for (int core_index = 0; core_index < _n_cores; core_index++) {
-    if (config.core_type == CoreType::WS_MESH) {
+    if (core_index < _n_cores-_n_sp_cores) {
       spdlog::info("[Config/Core] Core {}: {} MHz, Spad size: {} KB, Systolic array per core: {}",
         core_index, config.core_freq , config.sram_size, config.num_systolic_array_per_core);
       _cores.at(core_index) = std::make_unique<Core>(core_index, _config);
-    } else if (config.core_type == CoreType::STONNE) {
+    } else {
       spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq);
       _cores.at(core_index) = std::make_unique<SparseCore>(core_index, _config);
-    } else {
-      spdlog::error("[Configuration] Invalid core type...!");
-      exit(EXIT_FAILURE);
     }
   }
 

From b3fa7c5a08aa15b2e1381cf113061f2a7d4262d4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 19 Feb 2025 04:44:38 +0000
Subject: [PATCH 132/432] [Backendsim] Heterogeneouse config modification

---
 .../configs/heterogeneous_c1_simple_noc.json  |  4 +-
 PyTorchSimBackend/include/SimulationConfig.h  |  5 +--
 PyTorchSimBackend/src/Common.cc               | 45 ++++++++++---------
 PyTorchSimBackend/src/Simulator.cc            |  8 ++--
 4 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json b/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json
index 21d045a3..8fad4829 100644
--- a/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json
+++ b/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json
@@ -1,6 +1,6 @@
 {
+  "core_type" : ["ws_mesh", "stonne"],
   "num_cores" : 2,
-  "num_sp_cores" : 1,
   "stonne_config_path" : "/root/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
   "core_freq" : 940,
   "sram_size" : 65536,
@@ -20,7 +20,7 @@
   "icnt_type" : "simple",
   "icnt_latency" : 7,
   "icnt_freq" : 7000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m32.icnt",
 
   "precision" : 4,
   "scheduler" : "simple",
diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/PyTorchSimBackend/include/SimulationConfig.h
index 3d655b63..713ac008 100644
--- a/PyTorchSimBackend/include/SimulationConfig.h
+++ b/PyTorchSimBackend/include/SimulationConfig.h
@@ -5,7 +5,7 @@
 
 using json = nlohmann::json;
 
-enum class CoreType { WS_MESH, STONNE, HETEROGENEOUS };
+enum class CoreType { WS_MESH, STONNE };
 
 enum class DramType { SIMPLE, RAMULATOR1, RAMULATOR2 };
 
@@ -15,10 +15,9 @@ enum class L2CacheType { NOCACHE, READONLY };
 
 struct SimulationConfig {
   /* Core config */
-  CoreType core_type = CoreType::WS_MESH;
+  std::vector<CoreType> core_type;
   std::string stonne_config_path;
   uint32_t num_cores;
-  uint32_t num_sp_cores;
   uint32_t core_freq;
   uint32_t sram_size;
   uint32_t core_print_interval = 0;
diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc
index bafe2fce..5d2a6ece 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/PyTorchSimBackend/src/Common.cc
@@ -18,33 +18,36 @@ SimulationConfig initialize_config(json config) {
   SimulationConfig parsed_config;
 
   /* Core configs */
-  if (config.contains("num_sp_cores"))
-    parsed_config.num_sp_cores = config["num_sp_cores"];
-  else
-    parsed_config.num_sp_cores = 0;
   parsed_config.num_cores = config["num_cores"];
+  if (config.contains("core_type")) {
+    std::vector<std::string> core_types = config["core_type"].get<std::vector<std::string>>();
+
+    if (core_types.size() != parsed_config.num_cores)
+      throw std::runtime_error("Mismatch between num_cores and core_type list size");
+
+    for (const auto& core_type : core_types) {
+      if (core_type == "ws_mesh") {
+        parsed_config.core_type.push_back(CoreType::WS_MESH);
+      } else if (core_type == "stonne") {
+        parsed_config.core_type.push_back(CoreType::STONNE);
+      } else {
+        throw std::runtime_error(fmt::format("Not implemented core type: {}", core_type));
+      }
+    }
+  } else {
+    /* Used WS as default */
+    for (int i=0; i<parsed_config.num_cores; i++)
+      parsed_config.core_type.push_back(CoreType::WS_MESH);
+  }
   parsed_config.core_freq = config["core_freq"];
   parsed_config.sram_size = config["sram_size"];
-  if (config.contains("num_systolic_array_per_core")) {
+  if (config.contains("num_systolic_array_per_core"))
     parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"];
-    if (parsed_config.num_cores == parsed_config.num_sp_cores)
-      spdlog::warn("Systolic array is not used in sparse core");
-  }
   parsed_config.core_print_interval = get_config_value<uint32_t>(config, "core_print_interval");
 
-  if (parsed_config.num_sp_cores) {
-    parsed_config.core_type = CoreType::STONNE;
-    if (parsed_config.num_cores > parsed_config.num_sp_cores)
-      parsed_config.core_type = CoreType::HETEROGENEOUS;
-    else if (parsed_config.num_cores < parsed_config.num_sp_cores)
-      throw std::runtime_error("Core number should be larger or equal to sparse core number");
-
-    if (config.contains("stonne_config_path"))
-      parsed_config.stonne_config_path = config["stonne_config_path"];
-    else
-      throw std::runtime_error("Stonne config path is missing");
-  } else if (parsed_config.num_sp_cores == 0)
-    parsed_config.core_type = CoreType::WS_MESH;
+  /* Stonne config */ 
+  if (config.contains("stonne_config_path"))
+    parsed_config.stonne_config_path = config["stonne_config_path"];
 
   /* DRAM config */
   if ((std::string)config["dram_type"] == "simple")
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index 02a0546d..2d8a86d1 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -12,7 +12,6 @@ Simulator::Simulator(SimulationConfig config)
   _slot_id = 0;
   _max_slot = 2;
   _n_cores = config.num_cores;
-  _n_sp_cores = config.num_sp_cores;
   _n_memories = config.dram_channels;
   _memory_req_size = config.dram_req_size;
   _noc_node_per_core = config.icnt_node_per_core;
@@ -23,13 +22,16 @@ Simulator::Simulator(SimulationConfig config)
   // Create core objects
   _cores.resize(_n_cores);
   for (int core_index = 0; core_index < _n_cores; core_index++) {
-    if (core_index < _n_cores-_n_sp_cores) {
+    if (config.core_type[core_index] == CoreType::WS_MESH) {
       spdlog::info("[Config/Core] Core {}: {} MHz, Spad size: {} KB, Systolic array per core: {}",
         core_index, config.core_freq , config.sram_size, config.num_systolic_array_per_core);
       _cores.at(core_index) = std::make_unique<Core>(core_index, _config);
-    } else {
+    } else if(config.core_type[core_index] == CoreType::STONNE) {
       spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq);
       _cores.at(core_index) = std::make_unique<SparseCore>(core_index, _config);
+    } else {
+      throw std::runtime_error(fmt::format("Not implemented Core type {} ",
+                                          (int)config.core_type[core_index]));
     }
   }
 

From 68f13bcc40f02eb235140803c534ed1058a05906 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 19 Feb 2025 05:12:53 +0000
Subject: [PATCH 133/432] [Frontend] Dryrun support for sparse.mm

---
 PyTorchSimFrontend/extension_op.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 09d5feb0..f02772a5 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -5,6 +5,7 @@
 import torch
 import numpy as np
 from torch._inductor.select_algorithm import ExternKernelChoice
+from torch._inductor.codecache import get_hash
 from AsmParser.tog_generator import tog_generator
 from torch._inductor.codecache import write
 from PyTorchSimFrontend.extension_codecache import get_write_path
@@ -176,6 +177,7 @@ def calculate_sparsity(tensor):
         }
     }
     source_code = "graph = " + str(graph)
+    torch.ops.extension_op._sparse_mm.future = get_hash(source_code)
 
     write_path = get_write_path(source_code)
     key, raw_tog_path = write(source_code, "py", specified_dir=write_path)
@@ -187,6 +189,11 @@ def calculate_sparsity(tensor):
         offset=0,
         vector_lane=0
     )
+
+    is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
+    if is_dryrun:
+        return
+
     onnx_path = os.path.join(write_path, "tile_graph.onnx")
     #attribute_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "attribute")
     backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
@@ -203,4 +210,4 @@ def calculate_sparsity(tensor):
 
 custom_lib.define("_sparse_mm(Tensor a, Tensor b, Tensor out) -> Tensor")
 custom_lib.impl("_sparse_mm", flexagon_frontend, "PrivateUse1")
-custom_lib.impl("_sparse_mm", flexagon_frontend, "AutogradPrivateUse1")
+custom_lib.impl("_sparse_mm", flexagon_frontend, "AutogradPrivateUse1")
\ No newline at end of file

From 166b7511a4137657e64f78179aa7fe1e82f38dd1 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 19 Feb 2025 05:22:25 +0000
Subject: [PATCH 134/432] [Frontend] Get DRYRUN, EAGER_MODE env param at the
 running point

---
 PyTorchSimFrontend/extension_codecache.py     | 2 +-
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 3 ++-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    | 7 +++++--
 Simulator/simulator.py                        | 3 ++-
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index ee4e8c17..67845c72 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -328,7 +328,7 @@ def dummy_simulator(*args, **kwargs):
         def dryrun_simulator(*args, **kwargs):
             key = future.result()
 
-        is_dryrun = extension_config.CONFIG_BACKENDSIM_DRYRUN
+        is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
         target_simulator = dryrun_simulator if is_dryrun else dummy_simulator
         target_simulator.arg_attributes = arg_attributes
         target_simulator.future = future
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index a267c93f..bb6ee2a0 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -658,6 +658,7 @@ def render(self,
         return code
 
     def outer_func_render(self, kernel_name, input_args):
+        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
         options = dict(
             KERNEL_NAME=kernel_name,
             FUNC_NAME=self.function_name,
@@ -670,7 +671,7 @@ def outer_func_render(self, kernel_name, input_args):
             MULTI_TILE=self.is_multi_tile(self.input_shape[1]),
             SINGLE_BATCH=self.is_single_batch(self.input_shape[0]),
             VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
-            BACKENDSIM_EAGER_MODE=extension_config.CONFIG_BACKENDSIM_EAGER_MODE,
+            BACKENDSIM_EAGER_MODE=eager_mode,
             HASH_VALUE=self.hash_value
         )
         code = self._template_from_string(WRAPPER_TEMPLATE).render(**options)
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index dca37f42..87a05302 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -1,3 +1,4 @@
+import os
 import math
 from PyTorchSimFrontend import extension_config
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
@@ -84,7 +85,8 @@ def codegen_nodes(self, nodes):
         ex_kernel.call_kernel(kernel_name)
         _, args, _, _ = ex_kernel.args.mlir_argdefs()
         args = ", ".join(args)
-        if (extension_config.CONFIG_BACKENDSIM_EAGER_MODE):
+        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
+        if (eager_mode):
             V.graph.wrapper_code.writeline(
                 f"yield ({kernel_name}, ({args}))"
             )
@@ -176,7 +178,8 @@ def codegen_template(self, template_node, epilogue_nodes):
         V.graph.removed_buffers |= kernel.removed_buffers
         _, args, _, _ = self.kernel_group.args.mlir_argdefs()
         args = ", ".join(args)
-        if (extension_config.CONFIG_BACKENDSIM_EAGER_MODE):
+        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
+        if (eager_mode):
             target_kernel_name = kernel_name if kernel.outer_func_name is None else kernel.outer_func_name
             V.graph.wrapper_code.writeline(
                 f"yield ({target_kernel_name}, ({args}))"
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index b6292f07..c5184872 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -147,7 +147,8 @@ def show_progress():
         gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-d", dir_path, extension_config.CONFIG_GEM5_SCRIPT_PATH, "-c", target_binary, "--vlane", str(vectorlane_size)]
         try:
             # Create progress thread
-            if not extension_config.CONFIG_BACKENDSIM_DRYRUN:
+            is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
+            if is_dryrun:
                 print("[Gem5Simulator] cmd> ", " ".join(gem5_cmd))
                 finished = False
                 progress_thread = threading.Thread(target=show_progress)

From 6a96c487d2125be795f6cade10015afe2e8d7152 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 19 Feb 2025 05:24:34 +0000
Subject: [PATCH 135/432] [Frontend] Small fix for kernel launch by sparse.mm

---
 Scheduler/scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index f1f0b35a..1aa8f814 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -230,7 +230,7 @@ def finish_model(self, model : SchedulerDNNModel, output : torch.Tensor):
             self.finish_req_dict[req] = RequestReturn(RequestReturn.FINISHED)
 
     def prepare_launch_kernel(self, kernel, inputs):
-        key = kernel.future.result()
+        key = kernel.future.result() if hasattr(kernel.future, "result") else kernel.future
         result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key))
         onnx_path = os.path.join(result_path, "tile_graph.onnx")
 

From 7576134cde67c89326cf197b60dc2299c8bff8f8 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 19 Feb 2025 05:43:50 +0000
Subject: [PATCH 136/432] [Test] Add test for sparse.mm

---
 tests/test_sparse_core.py    | 52 ++++++++++++++++++++++++++++++++++++
 tests/test_spmm_scheduler.py | 35 ++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 tests/test_sparse_core.py
 create mode 100644 tests/test_spmm_scheduler.py

diff --git a/tests/test_sparse_core.py b/tests/test_sparse_core.py
new file mode 100644
index 00000000..b54b8be2
--- /dev/null
+++ b/tests/test_sparse_core.py
@@ -0,0 +1,52 @@
+import torch
+import torch.nn as nn
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    message = f"|{name} Test Passed|"
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+class MLP(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size):
+        super(MLP, self).__init__()
+        # self.fc1 = nn.Linear(input_size, hidden_size)
+        self.fc2 = nn.Linear(hidden_size, output_size)
+        # self.relu = nn.ReLU()
+
+        # bias_mean = -0.7
+        # bias_std = 0.5
+        # self.fc1.bias.data = torch.normal(mean=bias_mean, std=bias_std, size=self.fc1.bias.shape)
+
+    def forward(self, x):
+        # x = self.fc1(x)
+        # x = self.relu(x)
+        x = torch.sparse.mm(x, self.fc2.weight.T) + self.fc2.bias
+        return x
+
+def test_sparse_mlp(device, batch=32, input_size=128, hidden_size=128, output_size=128):
+    torch.manual_seed(5462)
+    mlp = MLP(input_size, hidden_size, output_size)
+    mlp = mlp.to(device=device)
+    input = torch.randn(batch, input_size)
+    x1 = input.to(device=device)
+    opt_fn = torch.compile(dynamic=False)(mlp)
+    res = opt_fn(x1)
+
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim'))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    test_sparse_mlp(device, 32, 128, 128, 128)
diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py
new file mode 100644
index 00000000..c1c2cb9d
--- /dev/null
+++ b/tests/test_spmm_scheduler.py
@@ -0,0 +1,35 @@
+import os
+import sys
+import torch
+
+sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
+
+from test_sparse_core import MLP as model1
+
+
+target_model1 = model1(16, 16, 16).eval()
+
+# Init scheduler
+scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE)
+# Register compiled model
+
+opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device()))
+SchedulerDNNModel.register_model("mlp", opt_model1)
+
+# Init input data
+model_input1 = torch.randn(16, 16)
+
+# Init request
+new_request1 = Request("mlp", [model_input1], [], request_queue_idx=0)
+
+
+# Add request to scheduler
+scheduler.add_request(new_request1, request_time=0)
+
+# Run scheduler
+while not scheduler.is_finished():
+    scheduler.schedule()
+
+print("Done")
\ No newline at end of file

From e915fa5bbaa457111fc479d98b8616fe0ddb5796 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 19 Feb 2025 05:24:35 +0000
Subject: [PATCH 137/432] [Scheduler] Fix scheduelr example path

---
 tests/test_scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 6eb2f0e0..cec07fb3 100644
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -2,10 +2,10 @@
 import sys
 import torch
 from torchvision.models import resnet18 as model1
+from test_transformer import DecoderBlock as model2
 
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-from test_extension_backend import DecoderBlock as model2
 
 target_model1 = model1().eval()
 target_model2 = model2(768, 12).eval()

From 96c5fc8a3e46a10bc312ceedc8fcf2e38ac06ecc Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 19 Feb 2025 05:44:01 +0000
Subject: [PATCH 138/432] [Scheduler] Fix eagermode option for convolution

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 4 +++-
 PyTorchSimFrontend/mlir/mlir_lowering.py      | 2 +-
 Simulator/simulator.py                        | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index bb6ee2a0..763acb50 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -467,7 +467,9 @@ def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif
     t_{{ OUT }} = {{ OUT }}.permute(2, 3, 0, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (O_H, O_W, BATCH, O_C)
     {% endif -%}
     {{ KERNEL_NAME }}(t_{{ INPUT }}, t_{{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif %}, t_{{ OUT }})
-
+    {% if BACKENDSIM_EAGER_MODE %}
+    yield ({{KERNEL_NAME}}, (t_{{ INPUT }}, t_{{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif %}, t_{{ OUT }}))
+    {% endif %}
     # Transpose back
     {%- if SINGLE_BATCH %}
     {{ OUT }}.copy_(t_{{ OUT }}.permute(0, 3, 1, 2).contiguous()) # (BATCH, O_H, O_W, O_C) -> (BATCH, O_C, O_H, O_W)
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index f364b546..79fec9ae 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -158,4 +158,4 @@ def sparse_addmm(*args, **kwargs):
 lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()})
 lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
 lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
-# lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # TODO: maxpool shpuld be implemeneted through llir
\ No newline at end of file
+lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # TODO: maxpool shpuld be implemeneted through llir
\ No newline at end of file
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index c5184872..92400dde 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -148,7 +148,7 @@ def show_progress():
         try:
             # Create progress thread
             is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
-            if is_dryrun:
+            if not is_dryrun:
                 print("[Gem5Simulator] cmd> ", " ".join(gem5_cmd))
                 finished = False
                 progress_thread = threading.Thread(target=show_progress)

From 2a7db1aa32f5cf707e10051c9da7c4d1d136d0b1 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 19 Feb 2025 08:28:49 +0000
Subject: [PATCH 139/432] [Scheduler] Support externcall eager mode

---
 .../configs/stonne_c1_simple_noc_tpuv3.json        |  7 +++----
 PyTorchSimFrontend/extension_op.py                 |  6 +++---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py    |  1 +
 Scheduler/scheduler.py                             | 14 ++++++++++----
 tests/test_spmm_scheduler.py                       |  3 ++-
 5 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json
index 8bce391d..56f84eb6 100644
--- a/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json
+++ b/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json
@@ -1,5 +1,5 @@
 {
-  "core_type" : "stonne",
+  "core_type" : ["stonne"],
   "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
   "num_cores" : 1,
   "core_freq" : 940,
@@ -23,9 +23,8 @@
 
   "precision" : 4,
   "scheduler" : "simple",
-  "num_partition" : 2,
+  "num_partition" : 1,
   "partition": {
-    "core_0":0,
-    "core_1":0
+    "core_0":0
   }
 }
\ No newline at end of file
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index f02772a5..ca396f94 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -14,7 +14,7 @@
 
 class MLIRExternKernelChoice(ExternKernelChoice):
     def call_name(self):
-        return f"torch.ops.extension_op.{self.name}"
+        return f"yield from flexagon_frontend"
 
 custom_lib = torch.library.Library("extension_op", "DEF")
 
@@ -80,7 +80,6 @@ def generate_outer_product_matrix(a, b, M, K, N):
     return 0, address_matrix_b, address_matrix_c
 
 def flexagon_frontend(a, b, out):
-    print("FLEXAGON FRONTEND")
     M = a.shape[0]
     N = b.shape[1]
     K = b.shape[0]
@@ -190,11 +189,12 @@ def calculate_sparsity(tensor):
         vector_lane=0
     )
 
+    onnx_path = os.path.join(write_path, "tile_graph.onnx")
     is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
     if is_dryrun:
+        yield (onnx_path, "")
         return
 
-    onnx_path = os.path.join(write_path, "tile_graph.onnx")
     #attribute_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "attribute")
     backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
     stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json'
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index bb6871e7..47b823e4 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -91,6 +91,7 @@ def write_header(self):
 
                 from torch import device, empty, empty_strided
                 from {extension_codecache.__name__} import CustomAsyncCompile
+                from PyTorchSimFrontend.extension_op import flexagon_frontend
                 from torch._inductor.select_algorithm import extern_kernels
 
                 aten = torch.ops.aten
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 1aa8f814..9e9dc434 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -245,9 +245,11 @@ def launch_kernel(self, current_cycle, partion_idx=0):
         result = self.select_kernel(partion_idx)
         if result == self.SELECT_NOTHING:
             return None
-
         kernel, inputs = result
-        onnx_path, attribute_path = self.prepare_launch_kernel(kernel, inputs)
+        if not isinstance(kernel, str):
+            onnx_path, attribute_path = self.prepare_launch_kernel(kernel, inputs)
+        else:
+            onnx_path, attribute_path = kernel, inputs
         self.partition_state[partion_idx] = self.PARTITION_BUSY
         return self.backend_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx)
 
@@ -267,6 +269,10 @@ def select_kernel(self, partition_idx):
             try:
                 kernel, inputs = next(target_model)
 
+                # For extern call
+                if isinstance(kernel, str):
+                    return kernel, inputs
+
                 # For convolution...
                 if not hasattr(kernel, "future"):
                     nested_gen = kernel(*inputs)
@@ -331,7 +337,7 @@ def select_kernel(self, partition_idx):
 class Scheduler:
     FIFO_ENGINE = 0
     RR_ENGINE = 1
-    def __init__(self, num_request_queue=1, engine_select=FIFO_ENGINE) -> None:
+    def __init__(self, num_request_queue=1, engine_select=FIFO_ENGINE, backend_config=extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) -> None:
         self.current_time = 0
         self.max_batch = 1
         self.num_request_queue = num_request_queue
@@ -341,7 +347,7 @@ def __init__(self, num_request_queue=1, engine_select=FIFO_ENGINE) -> None:
         self.finish_queue : List[Request] = []
 
         backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
-        self.backend_simulator = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG)
+        self.backend_simulator = BackendSimulator(backend_path, backend_config)
         self.backend_simulator.interactive_simulation()
         if engine_select == Scheduler.FIFO_ENGINE:
             self.execution_engine = FIFOExecutionEngine(self.backend_simulator, self.num_request_queue)
diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py
index c1c2cb9d..7920857d 100644
--- a/tests/test_spmm_scheduler.py
+++ b/tests/test_spmm_scheduler.py
@@ -12,7 +12,8 @@
 target_model1 = model1(16, 16, 16).eval()
 
 # Init scheduler
-scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE)
+scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
+                      backend_config="/workspace/PyTorchSim/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json")
 # Register compiled model
 
 opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device()))

From 924252cdde2b7a56c271138009b8772d0db60df9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 19 Feb 2025 11:25:00 +0000
Subject: [PATCH 140/432] [Scheduler] Support hetero core scheduling

---
 AsmParser/onnx_utility.py                       |  6 +++++-
 AsmParser/tog_generator.py                      |  4 ++--
 PyTorchSimBackend/include/SparseCore.h          |  2 +-
 PyTorchSimBackend/include/TileGraph.h           |  1 +
 PyTorchSimBackend/include/scheduler/Scheduler.h |  3 ++-
 PyTorchSimBackend/src/Simulator.cc              |  2 +-
 PyTorchSimBackend/src/SparseCore.cc             | 11 ++++++++++-
 PyTorchSimBackend/src/TileGraphParser.cc        |  3 +++
 PyTorchSimBackend/src/scheduler/Scheduler.cc    |  6 ++++--
 PyTorchSimFrontend/extension_op.py              |  9 ++++++---
 tests/test_spmm_scheduler.py                    |  4 +++-
 11 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/AsmParser/onnx_utility.py b/AsmParser/onnx_utility.py
index 0ee0fda0..19e5b5f8 100644
--- a/AsmParser/onnx_utility.py
+++ b/AsmParser/onnx_utility.py
@@ -152,7 +152,7 @@ def connect_nodes(parent, child):
     child.add_parent(parent)
     parent.add_child(child)
 
-def dump_onnx_graph(name, node_list, sa_size, origin_info="dummy_tile_graph"):
+def dump_onnx_graph(name, node_list, sa_size, origin_info="dummy_tile_graph", stonneGraph=False):
     graph_def = onnx.helper.make_graph(
         inputs=[],
         outputs=[],
@@ -164,6 +164,10 @@ def dump_onnx_graph(name, node_list, sa_size, origin_info="dummy_tile_graph"):
     meta = model_def.metadata_props.add()
     meta.key = "systolic_size"
     meta.value = str(sa_size)
+
+    meta = model_def.metadata_props.add()
+    meta.key = "stonneGraph"
+    meta.value = str(int(stonneGraph))
     onnx.save(model_def, name)
 
 if __name__ == "__main__":
diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
index 6a665b7b..80b7b895 100644
--- a/AsmParser/tog_generator.py
+++ b/AsmParser/tog_generator.py
@@ -198,7 +198,7 @@ def parse_graph(self):
             connect_nodes(prev_node, end_node)
             prev_node = end_node
 
-    def generate_tile_graph(self, name="tile_graph", cycle_list=list, offset=int, vector_lane=int):
+    def generate_tile_graph(self, name="tile_graph", cycle_list=list, offset=int, vector_lane=int, stonneGraph=False):
         node_list = list(self.node_dict.values())[1:]
         if len(node_list):
             node_list[0].set_parent([])
@@ -215,7 +215,7 @@ def generate_tile_graph(self, name="tile_graph", cycle_list=list, offset=int, ve
 
         origin_info = "_".join(map(str, self.origins))
         onnx_node_list = [node.to_onnx() for node in node_list] # Exclude root node
-        dump_onnx_graph(name, onnx_node_list, vector_lane, origin_info)
+        dump_onnx_graph(name, onnx_node_list, vector_lane, origin_info, stonneGraph=stonneGraph)
 
 if __name__ == "__main__":
     t = tog_generator()
diff --git a/PyTorchSimBackend/include/SparseCore.h b/PyTorchSimBackend/include/SparseCore.h
index 8f43347e..fab0dd16 100644
--- a/PyTorchSimBackend/include/SparseCore.h
+++ b/PyTorchSimBackend/include/SparseCore.h
@@ -19,7 +19,7 @@ class SparseCore : public Core {
   void push_memory_response(mem_fetch* response) override;
   void print_stats() override;
   void print_current_stats() override;
-
+  std::shared_ptr<Tile> pop_finished_tile() override;
   uint32_t r_port_nr = 1;
   uint32_t w_port_nr = 1;
 private:
diff --git a/PyTorchSimBackend/include/TileGraph.h b/PyTorchSimBackend/include/TileGraph.h
index d17ee4b3..6bd281d5 100644
--- a/PyTorchSimBackend/include/TileGraph.h
+++ b/PyTorchSimBackend/include/TileGraph.h
@@ -63,6 +63,7 @@ class TileGraph {
   std::string get_name() { return _name; }
   void set_arrival_time(cycle_type arrival_time) { _arrival_time = arrival_time; }
   cycle_type get_arrival_time() { return _arrival_time; }
+  bool StonneGraph = false;
 
   class Iterator {
    public:
diff --git a/PyTorchSimBackend/include/scheduler/Scheduler.h b/PyTorchSimBackend/include/scheduler/Scheduler.h
index 21567547..39ab7576 100644
--- a/PyTorchSimBackend/include/scheduler/Scheduler.h
+++ b/PyTorchSimBackend/include/scheduler/Scheduler.h
@@ -3,6 +3,7 @@
 #include "Tile.h"
 #include "Common.h"
 #include "TileGraph.h"
+#include "SimulationConfig.h"
 
 class Scheduler {
  public:
@@ -12,7 +13,7 @@ class Scheduler {
 
   /* For other schedulers */
   virtual std::shared_ptr<Tile> get_tile(int core_id=0, int slot_id=0);
-  virtual const std::shared_ptr<Tile> peek_tile(int core_id=0, int slot_id=0);
+  virtual const std::shared_ptr<Tile> peek_tile(int core_id=0, int slot_id=0, CoreType ctype=CoreType::WS_MESH);
   virtual bool empty();
   virtual bool empty(int core_id);
   virtual void refresh_status();
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index 2d8a86d1..b4637513 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -80,7 +80,7 @@ void Simulator::core_cycle() {
 
     // Issue new tile to core
     for (int i=0; i<_max_slot; i++, _slot_id=(_slot_id + 1) % _max_slot) {
-      const std::shared_ptr<Tile> tile = get_partition_scheduler(core_id)->peek_tile(core_id, _slot_id);
+      const std::shared_ptr<Tile> tile = get_partition_scheduler(core_id)->peek_tile(core_id, _slot_id, _config.core_type[core_id]);
       if (tile->get_status() != Tile::Status::EMPTY && _cores[core_id]->can_issue(tile))  {
         if (tile->get_status() == Tile::Status::INITIALIZED) {
           _cores[core_id]->issue(std::move(get_partition_scheduler(core_id)->get_tile(core_id, _slot_id)));
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 642ecbd7..88ff10d7 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -103,7 +103,7 @@ void SparseCore::cycle() {
     }
   }
 
-  if (stonneCore->isFinished()) {
+  if (stonneCore->isFinished() && _tiles.size()) {
     stonneCore->finish();
     std::shared_ptr<Tile> target_tile = _tiles.front();
     target_tile->set_status(Tile::Status::FINISH);
@@ -130,4 +130,13 @@ void SparseCore::print_stats() {
 
 void SparseCore::print_current_stats() {
   print_stats();
+}
+
+std::shared_ptr<Tile> SparseCore::pop_finished_tile() {
+  std::shared_ptr<Tile> result = std::make_unique<Tile>(Tile(Tile::Status::EMPTY));
+  if (_finished_tiles.size() > 0) {
+    result = std::move(_finished_tiles.front());
+    _finished_tiles.pop();
+  }
+  return result;
 }
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index a5922fa1..3ed4e439 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -712,6 +712,9 @@ TileGraphParser::TileGraphParser(std::string onnx_path, json& attribute_json) {
   }
 
   _tile_graph = std::make_unique<TileGraph>(TileGraph(onnx_path, graph_name));
+  if (std::stoi(this->getMetaByName("stonneGraph")))
+    _tile_graph->StonneGraph=true;
+
   /* Generate subgraph */
   if (_loop_nodes.empty()) {
     spdlog::warn("[TileGraphParser] Null Kernel \"{}\"", onnx_path);
diff --git a/PyTorchSimBackend/src/scheduler/Scheduler.cc b/PyTorchSimBackend/src/scheduler/Scheduler.cc
index 5bf2f6ee..bb5d29cf 100644
--- a/PyTorchSimBackend/src/scheduler/Scheduler.cc
+++ b/PyTorchSimBackend/src/scheduler/Scheduler.cc
@@ -11,10 +11,12 @@ void Scheduler::schedule_graph(std::unique_ptr<TileGraph> tile_graph) {
   refresh_status();
 }
 
-const std::shared_ptr<Tile> Scheduler::peek_tile(int core_id, int slot_id) {
+const std::shared_ptr<Tile> Scheduler::peek_tile(int core_id, int slot_id, CoreType ctype) {
   if (_tile_graph.empty() || _tile_graph.at(0)->get_arrival_time() > *_core_cycle)
     return std::make_unique<Tile>(Tile(Tile::Status::EMPTY));
-  return _tile_graph.at(0)->peek_tile(core_id, slot_id);
+  if ((!_tile_graph.at(0)->StonneGraph && ctype == CoreType::WS_MESH) || (_tile_graph.at(0)->StonneGraph && ctype == CoreType::STONNE))
+    return _tile_graph.at(0)->peek_tile(core_id, slot_id);
+  return std::make_unique<Tile>(Tile(Tile::Status::EMPTY));
 }
 
 std::shared_ptr<Tile> Scheduler::get_tile(int core_id, int slot_id) {
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index ca396f94..0a33c4dd 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -14,7 +14,10 @@
 
 class MLIRExternKernelChoice(ExternKernelChoice):
     def call_name(self):
-        return f"yield from flexagon_frontend"
+        is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
+        if is_dryrun:
+            return f"yield from flexagon_frontend"
+        return f"torch.ops.extension_op.{self.name}"
 
 custom_lib = torch.library.Library("extension_op", "DEF")
 
@@ -176,7 +179,6 @@ def calculate_sparsity(tensor):
         }
     }
     source_code = "graph = " + str(graph)
-    torch.ops.extension_op._sparse_mm.future = get_hash(source_code)
 
     write_path = get_write_path(source_code)
     key, raw_tog_path = write(source_code, "py", specified_dir=write_path)
@@ -186,7 +188,8 @@ def calculate_sparsity(tensor):
         os.path.join(write_path, "tile_graph.onnx"),
         cycle_list=[0],
         offset=0,
-        vector_lane=0
+        vector_lane=0,
+        stonneGraph=True
     )
 
     onnx_path = os.path.join(write_path, "tile_graph.onnx")
diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py
index 7920857d..7c6b76f8 100644
--- a/tests/test_spmm_scheduler.py
+++ b/tests/test_spmm_scheduler.py
@@ -13,7 +13,7 @@
 
 # Init scheduler
 scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
-                      backend_config="/workspace/PyTorchSim/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json")
+                      backend_config="/workspace/PyTorchSim/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json")
 # Register compiled model
 
 opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device()))
@@ -24,10 +24,12 @@
 
 # Init request
 new_request1 = Request("mlp", [model_input1], [], request_queue_idx=0)
+new_request2 = Request("mlp", [model_input1], [], request_queue_idx=0)
 
 
 # Add request to scheduler
 scheduler.add_request(new_request1, request_time=0)
+scheduler.add_request(new_request2, request_time=100)
 
 # Run scheduler
 while not scheduler.is_finished():

From 9732588ae7f667332f0b69f74a745eed40ee4efa Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 19 Feb 2025 16:01:55 +0000
Subject: [PATCH 141/432] [Fix] Config parameters

---
 PyTorchSimFrontend/extension_config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 6f5b3e00..0b04babe 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -34,9 +34,9 @@
 # Backendsim config
 CONFIG_TORCHSIM_BACKEND_CONFIG = os.environ.get('TORCHSIM_CONFIG',
                                         default=f'{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json')
-CONFIG_BACKENDSIM_SPIKE_ONLY = bool(os.environ.get("BACKENDSIM_SPIKE_ONLY", False))
-CONFIG_BACKENDSIM_EAGER_MODE = bool(os.environ.get("BACKENDSIM_EAGER_MODE", default=False))
-CONFIG_BACKENDSIM_DRYRUN = bool(int(os.environ.get('BACKENDSIM_DRYRUN', default=0)))
+CONFIG_BACKENDSIM_SPIKE_ONLY = int(os.environ.get("BACKENDSIM_SPIKE_ONLY", False))
+CONFIG_BACKENDSIM_EAGER_MODE = int(os.environ.get("BACKENDSIM_EAGER_MODE", default=False))
+CONFIG_BACKENDSIM_DRYRUN = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
 CONFIG_BACKENDSIM_DEBUG_LEVEL = os.environ.get("BACKENDSIM_DEBUG_LEVEL", "")
 
 # GEM5 config

From 0f38e0f38eac441faa535c124e5406e2d4217260 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 18 Feb 2025 08:22:08 +0000
Subject: [PATCH 142/432] [Frontend] CONV preload optimization

---
 AsmParser/tog_generator.py                    |  7 +++++--
 PyTorchSimFrontend/extension_codecache.py     | 10 +++++++---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 20 +++++++++----------
 PyTorchSimFrontend/mlir/mlir_template.py      |  2 +-
 4 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
index 80b7b895..6d519ce8 100644
--- a/AsmParser/tog_generator.py
+++ b/AsmParser/tog_generator.py
@@ -198,8 +198,9 @@ def parse_graph(self):
             connect_nodes(prev_node, end_node)
             prev_node = end_node
 
-    def generate_tile_graph(self, name="tile_graph", cycle_list=list, offset=int, vector_lane=int, stonneGraph=False):
+    def generate_tile_graph(self, name="tile_graph", cycle_list=list, x_offset=int, w_offset=int, vector_lane=int, stonneGraph=False):
         node_list = list(self.node_dict.values())[1:]
+        is_preload = True # FIXME: first systolic array node is preload
         if len(node_list):
             node_list[0].set_parent([])
             for iter_node in self.node_dict.values():
@@ -211,7 +212,9 @@ def generate_tile_graph(self, name="tile_graph", cycle_list=list, offset=int, ve
                         iter_node.torchsim_cycle = 10
                     # FIXME.
                     if iter_node.torchsim_compute_type == 1:
-                        iter_node.torchsim_overlapping_cycle = iter_node.torchsim_cycle - offset
+                        offset = w_offset if is_preload else x_offset
+                        is_preload = False
+                        iter_node.torchsim_overlapping_cycle = max(iter_node.torchsim_cycle - offset, 0)
 
         origin_info = "_".join(map(str, self.origins))
         onnx_node_list = [node.to_onnx() for node in node_list] # Exclude root node
diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 67845c72..660481d5 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -200,15 +200,19 @@ def load(cls, source_code,
         cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), " ".join(array_size), vectorlane_size)
 
         # Create TOG
-        offset = vectorlane_size
+        w_offset, x_offset = vectorlane_size, vectorlane_size
         if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size:
-            offset = kwargs['loop_size'][-3]
+            x_offset = kwargs['loop_size'][-3]
+        if kwargs['loop_size'] is not None and kwargs['loop_size'][-1] < vectorlane_size:
+            w_offset = kwargs['loop_size'][-1]
+        w_offset -= x_offset
         tile_graph_generator = tog_generator(origins)
         tile_graph_generator.load_file(raw_tog_path)
         tile_graph_generator.generate_tile_graph(
             os.path.join(write_path, "tile_graph.onnx"),
             cycle_list=cycle_list,
-            offset=offset, # FIXME.
+            x_offset=x_offset, # FIXME.
+            w_offset=w_offset, # FIXME.
             vector_lane=vectorlane_size
         )
         return key
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 763acb50..e4c4cb02 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -101,15 +101,15 @@
                     : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
                 affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
                   affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
+                    %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
+                    %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                     affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
                       affine.for %tile_o_w = 0 to {{ TILE_O_W }} {
                         %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
                         %tile_i_w = affine.apply #map_I_W(%tile_o_w, %tile_k_w)
                         %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_i_w)
-                        %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
                         %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
                         %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
-                        %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                         %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
                               outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
@@ -193,15 +193,15 @@
                     : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
                 affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
                   affine.for %tile_k_w = 0 to 1 {
+                    %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
+                    %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                     affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
                       affine.for %tile_o_w = 0 to {{ TILE_O_W }} {
                         %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
                         %tile_i_w = affine.apply #map_I_W(%tile_o_w, %tile_k_w)
                         %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_i_w)
-                        %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
                         %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
                         %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
-                        %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                         %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
                               outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
@@ -303,14 +303,14 @@
                   : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
               affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
                 affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
+                  %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
+                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                   affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
                     affine.for %tile_o_w = 0 to {{ 1 }} { // TILE_O_W
                       %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
                       %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_k_w)
-                      %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
                       %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
                       %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
-                      %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                       %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                       linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
                             outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
@@ -415,14 +415,14 @@
                   : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
               affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
                 affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
+                  %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
+                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                   affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
                     affine.for %tile_o_w = 0 to {{ 1 }} { // TILE_O_W
                       %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
                       %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_k_w)
-                      %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
                       %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
                       %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ TILE_I_H }}x{{ TILE_K_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
-                      %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                       %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                       linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
                             outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
@@ -584,7 +584,6 @@ def render(self,
           TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
           SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane
           SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-          BATCH = TILE_M # For TOG latency (heuristic)
           x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H, TILE_K)
           y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)
           x_spad_size = TILE_I_W * TILE_I_H * TILE_K
@@ -599,9 +598,8 @@ def render(self,
           y_spad_size = TILE_O_H * TILE_M * TILE_N
           SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
           SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-          BATCH = TILE_M # For TOG latency (heuristic)
 
-        kernel.loop_size = [K_H, K_W, O_H, O_W, BATCH, O_C, I_C]
+        kernel.loop_size = [TILE_M, TILE_N, TILE_K]
 
         # FIXME: transposed inputs not supported
         # W_transposed = self.is_transposed(W)
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 75d52332..a21f3732 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -120,7 +120,7 @@ def gemm_combination_mapping(self, M, N, K):
         max_spad_size = spad_size // 2 # double buffer
         m_pad_factor = self.vector_lane if M > self.vector_lane else 8
         n_pad_factor = self.vector_lane if N > self.vector_lane else 8
-        k_pad_factor = self.vector_lane if K > self.vector_lane else 8
+        k_pad_factor = self.vector_lane if K > self.vector_lane else 1
         M_padded = ((M + m_pad_factor - 1) // m_pad_factor) * m_pad_factor
         N_padded = ((N + n_pad_factor - 1) // n_pad_factor) * n_pad_factor
         K_padded = ((K + k_pad_factor - 1) // k_pad_factor) * k_pad_factor

From 728a73bc8ebea3abe3b5ea14fca07a0a3bd2dfa3 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 18 Feb 2025 10:21:24 +0000
Subject: [PATCH 143/432] [Script] ResNet Experiments Scripts

---
 experiments/resnet18.py | 42 +++++++++++++++++++++++++++++++++++++++++
 experiments/resnet50.py | 42 +++++++++++++++++++++++++++++++++++++++++
 scripts/end2end.sh      | 35 ++++++++++++++++++++++++++++++++++
 3 files changed, 119 insertions(+)
 create mode 100644 experiments/resnet18.py
 create mode 100644 experiments/resnet50.py
 create mode 100755 scripts/end2end.sh

diff --git a/experiments/resnet18.py b/experiments/resnet18.py
new file mode 100644
index 00000000..a27b005d
--- /dev/null
+++ b/experiments/resnet18.py
@@ -0,0 +1,42 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+import argparse
+import subprocess
+import datetime
+
+def run_resnet(device, batch):
+    from torchvision.models import resnet
+    model = resnet._resnet(resnet.BasicBlock, [1, 1, 0, 0], weights=None, progress=False).eval()
+    model.to(device, memory_format=torch.channels_last)
+    input = torch.randn(batch, 3, 224, 224).to(device=device)
+    x1 = input.to(device=device, memory_format=torch.channels_last)
+    opt_fn = torch.compile(dynamic=False)(model)
+    res = opt_fn(x1)
+    print("ResNet18 Simulation Done")
+
+if __name__ == "__main__":
+    import os
+    import sys
+    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+    sys.path.append(base_dir)
+    args = argparse.ArgumentParser()
+    args.add_argument('--batch', type=int, default=1)
+    args.add_argument('--dump_path', type=str, default='results')
+    args = args.parse_args()
+    batch = args.batch
+    result_path = os.path.join(base_dir, args.dump_path, f"resnet18_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    # setting environment variables
+    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    # only timing simulation
+    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
+    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
+        del os.environ['BACKENDSIM_SPIKE_ONLY']
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    run_resnet(device, batch)
+    # compute cycles with shell script
+    subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
new file mode 100644
index 00000000..ff4f0215
--- /dev/null
+++ b/experiments/resnet50.py
@@ -0,0 +1,42 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+import argparse
+import subprocess
+import datetime
+
+def run_resnet(device, batch):
+    from torchvision.models import resnet50
+    model = resnet50().eval()
+    model.to(device, memory_format=torch.channels_last)
+    input = torch.randn(batch, 3, 224, 224).to(device=device)
+    x1 = input.to(device=device, memory_format=torch.channels_last)
+    opt_fn = torch.compile(dynamic=False)(model)
+    res = opt_fn(x1)
+    print("ResNet50 Simulation Done")
+
+if __name__ == "__main__":
+    import os
+    import sys
+    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+    sys.path.append(base_dir)
+    args = argparse.ArgumentParser()
+    args.add_argument('--batch', type=int, default=1)
+    args.add_argument('--dump_path', type=str, default='results')
+    args = args.parse_args()
+    batch = args.batch
+    result_path = os.path.join(base_dir, args.dump_path, f"resnet50_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    # setting environment variables
+    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    # only timing simulation
+    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
+    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
+        del os.environ['BACKENDSIM_SPIKE_ONLY']
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    run_resnet(device, batch)
+    # compute cycles with shell script
+    subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)
diff --git a/scripts/end2end.sh b/scripts/end2end.sh
new file mode 100755
index 00000000..81095bd5
--- /dev/null
+++ b/scripts/end2end.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Base directory
+BASE_PATH=$1 # Input as the first argument
+
+# Initialize the total cycle sum
+total_sum=0
+
+# Find all backendsim_result folders
+mapfile -t backend_folders < <(find "$BASE_PATH" -type d -name "backendsim_result")
+
+# Iterate over each backendsim_result folder
+for backend_folder in "${backend_folders[@]}"; do
+  # echo "Processing folder: $backend_folder"
+
+  # Find all files within the backendsim_result folder
+  mapfile -t files < <(find "$backend_folder" -type f)
+
+  for file in "${files[@]}"; do
+    # echo "Processing $file"
+
+    # Extract the last line containing "Total cycle"
+    total_cycle=$(grep "Total cycle" "$file" | tail -n 1 | sed -E 's/.*Total cycle ([0-9]+).*/\1/')
+    # echo "total_cycle: $total_cycle"
+
+    if [[ -n "$total_cycle" ]]; then
+      # Add the total cycle to the total sum
+      # echo "Adding $total_cycle to total_sum"
+      total_sum=$((total_sum + total_cycle))
+    fi
+  done
+done
+
+# Print the total cycle sum
+echo "total end2end cycle: $total_sum"
\ No newline at end of file

From dd35c1f19f518130220f9004fead844b63c16973 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 19 Feb 2025 04:28:12 +0000
Subject: [PATCH 144/432] [Frontend] GEMM & CONV optimize

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  |  4 +-
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 61 +++++++++----------
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  2 +-
 PyTorchSimFrontend/mlir/mlir_template.py      | 28 ++++-----
 4 files changed, 45 insertions(+), 50 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 85520658..ca690bf2 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -95,8 +95,8 @@ def render(self,
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
         B, M, N, K = X.get_size()[0], X.get_size()[1], W.get_size()[2], X.get_size()[2]
-        TILE_M, TILE_N, TILE_K = kernel.gemmini_gemm_mapping(M, N, K)
-        kernel.loop_size = [M, N, K]
+        TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
+        kernel.loop_size = [TILE_M, TILE_N, TILE_K]
 
         W_transposed = self.is_transposed(W)
         X_transposed = self.is_transposed(X)
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index e4c4cb02..299b69c0 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -133,15 +133,14 @@
 
 MULTI_TILE_CONV_TEMPLATE = r"""
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * BATCH * O_C }} + d1 * {{ BATCH * O_C }} + d2 * {{ O_C }} + d3)> // output (O_H, O_W, BATCH, O_C)
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * BATCH * I_C }} + d1 * {{ I_C }} + d2 * {{ I_C * (I_W + 2 * PADDING_W) }} + d3)> // input (I_H, BATCH, I_W, I_C)
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * BATCH * I_C }} + d1 * {{ I_C * STRIDE_W }} + d2 * {{ I_C * (I_W + 2 * PADDING_W) }} + d3)> // input (I_H, BATCH, I_W, I_C)
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C)
 #map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
-#map_I_W = affine_map<(d0, d1) -> (d0 * {{ STRIDE_W }} + d1)>
 #offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(1 * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
-#offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_I_W * TILE_M, TILE_K) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_K) }})>
+#offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_O_W * TILE_M, TILE_K) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_K) }})>
 #offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_O_W * TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
 
-memref.global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+memref.global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
 memref.global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
 
@@ -153,7 +152,7 @@
   %vstride = arith.constant 1 : index
   %input_axis = arith.constant 3 : index
   %weight_axis = arith.constant 2 : index
-  %input_buffer = memref.get_global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+  %input_buffer = memref.get_global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
   %weight_buffer = memref.get_global @W_spad : memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
   %output_buffer = memref.get_global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
   %tag = memref.alloc() : memref<1xi32>
@@ -179,37 +178,33 @@
           affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N) }}xf32>
           {%- endif %}
           affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
-            affine.for %k_w = 0 to {{ 1 }} step {{ 1 }} {
-              affine.for %tile_k = 0 to {{ TILE_K }} step {{ TILE_K }} {
-                %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
-                %index_i_w = affine.apply #map_I_W(%o_w, %k_w)
-                %index1 = affine.apply #map1(%index_i_h, %index_i_w, %tile_m, %tile_k) // input index
-                %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
-                // Load input matrix
-                memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
-                    : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_I_H }}, {{ SUB_TILE_I_W }}, {{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_I_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
-                // Load kernel matrix
-                memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
-                    : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
-                affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
-                  affine.for %tile_k_w = 0 to 1 {
-                    %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
-                    %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
-                    affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
-                      affine.for %tile_o_w = 0 to {{ TILE_O_W }} {
-                        %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
-                        %tile_i_w = affine.apply #map_I_W(%tile_o_w, %tile_k_w)
-                        %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_i_w)
-                        %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
-                        %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
-                        %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
-                        linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
-                              outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
-                      } { inner_loop=true }
+            affine.for %tile_k = 0 to {{ I_C * K_W }} step {{ TILE_K }} {
+              %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
+              %index1 = affine.apply #map1(%index_i_h, %o_w, %tile_m, %tile_k) // input index
+              %index2 = affine.apply #map2(%k_h, %c0, %tile_k, %tile_n) // weight index
+              // Load input matrix
+              memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
+                  : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_I_H }}, {{ SUB_TILE_I_W }}, {{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_O_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
+              // Load kernel matrix
+              memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
+                  : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
+              affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
+                affine.for %tile_k_w = 0 to 1 {
+                  %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
+                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                  affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
+                    affine.for %tile_o_w = 0 to {{ TILE_O_W }} {
+                      %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
+                      %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_o_w)
+                      %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
+                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ TILE_I_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
+                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
                     } { inner_loop=true }
                   } { inner_loop=true }
                 } { inner_loop=true }
-              } { accumulation_loop=true }
+              } { inner_loop=true }
             } { accumulation_loop=true }
           } { accumulation_loop=true }
           // Store output matrix
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 9426a2e7..07604d61 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -110,7 +110,7 @@ def render(self,
         else:
             TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
             template = GEMM_TEMPLATE
-        kernel.loop_size =[M, N, K]
+        kernel.loop_size =[TILE_M, TILE_N, TILE_K]
         SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
         SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
 
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index a21f3732..49cc6c56 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -121,29 +121,29 @@ def gemm_combination_mapping(self, M, N, K):
         m_pad_factor = self.vector_lane if M > self.vector_lane else 8
         n_pad_factor = self.vector_lane if N > self.vector_lane else 8
         k_pad_factor = self.vector_lane if K > self.vector_lane else 1
+        K = max(K, 8)
         M_padded = ((M + m_pad_factor - 1) // m_pad_factor) * m_pad_factor
         N_padded = ((N + n_pad_factor - 1) // n_pad_factor) * n_pad_factor
         K_padded = ((K + k_pad_factor - 1) // k_pad_factor) * k_pad_factor
+        indexI, indexJ, indexK = (M_padded // self.vector_lane, N_padded // self.vector_lane, K_padded // self.vector_lane)
 
         max_used_spad_size = 0
         mapping = (self.vector_lane, self.vector_lane, self.vector_lane)
-        tile_M_range = range(self.vector_lane, M_padded + 1, self.vector_lane) if M > self.vector_lane else [M_padded]
-        tile_N_range = range(self.vector_lane, N_padded + 1, self.vector_lane) if N > self.vector_lane else [N_padded]
-        tile_K_range = range(self.vector_lane, K_padded + 1, self.vector_lane) if K > self.vector_lane else [K_padded]
-        for tile_M in tile_M_range:
-            for tile_N in tile_N_range:
-                for tile_K in tile_K_range:
+        tile_M_range = sympy.divisors(indexI) if M > self.vector_lane else [1]
+        tile_N_range = sympy.divisors(indexJ) if N > self.vector_lane else [1]
+        tile_K_range = sympy.divisors(indexK) if K > self.vector_lane else [1]
+        maximize_i_j = 1 # reuse weight
+        for k in tile_K_range:
+            tile_K = k * self.vector_lane if K > self.vector_lane else K_padded
+            for j in tile_N_range:
+                tile_N = j * self.vector_lane if N > self.vector_lane else N_padded
+                for i in tile_M_range:
+                    tile_M = i * self.vector_lane if M > self.vector_lane else M_padded
                     used_spad_size = (tile_M * tile_K + tile_K * tile_N + tile_M * tile_N) * self.precision
-                    if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size:
+                    if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and maximize_i_j <= tile_M * tile_N:
                         max_used_spad_size = used_spad_size
+                        maximize_i_j = tile_M * tile_N
                         mapping = (tile_M, tile_N, tile_K)
-
-        Outer_M = math.ceil(M_padded / mapping[0])
-        Outer_N = math.ceil(N_padded / mapping[1])
-        Outer_K = math.ceil(K_padded / mapping[2])
-
-        # split mapping equally to avoid unnecessary padding
-        mapping = (M_padded // Outer_M, N_padded // Outer_N, K_padded // Outer_K)
         return mapping
 
     def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation):

From b6b47e3edf04cab7bdd42f7ced71fdf32b122d78 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 19 Feb 2025 04:28:59 +0000
Subject: [PATCH 145/432] [Script] Bert experiments script

---
 experiments/BERT.py | 131 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 experiments/BERT.py

diff --git a/experiments/BERT.py b/experiments/BERT.py
new file mode 100644
index 00000000..35b3f3aa
--- /dev/null
+++ b/experiments/BERT.py
@@ -0,0 +1,131 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+import math
+import copy
+
+import argparse
+import subprocess
+import datetime
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    message = f"|{name} Test Passed|"
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def clones(module, N):
+    "Produce N identical layers."
+    return torch.nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
+
+class my_MultiheadAttention(torch.nn.Module):
+    def __init__(self, h, d_model, dropout=0.1):
+        super(my_MultiheadAttention, self).__init__()
+        assert d_model % h == 0
+        # We assume d_v always equals d_k
+        self.d_k = d_model // h
+        self.h = h
+        self.linears = clones(torch.nn.Linear(d_model, d_model), 4)
+        self.attn = None
+
+    def attention(self, query, key, value):
+        d_k = query.size(-1)
+        print(torch.matmul(query, key.transpose(-2, -1)))
+
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
+        p_attn = scores.softmax(dim=-1)
+        print(p_attn)
+        return torch.matmul(p_attn, value), p_attn
+
+    def forward(self, query, key, value):
+        # 1) Do all the linear projections in batch from d_model => h x d_k
+        query, key, value = [
+            lin(x).view(-1, self.h, self.d_k).transpose(0, 1).contiguous()
+            for lin, x in zip(self.linears, (query, key, value))
+        ]
+
+        # 2) Apply attention on all the projected vectors in batch.
+        # x, self.attn = self.attention(query, key, value)
+        # d_k = query.size(-1)
+
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)
+        p_attn = scores.softmax(dim=-1)
+        x = torch.matmul(p_attn, value)
+        # 3) "Concat" using a view and apply a final linear.
+        x = (
+            x.transpose(0, 1)
+            .contiguous()
+            .view(-1, self.h * self.d_k)
+        )
+        del query
+        del key
+        del value
+        return self.linears[-1](x)
+
+class DecoderBlock(torch.nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super(DecoderBlock, self).__init__()
+        self.multihead_attn = my_MultiheadAttention(num_heads, embed_dim)
+        self.layer_norm = torch.nn.LayerNorm(embed_dim)
+        self.ffn1 = torch.nn.Linear(embed_dim, embed_dim*4)
+        self.act = torch.nn.ReLU()
+        self.ffn2 = torch.nn.Linear(embed_dim*4, embed_dim)
+
+    def forward(self, x):
+        result = self.multihead_attn(x, x, x)
+        result = self.layer_norm(result+x)
+
+        ffn1_result = self.ffn1(result)
+        act_result = self.act(ffn1_result)
+        ffn2_result = self.ffn2(act_result)
+        return self.layer_norm(ffn2_result + result)
+
+def run_BERT(device, size, input_seq, validation):
+    hidden_dim = {'base': 768, 'large': 1024, 'xlarge': 2048}
+    embedding_size = {'base': 768, 'large': 1024, 'xlarge': 2048}
+    heads = {'base': 12, 'large': 16, 'xlarge': 32} # hidden/64 https://arxiv.org/pdf/1909.11942
+    cpu_query = torch.randn(input_seq, hidden_dim[size])
+    decoder_block = DecoderBlock(embedding_size[size], heads[size])
+    cpu_res = decoder_block(cpu_query)
+
+    query = cpu_query.clone().to(device=device)
+    decoder_block.to(device=device)
+    opt_fn = torch.compile(dynamic=False)(decoder_block)
+    res = opt_fn(query)
+
+    if validation:
+        test_result(f"BERT-{size} Forwrad", res, cpu_res)
+    print(f"BERT-{size} Simulation Done")
+
+if __name__ == "__main__":
+    import os
+    import sys
+    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+    sys.path.append(base_dir)
+    args = argparse.ArgumentParser()
+    args.add_argument('--size', type=str, default='base')
+    args.add_argument('--dump_path', type=str, default='results')
+    args.add_argument('--input_size', type=int, default=512)
+    args.add_argument('--validation', type=int, default=0)
+    args = args.parse_args()
+    size = args.size
+    input_seq = args.input_size
+    result_path = os.path.join(base_dir, args.dump_path, f"BERT_{size}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    # setting environment variables
+    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    # only timing simulation
+    os.environ['TORCHSIM_VALIDATION_MODE'] = str(args.validation)
+    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
+        del os.environ['BACKENDSIM_SPIKE_ONLY']
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    run_BERT(device, size, input_seq, args.validation)
+    # compute cycles with shell script
+    subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)

From bad477c369d72da5d6070be0ce92d9f1a4d5f53b Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 19 Feb 2025 04:55:38 +0000
Subject: [PATCH 146/432] [Fix] prevent negative w_offset

---
 PyTorchSimFrontend/extension_codecache.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 660481d5..ae38ef75 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -205,7 +205,7 @@ def load(cls, source_code,
             x_offset = kwargs['loop_size'][-3]
         if kwargs['loop_size'] is not None and kwargs['loop_size'][-1] < vectorlane_size:
             w_offset = kwargs['loop_size'][-1]
-        w_offset -= x_offset
+        w_offset = max(w_offset - x_offset, 0)
         tile_graph_generator = tog_generator(origins)
         tile_graph_generator.load_file(raw_tog_path)
         tile_graph_generator.generate_tile_graph(

From 77e03556461827838bc3cec2b699b2da286a9c1f Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 19 Feb 2025 10:47:15 +0000
Subject: [PATCH 147/432] [Script] GEMM & CONV script

---
 experiments/conv.py | 64 +++++++++++++++++++++++++++++++++++++++++++++
 experiments/gemm.py | 62 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 126 insertions(+)
 create mode 100644 experiments/conv.py
 create mode 100644 experiments/gemm.py

diff --git a/experiments/conv.py b/experiments/conv.py
new file mode 100644
index 00000000..74dd7f06
--- /dev/null
+++ b/experiments/conv.py
@@ -0,0 +1,64 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+import argparse
+import subprocess
+import datetime
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    message = f"|{name} Test Passed|"
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def run_conv2d(device, batch_size, i_h, i_w, i_c, o_c, kernel_size, stride, padding, validation):
+    def custom_conv2d(a, b, bias):
+        i_c = a.shape[1]
+        o_c = b.shape[0]
+        conv2d = torch.nn.Conv2d(i_c, o_c, b.shape[-1], stride=stride, padding=padding, dilation=1, bias=False)
+        conv2d.weight = torch.nn.Parameter(b)
+        conv2d.bias = torch.nn.Parameter(bias)
+        return conv2d(a)
+    torch.manual_seed(0)
+    conv_input = torch.randn(batch_size, i_c, i_h, i_w).to(memory_format=torch.channels_last, device=device)
+    conv_kernel = torch.randn(o_c, i_c, kernel_size, kernel_size).to(memory_format=torch.channels_last, device=device)
+    conv_bias = torch.randn(o_c).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(custom_conv2d)
+    res = opt_fn(conv_input, conv_kernel, conv_bias)
+    out = custom_conv2d(conv_input.cpu(), conv_kernel.cpu(), conv_bias.cpu())
+    if validation:
+        test_result("CONV Forward", res, y)
+    print(f"CONV {batch_size}_{i_h}_{i_w}_{i_c}_{o_c}_{kernel_size}_{stride}_{padding} (B_H_W_I_C_O_C_K_S_P) Simulation Done")
+
+if __name__ == "__main__":
+    import os
+    import sys
+    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+    sys.path.append(base_dir)
+    args = argparse.ArgumentParser()
+    args.add_argument('--size', nargs='+', type=int, default=[8, 28, 28, 128, 128, 3, 1, 1], help='B H W I_C O_C K S P')
+    args.add_argument('--dump_path', type=str, default='results')
+    args.add_argument('--validation', type=int, default=0)
+    args = args.parse_args()
+    size = args.size
+    size_str = "_".join([str(i) for i in size])
+    result_path = os.path.join(base_dir, args.dump_path, f"CONV_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    # setting environment variables
+    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    # only timing simulation
+    os.environ['TORCHSIM_VALIDATION_MODE'] = str(args.validation)
+    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
+        del os.environ['BACKENDSIM_SPIKE_ONLY']
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    run_conv2d(device, size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7], args.validation)
+    # compute cycles with shell script
+    subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)
diff --git a/experiments/gemm.py b/experiments/gemm.py
new file mode 100644
index 00000000..72363cf2
--- /dev/null
+++ b/experiments/gemm.py
@@ -0,0 +1,62 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+import argparse
+import subprocess
+import datetime
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    message = f"|{name} Test Passed|"
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def run_matmul(device, input_size, hidden_size, output_size, validation):
+    def custom_matmul(a, b):
+        return torch.matmul(a, b)
+    torch.manual_seed(0)
+    input = torch.randn(input_size, hidden_size)
+    weight = torch.randn(hidden_size, output_size)
+    x1 = input.to(device=device)
+    w1 = weight.to(device=device)
+    x2 = input.to("cpu")
+    w2 = weight.to("cpu")
+    opt_fn = torch.compile(dynamic=False)(custom_matmul)
+    res = opt_fn(x1, w1)
+    y = custom_matmul(x2, w2)
+    if validation:
+        test_result("Matmul Forward", res, y)
+    print(f"GEMM {input_size}x{hidden_size}x{output_size} (MxKxN) Simulation Done")
+
+if __name__ == "__main__":
+    import os
+    import sys
+    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+    sys.path.append(base_dir)
+    args = argparse.ArgumentParser()
+    args.add_argument('--size', nargs='+', type=int, default=[128, 128, 128], help='M K N')
+    args.add_argument('--dump_path', type=str, default='results')
+    args.add_argument('--validation', type=int, default=0)
+    args = args.parse_args()
+    size = args.size
+    size_str = "x".join([str(i) for i in size])
+    result_path = os.path.join(base_dir, args.dump_path, f"GEMM_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    # setting environment variables
+    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    # only timing simulation
+    os.environ['TORCHSIM_VALIDATION_MODE'] = str(args.validation)
+    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
+        del os.environ['BACKENDSIM_SPIKE_ONLY']
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    run_matmul(device, size[0], size[1], size[2], args.validation)
+    # compute cycles with shell script
+    subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)

From 3ab24a531b5ae93494da2dcd6197d0ef6af3c95a Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 19 Feb 2025 13:20:20 +0000
Subject: [PATCH 148/432] [Fix] BMM spad_per_lane fix

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py | 6 +++---
 tests/test_bmm.py                            | 4 +++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index ca690bf2..35919ef6 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -126,9 +126,9 @@ def render(self,
         code = self._template_from_string(BMM_TEMPLATE).render(**kernel.render_options)
         kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
 
-        self.header = f"float X_spad[{TILE_M * TILE_K // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float W_spad[{TILE_K * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float Y_spad[{TILE_M * TILE_N // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
+        self.header = f"float X_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_K)}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float W_spad[{kernel.get_spad_size_per_lane(TILE_K, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
+        self.header += f"float Y_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header = f"float X_spad[{TILE_M * TILE_K}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header += f"float W_spad[{TILE_K * TILE_N}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header += f"float Y_spad[{TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
diff --git a/tests/test_bmm.py b/tests/test_bmm.py
index 73831c5c..1114c750 100644
--- a/tests/test_bmm.py
+++ b/tests/test_bmm.py
@@ -49,4 +49,6 @@ def bmm(a, b, bias):
     test_BMM(device, 2, 256, 128, 256)
     test_BMM(device, 2, 128, 256, 256)
     test_BMM(device, 2, 256, 256, 128)
-    test_BMM(device, 4, 256, 256, 256)
\ No newline at end of file
+    test_BMM(device, 4, 256, 256, 256)
+    test_BMM(device, 12, 512, 512, 64)
+    test_BMM(device, 16, 512, 512, 64)
\ No newline at end of file

From b45e70a383c8a5703bee2468f3a3e2fa6a176e8f Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 19 Feb 2025 16:34:19 +0000
Subject: [PATCH 149/432] [Testcase] Attention

---
 tests/test_transformer.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index 1cb1fd50..5959f511 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -93,7 +93,7 @@ def test_DecoderBlock(device):
 
     test_result("Decoder Block Forwrad", res, cpu_res)
 
-def test_Attention(device):
+def test_Attention(device, head=16, seq=512, d_k=64):
     def attention(query, key, value):
         import math
         d_k = query.size(-1)
@@ -102,9 +102,9 @@ def attention(query, key, value):
         return torch.matmul(p_attn, value), p_attn
 
     torch.manual_seed(0)
-    query = torch.randn(16, 128).to(device=device)
-    key = torch.randn(16, 128).to(device=device)
-    value = torch.randn(16, 128).to(device=device)
+    query = torch.randn(head, seq, d_k).to(device=device)
+    key = torch.randn(head, seq, d_k).to(device=device)
+    value = torch.randn(head, seq, d_k).to(device=device)
 
     opt_fn = torch.compile(dynamic=False)(attention)
     res, p_attn = opt_fn(query, key, value)
@@ -133,3 +133,4 @@ def test_MHA(device, num_heads=12, embed_dim=768):
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
     test_DecoderBlock(device)
+    # test_Attention(device, head=16, seq=512, d_k=64)

From 7c08012939aa43917890dd91a6a70546d1b759d8 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 19 Feb 2025 17:34:02 +0000
Subject: [PATCH 150/432] [Scheduler] Convert time domain msec and cycle

---
 Scheduler/scheduler.py | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 9e9dc434..1d019582 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -1,7 +1,6 @@
 from typing import List
 import os
-import sys
-import json
+import numpy as np
 import torch
 from pathlib import Path
 import importlib.util
@@ -23,6 +22,19 @@ def import_module_from_path(module_name, path):
 
     return module
 
+def poisson_request_generator(lambda_requests, max_msec_time=None):
+    current_time = 0.0 # msec
+
+    yield 0
+    while max_msec_time is None or current_time < max_msec_time:
+        inter_arrival_time = np.random.exponential(scale=1000 / lambda_requests)
+        current_time += inter_arrival_time
+
+        if max_msec_time is not None and current_time > max_msec_time:
+            break
+
+        yield current_time
+
 class Request:
     """ Each request has model name, it's own id, and requested time. """
     request_id = 0
@@ -335,11 +347,12 @@ def select_kernel(self, partition_idx):
         return self.SELECT_NOTHING
 
 class Scheduler:
+
     FIFO_ENGINE = 0
     RR_ENGINE = 1
-    def __init__(self, num_request_queue=1, engine_select=FIFO_ENGINE, backend_config=extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) -> None:
+    def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, backend_config=extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) -> None:
         self.current_time = 0
-        self.max_batch = 1
+        self.max_batch = max_batch
         self.num_request_queue = num_request_queue
         self.request_queue : List[List[Request]] = []
         for i in range(self.num_request_queue):
@@ -448,8 +461,8 @@ def schedule(self):
 
         # Need to forward the time until next_arrival_time
         if self.execution_engine.is_idle():
-            reason = self.backend_simulator.until(next_time)
-            self.current_time = self.backend_simulator.cycle()
+            reason = self.backend_simulator.until(self.msec_to_cycle(next_time))
+            self.current_time = self.cycle_to_msec(self.backend_simulator.cycle())
         else:
             self.run(next_time)
         return
@@ -458,7 +471,7 @@ def run(self, until_time):
         def execute_cycle():
             for i in range(self.execution_engine.num_partion):
                 if self.execution_engine.partition_state[i] == ExecutionEngine.PARTITION_IDLE:
-                    ret = self.execution_engine.launch_kernel(self.current_time, i)
+                    ret = self.execution_engine.launch_kernel(self.msec_to_cycle(self.current_time), i)
 
             self.check_finish_request()
             # Check if the stop condition is met
@@ -466,8 +479,8 @@ def execute_cycle():
                 return -1
 
             # Schedule jobs and update the current time
-            result = self.backend_simulator.until(until_time)
-            self.current_time = self.backend_simulator.cycle()
+            result = self.backend_simulator.until(self.msec_to_cycle(until_time))
+            self.current_time = self.cycle_to_msec(self.backend_simulator.cycle())
 
             if result != -1:
                 # Kernel is finished. So set idle state
@@ -504,5 +517,9 @@ def cycle_to_msec(self, cycle):
         return cycle / (freq  / 1000)
 
     def msec_to_cycle(self, msec):
+        # We treat -1 as special time
+        if (msec == -1):
+            return msec
+
         freq = self.backend_simulator.get_core_freq()
-        return msec * (freq / 1000)
\ No newline at end of file
+        return int(msec * (freq / 1000))
\ No newline at end of file

From 71ad1d1437879cbb3a624b15fa94e3b5f11368a0 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 19 Feb 2025 17:44:21 +0000
Subject: [PATCH 151/432] [Test] Add batching scenario test

---
 tests/test_scheduler_batching.py | 33 ++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 tests/test_scheduler_batching.py

diff --git a/tests/test_scheduler_batching.py b/tests/test_scheduler_batching.py
new file mode 100644
index 00000000..47fad69c
--- /dev/null
+++ b/tests/test_scheduler_batching.py
@@ -0,0 +1,33 @@
+import os
+import sys
+import torch
+from torchvision.models import resnet18 as model1
+
+sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator
+
+target_model1 = model1().eval()
+
+# Init scheduler
+scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE)
+# Register compiled model
+opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last))
+SchedulerDNNModel.register_model("resnet18", opt_model1)
+
+# Generate time stamp
+for request_time in poisson_request_generator(500, 30): # 0.5 request / msec, max 30 msec
+    # Init input data
+    model_input1 = torch.randn(1, 3, 224, 224)
+
+    # Init request
+    new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0)
+
+    # Add request to scheduler
+    print("[Reqest] Resnet18 request time: ", request_time)
+    scheduler.add_request(new_request1, request_time=request_time)
+
+# Run scheduler
+while not scheduler.is_finished():
+    scheduler.schedule()
+
+print("Done")
\ No newline at end of file

From 7b275ba668093fcaa443f19e5a8e51a55f2ade03 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 19 Feb 2025 19:27:19 +0000
Subject: [PATCH 152/432] [Test] Sparse.mm test setting

---
 PyTorchSimFrontend/extension_op.py |  3 +-
 tests/test_sparse_core.py          | 33 +++++++-------
 tests/test_spmm_scheduler.py       | 69 ++++++++++++++++++++++--------
 3 files changed, 69 insertions(+), 36 deletions(-)

diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 0a33c4dd..6d294441 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -187,7 +187,8 @@ def calculate_sparsity(tensor):
     tile_graph_generator.generate_tile_graph(
         os.path.join(write_path, "tile_graph.onnx"),
         cycle_list=[0],
-        offset=0,
+        x_offset=0,
+        w_offset=0,
         vector_lane=0,
         stonneGraph=True
     )
diff --git a/tests/test_sparse_core.py b/tests/test_sparse_core.py
index b54b8be2..60cce8e5 100644
--- a/tests/test_sparse_core.py
+++ b/tests/test_sparse_core.py
@@ -2,6 +2,7 @@
 import torch.nn as nn
 import torch._dynamo
 import torch.utils.cpp_extension
+import torch.nn.utils.prune as prune
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     message = f"|{name} Test Passed|"
@@ -15,27 +16,27 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
         exit(1)
 
 class MLP(nn.Module):
-    def __init__(self, input_size, hidden_size, output_size):
+    def __init__(self, input_size=16, hidden_size=16, output_size=16, sparsity_fc1=0, sparsity_fc2=0):
         super(MLP, self).__init__()
-        # self.fc1 = nn.Linear(input_size, hidden_size)
-        self.fc2 = nn.Linear(hidden_size, output_size)
-        # self.relu = nn.ReLU()
+        self.fc1 = nn.Linear(input_size, hidden_size, bias=False)
+        self.fc2 = nn.Linear(hidden_size, output_size, bias=False)
 
-        # bias_mean = -0.7
-        # bias_std = 0.5
-        # self.fc1.bias.data = torch.normal(mean=bias_mean, std=bias_std, size=self.fc1.bias.shape)
+        prune.l1_unstructured(self.fc1, name="weight", amount=sparsity_fc1)
+        prune.l1_unstructured(self.fc2, name="weight", amount=sparsity_fc2)
+
+        prune.remove(self.fc1, "weight")
+        prune.remove(self.fc2, "weight")
 
     def forward(self, x):
-        # x = self.fc1(x)
-        # x = self.relu(x)
-        x = torch.sparse.mm(x, self.fc2.weight.T) + self.fc2.bias
+        x = torch.sparse.mm(x, self.fc1.weight.T)
+        x = torch.sparse.mm(x, self.fc2.weight.T)
         return x
 
-def test_sparse_mlp(device, batch=32, input_size=128, hidden_size=128, output_size=128):
-    torch.manual_seed(5462)
-    mlp = MLP(input_size, hidden_size, output_size)
+def test_sparse_mlp(device, batch_size=32, input_size=128, hidden_size=128, output_size=128, bias_shift=0):
+    torch.manual_seed(0)
+    mlp = MLP(input_size, hidden_size, output_size, bias_shift)
     mlp = mlp.to(device=device)
-    input = torch.randn(batch, input_size)
+    input = torch.randn(batch_size, input_size)
     x1 = input.to(device=device)
     opt_fn = torch.compile(dynamic=False)(mlp)
     res = opt_fn(x1)
@@ -45,8 +46,8 @@ def test_sparse_mlp(device, batch=32, input_size=128, hidden_size=128, output_si
     import os
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim'))
-
     from Scheduler.scheduler import ExecutionEngine
+
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_sparse_mlp(device, 32, 128, 128, 128)
+    test_sparse_mlp(device)
diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py
index 7c6b76f8..462b1be0 100644
--- a/tests/test_spmm_scheduler.py
+++ b/tests/test_spmm_scheduler.py
@@ -2,37 +2,68 @@
 import sys
 import torch
 
+args = sys.argv
+# if len(args) == 6:
+#     batch_size = int(args[1])
+#     input_size = int(args[2])
+#     hidden_size = int(args[3])
+#     output_size = int(args[4])
+#     w1_sparsity = int(args[5]) * 0.01
+#     w2_sparsity = int(args[5]) * 0.01
+# else:
+#     print("Usage: python test_sparse_core.py <batch_size> <input_size> <hidden_size> <output_size> <bias_shift>")
+#     exit(1)
+
+batch_size = 16
+input_size = 16
+hidden_size = 16
+output_size = 16
+w1_sparsity = 0.1
+w2_sparsity = 0.7
+
+print("batch_size: ", batch_size)
+print("input_size: ", input_size)
+print("hidden_size: ", hidden_size)
+print("output_size: ", output_size)
+print("w1_sparsity: ", w1_sparsity)
+print("w2_sparsity: ", w2_sparsity)
+
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
 
 from test_sparse_core import MLP as model1
+from test_transformer import DecoderBlock as model2
 
+with torch.no_grad():
+    target_model1 = model1(input_size, hidden_size, output_size, w1_sparsity, w2_sparsity).eval()
+    target_model2 = model2(768, 12).eval()
 
-target_model1 = model1(16, 16, 16).eval()
-
-# Init scheduler
-scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
-                      backend_config="/workspace/PyTorchSim/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json")
-# Register compiled model
+    # Init scheduler
+    scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
+                        backend_config="/root/workspace/PyTorchSim/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json")
+    # Register compiled model
 
-opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device()))
-SchedulerDNNModel.register_model("mlp", opt_model1)
+    opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device()))
+    opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device()))
+    SchedulerDNNModel.register_model("mlp", opt_model1)
+    SchedulerDNNModel.register_model("bert", opt_model2)
 
-# Init input data
-model_input1 = torch.randn(16, 16)
+    # Init input data
+    model_input1 = torch.randn(batch_size, input_size)
+    model_input2 = torch.randn(512, 768)
 
-# Init request
-new_request1 = Request("mlp", [model_input1], [], request_queue_idx=0)
-new_request2 = Request("mlp", [model_input1], [], request_queue_idx=0)
+    # Init request
+    new_request1 = Request("mlp", [model_input1], [], request_queue_idx=1)
+    new_request2 = Request("bert", [model_input2], [], request_queue_idx=0)
 
 
-# Add request to scheduler
-scheduler.add_request(new_request1, request_time=0)
-scheduler.add_request(new_request2, request_time=100)
+    # Add request to scheduler
+    scheduler.add_request(new_request1, request_time=0)
+    scheduler.add_request(new_request2, request_time=0)
 
-# Run scheduler
-while not scheduler.is_finished():
-    scheduler.schedule()
+    # Run scheduler
+    while not scheduler.is_finished():
+        scheduler.schedule()
 
 print("Done")
\ No newline at end of file

From 6f582c393245932096c8bd81c9ddbb1ffdc63665 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 19 Feb 2025 19:29:10 +0000
Subject: [PATCH 153/432] [Scheduler] Fix batching test case

---
 Scheduler/scheduler.py           | 35 ++++++++++++++++++--------------
 tests/test_scheduler_batching.py |  2 +-
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 1d019582..f0129696 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -59,9 +59,6 @@ def allocate_id(self):
         Request.request_id += 1
         return allocated_id
 
-    def is_arrived(self, current_time):
-        return current_time >= self.arrival_time
-
     def set_start(self, start_time):
         self.state = self.RUNNING
         self.start_time.append(start_time)
@@ -351,7 +348,7 @@ class Scheduler:
     FIFO_ENGINE = 0
     RR_ENGINE = 1
     def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, backend_config=extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) -> None:
-        self.current_time = 0
+        self.current_cycle = 0
         self.max_batch = max_batch
         self.num_request_queue = num_request_queue
         self.request_queue : List[List[Request]] = []
@@ -372,7 +369,7 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE,
 
     def add_request(self, request: Request, request_time=-1):
         """register model at timestamp time"""
-        request_time = self.current_time if request_time == -1 else request_time
+        request_time = self.current_time() if request_time == -1 else request_time
         request.arrival_time = request_time
         self.request_queue[request.request_queue_idx].append(request)
 
@@ -385,7 +382,8 @@ def select(self, request_queue_idx=0) -> List[Request]:
         if not self.request_queue[request_queue_idx]:
             return candidate_req
         for req in self.request_queue[request_queue_idx]:
-            if req.is_arrived(self.current_time) and req.state == Request.QUEUED:
+
+            if self.msec_to_cycle(req.arrival_time) <= self.current_cycle and req.state == Request.QUEUED:
                 candidate_req.append(req)
 
                 # Stop batching
@@ -413,7 +411,7 @@ def nearest_next_reqeust_time(self):
         return nearest_req, nearest_arrival_time
 
     def finish_request(self, req : Request):
-        req.set_finished(self.current_time)
+        req.set_finished(self.current_time())
 
         # Free resources
         req.free_memory()
@@ -431,13 +429,14 @@ def per_schedule(self, request_queue_idx):
         if not request_list:
             return False
 
+        print(f"[Request issue] partition: {request_queue_idx} batch size: {len(request_list)}")
         for req in request_list:
-            req.set_start(self.current_time)
-
+            req.set_start(self.current_time())
+            print(f"[Request-{req.id} issue] partition: {req.request_queue_idx} "
+                f"arrival_time: {req.arrival_time} start_time: {req.start_time[0]}")
         # Submit batched request
         self.execution_engine.submit(request_list, request_queue_idx)
-        print(f"[Request-{req.id} issue] partition: {req.request_queue_idx} "
-              f"arrival_time: {req.arrival_time} start_time: {req.start_time[0]}")
+
         return True
 
     def check_finish_request(self):
@@ -462,7 +461,7 @@ def schedule(self):
         # Need to forward the time until next_arrival_time
         if self.execution_engine.is_idle():
             reason = self.backend_simulator.until(self.msec_to_cycle(next_time))
-            self.current_time = self.cycle_to_msec(self.backend_simulator.cycle())
+            self.current_cycle = self.backend_simulator.cycle()
         else:
             self.run(next_time)
         return
@@ -471,7 +470,7 @@ def run(self, until_time):
         def execute_cycle():
             for i in range(self.execution_engine.num_partion):
                 if self.execution_engine.partition_state[i] == ExecutionEngine.PARTITION_IDLE:
-                    ret = self.execution_engine.launch_kernel(self.msec_to_cycle(self.current_time), i)
+                    ret = self.execution_engine.launch_kernel(self.current_cycle, i)
 
             self.check_finish_request()
             # Check if the stop condition is met
@@ -480,7 +479,7 @@ def execute_cycle():
 
             # Schedule jobs and update the current time
             result = self.backend_simulator.until(self.msec_to_cycle(until_time))
-            self.current_time = self.cycle_to_msec(self.backend_simulator.cycle())
+            self.current_cycle = self.backend_simulator.cycle()
 
             if result != -1:
                 # Kernel is finished. So set idle state
@@ -488,6 +487,9 @@ def execute_cycle():
 
             return result
 
+        if self.current_cycle >= self.msec_to_cycle(until_time):
+            until_time = -1
+
         if until_time == -1:
             while not self.execution_engine.is_idle():
                 result = execute_cycle()
@@ -496,7 +498,7 @@ def execute_cycle():
                     break
 
         else:
-            while self.current_time <= until_time and not self.execution_engine.is_idle():
+            while self.current_cycle <= self.msec_to_cycle(until_time) and not self.execution_engine.is_idle():
                 result = execute_cycle()
                 # if result is not -1, schedule new request
                 if result == -1:
@@ -512,6 +514,9 @@ def is_request_queue_empty(self):
     def is_finished(self):
         return self.is_request_queue_empty() and self.execution_engine.is_idle()
 
+    def current_time(self):
+        return self.cycle_to_msec(self.current_cycle)
+
     def cycle_to_msec(self, cycle):
         freq = self.backend_simulator.get_core_freq()
         return cycle / (freq  / 1000)
diff --git a/tests/test_scheduler_batching.py b/tests/test_scheduler_batching.py
index 47fad69c..4c842ab6 100644
--- a/tests/test_scheduler_batching.py
+++ b/tests/test_scheduler_batching.py
@@ -11,7 +11,7 @@
 # Init scheduler
 scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE)
 # Register compiled model
-opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last))
+opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
 SchedulerDNNModel.register_model("resnet18", opt_model1)
 
 # Generate time stamp

From 9f79b6fb66693e2ae95b0f05ed99d7fd16ef7804 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 19 Feb 2025 20:47:02 +0000
Subject: [PATCH 154/432] [Frontend/sparse] add dummy attribute path to extern
 call

---
 PyTorchSimFrontend/extension_op.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 6d294441..1cbdc29c 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -194,9 +194,10 @@ def calculate_sparsity(tensor):
     )
 
     onnx_path = os.path.join(write_path, "tile_graph.onnx")
+    attribute_path = os.path.join(write_path, "attributes")
     is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
     if is_dryrun:
-        yield (onnx_path, "")
+        yield (onnx_path, attribute_path)
         return
 
     #attribute_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "attribute")

From ffe2808fce82a6053aa02154116725c55c467e63 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 19 Feb 2025 21:20:28 +0000
Subject: [PATCH 155/432] [Backendsim/Stonne] Set port width

---
 PyTorchSimBackend/src/SparseCore.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 88ff10d7..6ef153b3 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -10,6 +10,8 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config)
   unsigned int dn_width = stonneConfig.m_SDMemoryCfg.port_width;
   unsigned int rn_bw = stonneConfig.m_SDMemoryCfg.n_write_ports;
   unsigned int rn_width = stonneConfig.m_SDMemoryCfg.port_width;
+  r_port_nr = dn_bw;
+  w_port_nr = rn_bw;
 
   double compute_throughput = static_cast<double>(num_ms) * core_freq / 1e3; // FLOPs/sec
   double dn_bandwidth = static_cast<double>(dn_bw) * dn_width * core_freq * 1e6 / 8.0 / 1e9; // GB/s
@@ -84,7 +86,8 @@ void SparseCore::cycle() {
   }
 
   // Send Memory Response
-  if (!_response_queue.empty()) {
+  nr_request = 0;
+  while (!_response_queue.empty()) {
     mem_fetch* resp_wrapper = _response_queue.front();
     std::vector<SimpleMem::Request*>* resps = static_cast<std::vector<SimpleMem::Request*>*>(resp_wrapper->get_custom_data());
 
@@ -101,6 +104,8 @@ void SparseCore::cycle() {
       delete resp_wrapper;
       _response_queue.pop();
     }
+    if (nr_request++ > r_port_nr);
+      break;
   }
 
   if (stonneCore->isFinished() && _tiles.size()) {

From 6301f707b3c359845a7d8026899b11d37c5bd482 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 20 Feb 2025 04:29:26 +0000
Subject: [PATCH 156/432] [Scheduler] Don't select kernel until partition is
 idle

---
 Scheduler/scheduler.py           |  8 ++++--
 Simulator/simulator.py           |  2 +-
 tests/test_scheduler_batching.py | 45 ++++++++++++++++++--------------
 3 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index f0129696..0b0a61e1 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -425,15 +425,19 @@ def finish_request(self, req : Request):
               f"response time: {response_time} tbt_time: {tbt_time}")
 
     def per_schedule(self, request_queue_idx):
+        # Wait partition is idle
+        if not self.execution_engine.is_partition_idle(request_queue_idx):
+            return False
+
         request_list = self.select(request_queue_idx)
         if not request_list:
             return False
 
-        print(f"[Request issue] partition: {request_queue_idx} batch size: {len(request_list)}")
+        print(f"[Request issue] partition: {request_queue_idx} batch size: {len(request_list)}", flush=True)
         for req in request_list:
             req.set_start(self.current_time())
             print(f"[Request-{req.id} issue] partition: {req.request_queue_idx} "
-                f"arrival_time: {req.arrival_time} start_time: {req.start_time[0]}")
+                f"arrival_time: {req.arrival_time} start_time: {req.start_time[0]}", flush=True)
         # Submit batched request
         self.execution_engine.submit(request_list, request_queue_idx)
 
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 92400dde..08a582f1 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -256,7 +256,7 @@ def send_command(self, command):
         if self.process:
             try:
                 if not extension_config.CONFIG_BACKENDSIM_DRYRUN:
-                    print(command)
+                    print(command, flush=True)
                 self.process.stdin.write(command + '\n')
                 self.process.stdin.flush()
                 ret = self.process.stderr.readline().strip()
diff --git a/tests/test_scheduler_batching.py b/tests/test_scheduler_batching.py
index 4c842ab6..1c32803d 100644
--- a/tests/test_scheduler_batching.py
+++ b/tests/test_scheduler_batching.py
@@ -2,32 +2,39 @@
 import sys
 import torch
 from torchvision.models import resnet18 as model1
+import argparse
 
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator
 
-target_model1 = model1().eval()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Poisson Request Generator (ms)")
+    parser.add_argument("lambda_requests", nargs="?", type=int, help="Average requests per second (λ)", default=2000)
+    parser.add_argument("max_time", nargs="?", type=int, help="Maximum simulation time in milliseconds", default=30)
 
-# Init scheduler
-scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE)
-# Register compiled model
-opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
-SchedulerDNNModel.register_model("resnet18", opt_model1)
+    args = parser.parse_args()
+    target_model1 = model1().eval()
 
-# Generate time stamp
-for request_time in poisson_request_generator(500, 30): # 0.5 request / msec, max 30 msec
-    # Init input data
-    model_input1 = torch.randn(1, 3, 224, 224)
+    # Init scheduler
+    scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, backend_config="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
+    # Register compiled model
+    opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
+    SchedulerDNNModel.register_model("resnet18", opt_model1)
 
-    # Init request
-    new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0)
+    # Generate time stamp
+    for request_time in poisson_request_generator(args.lambda_requests, args.max_time):
+        # Init input data
+        model_input1 = torch.randn(1, 3, 224, 224)
 
-    # Add request to scheduler
-    print("[Reqest] Resnet18 request time: ", request_time)
-    scheduler.add_request(new_request1, request_time=request_time)
+        # Init request
+        new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0)
 
-# Run scheduler
-while not scheduler.is_finished():
-    scheduler.schedule()
+        # Add request to scheduler
+        print("[Reqest] Resnet18 request time: ", request_time, flush=True)
+        scheduler.add_request(new_request1, request_time=request_time)
 
-print("Done")
\ No newline at end of file
+    # Run scheduler
+    while not scheduler.is_finished():
+        scheduler.schedule()
+
+    print("Done", file=sys.stderr)
\ No newline at end of file

From 501ec9da17436bfbdff707c805ed4d77c541c602 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Thu, 20 Feb 2025 05:27:03 +0000
Subject: [PATCH 157/432] [Frontend] Functionality support for sparse.mm dryrun
 mode

---
 PyTorchSimFrontend/extension_op.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 1cbdc29c..1d24a24e 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -93,12 +93,12 @@ def calculate_sparsity(tensor):
         sparsity_ratio = zero_elements / total_elements * 100
         return math.ceil(sparsity_ratio.item())
 
-    x_sparsity = calculate_sparsity(a)
-    w_sparsity = calculate_sparsity(b)
+    w_sparsity = calculate_sparsity(a)
+    x_sparsity = calculate_sparsity(b)
+    print(f"A Sparsity: {w_sparsity}")
+    print(f"B Sparsity: {x_sparsity}")
     assert(x_sparsity >= 0 and x_sparsity < 100)
     assert(w_sparsity >= 0 and w_sparsity < 100)
-    print(f"A Sparsity: {x_sparsity}")
-    print(f"B Sparsity: {w_sparsity}")
 
     # Generating inputs
     dir_path = os.path.join(
@@ -197,6 +197,7 @@ def calculate_sparsity(tensor):
     attribute_path = os.path.join(write_path, "attributes")
     is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
     if is_dryrun:
+        out.copy_(torch.matmul(b.cpu(), a.cpu()))
         yield (onnx_path, attribute_path)
         return
 

From 32ac00bcb68edf325876a2bb2245e7ead977737b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 20 Feb 2025 06:07:47 +0000
Subject: [PATCH 158/432] [Backend/Stonne] use more port in the stonneCore

---
 PyTorchSimBackend/extern/stonneCore |  2 +-
 PyTorchSimBackend/src/SparseCore.cc | 34 ++++++++++++++++-------------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/PyTorchSimBackend/extern/stonneCore b/PyTorchSimBackend/extern/stonneCore
index a045aa27..7f0a62c9 160000
--- a/PyTorchSimBackend/extern/stonneCore
+++ b/PyTorchSimBackend/extern/stonneCore
@@ -1 +1 @@
-Subproject commit a045aa2718e9f6a48fd2e8320e3b547d294ddac2
+Subproject commit 7f0a62c965ab6afb57a4ac47f1f9ebaa2e151b07
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 6ef153b3..c5fc9c9d 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -19,9 +19,9 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config)
 
   spdlog::info("[Config/StonneCore {}] Compute Throughput: {:.2f} GFLOPs/sec", id, compute_throughput);
   spdlog::info("[Config/StonneCore {}] Distribution Network Bandwidth: {:.2f} GB/s ({} ports x {} bits)",
-             id, dn_bandwidth, dn_bw, dn_width);
+             id, dn_bandwidth, r_port_nr, dn_width);
   spdlog::info("[Config/StonneCore {}] Reduction Network Bandwidth: {:.2f} GB/s ({} ports x {} bits)",
-             id, rn_bandwidth, rn_bw, rn_width);
+             id, rn_bandwidth, w_port_nr, rn_width);
 };
 
 SparseCore::~SparseCore() { delete stonneCore; }
@@ -72,17 +72,21 @@ void SparseCore::cycle() {
   }
 
   int nr_request = 0;
-  for (auto& req_pair : request_merge_table) {
-    uint64_t address;
-    mem_access_type acc_type;
-    mf_type type;
-    std::tie(address, acc_type, type) = req_pair.first;
-    mem_fetch* req_wrapper = new mem_fetch(address, acc_type, type, _config.dram_req_size, -1, req_pair.second);
-    _request_queue.push(req_wrapper);
-    request_merge_table.erase(req_pair.first);
-
-    if (nr_request++ > r_port_nr);
+  while (!request_merge_table.empty() && nr_request <= r_port_nr) {
+    for (auto& req_pair : request_merge_table) {
+      uint64_t address;
+      mem_access_type acc_type;
+      mf_type type;
+      std::tie(address, acc_type, type) = req_pair.first;
+      mem_fetch* req_wrapper = new mem_fetch(address, acc_type, type, _config.dram_req_size, -1, req_pair.second);
+      _request_queue.push(req_wrapper);
+      request_merge_table.erase(req_pair.first);
+
+      spdlog::debug("[SparseCore][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \
+              _core_cycle, req_wrapper->get_addr(), int(req_wrapper->get_access_type()), int(req_wrapper->get_type()), _config.dram_req_size, nr_request);
+      nr_request++;
       break;
+    }
   }
 
   // Send Memory Response
@@ -93,8 +97,8 @@ void SparseCore::cycle() {
 
     SimpleMem::Request* resp = resps->front();
 
-    spdlog::debug("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}", \
-             _core_cycle, _core_cycle - resp->request_time, resp->getAddress(), int(resp_wrapper->get_access_type()), int(resp_wrapper->get_type()), _config.dram_req_size);
+    spdlog::debug("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \
+             _core_cycle, _core_cycle - resp->request_time, resp->getAddress(), int(resp_wrapper->get_access_type()), int(resp_wrapper->get_type()), _config.dram_req_size, nr_request);
 
     resp->setReply();
     stonneCore->pushResponse(resp);
@@ -104,7 +108,7 @@ void SparseCore::cycle() {
       delete resp_wrapper;
       _response_queue.pop();
     }
-    if (nr_request++ > r_port_nr);
+    if (nr_request++ > w_port_nr)
       break;
   }
 

From d98eddcdee3b9398ea68b2bc5613d726bd01dc82 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Thu, 20 Feb 2025 07:17:32 +0000
Subject: [PATCH 159/432] [Fix] Sparse.mm shape fix

---
 PyTorchSimFrontend/extension_op.py       |  2 +-
 PyTorchSimFrontend/mlir/mlir_lowering.py |  4 ++-
 tests/test_sparse_core.py                | 41 +++++++++++++++++++++---
 tests/test_spmm_scheduler.py             | 16 +++++----
 4 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 1d24a24e..f162afb9 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -197,7 +197,7 @@ def calculate_sparsity(tensor):
     attribute_path = os.path.join(write_path, "attributes")
     is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
     if is_dryrun:
-        out.copy_(torch.matmul(b.cpu(), a.cpu()))
+        out.copy_(torch.matmul(a.cpu(), b.cpu()))
         yield (onnx_path, attribute_path)
         return
 
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index 79fec9ae..bc735df8 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -148,8 +148,10 @@ def sparse_addmm(*args, **kwargs):
     print("Custom sparse addmm")
     _, sp_mat1, sp_mat2 = args
     mat1_layout = sp_mat1.layout
+    out_range = args[0].data.data.data.ranges
+    size = [out_range[i] for i in args[0].data.dims]
     layout = ir.FlexibleLayout(
-            device=mat1_layout.device, dtype=mat1_layout.dtype, size=args[0].data.data.data.ranges  # FIXME: Example code for aten op overwrite by externkernel call
+            device=mat1_layout.device, dtype=mat1_layout.dtype, size=size  # FIXME: Example code for aten op overwrite by externkernel call
         )
     return aten_spmm.bind((sp_mat1, sp_mat2), layout).output_node()
 
diff --git a/tests/test_sparse_core.py b/tests/test_sparse_core.py
index 60cce8e5..3d368175 100644
--- a/tests/test_sparse_core.py
+++ b/tests/test_sparse_core.py
@@ -28,13 +28,43 @@ def __init__(self, input_size=16, hidden_size=16, output_size=16, sparsity_fc1=0
         prune.remove(self.fc2, "weight")
 
     def forward(self, x):
-        x = torch.sparse.mm(x, self.fc1.weight.T)
-        x = torch.sparse.mm(x, self.fc2.weight.T)
+        x = torch.sparse.mm(x, self.fc1.weight)
+        x = torch.sparse.mm(x, self.fc2.weight)
         return x
 
-def test_sparse_mlp(device, batch_size=32, input_size=128, hidden_size=128, output_size=128, bias_shift=0):
+class SparseMLP(nn.Module):
+    def __init__(self, input_size=16, hidden_size=16, output_size=16, sparsity_fc1=0, sparsity_fc2=0, device="cpu"):
+        super(SparseMLP, self).__init__()
+
+        self.weight1 = torch.empty(input_size, hidden_size, requires_grad=False)
+        self.weight2 = torch.empty(hidden_size, output_size, requires_grad=False)
+
+        nn.init.xavier_uniform_(self.weight1)
+        nn.init.xavier_uniform_(self.weight2)
+
+        self._apply_pruning(self.weight1, sparsity_fc1)
+        self._apply_pruning(self.weight2, sparsity_fc2)
+
+        self.weight1 = self.weight1.to(device=device)
+        self.weight2 = self.weight2.to(device=device)
+
+        print(f"WEIGHT1 SHAPE > {self.weight1.shape}")  # (input_size, hidden_size)
+        print(f"WEIGHT2 SHAPE > {self.weight2.shape}")  # (hidden_size, output_size)
+
+    def _apply_pruning(self, tensor, sparsity):
+        mask = torch.rand_like(tensor) > sparsity
+        tensor *= mask
+
+    def forward(self, x):
+        x = torch.sparse.mm(x, self.weight1)
+        x = torch.sparse.mm(x, self.weight2)
+        return x
+
+
+def test_sparse_mlp(device, batch_size=32, input_size=128, hidden_size=128, output_size=128):
     torch.manual_seed(0)
-    mlp = MLP(input_size, hidden_size, output_size, bias_shift)
+    # mlp = MLP(input_size, hidden_size, output_size)
+    mlp = SparseMLP(input_size, hidden_size, output_size, device)
     mlp = mlp.to(device=device)
     input = torch.randn(batch_size, input_size)
     x1 = input.to(device=device)
@@ -50,4 +80,5 @@ def test_sparse_mlp(device, batch_size=32, input_size=128, hidden_size=128, outp
 
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_sparse_mlp(device)
+    test_sparse_mlp(device, batch_size=8, input_size=16, hidden_size=32, output_size=64)
+    
diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py
index 462b1be0..2f27c602 100644
--- a/tests/test_spmm_scheduler.py
+++ b/tests/test_spmm_scheduler.py
@@ -15,9 +15,9 @@
 #     exit(1)
 
 batch_size = 16
-input_size = 16
-hidden_size = 16
-output_size = 16
+input_size = 32
+hidden_size = 64
+output_size = 128
 w1_sparsity = 0.1
 w2_sparsity = 0.7
 
@@ -32,18 +32,20 @@
 
 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
 
-from test_sparse_core import MLP as model1
+from test_sparse_core import SparseMLP as model1
 from test_transformer import DecoderBlock as model2
 
 with torch.no_grad():
-    target_model1 = model1(input_size, hidden_size, output_size, w1_sparsity, w2_sparsity).eval()
-    target_model2 = model2(768, 12).eval()
+
 
     # Init scheduler
     scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
                         backend_config="/root/workspace/PyTorchSim/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json")
-    # Register compiled model
 
+    target_model1 = model1(input_size, hidden_size, output_size, w1_sparsity, w2_sparsity, scheduler.execution_engine.module.custom_device()).eval()
+    target_model2 = model2(768, 12).eval()
+
+    # Register compiled model
     opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device()))
     opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device()))
     SchedulerDNNModel.register_model("mlp", opt_model1)

From c16974389a230e66e7c93b0588c90045c5a95a56 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 20 Feb 2025 06:22:13 +0000
Subject: [PATCH 160/432] [Test] Add batch axis in the transformer test

---
 tests/test_transformer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index 5959f511..f153d0e6 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -73,7 +73,7 @@ def __init__(self, embed_dim, num_heads):
         self.ffn2 = torch.nn.Linear(embed_dim*4, embed_dim)
 
     def forward(self, x):
-        result = self.multihead_attn(x, x, x)
+        result = self.multihead_attn(x, x, x).reshape(x.shape)
         result = self.layer_norm(result+x)
 
         ffn1_result = self.ffn1(result)
@@ -82,7 +82,7 @@ def forward(self, x):
         return self.layer_norm(ffn2_result + result)
 
 def test_DecoderBlock(device):
-    cpu_query = torch.randn(512, 768)
+    cpu_query = torch.randn(1, 512, 768)
     decoder_block = DecoderBlock(768, 12)
     cpu_res = decoder_block(cpu_query)
 

From 38a149fa84e9dc71c7192a949c59bd4102e8b0b5 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Thu, 20 Feb 2025 18:56:35 +0000
Subject: [PATCH 161/432] [Fix] BMM with small size

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 35919ef6..64b0c7e1 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -31,7 +31,7 @@
   %tag0 = memref.alloc() : memref<1xi32>
   %tag1 = memref.alloc() : memref<1xi32>
   %tag2 = memref.alloc() : memref<1xi32>{% if not Bias %}
-  %v0 = arith.constant dense<0.0> : vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>{% endif %}
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
   %c0 = arith.constant 0 : index
   {{- kernel.def_local_vars() }}
   affine.for %b=0 to {{ B }} {
@@ -45,16 +45,16 @@
         {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
           , %vstride : memref<
         {%- if Bias_rank == 2 -%} {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
-          xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1, sram_stride=[{{ kernel.vector_lane }}, 1] }
+          xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1 , {{ TILE_M }}] }
         {%- else -%}
-        affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>{% endif %}
+        affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
         affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
           %index0 = affine.apply #map0(%b, %t_m, %t_k)
           %index1 = affine.apply #map1(%b, %t_k, %t_n)
           memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
-             : memref<{{ B * M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
+             : memref<{{ B * M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
           memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
-             : memref<{{ B * K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1, sram_stride=[1, 1]}
+             : memref<{{ B * K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
           linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                   outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
         } { accumulation_loop=true }
@@ -97,6 +97,8 @@ def render(self,
         B, M, N, K = X.get_size()[0], X.get_size()[1], W.get_size()[2], X.get_size()[2]
         TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
         kernel.loop_size = [TILE_M, TILE_N, TILE_K]
+        SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
 
         W_transposed = self.is_transposed(W)
         X_transposed = self.is_transposed(X)
@@ -111,6 +113,8 @@ def render(self,
             TILE_M=TILE_M,
             TILE_N=TILE_N,
             TILE_K=TILE_K,
+            SUB_TILE_M=SUB_TILE_M,
+            SUB_TILE_N=SUB_TILE_N,
             DATA_STYPE="f32",
             DATA_SIZE=4,
             X = X,

From 6e4cb99f5a4e6c048ec626e438f795728eccb38b Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Thu, 20 Feb 2025 18:57:42 +0000
Subject: [PATCH 162/432] [TOG] overlap cycle fixed

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 28 +++++++++++--------
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  3 +-
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 299b69c0..15015a90 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -132,6 +132,7 @@
 """
 
 MULTI_TILE_CONV_TEMPLATE = r"""
+// Multi Channel Tile Conv2D kernel
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * BATCH * O_C }} + d1 * {{ BATCH * O_C }} + d2 * {{ O_C }} + d3)> // output (O_H, O_W, BATCH, O_C)
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * BATCH * I_C }} + d1 * {{ I_C * STRIDE_W }} + d2 * {{ I_C * (I_W + 2 * PADDING_W) }} + d3)> // input (I_H, BATCH, I_W, I_C)
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C)
@@ -561,7 +562,19 @@ def render(self,
         w_spad_size = TILE_K_W * TILE_K_H * TILE_K * TILE_N
         y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
         conv_template = CONV_TEMPLATE
-        if self.is_multi_tile(I_C):
+        TOG_latency = BATCH if TILE_M > BATCH else TILE_M
+        if self.is_single_batch(BATCH) and self.stride[0] != 1:
+          conv_template = SINGLE_BATCH_CONV_STRIDE_TEMPLATE
+          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation) # TODO: implement K_W
+          TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+          x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_W * TILE_I_H * TILE_M, TILE_K)
+          y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N)
+          x_spad_size = TILE_K_W * TILE_I_H * TILE_M * TILE_K
+          y_spad_size = TILE_O_H * TILE_M * TILE_N
+          SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
+          SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+          TOG_latency = O_W if TILE_M > O_W else TILE_M
+        elif self.is_multi_tile(I_C):
           conv_template = MULTI_TILE_CONV_TEMPLATE
           TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation)
           TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
@@ -583,18 +596,9 @@ def render(self,
           y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)
           x_spad_size = TILE_I_W * TILE_I_H * TILE_K
           y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
-        elif self.is_single_batch(BATCH) and self.stride[0] != 1:
-          conv_template = SINGLE_BATCH_CONV_STRIDE_TEMPLATE
-          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation) # TODO: implement K_W
-          TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-          x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_W * TILE_I_H * TILE_M, TILE_K)
-          y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N)
-          x_spad_size = TILE_K_W * TILE_I_H * TILE_M * TILE_K
-          y_spad_size = TILE_O_H * TILE_M * TILE_N
-          SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
-          SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+          TOG_latency = O_W if TILE_M > O_W else TILE_M
 
-        kernel.loop_size = [TILE_M, TILE_N, TILE_K]
+        kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
 
         # FIXME: transposed inputs not supported
         # W_transposed = self.is_transposed(W)
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 07604d61..15b257e0 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -110,7 +110,8 @@ def render(self,
         else:
             TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
             template = GEMM_TEMPLATE
-        kernel.loop_size =[TILE_M, TILE_N, TILE_K]
+        TOG_latency = M if TILE_M > M else TILE_M
+        kernel.loop_size =[TOG_latency, TILE_N, TILE_K]
         SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
         SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
 

From 49838f6be8765ed6752aac9d8df3f50e91b1cea0 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Thu, 20 Feb 2025 19:01:01 +0000
Subject: [PATCH 163/432] [Testcase] add test case

---
 tests/test_transformer.py | 11 ++++++-----
 tests/test_view3D_2D.py   | 12 ++++++++++++
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index f153d0e6..4c542ddb 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -81,9 +81,9 @@ def forward(self, x):
         ffn2_result = self.ffn2(act_result)
         return self.layer_norm(ffn2_result + result)
 
-def test_DecoderBlock(device):
-    cpu_query = torch.randn(1, 512, 768)
-    decoder_block = DecoderBlock(768, 12)
+def test_DecoderBlock(device, head=12, embed_dim=768, input_seq=512):
+    cpu_query = torch.randn(1, input_seq, embed_dim)
+    decoder_block = DecoderBlock(embed_dim, head)
     cpu_res = decoder_block(cpu_query)
 
     query = cpu_query.clone().to(device=device)
@@ -112,9 +112,9 @@ def attention(query, key, value):
     cpu_res, cpu_p_attn = attention(query.cpu(), key.cpu(), value.cpu())
     test_result("Attention Forward", res, cpu_res)
 
-def test_MHA(device, num_heads=12, embed_dim=768):
+def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512):
     MHA = my_MultiheadAttention(num_heads, embed_dim)
-    cpu_query = torch.randn(512, 768)
+    cpu_query = torch.randn(input_seq, embed_dim)
     cpu_res = MHA(cpu_query, cpu_query, cpu_query)
 
     query = cpu_query.clone().to(device=device)
@@ -134,3 +134,4 @@ def test_MHA(device, num_heads=12, embed_dim=768):
     device = module.custom_device()
     test_DecoderBlock(device)
     # test_Attention(device, head=16, seq=512, d_k=64)
+    # test_MHA(device, num_heads=12, embed_dim=768)
diff --git a/tests/test_view3D_2D.py b/tests/test_view3D_2D.py
index 7b754131..f943e20e 100644
--- a/tests/test_view3D_2D.py
+++ b/tests/test_view3D_2D.py
@@ -24,6 +24,17 @@ def view3D_2D(a):
     out = view3D_2D(cpu_input)
     test_result("view 3D->2D", res, out)
 
+def test_view2D_3D(device, size=(512, 768), h=12, d_k=64):
+    def view2D_3D(a):
+        return a.view(-1, h, d_k).transpose(0, 1).contiguous()
+    torch.manual_seed(0)
+    cpu_input = torch.randn(size)
+    input = cpu_input.clone().to(device=device)
+    opt_fn = torch.compile(dynamic=False)(view2D_3D)
+    res = opt_fn(input)
+    out = view2D_3D(cpu_input)
+    test_result("view 2D->3D", res, out)
+
 if __name__ == "__main__":
     import os
     import sys
@@ -34,4 +45,5 @@ def view3D_2D(a):
     device = module.custom_device()
     test_view3D_2D(device)
     test_view3D_2D(device, [12, 512, 64])
+    test_view2D_3D(device, size=(512, 1024), h=16, d_k=64)
 

From efe66204e00b63e2ca690d72e327de76dd9c2afe Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Thu, 20 Feb 2025 19:01:55 +0000
Subject: [PATCH 164/432] [Config] using PyTorchSim config

---
 PyTorchSimFrontend/extension_config.py |  4 ++--
 PyTorchSimFrontend/mlir/mlir_common.py | 14 +++++---------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 0b04babe..9b91b787 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -2,7 +2,7 @@
 import tempfile
 
 # Hardware info config
-CONFIG_VECTOR_LANE = 128
+CONFIG_VECTOR_LANE = int(os.environ.get("TORCHSIM_VECTOR_LANE", default=128))
 CONFIG_SPAD_INFO = {
   "spad_vaddr" : 0xD0000000,
   "spad_paddr" : 0xD0000000,
@@ -33,7 +33,7 @@
 
 # Backendsim config
 CONFIG_TORCHSIM_BACKEND_CONFIG = os.environ.get('TORCHSIM_CONFIG',
-                                        default=f'{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json')
+                                        default=f'{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
 CONFIG_BACKENDSIM_SPIKE_ONLY = int(os.environ.get("BACKENDSIM_SPIKE_ONLY", False))
 CONFIG_BACKENDSIM_EAGER_MODE = int(os.environ.get("BACKENDSIM_EAGER_MODE", default=False))
 CONFIG_BACKENDSIM_DRYRUN = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 3bf31310..af38612c 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -254,15 +254,11 @@ def set_tile_info(self, tile_desc : MLIRMultiDimTile):
 class BaseMLIRHardwareInfo():
     def __init__(self):
         # Default HW setting
-        self.vector_lane = 128
-        self.spad_info = {
-            "spad_vaddr" : 0xD0000000,
-            "spad_paddr" : 0x2000000000,
-            "spad_size" : 128 << 10 # 128KB per Lane
-        }
-        self.precision = 4 # 32bit
-        self.num_cores = 1
-        self.vlen = 32 // self.precision # 256bits / 32bits = 8 [elements]
+        self.vector_lane = extension_config.CONFIG_VECTOR_LANE
+        self.spad_info = extension_config.CONFIG_SPAD_INFO
+        self.precision = extension_config.CONFIG_PRECISION
+        self.num_cores = extension_config.CONFIG_NUM_CORES
+        self.vlen = extension_config.CONFIG_VLEN
 
 class BaseMLIRKernel(common.Kernel, BaseMLIRHardwareInfo):
     newvar_prefix = "%"

From 49f9ba01d665cd575c192394f737d6e7a8577f96 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 20 Feb 2025 10:11:39 +0000
Subject: [PATCH 165/432] [Test] Add script for sparse multi tenant

---
 stonne_experiment/run.sh         |  23 ++++++
 tests/test_scheduler_batching.py |   3 +-
 tests/test_spmm_scheduler.py     | 126 +++++++++++++++----------------
 3 files changed, 87 insertions(+), 65 deletions(-)
 create mode 100755 stonne_experiment/run.sh

diff --git a/stonne_experiment/run.sh b/stonne_experiment/run.sh
new file mode 100755
index 00000000..000d0134
--- /dev/null
+++ b/stonne_experiment/run.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+BATCH_SIZE=128
+INPUT_SIZE=128
+HIDDEN_SIZE=128
+OUTPUT_SIZE=128
+OUTPUT_DIR="sparse_mt_results"
+
+mkdir -p "$OUTPUT_DIR"
+
+for w1 in $(seq 0.0 0.2 1.0); do
+    OUTPUT_FILE="${OUTPUT_DIR}/flops_w1_${w1}_w2_${w1}.txt"
+    python3 ${TORCHSIM_DIR}/tests/test_spmm_scheduler.py \
+        --batch_size $BATCH_SIZE \
+        --input_size $INPUT_SIZE \
+        --hidden_size $HIDDEN_SIZE \
+        --output_size $OUTPUT_SIZE \
+        --w1_sparsity $w1 \
+        --w2_sparsity $w1 > "$OUTPUT_FILE" &
+    echo "Started: w1=$w1, w2=$w2 (Output: $OUTPUT_FILE)"
+done
+wait
+echo "All processes completed!"
diff --git a/tests/test_scheduler_batching.py b/tests/test_scheduler_batching.py
index 1c32803d..f3b54159 100644
--- a/tests/test_scheduler_batching.py
+++ b/tests/test_scheduler_batching.py
@@ -6,6 +6,7 @@
 
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator
+CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Poisson Request Generator (ms)")
@@ -16,7 +17,7 @@
     target_model1 = model1().eval()
 
     # Init scheduler
-    scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, backend_config="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
+    scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
     # Register compiled model
     opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
     SchedulerDNNModel.register_model("resnet18", opt_model1)
diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py
index 2f27c602..10f3c681 100644
--- a/tests/test_spmm_scheduler.py
+++ b/tests/test_spmm_scheduler.py
@@ -1,71 +1,69 @@
 import os
 import sys
 import torch
-
-args = sys.argv
-# if len(args) == 6:
-#     batch_size = int(args[1])
-#     input_size = int(args[2])
-#     hidden_size = int(args[3])
-#     output_size = int(args[4])
-#     w1_sparsity = int(args[5]) * 0.01
-#     w2_sparsity = int(args[5]) * 0.01
-# else:
-#     print("Usage: python test_sparse_core.py <batch_size> <input_size> <hidden_size> <output_size> <bias_shift>")
-#     exit(1)
-
-batch_size = 16
-input_size = 32
-hidden_size = 64
-output_size = 128
-w1_sparsity = 0.1
-w2_sparsity = 0.7
-
-print("batch_size: ", batch_size)
-print("input_size: ", input_size)
-print("hidden_size: ", hidden_size)
-print("output_size: ", output_size)
-print("w1_sparsity: ", w1_sparsity)
-print("w2_sparsity: ", w2_sparsity)
-
+import argparse
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-
 from test_sparse_core import SparseMLP as model1
 from test_transformer import DecoderBlock as model2
-
-with torch.no_grad():
-
-
-    # Init scheduler
-    scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
-                        backend_config="/root/workspace/PyTorchSim/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json")
-
-    target_model1 = model1(input_size, hidden_size, output_size, w1_sparsity, w2_sparsity, scheduler.execution_engine.module.custom_device()).eval()
-    target_model2 = model2(768, 12).eval()
-
-    # Register compiled model
-    opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device()))
-    opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device()))
-    SchedulerDNNModel.register_model("mlp", opt_model1)
-    SchedulerDNNModel.register_model("bert", opt_model2)
-
-    # Init input data
-    model_input1 = torch.randn(batch_size, input_size)
-    model_input2 = torch.randn(512, 768)
-
-    # Init request
-    new_request1 = Request("mlp", [model_input1], [], request_queue_idx=1)
-    new_request2 = Request("bert", [model_input2], [], request_queue_idx=0)
-
-
-    # Add request to scheduler
-    scheduler.add_request(new_request1, request_time=0)
-    scheduler.add_request(new_request2, request_time=0)
-
-    # Run scheduler
-    while not scheduler.is_finished():
-        scheduler.schedule()
-
-print("Done")
\ No newline at end of file
+CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("--batch_size", type=int, default=128, help="Batch size")
+    parser.add_argument("--input_size", type=int, default=128, help="Input layer size")
+    parser.add_argument("--hidden_size", type=int, default=128, help="Hidden layer size")
+    parser.add_argument("--output_size", type=int, default=128, help="Output layer size")
+    parser.add_argument("--w1_sparsity", type=float, default=0.5, help="Sparsity of first layer weights (0 to 1)")
+    parser.add_argument("--w2_sparsity", type=float, default=0.5, help="Sparsity of second layer weights (0 to 1)")
+    args = parser.parse_args()
+
+    batch_size = args.batch_size
+    input_size = args.input_size
+    hidden_size = args.hidden_size
+    output_size = args.output_size
+    w1_sparsity = args.w1_sparsity
+    w2_sparsity = args.w2_sparsity
+
+    print("batch_size: ", batch_size)
+    print("input_size: ", input_size)
+    print("hidden_size: ", hidden_size)
+    print("output_size: ", output_size)
+    print("w1_sparsity: ", w1_sparsity)
+    print("w2_sparsity: ", w2_sparsity)
+
+    with torch.no_grad():
+        # Init scheduler
+        scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
+                            backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json")
+
+        target_model1 = model1(input_size, hidden_size, output_size, w1_sparsity, w2_sparsity, scheduler.execution_engine.module.custom_device()).eval()
+        target_model2 = model2(768, 12).eval()
+
+        # Register compiled model
+        opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device()))
+        opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device()))
+        SchedulerDNNModel.register_model("mlp", opt_model1)
+        SchedulerDNNModel.register_model("bert", opt_model2)
+
+        # Init input data
+        model_input1 = torch.randn(batch_size, input_size)
+        model_input2 = torch.randn(1, 512, 768)
+
+        # Init request
+        new_request1 = Request("mlp", [model_input1], [], request_queue_idx=1)
+        new_request2 = Request("bert", [model_input2], [], request_queue_idx=0)
+
+
+        # Add request to scheduler
+        scheduler.add_request(new_request1, request_time=0)
+        scheduler.add_request(new_request1, request_time=0)
+        scheduler.add_request(new_request1, request_time=0)
+        scheduler.add_request(new_request1, request_time=0)
+        scheduler.add_request(new_request1, request_time=0)
+        scheduler.add_request(new_request1, request_time=0)
+        scheduler.add_request(new_request2, request_time=0)
+
+        # Run scheduler
+        while not scheduler.is_finished():
+            scheduler.schedule()
\ No newline at end of file

From 2d49aeeae792f6fe636a406dc02dfb527cca19bd Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 20 Feb 2025 20:27:27 +0000
Subject: [PATCH 166/432] [Frontned/Stonne] Add inner product config (WIP)

---
 PyTorchSimBackend/include/TileGraphParser.h |   6 +
 PyTorchSimFrontend/extension_op.py          | 217 ++++++++++++++++++--
 stonne_experiment/run.sh                    |   6 +-
 3 files changed, 212 insertions(+), 17 deletions(-)

diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/PyTorchSimBackend/include/TileGraphParser.h
index f2045cfa..c9d16a6d 100644
--- a/PyTorchSimBackend/include/TileGraphParser.h
+++ b/PyTorchSimBackend/include/TileGraphParser.h
@@ -282,6 +282,12 @@ class TileStonneNode : public TileNode {
           desc.rowpointer_matrix_b_init = attribute.s();
       } else if (attribute.name() == "torchsim_stonne_colpointer_matrix_b_init") {
           desc.colpointer_matrix_b_init = attribute.s();
+      } else if (attribute.name() == "torchsim_bitmap_matrix_a_init") {
+          desc.bitmap_matrix_a_init = attribute.s();
+      } else if (attribute.name() == "torchsim_bitmap_matrix_b_init") {
+          desc.bitmap_matrix_b_init = attribute.s();
+      }  else if (attribute.name() == "torchsim_mem_matrix_c_file_name") {
+          desc.mem_matrix_c_file_name = attribute.s();
       } else {
           spdlog::warn("[TileStonneNode] Unrecognized attribute: {}", attribute.name());
       }
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index f162afb9..dacfa699 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -2,6 +2,8 @@
 import subprocess
 import math
 import struct
+from datetime import datetime
+import random
 import torch
 import numpy as np
 from torch._inductor.select_algorithm import ExternKernelChoice
@@ -21,21 +23,22 @@ def call_name(self):
 
 custom_lib = torch.library.Library("extension_op", "DEF")
 
-def generate_outer_product_matrix(a, b, M, K, N):
+def generate_outer_product_matrix(a, b, M, K, N, prefix):
     # Generating matrix A
     data_width = 4
     a_cpu = a.cpu()
     b_cpu = b.cpu()
     value_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-        'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini')
+        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_mem.ini')
     rowA_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerA.in')
+        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_rowpointerA.in')
     colA_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerA.in')
+        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_colpointerA.in')
     rowB_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerB.in')
+        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_rowpointerB.in')
     colB_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerB.in')
+        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_colpointerB.in')
+
     with open(value_pointer, "w") as fd, open(rowA_pointer, "w") as rpA, open(colA_pointer, "w") as cpA, open(rowB_pointer, "w") as rpB, open(colB_pointer, "w") as cpB:
         #generating matrixA
         n_nonzeros=0
@@ -82,6 +85,71 @@ def generate_outer_product_matrix(a, b, M, K, N):
         address_matrix_c=address_matrix_b+(n_nonzeros*data_width)
     return 0, address_matrix_b, address_matrix_c
 
+def generate_inner_product_matrix(a, b, M, K, N, file_name, in_file_bitmap_a, in_file_bitmap_b):
+    data_width = 4
+    a_cpu = a.cpu()
+    b_cpu = b.cpu()
+    matrixA_size=int(M*K)
+    matrixB_size=int(N*K)
+    matrixC_size=int(M*N)
+
+    random.seed(a=0, version=2)
+
+    address_matrix_a = 0
+    with open(file_name, "w") as fd, open(in_file_bitmap_a, "w") as fbA, open(in_file_bitmap_b, "w") as fbB:
+        #generating matrixA
+        n_nonzeros=0
+        for m in range(M):  # Row major
+            for k in range(K):
+                is_sparse = a_cpu[m,k]
+                if(torch.isclose(is_sparse, torch.zeros(1), atol=1e-1)):
+                    if((m==(M-1)) and (k==(K-1))):
+                        fbA.write(str(1))
+                    else:
+                        fbA.write(str(1)+","); #writing a 1 in bitmap
+                    ba = bytearray(struct.pack(">f", is_sparse))  # generating list of bytes
+                    my_int = int.from_bytes(ba, "big")
+                    fd.write(str(my_int))
+                    fd.write(",")
+                    n_nonzeros+=1
+                else:
+                    if((m==(M-1)) and (k==(K-1))): # this is to insert a comma
+                        fbA.write(str(0))
+                        # note no data element is inserted in this case
+                    else:
+                        # note no data element is inserted in this case
+                        fbA.write(str(0)+",")
+
+        address_matrix_b=n_nonzeros*data_width
+        #Generating matrix B
+        n_nonzeros=0
+        bitmapB=list(range(0,matrixB_size))
+        for n in range(0,N):  # Row major
+            for k in range(0,K):
+                is_sparse = b_cpu[k,n]
+                if(torch.isclose(is_sparse, torch.zeros(1), atol=1e-1)):  # value is generated
+                    bitmapB[k*N+n]=1
+                    ba = bytearray(struct.pack(">f", float(is_sparse)))  # generating list of bytes
+                    my_int = int.from_bytes(ba, "big")
+                    fd.write(str(my_int))
+                    fd.write(",")
+                    n_nonzeros+=1
+                else:
+                    # no data element is inserted in this case
+                    bitmapB[k*N+n]=0; #writing a 0
+        # writing the bitmapB in the appropiate order
+        for i in range(0, matrixB_size):
+            fbB.write(str(bitmapB[i]))
+            if(i < (matrixB_size-1)):
+                fbB.write(",")
+        
+        fd.write(str(0)) # Adding a final 0 to the memory which will never be used. This is just to avoid having a last comma.
+        address_matrix_c=address_matrix_b+(n_nonzeros*data_width)
+    print("Offset matrix A: "+str(address_matrix_a))
+    print("Offset matrix B: "+str(address_matrix_b))
+    print("Offset matrix C: "+str(address_matrix_c))
+    return address_matrix_a, matrixA_size, matrixA_size+matrixB_size
+
 def flexagon_frontend(a, b, out):
     M = a.shape[0]
     N = b.shape[1]
@@ -93,6 +161,7 @@ def calculate_sparsity(tensor):
         sparsity_ratio = zero_elements / total_elements * 100
         return math.ceil(sparsity_ratio.item())
 
+    prefix = datetime.now().strftime("%m%d%H%M%S%f")
     w_sparsity = calculate_sparsity(a)
     x_sparsity = calculate_sparsity(b)
     print(f"A Sparsity: {w_sparsity}")
@@ -109,7 +178,7 @@ def calculate_sparsity(tensor):
 
     value_path = os.path.join(
         extension_config.CONFIG_TORCHSIM_DIR,
-        'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini'
+        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}outerproduct_gemm_mem.ini'
     )
 
     if os.path.exists(value_path):
@@ -118,13 +187,13 @@ def calculate_sparsity(tensor):
     else:
         print(f"File does not exist: {value_path}")
 
-    dram_a_address, dram_b_address, dram_c_address = generate_outer_product_matrix(a, b, M, K, N)
-    mem_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini')
-    a_row_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerA.in')
-    a_col_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerA.in')
-    b_row_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_rowpointerB.in')
-    b_col_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_colpointerB.in')
-    c_result = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/result.out')
+    dram_a_address, dram_b_address, dram_c_address = generate_outer_product_matrix(a, b, M, K, N, prefix)
+    mem_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_mem.ini')
+    a_row_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_rowpointerA.in')
+    a_col_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_colpointerA.in')
+    b_row_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_rowpointerB.in')
+    b_col_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_colpointerB.in')
+    c_result = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_result.out')
     graph = {
         0: {
             "node_id": 0,
@@ -214,6 +283,126 @@ def calculate_sparsity(tensor):
         src_tensor = torch.as_strided(torch.from_numpy(np_array), out.size(), out.stride())
         out.copy_(src_tensor.to(dtype=out.dtype))
 
+def flexagon_frontend2(a, b, out):
+    M = a.shape[0]
+    N = b.shape[1]
+    K = b.shape[0]
+
+    def calculate_sparsity(tensor):
+        total_elements = tensor.numel()
+        zero_elements = torch.sum(tensor.cpu() == 0)
+        sparsity_ratio = zero_elements / total_elements * 100
+        return math.ceil(sparsity_ratio.item())
+
+    prefix = ""# datetime.now().strftime("%d_%H_%M_%f")
+    w_sparsity = calculate_sparsity(a)
+    x_sparsity = calculate_sparsity(b)
+    print(f"A Sparsity: {w_sparsity}")
+    print(f"B Sparsity: {x_sparsity}")
+    assert(x_sparsity >= 0 and x_sparsity < 100)
+    assert(w_sparsity >= 0 and w_sparsity < 100)
+    target_path = 'PyTorchSimBackend/extern/stonneCore/tests/innerproduct'
+    # Generating inputs
+    dir_path = os.path.join(
+        extension_config.CONFIG_TORCHSIM_DIR,
+        target_path
+    )
+    os.makedirs(dir_path, exist_ok=True)
+
+    file_name = os.path.join(
+        extension_config.CONFIG_TORCHSIM_DIR,
+        f'{dir_path}/{prefix}_bitmapSpMSpM_gemm_mem.ini'
+    )
+
+    in_file_bitmap_a = f"{dir_path}/{prefix}_bitmapSpMSpM_file_bitmapA_"+str(M)+"_"+str(N)+"_"+str(K)+".in"
+    in_file_bitmap_b = f"{dir_path}/{prefix}_bitmapSpMSpM_file_bitmapB_"+str(M)+"_"+str(N)+"_"+str(K)+".in"
+    c_result = f'{dir_path}/{prefix}_result.out'
+    dram_a_address, dram_b_address, dram_c_address = generate_inner_product_matrix(a, b, M, N, K, file_name, in_file_bitmap_a, in_file_bitmap_b)
+
+    graph = {
+        0: {
+            "node_id": 0,
+            "node_name": "root",
+            "node_type": 0,
+            "parents": [],
+            "children": [1]
+        },
+        1: {
+            "node_id": 1,
+            "node_name": "loopNode",
+            "node_type": 2,
+            "parents": [0],
+            "children": [2],
+            "loop_index": "loop_arg000",
+            "loop_start": 0,
+            "loop_end": 1,
+            "loop_step": 1,
+            "loop_type": "outer_loop"
+        },
+        2: {
+            "node_id": 2,
+            "node_name": "stonneNode",
+            "node_type": 5,
+            "parents": [1],
+            "children": [],
+            # Operation Type
+            "stonne_operation": "bitmapSpMSpM",
+
+            # GEMM Parameters
+            "stonne_GEMM_K": K,
+            "stonne_GEMM_N": N,
+            "stonne_GEMM_M": M,
+
+            # Memory Initialization & File Paths
+            "stonne_mem_init": file_name,
+            "stonne_mem_matrix_c_file_name": c_result,
+
+            # Memory Addresses
+            "stonne_matrix_a_dram_address": dram_a_address,
+            "stonne_matrix_b_dram_address": dram_b_address,
+            "stonne_matrix_c_dram_address": dram_c_address,
+
+            # CSR & Bitmap Initialization
+            "stonne_bitmap_matrix_a_init" : in_file_bitmap_a,
+            "stonne_bitmap_matrix_b_init" : in_file_bitmap_b,
+        }
+    }
+    source_code = "graph = " + str(graph)
+
+    write_path = get_write_path(source_code)
+    key, raw_tog_path = write(source_code, "py", specified_dir=write_path)
+    tile_graph_generator = tog_generator(["flexagon_matmul"])
+    tile_graph_generator.load_file(raw_tog_path)
+    tile_graph_generator.generate_tile_graph(
+        os.path.join(write_path, "tile_graph.onnx"),
+        cycle_list=[0],
+        x_offset=0,
+        w_offset=0,
+        vector_lane=0,
+        stonneGraph=True
+    )
+
+    onnx_path = os.path.join(write_path, "tile_graph.onnx")
+    attribute_path = os.path.join(write_path, "attributes")
+    is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
+    if is_dryrun:
+        out.copy_(torch.matmul(a.cpu(), b.cpu()))
+        yield (onnx_path, attribute_path)
+        return
+
+    #attribute_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "attribute")
+    backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
+    stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json'
+    backsim = BackendSimulator(backend_path, stonne_config_path)
+    result_path = backsim.simulation(onnx_path)
+    result = BackendSimulator.get_result_from_file(result_path)
+
+    # Load result data
+    with open(c_result, 'rb') as f:
+        np_array = np.fromfile(f, dtype=TORCH_TO_NUMPY[out.dtype])
+        src_tensor = torch.as_strided(torch.from_numpy(np_array), out.size(), out.stride())
+        out.copy_(src_tensor.to(dtype=out.dtype))
+
 custom_lib.define("_sparse_mm(Tensor a, Tensor b, Tensor out) -> Tensor")
 custom_lib.impl("_sparse_mm", flexagon_frontend, "PrivateUse1")
 custom_lib.impl("_sparse_mm", flexagon_frontend, "AutogradPrivateUse1")
\ No newline at end of file
diff --git a/stonne_experiment/run.sh b/stonne_experiment/run.sh
index 000d0134..19a35df2 100755
--- a/stonne_experiment/run.sh
+++ b/stonne_experiment/run.sh
@@ -8,16 +8,16 @@ OUTPUT_DIR="sparse_mt_results"
 
 mkdir -p "$OUTPUT_DIR"
 
-for w1 in $(seq 0.0 0.2 1.0); do
+for w1 in $(seq 0.1 0.3 1.0); do
     OUTPUT_FILE="${OUTPUT_DIR}/flops_w1_${w1}_w2_${w1}.txt"
+    echo "Started: w1=$w1, w2=$w1 (Output: $OUTPUT_FILE)"
     python3 ${TORCHSIM_DIR}/tests/test_spmm_scheduler.py \
         --batch_size $BATCH_SIZE \
         --input_size $INPUT_SIZE \
         --hidden_size $HIDDEN_SIZE \
         --output_size $OUTPUT_SIZE \
         --w1_sparsity $w1 \
-        --w2_sparsity $w1 > "$OUTPUT_FILE" &
-    echo "Started: w1=$w1, w2=$w2 (Output: $OUTPUT_FILE)"
+        --w2_sparsity $w1 > "$OUTPUT_FILE"
 done
 wait
 echo "All processes completed!"

From f5b58846b0c042b5adb92554e94930b4500671b9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 20 Feb 2025 21:27:54 +0000
Subject: [PATCH 167/432] [Backend/Stonne] Support multiple stonne engine in a
 chip

---
 ...uv3.json => stonne_big_c1_simple_noc.json} |   2 +
 .../configs/stonne_little_c4_simple_noc.json  |  32 ++++
 PyTorchSimBackend/include/SimulationConfig.h  |   2 +
 PyTorchSimBackend/include/SparseCore.h        |   6 +-
 PyTorchSimBackend/src/Common.cc               |   4 +
 PyTorchSimBackend/src/SparseCore.cc           | 163 +++++++++++-------
 6 files changed, 148 insertions(+), 61 deletions(-)
 rename PyTorchSimBackend/configs/{stonne_c1_simple_noc_tpuv3.json => stonne_big_c1_simple_noc.json} (93%)
 create mode 100644 PyTorchSimBackend/configs/stonne_little_c4_simple_noc.json

diff --git a/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
similarity index 93%
rename from PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json
rename to PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
index 56f84eb6..fbe605e8 100644
--- a/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json
+++ b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
@@ -5,6 +5,8 @@
   "core_freq" : 940,
   "sram_size" : 65536,
   "core_print_interval" : 10000,
+  "num_stonne_per_core" : 64,
+  "num_stonne_port" : 16,
 
   "dram_type" : "ramulator2",
   "dram_freq" : 940,
diff --git a/PyTorchSimBackend/configs/stonne_little_c4_simple_noc.json b/PyTorchSimBackend/configs/stonne_little_c4_simple_noc.json
new file mode 100644
index 00000000..c094597b
--- /dev/null
+++ b/PyTorchSimBackend/configs/stonne_little_c4_simple_noc.json
@@ -0,0 +1,32 @@
+{
+  "core_type" : ["stonne"],
+  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "num_cores" : 4,
+  "core_freq" : 940,
+  "sram_size" : 65536,
+  "core_print_interval" : 10000,
+  "num_stonne_per_core" : 16,
+  "num_stonne_port" : 4,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" : 940,
+  "dram_channels": 8,
+  "dram_req_size": 16,
+  "dram_latency" : 10,
+  "dram_size" : 32,
+  "dram_nbl" : 2,
+  "dram_print_interval": 10000,
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq" : 7000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
+
+  "precision" : 4,
+  "scheduler" : "simple",
+  "num_partition" : 1,
+  "partition": {
+    "core_0":0
+  }
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/PyTorchSimBackend/include/SimulationConfig.h
index 713ac008..390d8d93 100644
--- a/PyTorchSimBackend/include/SimulationConfig.h
+++ b/PyTorchSimBackend/include/SimulationConfig.h
@@ -22,6 +22,8 @@ struct SimulationConfig {
   uint32_t sram_size;
   uint32_t core_print_interval = 0;
   uint32_t num_systolic_array_per_core = 1;
+  uint32_t num_stonne_per_core = 1;
+  uint32_t num_stonne_port = 1;
 
   /* DRAM config */
   DramType dram_type;
diff --git a/PyTorchSimBackend/include/SparseCore.h b/PyTorchSimBackend/include/SparseCore.h
index fab0dd16..8fc60804 100644
--- a/PyTorchSimBackend/include/SparseCore.h
+++ b/PyTorchSimBackend/include/SparseCore.h
@@ -22,8 +22,12 @@ class SparseCore : public Core {
   std::shared_ptr<Tile> pop_finished_tile() override;
   uint32_t r_port_nr = 1;
   uint32_t w_port_nr = 1;
+  uint32_t nr_cores = 1;
 private:
-  SST_STONNE::sstStonne *stonneCore;
+  uint32_t rr_idx = 0;
+  std::vector<bool> coreBusy;
+  std::vector<SST_STONNE::sstStonne*> stonneCores;
+  std::vector<std::vector<std::shared_ptr<Tile>>> percore_tiles;
   /* Interconnect queue */
   std::queue<mem_fetch*> _request_queue;
   std::queue<mem_fetch*> _response_queue;
diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc
index 5d2a6ece..f084a420 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/PyTorchSimBackend/src/Common.cc
@@ -43,6 +43,10 @@ SimulationConfig initialize_config(json config) {
   parsed_config.sram_size = config["sram_size"];
   if (config.contains("num_systolic_array_per_core"))
     parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"];
+  if (config.contains("num_stonne_per_core"))
+    parsed_config.num_stonne_per_core = config["num_stonne_per_core"];
+   if (config.contains("num_stonne_port"))
+    parsed_config.num_stonne_port = config["num_stonne_port"];
   parsed_config.core_print_interval = get_config_value<uint32_t>(config, "core_print_interval");
 
   /* Stonne config */ 
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index c5fc9c9d..5e503de4 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -1,74 +1,122 @@
 #include "SparseCore.h"
 
 SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config) {
-  stonneCore = new SST_STONNE::sstStonne(config.stonne_config_path);
-  stonneCore->init(1);
-  Config stonneConfig = stonneCore->getStonneConfig();
+  /* Init stonne cores*/
+  nr_cores = config.num_stonne_per_core;
+  coreBusy.resize(nr_cores);
+  percore_tiles.resize(nr_cores);
+  for (int i=0; i<nr_cores; i++) {
+    SST_STONNE::sstStonne* core = new SST_STONNE::sstStonne(config.stonne_config_path);
+    core->init(1);
+    stonneCores.push_back(core);
+    coreBusy.at(i) = false;
+    percore_tiles.at(i) = std::vector<std::shared_ptr<Tile>>();
+  }
+
+  Config stonneConfig = stonneCores.at(0)->getStonneConfig();
   unsigned int core_freq = config.core_freq; // MHz;
   unsigned int num_ms = stonneConfig.m_MSNetworkCfg.ms_size;
-  unsigned int dn_bw = stonneConfig.m_SDMemoryCfg.n_read_ports;
-  unsigned int dn_width = stonneConfig.m_SDMemoryCfg.port_width;
-  unsigned int rn_bw = stonneConfig.m_SDMemoryCfg.n_write_ports;
-  unsigned int rn_width = stonneConfig.m_SDMemoryCfg.port_width;
-  r_port_nr = dn_bw;
-  w_port_nr = rn_bw;
+  r_port_nr = config.num_stonne_port;
+  w_port_nr = config.num_stonne_port;
 
   double compute_throughput = static_cast<double>(num_ms) * core_freq / 1e3; // FLOPs/sec
-  double dn_bandwidth = static_cast<double>(dn_bw) * dn_width * core_freq * 1e6 / 8.0 / 1e9; // GB/s
-  double rn_bandwidth = static_cast<double>(rn_bw) * rn_width * core_freq * 1e6 / 8.0 / 1e9; // GB/s
-
-  spdlog::info("[Config/StonneCore {}] Compute Throughput: {:.2f} GFLOPs/sec", id, compute_throughput);
-  spdlog::info("[Config/StonneCore {}] Distribution Network Bandwidth: {:.2f} GB/s ({} ports x {} bits)",
-             id, dn_bandwidth, r_port_nr, dn_width);
-  spdlog::info("[Config/StonneCore {}] Reduction Network Bandwidth: {:.2f} GB/s ({} ports x {} bits)",
-             id, rn_bandwidth, w_port_nr, rn_width);
+  double dn_bandwidth = static_cast<double>(r_port_nr) * config.dram_req_size * core_freq * 1e6 / 8.0 / 1e9; // GB/s
+  double rn_bandwidth = static_cast<double>(w_port_nr) * config.dram_req_size * core_freq * 1e6 / 8.0 / 1e9; // GB/s
+  for (int i=0; i<nr_cores; i++) {
+    spdlog::info("[Config/StonneCore {}][{}] Compute Throughput: {:.2f} GFLOPs/sec", id, i, compute_throughput);
+    spdlog::info("[Config/StonneCore {}][{}] Distribution Network Bandwidth: {:.2f} GB/s",
+                id, i, dn_bandwidth, r_port_nr);
+    spdlog::info("[Config/StonneCore {}][{}] Reduction Network Bandwidth: {:.2f} GB/s",
+                id, i, rn_bandwidth, w_port_nr);
+  }
 };
 
-SparseCore::~SparseCore() { delete stonneCore; }
+SparseCore::~SparseCore() {
+  for (auto& core : stonneCores){
+    delete core;
+  }
+}
 
 bool SparseCore::running() {
-  return !_request_queue.empty() || !_response_queue.empty() || _tiles.size();
+  bool is_running = !_request_queue.empty() || !_response_queue.empty();
+  for (auto& tile_vec : percore_tiles)
+    is_running |= tile_vec.size();
+  return is_running;
 }
 
 void SparseCore::issue(std::shared_ptr<Tile> tile) {
+  int32_t selected_core_idx = -1;
+  for (int i=0; i<nr_cores; i++) {
+    int32_t core_idx = rr_idx % nr_cores;
+    if (!coreBusy.at(i)) {
+      selected_core_idx = core_idx;
+      rr_idx = (selected_core_idx + 1) % nr_cores;
+      break;
+    }
+  }
+  if (selected_core_idx == -1) {
+    spdlog::error("[StonneCore {}] Faield to issue tile", _id);
+    exit(1);
+  }
+  spdlog::info("[StonneCore {}][{}] issued new tile", _id, selected_core_idx);
   SST_STONNE::StonneOpDesc *opDesc = static_cast<SST_STONNE::StonneOpDesc*>(tile->get_custom_data());
-  stonneCore->setup(*opDesc);
-  stonneCore->init(1);
-  _tiles.push_back(tile);
+  stonneCores.at(selected_core_idx)->setup(*opDesc);
+  stonneCores.at(selected_core_idx)->init(1);
+  percore_tiles.at(selected_core_idx).push_back(tile);
+  coreBusy.at(selected_core_idx) = true;
 };
 
 bool SparseCore::can_issue(const std::shared_ptr<Tile>& op) {
-  return !running() && op->is_stonne_tile();
+  bool idle_exist = false;
+  for (bool flag : coreBusy) {
+    idle_exist |= !flag;
+  }
+  return idle_exist && op->is_stonne_tile();
 }
 
 void SparseCore::cycle() {
   _core_cycle++;
-  stonneCore->cycle();
-
-  /* Send Memory Request */
-  while (SimpleMem::Request* req = stonneCore->popRequest()) {
-    uint64_t target_addr =  (req->getAddress() / _config.dram_req_size) * _config.dram_req_size;
-    mem_access_type acc_type;
-    mf_type type;
-
-    switch(req->getcmd()) {
-      case SimpleMem::Request::Read:
-        acc_type = mem_access_type::GLOBAL_ACC_R;
-        type = mf_type::READ_REQUEST;
-        break;
-      case SimpleMem::Request::Write:
-        acc_type = mem_access_type::GLOBAL_ACC_W;
-        type = mf_type::WRITE_REQUEST;
-        break;
-      default:
-        spdlog::error("[SparseCore] Invalid request type from core");
-        return;
+  uint32_t stonne_core_id = 0;
+  for (auto& stonneCore : stonneCores) {
+    stonneCore->cycle();
+
+    /* Send Memory Request */
+    while (SimpleMem::Request* req = stonneCore->popRequest()) {
+      uint64_t target_addr =  (req->getAddress() / _config.dram_req_size) * _config.dram_req_size;
+      mem_access_type acc_type;
+      mf_type type;
+
+      switch(req->getcmd()) {
+        case SimpleMem::Request::Read:
+          acc_type = mem_access_type::GLOBAL_ACC_R;
+          type = mf_type::READ_REQUEST;
+          break;
+        case SimpleMem::Request::Write:
+          acc_type = mem_access_type::GLOBAL_ACC_W;
+          type = mf_type::WRITE_REQUEST;
+          break;
+        default:
+          spdlog::error("[SparseCore] Invalid request type from core");
+          return;
+      }
+      req->request_time = _core_cycle;
+      req->stonneId = stonne_core_id;
+      std::tuple<uint64_t, mem_access_type, mf_type> key = std::make_tuple(target_addr, acc_type, type);
+      if (request_merge_table.find(key) == request_merge_table.end())
+        request_merge_table[key] = new std::vector<SimpleMem::Request*> ();
+      request_merge_table[key]->push_back(req);
     }
-    req->request_time = _core_cycle;
-    std::tuple<uint64_t, mem_access_type, mf_type> key = std::make_tuple(target_addr, acc_type, type);
-    if (request_merge_table.find(key) == request_merge_table.end())
-      request_merge_table[key] = new std::vector<SimpleMem::Request*> ();
-    request_merge_table[key]->push_back(req);
+
+    if (coreBusy.at(stonne_core_id) && stonneCore->isFinished()) {
+      stonneCore->finish();
+
+      std::shared_ptr<Tile> target_tile = percore_tiles.at(stonne_core_id).front();
+      target_tile->set_status(Tile::Status::FINISH);
+      _finished_tiles.push(target_tile);
+      percore_tiles.at(stonne_core_id).erase(percore_tiles.at(stonne_core_id).begin());
+      coreBusy.at(stonne_core_id) = false;
+    }
+    stonne_core_id++;
   }
 
   int nr_request = 0;
@@ -82,8 +130,8 @@ void SparseCore::cycle() {
       _request_queue.push(req_wrapper);
       request_merge_table.erase(req_pair.first);
 
-      spdlog::debug("[SparseCore][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \
-              _core_cycle, req_wrapper->get_addr(), int(req_wrapper->get_access_type()), int(req_wrapper->get_type()), _config.dram_req_size, nr_request);
+      spdlog::debug("[SparseCore][{}][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \
+              _core_cycle, stonne_core_id, req_wrapper->get_addr(), int(req_wrapper->get_access_type()), int(req_wrapper->get_type()), _config.dram_req_size, nr_request);
       nr_request++;
       break;
     }
@@ -98,10 +146,10 @@ void SparseCore::cycle() {
     SimpleMem::Request* resp = resps->front();
 
     spdlog::debug("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \
-             _core_cycle, _core_cycle - resp->request_time, resp->getAddress(), int(resp_wrapper->get_access_type()), int(resp_wrapper->get_type()), _config.dram_req_size, nr_request);
+            _core_cycle, _core_cycle - resp->request_time, resp->getAddress(), int(resp_wrapper->get_access_type()), int(resp_wrapper->get_type()), _config.dram_req_size, nr_request);
 
     resp->setReply();
-    stonneCore->pushResponse(resp);
+    stonneCores.at(resp->stonneId)->pushResponse(resp);
     resps->erase(resps->begin());
     if (resps->empty()) {
       delete resps;
@@ -112,13 +160,7 @@ void SparseCore::cycle() {
       break;
   }
 
-  if (stonneCore->isFinished() && _tiles.size()) {
-    stonneCore->finish();
-    std::shared_ptr<Tile> target_tile = _tiles.front();
-    target_tile->set_status(Tile::Status::FINISH);
-    _finished_tiles.push(target_tile);
-    _tiles.erase(_tiles.begin());
-  }
+
 }
 
 bool SparseCore::has_memory_request() {
@@ -134,7 +176,8 @@ void SparseCore::push_memory_response(mem_fetch* response) {
 }
 
 void SparseCore::print_stats() {
-  stonneCore->printStats();
+  for (auto stonneCore : stonneCores)
+    stonneCore->printStats();
 }
 
 void SparseCore::print_current_stats() {

From 7eb2becf6ad5e8927a5f524b7463d0a17800d590 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 21 Feb 2025 01:52:14 +0000
Subject: [PATCH 168/432] [Backendsim] Add missing burst length configuration

---
 PyTorchSimBackend/include/Dram.h             | 1 +
 PyTorchSimBackend/include/SimulationConfig.h | 3 ++-
 PyTorchSimBackend/src/Common.cc              | 2 ++
 PyTorchSimBackend/src/Dram.cc                | 3 ++-
 PyTorchSimFrontend/extension_op.py           | 4 +++-
 5 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimBackend/include/Dram.h b/PyTorchSimBackend/include/Dram.h
index 137a9811..e600bbfb 100644
--- a/PyTorchSimBackend/include/Dram.h
+++ b/PyTorchSimBackend/include/Dram.h
@@ -32,6 +32,7 @@ class Dram {
   SimulationConfig _config;
   CacheConfig _m_cache_config;
   uint32_t _n_ch;
+  uint32_t _n_bl;
   uint32_t _n_partitions;
   uint32_t _n_ch_per_partition;
   uint32_t _req_size;
diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/PyTorchSimBackend/include/SimulationConfig.h
index 390d8d93..a6e7adef 100644
--- a/PyTorchSimBackend/include/SimulationConfig.h
+++ b/PyTorchSimBackend/include/SimulationConfig.h
@@ -32,6 +32,7 @@ struct SimulationConfig {
   uint32_t dram_channels;
   uint32_t dram_req_size;
   uint32_t dram_latency;
+  uint32_t dram_nbl = 1;
   uint32_t dram_print_interval;
   std::string dram_config_path;
 
@@ -64,6 +65,6 @@ struct SimulationConfig {
   }
 
   float max_dram_bandwidth() {
-    return dram_freq * dram_channels * dram_req_size / 1000; // GB/s
+    return dram_freq * dram_channels * dram_req_size / dram_nbl / 1000; // GB/s
   }
 };
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc
index f084a420..e160a83b 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/PyTorchSimBackend/src/Common.cc
@@ -73,6 +73,8 @@ SimulationConfig initialize_config(json config) {
     parsed_config.dram_req_size = config["dram_req_size"];
   if (config.contains("dram_print_interval"))
     parsed_config.dram_print_interval = config["dram_print_interval"];
+  if(config.contains("dram_nbl"))
+    parsed_config.dram_nbl = config["dram_nbl"];
   if (config.contains("dram_num_partitions"))
     parsed_config.dram_num_partitions = config["dram_num_partitions"];
 
diff --git a/PyTorchSimBackend/src/Dram.cc b/PyTorchSimBackend/src/Dram.cc
index f59d359c..76b349c5 100644
--- a/PyTorchSimBackend/src/Dram.cc
+++ b/PyTorchSimBackend/src/Dram.cc
@@ -14,6 +14,7 @@ uint32_t Dram::get_channel_id(mem_fetch* access) {
 Dram::Dram(SimulationConfig config, cycle_type* core_cycle) {
   _core_cycles = core_cycle;
   _n_ch = config.dram_channels;
+  _n_bl = config.dram_nbl;
   _req_size = config.dram_req_size;
   _n_partitions = config.dram_num_partitions;
   _n_ch_per_partition = _n_ch / _n_partitions;
@@ -53,7 +54,7 @@ DramRamulator2::DramRamulator2(SimulationConfig config, cycle_type* core_cycle)
   _mem.resize(_n_ch);
   for (int ch = 0; ch < _n_ch; ch++) {
     _mem[ch] = std::make_unique<Ramulator2>(
-      ch, _n_ch, config.dram_config_path, "Ramulator2", _config.dram_print_interval, 1);
+      ch, _n_ch, config.dram_config_path, "Ramulator2", _config.dram_print_interval, config.dram_nbl);
   }
   _tx_log2 = log2(_req_size);
   _tx_ch_log2 = log2(_n_ch_per_partition) + _tx_log2;
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index dacfa699..5744868d 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -318,7 +318,9 @@ def calculate_sparsity(tensor):
     in_file_bitmap_b = f"{dir_path}/{prefix}_bitmapSpMSpM_file_bitmapB_"+str(M)+"_"+str(N)+"_"+str(K)+".in"
     c_result = f'{dir_path}/{prefix}_result.out'
     dram_a_address, dram_b_address, dram_c_address = generate_inner_product_matrix(a, b, M, N, K, file_name, in_file_bitmap_a, in_file_bitmap_b)
-
+    dram_a_address = a.data_ptr()
+    dram_b_address = b.data_ptr()
+    dram_c_address = out.data_ptr()
     graph = {
         0: {
             "node_id": 0,

From 90c226569f95917a2bb818f7258209b485f5cdda Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 21 Feb 2025 03:00:57 +0000
Subject: [PATCH 169/432] [Backend] Run simulate when escaping the interactive
 mode

---
 PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json | 9 ++++-----
 PyTorchSimBackend/include/Simulator.h                   | 2 +-
 PyTorchSimBackend/src/main.cc                           | 1 +
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
index fbe605e8..c4ce3408 100644
--- a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
+++ b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
@@ -5,16 +5,15 @@
   "core_freq" : 940,
   "sram_size" : 65536,
   "core_print_interval" : 10000,
-  "num_stonne_per_core" : 64,
-  "num_stonne_port" : 16,
+  "num_stonne_per_core" : 4,
+  "num_stonne_port" : 64,
 
   "dram_type" : "ramulator2",
   "dram_freq" : 940,
-  "dram_channels": 8,
+  "dram_channels": 4,
   "dram_req_size": 16,
   "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
+  "dram_nbl" : 1,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
 
diff --git a/PyTorchSimBackend/include/Simulator.h b/PyTorchSimBackend/include/Simulator.h
index 2e294616..4d9defd1 100644
--- a/PyTorchSimBackend/include/Simulator.h
+++ b/PyTorchSimBackend/include/Simulator.h
@@ -32,8 +32,8 @@ class Simulator {
   int get_partition_id(int core_id) { return _config.partiton_map[core_id]; }
   std::unique_ptr<Scheduler>& get_partition_scheduler(int core_id) { return _partition_scheduler.at(get_partition_id(core_id)); }
   void print_core_stat();
- private:
   void cycle();
+ private:
   void core_cycle();
   void dram_cycle();
   void icnt_cycle();
diff --git a/PyTorchSimBackend/src/main.cc b/PyTorchSimBackend/src/main.cc
index 7974aab2..867188c1 100644
--- a/PyTorchSimBackend/src/main.cc
+++ b/PyTorchSimBackend/src/main.cc
@@ -97,6 +97,7 @@ void interactive_mode(Simulator* simulator) {
     if (isDryRun)
       std::cout << "[" << simulator->get_core_cycle() << "] BackendSim> ";
   }
+  simulator->cycle();
   if (simulator->get_core_cycle()==0)
     simulator->until(0);
   simulator->print_core_stat();

From cad7cea4e27e1db338df71e4f8b6883f8fc80ae8 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 21 Feb 2025 03:07:13 +0000
Subject: [PATCH 170/432] [Backendsim] Make use differenr address to generate
 mem traffic this is only for experiment...

---
 .../configs/heterogeneous_c1_simple_noc.json        |  4 ++--
 PyTorchSimBackend/src/SparseCore.cc                 |  2 +-
 PyTorchSimFrontend/extension_op.py                  |  4 ++--
 tests/test_spmm_scheduler.py                        | 13 ++++---------
 4 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json b/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json
index 8fad4829..8a066925 100644
--- a/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json
+++ b/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json
@@ -1,7 +1,7 @@
 {
-  "core_type" : ["ws_mesh", "stonne"],
+  "core_type" : ["stonne", "ws_mesh"],
   "num_cores" : 2,
-  "stonne_config_path" : "/root/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
   "core_freq" : 940,
   "sram_size" : 65536,
   "core_print_interval" : 10000,
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 5e503de4..2c93977c 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -60,7 +60,7 @@ void SparseCore::issue(std::shared_ptr<Tile> tile) {
   }
   spdlog::info("[StonneCore {}][{}] issued new tile", _id, selected_core_idx);
   SST_STONNE::StonneOpDesc *opDesc = static_cast<SST_STONNE::StonneOpDesc*>(tile->get_custom_data());
-  stonneCores.at(selected_core_idx)->setup(*opDesc);
+  stonneCores.at(selected_core_idx)->setup(*opDesc, 0x1000000 * selected_core_idx); // FIXME. To avoid same address
   stonneCores.at(selected_core_idx)->init(1);
   percore_tiles.at(selected_core_idx).push_back(tile);
   coreBusy.at(selected_core_idx) = true;
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 5744868d..59130fd1 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -210,7 +210,7 @@ def calculate_sparsity(tensor):
             "children": [2],
             "loop_index": "loop_arg000",
             "loop_start": 0,
-            "loop_end": 1,
+            "loop_end": 4,  # FIXME. this is a trick that generate multiple tile.
             "loop_step": 1,
             "loop_type": "outer_loop"
         },
@@ -337,7 +337,7 @@ def calculate_sparsity(tensor):
             "children": [2],
             "loop_index": "loop_arg000",
             "loop_start": 0,
-            "loop_end": 1,
+            "loop_end": 64,
             "loop_step": 1,
             "loop_type": "outer_loop"
         },
diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py
index 10f3c681..b25012ed 100644
--- a/tests/test_spmm_scheduler.py
+++ b/tests/test_spmm_scheduler.py
@@ -35,7 +35,7 @@
     with torch.no_grad():
         # Init scheduler
         scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
-                            backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json")
+                            backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json")
 
         target_model1 = model1(input_size, hidden_size, output_size, w1_sparsity, w2_sparsity, scheduler.execution_engine.module.custom_device()).eval()
         target_model2 = model2(768, 12).eval()
@@ -51,18 +51,13 @@
         model_input2 = torch.randn(1, 512, 768)
 
         # Init request
-        new_request1 = Request("mlp", [model_input1], [], request_queue_idx=1)
-        new_request2 = Request("bert", [model_input2], [], request_queue_idx=0)
+        new_request1 = Request("mlp", [model_input1], [], request_queue_idx=0)
+        #new_request2 = Request("mlp", [model_input2], [], request_queue_idx=0)
 
 
         # Add request to scheduler
         scheduler.add_request(new_request1, request_time=0)
-        scheduler.add_request(new_request1, request_time=0)
-        scheduler.add_request(new_request1, request_time=0)
-        scheduler.add_request(new_request1, request_time=0)
-        scheduler.add_request(new_request1, request_time=0)
-        scheduler.add_request(new_request1, request_time=0)
-        scheduler.add_request(new_request2, request_time=0)
+        #scheduler.add_request(new_request2, request_time=0)
 
         # Run scheduler
         while not scheduler.is_finished():

From a9b5a0554a23b43c7056e91392c34ffd379b4034 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 21 Feb 2025 03:46:32 +0000
Subject: [PATCH 171/432] [Scheduler] Fix interactive mode to wait TOGSim
 terminated

---
 .../configs/heterogeneous_c1_simple_noc.json  | 32 -------------------
 PyTorchSimBackend/src/main.cc                 |  2 +-
 Scheduler/scheduler.py                        |  5 ++-
 Simulator/simulator.py                        | 15 +++++++++
 4 files changed, 20 insertions(+), 34 deletions(-)
 delete mode 100644 PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json

diff --git a/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json b/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json
deleted file mode 100644
index 8a066925..00000000
--- a/PyTorchSimBackend/configs/heterogeneous_c1_simple_noc.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "core_type" : ["stonne", "ws_mesh"],
-  "num_cores" : 2,
-  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 1,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 1,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 7000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m32.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/main.cc b/PyTorchSimBackend/src/main.cc
index 867188c1..84afe2ce 100644
--- a/PyTorchSimBackend/src/main.cc
+++ b/PyTorchSimBackend/src/main.cc
@@ -89,7 +89,7 @@ void interactive_mode(Simulator* simulator) {
       cycle_type current_cycle = simulator->get_core_cycle();
       std::cerr << "Current cycle: " << current_cycle << std::endl;
     }else if (token == "quit") {
-      spdlog::info("Exiting BackendSim.");
+      std::cerr << "Quit" << std::endl;
       break;
     } else {
       spdlog::error("Error: unknown command {} Available commands are: launch, until, quit.", token);
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 0b0a61e1..feb7e973 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -516,7 +516,10 @@ def is_request_queue_empty(self):
         return result
 
     def is_finished(self):
-        return self.is_request_queue_empty() and self.execution_engine.is_idle()
+        if self.is_request_queue_empty() and self.execution_engine.is_idle():
+            self.backend_simulator.wait()
+            return True
+        return False
 
     def current_time(self):
         return self.cycle_to_msec(self.current_cycle)
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 08a582f1..c5b1ef87 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -250,7 +250,17 @@ def interactive_simulation(self):
     def stop(self):
         if self.process:
             self.process.terminate()
+            self.process.wait()
             self.process = None
+            print("[BackendSimulator] Simulator stopped.")
+
+    def wait(self):
+        if self.process:
+            print("[BackendSimulator] Waiting for simulation to complete...")
+            self.quit()
+            self.process.wait()
+            self.process = None
+            print("[BackendSimulator] Simulation completed.")
 
     def send_command(self, command):
         if self.process:
@@ -285,6 +295,11 @@ def until(self, until_cycle):
         ret = self.send_command(command)
         return int(ret.split(" ")[-1])
 
+    def quit(self):
+        command = "quit"
+        ret = self.send_command(command)
+        return
+
     def create_attribute_file(self, attribute_path, inputs, **kwargs):
         address_info = {}
         json_content = {}

From 41ec1da7292516081f2a2d00e7804a93adbef824 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 21 Feb 2025 10:35:00 +0000
Subject: [PATCH 172/432] [Backendsim/Stonne] Add sparse core time stamp and
 stat

---
 .../configs/stonne_big_c1_simple_noc.json     | 11 +++----
 PyTorchSimBackend/src/SparseCore.cc           | 30 +++++++++++++++----
 PyTorchSimFrontend/extension_op.py            |  2 +-
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
index c4ce3408..ef8d86cc 100644
--- a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
+++ b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
@@ -2,20 +2,21 @@
   "core_type" : ["stonne"],
   "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
   "num_cores" : 1,
-  "core_freq" : 940,
+  "core_freq" : 700,
   "sram_size" : 65536,
   "core_print_interval" : 10000,
-  "num_stonne_per_core" : 4,
+  "num_stonne_per_core" : 8,
   "num_stonne_port" : 64,
 
   "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 4,
+  "dram_freq" : 700,
+  "dram_channels": 16,
   "dram_req_size": 16,
   "dram_latency" : 10,
+  "dram_size" : 32,
   "dram_nbl" : 1,
   "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
 
   "icnt_type" : "simple",
   "icnt_latency" : 7,
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 2c93977c..3891d9dd 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -5,10 +5,11 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config)
   nr_cores = config.num_stonne_per_core;
   coreBusy.resize(nr_cores);
   percore_tiles.resize(nr_cores);
+  stonneCores.resize(nr_cores);
   for (int i=0; i<nr_cores; i++) {
     SST_STONNE::sstStonne* core = new SST_STONNE::sstStonne(config.stonne_config_path);
-    core->init(1);
-    stonneCores.push_back(core);
+    stonneCores.at(i) = core;
+    stonneCores.at(i)->init(1);
     coreBusy.at(i) = false;
     percore_tiles.at(i) = std::vector<std::shared_ptr<Tile>>();
   }
@@ -58,6 +59,11 @@ void SparseCore::issue(std::shared_ptr<Tile> tile) {
     spdlog::error("[StonneCore {}] Faield to issue tile", _id);
     exit(1);
   }
+  //delete stonneCores.at(selected_core_idx);
+  //SST_STONNE::sstStonne* core = new SST_STONNE::sstStonne(_config.stonne_config_path);
+  //stonneCores.at(selected_core_idx) = core;
+  stonneCores.at(selected_core_idx)->init(1);
+
   spdlog::info("[StonneCore {}][{}] issued new tile", _id, selected_core_idx);
   SST_STONNE::StonneOpDesc *opDesc = static_cast<SST_STONNE::StonneOpDesc*>(tile->get_custom_data());
   stonneCores.at(selected_core_idx)->setup(*opDesc, 0x1000000 * selected_core_idx); // FIXME. To avoid same address
@@ -159,8 +165,9 @@ void SparseCore::cycle() {
     if (nr_request++ > w_port_nr)
       break;
   }
-
-
+  if(_config.core_print_interval && _core_cycle % _config.core_print_interval == 0) {
+    print_current_stats();
+  }
 }
 
 bool SparseCore::has_memory_request() {
@@ -176,8 +183,19 @@ void SparseCore::push_memory_response(mem_fetch* response) {
 }
 
 void SparseCore::print_stats() {
-  for (auto stonneCore : stonneCores)
-    stonneCore->printStats();
+  //for (auto stonneCore : stonneCores)
+  //  stonneCore->printStats();
+  MSwitchStats accum;
+  spdlog::info("========= Sparse Core stat =========");
+  spdlog::info("Stonne Core [{}] : Total cycle {}", _id, _core_cycle);
+  for (size_t i = 0; i < stonneCores.size(); ++i) {
+    MSwitchStats stats = stonneCores.at(i)->getMSStats();
+    accum += stats;
+    spdlog::info("Stonne Core [{}][{}] : n_multiplications: {} ",
+                 _id, i, stats.n_multiplications);
+  }
+  spdlog::info("Stonne Core [{}] : total_multiplications: {} ",
+                 _id, accum.n_multiplications);
 }
 
 void SparseCore::print_current_stats() {
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 59130fd1..4114bce1 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -210,7 +210,7 @@ def calculate_sparsity(tensor):
             "children": [2],
             "loop_index": "loop_arg000",
             "loop_start": 0,
-            "loop_end": 4,  # FIXME. this is a trick that generate multiple tile.
+            "loop_end": 8,  # FIXME. this is a trick that generate multiple tile.
             "loop_step": 1,
             "loop_type": "outer_loop"
         },

From dec03fb2b165320269a0f4b6c4df0c65b952a5b0 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 21 Feb 2025 15:37:19 +0000
Subject: [PATCH 173/432] [Experiment] Add heterogeneous config and script

---
 .../configs/heterogeneous_c2_simple_noc.json  | 32 ++++++++
 .../configs/stonne_big_c1_simple_noc.json     |  5 +-
 ...c_ws_128x128_c1_simple_noc_tpuv2_half.json | 27 +++++++
 stonne_experiment/run.sh                      | 23 +-----
 tests/test_hetro.py                           | 77 +++++++++++++++++++
 5 files changed, 141 insertions(+), 23 deletions(-)
 create mode 100644 PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json
 create mode 100644 PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2_half.json
 create mode 100644 tests/test_hetro.py

diff --git a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json
new file mode 100644
index 00000000..6f3049ac
--- /dev/null
+++ b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json
@@ -0,0 +1,32 @@
+{
+  "core_type" : ["stonne", "ws_mesh"],
+  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "num_cores" : 2,
+  "core_freq" : 700,
+  "sram_size" : 65536,
+  "core_print_interval" : 10000,
+  "num_stonne_per_core" : 8,
+  "num_stonne_port" : 64,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" : 700,
+  "dram_channels": 16,
+  "dram_req_size": 32,
+  "dram_latency" : 10,
+  "dram_nbl" : 1,
+  "dram_print_interval": 10000,
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq" : 7000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
+
+  "precision" : 4,
+  "scheduler" : "simple",
+  "num_partition" : 2,
+  "partition": {
+    "core_0":0,
+    "core_1":1
+  }
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
index ef8d86cc..30f3e216 100644
--- a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
+++ b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
@@ -10,10 +10,9 @@
 
   "dram_type" : "ramulator2",
   "dram_freq" : 700,
-  "dram_channels": 16,
-  "dram_req_size": 16,
+  "dram_channels": 8,
+  "dram_req_size": 32,
   "dram_latency" : 10,
-  "dram_size" : 32,
   "dram_nbl" : 1,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2_half.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2_half.json
new file mode 100644
index 00000000..2ecb7b64
--- /dev/null
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2_half.json
@@ -0,0 +1,27 @@
+{
+  "num_cores" : 1,
+  "core_freq" : 700,
+  "sram_size" : 65536,
+  "core_print_interval" : 10000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" : 700,
+  "dram_channels": 8,
+  "dram_req_size": 32,
+  "dram_latency" : 10,
+  "dram_nbl" : 1,
+  "dram_print_interval": 10000,
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+ 
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq" : 7000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
+ 
+  "precision" : 4,
+  "scheduler" : "simple",
+  "num_partition" : 1,
+  "partition": {
+    "core_0": 0
+  }
+}
\ No newline at end of file
diff --git a/stonne_experiment/run.sh b/stonne_experiment/run.sh
index 19a35df2..3586c670 100755
--- a/stonne_experiment/run.sh
+++ b/stonne_experiment/run.sh
@@ -1,23 +1,6 @@
 #!/bin/bash
+python3 ../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config stonne_big_c1_simple_noc.json --mode 0 > hetero/big_sparse.log
+python3 ../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config systolic_ws_128x128_c1_simple_noc_tpuv2_half.json --mode 1 > hetero/big.log
+python3 ../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config heterogeneous_c2_simple_noc.json --mode 2 > hetero/hetero.log
 
-BATCH_SIZE=128
-INPUT_SIZE=128
-HIDDEN_SIZE=128
-OUTPUT_SIZE=128
-OUTPUT_DIR="sparse_mt_results"
-
-mkdir -p "$OUTPUT_DIR"
-
-for w1 in $(seq 0.1 0.3 1.0); do
-    OUTPUT_FILE="${OUTPUT_DIR}/flops_w1_${w1}_w2_${w1}.txt"
-    echo "Started: w1=$w1, w2=$w1 (Output: $OUTPUT_FILE)"
-    python3 ${TORCHSIM_DIR}/tests/test_spmm_scheduler.py \
-        --batch_size $BATCH_SIZE \
-        --input_size $INPUT_SIZE \
-        --hidden_size $HIDDEN_SIZE \
-        --output_size $OUTPUT_SIZE \
-        --w1_sparsity $w1 \
-        --w2_sparsity $w1 > "$OUTPUT_FILE"
-done
-wait
 echo "All processes completed!"
diff --git a/tests/test_hetro.py b/tests/test_hetro.py
new file mode 100644
index 00000000..f2ae6ba7
--- /dev/null
+++ b/tests/test_hetro.py
@@ -0,0 +1,77 @@
+import os
+import sys
+import torch
+import argparse
+sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
+from test_stonne import sparse_matmul
+
+def custom_matmul(a, b):
+    return torch.matmul(a, b)
+torch.manual_seed(0)
+CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("--M", type=int, default=128, help="Batch size")
+    parser.add_argument("--N", type=int, default=128, help="Input layer size")
+    parser.add_argument("--K", type=int, default=128, help="Hidden layer size")
+    parser.add_argument("--sparsity", type=float, default=0.9, help="Output layer size")
+    parser.add_argument("--config", type=str, default="stonne_big_c1_simple_noc.json", help="Output layer size")
+    parser.add_argument("--mode", type=int, default=0, help="Output layer size")
+    args = parser.parse_args()
+
+    M = args.M
+    N = args.N
+    K = args.K
+    sparsity = args.sparsity
+    mode = args.mode
+    config_path = f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/{args.config}"
+
+    print("M: ", M)
+    print("N: ", N)
+    print("K: ", K)
+    print("sparsity: ", sparsity)
+
+    with torch.no_grad():
+        # Init scheduler
+        scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
+                            backend_config=config_path)
+
+        # Register compiled model
+        opt_model1 = torch.compile(custom_matmul)
+        opt_model2 = torch.compile(sparse_matmul)
+        SchedulerDNNModel.register_model("matmul", opt_model1)
+        SchedulerDNNModel.register_model("spmm", opt_model2)
+
+        # Init input data
+        for i in range(4):
+            dense_input1 = torch.randn(M, K)
+            dense_input2 = torch.randn(K, N)
+
+            sparse_input1 = torch.randn(128, 128)
+            sparse_input2 = torch.randn(128, 128)
+            mask1 = torch.randn(sparse_input1.shape) > sparsity
+            mask2 = torch.randn(sparse_input2.shape) > sparsity
+
+            sparse_input1 = sparse_input1 * mask1
+            sparse_input2 = sparse_input2 * mask2
+
+            # Init request
+            if mode == 0:
+                new_request1 = Request("spmm", [sparse_input1, sparse_input2], [], request_queue_idx=0)
+                scheduler.add_request(new_request1, request_time=0)
+            elif mode == 1:
+                new_request2 = Request("matmul", [dense_input1, dense_input2], [], request_queue_idx=0)
+                scheduler.add_request(new_request2, request_time=0)
+            elif mode == 2:
+                new_request1 = Request("spmm", [sparse_input1, sparse_input2], [], request_queue_idx=0)
+                new_request2 = Request("matmul", [dense_input1, dense_input2], [], request_queue_idx=1)
+
+                # Add request to scheduler
+                scheduler.add_request(new_request1, request_time=0)
+                scheduler.add_request(new_request2, request_time=0)
+
+        # Run scheduler
+        while not scheduler.is_finished():
+            scheduler.schedule()
\ No newline at end of file

From 06c943dfa795def8183f2bac304c1db8c3cebd95 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 24 Feb 2025 07:15:40 +0000
Subject: [PATCH 174/432] [Experiment/BlockSparse] Upload modifications and
 checkpoint

---
 .../configs/stonne_little_c4_simple_noc.json  | 32 --------
 .../systolic_ws_8x8_c1_12G_simple_noc.json    | 24 ++++++
 .../systolic_ws_8x8_c1_24G_simple_noc.json    | 24 ++++++
 .../systolic_ws_8x8_c1_48G_simple_noc.json    | 24 ++++++
 .../systolic_ws_8x8_c2_12G_simple_noc.json    | 25 +++++++
 .../systolic_ws_8x8_c2_24G_simple_noc.json    | 24 ++++++
 .../systolic_ws_8x8_c2_48G_simple_noc.json    | 24 ++++++
 PyTorchSimBackend/src/Core.cc                 | 22 ++++--
 PyTorchSimFrontend/extension_codecache.py     |  7 +-
 PyTorchSimFrontend/extension_config.py        | 13 +++-
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  6 +-
 Simulator/simulator.py                        |  4 +-
 sparsity/parse.py                             | 74 +++++++++++++++++++
 sparsity/run.sh                               | 51 +++++++++++++
 tests/test_sparsity.py                        |  3 +-
 tests/test_spmm_scheduler.py                  |  6 +-
 16 files changed, 313 insertions(+), 50 deletions(-)
 delete mode 100644 PyTorchSimBackend/configs/stonne_little_c4_simple_noc.json
 create mode 100644 PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json
 create mode 100644 PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json
 create mode 100644 PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json
 create mode 100644 PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json
 create mode 100644 PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json
 create mode 100644 PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json
 create mode 100644 sparsity/parse.py
 create mode 100755 sparsity/run.sh

diff --git a/PyTorchSimBackend/configs/stonne_little_c4_simple_noc.json b/PyTorchSimBackend/configs/stonne_little_c4_simple_noc.json
deleted file mode 100644
index c094597b..00000000
--- a/PyTorchSimBackend/configs/stonne_little_c4_simple_noc.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "core_type" : ["stonne"],
-  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
-  "num_cores" : 4,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_stonne_per_core" : 16,
-  "num_stonne_port" : 4,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 8,
-  "dram_req_size": 16,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 7000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json
new file mode 100644
index 00000000..8aee751b
--- /dev/null
+++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json
@@ -0,0 +1,24 @@
+{
+  "num_cores" : 1,
+  "core_freq" : 1000,
+  "sram_size" : 256,
+  "core_print_interval" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" :800,
+  "dram_channels": 1,
+  "dram_req_size": 64,
+  "dram_latency" : 10,
+  "dram_size" : 16,
+  "dram_nbl" : 4,
+  "dram_print_interval": 100000,
+  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+ 
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq" : 8000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
+ 
+  "precision" : 4,
+  "scheduler" : "simple"
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json
new file mode 100644
index 00000000..37e18b35
--- /dev/null
+++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json
@@ -0,0 +1,24 @@
+{
+  "num_cores" : 1,
+  "core_freq" : 1000,
+  "sram_size" : 256,
+  "core_print_interval" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" :800,
+  "dram_channels": 2,
+  "dram_req_size": 64,
+  "dram_latency" : 10,
+  "dram_size" : 16,
+  "dram_nbl" : 4,
+  "dram_print_interval": 100000,
+  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+ 
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq" : 8000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
+ 
+  "precision" : 4,
+  "scheduler" : "simple"
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json
new file mode 100644
index 00000000..49225d77
--- /dev/null
+++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json
@@ -0,0 +1,24 @@
+{
+  "num_cores" : 1,
+  "core_freq" : 1000,
+  "sram_size" : 256,
+  "core_print_interval" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" :800,
+  "dram_channels": 4,
+  "dram_req_size": 64,
+  "dram_latency" : 10,
+  "dram_size" : 16,
+  "dram_nbl" : 4,
+  "dram_print_interval": 100000,
+  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+ 
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq" : 8000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
+ 
+  "precision" : 4,
+  "scheduler" : "simple"
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json
new file mode 100644
index 00000000..f76fec32
--- /dev/null
+++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json
@@ -0,0 +1,25 @@
+{
+  "core_type" : ["ws_mesh","ws_mesh"],
+  "num_cores" : 2,
+  "core_freq" : 1000,
+  "sram_size" : 256,
+  "core_print_interval" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" :800,
+  "dram_channels": 1,
+  "dram_req_size": 64,
+  "dram_latency" : 10,
+  "dram_size" : 16,
+  "dram_nbl" : 4,
+  "dram_print_interval": 100000,
+  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+ 
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq" : 8000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m4.icnt",
+ 
+  "precision" : 4,
+  "scheduler" : "simple"
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json
new file mode 100644
index 00000000..7571b830
--- /dev/null
+++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json
@@ -0,0 +1,24 @@
+{
+  "num_cores" : 2,
+  "core_freq" : 1000,
+  "sram_size" : 256,
+  "core_print_interval" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" :800,
+  "dram_channels": 2,
+  "dram_req_size": 64,
+  "dram_latency" : 10,
+  "dram_size" : 16,
+  "dram_nbl" : 4,
+  "dram_print_interval": 100000,
+  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+ 
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq" : 8000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m8.icnt",
+ 
+  "precision" : 4,
+  "scheduler" : "simple"
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json
new file mode 100644
index 00000000..be163336
--- /dev/null
+++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json
@@ -0,0 +1,24 @@
+{
+  "num_cores" : 2,
+  "core_freq" : 1000,
+  "sram_size" : 256,
+  "core_print_interval" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" :800,
+  "dram_channels": 4,
+  "dram_req_size": 64,
+  "dram_latency" : 10,
+  "dram_size" : 16,
+  "dram_nbl" : 4,
+  "dram_print_interval": 100000,
+  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+ 
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq" : 8000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
+ 
+  "precision" : 4,
+  "scheduler" : "simple"
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index ffcc04a2..3ad96f65 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -263,12 +263,22 @@ void Core::cycle() {
               inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle() - overlapped_cycle;
               inst->bubble_cycle = bubble_cycle;
             }
-            spdlog::trace("[Core {}][{}] {}-{} ISSUED, finsh at {}", _id, _core_cycle,
-                          opcode_to_string(inst->get_opcode()), inst->get_compute_type(), inst->finish_cycle);
-            target_pipeline.push(inst);
-            issued = true;
-            if (inst->get_compute_type()) {
-              _stat_gemm_inst++;
+            if (inst->get_compute_cycle() == 0) {
+              spdlog::trace("[Core {}][{}] {} SKIPPED", _id, _core_cycle,
+                            opcode_to_string(inst->get_opcode()));
+              inst->finish_instruction();
+              static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
+              _stat_tot_sa_inst.at(static_cast<size_t>(inst->get_opcode()))++;
+              auto it = instructions.begin() + j; // Position 2 is the third element
+              instructions.erase(it);
+            } else {
+              spdlog::trace("[Core {}][{}] {}-{} ISSUED, finsh at {}", _id, _core_cycle,
+                            opcode_to_string(inst->get_opcode()), inst->get_compute_type(), inst->finish_cycle);
+              target_pipeline.push(inst);
+              issued = true;
+              if (inst->get_compute_type()) {
+                _stat_gemm_inst++;
+              }
             }
           }
           break;
diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index ae38ef75..437f2a01 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -302,6 +302,10 @@ def task():
                                           tile_size=tile_size, spad_info=spad_info, origins=origins, **kwargs)
             return key
         future = self.submit(task)
+        if "loop_size" in kwargs:
+            loop_size = kwargs["loop_size"]
+        else:
+            loop_size = []
         def dummy_simulator(*args, **kwargs):
             # Wait for compilation
             key = future.result()
@@ -324,7 +328,8 @@ def dummy_simulator(*args, **kwargs):
             attribute_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "attribute")
             backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
             backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG)
-            attribute_path = backsim.create_attribute_file(attribute_path, args, tile_size=tile_size)
+            backsim.vectorlane_size = vectorlane_size
+            attribute_path = backsim.create_attribute_file(attribute_path, args, loop_size=loop_size)
             result_path = backsim.simulation(onnx_path, attribute_path)
             result = BackendSimulator.get_result_from_file(result_path)
             return result
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 9b91b787..f290e8e7 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -1,11 +1,12 @@
 import os
+import sys
 import tempfile
 
 # Hardware info config
 CONFIG_VECTOR_LANE = int(os.environ.get("TORCHSIM_VECTOR_LANE", default=128))
 CONFIG_SPAD_INFO = {
   "spad_vaddr" : 0xD0000000,
-  "spad_paddr" : 0xD0000000,
+  "spad_paddr" : 0x2000000000,
   "spad_size" : 128 << 10
 }
 CONFIG_PRECISION = 4 # 32bit
@@ -13,8 +14,6 @@
 CONFIG_VLEN = 32 // CONFIG_PRECISION # 256bits / 32bits = 8 [elements]
 
 # Tile size config
-CONFIG_TILE_ROW = int(os.environ.get("TORCHSIM_TILE_ROW", default=-1))
-CONFIG_TILE_COL = int(os.environ.get("TORCHSIM_TILE_COL", default=-1))
 CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
 
 # DUMP PATH
@@ -42,4 +41,10 @@
 # GEM5 config
 CONFIG_GEM5_PATH = os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt")
 CONFIG_GEM5_SCRIPT_PATH = os.environ.get('GEM5_SCRIPT_PATH',
-                                  default=f"{CONFIG_TORCHSIM_DIR}/gem5_script/script_systolic.py")
\ No newline at end of file
+                                  default=f"{CONFIG_TORCHSIM_DIR}/gem5_script/script_systolic.py")
+
+# For block sparse
+CONFIG_BLOCK_SPARSE = int(os.environ.get('BLOCK_SPARSE', default=0))
+CONFIG_FORCE_TILE_M = int(os.environ.get("TORCHSIM_FORCE_TIME_M", default=sys.maxsize))
+CONFIG_FORCE_TILE_N = int(os.environ.get("TORCHSIM_FORCE_TIME_N", default=sys.maxsize))
+CONFIG_FORCE_TILE_K = int(os.environ.get("TORCHSIM_FORCE_TIME_K", default=sys.maxsize))
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 15b257e0..95e5bcc5 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -8,6 +8,7 @@
 from torch._inductor.ir import ReinterpretView
 from torch._inductor.codecache import write_atomic
 import PyTorchSimFrontend.extension_codecache as extension_codecache
+from PyTorchSimFrontend import extension_config
 
 GEMM_TEMPLATE = r"""
 {% if X_transposed %}#map0 = affine_map<(d0, d1) -> (d1 * {{ M }} + d0)>{% else %}#map0 = affine_map<(d0, d1) -> (d0 * {{ K }} + d1)>{% endif %}
@@ -86,7 +87,7 @@ def is_transposed(self, node):
                 if node.layout.stride[-2] == node.data.layout.stride[-1] and node.layout.stride[-1] == node.data.layout.stride[-2]:
                     return True
                 else:
-                  raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
+                    raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
         return False
 
     def render(self,
@@ -110,6 +111,9 @@ def render(self,
         else:
             TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
             template = GEMM_TEMPLATE
+        TILE_M = min(extension_config.CONFIG_FORCE_TILE_M, TILE_M)
+        TILE_N = min(extension_config.CONFIG_FORCE_TILE_N, TILE_M)
+        TILE_K = min(extension_config.CONFIG_FORCE_TILE_K, TILE_M)
         TOG_latency = M if TILE_M > M else TILE_M
         kernel.loop_size =[TOG_latency, TILE_N, TILE_K]
         SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index c5b1ef87..c38e3f9c 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -311,12 +311,12 @@ def create_attribute_file(self, attribute_path, inputs, **kwargs):
             address_info[f"arg{idx}"] = tensor.data_ptr()
         json_content["address_info"] = address_info
 
-        if "tile_size" in kwargs and len(kwargs['tile_size'])==3 and kwargs['tile_size'][0] != 1:
+        if extension_config.CONFIG_BLOCK_SPARSE and "loop_size" in kwargs and len(kwargs['loop_size'])==3 and kwargs['loop_size'][0] != 1:
             # GEMM
             import copy
             zero_skip = {}
             input, weight = inputs[:2]
-            M, N, K = kwargs['tile_size']
+            M, N, K = kwargs['loop_size']
 
             padded_input = copy.deepcopy(input.cpu())
             padded_weight = copy.deepcopy(weight.cpu())
diff --git a/sparsity/parse.py b/sparsity/parse.py
new file mode 100644
index 00000000..7b15e156
--- /dev/null
+++ b/sparsity/parse.py
@@ -0,0 +1,74 @@
+import argparse
+import os
+import subprocess
+
+def get_stored_paths(log_file):
+    """Extracts stored file paths from the given log file."""
+    stored_paths = []
+    try:
+        result = subprocess.run(["grep", "stored", log_file], capture_output=True, text=True)
+        for line in result.stdout.splitlines():
+            parts = line.split(" ")
+            if "stored" in parts:
+                index = parts.index("stored")
+                if index + 1 < len(parts):
+                    stored_paths.append(parts[index + 2].strip('"'))
+    except Exception as e:
+        print(f"Error reading stored paths: {e}")
+    return stored_paths
+
+def get_last_total_cycle(file_path):
+    """Extracts the last Total cycle value from the given file."""
+    total_cycle = None
+    try:
+        result = subprocess.run(["grep", "Total cycle", file_path], capture_output=True, text=True)
+        lines = result.stdout.splitlines()
+        if lines:
+            last_line = lines[-1]
+            total_cycle = last_line.split()[-1]  # Extract the last value
+    except Exception as e:
+        print(f"Error reading total cycle from {file_path}: {e}")
+    return total_cycle
+
+def main(log_file):
+    stored_paths = get_stored_paths(log_file)
+    k = []
+    for path in stored_paths:
+        print(path)
+        if os.path.exists(path):
+            total_cycle = get_last_total_cycle(path)
+            if total_cycle:
+                k.append(total_cycle)
+            else:
+                print(f"{path}: No Total cycle found")
+        else:
+            print(f"{path}: File does not exist")
+    return k
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Extract Total Cycle from stored paths.")
+    parser.add_argument("log_file", type=str, help="Path to the log file containing stored paths")
+    args = parser.parse_args()
+    a_l = []
+    b_l = []
+    if os.path.exists(args.log_file):
+        a, b = main(args.log_file + "/0.0")
+        a_l.append(a)
+        b_l.append(b)
+        a, b = main(args.log_file + "/0.2")
+        a_l.append(a)
+        b_l.append(b)
+        a, b = main(args.log_file + "/0.4")
+        a_l.append(a)
+        b_l.append(b)
+        a, b = main(args.log_file + "/0.6")
+        a_l.append(a)
+        b_l.append(b)
+        a, b = main(args.log_file + "/0.8")
+        a_l.append(a)
+        b_l.append(b)
+        print(" ".join(a_l))
+        print(" ".join(b_l))
+ 
+    else:
+        print(f"Log file {args.log_file} not found.")
diff --git a/sparsity/run.sh b/sparsity/run.sh
new file mode 100755
index 00000000..5644b768
--- /dev/null
+++ b/sparsity/run.sh
@@ -0,0 +1,51 @@
+export TORCHSIM_DUMP_PATH=$(pwd)/result
+export BLOCK_SPARSE=1
+export TORCHSIM_FORCE_TIME_M=32
+
+OUTPUT_DIR="12GB"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json"
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
+
+OUTPUT_DIR="24GB"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json"
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
+
+OUTPUT_DIR="48GB"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json"
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
+
+OUTPUT_DIR="12GB_2core"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json"
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
+
+OUTPUT_DIR="24GB_2core"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json"
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
+
+OUTPUT_DIR="48GB_2core"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json"
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT_DIR}/0.6
+python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
diff --git a/tests/test_sparsity.py b/tests/test_sparsity.py
index c72dbb98..b3945520 100644
--- a/tests/test_sparsity.py
+++ b/tests/test_sparsity.py
@@ -83,7 +83,6 @@ def test_mlp_inf(device, batch_size=64, input_size=64, hidden_size=32, output_si
     test_result("MLP Forward", y, cpu_y)
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser(description="Count zeros in tensors from command-line arguments.")
     parser.add_argument(
         "--sparsity",
@@ -102,4 +101,4 @@ def test_mlp_inf(device, batch_size=64, input_size=64, hidden_size=32, output_si
     device = module.custom_device()
 
     #test_dec_inf(device, sparsity=args.sparsity, block=args.block)
-    test_mlp_inf(device, batch_size=64, input_size=784, hidden_size=512, output_size=256, sparsity=args.sparsity, block=args.block)
+    test_mlp_inf(device, batch_size=32, input_size=784, hidden_size=512, output_size=256, sparsity=args.sparsity, block=args.block)
diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py
index b25012ed..73bbdbae 100644
--- a/tests/test_spmm_scheduler.py
+++ b/tests/test_spmm_scheduler.py
@@ -16,6 +16,7 @@
     parser.add_argument("--output_size", type=int, default=128, help="Output layer size")
     parser.add_argument("--w1_sparsity", type=float, default=0.5, help="Sparsity of first layer weights (0 to 1)")
     parser.add_argument("--w2_sparsity", type=float, default=0.5, help="Sparsity of second layer weights (0 to 1)")
+    parser.add_argument("--config", type=str)
     args = parser.parse_args()
 
     batch_size = args.batch_size
@@ -24,6 +25,7 @@
     output_size = args.output_size
     w1_sparsity = args.w1_sparsity
     w2_sparsity = args.w2_sparsity
+    config_path = f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/{args.config}"
 
     print("batch_size: ", batch_size)
     print("input_size: ", input_size)
@@ -35,7 +37,7 @@
     with torch.no_grad():
         # Init scheduler
         scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
-                            backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json")
+                            backend_config=config_path)
 
         target_model1 = model1(input_size, hidden_size, output_size, w1_sparsity, w2_sparsity, scheduler.execution_engine.module.custom_device()).eval()
         target_model2 = model2(768, 12).eval()
@@ -52,7 +54,7 @@
 
         # Init request
         new_request1 = Request("mlp", [model_input1], [], request_queue_idx=0)
-        #new_request2 = Request("mlp", [model_input2], [], request_queue_idx=0)
+        #new_request2 = Request("bert", [model_input2], [], request_queue_idx=1)
 
 
         # Add request to scheduler

From 103fbe9a71524a192f3cf392d0621d16f70f9a6b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 24 Feb 2025 07:32:01 +0000
Subject: [PATCH 175/432] [Submodule] Update submodule version

---
 PyTorchSimBackend/extern/ramulator2 | 2 +-
 PyTorchSimBackend/extern/stonneCore | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimBackend/extern/ramulator2 b/PyTorchSimBackend/extern/ramulator2
index 00efad33..5f86be78 160000
--- a/PyTorchSimBackend/extern/ramulator2
+++ b/PyTorchSimBackend/extern/ramulator2
@@ -1 +1 @@
-Subproject commit 00efad33121408dcd3443465835649b120080395
+Subproject commit 5f86be78ad4cb8d56e44cdcf43285e1a68b73450
diff --git a/PyTorchSimBackend/extern/stonneCore b/PyTorchSimBackend/extern/stonneCore
index 7f0a62c9..f26ed01b 160000
--- a/PyTorchSimBackend/extern/stonneCore
+++ b/PyTorchSimBackend/extern/stonneCore
@@ -1 +1 @@
-Subproject commit 7f0a62c965ab6afb57a4ac47f1f9ebaa2e151b07
+Subproject commit f26ed01bab76f961837f559eb9c9459e57722927

From 31b9b273106d0c9fe5fa461a426b47f2de3c6009 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Mon, 24 Feb 2025 07:54:43 +0000
Subject: [PATCH 176/432] [Experiments] ResNet18 fix & core cycle

---
 experiments/resnet18.py |  4 ++--
 scripts/end2end.sh      | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index a27b005d..98e2597f 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -7,8 +7,8 @@
 import datetime
 
 def run_resnet(device, batch):
-    from torchvision.models import resnet
-    model = resnet._resnet(resnet.BasicBlock, [1, 1, 0, 0], weights=None, progress=False).eval()
+    from torchvision.models import resnet18
+    model = resnet18().eval()
     model.to(device, memory_format=torch.channels_last)
     input = torch.randn(batch, 3, 224, 224).to(device=device)
     x1 = input.to(device=device, memory_format=torch.channels_last)
diff --git a/scripts/end2end.sh b/scripts/end2end.sh
index 81095bd5..13755867 100755
--- a/scripts/end2end.sh
+++ b/scripts/end2end.sh
@@ -5,7 +5,7 @@ BASE_PATH=$1 # Input as the first argument
 
 # Initialize the total cycle sum
 total_sum=0
-
+total_core=0
 # Find all backendsim_result folders
 mapfile -t backend_folders < <(find "$BASE_PATH" -type d -name "backendsim_result")
 
@@ -22,14 +22,27 @@ for backend_folder in "${backend_folders[@]}"; do
     # Extract the last line containing "Total cycle"
     total_cycle=$(grep "Total cycle" "$file" | tail -n 1 | sed -E 's/.*Total cycle ([0-9]+).*/\1/')
     # echo "total_cycle: $total_cycle"
+    active_cycles=($(grep -o 'active cycle [0-9]*' "$file" | awk '{print $3}'))
+    num_cycles=${#active_cycles[@]}
+    if [ "$num_cycles" -ge 3 ]; then
+        core_cycle=${active_cycles[$((num_cycles-3))]}
+    else
+        echo "Error: cannot find core active cycle"
+    fi
 
     if [[ -n "$total_cycle" ]]; then
       # Add the total cycle to the total sum
       # echo "Adding $total_cycle to total_sum"
       total_sum=$((total_sum + total_cycle))
     fi
+    if [[ -n "$core_cycle" ]]; then
+      # Add the total cycle to the total sum
+      # echo "Adding $total_cycle to total_sum"
+      total_core=$((total_core + core_cycle))
+    fi
   done
 done
 
 # Print the total cycle sum
-echo "total end2end cycle: $total_sum"
\ No newline at end of file
+echo "total end2end cycle: $total_sum"
+echo "total core cycle: $total_core"
\ No newline at end of file

From 24130f761b1e4d1424caa06dbadd80bb217d7994 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 24 Feb 2025 11:33:11 +0000
Subject: [PATCH 177/432] [Submodule] Update stonne module path

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index f65e5f2b..34e143d0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -15,4 +15,4 @@
 	url = https://github.com/PSAL-POSTECH/ramulator2
 [submodule "PyTorchSimBackend/extern/stonneCore"]
 	path = PyTorchSimBackend/extern/stonneCore
-	url = https://github.com/PSAL-POSTECH/stonne_core.git
+	url = git@github.com:PSAL-POSTECH/stonne_core.git

From 01d83a1c8f405f7e7bd9ce272999db3d5462ed69 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 24 Feb 2025 11:42:06 +0000
Subject: [PATCH 178/432] Revert "[Submodule] Update stonne module path"

This reverts commit 24130f761b1e4d1424caa06dbadd80bb217d7994.
---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 34e143d0..f65e5f2b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -15,4 +15,4 @@
 	url = https://github.com/PSAL-POSTECH/ramulator2
 [submodule "PyTorchSimBackend/extern/stonneCore"]
 	path = PyTorchSimBackend/extern/stonneCore
-	url = git@github.com:PSAL-POSTECH/stonne_core.git
+	url = https://github.com/PSAL-POSTECH/stonne_core.git

From 80a3a8bd5c3248c620fe9799da1951121c0ff016 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 27 Feb 2025 07:01:04 +0000
Subject: [PATCH 179/432] [CI] Rename relu test to activation

---
 .github/workflows/docker-image.yml | 12 ++++++------
 .github/workflows/pull-request.yml | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 2b420ff8..0935e36a 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -80,18 +80,18 @@ jobs:
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_add.py
 
-  test_relu:
-    name: Run test_relu.py
+  test_activation:
+    name: Run test_activation.py
     runs-on: self-hosted
     needs: build
     steps:
-      - name: Run test_relu.py
+      - name: Run test_activation.py
         run: |
-          echo "Running test_relu.py"
+          echo "Running test_activation.py"
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_relu.py
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_activation.py
 
   test_batchnorm:
     name: Run test_batchnorm.py
@@ -361,7 +361,7 @@ jobs:
             test_matmul, test_reduce, test_softmax,
             test_transpose2D, test_view3D_2D, test_layernorm,
             test_mlp, test_resnet, test_transformer, test_transpose3D,
-            test_sparsity, test_relu, test_pool, test_perceptron,
+            test_sparsity, test_activation, test_pool, test_perceptron,
             test_fusion, test_moe]
     steps:
       - name: Checkout code
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index 003a0d01..92703c64 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -80,18 +80,18 @@ jobs:
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_add.py
 
-  test_relu:
-    name: Run test_relu.py
+  test_activation:
+    name: Run test_activation.py
     runs-on: self-hosted
     needs: build
     steps:
-      - name: Run test_relu.py
+      - name: Run test_activation.py
         run: |
-          echo "Running test_relu.py"
+          echo "Running test_activation.py"
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_relu.py
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_activation.py
 
   test_batchnorm:
     name: Run test_batchnorm.py
@@ -361,7 +361,7 @@ jobs:
             test_matmul, test_reduce, test_softmax,
             test_transpose2D, test_view3D_2D, test_layernorm,
             test_mlp, test_resnet, test_transformer, test_transpose3D,
-            test_sparsity, test_relu, test_pool, test_perceptron,
+            test_sparsity, test_activation, test_pool, test_perceptron,
             test_fusion, test_moe]
     steps:
       - name: Checkout code

From 2d52244b7403a392dd0d307a534422ad7e88e879 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 27 Feb 2025 07:10:58 +0000
Subject: [PATCH 180/432] [Frontend] Allow squeezed case for gemm template

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 95e5bcc5..cccc48a7 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -86,6 +86,9 @@ def is_transposed(self, node):
             if node.layout.stride != node.data.layout.stride:
                 if node.layout.stride[-2] == node.data.layout.stride[-1] and node.layout.stride[-1] == node.data.layout.stride[-2]:
                     return True
+                elif len(node.layout.stride) < len(node.data.layout.stride) and node.layout.stride == node.data.layout.stride[-len(node.layout.stride):]:
+                    # Squeezed case
+                    return False
                 else:
                     raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
         return False

From 735353acc7c5bcaa09518adf3793ce2d5ca44e74 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 28 Feb 2025 10:18:23 +0000
Subject: [PATCH 181/432] [Frontned/Fusion] Seperate fusion case

Currently, we support template + activation fusion. But, We found
that there are new available fusion cases which is fusion of
elementwise operations. To do this, we have to manage buffer that
affected by a fusion. There is two type of fusion, vertical or
horizontal fusion. In case of vertical fusion, intermediate buffer
are removed. For horizontal fusion, we avoid loading the buffer
multiple

Plus, set proper tile info for scalar operation
---
 PyTorchSimFrontend/mlir/mlir_common.py     | 14 +++++-
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 54 +++++++++++++---------
 2 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index af38612c..6b7a13ce 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -408,6 +408,14 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
                 tile_size[-i] = target_range
                 if remains:
                     tile_size[-i] += vlane_stride - remains
+        # Handle scalar case
+        if len(tile_size)==1 and tile_size[0] == 1:
+            vlane_stride = 1
+            tile_size[0] = 1
+        # Adjust tile size
+        used_vlane = min((tile_size[len(vars) - 1] + vlane_stride - 1) // vlane_stride, self.vector_lane)
+        padded_size = used_vlane * vlane_stride
+        tile_size[len(vars) - 1] = ((tile_size[len(vars) - 1] + padded_size - 1) // padded_size) * padded_size
 
         # Select tile info.
         # Note: Kernel Group have to share same tile desc for fusion
@@ -563,7 +571,11 @@ def load(name: str, index: sympy.Expr):
                 store_cache = self.cse.store_cache
                 if name in store_cache:
                     return store_cache[name]
-                return self.load(name, index)
+                key = name+str(index)
+                if key not in self.cse.cache:
+                    result = self.load(name, index)
+                    self.cse.cache[key] = result
+                return self.cse.cache[key]
 
             @staticmethod
             def store(name, index, value, mode=None):
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 87a05302..9996efa0 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -25,7 +25,7 @@ def _set_flush_status(self, status: bool):
         self._ready_to_flush = status
 
     def can_fuse_vertical(self, node1, node2):
-        return self.can_fuse_horizontal(node1, node2) and not node1.is_reduction()
+        return self.can_fuse_horizontal(node1, node2)
 
     def can_fuse_horizontal(self, node1, node2):
         _, (vars1, reduce1) = node1.group
@@ -35,36 +35,48 @@ def can_fuse_horizontal(self, node1, node2):
         if node1.is_reduction() or node2.is_reduction():
             return False
 
-        # Convolution is currently not supported
-        if not isinstance(node1, FusedSchedulerNode) and node1.node.origin_node is not None and hasattr(node1.node.origin_node.target, "_name") and node1.node.origin_node.target._name == 'aten::convolution':
-            return False
+        # Can't fuse two template node
+        nr_template = 0
+        for node in node1.get_nodes() + node2.get_nodes():
+            if node.is_template():
+                nr_template += 1
 
-        if not isinstance(node2, FusedSchedulerNode) and node2.node.origin_node is not None and hasattr(node2.node.origin_node.target, "_name") and node2.node.origin_node.target._name == 'aten::convolution':
+        if nr_template > 1:
             return False
 
-        if not isinstance(node1, FusedSchedulerNode) and not isinstance(node2, FusedSchedulerNode):
+        # Check template node fusion
+        if node1.is_template() or node2.is_template():
             # Different layout is not supported
-            if node1.node.layout.dtype != node2.node.layout.dtype:
+            if node1.get_nodes()[0].node.layout.dtype != node2.get_nodes()[0].node.layout.dtype:
                 return False
 
-            # Different size is not supported for non-template node
-            if  not node1.is_template() and (node1._sizes[0] != node2._sizes[0]):
+            # Convolution is currently not supported
+            if node1.is_template() and node1.get_nodes()[0].node.origin_node is not None and hasattr(node1.get_nodes()[0].node.origin_node.target, "_name") and node1.get_nodes()[0].node.origin_node.target._name == 'aten::convolution':
                 return False
 
-        if vars1 == vars2 and reduce1 == reduce2:
-            return True
-        if reduce1 == () and vars1 == vars2 + reduce2:
-            return True
+            if node2.is_template() and node2.get_nodes()[0].node.origin_node is not None and hasattr(node2.get_nodes()[0].node.origin_node.target, "_name") and node2.get_nodes()[0].node.origin_node.target._name == 'aten::convolution':
+                return False
 
-        #TODO: Temporary solution determining the fusion condition similar to CPP/OpenMP
-        v1_total = math.prod(vars1) if len(vars1) else 0
-        v2_total = math.prod(vars2) if len(vars2) else 0
-        r1_total = math.prod(reduce1) if len(reduce1) else 0
-        r2_total = math.prod(reduce2) if len(reduce2) else 0
-        if reduce1 == () \
-            and v1_total == (v2_total + r2_total):
-            return True
+            v1_total = math.prod(vars1) if len(vars1) else 0
+            v2_total = math.prod(vars2) if len(vars2) else 0
+            if v1_total != v2_total:
+                return False
 
+            has_depedency = False
+            template_node = node1 if node1.is_template() else node2
+            act_node = node2 if node1.is_template() else node1
+            for write_buf in template_node.read_writes.writes:
+                has_depedency = has_depedency or (write_buf in act_node.read_writes.reads)
+            return has_depedency
+        return False
+        # TODO. Support elementwise fusion
+        # Check elementwise fusion
+        if vars1 == vars2 and reduce1 == reduce2:
+            vertical_buf = (node1.read_writes.writes & node2.read_writes.reads) | (node2.read_writes.writes & node1.read_writes.reads)
+            for vbuf in vertical_buf:
+                # FIXME. Assume that all the users are fusioned.
+                V.graph.removed_buffers.add(vbuf.name)
+            return True
         return False
 
     def group_fn(self, sizes):

From 256a4dc408bd50c63bb4754ddc23102c38cb0a28 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 5 Mar 2025 03:49:16 +0000
Subject: [PATCH 182/432] [Fix] GEMM minimum tile size

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 4 ++--
 PyTorchSimFrontend/mlir/mlir_template.py      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index cccc48a7..dbe1c4b9 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -115,8 +115,8 @@ def render(self,
             TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
             template = GEMM_TEMPLATE
         TILE_M = min(extension_config.CONFIG_FORCE_TILE_M, TILE_M)
-        TILE_N = min(extension_config.CONFIG_FORCE_TILE_N, TILE_M)
-        TILE_K = min(extension_config.CONFIG_FORCE_TILE_K, TILE_M)
+        TILE_N = min(extension_config.CONFIG_FORCE_TILE_N, TILE_N)
+        TILE_K = min(extension_config.CONFIG_FORCE_TILE_K, TILE_K)
         TOG_latency = M if TILE_M > M else TILE_M
         kernel.loop_size =[TOG_latency, TILE_N, TILE_K]
         SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 49cc6c56..53287e43 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -120,7 +120,7 @@ def gemm_combination_mapping(self, M, N, K):
         max_spad_size = spad_size // 2 # double buffer
         m_pad_factor = self.vector_lane if M > self.vector_lane else 8
         n_pad_factor = self.vector_lane if N > self.vector_lane else 8
-        k_pad_factor = self.vector_lane if K > self.vector_lane else 1
+        k_pad_factor = self.vector_lane if K > self.vector_lane else 8
         K = max(K, 8)
         M_padded = ((M + m_pad_factor - 1) // m_pad_factor) * m_pad_factor
         N_padded = ((N + n_pad_factor - 1) // n_pad_factor) * n_pad_factor

From 78083a12cf3f0fc457d02cae0efbed977f8b1cee Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 5 Mar 2025 05:55:42 +0000
Subject: [PATCH 183/432] [Frontend] erf & tanh lowering are allowed

erf & tanh is handled by SFU

spike, gem5, and llvm-project should be pulled
---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +-
 gem5_script/script_systolic.py                  | 2 ++
 tests/test_activation.py                        | 4 ++--
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 47b823e4..1362ddfb 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -278,7 +278,7 @@ def erf(x, *args, var_info=None, **kwargs):
         tile_size = op_type[0]
         dtype = op_type[1]
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.erf %{x} : {shape}', [tile_size, dtype] # TODO: erf lowering pass is not implemented
+        return f'math.erf %{x} : {shape}', [tile_size, dtype]
 
     @staticmethod
     def tanh(operand, *args, var_info=None, **kwargs):
diff --git a/gem5_script/script_systolic.py b/gem5_script/script_systolic.py
index 89052f27..83c2ab39 100644
--- a/gem5_script/script_systolic.py
+++ b/gem5_script/script_systolic.py
@@ -56,6 +56,8 @@ class SparseAccelerator(MinorFU):
 class SpecialFunctionUnit(MinorFU):
     opClasses = minorMakeOpClassSet([
         "CustomMatMulvexp",
+        "CustomMatMulverf",
+        "CustomMatMulvtanh",
         ])
     opLat = 10
 
diff --git a/tests/test_activation.py b/tests/test_activation.py
index 97b77cac..86052828 100644
--- a/tests/test_activation.py
+++ b/tests/test_activation.py
@@ -87,5 +87,5 @@ def test_SwiGLU(device, size=(128, 128)):
     test_sigmoid(device, (128, 128))
     test_SiLU(device, (128, 128))
     test_SwiGLU(device, (128, 128))
-    # test_GeLU(device, (128, 128))
-    # test_GeLU(device, (128, 128), approximate='tanh')
+    test_GeLU(device, (128, 128))
+    test_GeLU(device, (128, 128), approximate='tanh')

From 4c6d96d8400c862cd27b36d4939a09189773578e Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 5 Mar 2025 13:03:35 +0000
Subject: [PATCH 184/432] [Fix] mapping depend on fusion node

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  3 ++-
 PyTorchSimFrontend/mlir/mlir_template.py      | 14 ++++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index dbe1c4b9..f00c77d7 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -108,11 +108,12 @@ def render(self,
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
         M, N, K = X.get_size()[0], W.get_size()[1], X.get_size()[1]
+        n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
         if (M == 0) or (N == 0) or (K == 0):
             TILE_M, TILE_N, TILE_K = 0, 0, 0
             template = EMPTY_TEMPLATE
         else:
-            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
+            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node)
             template = GEMM_TEMPLATE
         TILE_M = min(extension_config.CONFIG_FORCE_TILE_M, TILE_M)
         TILE_N = min(extension_config.CONFIG_FORCE_TILE_N, TILE_N)
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 53287e43..0b739102 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -115,9 +115,11 @@ def gemmini_gemm_mapping(self, M, N, K):
 
         return inner_I, inner_J, inner_K
 
-    def gemm_combination_mapping(self, M, N, K):
-        spad_size = self.spad_info["spad_size"] * self.vector_lane
+    def gemm_combination_mapping(self, M, N, K, n_extra_node=0):
+        spad_size_per_lane = self.spad_info["spad_size"]
+        spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
+        max_spad_per_lane = spad_size_per_lane // 2 # double buffer
         m_pad_factor = self.vector_lane if M > self.vector_lane else 8
         n_pad_factor = self.vector_lane if N > self.vector_lane else 8
         k_pad_factor = self.vector_lane if K > self.vector_lane else 8
@@ -139,8 +141,12 @@ def gemm_combination_mapping(self, M, N, K):
                 tile_N = j * self.vector_lane if N > self.vector_lane else N_padded
                 for i in tile_M_range:
                     tile_M = i * self.vector_lane if M > self.vector_lane else M_padded
-                    used_spad_size = (tile_M * tile_K + tile_K * tile_N + tile_M * tile_N) * self.precision
-                    if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and maximize_i_j <= tile_M * tile_N:
+                    used_spad_size = (tile_M * tile_K + tile_K * tile_N + tile_M * tile_N * (1 + n_extra_node)) * self.precision
+                    weight_size_per_lane = self.get_spad_size_per_lane(tile_K, tile_N)
+                    input_size_per_lane = self.get_spad_size_per_lane(tile_M, tile_K)
+                    output_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_extra_node), tile_N)
+                    used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
+                    if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and maximize_i_j <= tile_M * tile_N:
                         max_used_spad_size = used_spad_size
                         maximize_i_j = tile_M * tile_N
                         mapping = (tile_M, tile_N, tile_K)

From 7f7cbc8ece8a6e4dcb1aa1ebae6b9a31f0103125 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 5 Mar 2025 13:34:06 +0000
Subject: [PATCH 185/432] [Fusion] DeferredLine support for store codegen

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 8 ++++----
 PyTorchSimFrontend/mlir/mlir_common.py          | 2 ++
 PyTorchSimFrontend/mlir/mlir_scheduling.py      | 7 +------
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 47b823e4..aa8678af 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -851,12 +851,12 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
             value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
 
         line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}{shape}"
-        self.cse.generate(self.stores, line, assignment = False)
+        self.stores.writeline(common.DeferredLine(name, line))
 
         # Generate DMA instruction
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  f"{name}_tag", dram_shape, tile_shape, tile_stride)
-        self.cse.generate(self.stores, code, assignment = False)
+        self.stores.writeline(common.DeferredLine(name, code))
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
         argmax_or_argmin = reduction_type in {"argmax", "argmin"}
@@ -1012,14 +1012,14 @@ def store_reduction(self, name, index, value):
             shape = f", {shape}" if self.buffer_types[name][1] > 1 else ""
 
         line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}{shape}"
-        self.cse.generate(self.reductions_suffix, line, assignment = False)
+        self.reductions_suffix.writeline(common.DeferredLine(name, line))
 
         # MVOUT Encoding
 
         # Generate DMA instruction
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  f"{name}_tag", dram_shape, tile_shape, tile_stride)
-        self.cse.generate(self.reductions_suffix, code, assignment = False)
+        self.reductions_suffix.writeline(common.DeferredLine(name, code))
 
     def codegen_global_init(self):
         return self.global_vars
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 6b7a13ce..cd558f1c 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -439,6 +439,8 @@ def codegen_nodes(self, nodes, kernel_name):
         with self as kernel:
             for node in nodes:
                 node.run(vars, reduction_vars)
+        V.graph.removed_buffers |= self.removed_buffers
+        # V.graph.inplaced_to_remove |= self.inplaced_to_remove
         src_code = self.codegen_kernel(kernel_name=kernel_name)
         self.meta_kernel()
         return src_code
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 9996efa0..593257d4 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -68,14 +68,9 @@ def can_fuse_horizontal(self, node1, node2):
             for write_buf in template_node.read_writes.writes:
                 has_depedency = has_depedency or (write_buf in act_node.read_writes.reads)
             return has_depedency
-        return False
-        # TODO. Support elementwise fusion
+
         # Check elementwise fusion
         if vars1 == vars2 and reduce1 == reduce2:
-            vertical_buf = (node1.read_writes.writes & node2.read_writes.reads) | (node2.read_writes.writes & node1.read_writes.reads)
-            for vbuf in vertical_buf:
-                # FIXME. Assume that all the users are fusioned.
-                V.graph.removed_buffers.add(vbuf.name)
             return True
         return False
 

From 616b30bad72acd91b3d53b4a20c56c12edd3de45 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 5 Mar 2025 13:51:28 +0000
Subject: [PATCH 186/432] [Backend/Stonne] Add trace primitive for stonne core

---
 PyTorchSimBackend/extern/stonneCore    |  2 +-
 PyTorchSimBackend/include/SparseCore.h |  5 +++++
 PyTorchSimBackend/src/SparseCore.cc    | 19 ++++++++++++++++++-
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimBackend/extern/stonneCore b/PyTorchSimBackend/extern/stonneCore
index f26ed01b..f0a6e0fd 160000
--- a/PyTorchSimBackend/extern/stonneCore
+++ b/PyTorchSimBackend/extern/stonneCore
@@ -1 +1 @@
-Subproject commit f26ed01bab76f961837f559eb9c9459e57722927
+Subproject commit f0a6e0fdb742a63b3221a032a0a2b97789a6b5ef
diff --git a/PyTorchSimBackend/include/SparseCore.h b/PyTorchSimBackend/include/SparseCore.h
index 8fc60804..d2a2ba6b 100644
--- a/PyTorchSimBackend/include/SparseCore.h
+++ b/PyTorchSimBackend/include/SparseCore.h
@@ -20,12 +20,17 @@ class SparseCore : public Core {
   void print_stats() override;
   void print_current_stats() override;
   std::shared_ptr<Tile> pop_finished_tile() override;
+  uint32_t num_ms = 1;
   uint32_t r_port_nr = 1;
   uint32_t w_port_nr = 1;
   uint32_t nr_cores = 1;
 private:
   uint32_t rr_idx = 0;
   std::vector<bool> coreBusy;
+  std::vector<int> traceCoreStatus;
+  std::vector<int> traceCoreCycle;
+  std::set<uint64_t> traceLoadTraffic; // To trace dma traffic
+  std::set<uint64_t> traceStoreTraffic; // To trace dma traffic
   std::vector<SST_STONNE::sstStonne*> stonneCores;
   std::vector<std::vector<std::shared_ptr<Tile>>> percore_tiles;
   /* Interconnect queue */
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 3891d9dd..1c42a879 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -4,6 +4,8 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config)
   /* Init stonne cores*/
   nr_cores = config.num_stonne_per_core;
   coreBusy.resize(nr_cores);
+  traceCoreStatus.resize(nr_cores);
+  traceCoreCycle.resize(nr_cores);
   percore_tiles.resize(nr_cores);
   stonneCores.resize(nr_cores);
   for (int i=0; i<nr_cores; i++) {
@@ -11,12 +13,14 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config)
     stonneCores.at(i) = core;
     stonneCores.at(i)->init(1);
     coreBusy.at(i) = false;
+    traceCoreStatus.at(i) = 0;
+    traceCoreCycle.at(i) = 0;
     percore_tiles.at(i) = std::vector<std::shared_ptr<Tile>>();
   }
 
   Config stonneConfig = stonneCores.at(0)->getStonneConfig();
   unsigned int core_freq = config.core_freq; // MHz;
-  unsigned int num_ms = stonneConfig.m_MSNetworkCfg.ms_size;
+  num_ms = stonneConfig.m_MSNetworkCfg.ms_size;
   r_port_nr = config.num_stonne_port;
   w_port_nr = config.num_stonne_port;
 
@@ -85,6 +89,17 @@ void SparseCore::cycle() {
   uint32_t stonne_core_id = 0;
   for (auto& stonneCore : stonneCores) {
     stonneCore->cycle();
+    int new_status = stonneCore->getMCFSMStats();
+    int compute_cycle = stonneCore->getMSStats().n_multiplications;
+    if (traceCoreStatus.at(stonne_core_id) != new_status) {
+      spdlog::trace("Stonne Core [{}][{}] status transition {} -> {}, Load/Store: {}/{}, compute_cycle: {}",
+        _id, _core_cycle, traceCoreStatus.at(stonne_core_id), new_status,
+        traceLoadTraffic.size(), traceStoreTraffic.size(), (compute_cycle - traceCoreCycle.at(stonne_core_id))/num_ms);
+      traceCoreStatus.at(stonne_core_id) = new_status;
+      traceCoreCycle.at(stonne_core_id) = compute_cycle;
+      traceLoadTraffic.clear();
+      traceStoreTraffic.clear();
+    }
 
     /* Send Memory Request */
     while (SimpleMem::Request* req = stonneCore->popRequest()) {
@@ -96,10 +111,12 @@ void SparseCore::cycle() {
         case SimpleMem::Request::Read:
           acc_type = mem_access_type::GLOBAL_ACC_R;
           type = mf_type::READ_REQUEST;
+          traceLoadTraffic.insert(target_addr);
           break;
         case SimpleMem::Request::Write:
           acc_type = mem_access_type::GLOBAL_ACC_W;
           type = mf_type::WRITE_REQUEST;
+          traceStoreTraffic.insert(target_addr);
           break;
         default:
           spdlog::error("[SparseCore] Invalid request type from core");

From fd58ac397d9140a11d934797a2addcee0d667d4b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 5 Mar 2025 13:53:47 +0000
Subject: [PATCH 187/432] [Frontend] Support eager/graph mode for sparse
 external call

---
 PyTorchSimFrontend/extension_op.py            | 213 ++++--------------
 .../mlir/mlir_codegen_backend.py              |   2 +-
 2 files changed, 42 insertions(+), 173 deletions(-)

diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 4114bce1..9f74c53c 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -18,26 +18,27 @@ class MLIRExternKernelChoice(ExternKernelChoice):
     def call_name(self):
         is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
         if is_dryrun:
-            return f"yield from flexagon_frontend"
+            return f"yield from sparse_mm_dummy_stonne_outer"
         return f"torch.ops.extension_op.{self.name}"
 
 custom_lib = torch.library.Library("extension_op", "DEF")
 
-def generate_outer_product_matrix(a, b, M, K, N, prefix):
+def calculate_sparsity(tensor):
+    total_elements = tensor.numel()
+    zero_elements = torch.sum(tensor.cpu() == 0)
+    sparsity_ratio = zero_elements / total_elements * 100
+    return math.ceil(sparsity_ratio.item())
+
+def generate_outer_product_matrix(a, b, M, K, N, prefix, dir_path):
     # Generating matrix A
     data_width = 4
     a_cpu = a.cpu()
     b_cpu = b.cpu()
-    value_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_mem.ini')
-    rowA_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_rowpointerA.in')
-    colA_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_colpointerA.in')
-    rowB_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_rowpointerB.in')
-    colB_pointer = os.path.join(extension_config.CONFIG_TORCHSIM_DIR,
-        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_colpointerB.in')
+    value_pointer = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_mem.ini')
+    rowA_pointer = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_rowpointerA.in')
+    colA_pointer = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_colpointerA.in')
+    rowB_pointer = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_rowpointerB.in')
+    colB_pointer = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_colpointerB.in')
 
     with open(value_pointer, "w") as fd, open(rowA_pointer, "w") as rpA, open(colA_pointer, "w") as cpA, open(rowB_pointer, "w") as rpB, open(colB_pointer, "w") as cpB:
         #generating matrixA
@@ -150,16 +151,8 @@ def generate_inner_product_matrix(a, b, M, K, N, file_name, in_file_bitmap_a, in
     print("Offset matrix C: "+str(address_matrix_c))
     return address_matrix_a, matrixA_size, matrixA_size+matrixB_size
 
-def flexagon_frontend(a, b, out):
-    M = a.shape[0]
-    N = b.shape[1]
-    K = b.shape[0]
-
-    def calculate_sparsity(tensor):
-        total_elements = tensor.numel()
-        zero_elements = torch.sum(tensor.cpu() == 0)
-        sparsity_ratio = zero_elements / total_elements * 100
-        return math.ceil(sparsity_ratio.item())
+def prepare_outer_product_matrix(a, b, out):
+    M, K, N = a.shape[0], b.shape[0], b.shape[1]
 
     prefix = datetime.now().strftime("%m%d%H%M%S%f")
     w_sparsity = calculate_sparsity(a)
@@ -175,25 +168,14 @@ def calculate_sparsity(tensor):
         'PyTorchSimBackend/extern/stonneCore/tests/outerproduct'
     )
     os.makedirs(dir_path, exist_ok=True)
-
-    value_path = os.path.join(
-        extension_config.CONFIG_TORCHSIM_DIR,
-        f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}outerproduct_gemm_mem.ini'
-    )
-
-    if os.path.exists(value_path):
-        os.remove(value_path)
-        print(f"Deleted: {value_path}")
-    else:
-        print(f"File does not exist: {value_path}")
-
-    dram_a_address, dram_b_address, dram_c_address = generate_outer_product_matrix(a, b, M, K, N, prefix)
-    mem_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_mem.ini')
-    a_row_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_rowpointerA.in')
-    a_col_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_colpointerA.in')
-    b_row_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_rowpointerB.in')
-    b_col_init = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_outerproduct_gemm_colpointerB.in')
-    c_result = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, f'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/{prefix}_result.out')
+    mem_init = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_mem.ini')
+    dram_a_address, dram_b_address, dram_c_address = generate_outer_product_matrix(a, b, M, K, N, prefix, dir_path)
+    value_path = os.path.join(dir_path, f'{prefix}outerproduct_gemm_mem.ini')
+    a_row_init = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_rowpointerA.in')
+    a_col_init = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_colpointerA.in')
+    b_row_init = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_rowpointerB.in')
+    b_col_init = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_colpointerB.in')
+    c_result = os.path.join(dir_path, f'{prefix}_result.out')
     graph = {
         0: {
             "node_id": 0,
@@ -261,150 +243,37 @@ def calculate_sparsity(tensor):
         vector_lane=0,
         stonneGraph=True
     )
-
     onnx_path = os.path.join(write_path, "tile_graph.onnx")
     attribute_path = os.path.join(write_path, "attributes")
-    is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
-    if is_dryrun:
-        out.copy_(torch.matmul(a.cpu(), b.cpu()))
-        yield (onnx_path, attribute_path)
-        return
+    return onnx_path, attribute_path, c_result
+
+
+def sparse_mm_stonne_outer(a, b, out):
+    onnx_path, attribute_path, c_result_path = prepare_outer_product_matrix(a, b, out)
 
-    #attribute_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "attribute")
     backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
-    stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json'
+    stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json'
     backsim = BackendSimulator(backend_path, stonne_config_path)
     result_path = backsim.simulation(onnx_path)
-    result = BackendSimulator.get_result_from_file(result_path)
+    BackendSimulator.get_result_from_file(result_path)
 
     # Load result data
-    with open(c_result, 'rb') as f:
+    with open(c_result_path, 'rb') as f:
         np_array = np.fromfile(f, dtype=TORCH_TO_NUMPY[out.dtype])
         src_tensor = torch.as_strided(torch.from_numpy(np_array), out.size(), out.stride())
         out.copy_(src_tensor.to(dtype=out.dtype))
 
-def flexagon_frontend2(a, b, out):
-    M = a.shape[0]
-    N = b.shape[1]
-    K = b.shape[0]
-
-    def calculate_sparsity(tensor):
-        total_elements = tensor.numel()
-        zero_elements = torch.sum(tensor.cpu() == 0)
-        sparsity_ratio = zero_elements / total_elements * 100
-        return math.ceil(sparsity_ratio.item())
-
-    prefix = ""# datetime.now().strftime("%d_%H_%M_%f")
-    w_sparsity = calculate_sparsity(a)
-    x_sparsity = calculate_sparsity(b)
-    print(f"A Sparsity: {w_sparsity}")
-    print(f"B Sparsity: {x_sparsity}")
-    assert(x_sparsity >= 0 and x_sparsity < 100)
-    assert(w_sparsity >= 0 and w_sparsity < 100)
-    target_path = 'PyTorchSimBackend/extern/stonneCore/tests/innerproduct'
-    # Generating inputs
-    dir_path = os.path.join(
-        extension_config.CONFIG_TORCHSIM_DIR,
-        target_path
-    )
-    os.makedirs(dir_path, exist_ok=True)
-
-    file_name = os.path.join(
-        extension_config.CONFIG_TORCHSIM_DIR,
-        f'{dir_path}/{prefix}_bitmapSpMSpM_gemm_mem.ini'
-    )
-
-    in_file_bitmap_a = f"{dir_path}/{prefix}_bitmapSpMSpM_file_bitmapA_"+str(M)+"_"+str(N)+"_"+str(K)+".in"
-    in_file_bitmap_b = f"{dir_path}/{prefix}_bitmapSpMSpM_file_bitmapB_"+str(M)+"_"+str(N)+"_"+str(K)+".in"
-    c_result = f'{dir_path}/{prefix}_result.out'
-    dram_a_address, dram_b_address, dram_c_address = generate_inner_product_matrix(a, b, M, N, K, file_name, in_file_bitmap_a, in_file_bitmap_b)
-    dram_a_address = a.data_ptr()
-    dram_b_address = b.data_ptr()
-    dram_c_address = out.data_ptr()
-    graph = {
-        0: {
-            "node_id": 0,
-            "node_name": "root",
-            "node_type": 0,
-            "parents": [],
-            "children": [1]
-        },
-        1: {
-            "node_id": 1,
-            "node_name": "loopNode",
-            "node_type": 2,
-            "parents": [0],
-            "children": [2],
-            "loop_index": "loop_arg000",
-            "loop_start": 0,
-            "loop_end": 64,
-            "loop_step": 1,
-            "loop_type": "outer_loop"
-        },
-        2: {
-            "node_id": 2,
-            "node_name": "stonneNode",
-            "node_type": 5,
-            "parents": [1],
-            "children": [],
-            # Operation Type
-            "stonne_operation": "bitmapSpMSpM",
-
-            # GEMM Parameters
-            "stonne_GEMM_K": K,
-            "stonne_GEMM_N": N,
-            "stonne_GEMM_M": M,
-
-            # Memory Initialization & File Paths
-            "stonne_mem_init": file_name,
-            "stonne_mem_matrix_c_file_name": c_result,
-
-            # Memory Addresses
-            "stonne_matrix_a_dram_address": dram_a_address,
-            "stonne_matrix_b_dram_address": dram_b_address,
-            "stonne_matrix_c_dram_address": dram_c_address,
-
-            # CSR & Bitmap Initialization
-            "stonne_bitmap_matrix_a_init" : in_file_bitmap_a,
-            "stonne_bitmap_matrix_b_init" : in_file_bitmap_b,
-        }
-    }
-    source_code = "graph = " + str(graph)
-
-    write_path = get_write_path(source_code)
-    key, raw_tog_path = write(source_code, "py", specified_dir=write_path)
-    tile_graph_generator = tog_generator(["flexagon_matmul"])
-    tile_graph_generator.load_file(raw_tog_path)
-    tile_graph_generator.generate_tile_graph(
-        os.path.join(write_path, "tile_graph.onnx"),
-        cycle_list=[0],
-        x_offset=0,
-        w_offset=0,
-        vector_lane=0,
-        stonneGraph=True
-    )
-
-    onnx_path = os.path.join(write_path, "tile_graph.onnx")
-    attribute_path = os.path.join(write_path, "attributes")
-    is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
-    if is_dryrun:
-        out.copy_(torch.matmul(a.cpu(), b.cpu()))
-        yield (onnx_path, attribute_path)
-        return
-
-    #attribute_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "attribute")
-    backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
-    stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_c1_simple_noc_tpuv3.json'
-    backsim = BackendSimulator(backend_path, stonne_config_path)
-    result_path = backsim.simulation(onnx_path)
-    result = BackendSimulator.get_result_from_file(result_path)
+def sparse_mm_dummy_stonne_outer(a, b, out):
+    onnx_path, attribute_path, c_result_path = prepare_outer_product_matrix(a, b, out)
+    out.copy_(torch.matmul(a.cpu(), b.cpu()))
+    yield (onnx_path, attribute_path)
 
     # Load result data
-    with open(c_result, 'rb') as f:
-        np_array = np.fromfile(f, dtype=TORCH_TO_NUMPY[out.dtype])
-        src_tensor = torch.as_strided(torch.from_numpy(np_array), out.size(), out.stride())
-        out.copy_(src_tensor.to(dtype=out.dtype))
+    # with open(c_result_path, 'rb') as f:
+    #     np_array = np.fromfile(f, dtype=TORCH_TO_NUMPY[out.dtype])
+    #     src_tensor = torch.as_strided(torch.from_numpy(np_array), out.size(), out.stride())
+    #     out.copy_(src_tensor.to(dtype=out.dtype))
 
 custom_lib.define("_sparse_mm(Tensor a, Tensor b, Tensor out) -> Tensor")
-custom_lib.impl("_sparse_mm", flexagon_frontend, "PrivateUse1")
-custom_lib.impl("_sparse_mm", flexagon_frontend, "AutogradPrivateUse1")
\ No newline at end of file
+custom_lib.impl("_sparse_mm", sparse_mm_stonne_outer, "PrivateUse1")
+custom_lib.impl("_sparse_mm", sparse_mm_stonne_outer, "AutogradPrivateUse1")
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index d651a480..2b70f4a0 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -91,7 +91,7 @@ def write_header(self):
 
                 from torch import device, empty, empty_strided
                 from {extension_codecache.__name__} import CustomAsyncCompile
-                from PyTorchSimFrontend.extension_op import flexagon_frontend
+                from PyTorchSimFrontend.extension_op import sparse_mm_dummy_stonne_outer
                 from torch._inductor.select_algorithm import extern_kernels
 
                 aten = torch.ops.aten

From 2e3759844e409c23cc03f59e12db86a584190a9e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 6 Mar 2025 05:41:36 +0000
Subject: [PATCH 188/432] [Backend/Stonne] Dump trace primitive for stonne core

---
 AsmParser/onnx_utility.py                   |   1 +
 PyTorchSimBackend/extern/stonneCore         |   2 +-
 PyTorchSimBackend/include/SparseCore.h      |  44 +++++-
 PyTorchSimBackend/include/TileGraphParser.h |   2 +
 PyTorchSimBackend/src/SparseCore.cc         |  69 ++++++++--
 PyTorchSimBackend/src/TileGraphParser.cc    |   1 +
 PyTorchSimFrontend/extension_op.py          | 142 ++++++++++----------
 7 files changed, 181 insertions(+), 80 deletions(-)

diff --git a/AsmParser/onnx_utility.py b/AsmParser/onnx_utility.py
index 19e5b5f8..35e4fa79 100644
--- a/AsmParser/onnx_utility.py
+++ b/AsmParser/onnx_utility.py
@@ -147,6 +147,7 @@ def __init__(self, tile_info, node_id=0):
         self.torchsim_stonne_colpointer_matrix_a_init = tile_info.get("stonne_colpointer_matrix_a_init", "")
         self.torchsim_stonne_rowpointer_matrix_b_init = tile_info.get("stonne_rowpointer_matrix_b_init", "")
         self.torchsim_stonne_colpointer_matrix_b_init = tile_info.get("stonne_colpointer_matrix_b_init", "")
+        self.torchsim_trace_path = tile_info.get("stonne_trace_path", "")
 
 def connect_nodes(parent, child):
     child.add_parent(parent)
diff --git a/PyTorchSimBackend/extern/stonneCore b/PyTorchSimBackend/extern/stonneCore
index f0a6e0fd..b7475e0e 160000
--- a/PyTorchSimBackend/extern/stonneCore
+++ b/PyTorchSimBackend/extern/stonneCore
@@ -1 +1 @@
-Subproject commit f0a6e0fdb742a63b3221a032a0a2b97789a6b5ef
+Subproject commit b7475e0eb98ad1d116e4ba8ebc1807b94961b30c
diff --git a/PyTorchSimBackend/include/SparseCore.h b/PyTorchSimBackend/include/SparseCore.h
index d2a2ba6b..f03045ac 100644
--- a/PyTorchSimBackend/include/SparseCore.h
+++ b/PyTorchSimBackend/include/SparseCore.h
@@ -5,6 +5,41 @@
 #include "SimpleMem.h"
 #include "Config.h"
 
+class TraceNode {
+private:
+  int node_id;
+  int node_type;
+  std::string node_name;
+  std::set<uint64_t> address_set;
+  int compute_cycle;
+
+public:
+  TraceNode(int id, std::string name, int type, int cycle = 0)
+      : node_id(id), node_name(name), node_type(type), compute_cycle(cycle) {}
+  void setAddress(std::set<uint64_t> addr_set) { address_set = addr_set; }
+  friend std::ostream& operator<<(std::ostream& os, const TraceNode& node) {
+    os << "  " << node.node_id << ": {\n"
+        << "    \"node_id\": " << node.node_id << ",\n"
+        << "    \"node_name\": " << std::quoted(node.node_name) << ",\n"
+        << "    \"node_type\": " << node.node_type << ",\n"
+        << "    \"parents\": [0],\n"
+        << "    \"trace_address\": [";
+
+    bool first = true;
+    for (uint64_t addr : node.address_set) {
+      if (!first)
+        os << ", ";
+      os << addr;
+      first = false;
+    }
+
+    os << "],\n"
+        << "    \"trace_compute_cycle\": " << node.compute_cycle << "\n"
+        << "  }";
+    return os;
+  }
+};
+
 class SparseCore : public Core {
 public:
   SparseCore(uint32_t id, SimulationConfig config);
@@ -20,6 +55,7 @@ class SparseCore : public Core {
   void print_stats() override;
   void print_current_stats() override;
   std::shared_ptr<Tile> pop_finished_tile() override;
+  void dumpTrace(int stonne_core_id, const std::string& path);
   uint32_t num_ms = 1;
   uint32_t r_port_nr = 1;
   uint32_t w_port_nr = 1;
@@ -29,12 +65,14 @@ class SparseCore : public Core {
   std::vector<bool> coreBusy;
   std::vector<int> traceCoreStatus;
   std::vector<int> traceCoreCycle;
-  std::set<uint64_t> traceLoadTraffic; // To trace dma traffic
-  std::set<uint64_t> traceStoreTraffic; // To trace dma traffic
+  std::vector<std::vector<TraceNode>> traceNodeList;
+  std::vector<std::set<uint64_t>> traceLoadTraffic; // To trace dma traffic
+  std::vector<std::set<uint64_t>> traceStoreTraffic; // To trace dma traffic
   std::vector<SST_STONNE::sstStonne*> stonneCores;
   std::vector<std::vector<std::shared_ptr<Tile>>> percore_tiles;
   /* Interconnect queue */
   std::queue<mem_fetch*> _request_queue;
   std::queue<mem_fetch*> _response_queue;
   std::map<std::tuple<uint64_t, mem_access_type, mf_type>, std::vector<SimpleMem::Request*>*> request_merge_table;
-};
\ No newline at end of file
+};
+
diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/PyTorchSimBackend/include/TileGraphParser.h
index c9d16a6d..3ae418ec 100644
--- a/PyTorchSimBackend/include/TileGraphParser.h
+++ b/PyTorchSimBackend/include/TileGraphParser.h
@@ -288,6 +288,8 @@ class TileStonneNode : public TileNode {
           desc.bitmap_matrix_b_init = attribute.s();
       }  else if (attribute.name() == "torchsim_mem_matrix_c_file_name") {
           desc.mem_matrix_c_file_name = attribute.s();
+      }  else if (attribute.name() == "torchsim_trace_path") {
+          desc.trace_path = attribute.s();
       } else {
           spdlog::warn("[TileStonneNode] Unrecognized attribute: {}", attribute.name());
       }
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 1c42a879..eb7a7357 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -6,6 +6,9 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config)
   coreBusy.resize(nr_cores);
   traceCoreStatus.resize(nr_cores);
   traceCoreCycle.resize(nr_cores);
+  traceNodeList.resize(nr_cores);
+  traceLoadTraffic.resize(nr_cores);
+  traceStoreTraffic.resize(nr_cores);
   percore_tiles.resize(nr_cores);
   stonneCores.resize(nr_cores);
   for (int i=0; i<nr_cores; i++) {
@@ -67,6 +70,7 @@ void SparseCore::issue(std::shared_ptr<Tile> tile) {
   //SST_STONNE::sstStonne* core = new SST_STONNE::sstStonne(_config.stonne_config_path);
   //stonneCores.at(selected_core_idx) = core;
   stonneCores.at(selected_core_idx)->init(1);
+  traceNodeList.at(selected_core_idx).clear();
 
   spdlog::info("[StonneCore {}][{}] issued new tile", _id, selected_core_idx);
   SST_STONNE::StonneOpDesc *opDesc = static_cast<SST_STONNE::StonneOpDesc*>(tile->get_custom_data());
@@ -92,13 +96,28 @@ void SparseCore::cycle() {
     int new_status = stonneCore->getMCFSMStats();
     int compute_cycle = stonneCore->getMSStats().n_multiplications;
     if (traceCoreStatus.at(stonne_core_id) != new_status) {
-      spdlog::trace("Stonne Core [{}][{}] status transition {} -> {}, Load/Store: {}/{}, compute_cycle: {}",
+      spdlog::info("Stonne Core [{}][{}] status transition {} -> {}, Load/Store: {}/{}, compute_cycle: {}",
         _id, _core_cycle, traceCoreStatus.at(stonne_core_id), new_status,
-        traceLoadTraffic.size(), traceStoreTraffic.size(), (compute_cycle - traceCoreCycle.at(stonne_core_id))/num_ms);
+        traceLoadTraffic.at(stonne_core_id).size(), traceStoreTraffic.at(stonne_core_id).size(), (compute_cycle - traceCoreCycle.at(stonne_core_id))/num_ms);
+      if (traceLoadTraffic.at(stonne_core_id).size()) {
+        TraceNode load_node = TraceNode(traceNodeList.at(stonne_core_id).size()+2, "load", 1);
+        load_node.setAddress(traceLoadTraffic.at(stonne_core_id));
+        traceNodeList.at(stonne_core_id).push_back(load_node);
+      }
+      if ((compute_cycle - traceCoreCycle.at(stonne_core_id))/num_ms) {
+        TraceNode compute_node = TraceNode(traceNodeList.at(stonne_core_id).size()+2, "compute", 0, (compute_cycle - traceCoreCycle.at(stonne_core_id))/num_ms);
+        traceNodeList.at(stonne_core_id).push_back(compute_node);
+      }
+      if (traceStoreTraffic.at(stonne_core_id).size()) {
+        TraceNode store_node = TraceNode(traceNodeList.at(stonne_core_id).size()+2, "store", 0);
+        store_node.setAddress(traceStoreTraffic.at(stonne_core_id));
+        traceNodeList.at(stonne_core_id).push_back(store_node);
+      }
+
       traceCoreStatus.at(stonne_core_id) = new_status;
       traceCoreCycle.at(stonne_core_id) = compute_cycle;
-      traceLoadTraffic.clear();
-      traceStoreTraffic.clear();
+      traceLoadTraffic.at(stonne_core_id).clear();
+      traceStoreTraffic.at(stonne_core_id).clear();
     }
 
     /* Send Memory Request */
@@ -111,12 +130,12 @@ void SparseCore::cycle() {
         case SimpleMem::Request::Read:
           acc_type = mem_access_type::GLOBAL_ACC_R;
           type = mf_type::READ_REQUEST;
-          traceLoadTraffic.insert(target_addr);
+          traceLoadTraffic.at(stonne_core_id).insert(target_addr);
           break;
         case SimpleMem::Request::Write:
           acc_type = mem_access_type::GLOBAL_ACC_W;
           type = mf_type::WRITE_REQUEST;
-          traceStoreTraffic.insert(target_addr);
+          traceStoreTraffic.at(stonne_core_id).insert(target_addr);
           break;
         default:
           spdlog::error("[SparseCore] Invalid request type from core");
@@ -132,8 +151,11 @@ void SparseCore::cycle() {
 
     if (coreBusy.at(stonne_core_id) && stonneCore->isFinished()) {
       stonneCore->finish();
-
       std::shared_ptr<Tile> target_tile = percore_tiles.at(stonne_core_id).front();
+      SST_STONNE::StonneOpDesc *opDesc = static_cast<SST_STONNE::StonneOpDesc*>(target_tile->get_custom_data());
+      if (opDesc->trace_path != "")
+        dumpTrace(stonne_core_id, opDesc->trace_path);
+
       target_tile->set_status(Tile::Status::FINISH);
       _finished_tiles.push(target_tile);
       percore_tiles.at(stonne_core_id).erase(percore_tiles.at(stonne_core_id).begin());
@@ -226,4 +248,35 @@ std::shared_ptr<Tile> SparseCore::pop_finished_tile() {
     _finished_tiles.pop();
   }
   return result;
-}
\ No newline at end of file
+}
+
+void SparseCore::dumpTrace(int stonne_core_id, const std::string& path) {
+  std::ofstream outFile(path);
+  if (!outFile) {
+    spdlog::error("[StonneCore] Failed to make trace dump file to \"{}\"", path);
+    return;
+  }
+  // Static nodes for the graph
+  outFile << "graph = {\n 0: {\n"
+          << "    \"node_id\": 0,\n"
+          << "    \"node_name\": \"root\",\n"
+          << "    \"node_type\": 0,\n"
+          << "    \"parents\": [],\n"
+          << "    \"children\": [1]\n"
+          << "  },\n"
+          << "  1: {\n"
+          << "    \"node_id\": 1,\n"
+          << "    \"node_name\": \"loopNode\",\n"
+          << "    \"node_type\": 2,\n"
+          << "    \"parents\": [0],\n"
+          << "    \"children\": [2]\n"
+          << "  },\n";
+
+  // Output traceNodeList
+  for (size_t i = 0; i < traceNodeList.at(stonne_core_id).size(); ++i) {
+      if (i != 0) outFile << ",\n";
+      outFile << traceNodeList.at(stonne_core_id)[i];
+  }
+  outFile << "\n}" << std::endl;
+  spdlog::info("[StonneCore] Success to save trace dump file to \"{}\"", path);
+}
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index 3ed4e439..a75c1914 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -261,6 +261,7 @@ void TileStonneNode::print_node() {
   spdlog::debug("{} colpointer_matrix_a_init: {}", spaces, desc.colpointer_matrix_a_init);
   spdlog::debug("{} rowpointer_matrix_b_init: {}", spaces, desc.rowpointer_matrix_b_init);
   spdlog::debug("{} colpointer_matrix_b_init: {}", spaces, desc.colpointer_matrix_b_init);
+  spdlog::debug("{} trace_path: {}", spaces, desc.trace_path);
 }
 
 void TileMemoryWaitNode::print_node() {
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 9f74c53c..32519200 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -14,6 +14,35 @@
 from PyTorchSimFrontend import extension_config
 from Simulator.simulator import BackendSimulator, TORCH_TO_NUMPY
 
+graph_template = {
+    0: {
+        "node_id": 0,
+        "node_name": "root",
+        "node_type": 0,
+        "parents": [],
+        "children": [1]
+    },
+    1: {
+        "node_id": 1,
+        "node_name": "loopNode",
+        "node_type": 2,
+        "parents": [0],
+        "children": [2],
+        "loop_index": "loop_arg000",
+        "loop_start": 0,
+        "loop_end": 8,  # FIXME. this is a trick that generate multiple tile.
+        "loop_step": 1,
+        "loop_type": "outer_loop"
+    },
+    2: {
+        "node_id": 2,
+        "node_name": "stonneNode",
+        "node_type": 5,
+        "parents": [1],
+        "children": [],
+    }
+}
+
 class MLIRExternKernelChoice(ExternKernelChoice):
     def call_name(self):
         is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
@@ -162,76 +191,53 @@ def prepare_outer_product_matrix(a, b, out):
     assert(x_sparsity >= 0 and x_sparsity < 100)
     assert(w_sparsity >= 0 and w_sparsity < 100)
 
-    # Generating inputs
-    dir_path = os.path.join(
-        extension_config.CONFIG_TORCHSIM_DIR,
-        'PyTorchSimBackend/extern/stonneCore/tests/outerproduct'
-    )
-    os.makedirs(dir_path, exist_ok=True)
-    mem_init = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_mem.ini')
-    dram_a_address, dram_b_address, dram_c_address = generate_outer_product_matrix(a, b, M, K, N, prefix, dir_path)
-    value_path = os.path.join(dir_path, f'{prefix}outerproduct_gemm_mem.ini')
-    a_row_init = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_rowpointerA.in')
-    a_col_init = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_colpointerA.in')
-    b_row_init = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_rowpointerB.in')
-    b_col_init = os.path.join(dir_path, f'{prefix}_outerproduct_gemm_colpointerB.in')
-    c_result = os.path.join(dir_path, f'{prefix}_result.out')
-    graph = {
-        0: {
-            "node_id": 0,
-            "node_name": "root",
-            "node_type": 0,
-            "parents": [],
-            "children": [1]
-        },
-        1: {
-            "node_id": 1,
-            "node_name": "loopNode",
-            "node_type": 2,
-            "parents": [0],
-            "children": [2],
-            "loop_index": "loop_arg000",
-            "loop_start": 0,
-            "loop_end": 8,  # FIXME. this is a trick that generate multiple tile.
-            "loop_step": 1,
-            "loop_type": "outer_loop"
-        },
-        2: {
-            "node_id": 2,
-            "node_name": "stonneNode",
-            "node_type": 5,
-            "parents": [1],
-            "children": [],
-            # Operation Type
-            "stonne_operation": "outerProductGEMM",
+    graph = dict(graph_template)
+    meta_data = {
+        # Operation Type
+        "stonne_operation": "outerProductGEMM",
+
+        # GEMM Parameters
+        "stonne_GEMM_K": K,
+        "stonne_GEMM_N": N,
+        "stonne_GEMM_M": M,
+        "a_hash" : hash(a.cpu().numpy().tobytes()),
+        "b_hash" : hash(b.cpu().numpy().tobytes()),
+    }
+    graph[2].update(meta_data)
+
+    # Create write path
+    write_path = get_write_path(str(graph))
+    os.makedirs(write_path, exist_ok=True)
 
-            # GEMM Parameters
-            "stonne_GEMM_K": K,
-            "stonne_GEMM_N": N,
-            "stonne_GEMM_M": M,
-            "stonne_GEMM_T_K": 4,	# Currently fixed
-            "stonne_GEMM_T_N": 1,	# Currently fixed
-            "stonne_GEMM_T_M": 1,
+    # Generating inputs
+    dram_a_address, dram_b_address, dram_c_address = generate_outer_product_matrix(a, b, M, K, N, prefix, write_path)
+    mem_init = os.path.join(write_path, f'{prefix}_outerproduct_gemm_mem.ini')
+    a_row_init = os.path.join(write_path, f'{prefix}_outerproduct_gemm_rowpointerA.in')
+    a_col_init = os.path.join(write_path, f'{prefix}_outerproduct_gemm_colpointerA.in')
+    b_row_init = os.path.join(write_path, f'{prefix}_outerproduct_gemm_rowpointerB.in')
+    b_col_init = os.path.join(write_path, f'{prefix}_outerproduct_gemm_colpointerB.in')
+    c_result = os.path.join(write_path, f'{prefix}_result.out')
 
-            # Memory Initialization & File Paths
-            "stonne_mem_init": os.path.join(extension_config.CONFIG_TORCHSIM_DIR, 'PyTorchSimBackend/extern/stonneCore/tests/outerproduct/outerproduct_gemm_mem.ini'),
-            "stonne_mem_matrix_c_file_name": c_result,
+    meta_data = {
+        # Memory Initialization & File Paths
+        "stonne_mem_init": mem_init,
+        "stonne_mem_matrix_c_file_name": c_result,
 
-            # Memory Addresses
-            "stonne_matrix_a_dram_address": dram_a_address,
-            "stonne_matrix_b_dram_address": dram_b_address,
-            "stonne_matrix_c_dram_address": dram_c_address,
+        # Memory Addresses
+        "stonne_matrix_a_dram_address": dram_a_address,
+        "stonne_matrix_b_dram_address": dram_b_address,
+        "stonne_matrix_c_dram_address": dram_c_address,
 
-            # CSR & Bitmap Initialization
-            "stonne_rowpointer_matrix_a_init": a_row_init,
-            "stonne_colpointer_matrix_a_init": a_col_init,
-            "stonne_rowpointer_matrix_b_init": b_row_init,
-            "stonne_colpointer_matrix_b_init": b_col_init,
-        }
+        # CSR & Bitmap Initialization
+        "stonne_rowpointer_matrix_a_init": a_row_init,
+        "stonne_colpointer_matrix_a_init": a_col_init,
+        "stonne_rowpointer_matrix_b_init": b_row_init,
+        "stonne_colpointer_matrix_b_init": b_col_init,
     }
-    source_code = "graph = " + str(graph)
+    graph[2].update(meta_data)
 
-    write_path = get_write_path(source_code)
+    graph[2]["stonne_trace_path"] = os.path.join(write_path, "trace.py")
+    source_code = "graph = " + str(graph)
     key, raw_tog_path = write(source_code, "py", specified_dir=write_path)
     tile_graph_generator = tog_generator(["flexagon_matmul"])
     tile_graph_generator.load_file(raw_tog_path)
@@ -258,10 +264,10 @@ def sparse_mm_stonne_outer(a, b, out):
     BackendSimulator.get_result_from_file(result_path)
 
     # Load result data
-    with open(c_result_path, 'rb') as f:
-        np_array = np.fromfile(f, dtype=TORCH_TO_NUMPY[out.dtype])
-        src_tensor = torch.as_strided(torch.from_numpy(np_array), out.size(), out.stride())
-        out.copy_(src_tensor.to(dtype=out.dtype))
+    #with open(c_result_path, 'rb') as f:
+    #    np_array = np.fromfile(f, dtype=TORCH_TO_NUMPY[out.dtype])
+    #    src_tensor = torch.as_strided(torch.from_numpy(np_array), out.size(), out.stride())
+    #    out.copy_(src_tensor.to(dtype=out.dtype))
 
 def sparse_mm_dummy_stonne_outer(a, b, out):
     onnx_path, attribute_path, c_result_path = prepare_outer_product_matrix(a, b, out)

From 7d17090787fedbe9f8679128ed9fda1b24ce10c7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 6 Mar 2025 06:32:18 +0000
Subject: [PATCH 189/432] [Frontend+Backend/Stonne] Support stonne trace tog
 generation

---
 AsmParser/onnx_utility.py              | 15 +++++
 AsmParser/tog_generator.py             | 13 +++-
 PyTorchSimBackend/include/SparseCore.h |  1 +
 PyTorchSimBackend/src/SparseCore.cc    | 13 ++--
 PyTorchSimFrontend/extension_op.py     | 86 ++++++++++++++++----------
 5 files changed, 89 insertions(+), 39 deletions(-)

diff --git a/AsmParser/onnx_utility.py b/AsmParser/onnx_utility.py
index 35e4fa79..c0a7af40 100644
--- a/AsmParser/onnx_utility.py
+++ b/AsmParser/onnx_utility.py
@@ -149,6 +149,21 @@ def __init__(self, tile_info, node_id=0):
         self.torchsim_stonne_colpointer_matrix_b_init = tile_info.get("stonne_colpointer_matrix_b_init", "")
         self.torchsim_trace_path = tile_info.get("stonne_trace_path", "")
 
+class stonne_trace_compute_node(node):
+    def __init__(self, cycle=0, node_id=0):
+        super().__init__(node_id)
+        self.torchsim_trace_compute_cycle = cycle
+
+class stonne_trace_store_node(node):
+    def __init__(self, addr_list=list(), node_id=0):
+        super().__init__(node_id)
+        self.torchsim_trace_address = addr_list
+
+class stonne_trace_load_node(node):
+    def __init__(self, addr_list=list(), node_id=0):
+        super().__init__(node_id)
+        self.torchsim_trace_address = addr_list
+
 def connect_nodes(parent, child):
     child.add_parent(parent)
     parent.add_child(child)
diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
index 6d519ce8..88f63f72 100644
--- a/AsmParser/tog_generator.py
+++ b/AsmParser/tog_generator.py
@@ -6,8 +6,10 @@
 
 if __name__ == "__main__":
     from onnx_utility import node, loop_index_node, loop_end_node, load_node, store_node, memory_wait_node, compute_node, connect_nodes, dump_onnx_graph
+    from onnx_utility import stonne_node, stonne_trace_compute_node, stonne_trace_load_node, stonne_trace_store_node
 else:
-    from AsmParser.onnx_utility import node, loop_index_node, loop_end_node, load_node, store_node, memory_wait_node, compute_node, stonne_node, connect_nodes, dump_onnx_graph
+    from AsmParser.onnx_utility import node, loop_index_node, loop_end_node, load_node, store_node, memory_wait_node, compute_node, connect_nodes, dump_onnx_graph
+    from AsmParser.onnx_utility import stonne_node, stonne_trace_compute_node, stonne_trace_load_node, stonne_trace_store_node
 
 
 def import_module_from_path(module_name, path):
@@ -32,6 +34,9 @@ class tog_generator:
     DMANodeKind = 3
     DMAWaitNodeKind = 4
     StonneNodeKind = 5
+    StonneTraceCompute= 6
+    StonneTraceLoad = 7
+    StonneTraceStore = 8
     def __init__(self, origins="Unknown") -> None:
         self.module_name = "tile_operation_graph"
         self.module = None
@@ -107,6 +112,12 @@ def _create_node(self, dump_data):
             new_node = memory_wait_node(tile_info, node_id=node_id)
         elif node_type == self.StonneNodeKind:
             new_node = stonne_node(dump_data, node_id=node_id)
+        elif node_type == self.StonneTraceCompute:
+            new_node = stonne_trace_compute_node(dump_data['trace_compute_cycle'], node_id=node_id)
+        elif node_type == self.StonneTraceLoad:
+            new_node = stonne_trace_load_node(dump_data['trace_address'], node_id=node_id)
+        elif node_type == self.StonneTraceStore:
+            new_node = stonne_trace_store_node(dump_data['trace_address'], node_id=node_id)
         else:
             print("Unexpected node_type :", node_type)
             exit(1)
diff --git a/PyTorchSimBackend/include/SparseCore.h b/PyTorchSimBackend/include/SparseCore.h
index f03045ac..33723131 100644
--- a/PyTorchSimBackend/include/SparseCore.h
+++ b/PyTorchSimBackend/include/SparseCore.h
@@ -14,6 +14,7 @@ class TraceNode {
   int compute_cycle;
 
 public:
+  enum TraceType {StonneTraceCompute=6, StonneTraceLoad=7, StonneTraceStore=8};
   TraceNode(int id, std::string name, int type, int cycle = 0)
       : node_id(id), node_name(name), node_type(type), compute_cycle(cycle) {}
   void setAddress(std::set<uint64_t> addr_set) { address_set = addr_set; }
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index eb7a7357..533bf1ee 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -100,16 +100,16 @@ void SparseCore::cycle() {
         _id, _core_cycle, traceCoreStatus.at(stonne_core_id), new_status,
         traceLoadTraffic.at(stonne_core_id).size(), traceStoreTraffic.at(stonne_core_id).size(), (compute_cycle - traceCoreCycle.at(stonne_core_id))/num_ms);
       if (traceLoadTraffic.at(stonne_core_id).size()) {
-        TraceNode load_node = TraceNode(traceNodeList.at(stonne_core_id).size()+2, "load", 1);
+        TraceNode load_node = TraceNode(traceNodeList.at(stonne_core_id).size()+2, "load", TraceNode::StonneTraceLoad);
         load_node.setAddress(traceLoadTraffic.at(stonne_core_id));
         traceNodeList.at(stonne_core_id).push_back(load_node);
       }
       if ((compute_cycle - traceCoreCycle.at(stonne_core_id))/num_ms) {
-        TraceNode compute_node = TraceNode(traceNodeList.at(stonne_core_id).size()+2, "compute", 0, (compute_cycle - traceCoreCycle.at(stonne_core_id))/num_ms);
+        TraceNode compute_node = TraceNode(traceNodeList.at(stonne_core_id).size()+2, "compute", TraceNode::StonneTraceCompute, (compute_cycle - traceCoreCycle.at(stonne_core_id))/num_ms);
         traceNodeList.at(stonne_core_id).push_back(compute_node);
       }
       if (traceStoreTraffic.at(stonne_core_id).size()) {
-        TraceNode store_node = TraceNode(traceNodeList.at(stonne_core_id).size()+2, "store", 0);
+        TraceNode store_node = TraceNode(traceNodeList.at(stonne_core_id).size()+2, "store", TraceNode::StonneTraceStore);
         store_node.setAddress(traceStoreTraffic.at(stonne_core_id));
         traceNodeList.at(stonne_core_id).push_back(store_node);
       }
@@ -269,7 +269,12 @@ void SparseCore::dumpTrace(int stonne_core_id, const std::string& path) {
           << "    \"node_name\": \"loopNode\",\n"
           << "    \"node_type\": 2,\n"
           << "    \"parents\": [0],\n"
-          << "    \"children\": [2]\n"
+          << "    \"children\": [2],\n"
+          << "    \"loop_index\": \"loop_arg000\",\n"
+          << "    \"loop_start\": 0,\n"
+          << "    \"loop_end\": 8,\n"
+          << "    \"loop_step\": 1,\n"
+          << "    \"loop_type\": \"outer_loop\""
           << "  },\n";
 
   // Output traceNodeList
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 32519200..6950375d 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -6,6 +6,7 @@
 import random
 import torch
 import numpy as np
+import hashlib
 from torch._inductor.select_algorithm import ExternKernelChoice
 from torch._inductor.codecache import get_hash
 from AsmParser.tog_generator import tog_generator
@@ -200,8 +201,8 @@ def prepare_outer_product_matrix(a, b, out):
         "stonne_GEMM_K": K,
         "stonne_GEMM_N": N,
         "stonne_GEMM_M": M,
-        "a_hash" : hash(a.cpu().numpy().tobytes()),
-        "b_hash" : hash(b.cpu().numpy().tobytes()),
+        "a_hash" : hashlib.sha512(a.cpu().numpy().tobytes()).hexdigest(),
+        "b_hash" : hashlib.sha512(b.cpu().numpy().tobytes()).hexdigest(),
     }
     graph[2].update(meta_data)
 
@@ -210,48 +211,65 @@ def prepare_outer_product_matrix(a, b, out):
     os.makedirs(write_path, exist_ok=True)
 
     # Generating inputs
-    dram_a_address, dram_b_address, dram_c_address = generate_outer_product_matrix(a, b, M, K, N, prefix, write_path)
     mem_init = os.path.join(write_path, f'{prefix}_outerproduct_gemm_mem.ini')
     a_row_init = os.path.join(write_path, f'{prefix}_outerproduct_gemm_rowpointerA.in')
     a_col_init = os.path.join(write_path, f'{prefix}_outerproduct_gemm_colpointerA.in')
     b_row_init = os.path.join(write_path, f'{prefix}_outerproduct_gemm_rowpointerB.in')
     b_col_init = os.path.join(write_path, f'{prefix}_outerproduct_gemm_colpointerB.in')
     c_result = os.path.join(write_path, f'{prefix}_result.out')
+    trace_path = os.path.join(write_path, "trace.py")
 
-    meta_data = {
-        # Memory Initialization & File Paths
-        "stonne_mem_init": mem_init,
-        "stonne_mem_matrix_c_file_name": c_result,
+    if not os.path.isfile(trace_path):
+        dram_a_address, dram_b_address, dram_c_address = generate_outer_product_matrix(a, b, M, K, N, prefix, write_path)
+        meta_data = {
+            # Memory Initialization & File Paths
+            "stonne_mem_init": mem_init,
+            "stonne_mem_matrix_c_file_name": c_result,
 
-        # Memory Addresses
-        "stonne_matrix_a_dram_address": dram_a_address,
-        "stonne_matrix_b_dram_address": dram_b_address,
-        "stonne_matrix_c_dram_address": dram_c_address,
+            # Memory Addresses
+            "stonne_matrix_a_dram_address": dram_a_address,
+            "stonne_matrix_b_dram_address": dram_b_address,
+            "stonne_matrix_c_dram_address": dram_c_address,
 
-        # CSR & Bitmap Initialization
-        "stonne_rowpointer_matrix_a_init": a_row_init,
-        "stonne_colpointer_matrix_a_init": a_col_init,
-        "stonne_rowpointer_matrix_b_init": b_row_init,
-        "stonne_colpointer_matrix_b_init": b_col_init,
-    }
-    graph[2].update(meta_data)
+            # CSR & Bitmap Initialization
+            "stonne_rowpointer_matrix_a_init": a_row_init,
+            "stonne_colpointer_matrix_a_init": a_col_init,
+            "stonne_rowpointer_matrix_b_init": b_row_init,
+            "stonne_colpointer_matrix_b_init": b_col_init,
+            "stonne_trace_path": trace_path
+        }
+        graph[2].update(meta_data)
+
+        source_code = "graph = " + str(graph)
+        key, raw_tog_path = write(source_code, "py", specified_dir=write_path)
+        tile_graph_generator = tog_generator(["flexagon_matmul"])
+        tile_graph_generator.load_file(raw_tog_path)
+        tile_graph_generator.generate_tile_graph(
+            os.path.join(write_path, "tile_graph.onnx"),
+            cycle_list=[0],
+            x_offset=0,
+            w_offset=0,
+            vector_lane=0,
+            stonneGraph=True
+        )
+        onnx_path = os.path.join(write_path, "tile_graph.onnx")
+        attribute_path = os.path.join(write_path, "attributes")
+        return onnx_path, attribute_path, c_result
+    else: # Use trace file to generate onnx graph
+        tile_graph_generator = tog_generator(["flexagon_matmul"])
+        tile_graph_generator.load_file(trace_path)
+        tile_graph_generator.generate_tile_graph(
+            os.path.join(write_path, "trace_tile_graph.onnx"),
+            cycle_list=[0],
+            x_offset=0,
+            w_offset=0,
+            vector_lane=0,
+            stonneGraph=True
+        )
+        onnx_path = os.path.join(write_path, "trace_tile_graph.onnx")
+        attribute_path = os.path.join(write_path, "attributes")
+        return onnx_path, attribute_path, c_result
 
-    graph[2]["stonne_trace_path"] = os.path.join(write_path, "trace.py")
-    source_code = "graph = " + str(graph)
-    key, raw_tog_path = write(source_code, "py", specified_dir=write_path)
-    tile_graph_generator = tog_generator(["flexagon_matmul"])
-    tile_graph_generator.load_file(raw_tog_path)
-    tile_graph_generator.generate_tile_graph(
-        os.path.join(write_path, "tile_graph.onnx"),
-        cycle_list=[0],
-        x_offset=0,
-        w_offset=0,
-        vector_lane=0,
-        stonneGraph=True
-    )
-    onnx_path = os.path.join(write_path, "tile_graph.onnx")
-    attribute_path = os.path.join(write_path, "attributes")
-    return onnx_path, attribute_path, c_result
 
 
 def sparse_mm_stonne_outer(a, b, out):

From 1ab78951a6ee1197ef02260bcbb2c6b70262c56c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 6 Mar 2025 07:47:18 +0000
Subject: [PATCH 190/432] [Backend/stonne] Suuport trace node parsing

---
 PyTorchSimBackend/include/Instruction.h     |   3 +
 PyTorchSimBackend/include/TileGraphParser.h |  44 +++++++
 PyTorchSimBackend/src/TileGraphParser.cc    | 121 +++++++++++++++++---
 3 files changed, 149 insertions(+), 19 deletions(-)

diff --git a/PyTorchSimBackend/include/Instruction.h b/PyTorchSimBackend/include/Instruction.h
index acf416b0..2a6d66c0 100644
--- a/PyTorchSimBackend/include/Instruction.h
+++ b/PyTorchSimBackend/include/Instruction.h
@@ -75,6 +75,8 @@ class Instruction {
     }
     return address_set;
   }
+  std::vector<addr_type> get_trace_address() { return _trace_address; }
+  void set_trace_address(std::vector<addr_type>& trace_address) { _trace_address = trace_address; }
   size_t get_free_sram_size() { return _free_sram_size; }
   void adjust_dram_address() {
     int offset = std::inner_product(_idx_list.begin(), _idx_list.end(), _stride_list.begin(), 0);
@@ -125,6 +127,7 @@ class Instruction {
   std::vector<int> _tag_key;
   std::vector<int> _accum_tag_idx_list;
   std::vector<int> _loop_size_list;
+  std::vector<addr_type> _trace_address;
   std::string _addr_name;
   int _nr_inner_loop = 0;
   bool _is_async_dma=false;
diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/PyTorchSimBackend/include/TileGraphParser.h
index 3ae418ec..6eaa83d2 100644
--- a/PyTorchSimBackend/include/TileGraphParser.h
+++ b/PyTorchSimBackend/include/TileGraphParser.h
@@ -21,6 +21,9 @@ enum class TileType{
   COMPUTE_NODE,
   MEMORY_WAIT_NODE,
   STONNE_NODE,
+  STONNE_TRACE_COMPUTE_NODE,
+  STONNE_TRACE_LOAD_NODE,
+  STONNE_TRACE_STORE_NODE
 };
 
 enum class LoopType {
@@ -299,4 +302,45 @@ class TileStonneNode : public TileNode {
   void print_node() override;
  private:
   SST_STONNE::StonneOpDesc desc;
+};
+
+class TileStonneTraceComputeNode : public TileNode {
+ public:
+  TileStonneTraceComputeNode(onnx::NodeProto& node) : TileNode(node) {
+    for (auto attribute : node.attribute()) {
+      if (attribute.name() == "torchsim_trace_compute_cycle") {
+          _cycle = attribute.i();
+      }
+    }
+  }
+  uint32_t get_cycle() { return _cycle; }
+  void print_node();
+
+ private:
+  uint64_t _cycle;
+};
+
+class TileStonneTraceMemoryNode : public TileNode {
+ public:
+  TileStonneTraceMemoryNode(onnx::NodeProto& node) : TileNode(node) {
+    for (auto attribute : node.attribute()) {
+      if (attribute.name() == "torchsim_trace_address") {
+        trace_address.assign(attribute.ints().begin(), attribute.ints().end());
+      }
+    }
+  }
+  std::vector<uint64_t>& get_address() { return trace_address; }
+  void print_node();
+
+ private:
+  std::vector<uint64_t> trace_address;
+};
+class TileStonneTraceLoadNode : public TileStonneTraceMemoryNode {
+ public:
+  using TileStonneTraceMemoryNode::TileStonneTraceMemoryNode;
+};
+
+class TileStonneTraceStoreNode : public TileStonneTraceMemoryNode {
+ public:
+  using TileStonneTraceMemoryNode::TileStonneTraceMemoryNode;
 };
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index a75c1914..fc2a9392 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -130,6 +130,12 @@ TileType TileNode::get_tile_type(std::string type) {
     return TileType::MEMORY_WAIT_NODE;
   else if (type == "stonne_node")
     return TileType::STONNE_NODE;
+  else if (type == "stonne_trace_compute_node")
+    return TileType::STONNE_TRACE_COMPUTE_NODE;
+  else if (type == "stonne_trace_load_node")
+    return TileType::STONNE_TRACE_LOAD_NODE;
+  else if (type == "stonne_trace_store_node")
+    return TileType::STONNE_TRACE_STORE_NODE;
   spdlog::error("[TileGraphParser] Invalid node type...");
   exit(EXIT_FAILURE);
 }
@@ -271,6 +277,18 @@ void TileMemoryWaitNode::print_node() {
   spdlog::debug("{} tag_stride_list: {}", spaces, fmt::join(_tag_stride_list, ", "));
 }
 
+void TileStonneTraceComputeNode::print_node() {
+  TileNode::print_node();
+  std::string spaces(get_depth(), '\t');
+  spdlog::debug("{} ComputeCycle: {}", spaces, _cycle);
+}
+
+void TileStonneTraceMemoryNode::print_node() {
+  TileNode::print_node();
+  std::string spaces(get_depth(), '\t');
+  spdlog::debug("{} Address: {}", spaces, fmt::join(trace_address, ", "));
+}
+
 TileLoopNode::TileLoopNode(onnx::NodeProto& node) : TileNode(node) {
   for (auto attribute : node.attribute()) {
     if (attribute.name() == "torchsim_start") {
@@ -570,25 +588,78 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       /* Create new tile */
       tile_vec.push_back(child);
     } else if (tile_node->get_type() == TileType::STONNE_NODE) {
-        printIndexMap("[TOGParser] Stonne Node ", iter);
-        std::shared_ptr<TileStonneNode> stonne_node = std::static_pointer_cast<TileStonneNode>(tile_node);
-        /* Lookup given name's address */
-        std::vector<int> iter_list;
-        std::vector<int> tag_list;
-        std::vector<int> tag_stride_list;
-        std::vector<int> accum_tag_list;
-
-        /* Put dummy computation instruction */
-        std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
-          Opcode::COMP, 0,
-          0, 0,
-          std::vector<size_t>(), 0, iter_list,
-          iter_list, tag_list, tag_stride_list, accum_tag_list, std::vector<int>()
-        );
-        link_map[tile_node] = inst;
-        tile_vec.back()->append_instuction(inst);
-        tile_vec.back()->set_custom_data(stonne_node->getDesc());
-        tile_vec.back()->set_stonne_tile(true);
+      printIndexMap("[TOGParser] Stonne Node ", iter);
+      std::shared_ptr<TileStonneNode> stonne_node = std::static_pointer_cast<TileStonneNode>(tile_node);
+      /* Lookup given name's address */
+      std::vector<int> iter_list;
+      std::vector<int> tag_list;
+      std::vector<int> tag_stride_list;
+      std::vector<int> accum_tag_list;
+
+      /* Put dummy computation instruction */
+      std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
+        Opcode::COMP, 0,
+        0, 0,
+        std::vector<size_t>(), 0, iter_list,
+        iter_list, tag_list, tag_stride_list, accum_tag_list, std::vector<int>()
+      );
+      link_map[tile_node] = inst;
+      tile_vec.back()->append_instuction(inst);
+      tile_vec.back()->set_custom_data(stonne_node->getDesc());
+      tile_vec.back()->set_stonne_tile(true);
+    } else if (tile_node->get_type() == TileType::STONNE_TRACE_COMPUTE_NODE) {
+      std::shared_ptr<TileStonneTraceComputeNode> stonne_node = std::static_pointer_cast<TileStonneTraceComputeNode>(tile_node);
+      /* Lookup given name's address */
+      std::vector<int> iter_list;
+      std::vector<int> tag_list;
+      std::vector<int> tag_stride_list;
+      std::vector<int> accum_tag_list;
+
+      std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
+        Opcode::COMP, stonne_node->get_cycle(),
+        0, 0,
+        std::vector<size_t>(), 0, iter_list,
+        iter_list, tag_list, tag_stride_list, accum_tag_list, std::vector<int>()
+      );
+      link_map[tile_node] = inst;
+      tile_vec.back()->append_instuction(inst);
+      tile_vec.back()->set_stonne_tile(true);
+    } else if (tile_node->get_type() == TileType::STONNE_TRACE_LOAD_NODE) {
+      std::shared_ptr<TileStonneTraceLoadNode> stonne_node = std::static_pointer_cast<TileStonneTraceLoadNode>(tile_node);
+      /* Lookup given name's address */
+      std::vector<int> iter_list;
+      std::vector<int> tag_list;
+      std::vector<int> tag_stride_list;
+      std::vector<int> accum_tag_list;
+
+      std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
+        Opcode::MOVIN, 0,
+        0, 0,
+        std::vector<size_t>(), 0, iter_list,
+        iter_list, tag_list, tag_stride_list, accum_tag_list, std::vector<int>()
+      );
+      inst->set_trace_address(stonne_node->get_address());
+      link_map[tile_node] = inst;
+      tile_vec.back()->append_instuction(inst);
+      tile_vec.back()->set_stonne_tile(true);
+    } else if (tile_node->get_type() == TileType::STONNE_TRACE_STORE_NODE) {
+      std::shared_ptr<TileStonneTraceStoreNode> stonne_node = std::static_pointer_cast<TileStonneTraceStoreNode>(tile_node);
+      /* Lookup given name's address */
+      std::vector<int> iter_list;
+      std::vector<int> tag_list;
+      std::vector<int> tag_stride_list;
+      std::vector<int> accum_tag_list;
+
+      std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
+        Opcode::MOVOUT, 0,
+        0, 0,
+        std::vector<size_t>(), 0, iter_list,
+        iter_list, tag_list, tag_stride_list, accum_tag_list, std::vector<int>()
+      );
+      inst->set_trace_address(stonne_node->get_address());
+      link_map[tile_node] = inst;
+      tile_vec.back()->append_instuction(inst);
+      tile_vec.back()->set_stonne_tile(true);
     }
   }
 
@@ -704,6 +775,18 @@ TileGraphParser::TileGraphParser(std::string onnx_path, json& attribute_json) {
       std::shared_ptr<TileStonneNode> tile_node = std::make_shared<TileStonneNode>(node_proto);
       /* Register output */
       register_tile(tile_node);
+    } else if (type == TileType::STONNE_TRACE_COMPUTE_NODE) {
+      std::shared_ptr<TileStonneTraceComputeNode> tile_node = std::make_shared<TileStonneTraceComputeNode>(node_proto);
+      /* Register output */
+      register_tile(tile_node);
+    } else if (type == TileType::STONNE_TRACE_LOAD_NODE) {
+      std::shared_ptr<TileStonneTraceLoadNode> tile_node = std::make_shared<TileStonneTraceLoadNode>(node_proto);
+      /* Register output */
+      register_tile(tile_node);
+    } else if (type == TileType::STONNE_TRACE_STORE_NODE) {
+      std::shared_ptr<TileStonneTraceStoreNode> tile_node = std::make_shared<TileStonneTraceStoreNode>(node_proto);
+      /* Register output */
+      register_tile(tile_node);
     }
   }
 

From 4880f7eb29d87bc13b15c735c5a677f81042fd8c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 6 Mar 2025 13:31:19 +0000
Subject: [PATCH 191/432] [Backend/stonne] Introduce trace mode for stonne core

---
 PyTorchSimBackend/include/SparseCore.h |  10 +-
 PyTorchSimBackend/src/SparseCore.cc    | 278 +++++++++++++++++++------
 2 files changed, 220 insertions(+), 68 deletions(-)

diff --git a/PyTorchSimBackend/include/SparseCore.h b/PyTorchSimBackend/include/SparseCore.h
index 33723131..0eff4c7d 100644
--- a/PyTorchSimBackend/include/SparseCore.h
+++ b/PyTorchSimBackend/include/SparseCore.h
@@ -49,6 +49,8 @@ class SparseCore : public Core {
   bool can_issue(const std::shared_ptr<Tile>& op) override;
   void issue(std::shared_ptr<Tile> tile) override;
   void cycle() override;
+  void subCoreCycle(uint32_t subcore_id);
+  void stonneCycle(SST_STONNE::sstStonne *&stonneCore, uint32_t stonne_core_id, bool &retFlag);
   bool has_memory_request();
   void pop_memory_request();
   mem_fetch* top_memory_request() { return _request_queue.front(); }
@@ -56,7 +58,12 @@ class SparseCore : public Core {
   void print_stats() override;
   void print_current_stats() override;
   std::shared_ptr<Tile> pop_finished_tile() override;
+  void finish_instruction(std::shared_ptr<Instruction>& inst) override;
   void dumpTrace(int stonne_core_id, const std::string& path);
+  bool isTraceMode(int stonne_core_id) { return traceMode.at(stonne_core_id); }
+  void setTraceMode(int stonne_core_id, bool mode) { traceMode.at(stonne_core_id) = mode; }
+  void checkStatus(uint32_t subcore_id);
+  void registerMemfetch(const std::tuple<uint64_t, mem_access_type, mf_type>& key, std::function<void()> callback);
   uint32_t num_ms = 1;
   uint32_t r_port_nr = 1;
   uint32_t w_port_nr = 1;
@@ -66,6 +73,7 @@ class SparseCore : public Core {
   std::vector<bool> coreBusy;
   std::vector<int> traceCoreStatus;
   std::vector<int> traceCoreCycle;
+  std::vector<bool> traceMode;
   std::vector<std::vector<TraceNode>> traceNodeList;
   std::vector<std::set<uint64_t>> traceLoadTraffic; // To trace dma traffic
   std::vector<std::set<uint64_t>> traceStoreTraffic; // To trace dma traffic
@@ -74,6 +82,6 @@ class SparseCore : public Core {
   /* Interconnect queue */
   std::queue<mem_fetch*> _request_queue;
   std::queue<mem_fetch*> _response_queue;
-  std::map<std::tuple<uint64_t, mem_access_type, mf_type>, std::vector<SimpleMem::Request*>*> request_merge_table;
+  std::map<std::tuple<uint64_t, mem_access_type, mf_type>, mem_fetch*> request_merge_table;
 };
 
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 533bf1ee..17b3f9c4 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -11,6 +11,7 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config)
   traceStoreTraffic.resize(nr_cores);
   percore_tiles.resize(nr_cores);
   stonneCores.resize(nr_cores);
+  traceMode.resize(nr_cores);
   for (int i=0; i<nr_cores; i++) {
     SST_STONNE::sstStonne* core = new SST_STONNE::sstStonne(config.stonne_config_path);
     stonneCores.at(i) = core;
@@ -66,16 +67,18 @@ void SparseCore::issue(std::shared_ptr<Tile> tile) {
     spdlog::error("[StonneCore {}] Faield to issue tile", _id);
     exit(1);
   }
-  //delete stonneCores.at(selected_core_idx);
-  //SST_STONNE::sstStonne* core = new SST_STONNE::sstStonne(_config.stonne_config_path);
-  //stonneCores.at(selected_core_idx) = core;
   stonneCores.at(selected_core_idx)->init(1);
   traceNodeList.at(selected_core_idx).clear();
 
   spdlog::info("[StonneCore {}][{}] issued new tile", _id, selected_core_idx);
   SST_STONNE::StonneOpDesc *opDesc = static_cast<SST_STONNE::StonneOpDesc*>(tile->get_custom_data());
-  stonneCores.at(selected_core_idx)->setup(*opDesc, 0x1000000 * selected_core_idx); // FIXME. To avoid same address
-  stonneCores.at(selected_core_idx)->init(1);
+  bool is_trace_mode = true;
+  if (opDesc) {
+    is_trace_mode = false;
+    stonneCores.at(selected_core_idx)->setup(*opDesc, 0x1000000 * selected_core_idx); // FIXME. To avoid same address
+    stonneCores.at(selected_core_idx)->init(1);
+  }
+  setTraceMode(selected_core_idx, is_trace_mode);
   percore_tiles.at(selected_core_idx).push_back(tile);
   coreBusy.at(selected_core_idx) = true;
 };
@@ -88,37 +91,43 @@ bool SparseCore::can_issue(const std::shared_ptr<Tile>& op) {
   return idle_exist && op->is_stonne_tile();
 }
 
-void SparseCore::cycle() {
-  _core_cycle++;
-  uint32_t stonne_core_id = 0;
-  for (auto& stonneCore : stonneCores) {
+void SparseCore::checkStatus(uint32_t subcore_id) {
+  auto& stonneCore = stonneCores.at(subcore_id);
+  int new_status = stonneCore->getMCFSMStats();
+  int compute_cycle = stonneCore->getMSStats().n_multiplications;
+  if (traceCoreStatus.at(subcore_id) != new_status) {
+    spdlog::trace("Stonne Core [{}][{}] status transition {} -> {}, Load/Store: {}/{}, compute_cycle: {}",
+      _id, _core_cycle, traceCoreStatus.at(subcore_id), new_status,
+      traceLoadTraffic.at(subcore_id).size(), traceStoreTraffic.at(subcore_id).size(), (compute_cycle - traceCoreCycle.at(subcore_id))/num_ms);
+    if (traceLoadTraffic.at(subcore_id).size()) {
+      TraceNode load_node = TraceNode(traceNodeList.at(subcore_id).size()+2, "load", TraceNode::StonneTraceLoad);
+      load_node.setAddress(traceLoadTraffic.at(subcore_id));
+      traceNodeList.at(subcore_id).push_back(load_node);
+    }
+    if ((compute_cycle - traceCoreCycle.at(subcore_id))/num_ms) {
+      TraceNode compute_node = TraceNode(traceNodeList.at(subcore_id).size()+2, "compute", TraceNode::StonneTraceCompute, (compute_cycle - traceCoreCycle.at(subcore_id))/num_ms);
+      traceNodeList.at(subcore_id).push_back(compute_node);
+    }
+    if (traceStoreTraffic.at(subcore_id).size()) {
+      TraceNode store_node = TraceNode(traceNodeList.at(subcore_id).size()+2, "store", TraceNode::StonneTraceStore);
+      store_node.setAddress(traceStoreTraffic.at(subcore_id));
+      traceNodeList.at(subcore_id).push_back(store_node);
+    }
+
+    traceCoreStatus.at(subcore_id) = new_status;
+    traceCoreCycle.at(subcore_id) = compute_cycle;
+    traceLoadTraffic.at(subcore_id).clear();
+    traceStoreTraffic.at(subcore_id).clear();
+  }
+}
+
+void SparseCore::subCoreCycle(uint32_t subcore_id) {
+  if (!traceMode.at(subcore_id)) {
+    auto& stonneCore = stonneCores.at(subcore_id);
     stonneCore->cycle();
-    int new_status = stonneCore->getMCFSMStats();
-    int compute_cycle = stonneCore->getMSStats().n_multiplications;
-    if (traceCoreStatus.at(stonne_core_id) != new_status) {
-      spdlog::info("Stonne Core [{}][{}] status transition {} -> {}, Load/Store: {}/{}, compute_cycle: {}",
-        _id, _core_cycle, traceCoreStatus.at(stonne_core_id), new_status,
-        traceLoadTraffic.at(stonne_core_id).size(), traceStoreTraffic.at(stonne_core_id).size(), (compute_cycle - traceCoreCycle.at(stonne_core_id))/num_ms);
-      if (traceLoadTraffic.at(stonne_core_id).size()) {
-        TraceNode load_node = TraceNode(traceNodeList.at(stonne_core_id).size()+2, "load", TraceNode::StonneTraceLoad);
-        load_node.setAddress(traceLoadTraffic.at(stonne_core_id));
-        traceNodeList.at(stonne_core_id).push_back(load_node);
-      }
-      if ((compute_cycle - traceCoreCycle.at(stonne_core_id))/num_ms) {
-        TraceNode compute_node = TraceNode(traceNodeList.at(stonne_core_id).size()+2, "compute", TraceNode::StonneTraceCompute, (compute_cycle - traceCoreCycle.at(stonne_core_id))/num_ms);
-        traceNodeList.at(stonne_core_id).push_back(compute_node);
-      }
-      if (traceStoreTraffic.at(stonne_core_id).size()) {
-        TraceNode store_node = TraceNode(traceNodeList.at(stonne_core_id).size()+2, "store", TraceNode::StonneTraceStore);
-        store_node.setAddress(traceStoreTraffic.at(stonne_core_id));
-        traceNodeList.at(stonne_core_id).push_back(store_node);
-      }
 
-      traceCoreStatus.at(stonne_core_id) = new_status;
-      traceCoreCycle.at(stonne_core_id) = compute_cycle;
-      traceLoadTraffic.at(stonne_core_id).clear();
-      traceStoreTraffic.at(stonne_core_id).clear();
-    }
+    /* Check FSM status transition */
+    checkStatus(subcore_id);
 
     /* Send Memory Request */
     while (SimpleMem::Request* req = stonneCore->popRequest()) {
@@ -130,53 +139,163 @@ void SparseCore::cycle() {
         case SimpleMem::Request::Read:
           acc_type = mem_access_type::GLOBAL_ACC_R;
           type = mf_type::READ_REQUEST;
-          traceLoadTraffic.at(stonne_core_id).insert(target_addr);
+          traceLoadTraffic.at(subcore_id).insert(target_addr);
           break;
         case SimpleMem::Request::Write:
           acc_type = mem_access_type::GLOBAL_ACC_W;
           type = mf_type::WRITE_REQUEST;
-          traceStoreTraffic.at(stonne_core_id).insert(target_addr);
+          traceStoreTraffic.at(subcore_id).insert(target_addr);
           break;
         default:
           spdlog::error("[SparseCore] Invalid request type from core");
           return;
       }
       req->request_time = _core_cycle;
-      req->stonneId = stonne_core_id;
+      req->stonneId = subcore_id;
       std::tuple<uint64_t, mem_access_type, mf_type> key = std::make_tuple(target_addr, acc_type, type);
-      if (request_merge_table.find(key) == request_merge_table.end())
-        request_merge_table[key] = new std::vector<SimpleMem::Request*> ();
-      request_merge_table[key]->push_back(req);
+      registerMemfetch(key, [this, req, acc_type, type]() {
+        spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
+              _core_cycle, _core_cycle - req->request_time, req->getAddress(), int(req->getcmd()), _config.dram_req_size);
+        req->setReply();
+        stonneCores.at(req->stonneId)->pushResponse(req);
+      });
     }
 
-    if (coreBusy.at(stonne_core_id) && stonneCore->isFinished()) {
+    /* Finish stonne core */
+    if (coreBusy.at(subcore_id) && stonneCore->isFinished()) {
       stonneCore->finish();
-      std::shared_ptr<Tile> target_tile = percore_tiles.at(stonne_core_id).front();
+      std::shared_ptr<Tile> target_tile = percore_tiles.at(subcore_id).front();
       SST_STONNE::StonneOpDesc *opDesc = static_cast<SST_STONNE::StonneOpDesc*>(target_tile->get_custom_data());
       if (opDesc->trace_path != "")
-        dumpTrace(stonne_core_id, opDesc->trace_path);
+        dumpTrace(subcore_id, opDesc->trace_path);
 
       target_tile->set_status(Tile::Status::FINISH);
       _finished_tiles.push(target_tile);
-      percore_tiles.at(stonne_core_id).erase(percore_tiles.at(stonne_core_id).begin());
-      coreBusy.at(stonne_core_id) = false;
+      percore_tiles.at(subcore_id).erase(percore_tiles.at(subcore_id).begin());
+      coreBusy.at(subcore_id) = false;
+    }
+  } else {
+    auto& instructions = percore_tiles.at(subcore_id).front()->get_instructions();
+    /* Finish stonne core */
+    if (coreBusy.at(subcore_id) && instructions.empty()) {
+      std::shared_ptr<Tile> target_tile = percore_tiles.at(subcore_id).front();
+      target_tile->set_status(Tile::Status::FINISH);
+      _finished_tiles.push(target_tile);
+      percore_tiles.at(subcore_id).erase(percore_tiles.at(subcore_id).begin());
+      coreBusy.at(subcore_id) = false;
+      return;
+    }
+
+    /* Check finished computation */
+    auto& target_pipeline = get_compute_pipeline(0);
+    if (!target_pipeline.empty() && target_pipeline.front()->finish_cycle <= _core_cycle) {
+      finish_instruction(target_pipeline.front());
+      target_pipeline.pop();
+    }
+
+    /* Check finished dma operation */
+    for (int i=0; i<_dma_waiting_queue.size(); i++){
+      std::shared_ptr<Instruction>& instruction = _dma_waiting_queue.at(i);
+      /* Pass not finished instruction */
+      if (instruction->get_waiting_request())
+        continue;
+
+      /* Finish DMA read instruction */
+      if (instruction->is_dma_read())
+        finish_instruction(instruction);
+
+      /* Erase the instruction in DMA waiting queue */
+      _dma_waiting_queue.erase(_dma_waiting_queue.begin() + i);
+      i--;
+    }
+
+    /* Peek instruction*/
+    auto& inst = instructions.front();
+    if (!inst->is_ready())
+      return;
+
+    bool issued = false;
+    switch (inst->get_opcode()) {
+      case Opcode::MOVIN:
+        {
+          auto acc_type = mem_access_type::GLOBAL_ACC_R;
+          auto type = mf_type::READ_REQUEST;
+          spdlog::info("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle,
+                        opcode_to_string(inst->get_opcode()));
+          for (auto addr : inst->get_trace_address()) {
+            inst->inc_waiting_request();
+            std::tuple<uint64_t, mem_access_type, mf_type> key = std::make_tuple(addr, acc_type, type);
+            uint64_t current_time = _core_cycle;
+            registerMemfetch(key, [this, inst, addr, current_time, type]() {
+              spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
+                this->_core_cycle, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size);
+              inst->dec_waiting_request();
+            });
+          }
+          issued = true;
+          _dma_waiting_queue.push_back(std::move(inst));
+        }
+        break;
+      case Opcode::MOVOUT:
+        {
+          auto acc_type = mem_access_type::GLOBAL_ACC_W;
+          auto type = mf_type::WRITE_REQUEST;
+          spdlog::info("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle,
+                        opcode_to_string(inst->get_opcode()));
+          for (auto addr : inst->get_trace_address()) {
+            inst->inc_waiting_request();
+            std::tuple<uint64_t, mem_access_type, mf_type> key = std::make_tuple(addr, acc_type, type);
+            uint64_t current_time = _core_cycle;
+            registerMemfetch(key, [this, inst, addr, current_time, type]() {
+              spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
+                this->_core_cycle, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size);
+              inst->dec_waiting_request();
+            });
+          }
+          issued = true;
+          finish_instruction(inst);
+          _dma_waiting_queue.push_back(std::move(inst));
+        }
+        break;
+      case Opcode::COMP:
+        {
+          auto& target_pipeline = get_compute_pipeline(0);
+          if (target_pipeline.empty())
+            inst->finish_cycle = _core_cycle + inst->get_compute_cycle();
+          else
+            inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle();
+          spdlog::info("[Core {}][{}][{}] {} ISSUED, finsh at {}", _id, subcore_id, _core_cycle,
+                          opcode_to_string(inst->get_opcode()), inst->finish_cycle);
+          target_pipeline.push(inst);
+          issued = true;
+        }
+        break;
+      default:
+        spdlog::error("Undefined instruction opcode type");
+        exit(EXIT_FAILURE);
+    }
+
+    if (issued) {
+      instructions.erase(instructions.begin());
     }
-    stonne_core_id++;
   }
+}
 
+void SparseCore::cycle() {
+  _core_cycle++;
+  /* Handle core cycle*/
+  for (uint32_t subcore_id=0; subcore_id<stonneCores.size(); subcore_id++)
+    subCoreCycle(subcore_id);
+
+  /* Handle memory request/response */
   int nr_request = 0;
   while (!request_merge_table.empty() && nr_request <= r_port_nr) {
     for (auto& req_pair : request_merge_table) {
-      uint64_t address;
-      mem_access_type acc_type;
-      mf_type type;
-      std::tie(address, acc_type, type) = req_pair.first;
-      mem_fetch* req_wrapper = new mem_fetch(address, acc_type, type, _config.dram_req_size, -1, req_pair.second);
-      _request_queue.push(req_wrapper);
+      _request_queue.push(req_pair.second);
       request_merge_table.erase(req_pair.first);
 
       spdlog::debug("[SparseCore][{}][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \
-              _core_cycle, stonne_core_id, req_wrapper->get_addr(), int(req_wrapper->get_access_type()), int(req_wrapper->get_type()), _config.dram_req_size, nr_request);
+              _core_cycle, _id, req_pair.second->get_addr(), int(req_pair.second->get_access_type()), int(req_pair.second->get_type()), _config.dram_req_size, nr_request);
       nr_request++;
       break;
     }
@@ -186,27 +305,22 @@ void SparseCore::cycle() {
   nr_request = 0;
   while (!_response_queue.empty()) {
     mem_fetch* resp_wrapper = _response_queue.front();
-    std::vector<SimpleMem::Request*>* resps = static_cast<std::vector<SimpleMem::Request*>*>(resp_wrapper->get_custom_data());
-
-    SimpleMem::Request* resp = resps->front();
-
-    spdlog::debug("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \
-            _core_cycle, _core_cycle - resp->request_time, resp->getAddress(), int(resp_wrapper->get_access_type()), int(resp_wrapper->get_type()), _config.dram_req_size, nr_request);
-
-    resp->setReply();
-    stonneCores.at(resp->stonneId)->pushResponse(resp);
-    resps->erase(resps->begin());
-    if (resps->empty()) {
-      delete resps;
+    auto* callbacks = static_cast<std::vector<std::function<void()>>*>(resp_wrapper->get_custom_data());
+    if (callbacks->empty()) {
+      delete callbacks;
       delete resp_wrapper;
       _response_queue.pop();
+    } else {
+      (*callbacks).at(0)();
+      callbacks->erase(callbacks->begin());
     }
     if (nr_request++ > w_port_nr)
       break;
   }
-  if(_config.core_print_interval && _core_cycle % _config.core_print_interval == 0) {
+
+  /* Check print stat */
+  if(_config.core_print_interval && _core_cycle % _config.core_print_interval == 0)
     print_current_stats();
-  }
 }
 
 bool SparseCore::has_memory_request() {
@@ -250,6 +364,36 @@ std::shared_ptr<Tile> SparseCore::pop_finished_tile() {
   return result;
 }
 
+void SparseCore::finish_instruction(std::shared_ptr<Instruction>& inst) {
+  if (inst->finished) {
+    spdlog::error("[Core {}][{}] {} FINISHED, inst already finished!!", _id, _core_cycle,
+                  opcode_to_string(inst->get_opcode()));
+    exit(EXIT_FAILURE);
+  }
+  inst->finish_instruction();
+  static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
+  if (inst->get_opcode() == Opcode::COMP) {
+    spdlog::trace("[StonneCore {}][{}] {} FINISHED",
+      _id, _core_cycle, opcode_to_string(inst->get_opcode()));
+  } else if (inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) {
+    spdlog::trace("[StonneCore {}][{}] {} FINISHED, free_sram_size: {}", _id, _core_cycle,
+      opcode_to_string(inst->get_opcode()), inst->get_free_sram_size());
+  }
+}
+
+void SparseCore::registerMemfetch(const std::tuple<uint64_t, mem_access_type, mf_type>& key, std::function<void()> callback) {
+  if (request_merge_table.find(key) == request_merge_table.end()) {
+    mem_fetch* req_wrapper = new mem_fetch(std::get<0>(key), std::get<1>(key), std::get<2>(key), _config.dram_req_size, -1);
+
+    auto* callbacks = new std::vector<std::function<void()>>();
+    req_wrapper->set_custom_data(static_cast<void*>(callbacks));
+    request_merge_table[key] = req_wrapper;
+  }
+  mem_fetch* req_wrapper = request_merge_table[key];
+  auto* callbacks = static_cast<std::vector<std::function<void()>>*>(req_wrapper->get_custom_data());
+  callbacks->push_back(callback);
+}
+
 void SparseCore::dumpTrace(int stonne_core_id, const std::string& path) {
   std::ofstream outFile(path);
   if (!outFile) {

From 454dafc99d30be1490a6e31e1af16435616eec39 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 7 Mar 2025 04:31:55 +0000
Subject: [PATCH 192/432] [Backend/stonne] Polish stat feature for stonne core

---
 PyTorchSimBackend/extern/stonneCore    |  2 +-
 PyTorchSimBackend/include/SparseCore.h |  4 +-
 PyTorchSimBackend/include/Tile.h       |  2 +-
 PyTorchSimBackend/src/Core.cc          |  6 +-
 PyTorchSimBackend/src/SparseCore.cc    | 92 ++++++++++++++++----------
 tests/test_stonne.py                   | 50 ++++++++++++++
 6 files changed, 113 insertions(+), 43 deletions(-)
 create mode 100644 tests/test_stonne.py

diff --git a/PyTorchSimBackend/extern/stonneCore b/PyTorchSimBackend/extern/stonneCore
index b7475e0e..97804185 160000
--- a/PyTorchSimBackend/extern/stonneCore
+++ b/PyTorchSimBackend/extern/stonneCore
@@ -1 +1 @@
-Subproject commit b7475e0eb98ad1d116e4ba8ebc1807b94961b30c
+Subproject commit 97804185f00e98e56f74638e4282b9aecab8cfce
diff --git a/PyTorchSimBackend/include/SparseCore.h b/PyTorchSimBackend/include/SparseCore.h
index 0eff4c7d..55b19da3 100644
--- a/PyTorchSimBackend/include/SparseCore.h
+++ b/PyTorchSimBackend/include/SparseCore.h
@@ -77,11 +77,13 @@ class SparseCore : public Core {
   std::vector<std::vector<TraceNode>> traceNodeList;
   std::vector<std::set<uint64_t>> traceLoadTraffic; // To trace dma traffic
   std::vector<std::set<uint64_t>> traceStoreTraffic; // To trace dma traffic
-  std::vector<SST_STONNE::sstStonne*> stonneCores;
   std::vector<std::vector<std::shared_ptr<Tile>>> percore_tiles;
+  std::vector<SST_STONNE::sstStonne*> stonneCores;
   /* Interconnect queue */
   std::queue<mem_fetch*> _request_queue;
   std::queue<mem_fetch*> _response_queue;
   std::map<std::tuple<uint64_t, mem_access_type, mf_type>, mem_fetch*> request_merge_table;
+  std::vector<MSwitchStats> percore_stat;
+  std::vector<MSwitchStats> percore_total_stat;
 };
 
diff --git a/PyTorchSimBackend/include/Tile.h b/PyTorchSimBackend/include/Tile.h
index d86e62fb..8db245be 100644
--- a/PyTorchSimBackend/include/Tile.h
+++ b/PyTorchSimBackend/include/Tile.h
@@ -54,7 +54,7 @@ class Tile {
   size_t _nr_finished_insts = 0;
   std::deque<std::shared_ptr<Instruction>> _instructions;
   std::vector<std::shared_ptr<Tile>> _child_tiles;
-  void *_custom_data;
+  void *_custom_data=NULL;
   bool _stonne_tile=false;
 };
 
diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index 3ad96f65..7596c787 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -422,20 +422,18 @@ void Core::print_stats() {
 void Core::print_current_stats() {
   std::vector<float> sa_utilization;
   for (int i=0; i<_num_systolic_array_per_core; i++)
-    sa_utilization.push_back(static_cast<float>(_stat_sa_compute_cycle.at(i) * 100) / _core_cycle);
+    sa_utilization.push_back(static_cast<float>(_stat_sa_compute_cycle.at(i) * 100) / _config.core_print_interval);
   auto level = spdlog::level::info;
   if(_id != 0)
     level = spdlog::level::debug;
 
   spdlog::info("========= Core stat =========");
-  for (int i=0; i<_num_systolic_array_per_core; i++)
-    sa_utilization.push_back(static_cast<float>(_stat_sa_compute_cycle.at(i) * 100) / _core_cycle);
   for (int i=0; i<_num_systolic_array_per_core; i++)
     spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i),
       _stat_sa_compute_cycle.at(i), _stat_sa_compute_idle_cycle.at(i));
   spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {}", _id, _stat_tma_cycle, _stat_tma_idle_cycle);
   spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
-    static_cast<float>(_stat_vu_compute_cycle * 100) / _core_cycle, _stat_vu_compute_cycle, _stat_vu_compute_idle_cycle);
+    static_cast<float>(_stat_vu_compute_cycle * 100) / _config.core_print_interval, _stat_vu_compute_cycle, _stat_vu_compute_idle_cycle);
   spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle);
   update_stats();
 }
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 17b3f9c4..5584cad0 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -12,6 +12,8 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config)
   percore_tiles.resize(nr_cores);
   stonneCores.resize(nr_cores);
   traceMode.resize(nr_cores);
+  percore_stat.resize(nr_cores);
+  percore_total_stat.resize(nr_cores);
   for (int i=0; i<nr_cores; i++) {
     SST_STONNE::sstStonne* core = new SST_STONNE::sstStonne(config.stonne_config_path);
     stonneCores.at(i) = core;
@@ -20,6 +22,8 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config)
     traceCoreStatus.at(i) = 0;
     traceCoreCycle.at(i) = 0;
     percore_tiles.at(i) = std::vector<std::shared_ptr<Tile>>();
+    percore_stat.at(i).reset();
+    percore_total_stat.at(i).reset();
   }
 
   Config stonneConfig = stonneCores.at(0)->getStonneConfig();
@@ -70,7 +74,6 @@ void SparseCore::issue(std::shared_ptr<Tile> tile) {
   stonneCores.at(selected_core_idx)->init(1);
   traceNodeList.at(selected_core_idx).clear();
 
-  spdlog::info("[StonneCore {}][{}] issued new tile", _id, selected_core_idx);
   SST_STONNE::StonneOpDesc *opDesc = static_cast<SST_STONNE::StonneOpDesc*>(tile->get_custom_data());
   bool is_trace_mode = true;
   if (opDesc) {
@@ -81,6 +84,7 @@ void SparseCore::issue(std::shared_ptr<Tile> tile) {
   setTraceMode(selected_core_idx, is_trace_mode);
   percore_tiles.at(selected_core_idx).push_back(tile);
   coreBusy.at(selected_core_idx) = true;
+  spdlog::info("[StonneCore {}][{}] issued new tile (trace_mode: {})", _id, selected_core_idx, is_trace_mode);
 };
 
 bool SparseCore::can_issue(const std::shared_ptr<Tile>& op) {
@@ -104,8 +108,8 @@ void SparseCore::checkStatus(uint32_t subcore_id) {
       load_node.setAddress(traceLoadTraffic.at(subcore_id));
       traceNodeList.at(subcore_id).push_back(load_node);
     }
-    if ((compute_cycle - traceCoreCycle.at(subcore_id))/num_ms) {
-      TraceNode compute_node = TraceNode(traceNodeList.at(subcore_id).size()+2, "compute", TraceNode::StonneTraceCompute, (compute_cycle - traceCoreCycle.at(subcore_id))/num_ms);
+    if (_core_cycle - traceCoreCycle.at(subcore_id)) {//((compute_cycle - traceCoreCycle.at(subcore_id))/num_ms) {
+      TraceNode compute_node = TraceNode(traceNodeList.at(subcore_id).size()+2, "compute", TraceNode::StonneTraceCompute, _core_cycle - traceCoreCycle.at(subcore_id));
       traceNodeList.at(subcore_id).push_back(compute_node);
     }
     if (traceStoreTraffic.at(subcore_id).size()) {
@@ -115,7 +119,7 @@ void SparseCore::checkStatus(uint32_t subcore_id) {
     }
 
     traceCoreStatus.at(subcore_id) = new_status;
-    traceCoreCycle.at(subcore_id) = compute_cycle;
+    traceCoreCycle.at(subcore_id) = _core_cycle;
     traceLoadTraffic.at(subcore_id).clear();
     traceStoreTraffic.at(subcore_id).clear();
   }
@@ -188,9 +192,12 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
 
     /* Check finished computation */
     auto& target_pipeline = get_compute_pipeline(0);
-    if (!target_pipeline.empty() && target_pipeline.front()->finish_cycle <= _core_cycle) {
-      finish_instruction(target_pipeline.front());
-      target_pipeline.pop();
+    if (!target_pipeline.empty()) {
+      if (target_pipeline.front()->finish_cycle <= _core_cycle) {
+        finish_instruction(target_pipeline.front());
+        target_pipeline.pop();
+      }
+      percore_stat.at(subcore_id).n_multiplications += num_ms;
     }
 
     /* Check finished dma operation */
@@ -220,7 +227,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
         {
           auto acc_type = mem_access_type::GLOBAL_ACC_R;
           auto type = mf_type::READ_REQUEST;
-          spdlog::info("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle,
+          spdlog::trace("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle,
                         opcode_to_string(inst->get_opcode()));
           for (auto addr : inst->get_trace_address()) {
             inst->inc_waiting_request();
@@ -240,7 +247,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
         {
           auto acc_type = mem_access_type::GLOBAL_ACC_W;
           auto type = mf_type::WRITE_REQUEST;
-          spdlog::info("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle,
+          spdlog::trace("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle,
                         opcode_to_string(inst->get_opcode()));
           for (auto addr : inst->get_trace_address()) {
             inst->inc_waiting_request();
@@ -264,7 +271,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
             inst->finish_cycle = _core_cycle + inst->get_compute_cycle();
           else
             inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle();
-          spdlog::info("[Core {}][{}][{}] {} ISSUED, finsh at {}", _id, subcore_id, _core_cycle,
+          spdlog::trace("[Core {}][{}][{}] {} ISSUED, finsh at {}", _id, subcore_id, _core_cycle,
                           opcode_to_string(inst->get_opcode()), inst->finish_cycle);
           target_pipeline.push(inst);
           issued = true;
@@ -293,9 +300,9 @@ void SparseCore::cycle() {
     for (auto& req_pair : request_merge_table) {
       _request_queue.push(req_pair.second);
       request_merge_table.erase(req_pair.first);
-
       spdlog::debug("[SparseCore][{}][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \
-              _core_cycle, _id, req_pair.second->get_addr(), int(req_pair.second->get_access_type()), int(req_pair.second->get_type()), _config.dram_req_size, nr_request);
+              _core_cycle, _id, req_pair.second->get_addr(), int(req_pair.second->get_access_type()), int(req_pair.second->get_type()),
+              _config.dram_req_size, nr_request);
       nr_request++;
       break;
     }
@@ -306,15 +313,13 @@ void SparseCore::cycle() {
   while (!_response_queue.empty()) {
     mem_fetch* resp_wrapper = _response_queue.front();
     auto* callbacks = static_cast<std::vector<std::function<void()>>*>(resp_wrapper->get_custom_data());
-    if (callbacks->empty()) {
-      delete callbacks;
-      delete resp_wrapper;
-      _response_queue.pop();
-    } else {
-      (*callbacks).at(0)();
-      callbacks->erase(callbacks->begin());
+    for (int i=0; i<callbacks->size(); i++) {
+      (*callbacks).at(i)();
     }
-    if (nr_request++ > w_port_nr)
+    delete callbacks;
+    delete resp_wrapper;
+    _response_queue.pop();
+    if (++nr_request > w_port_nr)
       break;
   }
 
@@ -335,24 +340,39 @@ void SparseCore::push_memory_response(mem_fetch* response) {
   _response_queue.push(response);
 }
 
-void SparseCore::print_stats() {
-  //for (auto stonneCore : stonneCores)
-  //  stonneCore->printStats();
-  MSwitchStats accum;
+void SparseCore::print_current_stats() {
   spdlog::info("========= Sparse Core stat =========");
-  spdlog::info("Stonne Core [{}] : Total cycle {}", _id, _core_cycle);
   for (size_t i = 0; i < stonneCores.size(); ++i) {
-    MSwitchStats stats = stonneCores.at(i)->getMSStats();
-    accum += stats;
-    spdlog::info("Stonne Core [{}][{}] : n_multiplications: {} ",
-                 _id, i, stats.n_multiplications);
+    if (!isTraceMode(i)) {
+      MSwitchStats stats = stonneCores.at(i)->getMSStats();
+      stats -= percore_total_stat.at(i);
+      percore_stat.at(i) = stats;
+      percore_total_stat.at(i) = stonneCores.at(i)->getMSStats();
+    } else {
+      percore_total_stat.at(i) += percore_stat.at(i);
+    }
+    cycle_type nr_mul = percore_stat.at(i).n_multiplications;
+    percore_stat.at(i).reset();
+    spdlog::info("Stonne Core [{}][{}] : nr_multiplications: {}", _id, i, nr_mul);
   }
-  spdlog::info("Stonne Core [{}] : total_multiplications: {} ",
-                 _id, accum.n_multiplications);
+  spdlog::info("Stonne Core [{}] : Total cycle {}", _id, _core_cycle);
 }
 
-void SparseCore::print_current_stats() {
-  print_stats();
+void SparseCore::print_stats() {
+  spdlog::info("========= Sparse Core stat =========");
+  for (size_t i = 0; i < stonneCores.size(); ++i) {
+    if (!isTraceMode(i)) {
+      MSwitchStats stats = stonneCores.at(i)->getMSStats();
+      stats -= percore_total_stat.at(i);
+      percore_stat.at(i) = stats;
+      percore_total_stat.at(i) = stats;
+    } else {
+      percore_total_stat.at(i) += percore_stat.at(i);
+    }
+    cycle_type nr_mul = percore_total_stat.at(i).n_multiplications;
+    spdlog::info("Stonne Core [{}][{}] : nr_multiplications: {}", _id, i, nr_mul);
+  }
+  spdlog::info("Stonne Core [{}] : Total cycle {}", _id, _core_cycle);
 }
 
 std::shared_ptr<Tile> SparseCore::pop_finished_tile() {
@@ -373,10 +393,10 @@ void SparseCore::finish_instruction(std::shared_ptr<Instruction>& inst) {
   inst->finish_instruction();
   static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
   if (inst->get_opcode() == Opcode::COMP) {
-    spdlog::trace("[StonneCore {}][{}] {} FINISHED",
+    spdlog::info("[StonneCore {}][{}] {} FINISHED",
       _id, _core_cycle, opcode_to_string(inst->get_opcode()));
   } else if (inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) {
-    spdlog::trace("[StonneCore {}][{}] {} FINISHED, free_sram_size: {}", _id, _core_cycle,
+    spdlog::info("[StonneCore {}][{}] {} FINISHED, free_sram_size: {}", _id, _core_cycle,
       opcode_to_string(inst->get_opcode()), inst->get_free_sram_size());
   }
 }
@@ -416,7 +436,7 @@ void SparseCore::dumpTrace(int stonne_core_id, const std::string& path) {
           << "    \"children\": [2],\n"
           << "    \"loop_index\": \"loop_arg000\",\n"
           << "    \"loop_start\": 0,\n"
-          << "    \"loop_end\": 8,\n"
+          << "    \"loop_end\": 1,\n"
           << "    \"loop_step\": 1,\n"
           << "    \"loop_type\": \"outer_loop\""
           << "  },\n";
diff --git a/tests/test_stonne.py b/tests/test_stonne.py
new file mode 100644
index 00000000..ae53c1b2
--- /dev/null
+++ b/tests/test_stonne.py
@@ -0,0 +1,50 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+import random
+import numpy as np
+
+random.seed(0)
+np.random.seed(0)
+torch.manual_seed(0)
+
+def apply_pruning(tensor, sparsity):
+    mask = torch.rand_like(tensor) >= sparsity
+    tensor *= mask
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    message = f"|{name} Test Passed|"
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def sparse_matmul(a, b):
+    return torch.sparse.mm(a, b)
+
+def test_sparse_mm(device, input_size=128, hidden_size=128, output_size=128):
+    torch.manual_seed(0)
+    input = torch.randn(input_size, hidden_size)
+    weight = torch.randn(hidden_size, output_size)
+    x1 = input.to(device=device)
+    w1 = weight.to(device=device)
+    opt_fn = torch.compile(dynamic=False)(sparse_matmul)
+    res = opt_fn(x1, w1)
+    cpu_res = sparse_matmul(input.cpu(), weight.cpu())
+    test_result("spmm", res, cpu_res)
+ 
+ 
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim'))
+ 
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    test_sparse_mm(device, 512,512,512)#64, 64, 64)
+    # test_sparse_mm("cpu", 128, 64, 32)
\ No newline at end of file

From 74dcf76db78c8d798d12df0bea7ee2c2f37eaac0 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 7 Mar 2025 11:37:27 +0000
Subject: [PATCH 193/432] [script] move script

---
 scripts/batch_experiment/avg.py               | 22 ++++++
 scripts/batch_experiment/batch_time.py        | 37 ++++++++++
 scripts/batch_experiment/parse.py             | 35 +++++++++
 .../sparsity_experiment}/run.sh               |  0
 .../stonne_experiment}/run.sh                 |  0
 sparsity/parse.py                             | 74 -------------------
 6 files changed, 94 insertions(+), 74 deletions(-)
 create mode 100644 scripts/batch_experiment/avg.py
 create mode 100644 scripts/batch_experiment/batch_time.py
 create mode 100644 scripts/batch_experiment/parse.py
 rename {sparsity => scripts/sparsity_experiment}/run.sh (100%)
 rename {stonne_experiment => scripts/stonne_experiment}/run.sh (100%)
 delete mode 100644 sparsity/parse.py

diff --git a/scripts/batch_experiment/avg.py b/scripts/batch_experiment/avg.py
new file mode 100644
index 00000000..b91287b6
--- /dev/null
+++ b/scripts/batch_experiment/avg.py
@@ -0,0 +1,22 @@
+import re
+import sys
+
+def parse_log_file(file_path, interval):
+    with open(file_path, "r") as file:
+        index = 0
+        for line in file:
+            if index % interval != 0:
+                index += 1
+                continue
+            index += 1
+            print(line.strip())
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Wrong input")
+        sys.exit(1)
+    
+    log_file = sys.argv[1]
+    interval = int(sys.argv[2])
+    parse_log_file(log_file, interval)
+
diff --git a/scripts/batch_experiment/batch_time.py b/scripts/batch_experiment/batch_time.py
new file mode 100644
index 00000000..9f8778d7
--- /dev/null
+++ b/scripts/batch_experiment/batch_time.py
@@ -0,0 +1,37 @@
+import re
+import sys
+
+def time_to_milliseconds(timestamp):
+    match = re.match(r"\[(\d{4}-\d{2}-\d{2}) (\d{2}):(\d{2}):(\d{2})\.(\d{3})\]", timestamp)
+    if not match:
+        return None
+
+    _, hh, mm, ss, ms = match.groups()
+
+    total_ms = (int(hh) * 3600 + int(mm) * 60 + int(ss)) * 1000 + int(ms)
+    return total_ms
+
+def parse_log_file(file_path):
+    with open(file_path, "r") as file:
+        counter = 0
+        for line in file:
+            if "batch size" in line:
+                print(line.strip())
+                counter = 40
+                continue
+            counter -= 1
+            if (counter > 0):
+                time_match = re.search(r"\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\]", line)
+                if time_match:
+                    timestamp = time_match.group(0)  # "[YYYY-MM-DD HH:MM:SS.sss]" 형식
+                    time_ms = time_to_milliseconds(timestamp)
+                    print(time_ms)
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Wrong input")
+        sys.exit(1)
+    
+    log_file = sys.argv[1]
+    parse_log_file(log_file)
+
diff --git a/scripts/batch_experiment/parse.py b/scripts/batch_experiment/parse.py
new file mode 100644
index 00000000..dd3e504f
--- /dev/null
+++ b/scripts/batch_experiment/parse.py
@@ -0,0 +1,35 @@
+import re
+import sys
+
+def time_to_milliseconds(timestamp):
+    match = re.match(r"\[(\d{4}-\d{2}-\d{2}) (\d{2}):(\d{2}):(\d{2})\.(\d{3})\]", timestamp)
+    if not match:
+        return None
+
+    _, hh, mm, ss, ms = match.groups()
+
+    total_ms = (int(hh) * 3600 + int(mm) * 60 + int(ss)) * 1000 + int(ms)
+    return total_ms
+
+def parse_log_file(file_path):
+    with open(file_path, "r") as file:
+        for line in file:
+            time_match = re.search(r"\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\]", line)
+            # Cycle 값 추출 (예: Total cycle 43858000)
+            cycle_match = re.search(r"\[0\] : Total cycle (\d+)", line)
+            
+            if time_match and cycle_match:
+                timestamp = time_match.group(0)  # "[YYYY-MM-DD HH:MM:SS.sss]" 형식
+                cycle = cycle_match.group(1)  # Cycle 값
+                
+                time_ms = time_to_milliseconds(timestamp)
+                print(time_ms, cycle)
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Wrong input")
+        sys.exit(1)
+    
+    log_file = sys.argv[1]
+    parse_log_file(log_file)
+
diff --git a/sparsity/run.sh b/scripts/sparsity_experiment/run.sh
similarity index 100%
rename from sparsity/run.sh
rename to scripts/sparsity_experiment/run.sh
diff --git a/stonne_experiment/run.sh b/scripts/stonne_experiment/run.sh
similarity index 100%
rename from stonne_experiment/run.sh
rename to scripts/stonne_experiment/run.sh
diff --git a/sparsity/parse.py b/sparsity/parse.py
deleted file mode 100644
index 7b15e156..00000000
--- a/sparsity/parse.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import argparse
-import os
-import subprocess
-
-def get_stored_paths(log_file):
-    """Extracts stored file paths from the given log file."""
-    stored_paths = []
-    try:
-        result = subprocess.run(["grep", "stored", log_file], capture_output=True, text=True)
-        for line in result.stdout.splitlines():
-            parts = line.split(" ")
-            if "stored" in parts:
-                index = parts.index("stored")
-                if index + 1 < len(parts):
-                    stored_paths.append(parts[index + 2].strip('"'))
-    except Exception as e:
-        print(f"Error reading stored paths: {e}")
-    return stored_paths
-
-def get_last_total_cycle(file_path):
-    """Extracts the last Total cycle value from the given file."""
-    total_cycle = None
-    try:
-        result = subprocess.run(["grep", "Total cycle", file_path], capture_output=True, text=True)
-        lines = result.stdout.splitlines()
-        if lines:
-            last_line = lines[-1]
-            total_cycle = last_line.split()[-1]  # Extract the last value
-    except Exception as e:
-        print(f"Error reading total cycle from {file_path}: {e}")
-    return total_cycle
-
-def main(log_file):
-    stored_paths = get_stored_paths(log_file)
-    k = []
-    for path in stored_paths:
-        print(path)
-        if os.path.exists(path):
-            total_cycle = get_last_total_cycle(path)
-            if total_cycle:
-                k.append(total_cycle)
-            else:
-                print(f"{path}: No Total cycle found")
-        else:
-            print(f"{path}: File does not exist")
-    return k
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Extract Total Cycle from stored paths.")
-    parser.add_argument("log_file", type=str, help="Path to the log file containing stored paths")
-    args = parser.parse_args()
-    a_l = []
-    b_l = []
-    if os.path.exists(args.log_file):
-        a, b = main(args.log_file + "/0.0")
-        a_l.append(a)
-        b_l.append(b)
-        a, b = main(args.log_file + "/0.2")
-        a_l.append(a)
-        b_l.append(b)
-        a, b = main(args.log_file + "/0.4")
-        a_l.append(a)
-        b_l.append(b)
-        a, b = main(args.log_file + "/0.6")
-        a_l.append(a)
-        b_l.append(b)
-        a, b = main(args.log_file + "/0.8")
-        a_l.append(a)
-        b_l.append(b)
-        print(" ".join(a_l))
-        print(" ".join(b_l))
- 
-    else:
-        print(f"Log file {args.log_file} not found.")

From c0cfb9673de011ac035b1bc4d8baef1f1014acd6 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 10 Mar 2025 04:10:58 +0000
Subject: [PATCH 194/432] [Script] cleanup + stonne trace mode experiment
 script

---
 .../configs/stonne_single_c1_simple_noc.json  | 31 ++++++++
 PyTorchSimFrontend/mlir/mlir_lowering.py      |  1 -
 scripts/sparsity_experiment/parse.py          | 74 +++++++++++++++++++
 scripts/stonne_experiment/run.sh              |  6 +-
 scripts/stonne_experiment/run_trace.sh        | 18 +++++
 tests/test_stonne.py                          | 16 ++--
 6 files changed, 137 insertions(+), 9 deletions(-)
 create mode 100644 PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json
 create mode 100644 scripts/sparsity_experiment/parse.py
 create mode 100755 scripts/stonne_experiment/run_trace.sh

diff --git a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json
new file mode 100644
index 00000000..2bf376c5
--- /dev/null
+++ b/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json
@@ -0,0 +1,31 @@
+{
+  "core_type" : ["stonne"],
+  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "num_cores" : 1,
+  "core_freq" : 700,
+  "sram_size" : 65536,
+  "core_print_interval" : 10000,
+  "num_stonne_per_core" : 1,
+  "num_stonne_port" : 8,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" : 700,
+  "dram_channels": 8,
+  "dram_req_size": 32,
+  "dram_latency" : 10,
+  "dram_nbl" : 1,
+  "dram_print_interval": 10000,
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq" : 7000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
+
+  "precision" : 4,
+  "scheduler" : "simple",
+  "num_partition" : 1,
+  "partition": {
+    "core_0":0
+  }
+}
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index bc735df8..465f35ca 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -145,7 +145,6 @@ def custom_maxpool(
     return mlir_template.generate().output_node(), x # FIXME: x is dummy IRNode, indices are not used in our case
 
 def sparse_addmm(*args, **kwargs):
-    print("Custom sparse addmm")
     _, sp_mat1, sp_mat2 = args
     mat1_layout = sp_mat1.layout
     out_range = args[0].data.data.data.ranges
diff --git a/scripts/sparsity_experiment/parse.py b/scripts/sparsity_experiment/parse.py
new file mode 100644
index 00000000..7b15e156
--- /dev/null
+++ b/scripts/sparsity_experiment/parse.py
@@ -0,0 +1,74 @@
+import argparse
+import os
+import subprocess
+
+def get_stored_paths(log_file):
+    """Extracts stored file paths from the given log file."""
+    stored_paths = []
+    try:
+        result = subprocess.run(["grep", "stored", log_file], capture_output=True, text=True)
+        for line in result.stdout.splitlines():
+            parts = line.split(" ")
+            if "stored" in parts:
+                index = parts.index("stored")
+                if index + 1 < len(parts):
+                    stored_paths.append(parts[index + 2].strip('"'))
+    except Exception as e:
+        print(f"Error reading stored paths: {e}")
+    return stored_paths
+
+def get_last_total_cycle(file_path):
+    """Extracts the last Total cycle value from the given file."""
+    total_cycle = None
+    try:
+        result = subprocess.run(["grep", "Total cycle", file_path], capture_output=True, text=True)
+        lines = result.stdout.splitlines()
+        if lines:
+            last_line = lines[-1]
+            total_cycle = last_line.split()[-1]  # Extract the last value
+    except Exception as e:
+        print(f"Error reading total cycle from {file_path}: {e}")
+    return total_cycle
+
+def main(log_file):
+    stored_paths = get_stored_paths(log_file)
+    k = []
+    for path in stored_paths:
+        print(path)
+        if os.path.exists(path):
+            total_cycle = get_last_total_cycle(path)
+            if total_cycle:
+                k.append(total_cycle)
+            else:
+                print(f"{path}: No Total cycle found")
+        else:
+            print(f"{path}: File does not exist")
+    return k
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Extract Total Cycle from stored paths.")
+    parser.add_argument("log_file", type=str, help="Path to the log file containing stored paths")
+    args = parser.parse_args()
+    a_l = []
+    b_l = []
+    if os.path.exists(args.log_file):
+        a, b = main(args.log_file + "/0.0")
+        a_l.append(a)
+        b_l.append(b)
+        a, b = main(args.log_file + "/0.2")
+        a_l.append(a)
+        b_l.append(b)
+        a, b = main(args.log_file + "/0.4")
+        a_l.append(a)
+        b_l.append(b)
+        a, b = main(args.log_file + "/0.6")
+        a_l.append(a)
+        b_l.append(b)
+        a, b = main(args.log_file + "/0.8")
+        a_l.append(a)
+        b_l.append(b)
+        print(" ".join(a_l))
+        print(" ".join(b_l))
+ 
+    else:
+        print(f"Log file {args.log_file} not found.")
diff --git a/scripts/stonne_experiment/run.sh b/scripts/stonne_experiment/run.sh
index 3586c670..f456658b 100755
--- a/scripts/stonne_experiment/run.sh
+++ b/scripts/stonne_experiment/run.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-python3 ../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config stonne_big_c1_simple_noc.json --mode 0 > hetero/big_sparse.log
-python3 ../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config systolic_ws_128x128_c1_simple_noc_tpuv2_half.json --mode 1 > hetero/big.log
-python3 ../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config heterogeneous_c2_simple_noc.json --mode 2 > hetero/hetero.log
+python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config stonne_big_c1_simple_noc.json --mode 0 > hetero/big_sparse.log
+python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config systolic_ws_128x128_c1_simple_noc_tpuv2_half.json --mode 1 > hetero/big.log
+python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config heterogeneous_c2_simple_noc.json --mode 2 > hetero/hetero.log
 
 echo "All processes completed!"
diff --git a/scripts/stonne_experiment/run_trace.sh b/scripts/stonne_experiment/run_trace.sh
new file mode 100755
index 00000000..5a4ff890
--- /dev/null
+++ b/scripts/stonne_experiment/run_trace.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+SCRIPT="/workspace/PyTorchSim/tests/test_stonne.py"
+
+SIZES=(32 64 128)
+SPARSITIES=(0.0 0.2 0.4 0.6 0.8)
+
+for sz in "${SIZES[@]}"; do
+    for sparsity in "${SPARSITIES[@]}"; do
+        FILE_PATH=$(python "$SCRIPT" "$sz" "$sparsity" | grep -oP '(?<=stored to ")[^"]+')
+        TOTAL_CYCLE=$(grep -oP '\[.*?\] \[info\] Stonne Core \[0\] : Total cycle \K\d+' "$FILE_PATH" | tail -n 1)
+        echo "Stonne $sz $sparsity $TOTAL_CYCLE"
+
+        FILE_PATH=$(python "$SCRIPT" "$sz" "$sparsity" | grep -oP '(?<=stored to ")[^"]+')
+        TOTAL_CYCLE=$(grep -oP '\[.*?\] \[info\] Stonne Core \[0\] : Total cycle \K\d+' "$FILE_PATH" | tail -n 1)
+        echo "TOG $sz $sparsity $TOTAL_CYCLE"
+    done
+done
\ No newline at end of file
diff --git a/tests/test_stonne.py b/tests/test_stonne.py
index ae53c1b2..f82f833b 100644
--- a/tests/test_stonne.py
+++ b/tests/test_stonne.py
@@ -3,6 +3,7 @@
 import torch.utils.cpp_extension
 import random
 import numpy as np
+import argparse
 
 random.seed(0)
 np.random.seed(0)
@@ -26,25 +27,30 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
 def sparse_matmul(a, b):
     return torch.sparse.mm(a, b)
 
-def test_sparse_mm(device, input_size=128, hidden_size=128, output_size=128):
-    torch.manual_seed(0)
+def test_sparse_mm(device, input_size=128, hidden_size=128, output_size=128, sparsity=0.0):
     input = torch.randn(input_size, hidden_size)
     weight = torch.randn(hidden_size, output_size)
+    apply_pruning(input, sparsity)
+    apply_pruning(weight, sparsity)
     x1 = input.to(device=device)
     w1 = weight.to(device=device)
     opt_fn = torch.compile(dynamic=False)(sparse_matmul)
     res = opt_fn(x1, w1)
     cpu_res = sparse_matmul(input.cpu(), weight.cpu())
-    test_result("spmm", res, cpu_res)
+    #test_result("spmm", res, cpu_res)
  
  
 if __name__ == "__main__":
     import os
     import sys
+    parser = argparse.ArgumentParser(description="stonne test")
+    parser.add_argument("sz", nargs="?", type=int, help="size", default=64)
+    parser.add_argument("sparsity", nargs="?", type=float, help="%% of zero", default=0.0)
+
+    args = parser.parse_args()
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim'))
  
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_sparse_mm(device, 512,512,512)#64, 64, 64)
-    # test_sparse_mm("cpu", 128, 64, 32)
\ No newline at end of file
+    test_sparse_mm(device, args.sz, args.sz, args.sz, args.sparsity)
\ No newline at end of file

From 1c83e5326dfef8f0527b031d5a074a1b0f17a7f2 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 12 Mar 2025 06:37:02 +0000
Subject: [PATCH 195/432] [Frontend] Implement Index Expr

---
 PyTorchSimFrontend/extension_codecache.py     |  2 +
 .../mlir/mlir_codegen_backend.py              | 92 ++++++++++---------
 PyTorchSimFrontend/mlir/mlir_common.py        |  6 +-
 PyTorchSimFrontend/mlir/mlir_lowering.py      |  3 +-
 tests/test_pool.py                            | 11 +--
 5 files changed, 61 insertions(+), 53 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 437f2a01..fd7c01df 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -50,6 +50,7 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
             -test-loop-padding \
             -dma-fine-grained='systolic-array-size={vectorlane_size}' \
+            -global-idx='vlen={vlen}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size}" \
             -convert-linalg-to-loops \
@@ -84,6 +85,7 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
             -test-loop-padding='timing_mode=1' \
             -dma-fine-grained='systolic-array-size={vectorlane_size}' \
+            -global-idx='vlen={vlen}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
             -test-tile-operation-graph='vectorlane={vectorlane_size}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size} timing=1" \
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 2b70f4a0..cae4df0f 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -252,9 +252,11 @@ def constant(value, src_type, *args, var_info=None, **kwargs):
         # if value represented by e notation, convert to float (ex 1e-3 -> 1.0e-3)
         if "e" in str(value):
             value = float(value)
-        if src_type[0] == "f":
+        if value == float("-inf"):
+            value = "0xFF800000"
+        elif src_type[0] == "f":
             value = format(value, ".20f")
-        if src_type[0] == "i":
+        elif src_type[0] == "i":
             value = int(value)
         return f'arith.constant {value} : {src_type}', [1, src_type]
 
@@ -585,64 +587,40 @@ def where(condition, operand1, operand2, *args, var_info=None, **kwargs):
     @staticmethod
     def masked(mask, body, other, *args, var_info=None, tile_size=16, dtype="f32", ninf_declared=False, **kwargs):
         result = body()
-        val = ops.constant(0.0, "f32", *args, **kwargs)
+        val = ops.constant(other, dtype, *args, **kwargs)
         result = ops.where(mask, result, val)
         return result, var_info[result]
 
     @staticmethod
-    def _index_expr(tile_size, buffer, renamed_expression, vec_size, *args, var_info=None, **kwargs):
-        strides = [1] * len(tile_size)
-        for i in range(len(tile_size) - 2, -1, -1):
-            strides[i] = strides[i + 1] * tile_size[i + 1]
-
-        linear_expression = []
-        for i, stride in enumerate(strides):
-            linear_expression.append(f"d{i}*{stride}")
+    def _index_expr(tile_size, buffer, renamed_expression, index, *args, var_info=None, **kwargs):
+        str_tile_size = [str(dim) for dim in tile_size]
+        shape = "x".join(str_tile_size)
 
         dim = ["%d"+str(i) for i in range(len(tile_size))]
         sym_dim = ["d"+str(i) for i in range(len(tile_size))]
         start_dim = [str(0) for i in tile_size]
         end_dim = [str(i) for i in tile_size]
+        indices = [str(i) for i in index.free_symbols]
 
         affine_map_str = "(" + ", ".join(sym_dim) + ") -> ("
         affine_map_str += sympy.printing.ccode(renamed_expression) + ")"
-
-        affine_map_str2 = "(" + ", ".join(sym_dim) + ") -> ("
-        affine_map_str2 += "+".join(linear_expression) + ")"
-
-        apply_map_var = f"%index_var = affine.apply affine_map<{affine_map_str}>({', '.join(dim)})\n"
-        linear_index_var = f"%buffer_index_var = affine.apply affine_map<{affine_map_str2}>({', '.join(dim)})\n"
+        affine_offset_map = "(d0, d1) -> (d0 + d1)"
+        affine_offset_var = ""
+        offset_vars = dim.copy()
+        for idx in indices:
+            i = int(idx[5:])
+            affine_offset_var += f"%offset{i} = affine.apply affine_map<{affine_offset_map}>(%{idx}, {dim[i]})\n"
+            offset_vars[i] = f"%offset{i}"
+
+        apply_map_var = f"%index_var = affine.apply affine_map<{affine_map_str}>({', '.join(offset_vars)}) {{global_idx=1}}\n"
         broadcast_var = f"%broadcast_var = vector.broadcast %index_var : index to vector<2xindex>\n"
-        affine_store_var = f"affine.vector_store %broadcast_var, %{buffer}[%buffer_index_var] : memref<{vec_size}xindex>, vector<2xindex>\n"
+        broadcast_i64 = f"%broadcast_i64 = arith.index_cast %broadcast_var : vector<2xindex> to vector<2xi64>\n"
+        affine_store_var = f"affine.vector_store %broadcast_i64, %{buffer}[{','.join(dim)}] : memref<{shape}xi64, 1>, vector<2xi64>\n"
 
         result = f"affine.parallel ({','.join(dim)}) = ({','.join(start_dim)}) to ({','.join(end_dim)}) {{\n" + \
-            apply_map_var + linear_index_var + broadcast_var + affine_store_var + f"}}"
+            affine_offset_var + apply_map_var + broadcast_var + broadcast_i64 + affine_store_var + f"}}"
         return result, [None, None]
 
-    @staticmethod
-    def index_expr(operand, *args, var_info=None, tile_desc=None, **kwargs):
-        # Todo. To support index_expr, we have to custom instructions
-        tile_size = tile_desc.get_tile_size()
-        if tile_desc.get_used_vlane() != 1:
-            raise NotImplementedError("Currently index operation is only executable on single vectorlane configuration")
-
-        vec_size = 1
-        for ds in tile_size:
-            vec_size *= ds
-
-        buffer = ops.alloc(vec_size, "index")
-        ret_type = [vec_size, "index"]
-
-        renamed_symbols = {symbol: "d"+str(symbol)[5:] for symbol in operand.free_symbols}
-        renamed_expression = operand.subs(renamed_symbols)
-        if operand not in ExtensionOverrides.index_set:
-            # Register this operand
-            ExtensionOverrides.index_set.add(operand)
-            ops._index_expr(tile_size, buffer, renamed_expression, vec_size)
-
-        result = f"affine.vector_load %{buffer}[0] : memref<{vec_size}xindex>, vector<{vec_size}xindex> // {renamed_expression}"
-        return result, ret_type
-
     @staticmethod
     def index_cast(operand, target_type, *args, var_info=None, **kwrags):
         op_type = var_info[operand]
@@ -735,6 +713,9 @@ def get_padding_type(self):
             for op in ops:
                 if "exp" in op.name: # exponential reduciton case
                     return 1
+        # for op in ops: # TODO: padding has some problem in the case of max_pool
+        #     if "max_pool" in op.args[0].name:
+        #         return 1
         return 0
 
     def convert_index(self, expr, buffer):
@@ -1021,6 +1002,31 @@ def store_reduction(self, name, index, value):
                                  f"{name}_tag", dram_shape, tile_shape, tile_stride)
         self.reductions_suffix.writeline(common.DeferredLine(name, code))
 
+    def index_expr(self, index, dtype):
+        # Todo. To support index_expr, we have to custom instructions
+        tile_desc = self.kernel_group.tile_desc
+        tile_size = tile_desc.get_tile_size_per_lane()
+        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
+        tile_numel_per_lane = tile_desc.get_numel_per_lane()
+        str_tile_size = [str(dim) for dim in tile_size]
+        shape = "x".join(str_tile_size)
+        tile_shape = f"memref<{shape}xi64, 1>"
+
+        # Define scratch pad buffer
+        sram_var, _, _ = self.get_scratchpad_buffer(dtype, "index_buffer", tile_numel_per_lane, tile_shape, self.loads, None, "index_expr") # use same index for reuse spad
+
+        renamed_symbols = {symbol: "d"+str(symbol)[5:] for symbol in index.free_symbols}
+        renamed_expression = index.subs(renamed_symbols)
+        if index not in ExtensionOverrides.index_set:
+            # Register this operand
+            ExtensionOverrides.index_set.add(index)
+            ops._index_expr(tile_size, sram_var, renamed_expression, index)
+
+        line = f"affine.vector_load %{sram_var}[0, 0, 0] : {tile_shape}, vector<{tile_numel_per_lane}x{mlir_dtype}> // {renamed_expression}"
+        out = self.cse.generate(self.compute, line)
+        self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
+        return out
+
     def codegen_global_init(self):
         return self.global_vars
 
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index cd558f1c..1213666a 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -543,7 +543,7 @@ def inner(*args, **kwargs):
                         buf_bounds = self.node_to_bounds.get(
                             fx_node, ValueRanges.unknown()
                         )
-                    code, ret_info = getattr(parent_handler, name)(*args, var_info=self.var_info, tile_desc=self.kernel_group.tile_desc)
+                    code, ret_info = getattr(parent_handler, name)(*args, var_info=self.var_info)
                     csevar = self.cse.generate(
                         self.compute,
                         code,
@@ -605,6 +605,10 @@ def store_reduction(name, index, value):
             def reduction(dtype, src_dtype, reduction_type, value):
                 return self.reduction(dtype, src_dtype, reduction_type, value)
 
+            @staticmethod
+            def index_expr(index, dtype):
+                return self.index_expr(index, dtype)
+
             @staticmethod
             def bucketize(
                 values,
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index 465f35ca..8c9c35a7 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -158,5 +158,4 @@ def sparse_addmm(*args, **kwargs):
 lowerings.update({getattr(aten.addmm, overload): tuned_addmm for overload in aten.addmm.overloads()})
 lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()})
 lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
-lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
-lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # TODO: maxpool shpuld be implemeneted through llir
\ No newline at end of file
+lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
\ No newline at end of file
diff --git a/tests/test_pool.py b/tests/test_pool.py
index 7abcb3e6..e50c700e 100644
--- a/tests/test_pool.py
+++ b/tests/test_pool.py
@@ -13,14 +13,11 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
         print("cpu out: ", cpu_out)
         exit(1)
 
-def test_maxpool(device):
+def test_maxpool(device, b=1, c=64, h=112, w=112):
     torch.manual_seed(0)
-    model = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=1).eval()
+    model = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1).eval()
     model.to(device=device)
-    input = torch.randn(1, 2, 5, 2).to(device=device)
-    input = torch.arange(2*5*2, 0, -1, dtype=torch.float32)
-    input = input.reshape(1,2,5,2)
-    input = input.to(device=device)
+    input = torch.randn(b, c, h, w).to(device=device)
     x1 = input.to(device=device)
     x2 = input.to("cpu")
     opt_fn = torch.compile(dynamic=False)(model)
@@ -49,5 +46,5 @@ def avgpool(a):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_maxpool(device)
+    test_maxpool(device, b=1, c=8, h=16, w=16)
     test_avgpool(device)

From 482a76fdadab9b478e2d797d76c1e43501de71ea Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 10 Mar 2025 06:33:41 +0000
Subject: [PATCH 196/432] [Fix] Force wrapper use size_t & check sheduleNode

---
 PyTorchSimFrontend/llvm/llvm_caller_codegen.py |  4 ++--
 PyTorchSimFrontend/mlir/mlir_caller_codegen.py |  4 ++--
 PyTorchSimFrontend/mlir/mlir_scheduling.py     |  2 +-
 PyTorchSimFrontend/mlir/mlir_template.py       | 10 +++++++++-
 4 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py b/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
index 4ef03d3f..06c20a45 100644
--- a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
+++ b/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
@@ -83,7 +83,7 @@ def generate_args_define(self):
         self.writeline(self.newline)
 
     def generate_load_dump_fn(self):
-        self.writeline(f'{self.newline}int load_arg(void *arg, int size, const char *path) {self.open_bracket}')
+        self.writeline(f'{self.newline}int load_arg(void *arg, size_t size, const char *path) {self.open_bracket}')
         with self.code.indent():
             self.writeline(f'int fd = open(path, 0x00000000){self.ending}')
             self.writeline(f'if (fd == -1) {self.open_bracket}')
@@ -99,7 +99,7 @@ def generate_load_dump_fn(self):
             self.writeline(f'return 0{self.ending}')
         self.writeline(self.closed_bracket)
 
-        self.writeline(f'{self.newline}int dump_arg(void *arg, int size, const char *path) {self.open_bracket}')
+        self.writeline(f'{self.newline}int dump_arg(void *arg, size_t size, const char *path) {self.open_bracket}')
         with self.code.indent():
             self.writeline(f'int fd = open(path, 0x00000001 | 0x00000040, 0644){self.ending}')
             self.writeline(f'if (fd == -1) {self.open_bracket}')
diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
index 12c2cb8a..1b845338 100644
--- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
+++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
@@ -53,7 +53,7 @@ def generate_args_define(self):
         for arg_name, (_, arg_type, arg_size, arg_sizes, arg_stride) in self.arg_attributes:
             if not arg_name in name_set:
                 if self.validation:
-                    self.writeline(f'{DTYPE_TO_C[arg_type]} c_{arg_name}[{arg_size}]{self.ending}')
+                    self.writeline(f'{DTYPE_TO_C[arg_type]} c_{arg_name}[{arg_size}ULL]{self.ending}')
                 else:
                     if torch.is_floating_point(torch.tensor([], dtype=arg_type)):
                         bits = torch.finfo(arg_type).bits
@@ -61,7 +61,7 @@ def generate_args_define(self):
                         bits = 8
                     else:
                         bits = torch.iinfo(arg_type).bits
-                    self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({arg_size * bits // 8}){self.ending}')
+                    self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({arg_size * bits // 8}ULL){self.ending}')
                 name_set.add(arg_name)
         self.writeline(self.newline)
 
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 593257d4..ce68f137 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -4,7 +4,7 @@
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
 
 from torch._inductor import config
-from torch._inductor.scheduler import BaseScheduling, FusedSchedulerNode
+from torch._inductor.scheduler import BaseScheduling, FusedSchedulerNode, SchedulerNode
 from torch._inductor.utils import IndentedBuffer
 from torch._inductor.virtualized import V
 
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 0b739102..54c634e2 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -19,6 +19,7 @@
 from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
 from PyTorchSimFrontend.mlir.mlir_common import BaseMLIRHardwareInfo
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
+from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode
 
 from . import mlir_common
 
@@ -321,7 +322,10 @@ def def_kernel(
                 self.named_nodes[name] = node
                 self.kernel_group.args.output_buffers[node.get_name()] = name
                 self.store_buffer_names.add(node.get_name())    #TODO: Is this enough not calling store() in mlir_common.py?
-                extra_node[node.get_name()] = node
+                if isinstance(node, SchedulerNode):
+                    extra_node[node.get_name()] = node.node
+                else:
+                    extra_node[node.get_name()] = node
                 self.buffer_names[node.get_name()] = 'Y_buffer'   #TODO: Buffer name fixed
 
         def hook():
@@ -454,6 +458,10 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         if name not in self.buffer_names:
             sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.stores, index_var, index)
             self.buffer_names[name] = sram_var
+        else:
+            zero_cse = self.get_const_cse(0)
+            sram_dims = len(tile_shape.split("x")) - 1
+            sram_index_var = ",".join([f"%{zero_cse}"] * sram_dims)
         sram_var = self.buffer_names[name]
 
         operation = "affine.vector_store" if tile_numel_per_lane > 1 else "affine.store"

From 7f17099eb75dfe0e0ea545b95d08aa8ad9a4ae94 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 11 Mar 2025 05:56:40 +0000
Subject: [PATCH 197/432] [Frontend/Fusion] Use render mechanism to replace
 hook point

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  |  8 +++-
 PyTorchSimFrontend/mlir/mlir_conv_template.py |  3 +-
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  2 +-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  2 -
 PyTorchSimFrontend/mlir/mlir_template.py      | 42 +++++++++----------
 5 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 64b0c7e1..85efec0a 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -16,6 +16,7 @@
 memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
 memref.global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+{{kernel.def_global_vars()}}
 
 func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} {
   %c_mvin = arith.constant 2 : index
@@ -33,7 +34,7 @@
   %tag2 = memref.alloc() : memref<1xi32>{% if not Bias %}
   %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
   %c0 = arith.constant 0 : index
-  {{- kernel.def_local_vars() }}
+{{ kernel.def_local_vars() }}
   affine.for %b=0 to {{ B }} {
     affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
       affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
@@ -74,10 +75,13 @@ def is_transposed(self, node):
         if isinstance(node, ReinterpretView):
             # if node.layout.stride != node.data.layout.stride:
             if node.layout.stride[-1] != node.data.layout.stride[-1] or node.layout.stride[-2] != node.data.layout.stride[-2]:
+                squeezed_layout = [s for s in node.layout.stride if s]
                 if node.layout.stride[-2] == node.data.layout.stride[-1] and node.layout.stride[-1] == node.data.layout.stride[-2]:
                     return True
+                elif squeezed_layout == node.data.layout.stride[len(node.data.layout.stride)-len(squeezed_layout):]:
+                    return False
                 else:
-                  raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
+                    raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
         return False
 
     def render(self,
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 15015a90..acb242d1 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -52,6 +52,7 @@
 memref.global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
 memref.global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+{{kernel.def_global_vars()}}
 
 func.func @{{ KERNEL_NAME }}({{ KERNEL_DEF }}) {
   %c_mvin = arith.constant 2 : index
@@ -73,7 +74,7 @@
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
-
+{{ kernel.def_local_vars() }}
   affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
     affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }} {
       affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M }} {
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index f00c77d7..4bd90861 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -35,7 +35,7 @@
   %tag2 = memref.alloc() : memref<1xi32>{% if not Bias %}
   %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
   %c0 = arith.constant 0 : index
-  {{- kernel.def_local_vars() }}
+{{ kernel.def_local_vars() }}
 
   affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
     affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index ce68f137..e6e02f0f 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -155,8 +155,6 @@ def codegen_src_code(self, kernel, render, template_node, epilogue_nodes):
                 if isinstance(partial_code, str)
                 else partial_code.finalize()
             )
-            src_code = kernel.add_extra_global_vars(src_code)
-            src_code = kernel.add_extra_local_vars(src_code)
         return src_code
 
     def codegen_template(self, template_node, epilogue_nodes):
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 54c634e2..a8d117c0 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -4,6 +4,7 @@
 import re
 import math
 import sympy
+from collections import OrderedDict
 
 from typing import List, Optional
 from unittest.mock import patch
@@ -43,7 +44,7 @@ def __init__(self,
         self.outer_func_name = outer_func_name
         self.outer_func_render = outer_func_render
         self.kernel_arg_attributes = kernel_arg_attributes
-        self.render_hooks = dict()
+        self.render_hooks = OrderedDict()
         self.buffer_names = dict()
         self.render_options = dict()
         self.tile_size = []
@@ -355,6 +356,7 @@ def hook():
 
         assert "<STORE_OUTPUT>" not in self.render_hooks
         self.render_hooks["<STORE_OUTPUT>"] = hook
+        self.render_hooks.move_to_end("<STORE_OUTPUT>", last=False) # Force order to be triggered first
         return "<STORE_OUTPUT>"
 
     def def_function(self):
@@ -365,30 +367,28 @@ def def_function(self):
             return None, None
 
     def def_global_vars(self):
-        return "<GLOBAL_VARS>"
-
-    def replace_global_vars(self):
-        return textwrap.indent(self.global_vars.getvalue(), "").strip()
-
-    def add_extra_global_vars(self, code):
         key = "<GLOBAL_VARS>"
-        return code.replace(key, self.replace_global_vars())
+        def hook():
+            return textwrap.indent(self.global_vars.getvalue(), "").strip()
+
+        assert key not in self.render_hooks
+        self.render_hooks[key] = hook
+        return key
 
     def def_local_vars(self):
-        return "<LOCAL_VARS>"
-
-    def replace_local_vars(self):
-        code = IndentedBuffer()
-        code.tabwidth = 2
-        code.splice("\n")
-        with code.indent():
-            code.splice(self.const_buffer)
-            code.splice(self.alloc_buffer)
-        return code.getvalue()
-
-    def add_extra_local_vars(self, code):
         key = "<LOCAL_VARS>"
-        return code.replace(key, self.replace_local_vars())
+        def hook():
+            code = IndentedBuffer()
+            code.tabwidth = 2
+            code.splice("\n")
+            with code.indent():
+                code.splice(self.const_buffer)
+                code.splice(self.alloc_buffer)
+            return code.getvalue()
+
+        assert key not in self.render_hooks
+        self.render_hooks[key] = hook
+        return key
 
     def render(self, template, kwargs):
         # self.render_hooks = {}

From 85f2417dbc05471af707651f76d53dd5be8c4199 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 11 Mar 2025 07:48:26 +0000
Subject: [PATCH 198/432] [Frontned] Add fp/int conversion op

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index cae4df0f..8f79070b 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -220,15 +220,16 @@ def maximum(operand1, operand2, *args, var_info=None, **kwargs):
     def to_dtype(operand, dst_mlir_dtype, *args, var_info=None, **kwargs):
         src_mlir_dtype = var_info[operand][1]
         tile_size = var_info[operand][0]
-
+        if isinstance(dst_mlir_dtype, torch.dtype):
+            dst_mlir_dtype = mlir_common.DTYPE_TO_MLIR[dst_mlir_dtype]
         dst_bits = int(dst_mlir_dtype[1:])
         src_bits = int(src_mlir_dtype[1:])
         shape = f"vector<{tile_size}x{dst_mlir_dtype}>" if tile_size > 1 else dst_mlir_dtype
         src_shape = f"vector<{tile_size}x{src_mlir_dtype}>" if tile_size > 1 else src_mlir_dtype
         if dst_mlir_dtype[0] == "i" and src_mlir_dtype[0] == "f":
-            raise NotImplementedError("floating point to integer conversion")
+            return f"arith.fptoui%{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
         if dst_mlir_dtype[0] == "f" and src_mlir_dtype[0] == "i":
-            raise NotImplementedError("integer to floating point conversion")
+            return f"arith.uitofp%{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
         if dst_mlir_dtype[0] == "i":
             if dst_bits > src_bits:
                 return f"arith.extui %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]

From 8d3d01424555ea808ae500000e43a400b6c1ec5c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 11 Mar 2025 11:31:46 +0000
Subject: [PATCH 199/432] [Frontend+Fusion] Implement logical ops + fusion fix
 error

---
 .../mlir/mlir_codegen_backend.py              | 47 +++++++++++++++----
 PyTorchSimFrontend/mlir/mlir_common.py        | 11 +++++
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  2 +
 3 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 8f79070b..bb4477dd 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -253,8 +253,6 @@ def constant(value, src_type, *args, var_info=None, **kwargs):
         # if value represented by e notation, convert to float (ex 1e-3 -> 1.0e-3)
         if "e" in str(value):
             value = float(value)
-        if value == float("-inf"):
-            value = "0xFF800000"
         elif src_type[0] == "f":
             value = format(value, ".20f")
         elif src_type[0] == "i":
@@ -538,20 +536,42 @@ def xor(operand1, operand2, *args, var_info=None, **kwargs):
 
 
     @staticmethod
-    def logical_and(operand, *args, var_info=None, **kwargs):
-        raise NotImplementedError("logical_and")
+    def logical_and(operand1, operand2, *args, var_info=None, **kwargs):
+        op_type = var_info[operand1]
+        # Type check & auto cast
+        if op_type[1] != "i1":
+            raise NotImplementedError("Logical operation with not bool data type")
+        return ExtensionOverrides.and_(operand1, operand2, *args, var_info=var_info, **kwargs)
 
     @staticmethod
     def logical_not(operand, *args, var_info=None, **kwargs):
-        raise NotImplementedError("logical_not")
+        op_type = var_info[operand]
+        # Type check & auto cast
+        if op_type[1] != "i1":
+            raise NotImplementedError("Logical operation with not bool data type")
+
+        ret_type = op_type[1]
+        tile_size = op_type[0]
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        const_one = ops.constant(0, "i1")
+        const_one = ops.broadcast(const_one, operand, var_info=var_info)
+        return f'arith.xori %{operand}, %{const_one} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def logical_or(operand, *args, var_info=None, **kwargs):
-        raise NotImplementedError("logical_not")
+    def logical_or(operand1, operand2, *args, var_info=None, **kwargs):
+        op_type = var_info[operand1]
+        # Type check & auto cast
+        if op_type[1] != "i1":
+            raise NotImplementedError("Logical operation with not bool data type")
+        return ExtensionOverrides.or_(operand1, operand2, *args, var_info=var_info, **kwargs)
 
     @staticmethod
-    def logical_xor(operand, *args, var_info=None, **kwargs):
-        raise NotImplementedError("logical_not")
+    def logical_xor(operand1, operand2, *args, var_info=None, **kwargs):
+        op_type = var_info[operand1]
+        # Type check & auto cast
+        if op_type[1] != "i1":
+            raise NotImplementedError("Logical operation with not bool data type")
+        return ExtensionOverrides.xor(operand1, operand2, *args, var_info=var_info, **kwargs)
 
     @staticmethod
     def relu(operand, *args, var_info=None, **kwargs):
@@ -939,6 +959,12 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         return ret_var
 
     def store_reduction(self, name, index, value):
+        # Note: Change cse temporaily
+        # Store reduction can't share cached value stored in cse,
+        # since it is not innermost loop body.
+        tmp_cse = self.cse
+        self.cse = self.reduction_cse
+
         dram_var = self.kernel_group.args.output(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
@@ -1003,6 +1029,9 @@ def store_reduction(self, name, index, value):
                                  f"{name}_tag", dram_shape, tile_shape, tile_stride)
         self.reductions_suffix.writeline(common.DeferredLine(name, code))
 
+        # Restore origin cse
+        self.cse = tmp_cse
+
     def index_expr(self, index, dtype):
         # Todo. To support index_expr, we have to custom instructions
         tile_desc = self.kernel_group.tile_desc
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 1213666a..4a98ad8d 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -60,6 +60,17 @@
     torch.float16,
 ]
 
+MLIR_INF = {
+    "inf" : {
+        "f32" : 0x7F800000,
+        "f64" : 0x7FF0000000000000
+    },
+    "-inf" : {
+        "f32" : 0xFF800000,
+        "f64" : 0xFFF0000000000000
+    }
+}
+
 class ParallelLoopBuffer(IndentedBuffer):
     def indent(self, offset=1, outer_loop=True):
         @contextlib.contextmanager
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index e6e02f0f..3c33726c 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -147,6 +147,8 @@ def codegen_src_code(self, kernel, render, template_node, epilogue_nodes):
                 tile_desc = kernel.compute_tile_size(epilogue_nodes, vars, reduction_vars)
                 kernel.kernel_group.set_tile_info(tile_desc)
                 kernel.adjust_tile_size()
+            # Flush created varaibles, since template fusion doen't share variable
+            kernel.cse.cache.clear()
             for node in epilogue_nodes:
                 node.codegen((vars, reduction_vars))
         with V.set_kernel_handler(kernel):

From 831fe3184b8b6662fd9f399a50ace0a0ca77a0aa Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 12 Mar 2025 03:09:29 +0000
Subject: [PATCH 200/432] [Test/Mixtral] add model & test script

---
 tests/Mixtral_8x7B/model.py          | 239 +++++++++++++++++++++++++++
 tests/Mixtral_8x7B/test_attention.py | 172 +++++++++++++++++++
 2 files changed, 411 insertions(+)
 create mode 100644 tests/Mixtral_8x7B/model.py
 create mode 100644 tests/Mixtral_8x7B/test_attention.py

diff --git a/tests/Mixtral_8x7B/model.py b/tests/Mixtral_8x7B/model.py
new file mode 100644
index 00000000..6b813bae
--- /dev/null
+++ b/tests/Mixtral_8x7B/model.py
@@ -0,0 +1,239 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import functional as F
+def multinomial_sample_one_no_sync(
+    probs_sort,
+):  # Does multinomial sampling without a cuda synchronization
+    q = torch.empty_like(probs_sort).exponential_(1)
+    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
+
+
+def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
+    logits = logits / max(temperature, 1e-5)
+
+    if top_k is not None:
+        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+        pivot = v.select(-1, -1).unsqueeze(-1)
+        logits = torch.where(logits < pivot, -float("Inf"), logits)
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    return probs
+
+
+def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
+    probs = logits_to_probs(logits[0, -1], temperature, top_k)
+    idx_next = multinomial_sample_one_no_sync(probs)
+    return idx_next, probs
+
+
+def find_multiple(n: int, k: int) -> int:
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+@dataclass
+class ModelArgs:
+    block_size: int = 2048
+    vocab_size: int = 32000
+    n_layer: int = 32
+    n_head: int = 32
+    dim: int = 4096
+    intermediate_size: int = None
+    n_local_heads: int = -1
+    head_dim: int = 64
+    rope_base: float = 10000
+    norm_eps: float = 1e-5
+    num_experts: int = 8
+    num_activated_experts: int = 2
+
+    def __post_init__(self):
+        if self.n_local_heads == -1:
+            self.n_local_heads = self.n_head
+        if self.intermediate_size is None:
+            hidden_dim = 4 * self.dim
+            n_hidden = int(2 * hidden_dim / 3)
+            self.intermediate_size = find_multiple(n_hidden, 256)
+        self.head_dim = self.dim // self.n_head
+
+    @classmethod
+    def from_name(cls, name: str):
+        if name in transformer_configs:
+            return cls(**transformer_configs[name])
+        # fuzzy search
+        config = [config for config in transformer_configs if config in str(name).upper() or config in str(name)]
+        assert len(config) == 1, name
+        return cls(**transformer_configs[config[0]])
+
+
+transformer_configs = {
+    "Mixtral-8x7B-v0.1": dict(block_size=32768, n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, rope_base=1000000.0, num_experts=8, num_activated_experts=2),
+}
+
+class KVCache(nn.Module):
+    def __init__(self, max_batch_size, max_seq_length, n_heads, head_dim, dtype=torch.bfloat16):
+        super().__init__()
+        cache_shape = (max_batch_size, n_heads, 0, head_dim)
+        self.register_buffer('k_cache', torch.zeros(cache_shape, dtype=dtype))
+        self.register_buffer('v_cache', torch.zeros(cache_shape, dtype=dtype))
+
+    def update(self, k_val, v_val):
+        self.k_cache = torch.cat([self.k_cache, k_val], dim=2)
+        self.v_cache = torch.cat([self.v_cache, v_val], dim=2)
+
+        return self.k_cache, self.v_cache
+
+class Transformer(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.config = config
+
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
+        self.layers = nn.ModuleList(TransformerBlock(config) for _ in range(config.n_layer))
+        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
+
+        self.freqs_cis: Optional[Tensor] = None
+        self.mask_cache: Optional[Tensor] = None
+        self.max_batch_size = -1
+        self.max_seq_length = -1
+        self.setup_caches(1, 512)
+
+    def setup_caches(self, max_batch_size, max_seq_length):
+        if self.max_seq_length >= max_seq_length and self.max_batch_size >= max_batch_size:
+            return
+        head_dim = self.config.dim // self.config.n_head
+        max_seq_length = find_multiple(max_seq_length, 8)
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+        #for b in self.layers:
+        #    b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim)
+
+    def forward(self, x: Tensor, mask, freqs_cis: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
+        for i, layer in enumerate(self.layers):
+            x = layer(x, input_pos, freqs_cis, mask)
+        x = self.norm(x)
+        logits = self.output(x)
+        return logits
+
+    @classmethod
+    def from_name(cls, name: str):
+        return cls(ModelArgs.from_name(name))
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.attention = Attention(config)
+        self.ffn = FeedForward(config)
+        self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
+        self.attention_norm = RMSNorm(config.dim, config.norm_eps)
+
+    def forward(self, x: Tensor, input_pos: Tensor, freqs_cis: Tensor, mask: Tensor) -> Tensor:
+        h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)
+        out = h + self.ffn(self.ffn_norm(h))
+        return out
+
+class Attention(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        assert config.dim % config.n_head == 0
+
+        total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
+        # key, query, value projections for all heads, but in a batch
+        self.wqkv = nn.Linear(config.dim, total_head_dim, bias=False)
+        self.wo = nn.Linear(config.dim, config.dim, bias=False)
+        self.kv_cache = None
+
+        self.n_head = config.n_head
+        self.head_dim = config.head_dim
+        self.n_local_heads = config.n_local_heads
+        self.dim = config.dim
+
+    def forward(self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
+        bsz, seqlen, _ = x.shape
+
+        kv_size = self.n_local_heads * self.head_dim
+        q, k, v = self.wqkv(x).split([self.dim, kv_size, kv_size], dim=-1)
+
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+
+        # Todo.
+        # q = apply_rotary_emb(q, freqs_cis)
+        # k = apply_rotary_emb(k, freqs_cis)
+
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+
+        if self.kv_cache is not None:
+            k, v = self.kv_cache.update(k, v)
+
+        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+
+        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+        y = self.wo(y)
+        return y
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.w1 = nn.Parameter(torch.empty(config.intermediate_size, config.dim))
+        self.w2 = nn.Parameter(torch.empty(config.dim, config.intermediate_size))
+        self.w3 = nn.Parameter(torch.empty(config.intermediate_size, config.dim))
+
+    def forward(self, x) -> Tensor:
+        x1 = F.silu(torch.einsum('bti,oi -> bto', x, self.w1))
+        x3 = torch.einsum('bti, oi -> bto', x, self.w3)
+        out =  torch.einsum('bto, io -> bti', (x1 * x3), self.w2)
+        return out
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+
+    def forward(self, x: Tensor) -> Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
+def precompute_freqs_cis(
+    seq_len: int, n_elem: int, base: int = 10000
+) -> Tensor:
+    freqs = 1.0 / (base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem))
+    t = torch.arange(seq_len, device=freqs.device)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
+    return cache.to(dtype=torch.bfloat16)
+
+
+def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2).contiguous()
+    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2).contiguous()
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
+            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
+        ],
+        -1,
+    )
+
+    x_out2 = x_out2.flatten(3)
+    return x_out2.type_as(x)
diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py
new file mode 100644
index 00000000..c62bb4ca
--- /dev/null
+++ b/tests/Mixtral_8x7B/test_attention.py
@@ -0,0 +1,172 @@
+import copy
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+from model import Transformer, TransformerBlock, ModelArgs, FeedForward, KVCache, precompute_freqs_cis, sample
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    message = f"|{name} Test Passed|"
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def test_prefill(device, prompt_length):
+    # Setup model & model args
+    args = ModelArgs()
+    args.n_head = 8
+    args.n_local_heads = -1
+    args.intermediate_size = None
+    args.dim = 512
+    args.n_layer = 1
+    args.__post_init__()
+    max_batch = 1
+    max_seq = 512
+    head_dim = args.dim // args.n_head
+    model = Transformer(args)
+    model.setup_caches(max_batch, max_seq)
+    model = model.to(device=device)
+
+    # Prepare inputs
+    T = prompt_length
+    prompt = torch.randint(0, 2000, [1, T, args.dim] , dtype=torch.int32)
+    input_pos = torch.arange(0, T)
+    mask = torch.tril(torch.ones(T, T, dtype=torch.bool))
+    freqs_cis = precompute_freqs_cis(args.block_size, args.dim // args.n_head, args.rope_base)[input_pos].to(dtype=torch.float32)
+
+    cpu_prompt = copy.deepcopy(prompt)
+    prompt = prompt.to(device=device)
+    cpu_input_pos = copy.deepcopy(input_pos)
+
+    input_pos = input_pos.to(device=device)
+    cpu_mask = copy.deepcopy(mask)
+    mask = mask.to(device=device)
+    cpu_freqs_cis = copy.deepcopy(freqs_cis)
+    freqs_cis = freqs_cis.to(device=device)
+    cpu_kv_caches = copy.deepcopy(kv_caches)
+    kv_caches = [kv.to(device=device) for kv in kv_caches]
+
+    cpu_model = copy.deepcopy(model).to("cpu")
+    opt_fn = torch.compile(dynamic=False)(model)
+
+    # Run models
+    res = opt_fn(prompt, mask, freqs_cis, input_pos)
+    cpu_res = cpu_model(cpu_prompt, cpu_mask, cpu_freqs_cis, cpu_input_pos)
+    #test_result("Transformer", res, cpu_res)
+
+
+def test_decode(device, prompt_length, nr_tokens):
+    # Setup model & model args
+    args = ModelArgs()
+    args.n_head = 8
+    args.n_local_heads = -1
+    args.intermediate_size = None
+    args.dim = 512
+    args.n_layer = 1
+    args.__post_init__()
+    max_batch = 1
+    max_seq = 512
+    head_dim = args.dim // args.n_head
+    model = Transformer(args)
+    model.setup_caches(max_batch, max_seq)
+    model = model.to(device=device)
+
+    # Prepare inputs
+    T = prompt_length
+    prompt = torch.randint(0, 2000, [1, T, args.dim] , dtype=torch.float32)
+    cpu_prompt = copy.deepcopy(prompt)
+    cpu_model = copy.deepcopy(model).to("cpu")
+    opt_fn = torch.compile(dynamic=False)(model)
+
+    # Prepare KV cache
+    kv_caches = [KVCache(max_batch, max_seq, args.n_head, head_dim, torch.float32) for i in range(args.n_layer)]
+    cpu_kv_caches = copy.deepcopy(kv_caches)
+    kv_caches = [kv.to(device=device) for kv in kv_caches]
+    for idx, b in enumerate(model.layers):
+        b.attention.kv_cache = kv_caches[idx]
+    for idx, b in enumerate(cpu_model.layers):
+        b.attention.kv_cache = cpu_kv_caches[idx]
+
+    for i in range(nr_tokens):
+        input_pos = torch.arange(0, T)
+        mask = torch.tril(torch.ones(T, T, dtype=torch.bool))
+        freqs_cis = precompute_freqs_cis(args.block_size, args.dim // args.n_head, args.rope_base)[input_pos].to(dtype=torch.float32)
+        prompt = prompt.to(device=device)
+        cpu_input_pos = copy.deepcopy(input_pos)
+        input_pos = input_pos.to(device=device)
+        cpu_mask = copy.deepcopy(mask)
+        mask = mask.to(device=device)
+        cpu_freqs_cis = copy.deepcopy(freqs_cis)
+        freqs_cis = freqs_cis.to(device=device)
+
+        # Run models
+        res = opt_fn(prompt, mask, freqs_cis, input_pos)
+        cpu_res = cpu_model(cpu_prompt, cpu_mask, cpu_freqs_cis, cpu_input_pos)
+        new_token = sample(cpu_res.cpu())[0]
+        print(new_token)
+        new_token = cpu_model.tok_embeddings(new_token).unsqueeze(1)
+        cpu_prompt = new_token #torch.cat([cpu_prompt, new_token], dim=1)
+        prompt = cpu_prompt.clone()
+        T = 1
+
+def test_attention(device):
+    args = ModelArgs()
+    args.n_head = 8
+    args.n_local_heads = -1
+    args.intermediate_size = None
+    args.dim = 512
+    args.__post_init__()
+    model = TransformerBlock(args)
+    model = model.to(device=device)
+
+    T = 32
+    prompt = torch.randn([1, T, args.dim] , dtype=torch.float32)
+    input_pos = torch.arange(0, T)
+    cpu_prompt = copy.deepcopy(prompt)
+    prompt = prompt.to(device=device)
+    cpu_input_pos = copy.deepcopy(input_pos)
+    input_pos = input_pos.to(device=device)
+
+    cpu_model = copy.deepcopy(model).to("cpu")
+    opt_fn = torch.compile(dynamic=False)(model)
+    res = opt_fn(prompt, input_pos, None)
+    cpu_res = cpu_model(cpu_prompt, cpu_input_pos, None)
+    test_result("Attention", res, cpu_res)
+
+def test_ffn(device):
+    args = ModelArgs()
+    args.n_head = 8
+    args.n_local_heads = -1
+    args.intermediate_size = None
+    args.dim = 512
+    args.__post_init__()
+    model = FeedForward(args)
+    model = model.to(device=device)
+
+    T = 32
+    prompt = torch.randn([1, T, args.dim] , dtype=torch.float32)
+    cpu_prompt = copy.deepcopy(prompt)
+    prompt = prompt.to(device=device)
+
+    cpu_model = copy.deepcopy(model).to("cpu")
+    opt_fn = torch.compile(dynamic=False)(model)
+    res = opt_fn(prompt)
+    cpu_res = cpu_model(cpu_prompt)
+    test_result("FFN", res, cpu_res)
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    #test_prefill(device, prompt=32)
+    test_decode(device, 32, 4)
+    #test_attention(device)
+    #test_ffn(device)

From a9bc9bcff5e050603afbe784608f3ce689a21c67 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 12 Mar 2025 08:36:09 +0000
Subject: [PATCH 201/432] [Frontned] Support nan for arith.const

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 ++
 PyTorchSimFrontend/mlir/mlir_common.py          | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index bb4477dd..c08fe717 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -257,6 +257,8 @@ def constant(value, src_type, *args, var_info=None, **kwargs):
             value = format(value, ".20f")
         elif src_type[0] == "i":
             value = int(value)
+        if "inf" == str(value) or "-inf" == str(value) or "nan" == str(value):
+            value = f"0x{mlir_common.MLIR_INF[str(value)][src_type]:x}"
         return f'arith.constant {value} : {src_type}', [1, src_type]
 
     @staticmethod
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 4a98ad8d..98ac9964 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -68,6 +68,10 @@
     "-inf" : {
         "f32" : 0xFF800000,
         "f64" : 0xFFF0000000000000
+    },
+    "nan" : {
+        "f32" : 0x7FC00000,
+        "f64" : 0x7FF8000000000000
     }
 }
 

From 68219bdf221a9b4ec3708a8ef677b371c8969b00 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 12 Mar 2025 10:58:20 +0000
Subject: [PATCH 202/432] [Frontend/GEMM] Add transpose case for Mistral

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 4bd90861..fd85e39f 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -80,17 +80,26 @@ def __init__(self, input_nodes, layout, input_reorder=None):
 
     def is_transposed(self, node):
         if isinstance(node, ReinterpretView):
+            unsqueezed_layout_stride = [s for s, size in zip(node.layout.stride, node.layout.size) if size > 1]
+            unsqueezed_data_stride = [s for s, size in zip(node.data.layout.stride, node.data.layout.size) if size > 1]
+
             if 0 in node.layout.stride: # [MoE] Temporary solution
                 if node.layout.stride[1] == 0:
                     return True
-            if node.layout.stride != node.data.layout.stride:
+            if len(node.layout.stride) == len(node.data.layout.stride):
                 if node.layout.stride[-2] == node.data.layout.stride[-1] and node.layout.stride[-1] == node.data.layout.stride[-2]:
                     return True
-                elif len(node.layout.stride) < len(node.data.layout.stride) and node.layout.stride == node.data.layout.stride[-len(node.layout.stride):]:
-                    # Squeezed case
-                    return False
                 else:
                     raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
+            elif len(node.layout.stride) < len(node.data.layout.stride):
+                # Squeezed case
+                if node.layout.stride == node.data.layout.stride[-len(node.layout.stride):]:
+                    return False
+                if len(unsqueezed_layout_stride) < len(unsqueezed_data_stride):
+                    if unsqueezed_layout_stride == unsqueezed_data_stride[-len(unsqueezed_layout_stride):]:
+                        return False
+                raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
+
         return False
 
     def render(self,

From 9e99111ff8238b6ef61dbcdfe71871b91bf3a1ea Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 12 Mar 2025 10:59:09 +0000
Subject: [PATCH 203/432] [Frontend] Use MLIR_INF & avoid buf_bounds

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 10 ++++++----
 PyTorchSimFrontend/mlir/mlir_common.py          | 10 +---------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index c08fe717..17034f27 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -118,6 +118,8 @@ def custom_cast(operand, target_type, *args, var_info=None, **kwargs):
 
     @staticmethod
     def binary_elementwise_common(operand1, operand2, var_info):
+        operand1.bounds = operand1.bounds.unknown()
+        operand2.bounds = operand2.bounds.unknown()
         op_type1 = var_info[operand1]
         op_type2 = var_info[operand2]
         # Tile size check
@@ -250,15 +252,15 @@ def constant(value, src_type, *args, var_info=None, **kwargs):
         if isinstance(src_type, torch.dtype):
             src_type = mlir_common.DTYPE_TO_MLIR[src_type]
 
+        if "inf" == str(value) or "-inf" == str(value) or "nan" == str(value):
+            value = f"0x{mlir_common.MLIR_INF[str(value)][src_type]:x}"
         # if value represented by e notation, convert to float (ex 1e-3 -> 1.0e-3)
-        if "e" in str(value):
-            value = float(value)
+        elif "e" in str(value):
+            value = format(float(value), ".20f")
         elif src_type[0] == "f":
             value = format(value, ".20f")
         elif src_type[0] == "i":
             value = int(value)
-        if "inf" == str(value) or "-inf" == str(value) or "nan" == str(value):
-            value = f"0x{mlir_common.MLIR_INF[str(value)][src_type]:x}"
         return f'arith.constant {value} : {src_type}', [1, src_type]
 
     @staticmethod
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 98ac9964..72540385 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -550,19 +550,11 @@ class CSEProxy:
             @staticmethod
             def __getattr__(name: str) -> Callable[..., common.CSEVariable]:  # type: ignore[misc]
                 def inner(*args, **kwargs):
-                    # TritonTemplateKernel has no current_node
-                    buf_bounds = ValueRanges.unknown()
-                    if hasattr(V.interpreter, "current_node"):
-                        fx_node = V.interpreter.current_node
-                        assert isinstance(self.node_to_bounds, dict)
-                        buf_bounds = self.node_to_bounds.get(
-                            fx_node, ValueRanges.unknown()
-                        )
                     code, ret_info = getattr(parent_handler, name)(*args, var_info=self.var_info)
                     csevar = self.cse.generate(
                         self.compute,
                         code,
-                        bounds=buf_bounds,
+                        bounds=ValueRanges.unknown(),
                         assignment=(ret_info[0] is not None)
                     )
                     if ret_info[0] is not None:

From cb2a088dba0df86c3e06897b509195ae63daee6d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 12 Mar 2025 13:11:46 +0000
Subject: [PATCH 204/432] [Test/Mistral] Use dummy rotary encoder

---
 tests/Mixtral_8x7B/model.py          | 12 +++++++-----
 tests/Mixtral_8x7B/test_attention.py |  2 ++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/Mixtral_8x7B/model.py b/tests/Mixtral_8x7B/model.py
index 6b813bae..ce9cf5bb 100644
--- a/tests/Mixtral_8x7B/model.py
+++ b/tests/Mixtral_8x7B/model.py
@@ -168,8 +168,8 @@ def forward(self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Optiona
         v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)
 
         # Todo.
-        # q = apply_rotary_emb(q, freqs_cis)
-        # k = apply_rotary_emb(k, freqs_cis)
+        q = apply_rotary_emb(q, freqs_cis)
+        k = apply_rotary_emb(k, freqs_cis)
 
         q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
 
@@ -223,10 +223,12 @@ def precompute_freqs_cis(
     cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
     return cache.to(dtype=torch.bfloat16)
 
-
 def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
-    xshaped = x.float().reshape(*x.shape[:-1], -1, 2).contiguous()
-    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2).contiguous()
+    # FIXME. This is dummy rotary embedding
+    return x*freqs_cis
+
+def apply_rotary_emb2(x: Tensor, freqs_cis: Tensor) -> Tensor:
+    xshaped = x.reshape(*x.shape[:-1], -1, 2)
     x_out2 = torch.stack(
         [
             xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py
index c62bb4ca..b02e7597 100644
--- a/tests/Mixtral_8x7B/test_attention.py
+++ b/tests/Mixtral_8x7B/test_attention.py
@@ -100,6 +100,8 @@ def test_decode(device, prompt_length, nr_tokens):
         input_pos = input_pos.to(device=device)
         cpu_mask = copy.deepcopy(mask)
         mask = mask.to(device=device)
+
+        freqs_cis = freqs_cis.view(1, T, 1, -1)
         cpu_freqs_cis = copy.deepcopy(freqs_cis)
         freqs_cis = freqs_cis.to(device=device)
 

From 2c0f409fe4564f51f67f0a0a1245f54e0c127a4c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 12 Mar 2025 13:33:59 +0000
Subject: [PATCH 205/432] [CI] Add mistral test case

---
 .github/workflows/docker-image.yml | 15 ++++++++++++++-
 .github/workflows/pull-request.yml | 15 ++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 0935e36a..b13eda61 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -354,6 +354,19 @@ jobs:
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/MoE/test_moe.py
 
+  test_mistral:
+    name: Run test_mistral
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_mistral.py
+        run: |
+          echo "Running test_mistral.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Mixtral_8x7B/test_attention.py
+
   test_cleanup:
     name: Clean test cases
     runs-on: self-hosted
@@ -362,7 +375,7 @@ jobs:
             test_transpose2D, test_view3D_2D, test_layernorm,
             test_mlp, test_resnet, test_transformer, test_transpose3D,
             test_sparsity, test_activation, test_pool, test_perceptron,
-            test_fusion, test_moe]
+            test_fusion, test_mistral, test_moe]
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index 92703c64..88e9cfda 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -354,6 +354,19 @@ jobs:
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/MoE/test_moe.py
 
+  test_mistral:
+    name: Run test_mistral
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_mistral.py
+        run: |
+          echo "Running test_mistral.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Mixtral_8x7B/test_attention.py
+
   test_cleanup:
     name: Clean test cases
     runs-on: self-hosted
@@ -362,7 +375,7 @@ jobs:
             test_transpose2D, test_view3D_2D, test_layernorm,
             test_mlp, test_resnet, test_transformer, test_transpose3D,
             test_sparsity, test_activation, test_pool, test_perceptron,
-            test_fusion, test_moe]
+            test_fusion, test_mistral, test_moe]
     steps:
       - name: Checkout code
         uses: actions/checkout@v3

From a40e1bca9d0248272aa6491d3301589230dda17b Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Thu, 13 Mar 2025 13:22:35 +0000
Subject: [PATCH 206/432] [Optimize] index expr

vector load should be placed in self.compute buffer

if the distance between load and compute, register spilling incurs
---
 .../mlir/mlir_codegen_backend.py              | 68 ++++++++++---------
 PyTorchSimFrontend/mlir/mlir_common.py        |  4 ++
 gem5_script/script_systolic.py                |  1 +
 3 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 17034f27..065669ac 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -616,36 +616,6 @@ def masked(mask, body, other, *args, var_info=None, tile_size=16, dtype="f32", n
         result = ops.where(mask, result, val)
         return result, var_info[result]
 
-    @staticmethod
-    def _index_expr(tile_size, buffer, renamed_expression, index, *args, var_info=None, **kwargs):
-        str_tile_size = [str(dim) for dim in tile_size]
-        shape = "x".join(str_tile_size)
-
-        dim = ["%d"+str(i) for i in range(len(tile_size))]
-        sym_dim = ["d"+str(i) for i in range(len(tile_size))]
-        start_dim = [str(0) for i in tile_size]
-        end_dim = [str(i) for i in tile_size]
-        indices = [str(i) for i in index.free_symbols]
-
-        affine_map_str = "(" + ", ".join(sym_dim) + ") -> ("
-        affine_map_str += sympy.printing.ccode(renamed_expression) + ")"
-        affine_offset_map = "(d0, d1) -> (d0 + d1)"
-        affine_offset_var = ""
-        offset_vars = dim.copy()
-        for idx in indices:
-            i = int(idx[5:])
-            affine_offset_var += f"%offset{i} = affine.apply affine_map<{affine_offset_map}>(%{idx}, {dim[i]})\n"
-            offset_vars[i] = f"%offset{i}"
-
-        apply_map_var = f"%index_var = affine.apply affine_map<{affine_map_str}>({', '.join(offset_vars)}) {{global_idx=1}}\n"
-        broadcast_var = f"%broadcast_var = vector.broadcast %index_var : index to vector<2xindex>\n"
-        broadcast_i64 = f"%broadcast_i64 = arith.index_cast %broadcast_var : vector<2xindex> to vector<2xi64>\n"
-        affine_store_var = f"affine.vector_store %broadcast_i64, %{buffer}[{','.join(dim)}] : memref<{shape}xi64, 1>, vector<2xi64>\n"
-
-        result = f"affine.parallel ({','.join(dim)}) = ({','.join(start_dim)}) to ({','.join(end_dim)}) {{\n" + \
-            affine_offset_var + apply_map_var + broadcast_var + broadcast_i64 + affine_store_var + f"}}"
-        return result, [None, None]
-
     @staticmethod
     def index_cast(operand, target_type, *args, var_info=None, **kwrags):
         op_type = var_info[operand]
@@ -825,7 +795,7 @@ def load(self, name: str, index: sympy.Expr):
         operation = "affine.vector_load" if tile_numel_per_lane > 1 else "affine.load"
         shape = f", vector<{tile_numel_per_lane}x{mlir_dtype}>" if tile_numel_per_lane > 1 else ""
         line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}{shape}"
-        out = self.cse.generate(self.loads, line)
+        out = self.cse.generate(self.compute, line)
         self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
         return out
 
@@ -857,7 +827,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
             value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
 
         line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}{shape}"
-        self.stores.writeline(common.DeferredLine(name, line))
+        self.stores.writeline(common.DeferredLine(name, line)) # TODO: Should be changed to self.compute?
 
         # Generate DMA instruction
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
@@ -1036,6 +1006,38 @@ def store_reduction(self, name, index, value):
         # Restore origin cse
         self.cse = tmp_cse
 
+    def _index_expr(self, tile_size, buffer, renamed_expression, index):
+        str_tile_size = [str(dim) for dim in tile_size]
+        shape = "x".join(str_tile_size)
+
+        dim = ["%d"+str(i) for i in range(len(tile_size))]
+        sym_dim = ["d"+str(i) for i in range(len(tile_size))]
+        start_dim = [str(0) for i in tile_size]
+        end_dim = [str(i) for i in tile_size]
+        indices = [str(i) for i in index.free_symbols]
+
+        affine_map_str = "(" + ", ".join(sym_dim) + ") -> ("
+        affine_map_str += sympy.printing.ccode(renamed_expression) + ")"
+        affine_offset_map = "(d0, d1) -> (d0 + d1)"
+        offset_vars = dim.copy()
+        parallel_map = f"affine.parallel ({','.join(dim)}) = ({','.join(start_dim)}) to ({','.join(end_dim)}) {{"
+        self.loads.writeline(parallel_map)
+        with self.loads.indent():
+            for idx in indices:
+                i = int(idx[5:])
+                self.loads.writeline(f"%offset{i} = affine.apply affine_map<{affine_offset_map}>(%{idx}, {dim[i]})")
+                offset_vars[i] = f"%offset{i}"
+            apply_map = f"affine.apply affine_map<{affine_map_str}>({', '.join(offset_vars)}) {{global_idx=1}}"
+            apply_map_var = self.cse.generate(self.loads, apply_map)
+            broadcast = f"vector.broadcast %{apply_map_var} : index to vector<2xindex>"
+            broadcast_var = self.cse.generate(self.loads, broadcast)
+            cast_i64 = f"arith.index_cast %{broadcast_var} : vector<2xindex> to vector<2xi64>"
+            cast_i64_var = self.cse.generate(self.loads, cast_i64)
+            affine_store = f"affine.vector_store %{cast_i64_var}, %{buffer}[{','.join(dim)}] : memref<{shape}xi64, 1>, vector<2xi64>"
+            res = self.cse.generate(self.loads, affine_store, assignment=False)
+        self.loads.writeline("}")
+        return res
+
     def index_expr(self, index, dtype):
         # Todo. To support index_expr, we have to custom instructions
         tile_desc = self.kernel_group.tile_desc
@@ -1047,7 +1049,7 @@ def index_expr(self, index, dtype):
         tile_shape = f"memref<{shape}xi64, 1>"
 
         # Define scratch pad buffer
-        sram_var, _, _ = self.get_scratchpad_buffer(dtype, "index_buffer", tile_numel_per_lane, tile_shape, self.loads, None, "index_expr") # use same index for reuse spad
+        sram_var, _, _ = self.get_scratchpad_buffer(dtype, "index_buffer", tile_numel_per_lane, tile_shape, self.loads, None, index)
 
         renamed_symbols = {symbol: "d"+str(symbol)[5:] for symbol in index.free_symbols}
         renamed_expression = index.subs(renamed_symbols)
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 72540385..b0a45b4f 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -612,6 +612,10 @@ def store_reduction(name, index, value):
             def reduction(dtype, src_dtype, reduction_type, value):
                 return self.reduction(dtype, src_dtype, reduction_type, value)
 
+            @staticmethod
+            def _index_expr(tile_size, buffer, renamed_expression, index):
+                return self._index_expr(tile_size, buffer, renamed_expression, index)
+
             @staticmethod
             def index_expr(index, dtype):
                 return self.index_expr(index, dtype)
diff --git a/gem5_script/script_systolic.py b/gem5_script/script_systolic.py
index 83c2ab39..53553517 100644
--- a/gem5_script/script_systolic.py
+++ b/gem5_script/script_systolic.py
@@ -160,6 +160,7 @@ class MinorVecConfig(MinorFU):
     opClasses = minorMakeOpClassSet(
         [
             "SimdConfig",
+            "CustomVlaneIdx",
         ]
     )
     opLat = 1

From a75f733be6e97017bfd9881901c920d1a0cd2705 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 13 Mar 2025 13:06:50 +0000
Subject: [PATCH 207/432] [Frontend] Fix logical_not and select operation + 3D
 reduction fix

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 065669ac..fd47a9a3 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -557,9 +557,10 @@ def logical_not(operand, *args, var_info=None, **kwargs):
         ret_type = op_type[1]
         tile_size = op_type[0]
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        const_one = ops.constant(0, "i1")
+        const_one = ops.constant(1, "i1")
         const_one = ops.broadcast(const_one, operand, var_info=var_info)
-        return f'arith.xori %{operand}, %{const_one} : {shape}', [tile_size, ret_type]
+        ret = ops.eq(operand,const_one)
+        return ret, [tile_size, var_info[ret]]
 
     @staticmethod
     def logical_or(operand1, operand2, *args, var_info=None, **kwargs):
@@ -1192,7 +1193,7 @@ def get_dma_info(self, name, index, index_var, broadcast=True, store_reduction=F
         elif len(local_dims) == 3:
             is_reduction = self.reduction_depth < 3 and not store_reduction
             if is_reduction:
-                local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims], [2, 1, 0])
+                local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims], [1, 2, 0])
                 local_tile_desc.vlane_split_axis = local_vlane_split_axis
                 local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
             else:

From d3287fb16ec90b735df519ab3cb81d633fe97651 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 13 Mar 2025 13:11:26 +0000
Subject: [PATCH 208/432] [Test/Mistral] Fix mistral test case

---
 tests/Mixtral_8x7B/model.py          |  5 ++-
 tests/Mixtral_8x7B/test_attention.py | 63 ++++++----------------------
 2 files changed, 15 insertions(+), 53 deletions(-)

diff --git a/tests/Mixtral_8x7B/model.py b/tests/Mixtral_8x7B/model.py
index ce9cf5bb..9fc54bee 100644
--- a/tests/Mixtral_8x7B/model.py
+++ b/tests/Mixtral_8x7B/model.py
@@ -168,8 +168,9 @@ def forward(self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Optiona
         v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)
 
         # Todo.
-        q = apply_rotary_emb(q, freqs_cis)
-        k = apply_rotary_emb(k, freqs_cis)
+        if freqs_cis is not None:
+            q = apply_rotary_emb(q, freqs_cis)
+            k = apply_rotary_emb(k, freqs_cis)
 
         q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
 
diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py
index b02e7597..1afd18f4 100644
--- a/tests/Mixtral_8x7B/test_attention.py
+++ b/tests/Mixtral_8x7B/test_attention.py
@@ -2,7 +2,7 @@
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
-from model import Transformer, TransformerBlock, ModelArgs, FeedForward, KVCache, precompute_freqs_cis, sample
+from model import Transformer, TransformerBlock, ModelArgs, Attention, FeedForward, KVCache, precompute_freqs_cis, sample
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     message = f"|{name} Test Passed|"
@@ -15,50 +15,6 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
         print("cpu out: ", cpu_out)
         exit(1)
 
-def test_prefill(device, prompt_length):
-    # Setup model & model args
-    args = ModelArgs()
-    args.n_head = 8
-    args.n_local_heads = -1
-    args.intermediate_size = None
-    args.dim = 512
-    args.n_layer = 1
-    args.__post_init__()
-    max_batch = 1
-    max_seq = 512
-    head_dim = args.dim // args.n_head
-    model = Transformer(args)
-    model.setup_caches(max_batch, max_seq)
-    model = model.to(device=device)
-
-    # Prepare inputs
-    T = prompt_length
-    prompt = torch.randint(0, 2000, [1, T, args.dim] , dtype=torch.int32)
-    input_pos = torch.arange(0, T)
-    mask = torch.tril(torch.ones(T, T, dtype=torch.bool))
-    freqs_cis = precompute_freqs_cis(args.block_size, args.dim // args.n_head, args.rope_base)[input_pos].to(dtype=torch.float32)
-
-    cpu_prompt = copy.deepcopy(prompt)
-    prompt = prompt.to(device=device)
-    cpu_input_pos = copy.deepcopy(input_pos)
-
-    input_pos = input_pos.to(device=device)
-    cpu_mask = copy.deepcopy(mask)
-    mask = mask.to(device=device)
-    cpu_freqs_cis = copy.deepcopy(freqs_cis)
-    freqs_cis = freqs_cis.to(device=device)
-    cpu_kv_caches = copy.deepcopy(kv_caches)
-    kv_caches = [kv.to(device=device) for kv in kv_caches]
-
-    cpu_model = copy.deepcopy(model).to("cpu")
-    opt_fn = torch.compile(dynamic=False)(model)
-
-    # Run models
-    res = opt_fn(prompt, mask, freqs_cis, input_pos)
-    cpu_res = cpu_model(cpu_prompt, cpu_mask, cpu_freqs_cis, cpu_input_pos)
-    #test_result("Transformer", res, cpu_res)
-
-
 def test_decode(device, prompt_length, nr_tokens):
     # Setup model & model args
     args = ModelArgs()
@@ -77,7 +33,7 @@ def test_decode(device, prompt_length, nr_tokens):
 
     # Prepare inputs
     T = prompt_length
-    prompt = torch.randint(0, 2000, [1, T, args.dim] , dtype=torch.float32)
+    prompt = torch.randn([1, T, args.dim] , dtype=torch.float32)
     cpu_prompt = copy.deepcopy(prompt)
     cpu_model = copy.deepcopy(model).to("cpu")
     opt_fn = torch.compile(dynamic=False)(model)
@@ -115,6 +71,9 @@ def test_decode(device, prompt_length, nr_tokens):
         prompt = cpu_prompt.clone()
         T = 1
 
+        # Check output token
+        test_result("Mistral", res, cpu_res)
+
 def test_attention(device):
     args = ModelArgs()
     args.n_head = 8
@@ -122,7 +81,7 @@ def test_attention(device):
     args.intermediate_size = None
     args.dim = 512
     args.__post_init__()
-    model = TransformerBlock(args)
+    model = Attention(args)
     model = model.to(device=device)
 
     T = 32
@@ -132,11 +91,14 @@ def test_attention(device):
     prompt = prompt.to(device=device)
     cpu_input_pos = copy.deepcopy(input_pos)
     input_pos = input_pos.to(device=device)
+    mask = torch.tril(torch.ones(T, T, dtype=torch.bool))
+    cpu_mask = copy.deepcopy(mask)
+    mask = mask.to(device=device)
 
     cpu_model = copy.deepcopy(model).to("cpu")
     opt_fn = torch.compile(dynamic=False)(model)
-    res = opt_fn(prompt, input_pos, None)
-    cpu_res = cpu_model(cpu_prompt, cpu_input_pos, None)
+    res = opt_fn(prompt, None, mask, input_pos)
+    cpu_res = cpu_model(cpu_prompt, None, cpu_mask, cpu_input_pos)
     test_result("Attention", res, cpu_res)
 
 def test_ffn(device):
@@ -168,7 +130,6 @@ def test_ffn(device):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    #test_prefill(device, prompt=32)
-    test_decode(device, 32, 4)
+    test_decode(device, 32, 1)
     #test_attention(device)
     #test_ffn(device)

From 76c2065570d7a81fc12aa5fb327ff52a001e7c54 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 14 Mar 2025 04:37:23 +0000
Subject: [PATCH 209/432] [Frontend] Use i8 instead of i1 for torch.bool

---
 PyTorchSimFrontend/mlir/mlir_caller_codegen.py  | 2 +-
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 9 +--------
 PyTorchSimFrontend/mlir/mlir_common.py          | 2 +-
 Simulator/simulator.py                          | 4 ----
 4 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
index 1b845338..9da276f6 100644
--- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
+++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
@@ -77,7 +77,7 @@ def generate_main(self):
             else:
                 self.generate_args_define()
 
-            func_arguments = [f"c_{arg_name}, c_{arg_name}, 0, {arg_shape}, 1" if arg_type != torch.bool else f"c_{arg_name}, c_{arg_name}, 0, {(arg_shape + 7) // 8}, 1" for arg_name, (_, arg_type, arg_shape, _, _) in self.arg_attributes]
+            func_arguments = [f"c_{arg_name}, c_{arg_name}, 0, {arg_shape}, 1" for arg_name, (_, arg_type, arg_shape, _, _) in self.arg_attributes]
             self.writeline(f"wrapper_{self.kernel_name}({', '.join(func_arguments)}){self.ending}{self.newline}")
 
             if self.validation:
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index fd47a9a3..5d827dd5 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -550,14 +550,11 @@ def logical_and(operand1, operand2, *args, var_info=None, **kwargs):
     @staticmethod
     def logical_not(operand, *args, var_info=None, **kwargs):
         op_type = var_info[operand]
-        # Type check & auto cast
-        if op_type[1] != "i1":
-            raise NotImplementedError("Logical operation with not bool data type")
 
         ret_type = op_type[1]
         tile_size = op_type[0]
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        const_one = ops.constant(1, "i1")
+        const_one = ops.constant(0, ret_type)
         const_one = ops.broadcast(const_one, operand, var_info=var_info)
         ret = ops.eq(operand,const_one)
         return ret, [tile_size, var_info[ret]]
@@ -1315,10 +1312,6 @@ def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape
         if buffer is None:
             buffer = self.loads
 
-        if dtype == torch.bool and not is_template:
-            mapping = self.map_cse.generate(self.global_vars, f"affine_map<({indices}) -> ({indices} floordiv 8)>")
-            indices = self.cse.generate(buffer, f"affine.apply #{mapping}(%{indices})") # FIXME. Only loads?
-
         if name not in self.global_vars_dict:
             self.global_vars_dict[name] = list()
 
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index b0a45b4f..d9f87d65 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -38,7 +38,7 @@
     torch.int16: "i16",
     torch.int8: "i8",
     torch.uint8: "i8",
-    torch.bool: "i1",
+    torch.bool: "i8",
     torch.bfloat16: "bf16",
 }
 
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index c38e3f9c..78990346 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -35,8 +35,6 @@ def load_tensor(self, arg, arg_name, arg_attribute, path):
         # path = os.path.join(dump_path, arg_name, f'{n_call}.raw')
         with open(path, 'rb') as f:
             np_array = np.fromfile(f, dtype=TORCH_TO_NUMPY[arg.dtype])
-            if (arg.dtype == torch.bool):
-                np_array = np.unpackbits(np_array)
             src_tensor = torch.as_strided(torch.from_numpy(np_array), arg.size(), arg.stride())
             arg.copy_(src_tensor.to(dtype=arg.dtype))
 
@@ -52,8 +50,6 @@ def write_arg(self, arg, path, name):
             data_path = os.path.join(dump_path, f'{index}.raw')
             tensor = arg.cpu()
             t_arr = tensor.numpy().flatten()
-            if (tensor.dtype == torch.bool):
-                t_arr = np.packbits(t_arr)
             t_arr.tofile(data_path)
         else:
             assert(0)

From 7a66eb6ac756d0086ae107104762d74baaf54116 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 14 Mar 2025 12:02:49 +0000
Subject: [PATCH 210/432] [Test/Mistral] Add concat test example

---
 tests/Mixtral_8x7B/test_attention.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py
index 1afd18f4..ebda0515 100644
--- a/tests/Mixtral_8x7B/test_attention.py
+++ b/tests/Mixtral_8x7B/test_attention.py
@@ -122,6 +122,23 @@ def test_ffn(device):
     cpu_res = cpu_model(cpu_prompt)
     test_result("FFN", res, cpu_res)
 
+def test_concat(device, size1=(1, 8, 32, 64), size2=(1, 8, 1, 64), dim=2):
+    def concat_tensors(a, b):
+        return torch.cat((a, b), dim=dim)
+
+    x = torch.randn(size1)
+    y = torch.randn(size2)
+    cpu_x = x.clone()
+    cpu_y = y.clone()
+    x = x.to(device=device)
+    y = y.to(device=device)
+
+    opt_fn = torch.compile(dynamic=False)(concat_tensors)
+    res = opt_fn(x, y)
+    out = concat_tensors(cpu_x, cpu_y)
+
+    test_result("ConcatTensors", res, out)
+
 if __name__ == "__main__":
     import os
     import sys
@@ -130,6 +147,7 @@ def test_ffn(device):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_decode(device, 32, 1)
+    test_decode(device, 32, 2)
+    #test_concat(device, size1=(1, 8, 32, 64), size2=(1,8,1,64), dim=2)
     #test_attention(device)
     #test_ffn(device)

From a05634f5c623baf687cc07c544bc08a6aa42c7e3 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 14 Mar 2025 12:03:35 +0000
Subject: [PATCH 211/432] [Frontend] Move index_set storage

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 5d827dd5..ded40566 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -105,7 +105,6 @@ def write_header(self):
         )
 
 class ExtensionOverrides(common.OpOverrides):
-    index_set = set()
     # Binary element wise operations
     @staticmethod
     def custom_cast(operand, target_type, *args, var_info=None, **kwargs):
@@ -698,7 +697,7 @@ def __init__(self, kernel_group):
         self.welford_reduce_out = None
         self.reduce_iterator = {}
         self.is_template_kernel = False
-
+        self.index_set = set()
     # padding type 0: zero-padding 1: negative-padding(-inf) ...
     def get_padding_type(self):
         ops = self.current_node.node.origins
@@ -1051,9 +1050,9 @@ def index_expr(self, index, dtype):
 
         renamed_symbols = {symbol: "d"+str(symbol)[5:] for symbol in index.free_symbols}
         renamed_expression = index.subs(renamed_symbols)
-        if index not in ExtensionOverrides.index_set:
+        if index not in self.index_set:
             # Register this operand
-            ExtensionOverrides.index_set.add(index)
+            self.index_set.add(index)
             ops._index_expr(tile_size, sram_var, renamed_expression, index)
 
         line = f"affine.vector_load %{sram_var}[0, 0, 0] : {tile_shape}, vector<{tile_numel_per_lane}x{mlir_dtype}> // {renamed_expression}"

From c98b214f1dbf45a508f0cbada158162427855ca7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 14 Mar 2025 12:14:50 +0000
Subject: [PATCH 212/432] [Frontend] Avoid hash collision for compiled folder

---
 PyTorchSimFrontend/extension_codecache.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index fd7c01df..a7d952d7 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -16,7 +16,7 @@
 LOCK_TIMEOUT = 600
 
 def hash_prefix(hash_value):
-    return hash_value[1:5]
+    return hash_value[1:12]
 
 def get_write_path(src_code):
     return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(get_hash(src_code.strip())))

From ff9602ac2bbc2e4aac3a215f0f4b189d9d8325aa Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 17 Mar 2025 03:14:08 +0000
Subject: [PATCH 213/432] [Test/mistral] Move kv_cache as an argument to be
 captured

---
 tests/Mixtral_8x7B/model.py          | 17 ++++++++---------
 tests/Mixtral_8x7B/test_attention.py | 10 +++-------
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/tests/Mixtral_8x7B/model.py b/tests/Mixtral_8x7B/model.py
index 9fc54bee..4c583a0b 100644
--- a/tests/Mixtral_8x7B/model.py
+++ b/tests/Mixtral_8x7B/model.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 from dataclasses import dataclass
-from typing import Optional
+from typing import Optional, List
 
 import torch
 import torch.nn as nn
@@ -116,9 +116,9 @@ def setup_caches(self, max_batch_size, max_seq_length):
         #for b in self.layers:
         #    b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim)
 
-    def forward(self, x: Tensor, mask, freqs_cis: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
+    def forward(self, x: Tensor, mask, freqs_cis: Tensor, input_pos: Optional[Tensor] = None, kv_cache: List[KVCache] = None) -> Tensor:
         for i, layer in enumerate(self.layers):
-            x = layer(x, input_pos, freqs_cis, mask)
+            x = layer(x, input_pos, freqs_cis, mask, kv_cache[i])
         x = self.norm(x)
         logits = self.output(x)
         return logits
@@ -136,8 +136,8 @@ def __init__(self, config: ModelArgs) -> None:
         self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
         self.attention_norm = RMSNorm(config.dim, config.norm_eps)
 
-    def forward(self, x: Tensor, input_pos: Tensor, freqs_cis: Tensor, mask: Tensor) -> Tensor:
-        h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)
+    def forward(self, x: Tensor, input_pos: Tensor, freqs_cis: Tensor, mask: Tensor, kv_cache: KVCache = None) -> Tensor:
+        h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos, kv_cache)
         out = h + self.ffn(self.ffn_norm(h))
         return out
 
@@ -150,14 +150,13 @@ def __init__(self, config: ModelArgs):
         # key, query, value projections for all heads, but in a batch
         self.wqkv = nn.Linear(config.dim, total_head_dim, bias=False)
         self.wo = nn.Linear(config.dim, config.dim, bias=False)
-        self.kv_cache = None
 
         self.n_head = config.n_head
         self.head_dim = config.head_dim
         self.n_local_heads = config.n_local_heads
         self.dim = config.dim
 
-    def forward(self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
+    def forward(self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Optional[Tensor] = None, kv_cache: KVCache = None) -> Tensor:
         bsz, seqlen, _ = x.shape
 
         kv_size = self.n_local_heads * self.head_dim
@@ -174,8 +173,8 @@ def forward(self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Optiona
 
         q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
 
-        if self.kv_cache is not None:
-            k, v = self.kv_cache.update(k, v)
+        if kv_cache is not None:
+            k, v = kv_cache.update(k, v)
 
         k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
         v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py
index ebda0515..0040b90e 100644
--- a/tests/Mixtral_8x7B/test_attention.py
+++ b/tests/Mixtral_8x7B/test_attention.py
@@ -42,10 +42,6 @@ def test_decode(device, prompt_length, nr_tokens):
     kv_caches = [KVCache(max_batch, max_seq, args.n_head, head_dim, torch.float32) for i in range(args.n_layer)]
     cpu_kv_caches = copy.deepcopy(kv_caches)
     kv_caches = [kv.to(device=device) for kv in kv_caches]
-    for idx, b in enumerate(model.layers):
-        b.attention.kv_cache = kv_caches[idx]
-    for idx, b in enumerate(cpu_model.layers):
-        b.attention.kv_cache = cpu_kv_caches[idx]
 
     for i in range(nr_tokens):
         input_pos = torch.arange(0, T)
@@ -62,8 +58,8 @@ def test_decode(device, prompt_length, nr_tokens):
         freqs_cis = freqs_cis.to(device=device)
 
         # Run models
-        res = opt_fn(prompt, mask, freqs_cis, input_pos)
-        cpu_res = cpu_model(cpu_prompt, cpu_mask, cpu_freqs_cis, cpu_input_pos)
+        res = opt_fn(prompt, mask, freqs_cis, input_pos, kv_caches)
+        cpu_res = cpu_model(cpu_prompt, cpu_mask, cpu_freqs_cis, cpu_input_pos, cpu_kv_caches)
         new_token = sample(cpu_res.cpu())[0]
         print(new_token)
         new_token = cpu_model.tok_embeddings(new_token).unsqueeze(1)
@@ -147,7 +143,7 @@ def concat_tensors(a, b):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_decode(device, 32, 2)
+    test_decode(device, 33, 3)
     #test_concat(device, size1=(1, 8, 32, 64), size2=(1,8,1,64), dim=2)
     #test_attention(device)
     #test_ffn(device)

From 52144b35937f53e8ebc74c4cc922db2efbd8971f Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Fri, 7 Mar 2025 12:04:03 +0000
Subject: [PATCH 214/432] [Fusion] Convolution residual connection fusion
 implement

---
 PyTorchSimFrontend/mlir/mlir_common.py        |   7 +
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 250 ++++++++++++------
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  19 +-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  34 +--
 PyTorchSimFrontend/mlir/mlir_template.py      | 170 +++++++++---
 tests/Fusion/test_conv_fusion.py              |  66 +++++
 6 files changed, 406 insertions(+), 140 deletions(-)
 create mode 100644 tests/Fusion/test_conv_fusion.py

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index d9f87d65..e07437ab 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -440,6 +440,13 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
         tile_desc.implicit_dim_size = implicit_dim_size
         return tile_desc
 
+    def set_tile_size(self, template_store_info):
+        tile_desc = MLIRMultiDimTile(template_store_info['tile_size'],
+            self.vector_lane,
+            vlane_split_axis=template_store_info['vlane_split_axis'],
+            vlane_stride=template_store_info['vlane_stride'])
+        return tile_desc
+
     def codegen_nodes(self, nodes, kernel_name):
         _, (group, reduction_group) = max(
             nodes, key=lambda x: int(x.is_reduction())
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index acb242d1..255e6a3a 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -28,9 +28,17 @@
 // TILE_M = {{ TILE_M }}
 // TILE_N = {{ TILE_N }}
 // TILE_K = {{ TILE_K }}
-// TILE_M = {{ TILE_M }}
-// TILE_N = {{ TILE_N }}
-// TILE_K = {{ TILE_K }}
+// TILE_I_H={{ TILE_I_H }},
+// TILE_I_W={{ TILE_I_W }},
+// TILE_O_H={{ TILE_O_H }},
+// TILE_O_W={{ TILE_O_W }},
+// TILE_K_H={{ TILE_K_H }},
+// TILE_K_W={{ TILE_K_W }},
+// SUB_TILE_M={{ SUB_TILE_M }},
+// SUB_TILE_N={{ SUB_TILE_N }},
+// SUB_TILE_I_W={{ SUB_TILE_I_W }},
+// SUB_TILE_K_H={{ SUB_TILE_K_H }},
+// SUB_TILE_K_W={{ SUB_TILE_K_W }},
 // PADDING_H = {{ PADDING_H }}
 // PADDING_W = {{ PADDING_W }}
 // STRIDE_H = {{ STRIDE_H }}
@@ -54,7 +62,7 @@
 memref.global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
 {{kernel.def_global_vars()}}
 
-func.func @{{ KERNEL_NAME }}({{ KERNEL_DEF }}) {
+func.func @{{ KERNEL_NAME }}{{kernel.def_conv_kernel(inputs=[X, W, BIAS], outputs=[Y], names_str="X, W, Bias, Y", padded_input_size=PADDED_INPUT_SIZE, input_reorder=input_reorder)}} {
   %c_mvin = arith.constant 2 : index
   %c_mvin2 = arith.constant 1 : index
   %c_mvin3 = arith.constant 14 : index
@@ -74,7 +82,8 @@
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
-{{ kernel.def_local_vars() }}
+  {{- kernel.def_local_vars() }}
+
   affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
     affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }} {
       affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M }} {
@@ -122,8 +131,7 @@
             } { accumulation_loop=true }
           } { accumulation_loop=true }
           // Store output matrix
-          memref.dma_start %output_buffer[%c0, %c0, %c0, %c0], %Y[%index0], %c_mvout, %tag3[%c0], %input_axis, %vstride
-              : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[{{ TILE_O_W * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
+          {{kernel.store_output(indent_size=10)}}
         } { outer_loop=true }
       } { outer_loop=true }
     } { outer_loop=true }
@@ -134,6 +142,38 @@
 
 MULTI_TILE_CONV_TEMPLATE = r"""
 // Multi Channel Tile Conv2D kernel
+// BATCH = {{ BATCH }}
+// I_C = {{ I_C }}
+// I_H = {{ I_H }}
+// I_W = {{ I_W }}
+// O_C = {{ O_C }}
+// K_H = {{ K_H }}
+// K_W = {{ K_W }}
+// O_H = {{ O_H }}
+// O_W = {{ O_W }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
+// TILE_I_H={{ TILE_I_H }},
+// TILE_I_W={{ TILE_I_W }},
+// TILE_O_H={{ TILE_O_H }},
+// TILE_O_W={{ TILE_O_W }},
+// TILE_K_H={{ TILE_K_H }},
+// TILE_K_W={{ TILE_K_W }},
+// SUB_TILE_M={{ SUB_TILE_M }},
+// SUB_TILE_N={{ SUB_TILE_N }},
+// SUB_TILE_I_W={{ SUB_TILE_I_W }},
+// SUB_TILE_K_H={{ SUB_TILE_K_H }},
+// SUB_TILE_K_W={{ SUB_TILE_K_W }},
+// PADDING_H = {{ PADDING_H }}
+// PADDING_W = {{ PADDING_W }}
+// STRIDE_H = {{ STRIDE_H }}
+// STRIDE_W = {{ STRIDE_W }}
+// DILATION_H = {{ DILATION_H }}
+// DILATION_W = {{ DILATION_W }}
+// DATA_STYPE = {{ DATA_STYPE }}
+// DATA_SIZE = {{ DATA_SIZE }}
+
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * BATCH * O_C }} + d1 * {{ BATCH * O_C }} + d2 * {{ O_C }} + d3)> // output (O_H, O_W, BATCH, O_C)
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * BATCH * I_C }} + d1 * {{ I_C * STRIDE_W }} + d2 * {{ I_C * (I_W + 2 * PADDING_W) }} + d3)> // input (I_H, BATCH, I_W, I_C)
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C)
@@ -145,8 +185,9 @@
 memref.global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
 memref.global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+{{kernel.def_global_vars()}}
 
-func.func @{{ KERNEL_NAME }}({{ KERNEL_DEF }}) {
+func.func @{{ KERNEL_NAME }}{{kernel.def_conv_kernel(inputs=[X, W, BIAS], outputs=[Y], names_str="X, W, Bias, Y", padded_input_size=PADDED_INPUT_SIZE, input_reorder=input_reorder)}} {
   %c_mvin = arith.constant 2 : index
   %c_mvin2 = arith.constant 1 : index
   %c_mvin3 = arith.constant 14 : index
@@ -166,6 +207,7 @@
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
+  {{- kernel.def_local_vars() }}
 
   affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
     affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }} {
@@ -210,8 +252,7 @@
             } { accumulation_loop=true }
           } { accumulation_loop=true }
           // Store output matrix
-          memref.dma_start %output_buffer[%c0, %c0, %c0, %c0], %Y[%index0], %c_mvout, %tag3[%c0], %input_axis, %vstride
-              : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[{{ TILE_O_W * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
+          {{kernel.store_output(indent_size=10)}}
         } { outer_loop=true }
       } { outer_loop=true }
     } { outer_loop=true }
@@ -234,9 +275,17 @@
 // TILE_M = {{ TILE_M }}
 // TILE_N = {{ TILE_N }}
 // TILE_K = {{ TILE_K }}
-// TILE_M = {{ TILE_M }}
-// TILE_N = {{ TILE_N }}
-// TILE_K = {{ TILE_K }}
+// TILE_I_H={{ TILE_I_H }},
+// TILE_I_W={{ TILE_I_W }},
+// TILE_O_H={{ TILE_O_H }},
+// TILE_O_W={{ TILE_O_W }},
+// TILE_K_H={{ TILE_K_H }},
+// TILE_K_W={{ TILE_K_W }},
+// SUB_TILE_M={{ SUB_TILE_M }},
+// SUB_TILE_N={{ SUB_TILE_N }},
+// SUB_TILE_I_W={{ SUB_TILE_I_W }},
+// SUB_TILE_K_H={{ SUB_TILE_K_H }},
+// SUB_TILE_K_W={{ SUB_TILE_K_W }},
 // PADDING_H = {{ PADDING_H }}
 // PADDING_W = {{ PADDING_W }}
 // STRIDE_H = {{ STRIDE_H }}
@@ -245,6 +294,7 @@
 // DILATION_W = {{ DILATION_W }}
 // DATA_STYPE = {{ DATA_STYPE }}
 // DATA_SIZE = {{ DATA_SIZE }}
+
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * O_H * O_C }} + d1 * {{ O_W * O_C }} + d2 * {{ O_C }} + d3)> // output (BATCH, O_H, O_W, O_C)
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * (I_H + 2 * PADDING_W) * I_C }} + d1 * {{ (I_W + 2 * PADDING_W) * I_C }} + d2 * {{ I_C }} + d3)> // input (BATCH, I_H, I_W, I_C) Stride should be changed if kernel stride > 1
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C)
@@ -255,7 +305,9 @@
 memref.global @X_spad : memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
 memref.global @Y_spad : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-func.func @{{ KERNEL_NAME }}({{ KERNEL_DEF }}) {
+{{kernel.def_global_vars()}}
+
+func.func @{{ KERNEL_NAME }}{{kernel.def_conv_kernel(inputs=[X, W, BIAS], outputs=[Y], names_str="X, W, Bias, Y", padded_input_size=PADDED_INPUT_SIZE, input_reorder=input_reorder)}} {
   %c_mvin = arith.constant 2 : index
   %c_mvin2 = arith.constant 1 : index
   %c_mvin3 = arith.constant 14 : index
@@ -275,6 +327,8 @@
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
+  {{- kernel.def_local_vars() }}
+
   affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
     affine.for %tile_m = 0 to {{ O_W }} step {{ TILE_M }} {
       affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
@@ -319,8 +373,7 @@
           } { accumulation_loop=true }
         } { accumulation_loop=true }
         // Store output matrix
-        memref.dma_start %output_buffer[%c0, %c0, %c0, %c0], %Y[%index0], %c_mvout, %tag3[%c0], %input_axis, %vstride
-            : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[{{ TILE_O_W * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
+        {{kernel.store_output(indent_size=8)}}
       } { outer_loop=true }
     } { outer_loop=true }
   } { outer_loop=true }
@@ -342,9 +395,17 @@
 // TILE_M = {{ TILE_M }}
 // TILE_N = {{ TILE_N }}
 // TILE_K = {{ TILE_K }}
-// TILE_M = {{ TILE_M }}
-// TILE_N = {{ TILE_N }}
-// TILE_K = {{ TILE_K }}
+// TILE_I_H={{ TILE_I_H }},
+// TILE_I_W={{ TILE_I_W }},
+// TILE_O_H={{ TILE_O_H }},
+// TILE_O_W={{ TILE_O_W }},
+// TILE_K_H={{ TILE_K_H }},
+// TILE_K_W={{ TILE_K_W }},
+// SUB_TILE_M={{ SUB_TILE_M }},
+// SUB_TILE_N={{ SUB_TILE_N }},
+// SUB_TILE_I_W={{ SUB_TILE_I_W }},
+// SUB_TILE_K_H={{ SUB_TILE_K_H }},
+// SUB_TILE_K_W={{ SUB_TILE_K_W }},
 // PADDING_H = {{ PADDING_H }}
 // PADDING_W = {{ PADDING_W }}
 // STRIDE_H = {{ STRIDE_H }}
@@ -441,37 +502,46 @@
 """
 
 WRAPPER_TEMPLATE = r"""
-def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif %}, {{ OUT }}):
+def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
     # Padding input
-    padded_shape = list({{ INPUT }}.shape)
+    padded_shape = list(X.shape)
     padded_shape[2] += 2 * {{ PADDING_H }}
     padded_shape[3] += 2 * {{ PADDING_W }}
-    {{ INPUT }}_padding = torch.zeros(padded_shape, device={{ INPUT }}.device)
-    {{ INPUT }}_padding[:, :, {{ PADDING_H }}:{{ INPUT }}.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:{{ INPUT }}.shape[3] + {{ PADDING_W }}] = {{ INPUT }}
+    X_padding = torch.zeros(padded_shape, device=X.device)
+    X_padding[:, :, {{ PADDING_H }}:X.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:X.shape[3] + {{ PADDING_W }}] = X
+
+    # Holding original output tensor
+    {{kernel.def_wrapper(only_store_buffer=True)}}_t = {{kernel.def_wrapper(only_store_buffer=True)}}
 
     # Tanspose tensors
     {%- if MULTI_TILE %}
-    t_{{ INPUT }} = {{ INPUT }}_padding.permute(2, 0, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, BATCH, I_W, I_C)
+    X = X_padding.permute(2, 0, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, BATCH, I_W, I_C)
     {% elif SINGLE_BATCH %}
-    t_{{ INPUT }} = {{ INPUT }}_padding.permute(0, 2, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (BATCH, I_H, I_W, I_C)
+    X = X_padding.permute(0, 2, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (BATCH, I_H, I_W, I_C)
     {% else %}
-    t_{{ INPUT }} = {{ INPUT }}_padding.permute(2, 3, 0, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, I_W, BATCH, I_C)
+    X = X_padding.permute(2, 3, 0, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, I_W, BATCH, I_C)
     {% endif -%}
-    t_{{ WEIGHT }} = {{ WEIGHT }}.permute(2, 3, 1, 0).contiguous() # (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)
+    W = W.permute(2, 3, 1, 0).contiguous() # (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)
     {%- if SINGLE_BATCH %}
-    t_{{ OUT }} = {{ OUT }}.permute(0, 2, 3, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (BATCH, O_H, O_W, O_C)
+    {{kernel.def_wrapper(only_store_buffer=True)}} = {{kernel.def_wrapper(only_store_buffer=True)}}.permute(0, 2, 3, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (BATCH, O_H, O_W, O_C)
+    {%- for buf in EPILOGUE_READS %}
+    {{kernel.def_wrapper(epilogue_buffer=buf)}} = {{kernel.def_wrapper(epilogue_buffer=buf)}}.permute(0, 2, 3, 1).contiguous()
+    {%- endfor %}
     {% else %}
-    t_{{ OUT }} = {{ OUT }}.permute(2, 3, 0, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (O_H, O_W, BATCH, O_C)
+    {{kernel.def_wrapper(only_store_buffer=True)}} = {{kernel.def_wrapper(only_store_buffer=True)}}.permute(2, 3, 0, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (O_H, O_W, BATCH, O_C)
+    {%- for buf in EPILOGUE_READS %}
+    {{kernel.def_wrapper(epilogue_buffer=buf)}} = {{kernel.def_wrapper(epilogue_buffer=buf)}}.permute(2, 3, 0, 1).contiguous()
+    {%- endfor %}
     {% endif -%}
-    {{ KERNEL_NAME }}(t_{{ INPUT }}, t_{{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif %}, t_{{ OUT }})
+    {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
     {% if BACKENDSIM_EAGER_MODE %}
-    yield ({{KERNEL_NAME}}, (t_{{ INPUT }}, t_{{ WEIGHT }}{% if BIAS %}, {{ BIAS }} {% endif %}, t_{{ OUT }}))
+    yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
     {% endif %}
     # Transpose back
     {%- if SINGLE_BATCH %}
-    {{ OUT }}.copy_(t_{{ OUT }}.permute(0, 3, 1, 2).contiguous()) # (BATCH, O_H, O_W, O_C) -> (BATCH, O_C, O_H, O_W)
+    {{kernel.def_wrapper(only_store_buffer=True)}}_t.copy_({{kernel.def_wrapper(only_store_buffer=True)}}.permute(0, 3, 1, 2).contiguous()) # (BATCH, O_H, O_W, O_C) -> (BATCH, O_C, O_H, O_W)
     {% else %}
-    {{ OUT }}.copy_(t_{{ OUT }}.permute(2, 3, 0, 1).contiguous()) # (O_H, O_W, BATCH, O_C) -> (BATCH, O_C, O_H, O_W)
+    {{kernel.def_wrapper(only_store_buffer=True)}}_t.copy_({{kernel.def_wrapper(only_store_buffer=True)}}.permute(2, 3, 0, 1).contiguous()) # (O_H, O_W, BATCH, O_C) -> (BATCH, O_C, O_H, O_W)
     {% endif -%}
 """
 
@@ -504,28 +574,11 @@ def is_multi_tile(self, I_C):
     def is_single_batch(self, BATCH):
         return BATCH == 1
 
-    # Can use math.multi ?
-    def def_kernel(self) ->str:
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        Y = self.output_node
-        if len(self.input_nodes) == 3:
-          Bias = self.input_nodes[2]
-        else:
-          Bias = None
-
+    def get_padded_input_size(self, X):
         input_padded = list(X.layout.size)
         input_padded[2] += 2 * self.padding[0]
         input_padded[3] += 2 * self.padding[1]
-        input_size = math.prod(input_padded)
-        weight_size = math.prod(W.layout.size)
-        if Bias is not None:
-          bias_size = math.prod(Bias.layout.size)
-        output_size = math.prod(Y.layout.size)
-
-        if Bias is None:
-          return f"%{self.kernel_args[0]}: memref<{input_size}xf32>, %{self.kernel_args[1]}: memref<{weight_size}xf32>, %{self.kernel_args[3]}: memref<{output_size}xf32>"
-        else:
-          return f"%{self.kernel_args[0]}: memref<{input_size}xf32>, %{self.kernel_args[1]}: memref<{weight_size}xf32>, %{self.kernel_args[2]}: memref<{bias_size}xf32>, %{self.kernel_args[3]}: memref<{output_size}xf32>"
+        return math.prod(input_padded)
 
     def render(self,
                kernel: MLIRTemplateKernel,
@@ -534,9 +587,8 @@ def render(self,
                **kwargs):
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
-        # if epilogue_nodes is not None and len(epilogue_nodes) > 0:
-        #     self.output_node = cast(Buffer, epilogue_nodes[-1])
-        #     self.function_name += f"_fused_{epilogue_nodes[0].node.origin_node.name}"
+        self.kernel = kernel
+        self.epilogue_nodes = epilogue_nodes
 
         X, W = self.input_nodes[0], self.input_nodes[1]
         Y = self.output_node
@@ -589,6 +641,9 @@ def render(self,
         elif self.is_single_batch(BATCH) and self.stride[0] == 1:
           conv_template = SINGLE_BATCH_CONV_TEMPLATE
           TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation) # TODO: implement K_W
+          TILE_N = 32
+          TILE_K = 32
+          TILE_O_H = 8
           TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
           TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
           SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane
@@ -607,8 +662,12 @@ def render(self,
 
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
-            KERNEL_DEF=self.def_kernel(),
             kernel=kernel,
+            X=X,
+            W=W,
+            BIAS=Bias,
+            Y=Y,
+            PADDED_INPUT_SIZE=self.get_padded_input_size(X),
             BATCH=X.layout.size[0],
             I_C=X.layout.size[1],
             I_H=X.layout.size[2],
@@ -641,7 +700,25 @@ def render(self,
             DILATION_W=self.dilation[1],
             DATA_STYPE="f32",
             DATA_SIZE=4,
-            BIAS=Bias
+            input_reorder=self.input_reorder
+        )
+
+        kernel.store_info = dict(
+            output_node = self.output_node.name,
+            dependent_buf = [],
+            sram_var = "output_buffer",
+            dram_var = "Y",
+            index_var = "index0",
+            tag_var = "tag",
+            vlane_split_axis = 3,
+            vlane_stride = 1,
+            mlir_dtype = kernel.render_options['DATA_STYPE'],
+            tile_nr_dim = 4,
+            dram_shape = f"memref<{BATCH * O_C * O_H * O_W}x{kernel.render_options['DATA_STYPE']}>",
+            tile_shape = f"memref<{TILE_O_H}x{TILE_O_W}x{TILE_M}x{TILE_N}x{kernel.render_options['DATA_STYPE']}, 1>" if conv_template in (CONV_TEMPLATE, MULTI_TILE_CONV_TEMPLATE)
+                          else f"memref<1x{TILE_O_H}x{TILE_M}x{TILE_N}x{kernel.render_options['DATA_STYPE']}, 1>",
+            tile_size = (TILE_O_H, TILE_O_W, TILE_M, TILE_N) if conv_template in (CONV_TEMPLATE, MULTI_TILE_CONV_TEMPLATE) else (1, TILE_O_H, TILE_M, TILE_N),
+            tile_stride = [TILE_O_W * TILE_M * TILE_N, TILE_M * TILE_N, 1, TILE_M]
         )
         code = self._template_from_string(conv_template).render(**kernel.render_options)
 
@@ -653,26 +730,52 @@ def render(self,
         self.gem5_header += f"float Y_spad[{y_spad_size}] __attribute__ ((section(\".spad\")));\n"
 
         kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
-        kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=self.input_reorder)
 
         return code
 
     def outer_func_render(self, kernel_name, input_args):
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
+        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+
+        # Wrapper function needs to transpose the epilogue node tensors same as the convolution input/outputs
+        # Currently, only the read tensors are transposed
+        epilogue_reads = []
+        epilogue_writes = []
+        if self.epilogue_nodes is not None:
+          main_node_rw = [X.get_name(), W.get_name(), Y.get_name()]
+          if Bias is not None:
+            main_node_rw.append(Bias.get_name())
+
+          for epilogue_node in self.epilogue_nodes:
+            reads = epilogue_node.read_writes.reads
+            for read in list(reads):
+              if read[0] not in main_node_rw:
+                epilogue_reads.append(read[0])
+
+            writes = epilogue_node.read_writes.writes
+            for write in list(writes):
+              if write[0] not in main_node_rw:
+                epilogue_writes.append(write[0])
+
         eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
         options = dict(
+            kernel=self.kernel,
             KERNEL_NAME=kernel_name,
             FUNC_NAME=self.function_name,
-            INPUT=input_args[0],
-            WEIGHT=input_args[1],
-            BIAS=0 if len(input_args) == 3 else input_args[2],
-            OUT=input_args[3] if len(input_args) == 4 else input_args[2],
+            INPUT=X,
+            WEIGHT=W,
+            BIAS=Bias,
+            OUTPUT=Y,
+            EPILOGUE_READS=epilogue_reads,
+            EPILOGUE_WRITES=epilogue_writes,  # NOT USED YET
             PADDING_H=self.padding[0],
             PADDING_W=self.padding[1],
             MULTI_TILE=self.is_multi_tile(self.input_shape[1]),
             SINGLE_BATCH=self.is_single_batch(self.input_shape[0]),
             VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
             BACKENDSIM_EAGER_MODE=eager_mode,
-            HASH_VALUE=self.hash_value
+            input_reorder=self.input_reorder
         )
         code = self._template_from_string(WRAPPER_TEMPLATE).render(**options)
         return code, self.function_name
@@ -680,18 +783,10 @@ def outer_func_render(self, kernel_name, input_args):
     def get_arg_attributes(self):
         arg_attributes = []
 
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        Y = self.output_node
-        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
-
+        X = self.input_nodes[0]
         X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)]
         X_shape[0] += 2 * self.padding[0]
         X_shape[1] += 2 * self.padding[1]
-        W_shape = [W.get_size()[i] for i in (2, 3, 1, 0)]
-        Y_shape = [Y.get_size()[i] for i in (2, 3, 0, 1)]
-
-        if Bias is not None:
-          Bias_shape = Bias.get_size()
 
         def compute_stride(shape):
             stride = [1] * len(shape)
@@ -700,16 +795,7 @@ def compute_stride(shape):
             return stride
 
         X_stride = compute_stride(X_shape)
-        W_stride = compute_stride(W_shape)
-        Y_stride = compute_stride(Y_shape)
-        if Bias is not None:
-          Bias_stride = compute_stride(Bias_shape)
-
-        arg_attributes.append([self.kernel_args[0], [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]])
-        arg_attributes.append([self.kernel_args[1], [MLIRKernelArgs.MLIR_ARGS_IN, W.layout.dtype, math.prod(W_shape), W_shape, W_stride]])
-        if Bias is not None:
-          arg_attributes.append([self.kernel_args[2], [MLIRKernelArgs.MLIR_ARGS_IN, Bias.layout.dtype, math.prod(Bias_shape), Bias_shape, Bias_stride]])
-        arg_attributes.append([self.kernel_args[3], [MLIRKernelArgs.MLIR_ARGS_OUT, Y.layout.dtype, math.prod(Y_shape), Y_shape, Y_stride]])
+        arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]])
 
         return arg_attributes
 
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index fd85e39f..565e8290 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -61,7 +61,7 @@
         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                 outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
       } { accumulation_loop=true }
-      {{kernel.store_output(vlane_split_axis=1)}}
+      {{kernel.store_output(indent_size=6)}}
     } { outer_loop=true }
   } { outer_loop=true }
   return
@@ -159,6 +159,23 @@ def render(self,
             epilogue_nodes = epilogue_nodes,
             input_reorder = self.input_reorder
         )
+
+        kernel.store_info = dict(
+            output_node = self.output_node.name,
+            dependent_buf = [],
+            sram_var = "Y_buffer",
+            dram_var = "Y",
+            index_var = "index2",
+            tag_var = "tag",
+            vlane_split_axis = 1,
+            vlane_stride = 1,
+            mlir_dtype = kernel.render_options['DATA_STYPE'],
+            tile_nr_dim = 2,
+            dram_shape = f"memref<{kernel.render_options['Y_numel']}x{kernel.render_options['DATA_STYPE']}>",
+            tile_shape = f"memref<{TILE_M}x{TILE_N}x{kernel.render_options['DATA_STYPE']}, 1>",
+            tile_size = (TILE_M, TILE_N),
+            tile_stride = [1, TILE_M]
+        )
         code = self._template_from_string(template).render(**kernel.render_options)
 
         self.header = f"float X_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_K)}] __attribute__ ((section(\".spad\")));\n"
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 3c33726c..e3605b74 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -51,11 +51,11 @@ def can_fuse_horizontal(self, node1, node2):
                 return False
 
             # Convolution is currently not supported
-            if node1.is_template() and node1.get_nodes()[0].node.origin_node is not None and hasattr(node1.get_nodes()[0].node.origin_node.target, "_name") and node1.get_nodes()[0].node.origin_node.target._name == 'aten::convolution':
-                return False
+            # if node1.is_template() and node1.get_nodes()[0].node.origin_node is not None and hasattr(node1.get_nodes()[0].node.origin_node.target, "_name") and node1.get_nodes()[0].node.origin_node.target._name == 'aten::convolution':
+            #     return False
 
-            if node2.is_template() and node2.get_nodes()[0].node.origin_node is not None and hasattr(node2.get_nodes()[0].node.origin_node.target, "_name") and node2.get_nodes()[0].node.origin_node.target._name == 'aten::convolution':
-                return False
+            # if node2.is_template() and node2.get_nodes()[0].node.origin_node is not None and hasattr(node2.get_nodes()[0].node.origin_node.target, "_name") and node2.get_nodes()[0].node.origin_node.target._name == 'aten::convolution':
+            #     return False
 
             v1_total = math.prod(vars1) if len(vars1) else 0
             v2_total = math.prod(vars2) if len(vars2) else 0
@@ -111,11 +111,13 @@ def flush(self):
         self._set_flush_status(False)
 
     def define_function(self, kernel):
-        code, function_name = kernel.def_function()
-        if code is not None and function_name not in self.outer_function:
-            wrapper = V.graph.wrapper_code
-            wrapper.header.writeline(code)
-            self.outer_function.add(function_name)
+        partial_code, function_name = kernel.def_function()
+        if partial_code is not None and function_name not in self.outer_function:
+            with V.set_kernel_handler(kernel):
+                code = partial_code.finalize()
+                wrapper = V.graph.wrapper_code
+                wrapper.header.writeline(code)
+                self.outer_function.add(function_name)
 
     def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, loop_size=None, origins={}):
         wrapper = V.graph.wrapper_code
@@ -134,7 +136,7 @@ def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, loop_size
             wrapper.define_kernel(kernel_name, codecache_def.getvalue(), cuda=False)
         return kernel_name
 
-    def codegen_src_code(self, kernel, render, template_node, epilogue_nodes):
+    def codegen_template_code(self, kernel, render, template_node, epilogue_nodes):
         with kernel:
             for node in [template_node, *epilogue_nodes]:
                 node.mark_run()
@@ -143,13 +145,15 @@ def codegen_src_code(self, kernel, render, template_node, epilogue_nodes):
                 _, (group, reduction_group) = max(
                     epilogue_nodes, key=lambda x: int(x.is_reduction())
                 ).group
-                vars, reduction_vars = kernel.set_ranges(group, reduction_group)
-                tile_desc = kernel.compute_tile_size(epilogue_nodes, vars, reduction_vars)
+                vars, reduction_vars = kernel.set_ranges(group, reduction_group)    # Do we need this?
+                tile_desc = kernel.set_tile_size(kernel.store_info)
                 kernel.kernel_group.set_tile_info(tile_desc)
-                kernel.adjust_tile_size()
             # Flush created varaibles, since template fusion doen't share variable
             kernel.cse.cache.clear()
             for node in epilogue_nodes:
+                if template_node.node.name in [dep[0] for dep in list(node.read_writes.reads)]:
+                    kernel.store_info['dependent_buf'].append(node.node.name)
+                kernel.store_info
                 node.codegen((vars, reduction_vars))
         with V.set_kernel_handler(kernel):
             src_code = (
@@ -165,13 +169,13 @@ def codegen_template(self, template_node, epilogue_nodes):
         kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group)
         _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs()
 
-        src_code = self.codegen_src_code(kernel, render, template_node, epilogue_nodes)
+        src_code = self.codegen_template_code(kernel, render, template_node, epilogue_nodes)
         wrapper = V.graph.wrapper_code
 
         if src_code in wrapper.src_to_kernel: # [CONV] check inner function is already defined
             kernel_name = wrapper.src_to_kernel[src_code]
             kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, epilogue_nodes=epilogue_nodes, kernel_name=kernel_name) # update kernel name
-            src_code = self.codegen_src_code(kernel, render, template_node, epilogue_nodes)
+            src_code = self.codegen_template_code(kernel, render, template_node, epilogue_nodes)
 
         with V.set_kernel_handler(kernel):
             spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\"), aligned({kernel.spad_info['spad_size']*kernel.vector_lane})));"
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index a8d117c0..c5e7bfed 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -247,9 +247,13 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio
 
     def meta_kernel(self):
         wrapper = V.graph.wrapper_code
-        arg_attributes = self.kernel_arg_attributes
-        if arg_attributes is None:
-            _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs()
+        kernel_arg_attributes = self.kernel_arg_attributes
+        _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs()
+        if kernel_arg_attributes is not None:
+            for name, attr in kernel_arg_attributes:
+                for idx in range(len(arg_attributes)):
+                    if arg_attributes[idx][0] == name:
+                        arg_attributes[idx][1] = attr
         wrapper.add_import_once('\nprint(f\'Wrapper Codegen Path = {__file__}\')')
         wrapper.add_import_once(f'\nfrom PyTorchSimFrontend.extension_codecache import CustomAsyncCompile')
         wrapper.add_import_once(f'\ncustom_async_compile = CustomAsyncCompile()')
@@ -267,26 +271,27 @@ def call_kernel(self, kernel_name):
             kernel_name if self.outer_func_name is None else self.outer_func_name,
             call_args, cuda=False)
 
-    def codegen_body(self, vlane_split_axis):
-        def template_store(options):
-            sram_var = "Y_buffer"
-            dram_var = "Y"
-            index_var = "index2"
-            tag_var = "tag"
-            vlane_stride = 1
-            mlir_dtype = "f32"
-            dram_shape = f"memref<{options['Y_numel']}x{mlir_dtype}>"
-            tile_shape = f"memref<{options['TILE_M']}x{options['TILE_N']}x{mlir_dtype}, 1>"
+    def codegen_body(self):
+        def template_store():
+            sram_var = self.store_info["sram_var"]
+            dram_var = self.store_info["dram_var"]
+            index_var = self.store_info["index_var"]
+            tag_var = self.store_info["tag_var"]
+            vlane_split_axis = self.store_info["vlane_split_axis"]
+            vlane_stride = self.store_info["vlane_stride"]
+            mlir_dtype = self.store_info["mlir_dtype"]
+            dram_shape = self.store_info["dram_shape"]
+            tile_shape = self.store_info["tile_shape"]
             zero_cse = self.get_const_cse(0)
-            sram_index_var = ",".join([f"%{zero_cse}"] * 2)
-            tile_stride = [1, options['TILE_M']]
+            sram_index_var = ",".join([f"%{zero_cse}"] * self.store_info["tile_nr_dim"])
+            tile_stride = self.store_info['tile_stride']
             code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  tag_var, dram_shape, tile_shape, tile_stride)
             self.cse.generate(self.stores, code, assignment = False)
         self.body.splice(self.loads)
         self.body.splice(self.compute)
         if len(self.stores._lines) == 0:
-            template_store(self.render_options)
+            template_store()
         self.body.splice(self.stores)
         self.loads.clear()
         self.compute.clear()
@@ -327,7 +332,7 @@ def def_kernel(
                     extra_node[node.get_name()] = node.node
                 else:
                     extra_node[node.get_name()] = node
-                self.buffer_names[node.get_name()] = 'Y_buffer'   #TODO: Buffer name fixed
+                self.buffer_names[node.get_name()] = self.store_info['sram_var']
 
         def hook():
             arg_defs, *_ = self.kernel_group.args.mlir_argdefs(extra_node=extra_node)
@@ -337,6 +342,91 @@ def hook():
         self.render_hooks["<DEF_KERNEL>"] = hook
         return "<DEF_KERNEL>"
 
+    # This function is a temporal function for convolution because currently convolution kernel is not considering padding.
+    # Padding is done by python wrapper so the padded input size is manually applied here.
+    def def_conv_kernel(
+        self,
+        inputs: List[IRNode],
+        outputs: List[IRNode],
+        names_str: str = "",
+        padded_input_size: List[int] = [],
+        input_reorder: Optional[List[int]] = None,
+    ) -> str:
+        names = [x.strip() for x in names_str.strip().split(",")]
+        if len(inputs) + len(outputs) != len(names):
+            raise RuntimeError(
+                f"{len(inputs) + len(outputs)=} != {len(names)=}, {inputs=}, {outputs=}, {names=}"
+            )
+
+        if input_reorder is not None:
+            assert len(inputs) == len(input_reorder)
+        else:
+            input_reorder = list(range(len(inputs)))
+
+        for idx in input_reorder:
+            name = names[idx]
+            node = inputs[idx]
+            if node is not None:
+                self.named_nodes[name] = node
+                self.kernel_group.args.input_buffers[node.get_name()] = name
+
+        self.extra_node = {}
+        for name, node in zip(names[len(inputs) : len(inputs) + len(outputs)], outputs):
+            if node is not None:
+                self.named_nodes[name] = node
+                self.kernel_group.args.output_buffers[node.get_name()] = name
+                self.store_buffer_names.add(node.get_name())    #TODO: Is this enough not calling store() in mlir_common.py?
+                self.extra_node[node.get_name()] = node
+                self.buffer_names[node.get_name()] = self.store_info['sram_var']   #TODO: Buffer name fixed
+
+        def kernel_hook():
+            arg_defs, *_ = self.kernel_group.args.mlir_argdefs(extra_node=self.extra_node)
+            arg_defs[0] = re.sub(r'(\d+)(?=xf32)', str(padded_input_size), arg_defs[0])
+            return f"({', '.join(arg_defs)})"
+
+        assert "<DEF_CONV_KERNEL>" not in self.render_hooks
+        self.render_hooks["<DEF_CONV_KERNEL>"] = kernel_hook
+        return "<DEF_CONV_KERNEL>"
+
+    # This function is for convolution wrapper function finalizing.
+    def def_wrapper(self, only_store_buffer: bool = False, epilogue_buffer: str = False):
+        def wrapper_store_buf_hook():
+            output_bufs = self.kernel_group.args.output_buffers
+            if self.store_info['output_node'] not in output_bufs:
+                assert False, f"Output buffer {self.store_info['output_node']} not found in {output_bufs}"
+            if output_bufs[self.store_info['output_node']] == 'REMOVED':
+                if len(output_bufs) == 1 or len(self.store_info['dependent_buf']) == 0:
+                    assert False, "Output buffer is removed and no other output buffer is found"
+                return output_bufs[self.store_info['dependent_buf'][0]]  # FIXME: Only using the first dependent buffer
+            else:
+                return output_bufs[self.store_info['output_node']]
+
+        def wrapper_epilogue_buf_hook(name):
+            if name not in self.kernel_group.args.input_buffers:
+                assert False, f"Input buffer {name} not found in {self.kernel_group.args.input_buffers}"
+            return self.kernel_group.args.input_buffers[name]
+
+        def wrapper_hook():
+            arg_defs, *_ = self.kernel_group.args.mlir_argdefs(extra_node=self.extra_node)
+            wrapper_arg_defs = [arg.split('%')[1].split(':')[0] for arg in arg_defs]
+            return f"({', '.join(wrapper_arg_defs)})"
+
+        if only_store_buffer:
+            if "<DEF_CONV_WRAPPER_STORE_BUFFER>" not in self.render_hooks:
+                self.render_hooks["<DEF_CONV_WRAPPER_STORE_BUFFER>"] = wrapper_store_buf_hook
+            return "<DEF_CONV_WRAPPER_STORE_BUFFER>"
+        if epilogue_buffer:
+            if f"<DEF_CONV_WRAPPER_EPILOGUE_BUFFER_{epilogue_buffer}>" not in self.render_hooks:
+                self.render_hooks[f"<DEF_CONV_WRAPPER_EPILOGUE_BUFFER_{epilogue_buffer}>"] = functools.partial(
+                    wrapper_epilogue_buf_hook,
+                    name=epilogue_buffer
+                )
+            return f"<DEF_CONV_WRAPPER_EPILOGUE_BUFFER_{epilogue_buffer}>"
+        else:
+            if "<DEF_CONV_WRAPPER>" not in self.render_hooks:
+                self.render_hooks["<DEF_CONV_WRAPPER>"] = wrapper_hook
+            return "<DEF_CONV_WRAPPER>"
+
     def output_name(self):
         # Cannot know the output name from the template, so we need to hook it
         def hook():
@@ -349,10 +439,10 @@ def hook():
         self.render_hooks["<OUPUT>"] = hook
         return "<OUPUT>"
 
-    def store_output(self, vlane_split_axis=1):
+    def store_output(self, indent_size: int = 0):
         def hook():
-            self.codegen_body(vlane_split_axis)
-            return textwrap.indent(self.body.getvalue(), "      ").strip()  #TODO: First line is not indented
+            self.codegen_body()
+            return textwrap.indent(self.body.getvalue(), " "*indent_size).strip()
 
         assert "<STORE_OUTPUT>" not in self.render_hooks
         self.render_hooks["<STORE_OUTPUT>"] = hook
@@ -362,7 +452,11 @@ def hook():
     def def_function(self):
         _, call_args, _ = self.kernel_group.args.python_argdefs()
         if self.outer_func_render is not None:
-            return self.outer_func_render(input_args=call_args)
+            partial_code, function_name = self.outer_func_render(input_args=call_args)
+            return PartialRender(
+                partial_code,
+                self.render_hooks,
+            ), function_name
         else:
             return None, None
 
@@ -390,10 +484,13 @@ def hook():
         self.render_hooks[key] = hook
         return key
 
-    def render(self, template, kwargs):
+    def render(self, template, kwargs, define_function=None):
         # self.render_hooks = {}
+        code = template.render(**kwargs)
+        if define_function is not None:
+            define_function(self)
         return PartialRender(
-            template.render(**kwargs),
+            code,
             self.render_hooks,
         )
 
@@ -401,16 +498,8 @@ def get_spad_size_per_lane(self, tile_m, tile_n):
         size = tile_m * ((tile_n + self.vector_lane - 1) // self.vector_lane)
         return max(size, 2) # vector load/store
 
-    def adjust_tile_size(self):
-        # Fixed tile size for template kernel
-        self.kernel_group.tile_desc.set_tile_size((self.render_options['TILE_M'], self.render_options['TILE_N']))
-        self.kernel_group.tile_desc.vlane_split_axis = 1 # FIXME: Fixed
-        self.kernel_group.tile_desc.vlane_stride = 1 # FIXME: Fixed
-        return
-
     def load_epilogue(self, name: str, index: sympy.Expr):
-        #index_var = self.parse_indices(index)
-        index_var = "index2"
+        index_var = self.store_info['index_var']
         index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.input(name)
         dtype = V.graph.get_dtype(name)
@@ -422,8 +511,7 @@ def load_epilogue(self, name: str, index: sympy.Expr):
             # Allocate sram buffer
             dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
             tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
-            # tile_stride = self.kernel_group.tile_desc.get_tile_stride()
-            tile_stride = [1, self.render_options['TILE_M']] # FIXME: Fixed
+            tile_stride = self.store_info['tile_stride']
             sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.loads, index_var, index)
             self.buffer_names[name] = sram_var
             code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
@@ -435,14 +523,14 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         operation = "affine.vector_load" if tile_numel_per_lane > 1 else "affine.load"
         shape = f", vector<{tile_numel_per_lane}x{mlir_dtype}>" if tile_numel_per_lane > 1 else ""
         zero_var = self.get_const_cse(0)
-        line = f"{operation} %{sram_var}[%{zero_var}, %{zero_var}] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{mlir_dtype}, 1>{shape}"
+        tile_indices = ",".join([f"%{zero_var}"] * self.store_info["tile_nr_dim"])
+        line = f"{operation} %{sram_var}[{tile_indices}] : {self.store_info['tile_shape']}{shape}"
         out = self.cse.generate(self.loads, line)
         self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
         return out
 
     def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
-        #index_var = self.parse_indices(index)
-        index_var = "index2"
+        index_var = self.store_info['index_var']
         dram_var = self.kernel_group.args.output(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
@@ -452,8 +540,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
-        # tile_stride = self.kernel_group.tile_desc.get_tile_stride()
-        tile_stride = [1, self.render_options['TILE_M']] # FIXME: Fixed
+        tile_stride = self.store_info['tile_stride']
 
         if name not in self.buffer_names:
             sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.stores, index_var, index)
@@ -467,10 +554,9 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         operation = "affine.vector_store" if tile_numel_per_lane > 1 else "affine.store"
         shape = f", vector<{tile_numel_per_lane}x{mlir_dtype}>" if tile_numel_per_lane > 1 else ""
         zero_var = self.get_const_cse(0)
-        line = f"{operation} %{value}, %{sram_var}[%{zero_var}, %{zero_var}] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{mlir_dtype}, 1>{shape}"
+        tile_indices = ",".join([f"%{zero_var}"] * self.store_info["tile_nr_dim"])
+        line = f"{operation} %{value}, %{sram_var}[{tile_indices}] : {tile_shape}{shape}"
         self.cse.generate(self.stores, line, assignment = False)
-
-        index_var = "index2"                # FIXME. Is it okay?
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  f"{name}_tag", dram_shape, tile_shape, tile_stride)
         self.cse.generate(self.stores, code, assignment = False)
diff --git a/tests/Fusion/test_conv_fusion.py b/tests/Fusion/test_conv_fusion.py
new file mode 100644
index 00000000..26d8abd9
--- /dev/null
+++ b/tests/Fusion/test_conv_fusion.py
@@ -0,0 +1,66 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    message = f"|{name} Test Passed|"
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        print("Failed")
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        # exit(1)
+
+def test_conv_residual(device, batch_size=1, in_channels=8, out_channels=16, input_size=64, kernel_size=3, stride=1, padding=0):
+    def custom_conv2d(a, b, bias, c):
+        i_c = a.shape[1]
+        o_c = b.shape[0]
+        conv2d = torch.nn.Conv2d(i_c, o_c, b.shape[-1], stride=stride, padding=padding, dilation=1, bias=True)
+        conv2d.weight = torch.nn.Parameter(b)
+        conv2d.bias = torch.nn.Parameter(bias)
+        return conv2d(a) + c
+    torch.manual_seed(0)
+    conv_input = torch.randn(batch_size, in_channels, input_size, input_size).to(memory_format=torch.channels_last, device=device)
+    conv_kernel = torch.randn(out_channels, in_channels, kernel_size, kernel_size).to(memory_format=torch.channels_last, device=device)
+    conv_bias = torch.randn(out_channels).to(device=device)
+    o_h = (input_size + 2 * padding - kernel_size) // stride + 1
+    o_w = (input_size + 2 * padding - kernel_size) // stride + 1
+    add_tensor = torch.randn(batch_size, out_channels, o_h, o_w).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(custom_conv2d)
+    res = opt_fn(conv_input, conv_kernel, conv_bias, add_tensor)
+    out = custom_conv2d(conv_input.cpu(), conv_kernel.cpu(), conv_bias.cpu(), add_tensor.cpu())
+    test_result("Conv2d Residual Fusion Forward", res, out, rtol=1e-3, atol=1e-3)
+    print("Max diff > ", torch.max(torch.abs(res.cpu() - out)))
+
+
+def test_conv_scalar(device, batch_size=1, in_channels=8, out_channels=16, input_size=64, kernel_size=3, stride=1, padding=0):
+    def custom_conv2d(a, b, bias, c):
+        i_c = a.shape[1]
+        o_c = b.shape[0]
+        conv2d = torch.nn.Conv2d(i_c, o_c, b.shape[-1], stride=stride, padding=padding, dilation=1, bias=False)
+        conv2d.weight = torch.nn.Parameter(b)
+        # conv2d.bias = torch.nn.Parameter(bias)
+        return conv2d(a) * c
+    torch.manual_seed(0)
+    conv_input = torch.randn(batch_size, in_channels, input_size, input_size).to(memory_format=torch.channels_last, device=device)
+    conv_kernel = torch.randn(out_channels, in_channels, kernel_size, kernel_size).to(memory_format=torch.channels_last, device=device)
+    conv_bias = torch.randn(out_channels).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(custom_conv2d)
+    res = opt_fn(conv_input, conv_kernel, conv_bias, 2)
+    out = custom_conv2d(conv_input.cpu(), conv_kernel.cpu(), conv_bias.cpu(), 2)
+    test_result("Conv2d + Scalar Fusion Forward", res, out, rtol=1e-3, atol=1e-3)
+    print("Max diff > ", torch.max(torch.abs(res.cpu() - out)))
+
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    test_conv_residual(device, batch_size=1, in_channels=8, out_channels=32, input_size=32, kernel_size=3, stride=1, padding=1)
\ No newline at end of file

From baabc5666e455ba3e6c1f51a7752d749f28b70d5 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Tue, 11 Mar 2025 11:54:56 +0000
Subject: [PATCH 215/432] [Fusion] Convolution fusion debug

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 14 +++++------
 PyTorchSimFrontend/mlir/mlir_template.py      | 24 +++++++++----------
 tests/Fusion/test_conv_fusion.py              | 10 +++++++-
 3 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 255e6a3a..0a6e259f 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -593,6 +593,7 @@ def render(self,
         X, W = self.input_nodes[0], self.input_nodes[1]
         Y = self.output_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+        n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
 
         BATCH = X.layout.size[0]
         I_C = X.layout.size[1]
@@ -602,7 +603,7 @@ def render(self,
         O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2]
         O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
 
-        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation)
+        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
         SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
         SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
         TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
@@ -618,7 +619,7 @@ def render(self,
         TOG_latency = BATCH if TILE_M > BATCH else TILE_M
         if self.is_single_batch(BATCH) and self.stride[0] != 1:
           conv_template = SINGLE_BATCH_CONV_STRIDE_TEMPLATE
-          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation) # TODO: implement K_W
+          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
           TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
           x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_W * TILE_I_H * TILE_M, TILE_K)
           y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N)
@@ -629,7 +630,7 @@ def render(self,
           TOG_latency = O_W if TILE_M > O_W else TILE_M
         elif self.is_multi_tile(I_C):
           conv_template = MULTI_TILE_CONV_TEMPLATE
-          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation)
+          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
           TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
           TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
           x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H * TILE_M, TILE_K)
@@ -640,10 +641,9 @@ def render(self,
           y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
         elif self.is_single_batch(BATCH) and self.stride[0] == 1:
           conv_template = SINGLE_BATCH_CONV_TEMPLATE
-          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation) # TODO: implement K_W
-          TILE_N = 32
-          TILE_K = 32
-          TILE_O_H = 8
+          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
+          if TILE_O_W > O_W:  # FIXME: Temporal solution for single batch fusion
+            TILE_O_W = O_W
           TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
           TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
           SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index c5e7bfed..44a73c88 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -154,14 +154,14 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0):
                         mapping = (tile_M, tile_N, tile_K)
         return mapping
 
-    def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation):
+    def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0):
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
         max_spad_per_lane = spad_size_per_lane // 2 # double buffer
 
         max_used_spad_size = 0
-        M, N, K = self.gemm_combination_mapping(M, N, K)
+        M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node)
         max_k_h_w = 1 # maximize kernel size
         for o_h in sympy.divisors(O_H):
             for o_w in sympy.divisors(O_W):
@@ -172,10 +172,10 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation
                         weight_size = k_w * k_h * K * N
                         input_size = i_w * i_h * M * K
                         output_size = o_w * o_h * M * N
-                        used_spad_size = (weight_size + input_size + output_size) * self.precision
+                        used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision
                         weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N)
                         input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
-                        output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M, N)
+                        output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M  * (1 + n_extra_node), N)
                         used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
                         if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h * k_w:
                             max_used_spad_size = used_spad_size
@@ -185,14 +185,14 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation
             raise RuntimeError("Cannot find a valid mapping")
         return mapping
 
-    def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation):
+    def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0):
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2
         max_spad_per_lane = spad_size_per_lane // 2
 
         max_used_spad_size = 0
-        M, N, K = self.gemm_combination_mapping(M, N, K * K_W)
+        M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node)
         max_k_h_w = K_W
         for o_h in sympy.divisors(O_H):
             for o_w in sympy.divisors(O_W):
@@ -202,10 +202,10 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation)
                     weight_size = 1 * k_h * K * N
                     input_size = i_w * i_h * M * K
                     output_size = o_w * o_h * M * N
-                    used_spad_size = (weight_size + input_size + output_size) * self.precision
+                    used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision
                     weight_size_per_lane = self.get_spad_size_per_lane(1 * k_h * K, N)
                     input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
-                    output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M, N)
+                    output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M  * (1 + n_extra_node), N)
                     used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
                     if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h:
                         max_used_spad_size = used_spad_size
@@ -215,14 +215,14 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation)
             raise RuntimeError("Cannot find a valid mapping")
         return mapping
 
-    def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation):
+    def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0):
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2
         max_spad_per_lane = spad_size_per_lane // 2
 
         max_used_spad_size = 0
-        M, N, K = self.gemm_combination_mapping(O_W, N, K)
+        M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node)
         max_k_h_w = 1
         for o_h in sympy.divisors(O_H):
             for k_h in sympy.divisors(K_H):
@@ -232,10 +232,10 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio
                     weight_size = k_w * k_h * K * N
                     input_size = i_w * i_h * k_w * K
                     output_size = M * o_h * N
-                    used_spad_size = (weight_size + input_size + output_size) * self.precision
+                    used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision
                     weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N)
                     input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * k_w, K)
-                    output_size_per_lane = self.get_spad_size_per_lane(M * o_h, N)
+                    output_size_per_lane = self.get_spad_size_per_lane(M * o_h  * (1 + n_extra_node), N)
                     used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
                     if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h * k_w:
                         max_used_spad_size = used_spad_size
diff --git a/tests/Fusion/test_conv_fusion.py b/tests/Fusion/test_conv_fusion.py
index 26d8abd9..84c2b09c 100644
--- a/tests/Fusion/test_conv_fusion.py
+++ b/tests/Fusion/test_conv_fusion.py
@@ -63,4 +63,12 @@ def custom_conv2d(a, b, bias, c):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_conv_residual(device, batch_size=1, in_channels=8, out_channels=32, input_size=32, kernel_size=3, stride=1, padding=1)
\ No newline at end of file
+
+    # Vanila test
+    # test_conv_residual(device, batch_size=3, in_channels=64, out_channels=64, input_size=28, kernel_size=3, stride=1, padding=1)
+
+    # Multi-tile test
+    # test_conv_residual(device, batch_size=1, in_channels=3, out_channels=32, input_size=32, kernel_size=3, stride=1, padding=1)
+
+    # Single batch test
+    test_conv_residual(device, batch_size=1, in_channels=64, out_channels=64, input_size=28, kernel_size=3, stride=1, padding=1)
\ No newline at end of file

From 5b2759f432955dacda23b477ee36d6e7b1fc2db8 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 12 Mar 2025 02:08:16 +0000
Subject: [PATCH 216/432] [Fix] BMM template fix

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 85efec0a..81ebc81e 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -59,7 +59,7 @@
           linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                   outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
         } { accumulation_loop=true }
-       {{kernel.store_output(vlane_split_axis=2)}}
+        {{kernel.store_output(indent_size=8)}}
       } { outer_loop=true }
     } { outer_loop=true }
   } { outer_loop=true }
@@ -131,6 +131,23 @@ def render(self,
             Y_numel = B * M * N,
             input_reorder = self.input_reorder
         )
+
+        kernel.store_info = dict(
+            output_node = self.output_node.name,
+            dependent_buf = [],
+            sram_var = "Y_buffer",
+            dram_var = "Y",
+            index_var = "index2",
+            tag_var = "tag",
+            vlane_split_axis = 2,
+            vlane_stride = 1,
+            mlir_dtype = kernel.render_options['DATA_STYPE'],
+            tile_nr_dim = 2,
+            dram_shape = f"memref<{kernel.render_options['Y_numel']}x{kernel.render_options['DATA_STYPE']}>",
+            tile_shape = f"memref<{TILE_M}x{TILE_N}x{kernel.render_options['DATA_STYPE']}, 1>",
+            tile_size = (TILE_M, TILE_N),
+            tile_stride = [1, TILE_M]
+        )
         code = self._template_from_string(BMM_TEMPLATE).render(**kernel.render_options)
         kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
 

From aa7f8937cc71b76e455c8e3b0dd8825efd8abc57 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Thu, 13 Mar 2025 08:50:05 +0000
Subject: [PATCH 217/432] [Fix] Convolution wrapper template fix

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 96 +++++++++----------
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  3 +
 PyTorchSimFrontend/mlir/mlir_template.py      | 52 ++++------
 3 files changed, 66 insertions(+), 85 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 0a6e259f..35106a4b 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -511,38 +511,56 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
     X_padding[:, :, {{ PADDING_H }}:X.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:X.shape[3] + {{ PADDING_W }}] = X
 
     # Holding original output tensor
-    {{kernel.def_wrapper(only_store_buffer=True)}}_t = {{kernel.def_wrapper(only_store_buffer=True)}}
-
-    # Tanspose tensors
-    {%- if MULTI_TILE %}
-    X = X_padding.permute(2, 0, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, BATCH, I_W, I_C)
-    {% elif SINGLE_BATCH %}
-    X = X_padding.permute(0, 2, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (BATCH, I_H, I_W, I_C)
-    {% else %}
-    X = X_padding.permute(2, 3, 0, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, I_W, BATCH, I_C)
-    {% endif -%}
-    W = W.permute(2, 3, 1, 0).contiguous() # (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)
-    {%- if SINGLE_BATCH %}
-    {{kernel.def_wrapper(only_store_buffer=True)}} = {{kernel.def_wrapper(only_store_buffer=True)}}.permute(0, 2, 3, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (BATCH, O_H, O_W, O_C)
-    {%- for buf in EPILOGUE_READS %}
-    {{kernel.def_wrapper(epilogue_buffer=buf)}} = {{kernel.def_wrapper(epilogue_buffer=buf)}}.permute(0, 2, 3, 1).contiguous()
+    {%- for buf, name in kernel.get_conv_outputs().items() %}
+    {{ name }}_t = {{ name }}
     {%- endfor %}
-    {% else %}
-    {{kernel.def_wrapper(only_store_buffer=True)}} = {{kernel.def_wrapper(only_store_buffer=True)}}.permute(2, 3, 0, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (O_H, O_W, BATCH, O_C)
-    {%- for buf in EPILOGUE_READS %}
-    {{kernel.def_wrapper(epilogue_buffer=buf)}} = {{kernel.def_wrapper(epilogue_buffer=buf)}}.permute(2, 3, 0, 1).contiguous()
+
+    # Tanspose inputs
+    {%- for buf, name in kernel.get_conv_inputs().items() %}
+      {%- if name == "X" %}
+        {%- if MULTI_TILE %}
+    {{ name }} = {{ name }}_padding.permute(2, 0, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, BATCH, I_W, I_C)
+        {%- elif SINGLE_BATCH %}
+    {{ name }} = {{ name }}_padding.permute(0, 2, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (BATCH, I_H, I_W, I_C)
+        {%- else %}
+    {{ name }} = {{ name }}_padding.permute(2, 3, 0, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, I_W, BATCH, I_C)
+        {%- endif %}
+      {%- elif name == "W" %}
+    {{ name }} = {{ name }}.permute(2, 3, 1, 0).contiguous() # (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)
+      {%- elif name == "Bias" %}
+    {{ name }} = {{ name }}
+      {%- else %}
+        {%- if SINGLE_BATCH %}
+    {{ name }} = {{ name }}.permute(0, 2, 3, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (BATCH, O_H, O_W, O_C)
+        {%- else %}
+    {{ name }} = {{ name }}.permute(2, 3, 0, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (O_H, O_W, BATCH, O_C)
+        {%- endif %}
+      {%- endif %}
     {%- endfor %}
-    {% endif -%}
+
+    # Transpose outputs
+    {%- for buf, name in kernel.get_conv_outputs().items() %}
+      {%- if SINGLE_BATCH %}
+    {{ name }} = {{ name }}.permute(0, 2, 3, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (BATCH, O_H, O_W, O_C)
+      {%- else %}
+    {{ name }} = {{ name }}.permute(2, 3, 0, 1).contiguous() # (BATCH, O_C, O_H, O_W) ->  (O_H, O_W, BATCH, O_C)
+      {%- endif %}
+    {%- endfor %}
+
+    # Launch kernel
     {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
-    {% if BACKENDSIM_EAGER_MODE %}
+    {%- if BACKENDSIM_EAGER_MODE %}
     yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
-    {% endif %}
-    # Transpose back
-    {%- if SINGLE_BATCH %}
-    {{kernel.def_wrapper(only_store_buffer=True)}}_t.copy_({{kernel.def_wrapper(only_store_buffer=True)}}.permute(0, 3, 1, 2).contiguous()) # (BATCH, O_H, O_W, O_C) -> (BATCH, O_C, O_H, O_W)
-    {% else %}
-    {{kernel.def_wrapper(only_store_buffer=True)}}_t.copy_({{kernel.def_wrapper(only_store_buffer=True)}}.permute(2, 3, 0, 1).contiguous()) # (O_H, O_W, BATCH, O_C) -> (BATCH, O_C, O_H, O_W)
-    {% endif -%}
+    {%- endif %}
+
+    # Transpose back outputs
+    {%- for buf, name in kernel.get_conv_outputs().items() %}
+      {%- if SINGLE_BATCH %}
+    {{ name }}_t.copy_({{ name }}.permute(0, 3, 1, 2).contiguous()) # (BATCH, O_H, O_W, O_C) -> (BATCH, O_C, O_H, O_W)
+      {%- else %}
+    {{ name }}_t.copy_({{ name }}.permute(2, 3, 0, 1).contiguous()) # (O_H, O_W, BATCH, O_C) -> (BATCH, O_C, O_H, O_W)
+      {%- endif %}
+    {%- endfor %}
 """
 
 class MLIRConvTemplate(MLIRTemplate):
@@ -738,26 +756,6 @@ def outer_func_render(self, kernel_name, input_args):
         Y = self.output_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
-        # Wrapper function needs to transpose the epilogue node tensors same as the convolution input/outputs
-        # Currently, only the read tensors are transposed
-        epilogue_reads = []
-        epilogue_writes = []
-        if self.epilogue_nodes is not None:
-          main_node_rw = [X.get_name(), W.get_name(), Y.get_name()]
-          if Bias is not None:
-            main_node_rw.append(Bias.get_name())
-
-          for epilogue_node in self.epilogue_nodes:
-            reads = epilogue_node.read_writes.reads
-            for read in list(reads):
-              if read[0] not in main_node_rw:
-                epilogue_reads.append(read[0])
-
-            writes = epilogue_node.read_writes.writes
-            for write in list(writes):
-              if write[0] not in main_node_rw:
-                epilogue_writes.append(write[0])
-
         eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
         options = dict(
             kernel=self.kernel,
@@ -767,8 +765,6 @@ def outer_func_render(self, kernel_name, input_args):
             WEIGHT=W,
             BIAS=Bias,
             OUTPUT=Y,
-            EPILOGUE_READS=epilogue_reads,
-            EPILOGUE_WRITES=epilogue_writes,  # NOT USED YET
             PADDING_H=self.padding[0],
             PADDING_W=self.padding[1],
             MULTI_TILE=self.is_multi_tile(self.input_shape[1]),
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index e3605b74..f66fa010 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -20,6 +20,7 @@ def __init__(self, scheduler):
         self._ready_to_flush = False
         self.outer_function = set()
         config.inplace_buffers = False # FIXME. inout kernel makes trouble.. So disabled it!
+        self.max_fusion_size = 5
 
     def _set_flush_status(self, status: bool):
         self._ready_to_flush = status
@@ -28,6 +29,8 @@ def can_fuse_vertical(self, node1, node2):
         return self.can_fuse_horizontal(node1, node2)
 
     def can_fuse_horizontal(self, node1, node2):
+        if (len(node1.get_nodes())+ len(node2.get_nodes())) > self.max_fusion_size:
+            return False
         _, (vars1, reduce1) = node1.group
         _, (vars2, reduce2) = node2.group
 
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 44a73c88..acabb1cc 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -14,7 +14,7 @@
 from torch._inductor.select_algorithm import PartialRender
 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
 from torch._inductor.autotune_process import TensorMeta
-from torch._inductor.virtualized import V
+from torch._inductor.virtualized import V, NullHandler
 from torch._inductor.utils import IndentedBuffer
 
 from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
@@ -390,42 +390,20 @@ def kernel_hook():
 
     # This function is for convolution wrapper function finalizing.
     def def_wrapper(self, only_store_buffer: bool = False, epilogue_buffer: str = False):
-        def wrapper_store_buf_hook():
-            output_bufs = self.kernel_group.args.output_buffers
-            if self.store_info['output_node'] not in output_bufs:
-                assert False, f"Output buffer {self.store_info['output_node']} not found in {output_bufs}"
-            if output_bufs[self.store_info['output_node']] == 'REMOVED':
-                if len(output_bufs) == 1 or len(self.store_info['dependent_buf']) == 0:
-                    assert False, "Output buffer is removed and no other output buffer is found"
-                return output_bufs[self.store_info['dependent_buf'][0]]  # FIXME: Only using the first dependent buffer
-            else:
-                return output_bufs[self.store_info['output_node']]
-
-        def wrapper_epilogue_buf_hook(name):
-            if name not in self.kernel_group.args.input_buffers:
-                assert False, f"Input buffer {name} not found in {self.kernel_group.args.input_buffers}"
-            return self.kernel_group.args.input_buffers[name]
-
         def wrapper_hook():
             arg_defs, *_ = self.kernel_group.args.mlir_argdefs(extra_node=self.extra_node)
             wrapper_arg_defs = [arg.split('%')[1].split(':')[0] for arg in arg_defs]
             return f"({', '.join(wrapper_arg_defs)})"
 
-        if only_store_buffer:
-            if "<DEF_CONV_WRAPPER_STORE_BUFFER>" not in self.render_hooks:
-                self.render_hooks["<DEF_CONV_WRAPPER_STORE_BUFFER>"] = wrapper_store_buf_hook
-            return "<DEF_CONV_WRAPPER_STORE_BUFFER>"
-        if epilogue_buffer:
-            if f"<DEF_CONV_WRAPPER_EPILOGUE_BUFFER_{epilogue_buffer}>" not in self.render_hooks:
-                self.render_hooks[f"<DEF_CONV_WRAPPER_EPILOGUE_BUFFER_{epilogue_buffer}>"] = functools.partial(
-                    wrapper_epilogue_buf_hook,
-                    name=epilogue_buffer
-                )
-            return f"<DEF_CONV_WRAPPER_EPILOGUE_BUFFER_{epilogue_buffer}>"
-        else:
-            if "<DEF_CONV_WRAPPER>" not in self.render_hooks:
-                self.render_hooks["<DEF_CONV_WRAPPER>"] = wrapper_hook
-            return "<DEF_CONV_WRAPPER>"
+        if "<DEF_CONV_WRAPPER>" not in self.render_hooks:
+            self.render_hooks["<DEF_CONV_WRAPPER>"] = wrapper_hook
+        return "<DEF_CONV_WRAPPER>"
+
+    def get_conv_inputs(self):
+        return self.kernel_group.args.input_buffers
+
+    def get_conv_outputs(self):
+        return {k: v for k, v in self.kernel_group.args.output_buffers.items() if v != 'REMOVED'}
 
     def output_name(self):
         # Cannot know the output name from the template, so we need to hook it
@@ -499,14 +477,18 @@ def get_spad_size_per_lane(self, tile_m, tile_n):
         return max(size, 2) # vector load/store
 
     def load_epilogue(self, name: str, index: sympy.Expr):
-        index_var = self.store_info['index_var']
+        load_dim = []
+        if not isinstance(V.graph, NullHandler) and name in V.graph.graph_inputs:
+            load_dim = V.graph.graph_inputs[name].layout.size
+        index_var = self.store_info['index_var'] if len(load_dim) != 1 else 'tile_n'
         index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.input(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis
-        vlane_stride = self.kernel_group.tile_desc.vlane_stride
+        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis if len(load_dim) != 1 else 0
+        vlane_stride = self.kernel_group.tile_desc.vlane_stride if len(load_dim) != 1 else 1
         tile_numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
+        # layout = V.graph.graph_inputs[name].layout
         if name not in self.buffer_names:
             # Allocate sram buffer
             dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])

From 8943f1d652a214380af4ddd3e36be42d25785fe1 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Mon, 17 Mar 2025 10:57:14 +0000
Subject: [PATCH 218/432] [Fix] Gemm combination mapping k_pad_factor fix

---
 PyTorchSimFrontend/mlir/mlir_template.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index acabb1cc..5c4aa85f 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -14,7 +14,7 @@
 from torch._inductor.select_algorithm import PartialRender
 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
 from torch._inductor.autotune_process import TensorMeta
-from torch._inductor.virtualized import V, NullHandler
+from torch._inductor.virtualized import V, NullHandler, _ops as ops
 from torch._inductor.utils import IndentedBuffer
 
 from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
@@ -124,7 +124,7 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0):
         max_spad_per_lane = spad_size_per_lane // 2 # double buffer
         m_pad_factor = self.vector_lane if M > self.vector_lane else 8
         n_pad_factor = self.vector_lane if N > self.vector_lane else 8
-        k_pad_factor = self.vector_lane if K > self.vector_lane else 8
+        k_pad_factor = self.vector_lane if K > self.vector_lane else 1
         K = max(K, 8)
         M_padded = ((M + m_pad_factor - 1) // m_pad_factor) * m_pad_factor
         N_padded = ((N + n_pad_factor - 1) // n_pad_factor) * n_pad_factor
@@ -225,6 +225,7 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio
         M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node)
         max_k_h_w = 1
         for o_h in sympy.divisors(O_H):
+            o_h = 32
             for k_h in sympy.divisors(K_H):
                 for k_w in sympy.divisors(K_W):
                     i_h = 1 + (o_h - 1) * stride[0] + (k_h - 1) * dilation[0]
@@ -536,8 +537,14 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         operation = "affine.vector_store" if tile_numel_per_lane > 1 else "affine.store"
         shape = f", vector<{tile_numel_per_lane}x{mlir_dtype}>" if tile_numel_per_lane > 1 else ""
         zero_var = self.get_const_cse(0)
+
+        _, operand_type = self.var_info[value]
+        if mlir_dtype != operand_type:
+            value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
+
         tile_indices = ",".join([f"%{zero_var}"] * self.store_info["tile_nr_dim"])
         line = f"{operation} %{value}, %{sram_var}[{tile_indices}] : {tile_shape}{shape}"
+
         self.cse.generate(self.stores, line, assignment = False)
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  f"{name}_tag", dram_shape, tile_shape, tile_stride)

From e37830f43f677b6df9ada59e81dc1a6f5ee07131 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Mon, 17 Mar 2025 13:39:35 +0000
Subject: [PATCH 219/432] [Fix] Convolution single batch debug

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 7 ++-----
 PyTorchSimFrontend/mlir/mlir_template.py      | 1 -
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 35106a4b..e4b7a98b 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -660,16 +660,14 @@ def render(self,
         elif self.is_single_batch(BATCH) and self.stride[0] == 1:
           conv_template = SINGLE_BATCH_CONV_TEMPLATE
           TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
-          if TILE_O_W > O_W:  # FIXME: Temporal solution for single batch fusion
-            TILE_O_W = O_W
           TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
           TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
           SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane
           SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
           x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H, TILE_K)
-          y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)
+          y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H  * TILE_M, TILE_N)
           x_spad_size = TILE_I_W * TILE_I_H * TILE_K
-          y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
+          y_spad_size = TILE_O_H * TILE_M * TILE_N
           TOG_latency = O_W if TILE_M > O_W else TILE_M
 
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
@@ -739,7 +737,6 @@ def render(self,
             tile_stride = [TILE_O_W * TILE_M * TILE_N, TILE_M * TILE_N, 1, TILE_M]
         )
         code = self._template_from_string(conv_template).render(**kernel.render_options)
-
         self.header = f"float X_spad[{x_spad_size_per_lane}] __attribute__ ((section(\".spad\")));\n"
         self.header += f"float W_spad[{w_spad_size_per_lane}] __attribute__ ((section(\".spad\")));\n"
         self.header += f"float Y_spad[{y_spad_size_per_lane}] __attribute__ ((section(\".spad\")));\n"
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 5c4aa85f..c0364535 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -225,7 +225,6 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio
         M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node)
         max_k_h_w = 1
         for o_h in sympy.divisors(O_H):
-            o_h = 32
             for k_h in sympy.divisors(K_H):
                 for k_w in sympy.divisors(K_W):
                     i_h = 1 + (o_h - 1) * stride[0] + (k_h - 1) * dilation[0]

From c50ba322dc7019e09a9b045f46031fe5a10e8b03 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Mon, 17 Mar 2025 13:46:51 +0000
Subject: [PATCH 220/432] [CI] Add convolution fusion cases

---
 .github/workflows/pull-request.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index 88e9cfda..69b9bbfe 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -340,6 +340,13 @@ jobs:
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_scalar.py
+      - name: Run test_conv_fusion.py
+        run: |
+          echo "Running test_conv_fusion.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_conv_fusion.py
 
   test_moe:
     name: Run test_moe

From 7c784732fba58f9c0667ba5e073f70957eef5794 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 19 Mar 2025 11:35:04 +0000
Subject: [PATCH 221/432] [Fix] Resnet18 fusion timing fix

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 20 +++++--
 PyTorchSimFrontend/mlir/mlir_template.py      |  6 +-
 tests/Fusion/test_conv_fusion.py              | 55 ++++++++++++++++++-
 tests/test_resnet.py                          |  4 +-
 4 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index e4b7a98b..a58ca655 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -426,8 +426,9 @@
 memref.global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_K_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
 memref.global @Y_spad : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+{{kernel.def_global_vars()}}
 
-func.func @{{ KERNEL_NAME }}({{ KERNEL_DEF }}) {
+func.func @{{ KERNEL_NAME }}{{kernel.def_conv_kernel(inputs=[X, W, BIAS], outputs=[Y], names_str="X, W, Bias, Y", padded_input_size=PADDED_INPUT_SIZE, input_reorder=input_reorder)}} {
   %c_mvin = arith.constant 2 : index
   %c_mvin2 = arith.constant 1 : index
   %c_mvin3 = arith.constant 14 : index
@@ -447,6 +448,7 @@
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
+  {{- kernel.def_local_vars() }}
 
   affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
     affine.for %tile_m = 0 to {{ O_W }} step {{ TILE_M }} {
@@ -492,8 +494,7 @@
           } { accumulation_loop=true }
         } { accumulation_loop=true }
         // Store output matrix
-        memref.dma_start %output_buffer[%c0, %c0, %c0, %c0], %Y[%index0], %c_mvout, %tag3[%c0], %input_axis, %vstride
-            : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ BATCH * O_C * O_H * O_W }}xf32>, memref<1xi32> {padding=0, sram_stride=[{{ TILE_O_W * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
+        {{kernel.store_output(indent_size=8)}}
       } { outer_loop=true }
     } { outer_loop=true }
   } { outer_loop=true }
@@ -531,9 +532,9 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
     {{ name }} = {{ name }}
       {%- else %}
         {%- if SINGLE_BATCH %}
-    {{ name }} = {{ name }}.permute(0, 2, 3, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (BATCH, O_H, O_W, O_C)
+    {{ name }} = {{ name }}.permute(0, 2, 3, 1).contiguous()  if {{ name }}.dim() == 4 else {{ name }} # (BATCH, O_C, O_H, O_W) -> (BATCH, O_H, O_W, O_C)
         {%- else %}
-    {{ name }} = {{ name }}.permute(2, 3, 0, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (O_H, O_W, BATCH, O_C)
+    {{ name }} = {{ name }}.permute(2, 3, 0, 1).contiguous()  if {{ name }}.dim() == 4 else {{ name }} # (BATCH, O_C, O_H, O_W) -> (O_H, O_W, BATCH, O_C)
         {%- endif %}
       {%- endif %}
     {%- endfor %}
@@ -611,7 +612,14 @@ def render(self,
         X, W = self.input_nodes[0], self.input_nodes[1]
         Y = self.output_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
-        n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
+
+        if epilogue_nodes is not None:
+          extra_node_rw = {
+            item.name for epilogue_node in epilogue_nodes
+            for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes
+            if item.name != Y.name
+          }
+        n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0
 
         BATCH = X.layout.size[0]
         I_C = X.layout.size[1]
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index c0364535..595dee6e 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -10,7 +10,7 @@
 from unittest.mock import patch
 
 from torch._inductor.codegen.common import Kernel, KernelTemplate, ChoiceCaller, OpOverrides, CSE
-from torch._inductor.ir import Buffer, IRNode, TemplateBuffer
+from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, Pointwise
 from torch._inductor.select_algorithm import PartialRender
 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
 from torch._inductor.autotune_process import TensorMeta
@@ -485,8 +485,8 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         dram_var = self.kernel_group.args.input(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis if len(load_dim) != 1 else 0
-        vlane_stride = self.kernel_group.tile_desc.vlane_stride if len(load_dim) != 1 else 1
+        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis if len(load_dim) != 1 else 0    # FIXME: Fixed split axis for 1d load dim
+        vlane_stride = self.kernel_group.tile_desc.vlane_stride if len(load_dim) != 1 else 1    # FIXME: Fixed stride for 1d load dim
         tile_numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
         # layout = V.graph.graph_inputs[name].layout
         if name not in self.buffer_names:
diff --git a/tests/Fusion/test_conv_fusion.py b/tests/Fusion/test_conv_fusion.py
index 84c2b09c..62cab8d3 100644
--- a/tests/Fusion/test_conv_fusion.py
+++ b/tests/Fusion/test_conv_fusion.py
@@ -54,6 +54,46 @@ def custom_conv2d(a, b, bias, c):
     test_result("Conv2d + Scalar Fusion Forward", res, out, rtol=1e-3, atol=1e-3)
     print("Max diff > ", torch.max(torch.abs(res.cpu() - out)))
 
+def test_conv_relu(device, batch_size=1, in_channels=8, out_channels=16, input_size=64, kernel_size=3, stride=1, padding=0):
+    def custom_conv2d(a, b, bias):
+        i_c = a.shape[1]
+        o_c = b.shape[0]
+        conv2d = torch.nn.Conv2d(i_c, o_c, b.shape[-1], stride=stride, padding=padding, dilation=1, bias=True)
+        conv2d.weight = torch.nn.Parameter(b)
+        conv2d.bias = torch.nn.Parameter(bias)
+        return torch.nn.functional.relu(conv2d(a))
+    torch.manual_seed(0)
+    conv_input = torch.randn(batch_size, in_channels, input_size, input_size).to(memory_format=torch.channels_last, device=device)
+    conv_kernel = torch.randn(out_channels, in_channels, kernel_size, kernel_size).to(memory_format=torch.channels_last, device=device)
+    conv_bias = torch.randn(out_channels).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(custom_conv2d)
+    res = opt_fn(conv_input, conv_kernel, conv_bias)
+    out = custom_conv2d(conv_input.cpu(), conv_kernel.cpu(), conv_bias.cpu())
+    test_result("Conv2d + ReLU Fusion Forward", res, out, rtol=1e-3, atol=1e-3)
+    print("Max diff > ", torch.max(torch.abs(res.cpu() - out)))
+
+def test_conv_bn_relu(device, batch_size=1, in_channels=8, out_channels=16, input_size=64, kernel_size=3, stride=1, padding=0):
+    def custom_conv_bn_relu(a, b, bias, c, d, e, f):
+        i_c = a.shape[1]
+        o_c = b.shape[0]
+        conv2d = torch.nn.Conv2d(in_channels, out_channels, b.shape[-1], stride=stride, padding=padding, dilation=1, bias=True)
+        conv2d.weight = torch.nn.Parameter(b)
+        conv2d.bias = torch.nn.Parameter(bias)
+        # return torch.nn.functional.batch_norm(conv2d(a), c, d, weight=e, bias=f)
+        return torch.nn.functional.relu(torch.nn.functional.batch_norm(conv2d(a), c, d, weight=e, bias=f))
+    torch.manual_seed(0)
+    conv_input = torch.randn(batch_size, in_channels, input_size, input_size).to(memory_format=torch.channels_last, device=device)
+    conv_kernel = torch.randn(out_channels, in_channels, kernel_size, kernel_size).to(memory_format=torch.channels_last, device=device)
+    conv_bias = torch.randn(out_channels).to(device=device)
+    bn_weight = torch.randn(out_channels).to(device=device)
+    bn_bias = torch.randn(out_channels).to(device=device)
+    bn_mean = torch.zeros(out_channels).to(device=device)
+    bn_var = torch.ones(out_channels).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(custom_conv_bn_relu)
+    res = opt_fn(conv_input, conv_kernel, conv_bias, bn_mean, bn_var, bn_weight, bn_bias)
+    out = custom_conv_bn_relu(conv_input.cpu(), conv_kernel.cpu(), conv_bias.cpu(), bn_mean.cpu(), bn_var.cpu(), bn_weight.cpu(), bn_bias.cpu())
+    test_result("Conv2d + BN + ReLU Fusion Forward", res, out, rtol=1e-3, atol=1e-3)
+    print("Max diff > ", torch.max(torch.abs(res.cpu() - out)))
 
 if __name__ == "__main__":
     import os
@@ -65,10 +105,19 @@ def custom_conv2d(a, b, bias, c):
     device = module.custom_device()
 
     # Vanila test
-    # test_conv_residual(device, batch_size=3, in_channels=64, out_channels=64, input_size=28, kernel_size=3, stride=1, padding=1)
+    test_conv_residual(device, batch_size=3, in_channels=64, out_channels=64, input_size=28, kernel_size=3, stride=1, padding=1)
 
     # Multi-tile test
-    # test_conv_residual(device, batch_size=1, in_channels=3, out_channels=32, input_size=32, kernel_size=3, stride=1, padding=1)
+    test_conv_residual(device, batch_size=1, in_channels=3, out_channels=32, input_size=32, kernel_size=3, stride=1, padding=1)
 
     # Single batch test
-    test_conv_residual(device, batch_size=1, in_channels=64, out_channels=64, input_size=28, kernel_size=3, stride=1, padding=1)
\ No newline at end of file
+    test_conv_residual(device, batch_size=1, in_channels=16, out_channels=16, input_size=64, kernel_size=3, stride=1, padding=1)
+
+    # Scalar
+    test_conv_scalar(device, batch_size=1, in_channels=16, out_channels=48, input_size=48, kernel_size=3, stride=1, padding=1)
+
+    # Relu
+    test_conv_relu(device, batch_size=1, in_channels=16, out_channels=16, input_size=64, kernel_size=3, stride=1, padding=1)
+
+    # Conv + BN + ReLU
+    test_conv_bn_relu(device, batch_size=1, in_channels=8, out_channels=16, input_size=64, kernel_size=3, stride=1, padding=1)
\ No newline at end of file
diff --git a/tests/test_resnet.py b/tests/test_resnet.py
index 37f8a583..dc021174 100644
--- a/tests/test_resnet.py
+++ b/tests/test_resnet.py
@@ -1,6 +1,7 @@
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
+from torchvision.models import resnet18
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     message = f"|{name} Test Passed|"
@@ -15,7 +16,8 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
 
 def test_resnet(device):
     from torchvision.models import resnet
-    model = resnet._resnet(resnet.BasicBlock, [1, 1, 0, 0], weights=None, progress=False).eval()
+    # model = resnet._resnet(resnet.BasicBlock, [1, 1, 0, 0], weights=None, progress=False).eval()
+    model = resnet18().eval()
     model.to(device, memory_format=torch.channels_last)
     input = torch.randn(1, 3, 224, 224).to(device=device)
     x1 = input.to(device=device, memory_format=torch.channels_last)

From 3e6e670df7df6013342b6c889190078676b8473d Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 19 Mar 2025 12:03:42 +0000
Subject: [PATCH 222/432] [Fix] Allow k_pad_factor to be set to 1 when padding
 is not needed in multi-tile conv

---
 PyTorchSimFrontend/mlir/mlir_template.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 595dee6e..b513f6a1 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -117,14 +117,14 @@ def gemmini_gemm_mapping(self, M, N, K):
 
         return inner_I, inner_J, inner_K
 
-    def gemm_combination_mapping(self, M, N, K, n_extra_node=0):
+    def gemm_combination_mapping(self, M, N, K, n_extra_node=0, pad_k=True):
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
         max_spad_per_lane = spad_size_per_lane // 2 # double buffer
         m_pad_factor = self.vector_lane if M > self.vector_lane else 8
         n_pad_factor = self.vector_lane if N > self.vector_lane else 8
-        k_pad_factor = self.vector_lane if K > self.vector_lane else 1
+        k_pad_factor = self.vector_lane if K > self.vector_lane else (8 if pad_k else 1)
         K = max(K, 8)
         M_padded = ((M + m_pad_factor - 1) // m_pad_factor) * m_pad_factor
         N_padded = ((N + n_pad_factor - 1) // n_pad_factor) * n_pad_factor
@@ -161,7 +161,7 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation
         max_spad_per_lane = spad_size_per_lane // 2 # double buffer
 
         max_used_spad_size = 0
-        M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node)
+        M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False)
         max_k_h_w = 1 # maximize kernel size
         for o_h in sympy.divisors(O_H):
             for o_w in sympy.divisors(O_W):
@@ -192,7 +192,7 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation,
         max_spad_per_lane = spad_size_per_lane // 2
 
         max_used_spad_size = 0
-        M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node)
+        M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node=n_extra_node, pad_k=False)
         max_k_h_w = K_W
         for o_h in sympy.divisors(O_H):
             for o_w in sympy.divisors(O_W):
@@ -222,7 +222,7 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio
         max_spad_per_lane = spad_size_per_lane // 2
 
         max_used_spad_size = 0
-        M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node)
+        M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node=n_extra_node, pad_k=False)
         max_k_h_w = 1
         for o_h in sympy.divisors(O_H):
             for k_h in sympy.divisors(K_H):
@@ -488,7 +488,6 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis if len(load_dim) != 1 else 0    # FIXME: Fixed split axis for 1d load dim
         vlane_stride = self.kernel_group.tile_desc.vlane_stride if len(load_dim) != 1 else 1    # FIXME: Fixed stride for 1d load dim
         tile_numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
-        # layout = V.graph.graph_inputs[name].layout
         if name not in self.buffer_names:
             # Allocate sram buffer
             dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])

From 7be6e20193a6aae0c07594e6e7bd979fa9c61435 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 18 Mar 2025 04:01:43 +0000
Subject: [PATCH 223/432] [Optimize] Maximize used vlane

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py |  5 ++---
 PyTorchSimFrontend/mlir/mlir_common.py          | 14 +++++++++-----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index ded40566..a7d660ec 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1031,12 +1031,11 @@ def _index_expr(self, tile_size, buffer, renamed_expression, index):
             cast_i64 = f"arith.index_cast %{broadcast_var} : vector<2xindex> to vector<2xi64>"
             cast_i64_var = self.cse.generate(self.loads, cast_i64)
             affine_store = f"affine.vector_store %{cast_i64_var}, %{buffer}[{','.join(dim)}] : memref<{shape}xi64, 1>, vector<2xi64>"
-            res = self.cse.generate(self.loads, affine_store, assignment=False)
+            self.cse.generate(self.loads, affine_store, assignment=False)
         self.loads.writeline("}")
-        return res
+        return buffer
 
     def index_expr(self, index, dtype):
-        # Todo. To support index_expr, we have to custom instructions
         tile_desc = self.kernel_group.tile_desc
         tile_size = tile_desc.get_tile_size_per_lane()
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index e07437ab..2cbfe395 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -397,8 +397,8 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
         # Dummy tile size
         tile_size = [1] * (len(vars) + len(reduction_vars))
         if len(tile_size) == 2:
-            tile_size[-1] = 128
-            tile_size[-2] = 128
+            tile_size[-1] = 512
+            tile_size[-2] = 512
         elif len(tile_size) == 0: # Scalar
             tile_size = [1]
             self.ranges = [1]
@@ -412,6 +412,7 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
             raise NotImplementedError("dummy tile size fail!")
 
         vlane_stride = 8 # TODO: VCIX widening is not implemented
+        vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
         # Adjust tile size to avoid too much paddings
         for i in range(1, len(tile_size)+1):
             target_range = self.ranges[-i]
@@ -428,14 +429,17 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
             vlane_stride = 1
             tile_size[0] = 1
         # Adjust tile size
-        used_vlane = min((tile_size[len(vars) - 1] + vlane_stride - 1) // vlane_stride, self.vector_lane)
+        for i in range(len(vars)):
+            if tile_size[i] >= self.vector_lane: # maximize used vector lane
+                vlane_split_axis = i
+        used_vlane = min((tile_size[vlane_split_axis] + vlane_stride - 1) // vlane_stride, self.vector_lane)
         padded_size = used_vlane * vlane_stride
-        tile_size[len(vars) - 1] = ((tile_size[len(vars) - 1] + padded_size - 1) // padded_size) * padded_size
+        tile_size[vlane_split_axis] = ((tile_size[vlane_split_axis] + padded_size - 1) // padded_size) * padded_size
 
         # Select tile info.
         # Note: Kernel Group have to share same tile desc for fusion
         tile_desc = MLIRMultiDimTile(tile_size, self.vector_lane)
-        tile_desc.vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
+        tile_desc.vlane_split_axis = vlane_split_axis
         tile_desc.vlane_stride = vlane_stride
         tile_desc.implicit_dim_size = implicit_dim_size
         return tile_desc

From 8e09d26d7ceefa87e1e9e2f0e698f2794c6a87b6 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 18 Mar 2025 04:05:34 +0000
Subject: [PATCH 224/432] [Experiments] dump path with config

---
 experiments/BERT.py     | 4 +++-
 experiments/conv.py     | 4 +++-
 experiments/gemm.py     | 4 +++-
 experiments/resnet18.py | 4 +++-
 experiments/resnet50.py | 4 +++-
 5 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/experiments/BERT.py b/experiments/BERT.py
index 35b3f3aa..921b0413 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -106,6 +106,8 @@ def run_BERT(device, size, input_seq, validation):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
+    config = config.split('/')[-1].split('.')[0] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--size', type=str, default='base')
@@ -115,7 +117,7 @@ def run_BERT(device, size, input_seq, validation):
     args = args.parse_args()
     size = args.size
     input_seq = args.input_size
-    result_path = os.path.join(base_dir, args.dump_path, f"BERT_{size}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    result_path = os.path.join(base_dir, args.dump_path, config, f"BERT_{size}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
diff --git a/experiments/conv.py b/experiments/conv.py
index 74dd7f06..36db2729 100644
--- a/experiments/conv.py
+++ b/experiments/conv.py
@@ -40,6 +40,8 @@ def custom_conv2d(a, b, bias):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
+    config = config.split('/')[-1].split('.')[0] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--size', nargs='+', type=int, default=[8, 28, 28, 128, 128, 3, 1, 1], help='B H W I_C O_C K S P')
@@ -48,7 +50,7 @@ def custom_conv2d(a, b, bias):
     args = args.parse_args()
     size = args.size
     size_str = "_".join([str(i) for i in size])
-    result_path = os.path.join(base_dir, args.dump_path, f"CONV_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    result_path = os.path.join(base_dir, args.dump_path, config, f"CONV_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
diff --git a/experiments/gemm.py b/experiments/gemm.py
index 72363cf2..5ac4baf4 100644
--- a/experiments/gemm.py
+++ b/experiments/gemm.py
@@ -38,6 +38,8 @@ def custom_matmul(a, b):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
+    config = config.split('/')[-1].split('.')[0] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--size', nargs='+', type=int, default=[128, 128, 128], help='M K N')
@@ -46,7 +48,7 @@ def custom_matmul(a, b):
     args = args.parse_args()
     size = args.size
     size_str = "x".join([str(i) for i in size])
-    result_path = os.path.join(base_dir, args.dump_path, f"GEMM_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    result_path = os.path.join(base_dir, args.dump_path, config, f"GEMM_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index 98e2597f..eb63408f 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -20,13 +20,15 @@ def run_resnet(device, batch):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
+    config = config.split('/')[-1].split('.')[0] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--batch', type=int, default=1)
     args.add_argument('--dump_path', type=str, default='results')
     args = args.parse_args()
     batch = args.batch
-    result_path = os.path.join(base_dir, args.dump_path, f"resnet18_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    result_path = os.path.join(base_dir, args.dump_path, config, f"resnet18_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index ff4f0215..897c07eb 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -20,13 +20,15 @@ def run_resnet(device, batch):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
+    config = config.split('/')[-1].split('.')[0] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--batch', type=int, default=1)
     args.add_argument('--dump_path', type=str, default='results')
     args = args.parse_args()
     batch = args.batch
-    result_path = os.path.join(base_dir, args.dump_path, f"resnet50_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    result_path = os.path.join(base_dir, args.dump_path, config, f"resnet50_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation

From 1d1ed40166a0fb59c861864a897b64eb41b63e83 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Thu, 20 Mar 2025 11:19:59 +0000
Subject: [PATCH 225/432] [Fix] Address Calculation with interleaving

---
 PyTorchSimBackend/src/Dram.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimBackend/src/Dram.cc b/PyTorchSimBackend/src/Dram.cc
index 76b349c5..88475be0 100644
--- a/PyTorchSimBackend/src/Dram.cc
+++ b/PyTorchSimBackend/src/Dram.cc
@@ -3,9 +3,9 @@
 uint32_t Dram::get_channel_id(mem_fetch* access) {
   uint32_t channel_id;
   if (_n_ch_per_partition >= 16)
-    channel_id = ipoly_hash_function((new_addr_type)access->get_addr()/_config.dram_req_size, 0, _n_ch_per_partition);
+    channel_id = ipoly_hash_function((new_addr_type)access->get_addr()/_req_size, 0, _n_ch_per_partition);
   else
-    channel_id = ipoly_hash_function((new_addr_type)access->get_addr()/_config.dram_req_size, 0, 16) % _n_ch_per_partition;
+    channel_id = ipoly_hash_function((new_addr_type)access->get_addr()/_req_size, 0, 16) % _n_ch_per_partition;
 
   channel_id += ((access->get_numa_id() % _n_partitions)* _n_ch_per_partition);
   return channel_id;
@@ -54,7 +54,7 @@ DramRamulator2::DramRamulator2(SimulationConfig config, cycle_type* core_cycle)
   _mem.resize(_n_ch);
   for (int ch = 0; ch < _n_ch; ch++) {
     _mem[ch] = std::make_unique<Ramulator2>(
-      ch, _n_ch, config.dram_config_path, "Ramulator2", _config.dram_print_interval, config.dram_nbl);
+      ch, _n_ch, config.dram_config_path, "Ramulator2", _config.dram_print_interval, _n_bl);
   }
   _tx_log2 = log2(_req_size);
   _tx_ch_log2 = log2(_n_ch_per_partition) + _tx_log2;
@@ -99,6 +99,8 @@ bool DramRamulator2::is_full(uint32_t cid, mem_fetch* request) {
 }
 
 void DramRamulator2::push(uint32_t cid, mem_fetch* request) {
+  addr_type target_addr = (request->get_addr() >> _tx_ch_log2) << _tx_log2;
+  request->set_addr(target_addr);
   m_from_crossbar_queue[cid].push(request);
 }
 

From 1f2b0b653592951f36a0503ef098a1fddb5e0d6a Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Thu, 20 Mar 2025 11:26:18 +0000
Subject: [PATCH 226/432] [cleanup] rm unused code

---
 experiments/BERT.py       | 12 ------------
 tests/test_transformer.py | 12 ------------
 2 files changed, 24 deletions(-)

diff --git a/experiments/BERT.py b/experiments/BERT.py
index 921b0413..c8397eae 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -33,15 +33,6 @@ def __init__(self, h, d_model, dropout=0.1):
         self.linears = clones(torch.nn.Linear(d_model, d_model), 4)
         self.attn = None
 
-    def attention(self, query, key, value):
-        d_k = query.size(-1)
-        print(torch.matmul(query, key.transpose(-2, -1)))
-
-        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
-        p_attn = scores.softmax(dim=-1)
-        print(p_attn)
-        return torch.matmul(p_attn, value), p_attn
-
     def forward(self, query, key, value):
         # 1) Do all the linear projections in batch from d_model => h x d_k
         query, key, value = [
@@ -50,9 +41,6 @@ def forward(self, query, key, value):
         ]
 
         # 2) Apply attention on all the projected vectors in batch.
-        # x, self.attn = self.attention(query, key, value)
-        # d_k = query.size(-1)
-
         scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)
         p_attn = scores.softmax(dim=-1)
         x = torch.matmul(p_attn, value)
diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index 4c542ddb..8716ba06 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -29,15 +29,6 @@ def __init__(self, h, d_model, dropout=0.1):
         self.linears = clones(torch.nn.Linear(d_model, d_model), 4)
         self.attn = None
 
-    def attention(self, query, key, value):
-        d_k = query.size(-1)
-        print(torch.matmul(query, key.transpose(-2, -1)))
-
-        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
-        p_attn = scores.softmax(dim=-1)
-        print(p_attn)
-        return torch.matmul(p_attn, value), p_attn
-
     def forward(self, query, key, value):
         # 1) Do all the linear projections in batch from d_model => h x d_k
         query, key, value = [
@@ -46,9 +37,6 @@ def forward(self, query, key, value):
         ]
 
         # 2) Apply attention on all the projected vectors in batch.
-        # x, self.attn = self.attention(query, key, value)
-        # d_k = query.size(-1)
-
         scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)
         p_attn = scores.softmax(dim=-1)
         x = torch.matmul(p_attn, value)

From 3e6541630884788fb443ceffae71a280c98c7519 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Mon, 24 Mar 2025 02:22:07 +0000
Subject: [PATCH 227/432] [Fix] fix nBL config

---
 PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json    | 2 +-
 PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json       | 2 +-
 PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json    | 2 +-
 .../configs/systolic_ws_128x128_c1_booksim_tpuv2.json         | 2 +-
 .../configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json      | 4 ++--
 .../configs/systolic_ws_128x128_c1_simple_noc_tpuv2_half.json | 2 +-
 .../configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json      | 2 +-
 .../configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json      | 2 +-
 .../configs/systolic_ws_128x128_c2_booksim_tpuv2.json         | 2 +-
 .../configs/systolic_ws_128x128_c2_chiplet_tpuv2.json         | 2 +-
 .../configs/systolic_ws_128x128_c2_chiplet_tpuv2_xnuma.json   | 2 +-
 .../configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json      | 2 +-
 .../configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json      | 2 +-
 .../configs/systolic_ws_128x128_c4_booksim_tpuv2.json         | 2 +-
 .../configs/systolic_ws_128x128_c4_chiplet_map_tpuv2.json     | 2 +-
 .../configs/systolic_ws_128x128_c4_chiplet_tpuv2.json         | 2 +-
 PyTorchSimBackend/include/SimulationConfig.h                  | 2 +-
 17 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json
index 6f3049ac..40a100ef 100644
--- a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json
+++ b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json
@@ -13,7 +13,7 @@
   "dram_channels": 16,
   "dram_req_size": 32,
   "dram_latency" : 10,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
 
diff --git a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
index 30f3e216..72f52318 100644
--- a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
+++ b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
@@ -13,7 +13,7 @@
   "dram_channels": 8,
   "dram_req_size": 32,
   "dram_latency" : 10,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
 
diff --git a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json
index 2bf376c5..2293e197 100644
--- a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json
+++ b/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json
@@ -13,7 +13,7 @@
   "dram_channels": 8,
   "dram_req_size": 32,
   "dram_latency" : 10,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
 
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
index e623730a..5d7b0d35 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
@@ -10,7 +10,7 @@
   "dram_req_size": 32,
   "dram_latency" : 10,
   "dram_size" : 16,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
  
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
index 12d5ee39..31743430 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
@@ -5,12 +5,12 @@
   "core_print_interval" : 10000,
 
   "dram_type" : "ramulator2",
-  "dram_freq" :700,
+  "dram_freq" : 700,
   "dram_channels": 32,
   "dram_req_size": 32,
   "dram_latency" : 10,
   "dram_size" : 16,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
  
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2_half.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2_half.json
index 2ecb7b64..ab5266c5 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2_half.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2_half.json
@@ -9,7 +9,7 @@
   "dram_channels": 8,
   "dram_req_size": 32,
   "dram_latency" : 10,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
  
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
index 64c19a1d..2fc13de4 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
@@ -11,7 +11,7 @@
   "dram_req_size": 32,
   "dram_latency" : 10,
   "dram_size" : 32,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
 
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
index e9df64b3..3e082fc5 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
@@ -10,7 +10,7 @@
   "dram_req_size": 32,
   "dram_latency" : 10,
   "dram_size" : 16,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
  
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv2.json
index 73eb77d1..f81e2472 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv2.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv2.json
@@ -10,7 +10,7 @@
   "dram_req_size": 32,
   "dram_latency" : 10,
   "dram_size" : 16,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
  
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2.json
index f22cf1a7..d9161d67 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2.json
@@ -10,7 +10,7 @@
   "dram_req_size": 32,
   "dram_latency" : 10,
   "dram_size" : 16,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_num_partitions" : 2,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2_xnuma.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2_xnuma.json
index 9f8922b4..856e9c9c 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2_xnuma.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2_xnuma.json
@@ -10,7 +10,7 @@
   "dram_req_size": 32,
   "dram_latency" : 10,
   "dram_size" : 16,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_num_partitions" : 1,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
index 8c6c07dc..9f97c7db 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
@@ -10,7 +10,7 @@
   "dram_req_size": 32,
   "dram_latency" : 10,
   "dram_size" : 16,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
  
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
index 21f75c0e..5701402c 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
@@ -11,7 +11,7 @@
   "dram_req_size": 32,
   "dram_latency" : 10,
   "dram_size" : 32,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
 
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_booksim_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c4_booksim_tpuv2.json
index e387650c..842d5ab0 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_booksim_tpuv2.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c4_booksim_tpuv2.json
@@ -10,7 +10,7 @@
   "dram_req_size": 32,
   "dram_latency" : 10,
   "dram_size" : 16,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
  
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_map_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_map_tpuv2.json
index ec9493de..080599f2 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_map_tpuv2.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_map_tpuv2.json
@@ -10,7 +10,7 @@
   "dram_req_size": 32,
   "dram_latency" : 10,
   "dram_size" : 16,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_num_partitions" : 4,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_tpuv2.json
index d06c505d..5d4fa211 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_tpuv2.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_tpuv2.json
@@ -10,7 +10,7 @@
   "dram_req_size": 32,
   "dram_latency" : 10,
   "dram_size" : 16,
-  "dram_nbl" : 1,
+  "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
  
diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/PyTorchSimBackend/include/SimulationConfig.h
index a6e7adef..b647b3f9 100644
--- a/PyTorchSimBackend/include/SimulationConfig.h
+++ b/PyTorchSimBackend/include/SimulationConfig.h
@@ -65,6 +65,6 @@ struct SimulationConfig {
   }
 
   float max_dram_bandwidth() {
-    return dram_freq * dram_channels * dram_req_size / dram_nbl / 1000; // GB/s
+    return dram_freq * dram_channels * dram_req_size * 2 / dram_nbl / 1000; // GB/s
   }
 };
\ No newline at end of file

From 41ef9b825ae2b77cbeec8d9def560efa361b15ca Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Mon, 24 Mar 2025 02:29:47 +0000
Subject: [PATCH 228/432] [Fix] makes experiments dump path shorter

Dump path affects to the result of gem5.

We should find out the reason and fix it later.
---
 experiments/BERT.py     | 2 +-
 experiments/conv.py     | 2 +-
 experiments/resnet18.py | 2 +-
 experiments/resnet50.py | 2 +-
 tests/test_add.py       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/experiments/BERT.py b/experiments/BERT.py
index c8397eae..35f85631 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -95,7 +95,7 @@ def run_BERT(device, size, input_seq, validation):
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
     config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
-    config = config.split('/')[-1].split('.')[0] # extract config name from config path
+    config = config.split('/')[-1].split('.')[0][9:] # extract config name from config path FIXME: gem5 result is different as directoy name
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--size', type=str, default='base')
diff --git a/experiments/conv.py b/experiments/conv.py
index 36db2729..48bfea60 100644
--- a/experiments/conv.py
+++ b/experiments/conv.py
@@ -41,7 +41,7 @@ def custom_conv2d(a, b, bias):
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
     config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
-    config = config.split('/')[-1].split('.')[0] # extract config name from config path
+    config = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--size', nargs='+', type=int, default=[8, 28, 28, 128, 128, 3, 1, 1], help='B H W I_C O_C K S P')
diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index eb63408f..4d5c4c6e 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -21,7 +21,7 @@ def run_resnet(device, batch):
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
     config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
-    config = config.split('/')[-1].split('.')[0] # extract config name from config path
+    config = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--batch', type=int, default=1)
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index 897c07eb..da8aa710 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -21,7 +21,7 @@ def run_resnet(device, batch):
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
     config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
-    config = config.split('/')[-1].split('.')[0] # extract config name from config path
+    config = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--batch', type=int, default=1)
diff --git a/tests/test_add.py b/tests/test_add.py
index 708fa9f0..0d07a3fb 100644
--- a/tests/test_add.py
+++ b/tests/test_add.py
@@ -37,7 +37,7 @@ def test_vector_tensor_add(device, size=(128, 128)):
     def vectoradd(a, b):
         return a + b
     x = torch.randn(size).to(device=device)
-    y = torch.randn(size[0]).to(device=device)
+    y = torch.randn(size[-1]).to(device=device)
     opt_fn = torch.compile(dynamic=False)(vectoradd)
     res = opt_fn(x, y)
     out = vectoradd(x.cpu(), y.cpu())

From 89b40c3224e8454345d881cde758280cba1e48f8 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Mon, 24 Mar 2025 02:33:14 +0000
Subject: [PATCH 229/432] [Frontend] sub-tile size with K dim

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  |  6 ++++--
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 20 +++++++++++--------
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  6 ++++--
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 81ebc81e..20044237 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -53,9 +53,9 @@
           %index0 = affine.apply #map0(%b, %t_m, %t_k)
           %index1 = affine.apply #map1(%b, %t_k, %t_n)
           memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
-             : memref<{{ B * M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
+             : memref<{{ B * M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
           memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
-             : memref<{{ B * K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
+             : memref<{{ B * K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
           linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                   outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
         } { accumulation_loop=true }
@@ -103,6 +103,7 @@ def render(self,
         kernel.loop_size = [TILE_M, TILE_N, TILE_K]
         SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
         SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
 
         W_transposed = self.is_transposed(W)
         X_transposed = self.is_transposed(X)
@@ -119,6 +120,7 @@ def render(self,
             TILE_K=TILE_K,
             SUB_TILE_M=SUB_TILE_M,
             SUB_TILE_N=SUB_TILE_N,
+            SUB_TILE_K=SUB_TILE_K,
             DATA_STYPE="f32",
             DATA_SIZE=4,
             X = X,
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index a58ca655..20f48d88 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -105,10 +105,10 @@
                 %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
                 // Load input matrix
                 memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
-                    : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_I_H }}, {{ SUB_TILE_I_W }}, {{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_I_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
+                    : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_I_H }}, {{ SUB_TILE_I_W }}, {{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[{{ TILE_I_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
                 // Load kernel matrix
                 memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
-                    : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
+                    : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
                 affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
                   affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
                     %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
@@ -228,10 +228,10 @@
               %index2 = affine.apply #map2(%k_h, %c0, %tile_k, %tile_n) // weight index
               // Load input matrix
               memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
-                  : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_I_H }}, {{ SUB_TILE_I_W }}, {{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_O_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
+                  : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_I_H }}, {{ SUB_TILE_I_W }}, {{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[{{ TILE_O_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
               // Load kernel matrix
               memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
-                  : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
+                  : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
               affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
                 affine.for %tile_k_w = 0 to 1 {
                   %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
@@ -348,10 +348,10 @@
               %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
               // Load input matrix
               memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
-                  : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ 1 }}, {{ SUB_TILE_I_H }}, {{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_I_H * TILE_I_W * TILE_K }}, {{ TILE_I_W * TILE_K }}, 1, {{ TILE_I_W }}]}
+                  : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ 1 }}, {{ SUB_TILE_I_H }}, {{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[{{ TILE_I_H * TILE_I_W * TILE_K }}, {{ TILE_I_W * TILE_K }}, 1, {{ TILE_I_W }}]}
               // Load kernel matrix
               memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
-                  : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
+                  : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
               affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
                 affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
                   %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
@@ -469,10 +469,10 @@
               %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
               // Load input matrix
               memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
-                  : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_K_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_I_H }}, {{ SUB_TILE_K_W }}, {{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[{{ TILE_K_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
+                  : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_K_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_I_H }}, {{ SUB_TILE_K_W }}, {{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[{{ TILE_K_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
               // Load kernel matrix
               memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
-                  : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
+                  : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
               affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
                 affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
                   %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
@@ -632,6 +632,7 @@ def render(self,
         TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
         SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
         SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
         TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
         TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
         SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
@@ -653,6 +654,7 @@ def render(self,
           y_spad_size = TILE_O_H * TILE_M * TILE_N
           SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
           SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+          SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
           TOG_latency = O_W if TILE_M > O_W else TILE_M
         elif self.is_multi_tile(I_C):
           conv_template = MULTI_TILE_CONV_TEMPLATE
@@ -672,6 +674,7 @@ def render(self,
           TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
           SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane
           SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+          SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
           x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H, TILE_K)
           y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H  * TILE_M, TILE_N)
           x_spad_size = TILE_I_W * TILE_I_H * TILE_K
@@ -712,6 +715,7 @@ def render(self,
             TILE_K_W=TILE_K_W,
             SUB_TILE_M=SUB_TILE_M,
             SUB_TILE_N=SUB_TILE_N,
+            SUB_TILE_K=SUB_TILE_K,
             SUB_TILE_I_H=SUB_TILE_I_H,
             SUB_TILE_I_W=SUB_TILE_I_W,
             SUB_TILE_K_H=SUB_TILE_K_H,
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 565e8290..c99ea222 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -55,9 +55,9 @@
         %index0 = affine.apply #map0(%t_m, %t_k)
         %index1 = affine.apply #map1(%t_k, %t_n)
         memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
-           : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
+           : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
         memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
-           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
+           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                 outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
       } { accumulation_loop=true }
@@ -131,6 +131,7 @@ def render(self,
         kernel.loop_size =[TOG_latency, TILE_N, TILE_K]
         SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
         SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
 
         W_transposed = self.is_transposed(W)
         X_transposed = self.is_transposed(X)
@@ -146,6 +147,7 @@ def render(self,
             TILE_K=TILE_K,
             SUB_TILE_M=SUB_TILE_M,
             SUB_TILE_N=SUB_TILE_N,
+            SUB_TILE_K=SUB_TILE_K,
             DATA_STYPE="f32",
             DATA_SIZE=4,
             X = X,

From 3ef4339b85b9745e6a8c181b3c415e68f96c2dc4 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Mon, 24 Mar 2025 02:49:47 +0000
Subject: [PATCH 230/432] [Fix] Max-pool template

Max-pool template should supports functionality in the future.
---
 PyTorchSimFrontend/mlir/mlir_lowering.py      |  3 +-
 .../mlir/mlir_maxpool_template.py             | 45 +++++++++++++------
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index 8c9c35a7..fc96a255 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -158,4 +158,5 @@ def sparse_addmm(*args, **kwargs):
 lowerings.update({getattr(aten.addmm, overload): tuned_addmm for overload in aten.addmm.overloads()})
 lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()})
 lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
-lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
\ No newline at end of file
+lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
+# lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
index 79493fdd..6a5aafa0 100644
--- a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
@@ -62,25 +62,42 @@ def render(self,
         W = Y.get_size()[3]
         BCH = B * C * H
         kernel.loop_size = None
-        options = {
-          "KERNEL_NAME" : self.name,
-          "kernel" : kernel,
-          "IN" : X.get_numel(),
-          "OUT" : Y.get_numel(),
-          "X" : X,
-          "Y" : Y,
-          "BCH" : BCH,
-          "W" : W,
-          "in_tile" : in_tile,
-          "out_tile" : out_tile,
-        }
-        code = self._template_from_string(TEMPLATE).render(**options)
+        kernel.render_options = dict(
+            KERNEL_NAME=self.name,
+            kernel=kernel,
+            IN=X.get_numel(),
+            OUT=Y.get_numel(),
+            X=X,
+            Y=Y,
+            BCH=BCH,
+            W=W,
+            in_tile=in_tile,
+            out_tile=out_tile,
+            DATA_STYPE="f32",
+        )
+        kernel.store_info = dict(
+            output_node = self.output_node.name,
+            dependent_buf = [],
+            sram_var = "Y_buffer",
+            dram_var = "Y",
+            index_var = "index0",
+            tag_var = "tag",
+            vlane_split_axis = 1,
+            vlane_stride = 1,
+            mlir_dtype = kernel.render_options['DATA_STYPE'],
+            tile_nr_dim = 2,
+            dram_shape = f"memref<{kernel.render_options['OUT']}x{kernel.render_options['DATA_STYPE']}>",
+            tile_shape = f"memref<{out_tile}x{out_tile}x{kernel.render_options['DATA_STYPE']}, 1>",
+            tile_size = (out_tile, out_tile),
+            tile_stride = [1, out_tile]
+        )
+        code = self._template_from_string(TEMPLATE).render(**kernel.render_options)
         self.header = f"float X_spad[{in_tile * in_tile // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
         self.header += f"float Y_spad[{out_tile * out_tile // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header = f"float X_spad[{in_tile * in_tile // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
         self.gem5_header += f"float Y_spad[{out_tile * out_tile // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
 
-        kernel.add_loop_info([options["IN"]], [kernel.vector_lane, kernel.vector_lane])
+        kernel.add_loop_info([kernel.render_options["IN"]], [kernel.vector_lane, kernel.vector_lane])
         return code
 
     def codegen_header(self, code, extra_headers):

From 750572b0d9fd82bb810fd66298a3ea5a55685c07 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Mon, 24 Mar 2025 08:07:34 +0000
Subject: [PATCH 231/432] [Fix] conv template optimize order

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 20f48d88..0c10160d 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -644,7 +644,19 @@ def render(self,
         y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
         conv_template = CONV_TEMPLATE
         TOG_latency = BATCH if TILE_M > BATCH else TILE_M
-        if self.is_single_batch(BATCH) and self.stride[0] != 1:
+        if self.is_multi_tile(I_C):
+          conv_template = MULTI_TILE_CONV_TEMPLATE
+          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
+          TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
+          TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+          SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
+          x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H * TILE_M, TILE_K)
+          w_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_H * TILE_K, TILE_N)
+          y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)
+          x_spad_size = TILE_I_W * TILE_I_H * TILE_M * TILE_K
+          w_spad_size = TILE_K_H * TILE_K * TILE_N
+          y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
+        elif self.is_single_batch(BATCH) and self.stride[0] != 1:
           conv_template = SINGLE_BATCH_CONV_STRIDE_TEMPLATE
           TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
           TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
@@ -656,17 +668,6 @@ def render(self,
           SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
           SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
           TOG_latency = O_W if TILE_M > O_W else TILE_M
-        elif self.is_multi_tile(I_C):
-          conv_template = MULTI_TILE_CONV_TEMPLATE
-          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
-          TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
-          TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-          x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H * TILE_M, TILE_K)
-          w_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_H * TILE_K, TILE_N)
-          y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)
-          x_spad_size = TILE_I_W * TILE_I_H * TILE_M * TILE_K
-          w_spad_size = TILE_K_H * TILE_K * TILE_N
-          y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
         elif self.is_single_batch(BATCH) and self.stride[0] == 1:
           conv_template = SINGLE_BATCH_CONV_TEMPLATE
           TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W

From 5f88abff40019d21e57addb3e056d60f929406d7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 24 Mar 2025 09:08:10 +0000
Subject: [PATCH 232/432] [CI] Fix authorization

---
 .github/workflows/docker-base-image.yml |   2 +-
 .github/workflows/docker-image.yml      | 196 +++++++++++++++++++++-
 .github/workflows/pull-request.yml      | 211 +++++++++++++++++++++++-
 3 files changed, 404 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/docker-base-image.yml b/.github/workflows/docker-base-image.yml
index 82614c8c..5f2005b7 100644
--- a/.github/workflows/docker-base-image.yml
+++ b/.github/workflows/docker-base-image.yml
@@ -27,7 +27,7 @@ jobs:
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
 
       # Step 4: Build and Push Docker Image
       - name: Build and Push Docker Image
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index b13eda61..4b733d2a 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -6,7 +6,7 @@ on:
 
 jobs:
   build:
-    runs-on: self-hosted
+    runs-on: [self-hosted, Linux]
 
     permissions:
       contents: read
@@ -25,7 +25,7 @@ jobs:
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
 
       # Step 3: Pull the Cached Image
       - name: Pull Cached Image & Set environment
@@ -72,7 +72,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_add.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_add.py"
           docker run --rm \
@@ -85,7 +93,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_activation.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_activation.py"
           docker run --rm \
@@ -98,7 +114,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_batchnorm.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_batchnorm.py"
           docker run --rm \
@@ -111,7 +135,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_bmm.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_bmm.py"
           docker run --rm \
@@ -124,7 +156,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_cnn.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_cnn.py"
           docker run --rm \
@@ -137,7 +177,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_conv2d.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_conv2d.py"
           docker run --rm \
@@ -150,7 +198,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_matmul.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_matmul.py"
           docker run --rm \
@@ -163,7 +219,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_reduce.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_reduce.py"
           docker run --rm \
@@ -176,7 +240,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_softmax.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_softmax.py"
           docker run --rm \
@@ -189,7 +261,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_transpose2D.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_transpose2D.py"
           docker run --rm \
@@ -202,7 +282,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_view3D_2D.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_view3D_2D.py"
           docker run --rm \
@@ -215,7 +303,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_layernorm.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_layernorm.py"
           docker run --rm \
@@ -228,7 +324,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_mlp.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_mlp.py"
           docker run --rm \
@@ -241,7 +345,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_resnet.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_resnet.py"
           docker run --rm \
@@ -254,7 +366,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_transformer.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_transformer.py"
           docker run --rm \
@@ -267,7 +387,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_transpose3D.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_transpose3D.py"
           docker run --rm \
@@ -280,7 +408,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_sparsity.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_sparsity.py"
           docker run --rm \
@@ -293,7 +429,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_pool.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_pool.py"
           docker run --rm \
@@ -306,7 +450,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_single_perceptron.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_single_perceptron.py"
           docker run --rm \
@@ -319,21 +471,45 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_addmm_residual.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_addmm_residual.py"
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_addmm_residual.py
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_matmul_activation.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_matmul_activation.py"
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_activation.py
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_matmul_scalar.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_matmul_scalar.py"
           docker run --rm \
@@ -346,7 +522,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_moe.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_moe.py"
           docker run --rm \
@@ -359,7 +543,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_mistral.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_mistral.py"
           docker run --rm \
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index 69b9bbfe..4cb24617 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -6,7 +6,7 @@ on:
 
 jobs:
   build:
-    runs-on: self-hosted
+    runs-on: [self-hosted, Linux]
 
     permissions:
       contents: read
@@ -25,7 +25,7 @@ jobs:
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
 
       # Step 3: Pull the Cached Image
       - name: Pull Cached Image & Set environment
@@ -70,9 +70,24 @@ jobs:
   test_add:
     name: Run test_add.py
     runs-on: self-hosted
+
+    permissions:
+      contents: read
+      packages: write
+      attestations: write
+      id-token: write
     needs: build
+
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_add.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_add.py"
           docker run --rm \
@@ -85,7 +100,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_activation.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_activation.py"
           docker run --rm \
@@ -98,7 +121,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_batchnorm.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_batchnorm.py"
           docker run --rm \
@@ -111,7 +142,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_bmm.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_bmm.py"
           docker run --rm \
@@ -124,7 +163,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_cnn.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_cnn.py"
           docker run --rm \
@@ -137,7 +184,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_conv2d.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_conv2d.py"
           docker run --rm \
@@ -150,7 +205,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_matmul.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_matmul.py"
           docker run --rm \
@@ -163,7 +226,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_reduce.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_reduce.py"
           docker run --rm \
@@ -176,7 +247,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_softmax.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_softmax.py"
           docker run --rm \
@@ -189,7 +268,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_transpose2D.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_transpose2D.py"
           docker run --rm \
@@ -202,7 +289,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_view3D_2D.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_view3D_2D.py"
           docker run --rm \
@@ -215,7 +310,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_layernorm.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_layernorm.py"
           docker run --rm \
@@ -228,7 +331,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_mlp.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_mlp.py"
           docker run --rm \
@@ -241,7 +352,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_resnet.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_resnet.py"
           docker run --rm \
@@ -254,7 +373,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_transformer.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_transformer.py"
           docker run --rm \
@@ -267,7 +394,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_transpose3D.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_transpose3D.py"
           docker run --rm \
@@ -280,7 +415,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_sparsity.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_sparsity.py"
           docker run --rm \
@@ -293,7 +436,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_pool.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_pool.py"
           docker run --rm \
@@ -306,7 +457,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_single_perceptron.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_single_perceptron.py"
           docker run --rm \
@@ -319,28 +478,60 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_addmm_residual.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_addmm_residual.py"
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_addmm_residual.py
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_matmul_activation.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_matmul_activation.py"
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_activation.py
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_matmul_scalar.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_matmul_scalar.py"
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_scalar.py
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_conv_fusion.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_conv_fusion.py"
           docker run --rm \
@@ -353,7 +544,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_moe.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_moe.py"
           docker run --rm \
@@ -366,7 +565,15 @@ jobs:
     runs-on: self-hosted
     needs: build
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
       - name: Run test_mistral.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           echo "Running test_mistral.py"
           docker run --rm \

From 9652bb012bef71144aa629a04373443a2ee48679 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 24 Mar 2025 12:58:54 +0000
Subject: [PATCH 233/432] [CI] Remove intermediate file during compile

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 8e149883..92bd2d47 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,7 +53,7 @@ ENV LLVM_DIR /riscv-llvm
 
 # Install Spike simulator
 RUN git clone https://${GIT_ACCESS_TOKEN}@github.com/PSAL-POSTECH/riscv-isa-sim.git --branch TorchSim && cd riscv-isa-sim && mkdir build && cd build && \
-    ../configure --prefix=$RISCV && make -j && make install
+    ../configure --prefix=$RISCV && make -j && make install && make clean
 
 # Install Proxy kernel
 RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \

From 26ef52fb729fbf9d0f41cc5c807f710347251680 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 19 Mar 2025 10:48:50 +0000
Subject: [PATCH 234/432] [Frontend/common] Add mlir_vshape api to get the mlir
 tile type

---
 .../mlir/mlir_codegen_backend.py              | 31 ++++++++++++-------
 PyTorchSimFrontend/mlir/mlir_common.py        | 17 +++++++++-
 PyTorchSimFrontend/mlir/mlir_template.py      | 10 +++---
 3 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index a7d660ec..8a556382 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -779,6 +779,7 @@ def load(self, name: str, index: sympy.Expr):
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
+        vshape = local_tile_desc.get_mlir_vshape(mlir_dtype)
 
         # Define scratch pad buffer
         sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.loads, index_var, index)
@@ -789,9 +790,12 @@ def load(self, name: str, index: sympy.Expr):
         self.cse.generate(self.loads, code, assignment = False) # FIXME: assignment = False does not support caching
 
         # Generate vector load instruction
-        operation = "affine.vector_load" if tile_numel_per_lane > 1 else "affine.load"
-        shape = f", vector<{tile_numel_per_lane}x{mlir_dtype}>" if tile_numel_per_lane > 1 else ""
-        line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}{shape}"
+        if tile_numel_per_lane > 1:
+            operation = "affine.vector_load"
+            line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}"
+        else:
+            operation = "affine.load"
+            line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}"
         out = self.cse.generate(self.compute, line)
         self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
         return out
@@ -812,18 +816,22 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
+        vshape = local_tile_desc.get_mlir_vshape(mlir_dtype)
 
         # Define scratch pad buffer
         sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.stores, index_var, index)
 
         # Generate vector store instruction
         store_size, operand_type = self.var_info[value]
-        operation = "affine.vector_store" if tile_numel_per_lane > 1 and store_size > 1 else "affine.store"
-        shape = f", vector<{tile_numel_per_lane}x{mlir_dtype}>" if tile_numel_per_lane > 1 and store_size > 1 else ""
         if mlir_dtype != operand_type:
             value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
 
-        line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}{shape}"
+        if tile_numel_per_lane > 1 and store_size > 1:
+            operation = "affine.vector_store"
+            line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}"
+        else:
+            operation = "affine.store"
+            line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}"
         self.stores.writeline(common.DeferredLine(name, line)) # TODO: Should be changed to self.compute?
 
         # Generate DMA instruction
@@ -957,7 +965,7 @@ def store_reduction(self, name, index, value):
         if self.welford_reduce_out is not None:
             # raise NotImplementedError()
             sum, sqr_sum, _ = self.welford_reduce_out
-            shape = f"vector<{tile_numel_per_lane}x{mlir_dtype}>" if self.buffer_types[name][1] > 1 else mlir_dtype
+            shape = local_tile_desc.get_mlir_vshape(mlir_dtype) if self.buffer_types[name][1] > 1 else mlir_dtype
             # mean
             divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.ranges[self.reduction_depth])} : f32")
             if self.buffer_types[name][1] > 1:
@@ -987,8 +995,7 @@ def store_reduction(self, name, index, value):
         if tile_numel_per_lane == 1:
             shape = ""
         else:
-            shape = f"vector<{tile_numel_per_lane}x{mlir_dtype}>"
-            shape = f", {shape}" if self.buffer_types[name][1] > 1 else ""
+            shape = f", {local_tile_desc.get_mlir_vshape(mlir_dtype)}" if self.buffer_types[name][1] > 1 else ""
 
         line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}{shape}"
         self.reductions_suffix.writeline(common.DeferredLine(name, line))
@@ -1041,8 +1048,8 @@ def index_expr(self, index, dtype):
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         tile_numel_per_lane = tile_desc.get_numel_per_lane()
         str_tile_size = [str(dim) for dim in tile_size]
-        shape = "x".join(str_tile_size)
-        tile_shape = f"memref<{shape}xi64, 1>"
+        tile_shape = f"memref<{'x'.join(str_tile_size)}xi64, 1>"
+        vshape = tile_desc.get_mlir_vshape(mlir_dtype)
 
         # Define scratch pad buffer
         sram_var, _, _ = self.get_scratchpad_buffer(dtype, "index_buffer", tile_numel_per_lane, tile_shape, self.loads, None, index)
@@ -1054,7 +1061,7 @@ def index_expr(self, index, dtype):
             self.index_set.add(index)
             ops._index_expr(tile_size, sram_var, renamed_expression, index)
 
-        line = f"affine.vector_load %{sram_var}[0, 0, 0] : {tile_shape}, vector<{tile_numel_per_lane}x{mlir_dtype}> // {renamed_expression}"
+        line = f"affine.vector_load %{sram_var}[0, 0, 0] : {tile_shape}, {vshape} // {renamed_expression}"
         out = self.cse.generate(self.compute, line)
         self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
         return out
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 2cbfe395..b01b5831 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -244,6 +244,18 @@ def get_mlir_shape(self, dtype):
         shape = "x".join(str_tile_size)
         return f"memref<{shape}x{dtype}, 1>"
 
+    @staticmethod
+    def extract_tile_size(memref_str):
+        assert memref_str.startswith("memref<") and memref_str.endswith(">"), "Invalid memref format"
+        # Extract the inner content of memref<>
+        inner_part = memref_str[len("memref<"):-1]
+        shapes = inner_part.split("x")[:-1]
+        return [int(dim) for dim in shapes]
+
+    def get_mlir_vshape(self, mlir_dtype):
+        tile_numel_per_lane = self.get_numel_per_lane()
+        return f"vector<{tile_numel_per_lane}x{mlir_dtype}>" if tile_numel_per_lane > 1 else ""
+
     def get_used_vlane(self):
         """
         Return number of used vector lane
@@ -326,6 +338,9 @@ def store(self, name, index, value, mode=None):
     def reduction(self, dtype, src_dtype, reduction_type, value):
         raise NotImplementedError()
 
+    def indirect_indexing(self, index_var, size, check):
+        raise NotImplementedError()
+
     def codegen_global_init(self):
         raise NotImplementedError()
 
@@ -578,7 +593,7 @@ def inner(*args, **kwargs):
             @staticmethod
             def indirect_indexing(index_var, size, check=True):
                 # Skip CSE since this doesn't return an expression
-                return sympy_symbol(str(index_var))  # type: ignore[attr-defined]
+                return self.indirect_indexing(index_var, size, check)
 
             @staticmethod
             def load(name: str, index: sympy.Expr):
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index b513f6a1..1feac99e 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -488,6 +488,8 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis if len(load_dim) != 1 else 0    # FIXME: Fixed split axis for 1d load dim
         vlane_stride = self.kernel_group.tile_desc.vlane_stride if len(load_dim) != 1 else 1    # FIXME: Fixed stride for 1d load dim
         tile_numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
+        vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
+        vshape = f", {vshape}" if tile_numel_per_lane > 1 else ""
         if name not in self.buffer_names:
             # Allocate sram buffer
             dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
@@ -502,10 +504,9 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         # Load vector from sram
         sram_var = self.buffer_names[name]
         operation = "affine.vector_load" if tile_numel_per_lane > 1 else "affine.load"
-        shape = f", vector<{tile_numel_per_lane}x{mlir_dtype}>" if tile_numel_per_lane > 1 else ""
         zero_var = self.get_const_cse(0)
         tile_indices = ",".join([f"%{zero_var}"] * self.store_info["tile_nr_dim"])
-        line = f"{operation} %{sram_var}[{tile_indices}] : {self.store_info['tile_shape']}{shape}"
+        line = f"{operation} %{sram_var}[{tile_indices}] : {self.store_info['tile_shape']}{vshape}"
         out = self.cse.generate(self.loads, line)
         self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
         return out
@@ -522,6 +523,8 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = self.store_info['tile_stride']
+        vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
+        vshape = f", {vshape}" if tile_numel_per_lane > 1 else ""
 
         if name not in self.buffer_names:
             sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.stores, index_var, index)
@@ -533,7 +536,6 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         sram_var = self.buffer_names[name]
 
         operation = "affine.vector_store" if tile_numel_per_lane > 1 else "affine.store"
-        shape = f", vector<{tile_numel_per_lane}x{mlir_dtype}>" if tile_numel_per_lane > 1 else ""
         zero_var = self.get_const_cse(0)
 
         _, operand_type = self.var_info[value]
@@ -541,7 +543,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
             value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
 
         tile_indices = ",".join([f"%{zero_var}"] * self.store_info["tile_nr_dim"])
-        line = f"{operation} %{value}, %{sram_var}[{tile_indices}] : {tile_shape}{shape}"
+        line = f"{operation} %{value}, %{sram_var}[{tile_indices}] : {tile_shape}{vshape}"
 
         self.cse.generate(self.stores, line, assignment = False)
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,

From 10c357156b15a447d30daa6fd813803afafdad86 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 19 Mar 2025 13:00:02 +0000
Subject: [PATCH 235/432] [Frontend] Seperate dma/load buffer

---
 .../mlir/mlir_codegen_backend.py              | 74 +++++++++++--------
 PyTorchSimFrontend/mlir/mlir_template.py      | 18 +++--
 2 files changed, 56 insertions(+), 36 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 8a556382..ef51e117 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -668,29 +668,37 @@ def __init__(self, kernel_group):
         super().__init__(kernel_group)
         self.const_buffer = IndentedBuffer()
         self.alloc_buffer = IndentedBuffer()
+        self.spad_buffer = IndentedBuffer()
         self.reduction_prefix = IndentedBuffer()
         self.reduction_suffix = IndentedBuffer()
+        self.applys = IndentedBuffer()
         self.body = IndentedBuffer()
+        self.dma_loads = IndentedBuffer()
+        self.dma_stores = IndentedBuffer()
+        self.indexed_buffer = IndentedBuffer()
         self.global_vars = IndentedBuffer()
-        self.global_vars_dict = dict()
         self.header = IndentedBuffer()
+        self.gem5_header = IndentedBuffer()
         self.header.writeline("#include <unistd.h>")
         self.header.writeline("#include <stdlib.h>")
         self.header.writeline("void* __wrap_malloc(size_t size) { return sbrk(size); }")
         self.header.writeline("void __wrap_free(void *ptr) { return; }")
-        self.gem5_header = IndentedBuffer()
-        self.reduction_vars = {}
         self.reduction_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc")
+        self.spad_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="spad")
+        self.apply_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="apply")
         self.iterator_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="iter")
         self.init_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="init")
         self.init_vec_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="init_vec")
         self.const_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="const")
         self.alloc_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="alloc")
+        self.indexed_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="indexed_op")
         self.map_cse = common.CSE("#", self.suffix, name_prefix="map")
+        self.global_vars_dict = dict()
+        self.reduction_vars = dict()
         self.consts = dict()
         self.tags = dict()
-        self.dma_read_cache = {}
-        self.dma_write_cache = {}
+        self.dma_read_cache = dict()
+        self.dma_write_cache = dict()
         self.dma_read_counter = 1
         self.dma_write_counter = 1
         self.affine_yield = {}
@@ -698,6 +706,7 @@ def __init__(self, kernel_group):
         self.reduce_iterator = {}
         self.is_template_kernel = False
         self.index_set = set()
+
     # padding type 0: zero-padding 1: negative-padding(-inf) ...
     def get_padding_type(self):
         ops = self.current_node.node.origins
@@ -730,12 +739,12 @@ def convert_index(self, expr, buffer):
         args = ", ".join(map(str, indices))
         map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args}) -> ({expr_str})>")
         args = ", ".join([f"%{i}" for i in indices])
-        index = self.cse.generate(buffer, f"affine.apply #{map_var}({args})")
+        index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})")
         return index
 
     def parse_indices(self, expr, buffer=None) -> common.CSEVariable:
         if buffer is None:
-            buffer = self.loads
+            buffer = self.applys
         # Constant case
         if expr.is_number:
             return self.get_const_cse(int(expr))
@@ -761,7 +770,7 @@ def parse_indices(self, expr, buffer=None) -> common.CSEVariable:
         args = ", ".join(map(str, indices))
         map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args}) -> ({expr_str})>")
         args = ", ".join([f"%{i}" for i in indices])
-        index = self.cse.generate(buffer, f"affine.apply #{map_var}({args})")
+        index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})")
         return index
 
     def load(self, name: str, index: sympy.Expr):
@@ -782,12 +791,12 @@ def load(self, name: str, index: sympy.Expr):
         vshape = local_tile_desc.get_mlir_vshape(mlir_dtype)
 
         # Define scratch pad buffer
-        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.loads, index_var, index)
+        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
 
         # MVIN Encoding
         code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  f"{name}_tag", dram_shape, tile_shape, tile_stride, padding)
-        self.cse.generate(self.loads, code, assignment = False) # FIXME: assignment = False does not support caching
+        self.cse.generate(self.dma_loads, code, assignment = False) # FIXME: assignment = False does not support caching
 
         # Generate vector load instruction
         if tile_numel_per_lane > 1:
@@ -796,7 +805,7 @@ def load(self, name: str, index: sympy.Expr):
         else:
             operation = "affine.load"
             line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}"
-        out = self.cse.generate(self.compute, line)
+        out = self.cse.generate(self.loads, line)
         self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
         return out
 
@@ -819,7 +828,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         vshape = local_tile_desc.get_mlir_vshape(mlir_dtype)
 
         # Define scratch pad buffer
-        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.stores, index_var, index)
+        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
 
         # Generate vector store instruction
         store_size, operand_type = self.var_info[value]
@@ -837,7 +846,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         # Generate DMA instruction
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  f"{name}_tag", dram_shape, tile_shape, tile_stride)
-        self.stores.writeline(common.DeferredLine(name, code))
+        self.dma_stores.writeline(common.DeferredLine(name, code))
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
         argmax_or_argmin = reduction_type in {"argmax", "argmin"}
@@ -960,8 +969,8 @@ def store_reduction(self, name, index, value):
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
 
-        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.reductions_suffix,
-                                                                         index_var, index, buffer=self.reduction_suffix)
+        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var,
+                                                                         index, buffer=self.reduction_suffix)
         if self.welford_reduce_out is not None:
             # raise NotImplementedError()
             sum, sqr_sum, _ = self.welford_reduce_out
@@ -1010,6 +1019,9 @@ def store_reduction(self, name, index, value):
         # Restore origin cse
         self.cse = tmp_cse
 
+    def indirect_indexing(self, index_var, size, check=True):
+        raise NotImplementedError("Not support indirect access")
+
     def _index_expr(self, tile_size, buffer, renamed_expression, index):
         str_tile_size = [str(dim) for dim in tile_size]
         shape = "x".join(str_tile_size)
@@ -1025,21 +1037,21 @@ def _index_expr(self, tile_size, buffer, renamed_expression, index):
         affine_offset_map = "(d0, d1) -> (d0 + d1)"
         offset_vars = dim.copy()
         parallel_map = f"affine.parallel ({','.join(dim)}) = ({','.join(start_dim)}) to ({','.join(end_dim)}) {{"
-        self.loads.writeline(parallel_map)
-        with self.loads.indent():
+        self.indexed_buffer.writeline(parallel_map)
+        with self.indexed_buffer.indent():
             for idx in indices:
                 i = int(idx[5:])
-                self.loads.writeline(f"%offset{i} = affine.apply affine_map<{affine_offset_map}>(%{idx}, {dim[i]})")
+                self.indexed_buffer.writeline(f"%offset{i} = affine.apply affine_map<{affine_offset_map}>(%{idx}, {dim[i]})")
                 offset_vars[i] = f"%offset{i}"
             apply_map = f"affine.apply affine_map<{affine_map_str}>({', '.join(offset_vars)}) {{global_idx=1}}"
-            apply_map_var = self.cse.generate(self.loads, apply_map)
+            apply_map_var = self.indexed_cse.generate(self.indexed_buffer, apply_map)
             broadcast = f"vector.broadcast %{apply_map_var} : index to vector<2xindex>"
-            broadcast_var = self.cse.generate(self.loads, broadcast)
+            broadcast_var = self.indexed_cse.generate(self.indexed_buffer, broadcast)
             cast_i64 = f"arith.index_cast %{broadcast_var} : vector<2xindex> to vector<2xi64>"
-            cast_i64_var = self.cse.generate(self.loads, cast_i64)
+            cast_i64_var = self.indexed_cse.generate(self.indexed_buffer, cast_i64)
             affine_store = f"affine.vector_store %{cast_i64_var}, %{buffer}[{','.join(dim)}] : memref<{shape}xi64, 1>, vector<2xi64>"
-            self.cse.generate(self.loads, affine_store, assignment=False)
-        self.loads.writeline("}")
+            self.cse.generate(self.indexed_buffer, affine_store, assignment=False)
+        self.indexed_buffer.writeline("}")
         return buffer
 
     def index_expr(self, index, dtype):
@@ -1052,7 +1064,7 @@ def index_expr(self, index, dtype):
         vshape = tile_desc.get_mlir_vshape(mlir_dtype)
 
         # Define scratch pad buffer
-        sram_var, _, _ = self.get_scratchpad_buffer(dtype, "index_buffer", tile_numel_per_lane, tile_shape, self.loads, None, index)
+        sram_var, _, _ = self.get_scratchpad_buffer(dtype, "index_buffer", tile_numel_per_lane, tile_shape, None, index)
 
         renamed_symbols = {symbol: "d"+str(symbol)[5:] for symbol in index.free_symbols}
         renamed_expression = index.subs(renamed_symbols)
@@ -1084,9 +1096,9 @@ def codegen_loops(self):
             vars = ', '.join([f"%{name}" for name, _ in self.affine_yield.items()])
             reduced_shapes = ', '.join([f"{shape}" for _, shape in self.affine_yield.items()])
             self.stores.writeline(f"affine.yield {vars} : {reduced_shapes}")
-
         code.splice(self.const_buffer)
         code.splice(self.alloc_buffer)
+        code.splice(self.spad_buffer)
         with contextlib.ExitStack() as stack:
             for loop in loops.loops:
                 loop_lines = loop.lines()
@@ -1103,9 +1115,13 @@ def codegen_loops(self):
                             return
                         code.writelines(reduction_lines)
                         stack.enter_context(code.indent(outer_loop=False))
+                    code.splice(self.applys)
+                    code.splice(self.indexed_buffer)
+                    code.splice(self.dma_loads)
                     code.splice(self.loads)
                     code.splice(self.compute)
                     code.splice(self.stores)
+                    code.splice(self.dma_stores)
                 code.splice(self.reductions_suffix)
         code.writeline(f"return")
         return code
@@ -1135,7 +1151,7 @@ def get_dma_info(self, name, index, index_var, broadcast=True, store_reduction=F
         """
         # Use loads as default
         if buffer is None:
-            buffer = self.loads
+            buffer = self.applys
 
         # TODO.
         kg_tile_desc = self.kernel_group.tile_desc
@@ -1157,7 +1173,7 @@ def get_dma_info(self, name, index, index_var, broadcast=True, store_reduction=F
             output_expr = str(index).replace('index', 'd')
             input_argument = ",".join(["%index" + str(i) if i in local_dims else f"%{fake_dim}" for i in total_dims])
             map_var = self.map_cse.generate(self.global_vars, f"affine_map<({input_expr}) -> ({output_expr})>")
-            index_var = self.cse.generate(buffer, f"affine.apply #{map_var}({input_argument})")
+            index_var = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({input_argument})")
             local_dims = total_dims # Brodatcast tile shape
 
         if kg_tile_desc.vlane_split_axis in local_dims:
@@ -1309,7 +1325,7 @@ def adjust_tile_size(self):
         if len(self.itervars) >= 3 and self.reduction_depth < len(self.itervars):
             raise NotImplementedError()
 
-    def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, code_buffer, indices, raw_index, is_template=False, buffer=None):
+    def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, indices, raw_index, is_template=False, buffer=None):
         c_type = mlir_common.DTYPE_TO_C[dtype]
         # Make sure each lane's buffer has at least two element
         tile_size = max(tile_size_per_lane, 2) * self.vector_lane
@@ -1329,7 +1345,7 @@ def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape
             self.global_vars_dict[name].append(str(raw_index))
         else:
             new_name = f"{name}_{self.global_vars_dict[name].index(str(raw_index))}"
-        sram_var = self.cse.generate(code_buffer, f"memref.get_global @{new_name}_spad : {dram_tile_shape}")
+        sram_var = self.spad_cse.generate(self.spad_buffer, f"memref.get_global @{new_name}_spad : {dram_tile_shape}")
 
         zero_cse = self.get_const_cse(0)
         sram_dims = len(dram_tile_shape.split("x")) - 1
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 1feac99e..692dce00 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -287,12 +287,16 @@ def template_store():
             tile_stride = self.store_info['tile_stride']
             code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  tag_var, dram_shape, tile_shape, tile_stride)
-            self.cse.generate(self.stores, code, assignment = False)
+            self.cse.generate(self.dma_stores, code, assignment = False)
+        self.body.splice(self.spad_buffer)
+        self.body.splice(self.applys)
+        self.body.splice(self.dma_loads)
         self.body.splice(self.loads)
         self.body.splice(self.compute)
         if len(self.stores._lines) == 0:
             template_store()
         self.body.splice(self.stores)
+        self.body.splice(self.dma_stores)
         self.loads.clear()
         self.compute.clear()
         self.stores.clear()
@@ -495,11 +499,11 @@ def load_epilogue(self, name: str, index: sympy.Expr):
             dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
             tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
             tile_stride = self.store_info['tile_stride']
-            sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.loads, index_var, index)
+            sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
             self.buffer_names[name] = sram_var
             code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                      f"{name}_tag", dram_shape, tile_shape, tile_stride)
-            self.cse.generate(self.loads, code, assignment = False)
+            self.cse.generate(self.dma_loads, code, assignment = False)
 
         # Load vector from sram
         sram_var = self.buffer_names[name]
@@ -527,7 +531,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         vshape = f", {vshape}" if tile_numel_per_lane > 1 else ""
 
         if name not in self.buffer_names:
-            sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, self.stores, index_var, index)
+            sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
             self.buffer_names[name] = sram_var
         else:
             zero_cse = self.get_const_cse(0)
@@ -548,10 +552,10 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         self.cse.generate(self.stores, line, assignment = False)
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  f"{name}_tag", dram_shape, tile_shape, tile_stride)
-        self.cse.generate(self.stores, code, assignment = False)
+        self.cse.generate(self.dma_stores, code, assignment = False)
 
-    def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, code_buffer, index_var, raw_index):
-        return super().get_scratchpad_buffer(dtype, name, tile_size_per_lane, dram_tile_shape, code_buffer, index_var, raw_index, True)
+    def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index):
+        return super().get_scratchpad_buffer(dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index, True)
 
 class MLIRTemplateCaller(CUDATemplateCaller):
     def __str__(self):

From cfd57c866997359c7c89e29383e5030a8d3c5dbd Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 19 Mar 2025 13:56:41 +0000
Subject: [PATCH 236/432] [Frontend] Fix index generator ordering

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index ef51e117..22e8e43a 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -753,8 +753,12 @@ def parse_indices(self, expr, buffer=None) -> common.CSEVariable:
         if len(expr.args) == 0:
             return expr
 
+        args = list(expr.args)
+        # Sort index variable.. ex) (%index1, %index0)
+        args_dict = {term: list(term.free_symbols)[0] for term in args if term.free_symbols}
+        sorted_args = sorted(args_dict.keys(), key=lambda term: str(args_dict[term]))
         indices = []
-        for arg in expr.args:
+        for arg in sorted_args:
             if arg.is_Mul and arg.args[0].is_number:
                 new_arg = sympy.Symbol(str(self.convert_index(arg.args[1], buffer)))
                 expr = expr.replace(arg.args[1], new_arg)
@@ -763,7 +767,6 @@ def parse_indices(self, expr, buffer=None) -> common.CSEVariable:
                 new_arg = sympy.Symbol(str(self.convert_index(arg, buffer)))
                 expr = expr.replace(arg, new_arg)
                 indices.append(str(new_arg))
-        indices.sort()
 
         # Extract index var
         expr_str = str(expr)

From a23d525ad1cfc5aac8f2b71c2fc0455bbe3abfe7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 20 Mar 2025 13:57:27 +0000
Subject: [PATCH 237/432] [Frontend] indexed access codegen implementation

---
 .../mlir/mlir_codegen_backend.py              | 39 ++++++++++++-------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 22e8e43a..ea03e772 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -706,6 +706,7 @@ def __init__(self, kernel_group):
         self.reduce_iterator = {}
         self.is_template_kernel = False
         self.index_set = set()
+        self.spad_buffer_dict = dict()
 
     # padding type 0: zero-padding 1: negative-padding(-inf) ...
     def get_padding_type(self):
@@ -745,6 +746,7 @@ def convert_index(self, expr, buffer):
     def parse_indices(self, expr, buffer=None) -> common.CSEVariable:
         if buffer is None:
             buffer = self.applys
+
         # Constant case
         if expr.is_number:
             return self.get_const_cse(int(expr))
@@ -771,19 +773,18 @@ def parse_indices(self, expr, buffer=None) -> common.CSEVariable:
         # Extract index var
         expr_str = str(expr)
         args = ", ".join(map(str, indices))
-        map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args}) -> ({expr_str})>")
+        map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args})[] -> ({expr_str})>")
         args = ", ".join([f"%{i}" for i in indices])
-        index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})")
+        index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})[]")
         return index
 
     def load(self, name: str, index: sympy.Expr):
         index = self.rename_indexing(index)
         padding = self.get_padding_type()
-        index_var = self.parse_indices(index)
         dram_var = self.kernel_group.args.input(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        local_tile_desc, index_var = self.get_dma_info(name, index, index_var)
+        local_tile_desc, index_var = self.get_dma_info(name, index)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
@@ -810,17 +811,17 @@ def load(self, name: str, index: sympy.Expr):
             line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}"
         out = self.cse.generate(self.loads, line)
         self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
+        self.spad_buffer_dict[out] = [sram_var, local_tile_desc.get_tile_size(), tile_shape]
         return out
 
     def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         index = self.rename_indexing(index)
-        index_var = self.parse_indices(index)
         dram_var = self.kernel_group.args.output(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
         # Prepare dma instruction
-        local_tile_desc, index_var = self.get_dma_info(name, index, index_var)
+        local_tile_desc, index_var = self.get_dma_info(name, index)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
@@ -960,10 +961,9 @@ def store_reduction(self, name, index, value):
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         index = self.rename_indexing(index)
-        index_var = self.parse_indices(index, buffer=self.reductions_suffix)
 
         # Tile is always reuduced in inner loop
-        local_tile_desc, index_var = self.get_dma_info(name, index, index_var, broadcast=False, store_reduction=True, buffer=self.reductions_suffix)
+        local_tile_desc, index_var = self.get_dma_info(name, index, broadcast=False, store_reduction=True, buffer=self.reductions_suffix)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
@@ -1023,7 +1023,16 @@ def store_reduction(self, name, index, value):
         self.cse = tmp_cse
 
     def indirect_indexing(self, index_var, size, check=True):
-        raise NotImplementedError("Not support indirect access")
+        spad_buffer_var, tile_size, mlir_shape = self.spad_buffer_dict[index_var]
+        nr_rank = len(tile_size)
+        mlir_dtype = self.var_info[index_var][1]
+        line = f"affine.load %{spad_buffer_var}[{', '.join(['%const0']*nr_rank)}] : {mlir_shape}"
+        out = self.cse.generate(self.dma_loads, line)
+        self.register_var_info(out, [1, "index", [1]])
+        if mlir_dtype != "index":
+            line = f"arith.index_cast %{out} : {mlir_dtype} to {'index'}"
+            out = self.cse.generate(self.dma_loads, line)
+        return str(out)
 
     def _index_expr(self, tile_size, buffer, renamed_expression, index):
         str_tile_size = [str(dim) for dim in tile_size]
@@ -1145,7 +1154,7 @@ def codegen_nodes(self, nodes, kernel_name):
             write_atomic(gem5_write_path, self.gem5_header.getvalue())
         return src_code
 
-    def get_dma_info(self, name, index, index_var, broadcast=True, store_reduction=False, buffer=None): # Need more argument?
+    def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffer=None): # Need more argument?
         """
         A tile descriptor exists that is configured on a kernel group
         DMA desc should be adjusted according to buffer.
@@ -1154,7 +1163,7 @@ def get_dma_info(self, name, index, index_var, broadcast=True, store_reduction=F
         """
         # Use loads as default
         if buffer is None:
-            buffer = self.applys
+            buffer = self.applys if "tmp" not in str(index) else self.dma_loads
 
         # TODO.
         kg_tile_desc = self.kernel_group.tile_desc
@@ -1166,6 +1175,8 @@ def get_dma_info(self, name, index, index_var, broadcast=True, store_reduction=F
         total_dims =  [int(str(i)[5:]) for i in self.itervars]
         local_tile_desc = mlir_common.MLIRMultiDimTile([1], self.vector_lane)
         local_dims.sort() # Assume that smaller index is placed in the outer loop
+        indirect_dims = [f"{i}" for i in index.free_symbols if "tmp" in str(i)]
+        indirect_arg_dims = [f"%{i}" for i in index.free_symbols if "tmp" in str(i)]
 
         # Reduction can have two type of tile size
         if broadcast and (total_dims != local_dims or (self.reduction_depth!=len(total_dims) and total_dims[:self.reduction_depth] == local_dims)):
@@ -1175,9 +1186,11 @@ def get_dma_info(self, name, index, index_var, broadcast=True, store_reduction=F
             input_expr = ",".join(["d"+str(i) for i in total_dims])
             output_expr = str(index).replace('index', 'd')
             input_argument = ",".join(["%index" + str(i) if i in local_dims else f"%{fake_dim}" for i in total_dims])
-            map_var = self.map_cse.generate(self.global_vars, f"affine_map<({input_expr}) -> ({output_expr})>")
-            index_var = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({input_argument})")
+            map_var = self.map_cse.generate(self.global_vars, f"affine_map<({input_expr})[{','.join(indirect_dims)}] -> ({output_expr})>")
+            index_var = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({input_argument})[{','.join(indirect_arg_dims)}]")
             local_dims = total_dims # Brodatcast tile shape
+        else:
+            index_var = self.parse_indices(index, buffer=buffer)
 
         if kg_tile_desc.vlane_split_axis in local_dims:
             local_vlane_split_axis = local_dims.index(kg_tile_desc.vlane_split_axis)

From bdbdf24ade07b3166f4e2e6741704515956515ac Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 24 Mar 2025 05:27:28 +0000
Subject: [PATCH 238/432] [Frontned/indirect] Support indirect access

---
 .github/workflows/docker-image.yml            | 18 +++-
 .github/workflows/pull-request.yml            | 18 +++-
 .../mlir/mlir_codegen_backend.py              | 96 ++++++++++++++++---
 tests/test_indirect_access.py                 | 51 ++++++++++
 4 files changed, 170 insertions(+), 13 deletions(-)
 create mode 100644 tests/test_indirect_access.py

diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 4b733d2a..c30a09c7 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -559,6 +559,22 @@ jobs:
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Mixtral_8x7B/test_attention.py
 
+  test_indirect:
+    name: Run test_indirect
+    runs-on: self-hosted
+    needs: build
+    env:
+      GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+    steps:
+      - name: Run test_indirect.py
+        run: |
+          echo "Running test_indirect.py"
+          echo $GIT_ACCESS_TOKEN | docker login ghcr.io -u USERNAME --password-stdin
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_indirect_access.py
+
   test_cleanup:
     name: Clean test cases
     runs-on: self-hosted
@@ -567,7 +583,7 @@ jobs:
             test_transpose2D, test_view3D_2D, test_layernorm,
             test_mlp, test_resnet, test_transformer, test_transpose3D,
             test_sparsity, test_activation, test_pool, test_perceptron,
-            test_fusion, test_mistral, test_moe]
+            test_fusion, test_mistral, test_moe, test_indirect]
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index 4cb24617..35d45e6c 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -581,6 +581,22 @@ jobs:
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Mixtral_8x7B/test_attention.py
 
+  test_indirect:
+    name: Run test_indirect
+    runs-on: self-hosted
+    needs: build
+    env:
+      GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+    steps:
+      - name: Run test_indirect.py
+        run: |
+          echo "Running test_indirect.py"
+          echo $GIT_ACCESS_TOKEN | docker login ghcr.io -u USERNAME --password-stdin
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_indirect_access.py
+
   test_cleanup:
     name: Clean test cases
     runs-on: self-hosted
@@ -589,7 +605,7 @@ jobs:
             test_transpose2D, test_view3D_2D, test_layernorm,
             test_mlp, test_resnet, test_transformer, test_transpose3D,
             test_sparsity, test_activation, test_pool, test_perceptron,
-            test_fusion, test_mistral, test_moe]
+            test_fusion, test_mistral, test_moe, test_indirect]
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index ea03e772..ebecd006 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -148,6 +148,13 @@ def binary_elementwise_common(operand1, operand2, var_info):
             elif op_type1[1][0] == "f" and op_type2[1][0] == "i":
                 operand2 = ops.to_dtype(operand2, op_type1[1], var_info)
                 op_type2 = var_info[operand2]
+            elif op_type1[1][0] == op_type2[1][0]:
+                if int(op_type1[1][1:]) > int(op_type2[1][1:]):
+                   operand2 = ops.ext(operand2, op_type1[1])
+                   op_type2 = var_info[operand2]
+                elif int(op_type1[1][1:]) < int(op_type2[1][1:]):
+                   operand1 = ops.ext(operand1, op_type2[1])
+                   op_type1 = var_info[operand1]
             else:
                 raise NotImplementedError("Unsupported type converting")
 
@@ -369,6 +376,17 @@ def reciprocal(operand, *args, var_info=None, **kwargs):
 
         return ops.div(ops.constant(1.0, dtype), operand), [tile_size, dtype]
 
+    @staticmethod
+    def ext(operand, dtype, *args, var_info=None, **kwargs):
+        op_type = var_info[operand]
+        shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else f"{op_type[1]}"
+        target_type = f"vector<{op_type[0]}x{dtype}>" if op_type[0] > 1 else f"{dtype}"
+        if op_type[0] == "f":
+            opcode = f'arith.extf'
+        else:
+            opcode = f'arith.extui'
+        return f'{opcode} %{operand} : {shape} to {target_type}', [op_type[0], dtype]
+
     # Logical operations
     @staticmethod
     def neg(operand, *args, var_info=None, **kwargs):
@@ -780,6 +798,7 @@ def parse_indices(self, expr, buffer=None) -> common.CSEVariable:
 
     def load(self, name: str, index: sympy.Expr):
         index = self.rename_indexing(index)
+        index = self.convert_indirect_indexing(index)
         padding = self.get_padding_type()
         dram_var = self.kernel_group.args.input(name)
         dtype = V.graph.get_dtype(name)
@@ -811,7 +830,7 @@ def load(self, name: str, index: sympy.Expr):
             line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}"
         out = self.cse.generate(self.loads, line)
         self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
-        self.spad_buffer_dict[out] = [sram_var, local_tile_desc.get_tile_size(), tile_shape]
+        self.spad_buffer_dict[str(out)] = [sram_var, local_tile_desc.get_tile_size(), tile_numel_per_lane, sram_index_var, tile_shape, vshape]
         return out
 
     def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
@@ -1023,16 +1042,7 @@ def store_reduction(self, name, index, value):
         self.cse = tmp_cse
 
     def indirect_indexing(self, index_var, size, check=True):
-        spad_buffer_var, tile_size, mlir_shape = self.spad_buffer_dict[index_var]
-        nr_rank = len(tile_size)
-        mlir_dtype = self.var_info[index_var][1]
-        line = f"affine.load %{spad_buffer_var}[{', '.join(['%const0']*nr_rank)}] : {mlir_shape}"
-        out = self.cse.generate(self.dma_loads, line)
-        self.register_var_info(out, [1, "index", [1]])
-        if mlir_dtype != "index":
-            line = f"arith.index_cast %{out} : {mlir_dtype} to {'index'}"
-            out = self.cse.generate(self.dma_loads, line)
-        return str(out)
+        return str(index_var)
 
     def _index_expr(self, tile_size, buffer, renamed_expression, index):
         str_tile_size = [str(dim) for dim in tile_size]
@@ -1177,6 +1187,8 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
         local_dims.sort() # Assume that smaller index is placed in the outer loop
         indirect_dims = [f"{i}" for i in index.free_symbols if "tmp" in str(i)]
         indirect_arg_dims = [f"%{i}" for i in index.free_symbols if "tmp" in str(i)]
+        for indirect_dim in indirect_dims:
+            index = index.replace(sympy.Symbol(indirect_dim), 0)
 
         # Reduction can have two type of tile size
         if broadcast and (total_dims != local_dims or (self.reduction_depth!=len(total_dims) and total_dims[:self.reduction_depth] == local_dims)):
@@ -1385,6 +1397,68 @@ def get_tag_cse(self, value, shape="memref<1xi32>"):
             self.tags[value] = self.alloc_cse.generate(self.alloc_buffer, f"memref.alloc() : {shape}")
         return self.tags[value]
 
+    def convert_indirect_indexing(self, index :sympy.Expr):
+        if "tmp" not in str(index):
+            return index
+
+        # Process start
+        indirect_dims = [str(dim) for dim in index.free_symbols if "tmp" in str(dim)]
+        indirect_dims.sort()
+        first_dim = indirect_dims[0]
+        spad_vars = dict()
+        tmp_comp, self.compute = self.compute, self.dma_loads
+
+        # Load indirect operands
+        for target_dim in indirect_dims:
+            sram_var, _, tile_numel_per_lane, sram_index_var, tile_shape, vshape = self.spad_buffer_dict[target_dim]
+            mlir_dtype = vshape.split("x")[1][:-1]
+            if tile_numel_per_lane > 1:
+                operation = "affine.vector_load"
+                line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape} // For indirect access"
+            else:
+                operation = "affine.load"
+                line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape} // For indirect access"
+            out = self.cse.generate(self.dma_loads, line)
+            self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
+            spad_vars[target_dim] = out
+
+        # Apply stride
+        for arg in index.args:
+            if "tmp" not in str(arg):
+                continue
+            if arg.is_Mul and arg.args[0].is_number:
+                coeff_dtype = self.var_info[spad_vars[str(arg.args[1])]][1]
+                coeff = ops.constant(int(arg.args[0]), coeff_dtype)
+                spad_vars[str(arg.args[1])] = ops.mul(spad_vars[str(arg.args[1])], coeff)
+            index = index.replace(arg, 0)
+
+        # Sum
+        for dim, var in spad_vars.items():
+            if dim == first_dim:
+                continue
+            spad_vars[first_dim] = ops.add(spad_vars[first_dim], var)
+
+        # Store index var
+        sram_var, _, tile_numel_per_lane, sram_index_var, tile_shape, vshape = self.spad_buffer_dict[first_dim]
+        if tile_numel_per_lane > 1:
+            operation = "affine.vector_store"
+            line = f"{operation} %{spad_vars[first_dim]}, %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}"
+        else:
+            operation = "affine.store"
+            line = f"{operation} %{spad_vars[first_dim]}, %{sram_var}[{sram_index_var}] : {tile_shape}"
+        out = self.cse.generate(self.dma_loads, line, assignment=False)
+
+        # Conversion
+        mlir_dtype = self.var_info[spad_vars[first_dim]][1]
+        line = f"affine.load %{sram_var}[{sram_index_var}] : {tile_shape}"
+        out = self.cse.generate(self.dma_loads, line)
+        if mlir_dtype != "index":
+            line = f"arith.index_cast %{out} : {mlir_dtype} to {'index'}"
+            out = self.cse.generate(self.dma_loads, line)
+        self.register_var_info(out, [1, "index", [1]])
+        self.compute = tmp_comp
+        return index + sympy.Symbol(str(out))
+
 @dataclasses.dataclass
 class LoopLevel:
     var: sympy.Expr
diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py
new file mode 100644
index 00000000..16d8afd3
--- /dev/null
+++ b/tests/test_indirect_access.py
@@ -0,0 +1,51 @@
+import torch
+import copy
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    message = f"|{name} Test Passed|"
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def test_indirect_vectoradd(device, size=(128, 128)):
+    def vectoradd(a, idx, b):
+        return a[idx] + b
+    x = torch.randn(size, dtype=torch.float32).to(device=device)
+    idx = torch.randint(0,128, [128]).to(device=device)
+    y = torch.randn(128, dtype=torch.float32).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(vectoradd)
+    res = opt_fn(x, idx, y)
+    out = vectoradd(x.cpu(), idx.cpu(), y.cpu())
+    test_result("VectorAdd", res, out)
+
+def test_embedding(device, vocab_size, dim):
+    emb = torch.nn.Embedding(vocab_size, dim)
+    cpu_emb = copy.deepcopy(emb)
+
+    prompt = torch.randint(0, 1023, [511], dtype=torch.int)
+    cpu_prompt = copy.deepcopy(prompt)
+    prompt = prompt.to(device=device)
+
+    emb.to(device=device)
+    opt_emb = torch.compile(dynamic=False)(emb)
+    res = opt_emb(prompt)
+    cpu_res = cpu_emb(cpu_prompt)
+    test_result("Embedding", res, cpu_res)
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    test_indirect_vectoradd(device)
+    #test_embedding(device, 1024, 2048)
\ No newline at end of file

From 39dfb839d032358214f9e9f8490b765b0004459b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 24 Mar 2025 14:45:38 +0000
Subject: [PATCH 239/432] [Frontend+config] Add IR dump config

---
 PyTorchSimFrontend/extension_codecache.py | 2 ++
 PyTorchSimFrontend/extension_config.py    | 1 +
 2 files changed, 3 insertions(+)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index a7d952d7..29da29e5 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -65,6 +65,7 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
             -convert-func-to-llvm \
             -convert-index-to-llvm \
             -reconcile-unrealized-casts \
+            {'--mlir-print-ir-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_MLIR_IR else ''} \
             {filename}.mlir -o {filename}_llvm.mlir
         """,
     ).strip(),
@@ -101,6 +102,7 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
             -convert-func-to-llvm \
             -convert-index-to-llvm \
             -reconcile-unrealized-casts \
+            {'--mlir-print-ir-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_MLIR_IR else ''} \
             {filename}.mlir -o {sample_filename}_llvm.mlir
         """,
     ).strip(),
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index f290e8e7..222fe667 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -29,6 +29,7 @@
 CONFIG_TORCHSIM_LLVM_PATH = os.environ.get('TORCHSIM_LLVM_PATH', default="/usr/bin")
 CONFIG_TORCHSIM_CUSTOM_PASS_PATH = os.environ.get('TORCHSIM_CUSTOM_PASS_PATH',
                                            default=f"{CONFIG_TORCHSIM_DIR}/GemminiLowerPass/build")
+CONFIG_TORCHSIM_DUMP_MLIR_IR = int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False))
 
 # Backendsim config
 CONFIG_TORCHSIM_BACKEND_CONFIG = os.environ.get('TORCHSIM_CONFIG',

From 1d00fa1a6dbf9d87fd741fedc303d7352c67a0b9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 25 Mar 2025 10:35:06 +0000
Subject: [PATCH 240/432] [Frontend+config] Add LLVM IR dump config

---
 PyTorchSimFrontend/extension_codecache.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 29da29e5..bd209329 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -76,7 +76,11 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
     ).strip(),
             re.sub(r"[ \n]+", " ",
         f"""
-            {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc -relocation-model=pic -march=riscv64 -mattr=+m,+f,+d,+a,+c,+v,+xsfvcp,zvl{vlen}b -O2 {filename}.ll -o {filename}.s
+            {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc \
+                -relocation-model=pic -march=riscv64 \
+                -mattr=+m,+f,+d,+a,+c,+v,+xsfvcp,zvl{vlen}b \
+                {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_MLIR_IR else ''} \
+                -O2 {filename}.ll -o {filename}.s
         """,
     ).strip()]
 
@@ -113,7 +117,11 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
     ).strip(),
             re.sub(r"[ \n]+", " ",
         f"""
-            {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc -relocation-model=pic -march=riscv64 -mattr=+m,+f,+d,+a,+c,+v,+xsfvcp,zvl{vlen}b -O2 {sample_filename}.ll -o {sample_filename}.s
+            {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc \
+                -relocation-model=pic -march=riscv64 \
+                -mattr=+m,+f,+d,+a,+c,+v,+xsfvcp,zvl{vlen}b \
+                {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_MLIR_IR else ''} \
+                -O2 {sample_filename}.ll -o {sample_filename}.s
         """,
     ).strip()]
 

From 5e07728146e118e8fc1ea3ac8cfc852ed57289e8 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 25 Mar 2025 10:35:56 +0000
Subject: [PATCH 241/432] [Frontend] Peephole optimization for load/store
 operation

---
 .../mlir/mlir_codegen_backend.py              | 40 ++++++++++++-------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index ebecd006..69a5e5a4 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -848,23 +848,33 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
+        tile_size = local_tile_desc.get_tile_size()
         vshape = local_tile_desc.get_mlir_vshape(mlir_dtype)
-
-        # Define scratch pad buffer
-        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
-
-        # Generate vector store instruction
-        store_size, operand_type = self.var_info[value]
-        if mlir_dtype != operand_type:
-            value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
-
-        if tile_numel_per_lane > 1 and store_size > 1:
-            operation = "affine.vector_store"
-            line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}"
+        origin_tile_size = self.spad_buffer_dict[str(value)][1] if str(value) in self.spad_buffer_dict else tile_size
+        require_store = True
+        if str(value) in self.spad_buffer_dict:
+            # Todo. If tile_size is not same (i.e., view operation), we can't apply peephole optimization easily
+            require_store = self.spad_buffer_dict[str(value)][1] != tile_size
+
+        if require_store:
+            # Define scratch pad buffer
+            sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
+
+            # Generate vector store instruction
+            store_size, operand_type = self.var_info[value]
+            if mlir_dtype != operand_type:
+                value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
+
+            if tile_numel_per_lane > 1 and store_size > 1:
+                operation = "affine.vector_store"
+                line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}"
+            else:
+                operation = "affine.store"
+                line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}"
+            self.stores.writeline(common.DeferredLine(name, line)) # TODO: Should be changed to self.compute?
         else:
-            operation = "affine.store"
-            line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}"
-        self.stores.writeline(common.DeferredLine(name, line)) # TODO: Should be changed to self.compute?
+            sram_var = self.spad_buffer_dict[str(value)][0]
+            sram_index_var = self.spad_buffer_dict[str(value)][3]
 
         # Generate DMA instruction
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,

From 55c8cb318ceaa4db98ac6397e927fd6887dd0b2c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 26 Mar 2025 03:18:28 +0000
Subject: [PATCH 242/432] [Funsim] Isolate dump storage for funcsim

---
 PyTorchSimFrontend/extension_codecache.py | 20 ++++++++--
 Scheduler/scheduler.py                    |  5 +--
 Simulator/simulator.py                    | 45 +++++++++++++----------
 3 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index bd209329..a51b0ff2 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -326,18 +326,18 @@ def dummy_simulator(*args, **kwargs):
             result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key))
             # Dump arguments and meta data
             dump_metadata(args, arg_attributes, result_path)
+            runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
             if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE:
                 funcsim = FunctionalSimulator(result_path, key)
                 funcsim.run_spike(args, arg_attributes,
-                                  result_path, self.validation_binary_name,
-                                  kwargs['intermediate_op'] if 'intermediate_op' in kwargs else None,
+                                  runtime_path, self.validation_binary_name,
                                   vectorlane_size=vectorlane_size, spad_info=spad_info,
                                   cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS)
             if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
                 return
 
             onnx_path = os.path.join(result_path, "tile_graph.onnx")
-            attribute_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "attribute")
+            attribute_path = os.path.join(runtime_path, "attribute")
             backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
             backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG)
             backsim.vectorlane_size = vectorlane_size
@@ -348,6 +348,20 @@ def dummy_simulator(*args, **kwargs):
 
         def dryrun_simulator(*args, **kwargs):
             key = future.result()
+             # Run simulator pass
+            result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key))
+            # Dump arguments and meta data
+            dump_metadata(args, arg_attributes, result_path)
+            runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
+
+            # Todo. Support valude dependent mode for graph mode
+            if False: # extension_config.CONFIG_TORCHSIM_VALIDATION_MODE:
+                funcsim = FunctionalSimulator(result_path, key)
+                funcsim.run_spike(args, arg_attributes,
+                                  runtime_path, self.validation_binary_name,
+                                  vectorlane_size=vectorlane_size, spad_info=spad_info,
+                                  cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS)
+            return result_path, runtime_path
 
         is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
         target_simulator = dryrun_simulator if is_dryrun else dummy_simulator
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index feb7e973..89f9ded9 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -239,11 +239,10 @@ def finish_model(self, model : SchedulerDNNModel, output : torch.Tensor):
             self.finish_req_dict[req] = RequestReturn(RequestReturn.FINISHED)
 
     def prepare_launch_kernel(self, kernel, inputs):
-        key = kernel.future.result() if hasattr(kernel.future, "result") else kernel.future
-        result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key))
+        result_path, runtime_path = kernel(*inputs)
         onnx_path = os.path.join(result_path, "tile_graph.onnx")
 
-        attribute_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "attribute")
+        attribute_path = os.path.join(runtime_path, "attribute")
         attribute_path = self.backend_simulator.create_attribute_file(attribute_path, inputs)
         return onnx_path, attribute_path
 
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 78990346..35d0d9bd 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -48,7 +48,7 @@ def write_arg(self, arg, path, name):
 
         if (isinstance(arg, torch.Tensor)):
             data_path = os.path.join(dump_path, f'{index}.raw')
-            tensor = arg.cpu()
+            tensor = arg.cpu().detach()
             t_arr = tensor.numpy().flatten()
             t_arr.tofile(data_path)
         else:
@@ -71,29 +71,19 @@ def dump_args(self, args, arg_attributes, load_path, dump_path):
 
         return array_size, file_path
 
-    def run_spike(self, args, arg_attributes, path, binary, intermediate_op=None, vectorlane_size=4, spad_info=None, cleanup=False):
-        load_path = self.path
-        dump_path = self.path
+    def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=4, spad_info=None, cleanup=False):
+        load_path = runtime_path
+        dump_path = runtime_path
 
-        target_binary = os.path.join(path, binary)
-        objdump = f"riscv64-unknown-elf-objdump -d {target_binary} > {os.path.join(path, 'binary.dump')}"
+        target_binary = os.path.join(self.path, binary)
+        objdump = f"riscv64-unknown-elf-objdump -d {target_binary} > {os.path.join(self.path, 'binary.dump')}"
         kernel_start = f"nm {target_binary} | grep 'kernel' | awk 'NR==1 {{print $1}}'"
-        kernel_end = f"nm {target_binary} | grep 'kernel' | awk 'NR==1 {{print $1}}' | xargs -I {{}} awk '/{{}}/,0' {os.path.join(path, 'binary.dump')} | grep ret | awk 'NR==1 {{print $1}}' | awk '{{gsub(/:$/, \"\"); print}}'"
+        kernel_end = f"nm {target_binary} | grep 'kernel' | awk 'NR==1 {{print $1}}' | xargs -I {{}} awk '/{{}}/,0' {os.path.join(self.path, 'binary.dump')} | grep ret | awk 'NR==1 {{print $1}}' | awk '{{gsub(/:$/, \"\"); print}}'"
 
         subprocess.run(objdump, shell=True)
         kernel_start_addr = subprocess.run(kernel_start, shell=True, stdout=subprocess.PIPE).stdout.strip().decode('utf-8')
         kernel_end_addr = subprocess.run(kernel_end, shell=True, stdout=subprocess.PIPE).stdout.strip().decode('utf-8')
 
-        if intermediate_op is not None:
-            os.makedirs(os.path.join(self.path, "intermediate"), exist_ok=True)
-            if intermediate_op & 0b10: # input comes from intermediate
-                load_path = os.path.join(self.path, "intermediate")
-            if intermediate_op & 0b01: # output dumps to intermediate
-                dump_path = os.path.join(self.path, "intermediate")
-                for name, attr in arg_attributes:
-                    if attr[0] == 2:
-                        os.makedirs(os.path.join(dump_path, name), exist_ok=True)
-
         _, file_path = self.dump_args(args, arg_attributes, load_path, dump_path)
         file_path_str = ' '.join(file_path)
 
@@ -104,8 +94,9 @@ def run_spike(self, args, arg_attributes, path, binary, intermediate_op=None, ve
             f"--scratchpad-size={spad_info['spad_size']} "
         vectorlane_option = f"--vectorlane-size={vectorlane_size}"
         kernel_address = f"--kernel-addr={kernel_start_addr}:{kernel_end_addr}"
-        base_addr = f"--base-path={path}"
-        run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_addr} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
+        base_path= f"--base-path={runtime_path}"
+        os.makedirs(os.path.join(base_path, "indirect_access"), exist_ok=True)
+        run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
 
         print("[SpikeSimulator] cmd> ", run)
         run_cmd = shlex.split(run)
@@ -125,6 +116,22 @@ def run_spike(self, args, arg_attributes, path, binary, intermediate_op=None, ve
                 if os.path.exists(path):
                     os.remove(path)
 
+    @staticmethod
+    def get_runtime_dump_path(base_path, prefix="runtime", zfill=4):
+        indices = [
+            int(match.group(1))
+            for d in os.listdir(base_path)
+            if (match := re.fullmatch(rf"{prefix}_(\d{{{zfill}}})", d))
+        ]
+
+        max_index = max(indices, default=-1)
+        next_index = max_index + 1
+        folder_name = f"{prefix}_{str(next_index).zfill(zfill)}"
+        full_path = os.path.join(base_path, folder_name)
+
+        os.makedirs(full_path)
+        return full_path
+
 class CycleSimulator():
     def __init__(self) -> None:
         pass

From c4251f807fc5768ba2ce9a01c88eba50d059a6be Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 26 Mar 2025 04:30:41 +0000
Subject: [PATCH 243/432] [TOGGen] Fix multpile store node case

---
 AsmParser/tog_generator.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
index 88f63f72..29ae3460 100644
--- a/AsmParser/tog_generator.py
+++ b/AsmParser/tog_generator.py
@@ -153,8 +153,6 @@ def create_node(self, dump_data, prev_node):
                 connect_nodes(prev_node[-1].get_parent()[-1], new_node)
             elif isinstance(prev_node[-1], memory_wait_node) and isinstance(new_node, memory_wait_node):
                 connect_nodes(prev_node[-1].get_parent()[-1], new_node)
-            elif isinstance(prev_node[-1], store_node) and isinstance(new_node, store_node):
-                connect_nodes(prev_node[-1].get_parent()[-1], new_node)
             elif isinstance(prev_node[-1], load_node) and isinstance(new_node, compute_node) or \
                  isinstance(prev_node[-1], memory_wait_node) and isinstance(new_node, compute_node):
                 for pn in prev_node:
@@ -233,5 +231,5 @@ def generate_tile_graph(self, name="tile_graph", cycle_list=list, x_offset=int,
 
 if __name__ == "__main__":
     t = tog_generator()
-    t.load_file("/workspace/llvm-project/build/tile_operation_graph.py")
-    t.generate_tile_graph("./tile_graph.onnx", cycle_list=[1,1,1,1,1], offset=0, vector_lane=128)
\ No newline at end of file
+    t.load_file("/tmp/torchinductor/tmp/sz6qi7bqkxn/csz6qi7bqkxnam5sxok4l4sppddjkijq5rd55s4qvdutd5ni73fc_tog.py")
+    t.generate_tile_graph("./tile_graph.onnx", cycle_list=[1,1,1,1,1], x_offset=0, w_offset=0, vector_lane=128)
\ No newline at end of file

From 682d8e3192464f8b974a6ff3f1fdcef789ea3ff3 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 26 Mar 2025 05:08:59 +0000
Subject: [PATCH 244/432] [TOG] Add indirect mode information

---
 AsmParser/onnx_utility.py  | 1 +
 AsmParser/tog_generator.py | 1 +
 Simulator/simulator.py     | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/AsmParser/onnx_utility.py b/AsmParser/onnx_utility.py
index c0a7af40..d46e8347 100644
--- a/AsmParser/onnx_utility.py
+++ b/AsmParser/onnx_utility.py
@@ -73,6 +73,7 @@ def __init__(self, tile_info, inst_list=list(), node_id=0):
         self.torchsim_tag_stride_list = tile_info["tag_stride_list"]
         self.torchsim_loop_idx_list = tile_info["loop_idx_list"]
         self.torchsim_is_async = tile_info["is_async"]
+        self.torchsim_indirect_mode = tile_info["indirect_mode"]
 
 class load_node(memory_node):
     pass
diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
index 29ae3460..0616a21b 100644
--- a/AsmParser/tog_generator.py
+++ b/AsmParser/tog_generator.py
@@ -98,6 +98,7 @@ def _create_node(self, dump_data):
             tile_info["tag_stride_list"] = dump_data["tag_stride_list"]
             tile_info["loop_idx_list"] = dump_data["loop_idx_list"]
             tile_info["is_async"] = dump_data["is_async"]
+            tile_info["indirect_mode"] = dump_data["indirect_mode"]
             is_write = dump_data["is_write"]
             if is_write:
                 new_node = store_node(tile_info, node_id=node_id)
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 35d0d9bd..40351b14 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -95,7 +95,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
         vectorlane_option = f"--vectorlane-size={vectorlane_size}"
         kernel_address = f"--kernel-addr={kernel_start_addr}:{kernel_end_addr}"
         base_path= f"--base-path={runtime_path}"
-        os.makedirs(os.path.join(base_path, "indirect_access"), exist_ok=True)
+        os.makedirs(os.path.join(runtime_path, "indirect_access"), exist_ok=True)
         run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
 
         print("[SpikeSimulator] cmd> ", run)

From 933d6fef6d77b5be034630222c1524bde9411747 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 26 Mar 2025 10:04:41 +0000
Subject: [PATCH 245/432] [Backend] Support indirect access modeling

---
 PyTorchSimBackend/include/Instruction.h     | 36 +++--------
 PyTorchSimBackend/include/TileGraphParser.h | 18 +++++-
 PyTorchSimBackend/src/Instruction.cc        | 67 +++++++++++++++++++++
 PyTorchSimBackend/src/TileGraphParser.cc    | 34 ++++++++++-
 PyTorchSimBackend/src/main.cc               | 18 +-----
 5 files changed, 124 insertions(+), 49 deletions(-)

diff --git a/PyTorchSimBackend/include/Instruction.h b/PyTorchSimBackend/include/Instruction.h
index 2a6d66c0..86979530 100644
--- a/PyTorchSimBackend/include/Instruction.h
+++ b/PyTorchSimBackend/include/Instruction.h
@@ -1,5 +1,5 @@
 #pragma once
-
+#include <fstream>
 #include <robin_hood.h>
 #include <spdlog/fmt/ranges.h>
 #include <spdlog/spdlog.h>
@@ -31,6 +31,8 @@ class Instruction {
   bool is_dma_read() { return opcode == Opcode::MOVIN; }
   bool is_dma_write() { return opcode == Opcode::MOVOUT; }
   bool is_async_dma() { return _is_async_dma; }
+  bool is_indirect_mode() { return _is_indirect_mode; }
+  std::string get_indirect_index_path() { return _indirect_index_path; }
   bool is_ready() { return ready_counter == 0; }
   void inc_ready_counter() { ready_counter++; }
   void dec_ready_counter() {
@@ -47,35 +49,11 @@ class Instruction {
   cycle_type get_overlapping_cycle() { return overlapping_cycle; }
   cycle_type get_compute_cycle() { return compute_cycle; }
   void set_compute_cycle(cycle_type cycle) { compute_cycle = cycle; }
+  void set_indirect_index_path(std::string indirect_path) { _is_indirect_mode=true; _indirect_index_path=indirect_path; }
   void print();
-  std::set<addr_type> get_dram_address(addr_type dram_req_size) {
-    std::set<addr_type> address_set;
-
-    /* Set 4D shape*/
-    while (tile_size.size() < 4)
-      tile_size.insert(tile_size.begin(), 1);
-
-    while (_stride_list.size() < 4)
-      _stride_list.insert(_stride_list.begin(), 1);
-
-    /* Iterate tile_size */
-    for (int dim0=0; dim0<tile_size.at(0); dim0++) {
-      for (int dim1=0; dim1<tile_size.at(1); dim1++) {
-        for (int dim2=0; dim2<tile_size.at(2); dim2++) {
-          for (int dim3=0; dim3<tile_size.at(3); dim3++) {
-            addr_type address = dim0*_stride_list.at(_stride_list.size() - 4) + \
-                                dim1*_stride_list.at(_stride_list.size() - 3) + \
-                                dim2*_stride_list.at(_stride_list.size() - 2) + \
-                                dim3*_stride_list.at(_stride_list.size() - 1);
-            address = dram_addr + address * _precision;
-            address_set.insert(address - (address & dram_req_size-1));
-          }
-        }
-      }
-    }
-    return address_set;
-  }
+  std::set<addr_type> get_dram_address(addr_type dram_req_size);
   std::vector<addr_type> get_trace_address() { return _trace_address; }
+  bool load_indirect_index(const std::string& path, uint64_t*& indirect_index, const std::vector<uint64_t>& tile_size);
   void set_trace_address(std::vector<addr_type>& trace_address) { _trace_address = trace_address; }
   size_t get_free_sram_size() { return _free_sram_size; }
   void adjust_dram_address() {
@@ -131,4 +109,6 @@ class Instruction {
   std::string _addr_name;
   int _nr_inner_loop = 0;
   bool _is_async_dma=false;
+  bool _is_indirect_mode=false;
+  std::string _indirect_index_path="";
 };
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/PyTorchSimBackend/include/TileGraphParser.h
index 6eaa83d2..a8a97bb4 100644
--- a/PyTorchSimBackend/include/TileGraphParser.h
+++ b/PyTorchSimBackend/include/TileGraphParser.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <fstream>
 #include <algorithm>
+#include <filesystem>
 #include <nlohmann/json.hpp>
 #include <fmt/ranges.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
@@ -33,6 +34,8 @@ enum class LoopType {
   INNER_LOOP
 };
 
+bool loadConfig(const std::string& config_path, json& config_json);
+
 class TileNode {
  public:
   TileNode(onnx::NodeProto& node);
@@ -64,7 +67,7 @@ class TileNode {
 
 class TileGraphParser {
  public:
-  TileGraphParser(std::string onnx_path, json& attribute_json);
+  TileGraphParser(std::string onnx_path, std::string attribute_path);
   std::shared_ptr<TileNode> get_top_loop();
   std::unique_ptr<TileGraph>& get_tile_graph() { return _tile_graph; }
   addr_type lookup(std::string key);
@@ -83,6 +86,15 @@ class TileGraphParser {
   void register_memory_tag(std::string name, std::vector<int>& tag_key);
   bool check_memory_tag(std::string name, std::vector<int>& tag_key);
   void clear_tag_table() { _tag_table.clear(); }
+  std::string get_indirect_path() {
+    namespace fs = std::filesystem;
+    fs::path original(_attribute_path);
+    fs::path base_folder = original.parent_path().parent_path();
+    fs::path new_path = base_folder / "indirect_access" / (std::string("indirect_index") + std::to_string(indirect_counter) + ".raw");
+    return new_path.string();
+  }
+  void inc_indirect_counter() { indirect_counter++; }
+
  private:
   void register_tile(std::shared_ptr<TileNode> tile_node);
   void _tile_generate() {}
@@ -92,6 +104,8 @@ class TileGraphParser {
 
   json _attribute_json;
   std::string _tog_path;
+  std::string _attribute_path;
+  uint64_t indirect_counter = 0;
   std::map<std::string, std::shared_ptr<TileNode>> _output_map;
   std::vector<std::vector<std::shared_ptr<TileNode>>> _loop_nodes;
   std::vector<std::shared_ptr<TileNode>> _tile_vec;
@@ -129,6 +143,7 @@ class TileMemoryNode : public TileNode {
   std::vector<int>& get_tag_stride_list() { return _tag_stride_list; }
   std::vector<std::string>& get_loop_idx_list() { return _loop_idx_list; }
   bool is_async_node() { return _is_async; }
+  bool is_indirect() { return _is_indirect; }
   void print_node() override;
 
  private:
@@ -136,6 +151,7 @@ class TileMemoryNode : public TileNode {
   std::vector<int> _stride_list;
   size_t _element_size;
   bool _is_async;
+  bool _is_indirect;
   std::string _base_addr_name;
   std::vector<std::string> _tag_idx_list;
   std::vector<int> _tag_stride_list;
diff --git a/PyTorchSimBackend/src/Instruction.cc b/PyTorchSimBackend/src/Instruction.cc
index ac79775b..5f21f5c6 100644
--- a/PyTorchSimBackend/src/Instruction.cc
+++ b/PyTorchSimBackend/src/Instruction.cc
@@ -59,4 +59,71 @@ void Instruction::dec_waiting_request() {
 
 void Instruction::print() {
   spdlog::info("{}", opcode_to_string(opcode));
+}
+
+std::set<addr_type> Instruction::get_dram_address(addr_type dram_req_size) {
+  std::set<addr_type> address_set;
+  uint64_t* indirect_index = NULL;
+  size_t index_count = 0;
+  /* Set 4D shape*/
+  while (tile_size.size() < 4)
+    tile_size.insert(tile_size.begin(), 1);
+
+  while (_stride_list.size() < 4)
+    _stride_list.insert(_stride_list.begin(), 1);
+  if (_is_indirect_mode) {
+    spdlog::trace("[Indirect Access] Indirect mode, dump_path: {}", _indirect_index_path);
+    load_indirect_index(_indirect_index_path, indirect_index, tile_size);
+  }
+
+  /* Iterate tile_size */
+  for (int dim0=0; dim0<tile_size.at(0); dim0++) {
+    for (int dim1=0; dim1<tile_size.at(1); dim1++) {
+      for (int dim2=0; dim2<tile_size.at(2); dim2++) {
+        for (int dim3=0; dim3<tile_size.at(3); dim3++) {
+          addr_type address = dim0*_stride_list.at(_stride_list.size() - 4) + \
+                              dim1*_stride_list.at(_stride_list.size() - 3) + \
+                              dim2*_stride_list.at(_stride_list.size() - 2) + \
+                              dim3*_stride_list.at(_stride_list.size() - 1);
+          address = dram_addr + address * _precision;
+          if (indirect_index != NULL) {
+            uint64_t index_val = indirect_index[index_count++];
+            address += index_val * _precision;
+          }
+          address_set.insert(address - (address & dram_req_size-1));
+        }
+      }
+    }
+  }
+  return address_set;
+}
+
+bool Instruction::load_indirect_index(const std::string& path, uint64_t*& indirect_index, const std::vector<uint64_t>& tile_size) {
+  size_t count;
+  std::ifstream ifs(path, std::ios::binary | std::ios::ate);
+  if (!ifs) {
+    spdlog::warn("[Indirect Access] Failed to open index file(\'{}\')", path);
+    return false;
+  }
+
+  std::streamsize size = ifs.tellg();
+  ifs.seekg(0, std::ios::beg);
+  count = size / sizeof(uint64_t);
+
+  uint64_t expected_count = tile_size[0] * tile_size[1] * tile_size[2] * tile_size[3];
+  if (size % sizeof(uint64_t) != 0 || count != expected_count) {
+    spdlog::warn("[Indirect Access] Invalid file size ({} Bytes) at \'{}\'", size, path);
+    return false;
+  }
+
+  indirect_index = new uint64_t[count];
+
+  if (!ifs.read(reinterpret_cast<char*>(indirect_index), size)) {
+    spdlog::warn("[Indirect Access] Failed to read data from file (\'{}\')", path);
+    delete[] indirect_index;
+    indirect_index = NULL;
+    count = 0;
+    return false;
+  }
+  return true;
 }
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index fc2a9392..4985d2f9 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -1,5 +1,18 @@
 #include "TileGraphParser.h"
 
+bool loadConfig(const std::string& config_path, json& config_json) {
+  std::ifstream config_file(config_path);
+  if (config_file.is_open()) {
+      config_file >> config_json;
+      config_file.close();
+      spdlog::info("[LoadConfig] Success to open \"{}\"", config_path);
+      return true;
+  } else {
+    spdlog::error("[LoadConfig] Failed to open \"{}\"", config_path);
+    return false;
+  }
+}
+
 void printIndexMap(std::string prefix, const std::map<std::string, int>& indexMap) {
     std::ostringstream oss;
     for (const auto& [key, value] : indexMap) {
@@ -201,6 +214,10 @@ TileMemoryNode::TileMemoryNode(onnx::NodeProto& node) : TileNode(node) {
         _loop_idx_list.push_back(attribute.strings(i));
     } else if (attribute.name() == "torchsim_is_async") {
       _is_async = attribute.i();
+    } else if (attribute.name() == "torchsim_indirect_mode") {
+      _is_indirect = attribute.i();
+    } else {
+      spdlog::info("Unknown attribute: {}", attribute.name());
     }
   }
 }
@@ -215,6 +232,7 @@ void TileMemoryNode::print_node() {
   spdlog::debug("{} tag_list: {}", spaces, fmt::join(_tag_idx_list, ", "));
   spdlog::debug("{} tag_stride_list: {}", spaces, fmt::join(_tag_stride_list, ", "));
   spdlog::debug("{} index_list: {}", spaces, fmt::join(_loop_idx_list, ", "));
+  spdlog::debug("{} indirect mode: {}", spaces, _is_indirect);
 }
 
 TileMemoryWaitNode::TileMemoryWaitNode(onnx::NodeProto& node) : TileNode(node) {
@@ -402,6 +420,10 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       inst->adjust_dram_address();
       inst->set_is_async(mem_node->is_async_node());
       inst->set_numa_id(numa_id);
+      if (mem_node->is_indirect()) {
+        inst->set_indirect_index_path(tog_parser->get_indirect_path());
+        tog_parser->inc_indirect_counter();
+      }
       link_map[tile_node] = inst;
       tile_vec.back()->append_instuction(inst);
     } else if (tile_node->get_type() == TileType::STORE_NODE) {
@@ -452,6 +474,10 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       inst->adjust_dram_address();
       inst->set_is_async(mem_node->is_async_node());
       inst->set_numa_id(numa_id);
+      if (mem_node->is_indirect()) {
+        inst->set_indirect_index_path(tog_parser->get_indirect_path());
+        tog_parser->inc_indirect_counter();
+      }
       link_map[tile_node] = inst;
       tile_vec.back()->append_instuction(inst);
     } else if (tile_node->get_type() == TileType::MEMORY_WAIT_NODE) {
@@ -696,14 +722,16 @@ void TileLoopNode::print_node() {
   spdlog::debug("{} stride: {} ", spaces, _stride);
 }
 
-TileGraphParser::TileGraphParser(std::string onnx_path, json& attribute_json) {
+TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_path) {
+  loadConfig(attribute_path, _attribute_json);
+  _attribute_path = attribute_path;
+
   /* Note: this parsing algorithm assume that all node are sorted in topological-order */
   std::ifstream model_istream(onnx_path);
   google::protobuf::io::IstreamInputStream zero_copy_input(&model_istream);
   onnx::ModelProto model_proto;
 
   /* Attribute parsing */
-  _attribute_json = attribute_json;
   if (_attribute_json.contains("address_info")) {
     auto address_info = _attribute_json["address_info"];
     for (auto it = address_info.begin(); it != address_info.end(); ++it) {
@@ -823,7 +851,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, json& attribute_json) {
   /* Iterate outer loop and initialize inner loop */
   for (auto iter=_tile_graph->begin(); iter!=_tile_graph->end(); ++iter) {
     std::shared_ptr<TileSubGraph> subgraph = std::make_shared<TileSubGraph>();
-    subgraph->set_core_id(getCoreIdFromJson(attribute_json, subgraph->get_id()));
+    subgraph->set_core_id(getCoreIdFromJson(_attribute_json, subgraph->get_id()));
     auto indices = iter.get_indices();
     for (auto loop : _loop_nodes.at(last_outer_idx)) {
       std::shared_ptr<TileLoopNode> outer_loop = std::static_pointer_cast<TileLoopNode>(loop);
diff --git a/PyTorchSimBackend/src/main.cc b/PyTorchSimBackend/src/main.cc
index 84afe2ce..ecdd85aa 100644
--- a/PyTorchSimBackend/src/main.cc
+++ b/PyTorchSimBackend/src/main.cc
@@ -12,24 +12,8 @@ namespace po = boost::program_options;
 const char* env_value = std::getenv("BACKENDSIM_DRYRUN");
 bool isDryRun = (env_value != nullptr && std::string(env_value) == "1");
 
-bool loadConfig(const std::string& config_path, json& config_json) {
-  std::ifstream config_file(config_path);
-  if (config_file.is_open()) {
-      config_file >> config_json;
-      config_file.close();
-      spdlog::info("[LoadConfig] Success to open \"{}\"", config_path);
-      return true;
-  } else {
-    spdlog::error("[LoadConfig] Failed to open \"{}\"", config_path);
-    return false;
-  }
-}
-
 void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, cycle_type request_time=0, int partiton_id=0) {
-  json attribute_json;
-  loadConfig(attribute_path, attribute_json);
-
-  auto graph_praser = TileGraphParser(onnx_path, attribute_json);
+  auto graph_praser = TileGraphParser(onnx_path, attribute_path);
   std::unique_ptr<TileGraph>& tile_graph = graph_praser.get_tile_graph();
   tile_graph->set_arrival_time(request_time ? request_time : simulator->get_core_cycle());
   spdlog::info("[Scheduler {}] Register graph path: {} operation: {} at {}", partiton_id, onnx_path, tile_graph->get_name(), simulator->get_core_cycle());

From 39a1e75cfd654ee5b060800e97ef0cdbbea93af5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 26 Mar 2025 10:45:16 +0000
Subject: [PATCH 246/432] [CI] Fix dockefile

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 92bd2d47..8e149883 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,7 +53,7 @@ ENV LLVM_DIR /riscv-llvm
 
 # Install Spike simulator
 RUN git clone https://${GIT_ACCESS_TOKEN}@github.com/PSAL-POSTECH/riscv-isa-sim.git --branch TorchSim && cd riscv-isa-sim && mkdir build && cd build && \
-    ../configure --prefix=$RISCV && make -j && make install && make clean
+    ../configure --prefix=$RISCV && make -j && make install
 
 # Install Proxy kernel
 RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \

From 3d5796917a332ef307a254275b387b8cd442b7ab Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 27 Mar 2025 06:26:39 +0000
Subject: [PATCH 247/432] [Ramulator2] Update stat logic

---
 PyTorchSimBackend/extern/ramulator2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimBackend/extern/ramulator2 b/PyTorchSimBackend/extern/ramulator2
index 5f86be78..e97b356a 160000
--- a/PyTorchSimBackend/extern/ramulator2
+++ b/PyTorchSimBackend/extern/ramulator2
@@ -1 +1 @@
-Subproject commit 5f86be78ad4cb8d56e44cdcf43285e1a68b73450
+Subproject commit e97b356a023dd7ae45582de37736d79498801f72

From a871f6637aaa41051d4db2c0372122fd62c842dd Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 27 Mar 2025 11:07:23 +0000
Subject: [PATCH 248/432] [Config] Seperate llvm ir dump option

---
 PyTorchSimFrontend/extension_codecache.py | 4 ++--
 PyTorchSimFrontend/extension_config.py    | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index a51b0ff2..177e1893 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -79,7 +79,7 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc \
                 -relocation-model=pic -march=riscv64 \
                 -mattr=+m,+f,+d,+a,+c,+v,+xsfvcp,zvl{vlen}b \
-                {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_MLIR_IR else ''} \
+                {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_LLVM_IR else ''} \
                 -O2 {filename}.ll -o {filename}.s
         """,
     ).strip()]
@@ -120,7 +120,7 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc \
                 -relocation-model=pic -march=riscv64 \
                 -mattr=+m,+f,+d,+a,+c,+v,+xsfvcp,zvl{vlen}b \
-                {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_MLIR_IR else ''} \
+                {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_LLVM_IR else ''} \
                 -O2 {sample_filename}.ll -o {sample_filename}.s
         """,
     ).strip()]
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 222fe667..c6fd515f 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -30,6 +30,7 @@
 CONFIG_TORCHSIM_CUSTOM_PASS_PATH = os.environ.get('TORCHSIM_CUSTOM_PASS_PATH',
                                            default=f"{CONFIG_TORCHSIM_DIR}/GemminiLowerPass/build")
 CONFIG_TORCHSIM_DUMP_MLIR_IR = int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False))
+CONFIG_TORCHSIM_DUMP_LLVM_IR = int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False))
 
 # Backendsim config
 CONFIG_TORCHSIM_BACKEND_CONFIG = os.environ.get('TORCHSIM_CONFIG',

From bcf6791c615de995d267b44355b368c067c0ddfc Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Fri, 28 Mar 2025 04:49:13 +0000
Subject: [PATCH 249/432] [Debug] Gem5 debug script

---
 debug/gem5.sh     |  27 +++++
 debug/pipeline.py | 257 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 284 insertions(+)
 create mode 100755 debug/gem5.sh
 create mode 100644 debug/pipeline.py

diff --git a/debug/gem5.sh b/debug/gem5.sh
new file mode 100755
index 00000000..b4791775
--- /dev/null
+++ b/debug/gem5.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+path="$1"
+
+# if $TORCHSIM_DIR/debug does not exist, create it
+if [ ! -d "$TORCHSIM_DIR/debug" ]; then
+  mkdir $TORCHSIM_DIR/debug
+fi
+if [ ! -d "$TORCHSIM_DIR/debug/out" ]; then
+  mkdir $TORCHSIM_DIR/debug/out
+fi
+
+/workspace/gem5/build/RISCV/gem5.debug \
+--debug-flags=Fetch,Decode,MinorExecute \
+-d m5out $TORCHSIM_DIR/gem5_script/script_systolic.py \
+-c $path/cycle_bin --vlane 128 > $TORCHSIM_DIR/debug/out/gem5_log.txt
+
+# grep ticks of M5Op
+ticks=($(grep "Changing stream on" $TORCHSIM_DIR/debug/out/gem5_log.txt | grep "M5Op" | awk '{print $1}'))
+
+# trim only cycle number
+for i in "${!ticks[@]}"; do
+  ticks[$i]=${ticks[$i]::-4}
+done
+
+# extract instruction
+python $TORCHSIM_DIR/debug/pipeline.py --input $TORCHSIM_DIR/debug/out/gem5_log.txt --min ${ticks[-2]} --max ${ticks[-1]} > $TORCHSIM_DIR/debug/out/gem5_inst.txt
\ No newline at end of file
diff --git a/debug/pipeline.py b/debug/pipeline.py
new file mode 100644
index 00000000..88f6a544
--- /dev/null
+++ b/debug/pipeline.py
@@ -0,0 +1,257 @@
+import argparse
+import os
+args = argparse.ArgumentParser()
+args.add_argument('--input', type=str, default='input.txt')
+args.add_argument('--min', type=int, default=0)
+args.add_argument('--max', type=int, default=0)
+
+parsed_args = args.parse_args()
+
+filename = parsed_args.input
+min_value = parsed_args.min
+max_value = parsed_args.max - 1
+#max_addr = '0x10378' # address of the last M5Op
+
+def filter_exec_in_file(filename, match_string):
+    result = []
+    prev_line = "prev line"
+    
+    with open(filename, 'r') as file:
+        lines = file.readlines()
+
+        cur_stream = 0
+        index_dict = dict()
+        
+        for line in lines:
+            parts = line.strip().split(' ')
+
+            if len(parts) >= 3:
+                try:
+                    num = int(int(parts[0][:-1])/1000)
+                    first_string = parts[1][:-1]
+
+                    addr = "temp addr"
+                    
+                    if min_value <= num <= max_value and first_string == match_string:
+                        idx = 0
+
+#                        if parts[2] == "Pushing": # Pushing mem inst: %s pc: addr (inst)
+#                            result.append(f"{num}|execute I: {parts[-2]} {parts[-1]} this is a memory ref. instruction.")
+                        if parts[2] == "Issuing":
+                            if parts[3] == "inst:": # Issuing inst: %s pc: addr (inst) into FU %d
+                                stream = int(parts[-7].split('.')[0].split('/')[-1])
+                                index = int(parts[-7].split('.')[-1])
+                                addr = parts[-5]
+                                inst = parts[-4]
+
+#                                if addr > max_addr:
+#                                    continue
+
+                                if (cur_stream != stream):
+                                    index_dict.clear()
+                            
+                                if addr in index_dict:
+                                    idx = index - index_dict[addr]
+                                else:
+                                    index_dict[addr] = index
+
+                                result.append(f"{num}|2_execute I: {addr} {inst} {idx}")
+
+                                cur_stream = stream
+                            elif parts[3] == "mem": # Issuing mem ref early inst: %s pc: addr (inst) instToWaitFor: %d
+                                stream = int(parts[-6].split('.')[0].split('/')[-1])
+                                index = int(parts[-6].split('.')[-1])
+                                addr = parts[-4]
+                                inst = parts[-3]
+
+#                                if addr > max_addr:
+#                                    continue
+
+                                if (cur_stream != stream):
+                                    index_dict.clear()
+
+                                if addr in index_dict:
+                                    idx = index - index_dict[addr]
+                                else:
+                                    index_dict[addr] = index
+
+                                result.append(f"{num}|1_execute M: {addr} {inst} {idx}") 
+
+                                cur_stream = stream
+                            else : # Issuing %s to %d  # Prev : Trying to issue inst: %s pc: addr (inst) to FU %d
+                                prev_parts = prev_line.strip().split(' ')
+
+                                stream = int(prev_parts[-7].split('.')[0].split('/')[-1])
+                                index = int(prev_parts[-7].split('.')[-1])
+                                addr = prev_parts[-5]
+                                inst = prev_parts[-4]
+
+#                                if addr > max_addr:
+#                                    continue
+
+                                if (cur_stream != stream):
+                                    index_dict.clear()
+
+                                if addr in index_dict:
+                                    idx = index - index_dict[addr]
+                                else:
+                                    index_dict[addr] = index
+
+                                result.append(f"{num}|2_execute I: {addr} {inst} {idx}")
+
+                                cur_stream = stream                                
+#                        elif parts[2] == "Discarding": # Discarding inst: %s pc: addr (inst) as its stream state was unexpected, expected: %d
+#                            result.append(f"")
+                        elif parts[2] == "Completed": # Completed inst: %s pc: addr (inst)
+                            stream = int(parts[-4].split('.')[0].split('/')[-1])
+                            index = int(parts[-4].split('.')[-1])
+                            addr = parts[-2]
+                            inst = parts[-1]
+
+#                            if addr > max_addr:
+#                                continue
+
+                            if (cur_stream != stream):
+                                continue
+
+                            if addr in index_dict:
+                                idx = index - index_dict[addr]
+                            else:
+                                index_dict[addr] = index
+
+                            result.append(f"{num}|0_execute C: {addr} {inst} {idx}")
+
+                            cur_stream = stream
+                        else:
+                            prev_line = line
+                            continue
+                except ValueError:
+                    continue
+            prev_line = line
+                    
+    return result
+
+def filter_decode_in_file(filename, match_string):
+    result = []
+    
+    with open(filename, 'r') as file:
+        lines = file.readlines()
+        
+        for line in lines:
+            parts = line.strip().split(' ')
+            
+            if len(parts) >= 3:
+                try:
+                    num = int(int(parts[0][:-1])/1000)
+                    first_string = parts[1][:-1]
+                    
+                    if min_value <= num <= max_value and first_string == match_string:
+                        if parts[2] == "Microop":
+                            inst = parts[-1]
+                            addr = parts[-2]
+
+                            if inst == '(vnop)':
+                                continue
+
+#                            if addr > max_addr:
+#                                continue
+
+                            result.append(f"{num}|3_decode: {parts[-2]} {parts[-1]} {parts[-6][-5]}")                            
+                        elif parts[2] == "Passing":
+                            addr = parts[7]
+
+#                            if addr > max_addr:
+#                                continue
+                            
+                            result.append(f"{num}|3_decode: {parts[7]} {parts[8]}")
+                        else:
+                            continue
+                except ValueError:
+                    continue
+                    
+    return result
+
+def filter_fetch2_in_file(filename, match_string):
+    result = []
+    
+    with open(filename, 'r') as file:
+        lines = file.readlines()
+        
+        for line in lines:
+            parts = line.strip().split(' ')
+            
+            if len(parts) >= 3:
+                try:
+                    num = int(int(parts[0][:-1])/1000)
+                    first_string = parts[1][:-1]
+                    
+                    if min_value <= num <= max_value and first_string == match_string:
+                        if parts[2] == "Instruction": # Instruction extracted from line ~
+                            addr = parts[-2]
+
+#                            if addr > max_addr:
+#                                continue                            
+
+                            result.append(f"{num}|4_fetch2: {parts[-2]} {parts[-1]}")
+                        else:
+                            continue
+                except ValueError:
+                    continue
+                    
+    return result
+
+def filter_fetch1_in_file(filename, match_string):
+    result = []
+    
+    with open(filename, 'r') as file:
+        lines = file.readlines()
+        
+        temp = "start_addr"
+        
+        for line in lines:
+            parts = line.strip().split(' ')
+            
+            if len(parts) >= 3:
+                try:
+                    num = int(int(parts[0][:-1])/1000)
+                    first_string = parts[1][:-1]
+                    
+                    if min_value <= num <= max_value and first_string == match_string:
+                        if parts[2] == "Inserting":
+                            addr = parts[-7]
+                            temp = addr
+
+#                            if addr > max_addr:
+#                                continue                            
+
+                            result.append(f"{num}|5_fetch1: {addr} ~ ")
+                        elif parts[2] == "Processing":
+#                            if temp > max_addr:
+#                                continue
+
+                            result.append(f"{num}|5_fetch1: {temp} ~ ")
+                        else:
+                            continue
+                except ValueError:
+                    continue
+                    
+    return result
+
+
+
+filtered_fetch1 = filter_fetch1_in_file(filename, 'system.cpu.fetch1')
+filtered_fetch2 = filter_fetch2_in_file(filename, 'system.cpu.fetch2')
+filtered_decode = filter_decode_in_file(filename, 'system.cpu.decode')
+filtered_exec = filter_exec_in_file(filename, 'system.cpu.execute')
+
+for line in filtered_exec:
+    print(line)
+
+for line in filtered_decode:
+    print(line)
+
+for line in filtered_fetch2:
+    print(line)
+
+for line in filtered_fetch1:
+    print(line)

From 98827e8aec014ed0ff796116168056ccc0ca8498 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 31 Mar 2025 05:01:21 +0000
Subject: [PATCH 250/432] [Gem5/Spad] Fix spad modeling

---
 .../mlir/mlir_codegen_backend.py              |   2 +-
 gem5_script/script_systolic.py                | 254 ++++++++++--------
 2 files changed, 136 insertions(+), 120 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 69a5e5a4..1d77a9b2 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1378,7 +1378,7 @@ def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape
             new_name = f"{name}_{len(self.global_vars_dict[name])}"
             # Add definition to header
             self.header.writeline(f"{c_type} {new_name}_spad[{tile_size // self.vector_lane}] __attribute__ ((section(\".spad\")));")
-            self.gem5_header.writeline(f"{c_type} {new_name}_spad[{tile_size}];")
+            self.gem5_header.writeline(f"{c_type} {new_name}_spad[{tile_size}] __attribute__((aligned(64)));")
             self.global_vars.writeline(f"memref.global @{new_name}_spad : {dram_tile_shape}")
             self.global_vars_dict[name].append(str(raw_index))
         else:
diff --git a/gem5_script/script_systolic.py b/gem5_script/script_systolic.py
index 53553517..9b49aa87 100644
--- a/gem5_script/script_systolic.py
+++ b/gem5_script/script_systolic.py
@@ -1,56 +1,111 @@
 import argparse
 import os
 import sys
-
+import math
 import m5
 from m5.objects import *
-
 from ctypes import cdll
 
 bin_path = sys.argv[1]
 parser = argparse.ArgumentParser()
-parser.add_argument(
-    "-c",
-    "--cmd",
-    default="",
-    help="The binary to run in syscall emulation mode.",
-)
-parser.add_argument(
-    "-o",
-    "--options",
-    default="",
-    help="""The options to pass to the binary, use
-                            around the entire string""",
-)
-
-class MySimpleMemory(SimpleMemory):
+parser.add_argument("-c", "--cmd", default="", help="The binary to run in syscall emulation mode.")
+parser.add_argument("-o", "--options", default="", help="""The options to pass to the binary, use around the entire string""")
+parser.add_argument("--cpu", choices=["RiscvAtomicSimpleCPU", "RiscvTimingSimpleCPU", "RiscvMinorCPU", "RiscvDerivO3CPU",
+                                      "RiscvMinorCPU", "RiscvCustomCPU", "RiscvMinorV2CPU", "RiscvMinorV4CPU", "RiscvVPU",
+                                      "RiscvSparseVPU"], default="RiscvVPU")
+parser.add_argument("--mem", choices=["SimpleMemory", "ScratchpadMemory", "DDR3_1600_8x8"], default="ScratchpadMemory")
+parser.add_argument("--sparse", type=bool, default=False)
+parser.add_argument("--vlane", type=int, default=128)
+args = parser.parse_args()
+
+
+class InstMemory(SimpleMemory):
     latency = "1ns"
+    bandwidth = "64GB/s"
 
 class SpadMemory(SimpleMemory):
     latency = "1ns" # latency unit is "tick" 1ns = 1000 ticks
-    bandwidth = "64GB/s"
-    # TODO: bandwidth = "XXGB/s" what is a proper value? (ref. simple_mem.cc:154)
+
+    def __init__(self, bandwidth="4GB/s"):
+        super().__init__()
+        self.bandwidth = bandwidth  # Set the bandwidth for this memory bank
+
+class MultiBankMemorySystem():
+    def __init__(self, bus_port, mem_range, num_banks=8, granule_size=4, total_bandwidth="32GB/s"):
+        self.num_banks = num_banks
+        self.granule_size = granule_size
+
+        # Calculate interleaving properties
+        self.intlvBits = int(math.log2(self.num_banks)) # Interleaving bits
+        self.intlvLowBit = int(math.log2(self.granule_size))  # Granule size low bit
+        self.intlvHighBit = self.intlvLowBit + self.intlvBits - 1  # High bit for interleaving
+        self.mem_ctrls = []
+        self.bandwidth_per_bank = self.divide_bandwidth(total_bandwidth[:-2], self.num_banks)
+
+        # Create memory controllers for each bank
+        self.create_memory_banks(bus_port, mem_range)
+
+    def create_memory_banks(self, bus_port, mem_range):
+        """Create memory banks and interleave them"""
+        for i in range(self.num_banks):
+            #print(f"[Spad Bank {i}] Bandwidth {self.bandwidth_per_bank}")
+            mem = SpadMemory(self.bandwidth_per_bank)  # Create a new memory bank
+            # Define the memory range for each bank (interleaving range)
+            if self.num_banks!=1:
+                print("intlvBits:",self.intlvBits, " intlvHighBits: ", self.intlvHighBit)
+                mem.range = AddrRange(
+                    start=mem_range.start, size=mem_range.size(),
+                    intlvBits=self.intlvBits, intlvMatch=i, intlvHighBit=self.intlvHighBit
+                )
+            else:
+                mem.range = AddrRange(start=mem_range.start, size=mem_range.size())
+            mem.port = bus_port
+            self.mem_ctrls.append(mem)
+
+    def divide_bandwidth(self, total_bandwidth, num_banks):
+        total_bandwidth_bytes = self.bandwidth_to_bytes(total_bandwidth)
+        per_bank_bandwidth_bytes = total_bandwidth_bytes / num_banks
+        return self.bytes_to_bandwidth(per_bank_bandwidth_bytes)
+
+    def bandwidth_to_bytes(self, bandwidth):
+        # Extract the value and unit
+        value, unit = bandwidth[:-2], bandwidth[-2:]
+        value = float(value)
+        # Convert based on the unit
+        if unit == "GB":
+            return value * 1e9
+        elif unit == "MB":
+            return value * 1e6
+        elif unit == "KB":
+            return value * 1e3
+        elif unit == "B":
+            return value
+        else:
+            raise ValueError(f"Unknown bandwidth unit: {unit}")
+
+    def bytes_to_bandwidth(self, bandwidth_bytes):
+        if bandwidth_bytes >= 1e9:
+            return f"{bandwidth_bytes / 1e9}GB/s"
+        elif bandwidth_bytes >= 1e6:
+            return f"{bandwidth_bytes / 1e6}MB/s"
+        elif bandwidth_bytes >= 1e3:
+            return f"{bandwidth_bytes / 1e3}KB/s"
+        else:
+            return f"{bandwidth_bytes}B/s"
+
+    def get_ctrls(self):
+        return self.mem_ctrls
 
 class SystolicArray(MinorFU):
     unitType = "SystolicArray"
-    opClasses = minorMakeOpClassSet([
-        "CustomMatMul",
-        "CustomMatMuliVpush",
-        "CustomMatMulwVpush",
-        "CustomMatMulvpop",
-        ])
+    opClasses = minorMakeOpClassSet(["CustomMatMul", "CustomMatMuliVpush", "CustomMatMulwVpush", "CustomMatMulvpop"])
     opLat = 1
     systolicArrayWidth = 128
     systolicArrayHeight = 128
 
 class SparseAccelerator(MinorFU):
     unitType = "SparseAccelerator"
-    opClasses = minorMakeOpClassSet([
-        "CustomMatMul",
-        "CustomMatMuliVpush",
-        "CustomMatMulwVpush",
-        "CustomMatMulvpop",
-        ])
+    opClasses = minorMakeOpClassSet(["CustomMatMul", "CustomMatMuliVpush", "CustomMatMulwVpush", "CustomMatMulvpop"])
     opLat = 1
 
 class SpecialFunctionUnit(MinorFU):
@@ -234,7 +289,11 @@ class MinorCustomFUPool(MinorFUPool):
         MinorVecConfig(), # 1 for vector config
 
         MinorFPUnit(),
-        MinorVecMisc(), # 2~5
+        MinorVecMisc(),
+        MinorVecMisc(),
+        MinorVecMisc(),
+        MinorVecMisc(),
+        MinorVecMisc(),
         MinorVecMisc(),
         MinorVecMisc(),
         MinorVecMisc(),
@@ -353,20 +412,16 @@ class RiscvVPU(RiscvMinorCPU):
     decodeInputWidth = 1
     executeInputWidth = 8
     executeIssueLimit = 8
-    executeMemoryIssueLimit = 8
     executeCommitLimit = 8
-    executeMemoryCommitLimit = 8
     executeFuncUnits = MinorCustomFUPool()
-
-class RiscvSparseVPU(RiscvMinorCPU):
-    fetch2InputBufferSize = 2
-    decodeInputBufferSize = 1
-    decodeInputWidth = 1
-    executeInputWidth = 8
-    executeIssueLimit = 8
     executeMemoryIssueLimit = 8
-    executeCommitLimit = 8
     executeMemoryCommitLimit = 8
+    executeMaxAccessesInMemory = 8
+    executeLSQMaxStoreBufferStoresPerCycle = 8
+    executeLSQTransfersQueueSize = 8
+    executeLSQStoreBufferSize = 8
+
+class RiscvSparseVPU(RiscvVPU):
     executeFuncUnits = MinorCustomSparseFUPool()
 
 class MinorV2FUPool(MinorFUPool):
@@ -379,9 +434,7 @@ class MinorV2FUPool(MinorFUPool):
         MinorDefaultPredFU(),
         MinorDefaultMemFU(),
         MinorDefaultMiscFU(),
-        # MinorDefaultVecFU(),
-        # MinorDefaultVecFU(),
-        ]
+    ]
 
 class RiscvMinorV2CPU(RiscvMinorCPU):
     executeFuncUnits = MinorV2FUPool()
@@ -396,54 +449,37 @@ class MinorV4FUPool(MinorFUPool):
         MinorDefaultPredFU(),
         MinorDefaultMemFU(),
         MinorDefaultMiscFU(),
-        # MinorDefaultVecFU(),
-        # MinorDefaultVecFU(),
-        # MinorDefaultVecFU(),
-        # MinorDefaultVecFU(),
-        ]
+    ]
 
 class RiscvMinorV4CPU(RiscvMinorCPU):
     executeFuncUnits = MinorV4FUPool()
     executeCommitLimit = 4
     executeMemoryCommitLimit = 1
 
-class L1Cache(Cache):
+class L1Cache(NoncoherentCache):
     """Simple L1 Cache with default values"""
-
     assoc = 8
     tag_latency = 1
     data_latency = 1
     response_latency = 1
     mshrs = 16
     tgts_per_mshr = 20
-
     def connectBus(self, bus):
-        """Connect this cache to a memory-side bus"""
         self.mem_side = bus.cpu_side_ports
 
     def connectCPU(self, cpu):
-        """Connect this cache's port to a CPU-side port
-        This must be defined in a subclass"""
         raise NotImplementedError
 
 class L1ICache(L1Cache):
-    """Simple L1 instruction cache with default values"""
-
-    # Set the default size
-    size = "8192kB" # is it enough for infinite ICache?
+    size = "8192kB"
+    tag_latency = 0
+    data_latency = 0
+    response_latency = 0
 
     def connectCPU(self, cpu):
-        """Connect this cache's port to a CPU icache port"""
         self.cpu_side = cpu.icache_port
 
 valid_cpu = {
-#    "X86AtomicSimpleCPU": X86AtomicSimpleCPU,
-#    "X86TimingSimpleCPU": X86TimingSimpleCPU,
-#    "X86DerivO3CPU": X86O3CPU,
-#    "ArmAtomicSimpleCPU": ArmAtomicSimpleCPU,
-#    "ArmTimingSimpleCPU": ArmTimingSimpleCPU,
-#    "ArmMinorCPU": ArmMinorCPU,
-#    "ArmDerivO3CPU": ArmO3CPU,
     "RiscvAtomicSimpleCPU": RiscvAtomicSimpleCPU,
     "RiscvTimingSimpleCPU": RiscvTimingSimpleCPU,
     "RiscvMinorCPU": RiscvMinorCPU,
@@ -456,82 +492,63 @@ def connectCPU(self, cpu):
     "RiscvSparseVPU": RiscvSparseVPU,
 }
 
-valid_mem = {"SimpleMemory": MySimpleMemory, "ScratchpadMemory": SpadMemory, "DDR3_1600_8x8": DDR3_1600_8x8}
-
-#parser = argparse.ArgumentParser()
-#parser.add_argument("binary", type=str)
-#parser.add_argument("--cpu", choices=valid_cpu.keys(), default="RiscvTimingSimpleCPU")
-parser.add_argument("--cpu", choices=valid_cpu.keys(), default="RiscvVPU")
-parser.add_argument("--mem", choices=valid_mem.keys(), default="ScratchpadMemory")
-parser.add_argument("--sparse", type=bool, default=False)
-parser.add_argument("--vlane", type=int, default=128)
-
-args = parser.parse_args()
-
 # change systolicArrayWidth and systolicArrayHeight into args.vlane
 SystolicArray.systolicArrayWidth = args.vlane
 SystolicArray.systolicArrayHeight = args.vlane
-
-system = System()
-
-thispath = os.path.dirname(os.path.realpath(__file__))
 binary = args.cmd
-#binary = os.path.join(
-#        thispath,
-#        "../../../",
-#        args.binary,
-#)
 
-#system.workload = SEWorkload.init_compatible(args.binary)
+# Main System Setup
+system = System()
 system.workload = SEWorkload.init_compatible(binary)
 
+# Clock setting
 system.clk_domain = SrcClockDomain()
 system.clk_domain.clock = "1GHz"
 system.clk_domain.voltage_domain = VoltageDomain()
 
-if args.cpu not in (
-    "X86AtomicSimpleCPU",
-    "ArmAtomicSimpleCPU",
-    "RiscvAtomicSimpleCPU",
-):
-    system.mem_mode = "timing"
-
-system.mem_ranges = [AddrRange("8192MB")]
+fast_clk = SrcClockDomain()
+fast_clk.clock = '8GHz'
+fast_clk.voltage_domain = VoltageDomain()
 
+system.mem_mode = "timing"
+system.cache_line_size = 64
 system.cpu = valid_cpu[args.cpu]()
 
+# Memory range
+granule_sz = 64
+spad_num_bank = 1
+system.mem_ranges = [AddrRange(start=0, size="16GB")]
+
 system.membus = SpmXBar(
-    width = 64,
-    frontend_latency = 0,
-    forward_latency = 0,
-    response_latency = 0)
-# system.cpu.icache_port = system.membus.cpu_side_ports
+        width = granule_sz,
+        header_latency = 0,
+        frontend_latency = 0,
+        forward_latency = 0,
+        response_latency = 0)
+system.membus.clk_domain = fast_clk
+
+# Instruction cache connection
+system.cpu.icache= L1ICache()
+system.cpu.icache.connectCPU(system.cpu)
+system.cpu.icache.connectBus(system.membus)
+#system.cpu.icache.mem_side = inst_mem.port
 system.cpu.dcache_port = system.membus.cpu_side_ports
+system.cpu.createInterruptController()
 
-system.cpu.l1i = L1ICache()
-system.cpu.l1i.connectCPU(system.cpu)
-system.cpu.l1i.connectBus(system.membus)
+# Create and connect memory nodes
+multi_banked_spm = MultiBankMemorySystem(system.membus.mem_side_ports, system.mem_ranges[0], num_banks=spad_num_bank, granule_size=granule_sz)
+system.mem_ctrls = multi_banked_spm.get_ctrls()
 
-system.cpu.createInterruptController()
-if args.cpu in ("X86AtomicSimpleCPU", "X86TimingSimpleCPU", "X86DerivO3CPU"):
-    system.cpu.interrupts[0].pio = system.membus.mem_side_ports
-    system.cpu.interrupts[0].int_master = system.membus.cpu_side_ports
-    system.cpu.interrupts[0].int_slave = system.membus.mem_side_ports
-
-system.mem_ctrl = valid_mem[args.mem]()
-system.mem_ctrl.range = system.mem_ranges[0]
-system.mem_ctrl.port = system.membus.mem_side_ports
 system.system_port = system.membus.cpu_side_ports
 
 process = Process()
-#process.cmd = [args.binary]
 process.cmd = [binary] + args.options.split()
 system.cpu.workload = process
 system.cpu.createThreads()
 
+# Simulation
 root = Root(full_system=False, system=system)
 m5.instantiate()
-
 exit_event = m5.simulate()
 
 if exit_event.getCause() != "exiting with last active thread context":
@@ -539,5 +556,4 @@ def connectCPU(self, cpu):
 
 # print(f"Exiting @ tick {m5.curTick()} because {exit_event.getCause()}")
 print(f"{m5.curTick() / 1000}")
-print(f"{m5.curTick()}")
-
+print(f"{m5.curTick()}")
\ No newline at end of file

From de5b2b78e2ca7b17b74e41f4cc742e9a8050c4ee Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 31 Mar 2025 14:03:09 +0000
Subject: [PATCH 251/432] [Script] Add tog result parsing script

---
 scripts/get_tog_result.sh | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100755 scripts/get_tog_result.sh

diff --git a/scripts/get_tog_result.sh b/scripts/get_tog_result.sh
new file mode 100755
index 00000000..86401c72
--- /dev/null
+++ b/scripts/get_tog_result.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+total_cycles=0
+
+# Read through input stream line by line
+while IFS= read -r line; do
+    # Check if the line contains both "[BackendSimulator]" and "stored"
+    if [[ "$line" == *"[BackendSimulator]"* && "$line" == *"stored"* ]]; then
+        # Extract the file path from the line
+        file_path=$(echo "$line" | sed -n 's/.*stored to "\(.*\)"$/\1/p')
+        
+        # If the file exists, grep for "Total cycle" and output the last matching line
+        if [[ -f "$file_path" ]]; then
+            last_line=$(grep "Total cycle" "$file_path" | tail -n 1)
+            echo "$last_line ($file_path)"
+            # Accumulate the cycle value
+            cycle_value=$(echo "$last_line" | sed -n 's/.*Total cycle \([0-9]\+\)$/\1/p')
+            total_cycles=$((total_cycles + cycle_value))
+        else
+            echo "File not found: $file_path"
+        fi
+    fi
+done
+echo "Accumulated Total Cycle: $total_cycles"
\ No newline at end of file

From 27c2eda1d454dc80d3a2edf3a12c6256532882f0 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 1 Apr 2025 09:32:48 +0000
Subject: [PATCH 252/432] [Fix] Interconnect Bottleneck

---
 .../configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json      | 2 +-
 .../configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json      | 4 ++--
 .../configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json      | 2 +-
 .../configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json      | 2 +-
 .../configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json      | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
index 31743430..38acafc0 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
@@ -16,7 +16,7 @@
  
   "icnt_type" : "simple",
   "icnt_latency" : 7,
-  "icnt_freq" : 7000,
+  "icnt_freq" : 10000,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
  
   "precision" : 4,
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
index 2fc13de4..98943fae 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
@@ -7,7 +7,7 @@
 
   "dram_type" : "ramulator2",
   "dram_freq" : 940,
-  "dram_channels": 32,
+  "dram_channels": 16,
   "dram_req_size": 32,
   "dram_latency" : 10,
   "dram_size" : 32,
@@ -17,7 +17,7 @@
 
   "icnt_type" : "simple",
   "icnt_latency" : 7,
-  "icnt_freq" : 7000,
+  "icnt_freq" : 14000,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
 
   "precision" : 4,
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
index 3e082fc5..ec975d3d 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
@@ -16,7 +16,7 @@
  
   "icnt_type" : "simple",
   "icnt_latency" : 1,
-  "icnt_freq" : 8000,
+  "icnt_freq" : 19200,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
  
   "precision" : 4,
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
index 9f97c7db..034542fe 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
@@ -16,7 +16,7 @@
  
   "icnt_type" : "simple",
   "icnt_latency" : 7,
-  "icnt_freq" : 7000,
+  "icnt_freq" : 20000,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
  
   "precision" : 4,
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
index 5701402c..82f42c00 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
@@ -17,7 +17,7 @@
 
   "icnt_type" : "simple",
   "icnt_latency" : 7,
-  "icnt_freq" : 7000,
+  "icnt_freq" : 28000,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
 
   "precision" : 4,

From 7113895100d7f448786aa06eccdb62ecd90a7d05 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 2 Apr 2025 02:47:38 +0000
Subject: [PATCH 253/432] [Config] TPUv4 config

---
 .../systolic_ws_128x128_c1_simple_noc_tpuv4.json   | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
index ec975d3d..2594f734 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
@@ -1,8 +1,9 @@
 {
   "num_cores" : 1,
-  "core_freq" : 700,
-  "sram_size" : 65536,
+  "core_freq" : 1050,
+  "sram_size" : 16777216,
   "core_print_interval" : 10000,
+  "num_systolic_array_per_core" : 4,
 
   "dram_type" : "ramulator2",
   "dram_freq" :1200,
@@ -13,17 +14,18 @@
   "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+  "l2d_type" : "readonly",
+  "l2d_config" : "S:128:128:512,32,L:R:m:L:L,A:192:4,32:0,32",
  
   "icnt_type" : "simple",
-  "icnt_latency" : 1,
+  "icnt_latency" : 7,
   "icnt_freq" : 19200,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
  
   "precision" : 4,
   "scheduler" : "simple",
-  "num_partition" : 2,
+  "num_partition" : 1,
   "partition": {
-    "core_0":0,
-    "core_1":0
+    "core_0":0
   }
 }
\ No newline at end of file

From 620dc7834525080f18fe75040529a0df7f875f4d Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 2 Apr 2025 02:58:15 +0000
Subject: [PATCH 254/432] [Fix] SA utils & DRAM BW

---
 PyTorchSimBackend/extern/ramulator2 | 2 +-
 PyTorchSimBackend/src/Core.cc       | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimBackend/extern/ramulator2 b/PyTorchSimBackend/extern/ramulator2
index e97b356a..748cd709 160000
--- a/PyTorchSimBackend/extern/ramulator2
+++ b/PyTorchSimBackend/extern/ramulator2
@@ -1 +1 @@
-Subproject commit e97b356a023dd7ae45582de37736d79498801f72
+Subproject commit 748cd7099778d7196326aeb6384da92efb0c34c9
diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index 7596c787..e4957661 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -255,16 +255,17 @@ void Core::cycle() {
         case Opcode::COMP:
           {
             auto& target_pipeline = get_compute_pipeline(inst->get_compute_type());
-            if (target_pipeline.empty())
+            if (target_pipeline.empty()) {
               inst->finish_cycle = _core_cycle + inst->get_compute_cycle();
-            else {
+              inst->bubble_cycle = inst->get_overlapping_cycle();
+            } else {
               int overlapped_cycle = std::min(target_pipeline.back()->finish_cycle - _core_cycle, inst->get_overlapping_cycle());
               int bubble_cycle = inst->get_overlapping_cycle() - overlapped_cycle;
               inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle() - overlapped_cycle;
               inst->bubble_cycle = bubble_cycle;
             }
             if (inst->get_compute_cycle() == 0) {
-              spdlog::trace("[Core {}][{}] {} SKIPPED", _id, _core_cycle,
+              spdlog::trace("[Core {}][SA {}][{}] {} SKIPPED", _id, _systolic_array_rr, _core_cycle,
                             opcode_to_string(inst->get_opcode()));
               inst->finish_instruction();
               static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
@@ -272,7 +273,7 @@ void Core::cycle() {
               auto it = instructions.begin() + j; // Position 2 is the third element
               instructions.erase(it);
             } else {
-              spdlog::trace("[Core {}][{}] {}-{} ISSUED, finsh at {}", _id, _core_cycle,
+              spdlog::trace("[Core {}][SA {}][{}] {}-{} ISSUED, finsh at {}", _id, _systolic_array_rr, _core_cycle,
                             opcode_to_string(inst->get_opcode()), inst->get_compute_type(), inst->finish_cycle);
               target_pipeline.push(inst);
               issued = true;

From a792e123524845f76041ac2c66121d2a2e093d0f Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 2 Apr 2025 15:50:48 +0000
Subject: [PATCH 255/432] [Config] TPUv4 2 Core & CMEM

---
 ...stolic_ws_128x128_c2_simple_noc_tpuv4.json | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json

diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
new file mode 100644
index 00000000..f4a5172d
--- /dev/null
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
@@ -0,0 +1,32 @@
+{
+  "num_cores" : 2,
+  "core_freq" : 1050,
+  "sram_size" : 16777216,
+  "core_print_interval" : 10000,
+  "num_systolic_array_per_core" : 4,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" :1200,
+  "dram_channels": 32,
+  "dram_req_size": 32,
+  "dram_latency" : 10,
+  "dram_size" : 16,
+  "dram_nbl" : 2,
+  "dram_print_interval": 10000,
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+  "l2d_type" : "readonly",
+  "l2d_config" : "S:64:128:512,32,L:R:m:L:L,A:192:4,32:0,32",
+ 
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq" : 38400,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
+ 
+  "precision" : 4,
+  "scheduler" : "simple",
+  "num_partition" : 2,
+  "partition": {
+    "core_0":0,
+    "core_1":0
+  }
+}
\ No newline at end of file

From e214fff52f92039903b744752dab54a31d9095a2 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 31 Mar 2025 14:22:31 +0000
Subject: [PATCH 256/432] [Gem5/Script] Set vlen configurable

---
 gem5_script/script_systolic.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gem5_script/script_systolic.py b/gem5_script/script_systolic.py
index 9b49aa87..979bf589 100644
--- a/gem5_script/script_systolic.py
+++ b/gem5_script/script_systolic.py
@@ -16,6 +16,7 @@
 parser.add_argument("--mem", choices=["SimpleMemory", "ScratchpadMemory", "DDR3_1600_8x8"], default="ScratchpadMemory")
 parser.add_argument("--sparse", type=bool, default=False)
 parser.add_argument("--vlane", type=int, default=128)
+parser.add_argument("--vlen", type=int, default=256)
 args = parser.parse_args()
 
 
@@ -513,6 +514,7 @@ def connectCPU(self, cpu):
 system.mem_mode = "timing"
 system.cache_line_size = 64
 system.cpu = valid_cpu[args.cpu]()
+system.cpu.ArchISA.vlen = args.vlen
 
 # Memory range
 granule_sz = 64

From 4ada7f0e32cf2bd2cc783c427e373a1e19edf001 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 31 Mar 2025 14:42:22 +0000
Subject: [PATCH 257/432] [Frontend] Fix reduction scalar store

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 1d77a9b2..e402c324 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1012,7 +1012,7 @@ def store_reduction(self, name, index, value):
             if self.buffer_types[name][1] > 1:
                 divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to vector<{self.var_info[sum][0]}x{mlir_dtype}>")
             else:
-                divider_vec = f"f{self.buffer_types[name][1]}"
+                divider_vec = divider
             mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{sum}, %{divider_vec} : {shape}")
 
             # m2 = (E(X^2) - E(X)^2) * N

From e1cc52c384b32acf382760c15acfdffb222bf418 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 31 Mar 2025 14:42:41 +0000
Subject: [PATCH 258/432] [Tests] Make shape optionable

---
 tests/test_activation.py |  6 ++++++
 tests/test_add.py        |  6 ++++++
 tests/test_layernorm.py  | 10 ++++++++--
 tests/test_reduce.py     | 15 +++++++++++++++
 tests/test_softmax.py    |  6 ++++++
 5 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/tests/test_activation.py b/tests/test_activation.py
index 86052828..066074df 100644
--- a/tests/test_activation.py
+++ b/tests/test_activation.py
@@ -76,8 +76,14 @@ def test_SwiGLU(device, size=(128, 128)):
 if __name__ == "__main__":
     import os
     import sys
+    import argparse
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
+    parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
+    parser.add_argument('--shape', type=str, default="(512,768)")
+    args = parser.parse_args()
+    shape = tuple(map(int, args.shape.strip('()').split(',')))
+
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
diff --git a/tests/test_add.py b/tests/test_add.py
index 0d07a3fb..d258a3ee 100644
--- a/tests/test_add.py
+++ b/tests/test_add.py
@@ -46,8 +46,14 @@ def vectoradd(a, b):
 if __name__ == "__main__":
     import os
     import sys
+    import argparse
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
+    parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
+    parser.add_argument('--shape', type=str, default="(512,768)")
+    args = parser.parse_args()
+    shape = tuple(map(int, args.shape.strip('()').split(',')))
+
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
diff --git a/tests/test_layernorm.py b/tests/test_layernorm.py
index 26f5ca17..64d4cbe5 100644
--- a/tests/test_layernorm.py
+++ b/tests/test_layernorm.py
@@ -29,10 +29,16 @@ def test_LayerNorm(device, size=(64, 64)):
 if __name__ == "__main__":
     import os
     import sys
+    import argparse
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
+    parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
+    parser.add_argument('--shape', type=str, help="Shape of the tensor in the format (batch_size, features)", default="(512,768)")
+    args = parser.parse_args()
+    shape = tuple(map(int, args.shape.strip('()').split(',')))
+
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_LayerNorm(device)
-    test_LayerNorm(device, (64, 128))
+    #test_LayerNorm(device)
+    test_LayerNorm(device, shape)
diff --git a/tests/test_reduce.py b/tests/test_reduce.py
index 90a7487e..512d9e36 100644
--- a/tests/test_reduce.py
+++ b/tests/test_reduce.py
@@ -23,11 +23,26 @@ def reduce_sum(a, b, dim, keepdim):
     out = reduce_sum(x.cpu(), y.cpu(), dim, keepdim)
     test_result("ReduceSum", res, out)
 
+def test_reduce_sum2(device, size, dim, keepdim=False):
+    def reduce_sum(a, dim, keepdim):
+        return torch.sum(a, axis=dim, keepdim=keepdim)
+    x = torch.randn(size).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(reduce_sum)
+    res = opt_fn(x, dim, keepdim)
+    out = reduce_sum(x.cpu(), dim, keepdim)
+    test_result("ReduceMax", res, out)
+
 if __name__ == "__main__":
     import os
     import sys
+    import argparse
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
+    parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
+    parser.add_argument('--shape', type=str, default="(512,768)")
+    args = parser.parse_args()
+    shape = tuple(map(int, args.shape.strip('()').split(',')))
+
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
diff --git a/tests/test_softmax.py b/tests/test_softmax.py
index 7f5d75ac..9fba41dd 100644
--- a/tests/test_softmax.py
+++ b/tests/test_softmax.py
@@ -50,8 +50,14 @@ def test_softmax(device, size=(128, 128), dim=1):
 if __name__ == "__main__":
     import os
     import sys
+    import argparse
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
+    parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
+    parser.add_argument('--shape', type=str, help="Shape of the tensor in the format (batch_size, features)", default="(512,768)")
+    args = parser.parse_args()
+    shape = tuple(map(int, args.shape.strip('()').split(',')))
+
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()

From 8eacf279f3b245c61e0ae66bf6dd484e414c76c5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 2 Apr 2025 15:46:22 +0000
Subject: [PATCH 259/432] [Frontend] Optimize reduce/elementwise code

---
 .../mlir/mlir_codegen_backend.py              | 318 ++++++---------
 PyTorchSimFrontend/mlir/mlir_common.py        | 100 ++++-
 gem5_script/script_systolic.py                | 372 +-----------------
 gem5_script/vpu_config.py                     | 201 ++++++++++
 4 files changed, 401 insertions(+), 590 deletions(-)
 create mode 100644 gem5_script/vpu_config.py

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index e402c324..a58909da 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1,18 +1,10 @@
-import dataclasses
 import contextlib
 import sympy
-import itertools
 import re
 import os
 import math
-from functools import reduce
-from operator import mul
-from typing import List
-from typing import Dict
-from collections import OrderedDict
 import torch
 from torch._inductor.codegen import cpp, wrapper, common
-from torch._inductor.scheduler import BaseScheduling
 from torch._inductor.virtualized import V, _ops as ops
 from torch._inductor.codecache import write_atomic, write
 from torch._inductor.utils import (
@@ -23,6 +15,7 @@
 import PyTorchSimFrontend.extension_codecache as extension_codecache
 
 from . import mlir_common
+from .mlir_common import LoopLevel, LoopNest
 
 def reduction_init(reduction_type, dtype):
     if dtype in cpp.DTYPE_LOWP_FP:
@@ -41,34 +34,17 @@ def reduction_init(reduction_type, dtype):
         return f"0.0"
     raise AssertionError(reduction_type)
 
-def reduction_combine(reduction_type, start_value, vector_value, tile_size=64):
+def reduction_combine_vec(reduction_type, vector_value, init_value):
     if reduction_type == "sum":
-        return f"arith.addf %{start_value}, %{vector_value} : vector<{tile_size}xf32>"
+        return ops.add(vector_value, init_value)
     if reduction_type == "prod":
-        return f"arith.mulf %{start_value}, %{vector_value} : vector<{tile_size}xf32>"
-    if reduction_type == "xor_sum":
-        raise NotImplementedError() # TODO: implement
-    if reduction_type == "any":
-        raise NotImplementedError()
-    if reduction_type in ("min", "max"):
-        raise NotImplementedError()
-    if reduction_type == "welford_reduce":
-        raise NotImplementedError()
-    if reduction_type == "welford_combine":
-        raise NotImplementedError()
-    raise AssertionError(reduction_type)
-
-def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape, reduced_shape):
-    if reduction_type == "sum":
-        return f"vector.multi_reduction <add>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
-    if reduction_type == "prod":
-        return f"vector.multi_reduction <mul>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+        return ops.mul(vector_value, init_value)
     if reduction_type == "max":
-        return f"vector.multi_reduction <maximumf>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+        return ops.maximum(vector_value, init_value)
     if reduction_type == "min":
-        return f"vector.multi_reduction <minimumf>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+        return ops.minimum(vector_value, init_value)
     if reduction_type == "any":
-        return f"vector.multi_reduction <and>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+        return ops.logical_and(vector_value, init_value)
     raise AssertionError(reduction_type)
 
 class ExtensionWrapperCodegen(wrapper.WrapperCodeGen):
@@ -801,17 +777,20 @@ def load(self, name: str, index: sympy.Expr):
         index = self.convert_indirect_indexing(index)
         padding = self.get_padding_type()
         dram_var = self.kernel_group.args.input(name)
+
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         local_tile_desc, index_var = self.get_dma_info(name, index)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
-
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
-        vshape = local_tile_desc.get_mlir_vshape(mlir_dtype)
+
+        # Compute vector unit size
+        vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
+        compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
 
         # Define scratch pad buffer
         sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
@@ -820,16 +799,16 @@ def load(self, name: str, index: sympy.Expr):
         code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  f"{name}_tag", dram_shape, tile_shape, tile_stride, padding)
         self.cse.generate(self.dma_loads, code, assignment = False) # FIXME: assignment = False does not support caching
-
+        compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"])
         # Generate vector load instruction
-        if tile_numel_per_lane > 1:
+        if compute_vec_size > 1:
             operation = "affine.vector_load"
-            line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}"
+            line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
         else:
             operation = "affine.load"
-            line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}"
+            line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
         out = self.cse.generate(self.loads, line)
-        self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
+        self.register_var_info(out, [compute_vec_size, mlir_dtype])
         self.spad_buffer_dict[str(out)] = [sram_var, local_tile_desc.get_tile_size(), tile_numel_per_lane, sram_index_var, tile_shape, vshape]
         return out
 
@@ -849,9 +828,12 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
         tile_size = local_tile_desc.get_tile_size()
-        vshape = local_tile_desc.get_mlir_vshape(mlir_dtype)
-        origin_tile_size = self.spad_buffer_dict[str(value)][1] if str(value) in self.spad_buffer_dict else tile_size
+
+        # Compute vector unit size
+        vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
+        compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
         require_store = True
+
         if str(value) in self.spad_buffer_dict:
             # Todo. If tile_size is not same (i.e., view operation), we can't apply peephole optimization easily
             require_store = self.spad_buffer_dict[str(value)][1] != tile_size
@@ -859,18 +841,18 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         if require_store:
             # Define scratch pad buffer
             sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
-
+            compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"])
             # Generate vector store instruction
             store_size, operand_type = self.var_info[value]
             if mlir_dtype != operand_type:
                 value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
 
-            if tile_numel_per_lane > 1 and store_size > 1:
+            if compute_vec_size > 1 and store_size > 1:
                 operation = "affine.vector_store"
-                line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}"
+                line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
             else:
                 operation = "affine.store"
-                line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}"
+                line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}"
             self.stores.writeline(common.DeferredLine(name, line)) # TODO: Should be changed to self.compute?
         else:
             sram_var = self.spad_buffer_dict[str(value)][0]
@@ -896,88 +878,57 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 sqr_sum = self.reduction(dtype, src_dtype, "sum", ops.mul(value, value))
                 self.welford_reduce_out = (sum, sqr_sum, None)
                 return sum, sqr_sum, None
+
+        # Prepare reduction loop
+        reduction_key = src_dtype, reduction_type, value
+        acc = self.reduction_cse.generate(
+            self.loads, f"reduction {reduction_key}", write=False
+        )
+        iterator = self.iterator_cse.generate(
+            self.loads, f"reduction {reduction_key}", write=False
+        )
+        init = self.init_cse.generate(
+            self.loads, f"reduction {reduction_key}", write=False
+        )
+        init_vec = self.init_vec_cse.generate(
+            self.loads, f"reduction {reduction_key}", write=False
+        )
+        type_name = mlir_common.DTYPE_TO_MLIR[dtype]
+        init = self.cse.generate(self.reduction_prefix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
+        vec_len = self.kernel_group.tile_desc.get_compute_vec_size()
+        reduced_shape = self.kernel_group.tile_desc.get_mlir_vshape(type_name)
+
+        # Set accumulation var
+        if len(self.ranges) == 1 or (len(self.ranges) == 2 and vec_len == 1): # 1-D vector to scalar
+            # Edge case for scalar
+            init_vec = init
         else:
-            reduction_key = src_dtype, reduction_type, value
-            acc = self.reduction_cse.generate(
-                self.loads, f"reduction {reduction_key}", write=False
-            )
-            iterator = self.iterator_cse.generate(
-                self.loads, f"reduction {reduction_key}", write=False
-            )
-            init = self.init_cse.generate(
-                self.loads, f"reduction {reduction_key}", write=False
-            )
-            init_vec = self.init_vec_cse.generate(
-                self.loads, f"reduction {reduction_key}", write=False
-            )
-            type_name = mlir_common.DTYPE_TO_MLIR[dtype]
-            acc_var = init
-            ret_var = acc
-            reduced_shape = type_name
-            init = self.cse.generate(self.reduction_prefix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
-            if len(self.ranges) == 1: # 1-D vector to scalar
-                axis = "0"
-                acc_var = init
-                vec_len = self.kernel_group.tile_desc.get_vlane_stride()
-                shape = f"vector<{self.var_info[value][0]}x{type_name}>"
-                var_info = [vec_len, mlir_common.DTYPE_TO_MLIR[dtype]]
-                self.register_var_info(acc, var_info)
-            elif len(self.ranges) == 2:
-                vec_len = self.kernel_group.tile_desc.get_vlane_stride()
-                flattened_size = f"vector<{self.var_info[value][0]}x{type_name}>"
-
-                # It is column majored per lane tile
-                expaned_size = f"vector<{self.var_info[value][0]//vec_len}x{vec_len}x{type_name}>"
-                value = self.cse.generate(self.compute, f"vector.shape_cast %{value} : {flattened_size} to {expaned_size}")
-                shape = expaned_size
-
-                # Edge case for scalar
-                if vec_len == 1:
-                    reduced_shape = f"{type_name}"
-                    init_vec = init
-                    axis = "0, 1"
-                    acc_var = init
-                    var_info = [1, mlir_common.DTYPE_TO_MLIR[dtype]]
-                else:
-                    reduced_shape = f"vector<{vec_len}x{type_name}>"
-                    init_vec = self.cse.generate(self.reduction_prefix, f"vector.broadcast %{init} : {type_name} to {reduced_shape}")
-                    axis = "0"
-                    acc_var = init_vec
-                    var_info = [vec_len, mlir_common.DTYPE_TO_MLIR[dtype]]
-                self.register_var_info(acc, var_info)
-            elif len(self.ranges) == 3:
-                vec_len = self.kernel_group.tile_desc.get_vlane_stride()
-                tile_size = list(self.kernel_group.tile_desc.get_tile_size_per_lane())
-                tile_size.pop(self.kernel_group.tile_desc.vlane_split_axis)
-                flattened_size = f"vector<{self.var_info[value][0]}x{type_name}>"
-
-                reduce_axis_size = tile_size[-1]
-                reduced_size = self.var_info[value][0]//reduce_axis_size
-                # It is column majored per lane tile
-                expaned_size = f"vector<{reduce_axis_size}x{reduced_size}x{type_name}>"
-                value = self.cse.generate(self.compute, f"vector.shape_cast %{value} : {flattened_size} to {expaned_size}")
-                shape = expaned_size
-
-                # Edge case for scalar
-                if vec_len == 1:
-                    raise NotImplementedError()
-                reduced_shape = f"vector<{reduced_size}x{type_name}>"
-                init_vec = self.cse.generate(self.reduction_prefix, f"vector.broadcast %{init} : {type_name} to {reduced_shape}")
-                axis = "0"
-                acc_var = init_vec
-                var_info = [reduced_size, mlir_common.DTYPE_TO_MLIR[dtype]]
-                self.register_var_info(acc, var_info)
-                #ret_var = self.cse.generate(self.reductions_suffix, f"vector.shape_cast %{acc} : {reduced_shape} to {reduced_shape2}")
-            else:
-                raise NotImplementedError()
+            # Adjust shape and inital value
+            init_vec = self.cse.generate(self.reduction_prefix, f"vector.broadcast %{init} : {type_name} to {reduced_shape}")
+        acc_var = init_vec
+        var_info = [vec_len, mlir_common.DTYPE_TO_MLIR[dtype]]
+        self.register_var_info(acc, var_info)
+
+        # Reduction body prepare
+        body_acc = self.reduction_cse.generate(
+            self.compute, f"reduction {reduction_key}body_acc", write=False
+        )
+        body_iter_arg = self.iterator_cse.generate(
+            self.compute, f"reduction {reduction_key}body_iter_arg", write=False
+        )
+        self.register_var_info(body_iter_arg, [vec_len, type_name])
+
+        self.reduction_vars[acc] = (reduction_type, iterator, acc_var, reduced_shape)
+        self.affine_yield[body_acc] = reduced_shape
+        self.reduction_cse.reduction_cache[reduction_key] = acc
+        self.iterator_cse.reduction_cache[reduction_key] = iterator
+        self.init_cse.reduction_cache[reduction_key] = init_vec
 
-            self.reduction_vars[acc] = (reduction_type, iterator, acc_var, reduced_shape)
-            out = self.cse.generate(self.compute, reduction_combine_vec(reduction_type, value, iterator, axis=axis, shape=shape, reduced_shape=reduced_shape))
-            self.affine_yield[out] = reduced_shape
-            self.reduction_cse.reduction_cache[reduction_key] = acc
-            self.iterator_cse.reduction_cache[reduction_key] = iterator
-            self.init_cse.reduction_cache[reduction_key] = init_vec
-        return ret_var
+        # Reduction body codegen
+        result = reduction_combine_vec(reduction_type, value, body_iter_arg)
+        self.compute_body_loop.reduction_vars[body_acc] = (reduction_type, body_iter_arg, iterator, reduced_shape)
+        self.compute_body_loop.affine_yield[result] = reduced_shape
+        return acc
 
     def store_reduction(self, name, index, value):
         # Note: Change cse temporaily
@@ -1000,49 +951,41 @@ def store_reduction(self, name, index, value):
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
+        vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
 
         sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var,
                                                                          index, buffer=self.reduction_suffix)
         if self.welford_reduce_out is not None:
             # raise NotImplementedError()
             sum, sqr_sum, _ = self.welford_reduce_out
-            shape = local_tile_desc.get_mlir_vshape(mlir_dtype) if self.buffer_types[name][1] > 1 else mlir_dtype
             # mean
             divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.ranges[self.reduction_depth])} : f32")
             if self.buffer_types[name][1] > 1:
                 divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to vector<{self.var_info[sum][0]}x{mlir_dtype}>")
             else:
                 divider_vec = divider
-            mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{sum}, %{divider_vec} : {shape}")
+            mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{sum}, %{divider_vec} : {vshape}")
 
             # m2 = (E(X^2) - E(X)^2) * N
-            sqr_mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{sqr_sum}, %{divider_vec} : {shape}")
-            mean_sqr = self.cse.generate(self.reductions_suffix, f"arith.mulf %{mean}, %{mean} : {shape}")
-            variance = self.cse.generate(self.reductions_suffix, f"arith.subf %{sqr_mean}, %{mean_sqr} : {shape}")
-            m2 = self.cse.generate(self.reductions_suffix, f"arith.mulf %{variance}, %{divider_vec} : {shape}")
+            sqr_mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{sqr_sum}, %{divider_vec} : {vshape}")
+            mean_sqr = self.cse.generate(self.reductions_suffix, f"arith.mulf %{mean}, %{mean} : {vshape}")
+            variance = self.cse.generate(self.reductions_suffix, f"arith.subf %{sqr_mean}, %{mean_sqr} : {vshape}")
+            m2 = self.cse.generate(self.reductions_suffix, f"arith.mulf %{variance}, %{divider_vec} : {vshape}")
             if self.current_node.node.origin_node: # FIXME: This is a temporary solution
                 value = mean
             else:
                 value = m2
 
-        # Select mlir store operaiton
-        if self.buffer_types[name][1] == 1 or tile_numel_per_lane == 1:
-            operation = "affine.store"
-            # raise NotImplementedError("Scalar store!")
-        else:
-            operation =  "affine.vector_store"
-
         # Select src type
         if tile_numel_per_lane == 1:
-            shape = ""
+            operation = "affine.store"
+            line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}"
         else:
-            shape = f", {local_tile_desc.get_mlir_vshape(mlir_dtype)}" if self.buffer_types[name][1] > 1 else ""
-
-        line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}{shape}"
+            operation =  "affine.vector_store"
+            line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}"
         self.reductions_suffix.writeline(common.DeferredLine(name, line))
 
         # MVOUT Encoding
-
         # Generate DMA instruction
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  f"{name}_tag", dram_shape, tile_shape, tile_stride)
@@ -1091,6 +1034,7 @@ def index_expr(self, index, dtype):
         tile_size = tile_desc.get_tile_size_per_lane()
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         tile_numel_per_lane = tile_desc.get_numel_per_lane()
+        vlane_stride = tile_desc.vlane_stride
         str_tile_size = [str(dim) for dim in tile_size]
         tile_shape = f"memref<{'x'.join(str_tile_size)}xi64, 1>"
         vshape = tile_desc.get_mlir_vshape(mlir_dtype)
@@ -1107,7 +1051,7 @@ def index_expr(self, index, dtype):
 
         line = f"affine.vector_load %{sram_var}[0, 0, 0] : {tile_shape}, {vshape} // {renamed_expression}"
         out = self.cse.generate(self.compute, line)
-        self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
+        self.register_var_info(out, [vlane_stride, mlir_dtype])
         return out
 
     def codegen_global_init(self):
@@ -1121,40 +1065,45 @@ def codegen_loops(self):
         loops = [LoopLevel(var, size, step=step) for idx, (var, size, step) in enumerate(zip(self.itervars, self.ranges, tile_size))]
         loops, reductions = [LoopNest(loops[: self.reduction_depth]),
                              LoopNest(loops[self.reduction_depth :])]
+        reductions.mark_reduction(self.reduction_vars, self.affine_yield)
+        # For non-loop code
         if (self.reduction_depth==0):
             loops = LoopNest([LoopLevel("dummy", 1)])
-        reductions.mark_reduction(self.reduction_vars)
-        if len(self.affine_yield) > 0:
-            vars = ', '.join([f"%{name}" for name, _ in self.affine_yield.items()])
-            reduced_shapes = ', '.join([f"{shape}" for _, shape in self.affine_yield.items()])
-            self.stores.writeline(f"affine.yield {vars} : {reduced_shapes}")
+
+        if len(reductions.loops) > 1:
+            NotImplementedError("Not support multiple reduction axis..")
+
         code.splice(self.const_buffer)
         code.splice(self.alloc_buffer)
         code.splice(self.spad_buffer)
+        # Outerloop
         with contextlib.ExitStack() as stack:
             for loop in loops.loops:
                 loop_lines = loop.lines()
-                if loop_lines is None:
-                    return
                 code.writelines(loop_lines)
-                stack.enter_context(code.indent(outer_loop=True))
-            with contextlib.ExitStack() as stack_outer:
-                code.splice(self.reduction_prefix)
+                stack.enter_context(code.indent(attribute="{outer_loop=true}"))
+            # Non-outerloop start
+            code.splice(self.reduction_prefix)
+            with contextlib.ExitStack() as stack:
+                # Add reduction loops
+                if len(reductions.loops):
+                    reduction_lines = reductions.loops[0].lines()
+                    epilogue = reductions.loops[0].epilogue_line()
+                    code.writelines(reduction_lines)
+                    stack.enter_context(code.indent(attribute="{accumulation_loop=true}", suffix=epilogue))
+                code.splice(self.applys)
+                code.splice(self.indexed_buffer)
+                code.splice(self.dma_loads)
+                # Compute body
+                code.writelines(self.compute_body_loop.lines())
                 with contextlib.ExitStack() as stack:
-                    for reduction in reductions.loops:
-                        reduction_lines = reduction.lines()
-                        if reduction_lines is None:
-                            return
-                        code.writelines(reduction_lines)
-                        stack.enter_context(code.indent(outer_loop=False))
-                    code.splice(self.applys)
-                    code.splice(self.indexed_buffer)
-                    code.splice(self.dma_loads)
+                    stack.enter_context(code.indent(attribute="{inner_loop=false}",suffix=self.compute_body_loop.epilogue_line()))
                     code.splice(self.loads)
                     code.splice(self.compute)
                     code.splice(self.stores)
-                    code.splice(self.dma_stores)
-                code.splice(self.reductions_suffix)
+                code.splice(self.dma_stores)
+            code.splice(self.reductions_suffix)
+            # Non-outerloop end
         code.writeline(f"return")
         return code
 
@@ -1231,9 +1180,9 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
             local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         # Case 2. Tile is 1-D vector type with reduction
         elif len(local_dims) == 1 and len(local_dims) == self.reduction_depth + 1:
-            local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(local_dims[0])])
+            local_tile_desc.set_tile_size([1, kg_tile_desc.get_dim_size(local_dims[0])])
             local_tile_desc.vlane_split_axis = 0
-            local_tile_desc.vlane_stride = kg_tile_desc.get_dim_size(local_dims[0])
+            local_tile_desc.vlane_stride = 1
         # Case 3. Tile is 2-D tile
         elif len(local_dims) == 2:
             is_reduction = self.reduction_depth == 1 and not store_reduction
@@ -1468,40 +1417,3 @@ def convert_indirect_indexing(self, index :sympy.Expr):
         self.register_var_info(out, [1, "index", [1]])
         self.compute = tmp_comp
         return index + sympy.Symbol(str(out))
-
-@dataclasses.dataclass
-class LoopLevel:
-    var: sympy.Expr
-    size: sympy.Expr
-    start: int = 0
-    step: int = 1
-    reduction_vars: Dict[str, str] = None
-
-    def lines(self):
-        if self.reduction_vars:
-            acc = ', '.join([f"%{acc.name}" for acc in self.reduction_vars.keys()])
-            args = ', '.join([f"%{iter.name} = %{init.name}" for (_, iter, init, _) in self.reduction_vars.values()])
-            dtype = ', '.join([f"{dtype}" for (_, _, _, dtype) in self.reduction_vars.values()])
-            line = f"{acc} = affine.for %{self.var} = {self.start} to {self.size} step {self.step} iter_args({args}) -> ({dtype})"
-        else:
-            line = f"affine.for %{self.var} = {self.start} to {self.size} step {self.step}"
-
-        return [line]
-
-@dataclasses.dataclass
-class LoopNest:
-    loops: List[LoopLevel]
-
-    def __bool__(self):
-        return bool(self.loops)
-
-    def mark_reduction(self, reduction_vars):
-        for loop in self.loops:
-            loop.reduction_vars = reduction_vars
-
-    def mark_parallel(self, par_depth):
-        loops = self.loops
-        loops[0].parallel = par_depth
-        for i in range(1, par_depth):
-            loops[i].collapsed = True
-        loops[0].simd = loops[par_depth - 1].simd
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index b01b5831..0603b97d 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -1,4 +1,6 @@
-import os
+import dataclasses
+from typing import Dict
+from typing import List
 from collections import defaultdict
 from functools import reduce
 from operator import mul
@@ -76,14 +78,15 @@
 }
 
 class ParallelLoopBuffer(IndentedBuffer):
-    def indent(self, offset=1, outer_loop=True):
+    def indent(self, offset=1, attribute="", suffix=""):
         @contextlib.contextmanager
         def ctx():
-            attribute = "{outer_loop=true}" if outer_loop else "{accumulation_loop=true}"
             for _ in range(offset):
                 self.writeline("{")
                 self._indent += 1
             for _ in range(-offset):
+                if suffix:
+                    self.writeline(suffix)
                 self._indent -= 1
                 self.writeline("} " + attribute)
             yield
@@ -91,6 +94,8 @@ def ctx():
                 self.writeline("{")
                 self._indent += 1
             for _ in range(offset):
+                if suffix:
+                    self.writeline(suffix)
                 self._indent -= 1
                 self.writeline("} " + attribute)
 
@@ -178,6 +183,7 @@ def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=N
         self.vlane_split_axis = vlane_split_axis
         self.vlane_stride = vlane_stride
         self.implicit_dim_size = None
+        self.nr_rdim = 0
 
     def set_tile_size(self, tile_size, tile_axis_order=None):
         self._tile_size = tile_size
@@ -221,6 +227,8 @@ def get_tile_stride(self):
 
     def get_tile_size_per_lane(self):
         tile_size_per_lane = list(self._tile_size)
+        if self.vlane_split_axis < 0 or self.vlane_split_axis >= len(tile_size_per_lane):
+            raise AssertionError("Not allowed split_axis")
         used_vlane = self.get_used_vlane()
         tile_size_per_lane[self.vlane_split_axis] = \
             self.div_round_up(tile_size_per_lane[self.vlane_split_axis], used_vlane)
@@ -244,27 +252,27 @@ def get_mlir_shape(self, dtype):
         shape = "x".join(str_tile_size)
         return f"memref<{shape}x{dtype}, 1>"
 
-    @staticmethod
-    def extract_tile_size(memref_str):
-        assert memref_str.startswith("memref<") and memref_str.endswith(">"), "Invalid memref format"
-        # Extract the inner content of memref<>
-        inner_part = memref_str[len("memref<"):-1]
-        shapes = inner_part.split("x")[:-1]
-        return [int(dim) for dim in shapes]
-
     def get_mlir_vshape(self, mlir_dtype):
-        tile_numel_per_lane = self.get_numel_per_lane()
-        return f"vector<{tile_numel_per_lane}x{mlir_dtype}>" if tile_numel_per_lane > 1 else ""
+        return f"vector<{self.get_compute_vec_size()}x{mlir_dtype}>" if self.vlane_stride > 1 else f"{mlir_dtype}"
 
     def get_used_vlane(self):
         """
         Return number of used vector lane
         """
+        if self.vlane_split_axis < 0 or self.vlane_split_axis >= len(self._tile_size):
+            raise AssertionError("Not allowed split_axis")
         return min(self.div_round_up(self._tile_size[self.vlane_split_axis], self.vlane_stride), self.vector_lane)
 
     def get_vlane_stride(self):
         return self.vlane_stride
 
+    def get_compute_vec_size(self):
+        # Granule size used in compute loop
+        if self.nr_rdim:
+            assert self.nr_rdim==1
+            return self.get_numel_per_lane() // self._tile_size[-1]
+        return self.vlane_stride
+
     @staticmethod
     def div_round_up(size, round_val):
         return (size + round_val - 1) // round_val
@@ -309,6 +317,8 @@ def __init__(self, kernel_group):
         # MLIR SSA tracker
         self.var_info = {} # MLIR variable info
         self.buffer_types : dict = None # format: dtype, numel, size, stride
+        self.compute_idx = "compute_idx"
+        self.compute_body_loop = LoopLevel(self.compute_idx, 1)
 
     def set_ranges(self, lengths, reduction_lengths):
         if self.call_ranges:
@@ -426,8 +436,8 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
         else:
             raise NotImplementedError("dummy tile size fail!")
 
-        vlane_stride = 8 # TODO: VCIX widening is not implemented
         vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
+        vlane_stride = 8 # TODO: VCIX widening is not implemented
         # Adjust tile size to avoid too much paddings
         for i in range(1, len(tile_size)+1):
             target_range = self.ranges[-i]
@@ -439,10 +449,7 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
                 tile_size[-i] = target_range
                 if remains:
                     tile_size[-i] += vlane_stride - remains
-        # Handle scalar case
-        if len(tile_size)==1 and tile_size[0] == 1:
-            vlane_stride = 1
-            tile_size[0] = 1
+
         # Adjust tile size
         for i in range(len(vars)):
             if tile_size[i] >= self.vector_lane: # maximize used vector lane
@@ -451,12 +458,22 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
         padded_size = used_vlane * vlane_stride
         tile_size[vlane_split_axis] = ((tile_size[vlane_split_axis] + padded_size - 1) // padded_size) * padded_size
 
+        # Handle scalar case
+        if len(tile_size)==1 and tile_size[0] == 1:
+            vlane_stride = 1
+            tile_size[0] = 1
+        elif vlane_split_axis == -1:
+            tile_size = [1] + tile_size
+            vlane_split_axis = 0
+            vlane_stride = 1
+
         # Select tile info.
         # Note: Kernel Group have to share same tile desc for fusion
         tile_desc = MLIRMultiDimTile(tile_size, self.vector_lane)
         tile_desc.vlane_split_axis = vlane_split_axis
         tile_desc.vlane_stride = vlane_stride
         tile_desc.implicit_dim_size = implicit_dim_size
+        tile_desc.nr_rdim = len(reduction_vars)
         return tile_desc
 
     def set_tile_size(self, template_store_info):
@@ -474,6 +491,8 @@ def codegen_nodes(self, nodes, kernel_name):
         # Set node range info
         vars, reduction_vars = self.set_ranges(group, reduction_group)
         tile_desc = self.compute_tile_size(nodes, vars, reduction_vars)
+        self.compute_body_loop.size = tile_desc.get_numel_per_lane()
+        self.compute_body_loop.step = tile_desc.get_compute_vec_size()
         self.kernel_group.set_tile_info(tile_desc)
 
         _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs()
@@ -680,3 +699,48 @@ def bucketize(
         return self
 
 
+@dataclasses.dataclass
+class LoopLevel:
+    var: sympy.Expr
+    size: sympy.Expr
+    start: int = 0
+    step: int = 1
+    reduction_vars: Dict[str, str] = dataclasses.field(default_factory=dict)
+    affine_yield: Dict[str, str] = dataclasses.field(default_factory=dict)
+
+    def lines(self):
+        if len(self.reduction_vars):
+            acc = ', '.join([f"%{acc.name}" for acc in self.reduction_vars.keys()])
+            args = ', '.join([f"%{iter.name} = %{init.name}" for (_, iter, init, _) in self.reduction_vars.values()])
+            dtype = ', '.join([f"{dtype}" for (_, _, _, dtype) in self.reduction_vars.values()])
+            line = f"{acc} = affine.for %{self.var} = {self.start} to {self.size} step {self.step} iter_args({args}) -> ({dtype})"
+        else:
+            line = f"affine.for %{self.var} = {self.start} to {self.size} step {self.step}"
+
+        return [line]
+
+    def epilogue_line(self):
+        if len(self.affine_yield):
+            vars = ', '.join([f"%{name}" for name, _ in self.affine_yield.items()])
+            reduced_shapes = ', '.join([f"{shape}" for _, shape in self.affine_yield.items()])
+            return f"affine.yield {vars} : {reduced_shapes}"
+        return ""
+
+@dataclasses.dataclass
+class LoopNest:
+    loops: List[LoopLevel]
+
+    def __bool__(self):
+        return bool(self.loops)
+
+    def mark_reduction(self, reduction_vars, affine_yield=dict()):
+        for loop in self.loops:
+            loop.reduction_vars = reduction_vars
+            loop.affine_yield = affine_yield
+
+    def mark_parallel(self, par_depth):
+        loops = self.loops
+        loops[0].parallel = par_depth
+        for i in range(1, par_depth):
+            loops[i].collapsed = True
+        loops[0].simd = loops[par_depth - 1].simd
\ No newline at end of file
diff --git a/gem5_script/script_systolic.py b/gem5_script/script_systolic.py
index 979bf589..4dad11ac 100644
--- a/gem5_script/script_systolic.py
+++ b/gem5_script/script_systolic.py
@@ -1,10 +1,11 @@
 import argparse
-import os
 import sys
 import math
 import m5
 from m5.objects import *
-from ctypes import cdll
+
+sys.path.append(os.environ.get('TORCHSIM_DIR'))
+from gem5_script.vpu_config import *
 
 bin_path = sys.argv[1]
 parser = argparse.ArgumentParser()
@@ -19,7 +20,6 @@
 parser.add_argument("--vlen", type=int, default=256)
 args = parser.parse_args()
 
-
 class InstMemory(SimpleMemory):
     latency = "1ns"
     bandwidth = "64GB/s"
@@ -97,366 +97,6 @@ def bytes_to_bandwidth(self, bandwidth_bytes):
     def get_ctrls(self):
         return self.mem_ctrls
 
-class SystolicArray(MinorFU):
-    unitType = "SystolicArray"
-    opClasses = minorMakeOpClassSet(["CustomMatMul", "CustomMatMuliVpush", "CustomMatMulwVpush", "CustomMatMulvpop"])
-    opLat = 1
-    systolicArrayWidth = 128
-    systolicArrayHeight = 128
-
-class SparseAccelerator(MinorFU):
-    unitType = "SparseAccelerator"
-    opClasses = minorMakeOpClassSet(["CustomMatMul", "CustomMatMuliVpush", "CustomMatMulwVpush", "CustomMatMulvpop"])
-    opLat = 1
-
-class SpecialFunctionUnit(MinorFU):
-    opClasses = minorMakeOpClassSet([
-        "CustomMatMulvexp",
-        "CustomMatMulverf",
-        "CustomMatMulvtanh",
-        ])
-    opLat = 10
-
-class MinorFPUnit(MinorFU):
-    opClasses = minorMakeOpClassSet(
-        [
-            "FloatAdd",
-            "FloatCmp",
-            "FloatCvt",
-            "FloatMult",
-            "FloatMultAcc",
-            "FloatDiv",
-            "FloatMisc",
-            "FloatSqrt"
-        ]
-    )
-
-class MinorVecAdder(MinorFU):
-    opClasses = minorMakeOpClassSet(
-        [
-            "SimdAdd",
-            "SimdFloatAdd",
-            "SimdFloatAlu",
-            "SimdFloatCmp",
-        ]
-    )
-    opLat = 1
-
-class MinorVecMultiplier(MinorFU):
-    opClasses = minorMakeOpClassSet(
-        [
-            "SimdMult",
-            "SimdFloatMult",
-        ]
-    )
-    opLat = 3
-
-class MinorVecDivider(MinorFU):
-    opClasses = minorMakeOpClassSet(
-        [
-            "SimdDiv",
-            "SimdFloatDiv",
-        ]
-    )
-    opLat = 5
-
-class MinorVecMisc(MinorFU):
-    opClasses = minorMakeOpClassSet(
-        [
-            "SimdUnitStrideLoad",
-            "SimdUnitStrideStore",
-            "SimdUnitStrideMaskLoad",
-            "SimdUnitStrideMaskStore",
-            "SimdStridedLoad",
-            "SimdStridedStore",
-            "SimdIndexedLoad",
-            "SimdIndexedStore",
-            "SimdUnitStrideFaultOnlyFirstLoad",
-            "SimdWholeRegisterLoad",
-            "SimdWholeRegisterStore",
-            "SimdAddAcc",
-            "SimdAlu",
-            "SimdCmp",
-            "SimdCvt",
-            "SimdMultAcc",
-            "SimdMatMultAcc",
-            "SimdShift",
-            "SimdShiftAcc",
-            "SimdSqrt",
-            "SimdFloatCvt",
-            "SimdFloatMisc",
-            "SimdFloatMultAcc",
-            "SimdFloatMatMultAcc",
-            "SimdFloatSqrt",
-            "SimdReduceAdd",
-            "SimdReduceAlu",
-            "SimdReduceCmp",
-            "SimdFloatReduceAdd",
-            "SimdFloatReduceCmp",
-            "SimdAes",
-            "SimdAesMix",
-            "SimdSha1Hash",
-            "SimdSha1Hash2",
-            "SimdSha256Hash",
-            "SimdSha256Hash2",
-            "SimdShaSigma2",
-            "SimdShaSigma3",
-            "SimdPredAlu",
-            "SimdMisc",
-
-            "SimdUnitStrideSegmentedLoad",
-            "SimdUnitStrideSegmentedStore",
-            "SimdExt",
-            "SimdFloatExt",
-        ]
-    )
-    opLat = 1
-
-class MinorVecConfig(MinorFU):
-    opClasses = minorMakeOpClassSet(
-        [
-            "SimdConfig",
-            "CustomVlaneIdx",
-        ]
-    )
-    opLat = 1
-
-class MinorCustomVecFU(MinorFU):
-    opClasses = minorMakeOpClassSet(
-        [
-            "SimdUnitStrideLoad",
-            "SimdUnitStrideStore",
-            "SimdUnitStrideMaskLoad",
-            "SimdUnitStrideMaskStore",
-            "SimdStridedLoad",
-            "SimdStridedStore",
-            "SimdIndexedLoad",
-            "SimdIndexedStore",
-            "SimdUnitStrideFaultOnlyFirstLoad",
-            "SimdWholeRegisterLoad",
-            "SimdWholeRegisterStore",
-            "SimdAdd",
-            "SimdAddAcc",
-            "SimdAlu",
-            "SimdCmp",
-            "SimdCvt",
-            "SimdMisc",
-            "SimdMult",
-            "SimdMultAcc",
-            "SimdMatMultAcc",
-            "SimdShift",
-            "SimdShiftAcc",
-            "SimdDiv",
-            "SimdSqrt",
-            "SimdFloatAdd",
-            "SimdFloatAlu",
-            "SimdFloatCmp",
-            "SimdFloatCvt",
-            "SimdFloatDiv",
-            "SimdFloatMisc",
-            "SimdFloatMult",
-            "SimdFloatMultAcc",
-            "SimdFloatMatMultAcc",
-            "SimdFloatSqrt",
-            "SimdReduceAdd",
-            "SimdReduceAlu",
-            "SimdReduceCmp",
-            "SimdFloatReduceAdd",
-            "SimdFloatReduceCmp",
-            "SimdAes",
-            "SimdAesMix",
-            "SimdSha1Hash",
-            "SimdSha1Hash2",
-            "SimdSha256Hash",
-            "SimdSha256Hash2",
-            "SimdShaSigma2",
-            "SimdShaSigma3",
-            "SimdPredAlu",
-            "SimdMisc",
-            "SimdConfig",
-        ]
-    )
-    opLat = 1
-
-class MinorCustomIntFU(MinorFU):
-    opClasses = minorMakeOpClassSet(["IntAlu"])
-    timings = [MinorFUTiming(description="Int", srcRegsRelativeLats=[2])]
-    opLat = 1
-
-class MinorCustomFUPool(MinorFUPool):
-    funcUnits = [
-        SystolicArray(), # 0
-
-        MinorVecConfig(), # 1 for vector config
-
-        MinorFPUnit(),
-        MinorVecMisc(),
-        MinorVecMisc(),
-        MinorVecMisc(),
-        MinorVecMisc(),
-        MinorVecMisc(),
-        MinorVecMisc(),
-        MinorVecMisc(),
-        MinorVecMisc(),
-
-        # ALU0
-        MinorVecAdder(), # 6
-        MinorVecMultiplier(), # 7
-        MinorVecDivider(), # 8
-        MinorVecAdder(), # 9
-        MinorVecMultiplier(), # 10
-        MinorVecDivider(), # 11
-        MinorVecAdder(), # 12
-        MinorVecMultiplier(), # 13
-        MinorVecDivider(), # 14
-        MinorVecAdder(), # 15
-        MinorVecMultiplier(), # 16
-        MinorVecDivider(), # 17
-
-        # ALU1
-        MinorVecAdder(), # 18 ~ 29
-        MinorVecMultiplier(),
-        MinorVecDivider(),
-        MinorVecAdder(),
-        MinorVecMultiplier(),
-        MinorVecDivider(),
-        MinorVecAdder(),
-        MinorVecMultiplier(),
-        MinorVecDivider(),
-        MinorVecAdder(),
-        MinorVecMultiplier(),
-        MinorVecDivider(),
-
-        MinorCustomIntFU(), # 30
-        MinorCustomIntFU(),
-
-        MinorDefaultIntMulFU(),
-        MinorDefaultIntDivFU(),
-        MinorDefaultPredFU(),
-        MinorDefaultMemFU(),
-        MinorDefaultMiscFU(),
-
-        SpecialFunctionUnit(),
-
-        # SparseAccelerator(),
-        # Serializer0(),
-        # Serializer1(),
-        # DeSerializer(),
-    ]
-
-class MinorCustomSparseFUPool(MinorFUPool):
-    funcUnits = [
-        MinorVecConfig(), # for vector config
-
-        MinorFPUnit(),
-        MinorVecMisc(),
-        MinorVecMisc(),
-        MinorVecMisc(),
-        MinorVecMisc(),
-
-        # ALU0
-        MinorVecAdder(),
-        MinorVecMultiplier(),
-        MinorVecDivider(),
-        MinorVecAdder(),
-        MinorVecMultiplier(),
-        MinorVecDivider(),
-        MinorVecAdder(),
-        MinorVecMultiplier(),
-        MinorVecDivider(),
-        MinorVecAdder(),
-        MinorVecMultiplier(),
-        MinorVecDivider(),
-
-        # ALU1
-        MinorVecAdder(),
-        MinorVecMultiplier(),
-        MinorVecDivider(),
-        MinorVecAdder(),
-        MinorVecMultiplier(),
-        MinorVecDivider(),
-        MinorVecAdder(),
-        MinorVecMultiplier(),
-        MinorVecDivider(),
-        MinorVecAdder(),
-        MinorVecMultiplier(),
-        MinorVecDivider(),
-
-        MinorCustomIntFU(),
-        MinorCustomIntFU(),
-
-        MinorDefaultIntMulFU(),
-        MinorDefaultIntDivFU(),
-        MinorDefaultPredFU(),
-        MinorDefaultMemFU(),
-        MinorDefaultMiscFU(),
-
-        SparseAccelerator(),
-        # Serializer0(),
-        # Serializer1(),
-        # DeSerializer(),
-    ]
-
-class RiscvCustomCPU(RiscvMinorCPU):
-    fetch2InputBufferSize = 4
-    decodeInputWidth = 4
-    executeInputWidth = 4
-    executeIssueLimit = 8
-    executeMemoryIssueLimit = 2
-    executeCommitLimit = 8
-    executeMemoryCommitLimit = 2
-    executeFuncUnits = MinorCustomFUPool()
-
-class RiscvVPU(RiscvMinorCPU):
-    fetch2InputBufferSize = 2
-    decodeInputBufferSize = 1
-    decodeInputWidth = 1
-    executeInputWidth = 8
-    executeIssueLimit = 8
-    executeCommitLimit = 8
-    executeFuncUnits = MinorCustomFUPool()
-    executeMemoryIssueLimit = 8
-    executeMemoryCommitLimit = 8
-    executeMaxAccessesInMemory = 8
-    executeLSQMaxStoreBufferStoresPerCycle = 8
-    executeLSQTransfersQueueSize = 8
-    executeLSQStoreBufferSize = 8
-
-class RiscvSparseVPU(RiscvVPU):
-    executeFuncUnits = MinorCustomSparseFUPool()
-
-class MinorV2FUPool(MinorFUPool):
-    funcUnits = [
-        MinorDefaultIntFU(),
-        MinorDefaultIntFU(),
-        MinorDefaultIntMulFU(),
-        MinorDefaultIntDivFU(),
-        MinorDefaultFloatSimdFU(),
-        MinorDefaultPredFU(),
-        MinorDefaultMemFU(),
-        MinorDefaultMiscFU(),
-    ]
-
-class RiscvMinorV2CPU(RiscvMinorCPU):
-    executeFuncUnits = MinorV2FUPool()
-
-class MinorV4FUPool(MinorFUPool):
-    funcUnits = [
-        MinorDefaultIntFU(),
-        MinorDefaultIntFU(),
-        MinorDefaultIntMulFU(),
-        MinorDefaultIntDivFU(),
-        MinorDefaultFloatSimdFU(),
-        MinorDefaultPredFU(),
-        MinorDefaultMemFU(),
-        MinorDefaultMiscFU(),
-    ]
-
-class RiscvMinorV4CPU(RiscvMinorCPU):
-    executeFuncUnits = MinorV4FUPool()
-    executeCommitLimit = 4
-    executeMemoryCommitLimit = 1
-
 class L1Cache(NoncoherentCache):
     """Simple L1 Cache with default values"""
     assoc = 8
@@ -481,16 +121,10 @@ def connectCPU(self, cpu):
         self.cpu_side = cpu.icache_port
 
 valid_cpu = {
-    "RiscvAtomicSimpleCPU": RiscvAtomicSimpleCPU,
-    "RiscvTimingSimpleCPU": RiscvTimingSimpleCPU,
     "RiscvMinorCPU": RiscvMinorCPU,
     "RiscvDerivO3CPU": RiscvO3CPU,
     "RiscvMinorCPU": RiscvMinorCPU,
-    "RiscvCustomCPU": RiscvCustomCPU,
-    "RiscvMinorV2CPU": RiscvMinorV2CPU,
-    "RiscvMinorV4CPU": RiscvMinorV4CPU,
     "RiscvVPU": RiscvVPU,
-    "RiscvSparseVPU": RiscvSparseVPU,
 }
 
 # change systolicArrayWidth and systolicArrayHeight into args.vlane
diff --git a/gem5_script/vpu_config.py b/gem5_script/vpu_config.py
new file mode 100644
index 00000000..0a450125
--- /dev/null
+++ b/gem5_script/vpu_config.py
@@ -0,0 +1,201 @@
+import m5
+from m5.objects import *
+
+class SystolicArray(MinorFU):
+    unitType = "SystolicArray"
+    opClasses = minorMakeOpClassSet(["CustomMatMul", "CustomMatMuliVpush", "CustomMatMulwVpush", "CustomMatMulvpop"])
+    opLat = 1
+    systolicArrayWidth = 128
+    systolicArrayHeight = 128
+
+class SparseAccelerator(MinorFU):
+    unitType = "SparseAccelerator"
+    opClasses = minorMakeOpClassSet(["CustomMatMul", "CustomMatMuliVpush", "CustomMatMulwVpush", "CustomMatMulvpop"])
+    opLat = 1
+
+class SpecialFunctionUnit(MinorFU):
+    opClasses = minorMakeOpClassSet([
+        "CustomMatMulvexp",
+        "CustomMatMulverf",
+        "CustomMatMulvtanh",
+        ])
+    opLat = 10
+
+class MinorFPUnit(MinorFU):
+    opClasses = minorMakeOpClassSet(
+        [
+            "FloatAdd",
+            "FloatCmp",
+            "FloatCvt",
+            "FloatMult",
+            "FloatMultAcc",
+            "FloatDiv",
+            "FloatMisc",
+            "FloatSqrt"
+        ]
+    )
+
+class MinorVecAdder(MinorFU):
+    opClasses = minorMakeOpClassSet(
+        [
+            "SimdAdd",
+            "SimdFloatAdd",
+            "SimdFloatAlu",
+            "SimdFloatCmp",
+            "SimdShift",
+            "SimdShiftAcc",
+            "SimdAddAcc",
+            "SimdAlu",
+            "SimdCmp",
+        ]
+    )
+    opLat = 1
+
+class MinorVecMultiplier(MinorFU):
+    opClasses = minorMakeOpClassSet(
+        [
+            "SimdMult",
+            "SimdFloatMult",
+            "SimdMultAcc",
+            "SimdMatMultAcc",
+            "SimdSqrt",
+            "SimdFloatMultAcc",
+            "SimdFloatMatMultAcc",
+            "SimdFloatSqrt",
+        ]
+    )
+    opLat = 1
+
+class MinorVecDivider(MinorFU):
+    opClasses = minorMakeOpClassSet(
+        [
+            "SimdDiv",
+            "SimdFloatDiv",
+        ]
+    )
+    opLat = 1
+
+class MinorVecReduce(MinorFU):
+    opClasses = minorMakeOpClassSet(
+        [
+            "SimdReduceAdd",
+            "SimdReduceAlu",
+            "SimdReduceCmp",
+            "SimdFloatReduceAdd",
+            "SimdFloatReduceCmp",
+        ]
+    )
+    opLat = 1
+
+class MinorVecLdStore(MinorFU):
+    opClasses = minorMakeOpClassSet(
+        [
+            "SimdUnitStrideLoad",
+            "SimdUnitStrideStore",
+            "SimdUnitStrideMaskLoad",
+            "SimdUnitStrideMaskStore",
+            "SimdStridedLoad",
+            "SimdStridedStore",
+            "SimdIndexedLoad",
+            "SimdIndexedStore",
+            "SimdUnitStrideFaultOnlyFirstLoad",
+            "SimdWholeRegisterLoad",
+            "SimdWholeRegisterStore",
+            "SimdUnitStrideSegmentedLoad",
+            "SimdUnitStrideSegmentedStore",
+        ]
+    )
+    opLat = 2
+
+class MinorVecMisc(MinorFU):
+    opClasses = minorMakeOpClassSet(
+        [
+            "SimdCvt",
+            "SimdFloatCvt",
+            "SimdFloatMisc",
+            "SimdPredAlu",
+            "SimdMisc",
+            "SimdExt",
+            "SimdFloatExt",
+        ]
+    )
+    opLat = 1
+
+class MinorVecConfig(MinorFU):
+    opClasses = minorMakeOpClassSet(
+        [
+            "SimdConfig",
+            "CustomVlaneIdx",
+        ]
+    )
+    opLat = 1
+
+class MinorCustomIntFU(MinorDefaultIntFU):
+    opLat = 1
+
+class MinorCustomIntDivFU(MinorDefaultIntDivFU):
+    opLat = 1
+
+class MinorCustomIntMulFU(MinorDefaultIntMulFU):
+    opLat = 1
+
+class MinorCustomPredFU(MinorDefaultPredFU):
+    opLat = 1
+
+class MinorCustomMemFU(MinorDefaultMemFU):
+    opLat = 1
+
+class MinorCustomMiscFU(MinorDefaultMiscFU):
+    opLat = 1
+
+class MinorCustomFUPool(MinorFUPool):
+    funcUnits = [
+        # Scalar unit
+        MinorFPUnit(),
+        MinorCustomIntFU(),
+        MinorCustomIntMulFU(),
+        MinorCustomIntDivFU(),
+        MinorCustomPredFU(),
+        MinorCustomMemFU(),
+        MinorCustomMiscFU(),
+
+        # Matmul unit
+        SystolicArray(), # 0
+ 
+        # Vector
+        MinorVecConfig(), # 1 for vector config
+        MinorVecMisc(),
+        MinorVecLdStore(),
+
+        # Vector ALU0
+        MinorVecAdder(), # 6
+        MinorVecMultiplier(), # 7
+        MinorVecDivider(), # 8
+        MinorVecReduce(),
+
+        # Vector ALU1
+        MinorVecAdder(), # 18 ~ 29
+        MinorVecMultiplier(),
+        MinorVecDivider(),
+        MinorVecReduce(),
+
+        # SFU
+        SpecialFunctionUnit(),
+    ]
+
+class RiscvVPU(RiscvMinorCPU):
+    fetch2InputBufferSize = 2
+    decodeInputBufferSize = 1
+    decodeInputWidth = 1
+    executeInputWidth = 8
+    executeIssueLimit = 8
+    executeCommitLimit = 8
+    # Memory
+    executeMemoryIssueLimit = 2
+    executeMemoryCommitLimit = 2
+    executeMaxAccessesInMemory = 2
+    executeLSQMaxStoreBufferStoresPerCycle = 2
+    executeLSQTransfersQueueSize = 8
+    executeLSQStoreBufferSize = 8
+
+    executeFuncUnits = MinorCustomFUPool()

From 177e2df277405d408f272e0a2c6a373e7a039ae4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 2 Apr 2025 15:46:57 +0000
Subject: [PATCH 260/432] [Frontend] llc Apply O3 optimization

---
 PyTorchSimFrontend/extension_codecache.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 177e1893..f58605cc 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -77,7 +77,7 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
             re.sub(r"[ \n]+", " ",
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc \
-                -relocation-model=pic -march=riscv64 \
+                -relocation-model=pic -march=riscv64 -O3 \
                 -mattr=+m,+f,+d,+a,+c,+v,+xsfvcp,zvl{vlen}b \
                 {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_LLVM_IR else ''} \
                 -O2 {filename}.ll -o {filename}.s
@@ -118,7 +118,7 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
             re.sub(r"[ \n]+", " ",
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc \
-                -relocation-model=pic -march=riscv64 \
+                -relocation-model=pic -march=riscv64 -O3 \
                 -mattr=+m,+f,+d,+a,+c,+v,+xsfvcp,zvl{vlen}b \
                 {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_LLVM_IR else ''} \
                 -O2 {sample_filename}.ll -o {sample_filename}.s

From 413b278bb4b6fbc63ab29eed347f3339c738dbc5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 2 Apr 2025 15:49:40 +0000
Subject: [PATCH 261/432] [Frontend] Use tpu v3 config as default config

---
 PyTorchSimFrontend/extension_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index c6fd515f..4e035db4 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -34,7 +34,7 @@
 
 # Backendsim config
 CONFIG_TORCHSIM_BACKEND_CONFIG = os.environ.get('TORCHSIM_CONFIG',
-                                        default=f'{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
+                                        default=f'{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
 CONFIG_BACKENDSIM_SPIKE_ONLY = int(os.environ.get("BACKENDSIM_SPIKE_ONLY", False))
 CONFIG_BACKENDSIM_EAGER_MODE = int(os.environ.get("BACKENDSIM_EAGER_MODE", default=False))
 CONFIG_BACKENDSIM_DRYRUN = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))

From c0938d66d8183e521b61bda1e8f9174adc1b2596 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 2 Apr 2025 15:50:03 +0000
Subject: [PATCH 262/432] [Gem5script] Minor update

---
 gem5_script/vpu_config.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/gem5_script/vpu_config.py b/gem5_script/vpu_config.py
index 0a450125..01437381 100644
--- a/gem5_script/vpu_config.py
+++ b/gem5_script/vpu_config.py
@@ -105,7 +105,7 @@ class MinorVecLdStore(MinorFU):
             "SimdUnitStrideSegmentedStore",
         ]
     )
-    opLat = 2
+    opLat = 1
 
 class MinorVecMisc(MinorFU):
     opClasses = minorMakeOpClassSet(
@@ -117,6 +117,7 @@ class MinorVecMisc(MinorFU):
             "SimdMisc",
             "SimdExt",
             "SimdFloatExt",
+            "CustomVlaneIdx",
         ]
     )
     opLat = 1
@@ -125,7 +126,6 @@ class MinorVecConfig(MinorFU):
     opClasses = minorMakeOpClassSet(
         [
             "SimdConfig",
-            "CustomVlaneIdx",
         ]
     )
     opLat = 1
@@ -153,6 +153,7 @@ class MinorCustomFUPool(MinorFUPool):
         # Scalar unit
         MinorFPUnit(),
         MinorCustomIntFU(),
+        MinorCustomIntFU(),
         MinorCustomIntMulFU(),
         MinorCustomIntDivFU(),
         MinorCustomPredFU(),
@@ -164,8 +165,11 @@ class MinorCustomFUPool(MinorFUPool):
  
         # Vector
         MinorVecConfig(), # 1 for vector config
+        MinorVecConfig(),
+        MinorVecMisc(),
         MinorVecMisc(),
         MinorVecLdStore(),
+        MinorVecLdStore(),
 
         # Vector ALU0
         MinorVecAdder(), # 6

From 0d345e4a63a08578074c041f0f2df263478fcd7c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 2 Apr 2025 16:05:58 +0000
Subject: [PATCH 263/432] [Backend] Unlimit number of node

---
 PyTorchSimBackend/src/Core.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index e4957661..af71c290 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -20,7 +20,7 @@ Core::Core(uint32_t id, SimulationConfig config)
 
 bool Core::can_issue(const std::shared_ptr<Tile>& op) {
   /* Check SRAM is enough to run tile */
-  return op->get_required_sram_size() + _used_sram_size <= _sram_size && _tiles.size() < 2 && !op->is_stonne_tile();
+  return op->get_required_sram_size() + _used_sram_size <= _sram_size && !op->is_stonne_tile();
 }
 
 void Core::issue(std::shared_ptr<Tile> op) {

From f2b23acd03f7fb4f52eeca742ce02ee926a8907d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 3 Apr 2025 07:06:45 +0000
Subject: [PATCH 264/432] [Frontend] Support fine-grained computation fusion

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  | 75 ++++++++++------
 .../mlir/mlir_codegen_backend.py              | 61 ++++++++++---
 PyTorchSimFrontend/mlir/mlir_common.py        | 90 ++++++++++++-------
 PyTorchSimFrontend/mlir/mlir_conv_template.py |  7 --
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 16 +++-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  7 +-
 PyTorchSimFrontend/mlir/mlir_template.py      | 78 +++++++++++-----
 7 files changed, 229 insertions(+), 105 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 20044237..588c6f42 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -10,12 +10,22 @@
 import PyTorchSimFrontend.extension_codecache as extension_codecache
 
 BMM_TEMPLATE = r"""
+// BMM kernel
+// BATCH = {{ B }}
+// M = {{ M }}
+// N = {{ N }}
+// K = {{ K }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
+// SUB_TILE_M = {{ SUB_TILE_M }}
+// SUB_TILE_N = {{ SUB_TILE_N }}
 {% if X_transposed %}#map0 = affine_map<(d0, d1, d2) -> (d0 * {{ K * M }} + d2 * {{ M }} + d1)>{% else %}#map0 = affine_map<(d0, d1, d2) -> (d0 * {{ M * K }} + d1 * {{ K }} + d2)>{% endif %}
 {% if W_transposed %}#map1 = affine_map<(d0, d1, d2) -> (d0 * {{ N * K }} + d2 * {{ K }} + d1)>{% else %}#map1 = affine_map<(d0, d1, d2) -> (d0 * {{ K * N }} + d1 * {{ N }} + d2)>{% endif %}
 #map2 = affine_map<(d0, d1, d2) -> (d0 * {{ M * N }} + d1 * {{ N }} + d2)>
-memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-memref.global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+memref.global @X_spad : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+memref.global @W_spad : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+memref.global @Y_spad : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
 {{kernel.def_global_vars()}}
 
 func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} {
@@ -25,9 +35,9 @@
   %c_mvout = arith.constant 3 : index
   %vstride = arith.constant 1 : index
   %axis = arith.constant 2 : index
-  %X_buffer = memref.get_global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-  %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-  %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+  %X_buffer = memref.get_global @X_spad : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+  %W_buffer = memref.get_global @W_spad : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+  %Y_buffer = memref.get_global @Y_spad : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
   %tag = memref.alloc() : memref<1xi32>
   %tag0 = memref.alloc() : memref<1xi32>
   %tag1 = memref.alloc() : memref<1xi32>
@@ -38,26 +48,31 @@
   affine.for %b=0 to {{ B }} {
     affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
       affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
+        %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+        %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+        %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+
         %index2 = affine.apply #map2(%b, %t_m, %t_n)
         {% if Bias -%}
         memref.dma_start %Bias[
         {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
-          ], %Y_buffer[0, 0], %c_mvin3, %tag0[%c0], %
+          ], %Y_buffer2D[0, 0], %c_mvin3, %tag0[%c0], %
         {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
           , %vstride : memref<
         {%- if Bias_rank == 2 -%} {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
           xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1 , {{ TILE_M }}] }
         {%- else -%}
-        affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
+        affine.vector_store %v0, %Y_buffer2D[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
         affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
           %index0 = affine.apply #map0(%b, %t_m, %t_k)
           %index1 = affine.apply #map1(%b, %t_k, %t_n)
-          memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
+          memref.dma_start %X[%index0], %X_buffer2D[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
              : memref<{{ B * M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
-          memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
+          memref.dma_start %W[%index1], %W_buffer2D[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
              : memref<{{ B * K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
-          linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
-                  outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
+
+          linalg.matmul ins(%X_buffer2D, %W_buffer2D : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
+                  outs(%Y_buffer2D : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
         } { accumulation_loop=true }
         {{kernel.store_output(indent_size=8)}}
       } { outer_loop=true }
@@ -73,15 +88,26 @@ def __init__(self, input_nodes, layout, input_reorder=None):
 
     def is_transposed(self, node):
         if isinstance(node, ReinterpretView):
-            # if node.layout.stride != node.data.layout.stride:
-            if node.layout.stride[-1] != node.data.layout.stride[-1] or node.layout.stride[-2] != node.data.layout.stride[-2]:
-                squeezed_layout = [s for s in node.layout.stride if s]
+            unsqueezed_layout_stride = [s for s, size in zip(node.layout.stride, node.layout.size) if size > 1]
+            unsqueezed_data_stride = [s for s, size in zip(node.data.layout.stride, node.data.layout.size) if size > 1]
+
+            if 0 in node.layout.stride: # [MoE] Temporary solution
+                if node.layout.stride[1] == 0:
+                    return True
+            if len(node.layout.stride) == len(node.data.layout.stride):
                 if node.layout.stride[-2] == node.data.layout.stride[-1] and node.layout.stride[-1] == node.data.layout.stride[-2]:
                     return True
-                elif squeezed_layout == node.data.layout.stride[len(node.data.layout.stride)-len(squeezed_layout):]:
-                    return False
                 else:
                     raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
+            elif len(node.layout.stride) < len(node.data.layout.stride):
+                # Squeezed case
+                if node.layout.stride == node.data.layout.stride[-len(node.layout.stride):]:
+                    return False
+                if len(unsqueezed_layout_stride) < len(unsqueezed_data_stride):
+                    if unsqueezed_layout_stride == unsqueezed_data_stride[-len(unsqueezed_layout_stride):]:
+                        return False
+                raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
+
         return False
 
     def render(self,
@@ -91,8 +117,8 @@ def render(self,
                **kwargs):
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
-        if epilogue_nodes is not None and len(epilogue_nodes) > 0:
-            self.output_node = cast(Buffer, epilogue_nodes[-1])
+        #if epilogue_nodes is not None and len(epilogue_nodes) > 0:
+        #    self.output_node = cast(Buffer, epilogue_nodes[-1])
 
         X, W = self.input_nodes[0], self.input_nodes[1]
         Y = self.output_node
@@ -100,7 +126,8 @@ def render(self,
 
         B, M, N, K = X.get_size()[0], X.get_size()[1], W.get_size()[2], X.get_size()[2]
         TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
-        kernel.loop_size = [TILE_M, TILE_N, TILE_K]
+        TOG_latency = M if TILE_M > M else TILE_M
+        kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
         SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
         SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
         SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
@@ -128,8 +155,8 @@ def render(self,
             Y = Y,
             Bias = Bias,
             Bias_rank = len(Bias.data.get_size()) if Bias is not None else 0,
-            W_transposed = W_transposed,
             X_transposed = X_transposed,
+            W_transposed = W_transposed,
             Y_numel = B * M * N,
             input_reorder = self.input_reorder
         )
@@ -144,11 +171,9 @@ def render(self,
             vlane_split_axis = 2,
             vlane_stride = 1,
             mlir_dtype = kernel.render_options['DATA_STYPE'],
-            tile_nr_dim = 2,
             dram_shape = f"memref<{kernel.render_options['Y_numel']}x{kernel.render_options['DATA_STYPE']}>",
-            tile_shape = f"memref<{TILE_M}x{TILE_N}x{kernel.render_options['DATA_STYPE']}, 1>",
-            tile_size = (TILE_M, TILE_N),
-            tile_stride = [1, TILE_M]
+            tile_size = (1, TILE_M, TILE_N),
+            tile_stride = [1, 1, TILE_M]
         )
         code = self._template_from_string(BMM_TEMPLATE).render(**kernel.render_options)
         kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index a58909da..601e7e39 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -249,27 +249,63 @@ def constant(value, src_type, *args, var_info=None, **kwargs):
     def alloc(size, src_type, *args, var_info=None, **kwargs):
         return f"memref.alloc() : memref<{size}x{src_type}>", [size, src_type]
 
+    @staticmethod
+    def extractelement(operand, idx, *args, var_info=None, **kwargs):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f"vector.extract %{operand}[{idx}]: {dtype} from {shape}", [1, dtype]
+
     # transcendental functions
     @staticmethod
     def exp(operand, *args, var_info=None, **kwargs):
+        # Check scalar
+        op_type = var_info[operand]
+        if op_type[0] == 1:
+            val = ops.constant(0, op_type[1])
+            var_info[val][0] = 4
+            operand = ops.broadcast(operand, val)
+            val = ops.exp(operand)
+            result = ops.extractelement(val, 0)
+            return result, var_info[result]
         op_type = var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
-
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
         return f'math.exp %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def erf(x, *args, var_info=None, **kwargs):
-        op_type = var_info[x]
+    def erf(operand, *args, var_info=None, **kwargs):
+        # Check scalar
+        op_type = var_info[operand]
+        if op_type[0] == 1:
+            val = ops.constant(0, op_type[1])
+            var_info[val][0] = 4
+            operand = ops.broadcast(operand, val)
+            val = ops.exp(operand)
+            result = ops.extractelement(val, 0)
+            return result, var_info[result]
+        op_type = var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.erf %{x} : {shape}', [tile_size, dtype]
+        return f'math.erf %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
     def tanh(operand, *args, var_info=None, **kwargs):
         op_type = var_info[operand]
+
+        # Check scalar
+        op_type = var_info[operand]
+        if op_type[0] == 1:
+            val = ops.constant(0, op_type[1])
+            var_info[val][0] = 4
+            operand = ops.broadcast(operand, val)
+            val = ops.exp(operand)
+            result = ops.extractelement(val, 0)
+            return result, var_info[result]
+        op_type = var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
 
@@ -277,7 +313,6 @@ def tanh(operand, *args, var_info=None, **kwargs):
         if dtype[0] != "f":
             operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
             var_info[operand] = dtype
-
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
         return f'math.tanh %{operand} : {shape}', [tile_size, dtype]
 
@@ -952,6 +987,7 @@ def store_reduction(self, name, index, value):
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
         vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
+        compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
 
         sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var,
                                                                          index, buffer=self.reduction_suffix)
@@ -977,7 +1013,7 @@ def store_reduction(self, name, index, value):
                 value = m2
 
         # Select src type
-        if tile_numel_per_lane == 1:
+        if compute_vec_size == 1:
             operation = "affine.store"
             line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}"
         else:
@@ -1034,10 +1070,10 @@ def index_expr(self, index, dtype):
         tile_size = tile_desc.get_tile_size_per_lane()
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         tile_numel_per_lane = tile_desc.get_numel_per_lane()
-        vlane_stride = tile_desc.vlane_stride
         str_tile_size = [str(dim) for dim in tile_size]
         tile_shape = f"memref<{'x'.join(str_tile_size)}xi64, 1>"
         vshape = tile_desc.get_mlir_vshape(mlir_dtype)
+        compute_vec_size = tile_desc.get_compute_vec_size()
 
         # Define scratch pad buffer
         sram_var, _, _ = self.get_scratchpad_buffer(dtype, "index_buffer", tile_numel_per_lane, tile_shape, None, index)
@@ -1049,9 +1085,9 @@ def index_expr(self, index, dtype):
             self.index_set.add(index)
             ops._index_expr(tile_size, sram_var, renamed_expression, index)
 
-        line = f"affine.vector_load %{sram_var}[0, 0, 0] : {tile_shape}, {vshape} // {renamed_expression}"
+        line = f"affine.vector_load %{sram_var}[0, 0, %{self.compute_idx}] : {tile_shape}, {vshape} // {renamed_expression}"
         out = self.cse.generate(self.compute, line)
-        self.register_var_info(out, [vlane_stride, mlir_dtype])
+        self.register_var_info(out, [compute_vec_size, mlir_dtype])
         return out
 
     def codegen_global_init(self):
@@ -1181,8 +1217,8 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
         # Case 2. Tile is 1-D vector type with reduction
         elif len(local_dims) == 1 and len(local_dims) == self.reduction_depth + 1:
             local_tile_desc.set_tile_size([1, kg_tile_desc.get_dim_size(local_dims[0])])
-            local_tile_desc.vlane_split_axis = 0
-            local_tile_desc.vlane_stride = 1
+            local_tile_desc.vlane_split_axis = local_vlane_split_axis
+            local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         # Case 3. Tile is 2-D tile
         elif len(local_dims) == 2:
             is_reduction = self.reduction_depth == 1 and not store_reduction
@@ -1371,6 +1407,7 @@ def convert_indirect_indexing(self, index :sympy.Expr):
         for target_dim in indirect_dims:
             sram_var, _, tile_numel_per_lane, sram_index_var, tile_shape, vshape = self.spad_buffer_dict[target_dim]
             mlir_dtype = vshape.split("x")[1][:-1]
+            vshape = f"vector<{tile_numel_per_lane}x{mlir_dtype}>" # FIXME. Maybe require fine grain compute...
             if tile_numel_per_lane > 1:
                 operation = "affine.vector_load"
                 line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape} // For indirect access"
@@ -1399,6 +1436,8 @@ def convert_indirect_indexing(self, index :sympy.Expr):
 
         # Store index var
         sram_var, _, tile_numel_per_lane, sram_index_var, tile_shape, vshape = self.spad_buffer_dict[first_dim]
+        mlir_dtype = vshape.split("x")[1][:-1]
+        vshape = f"vector<{tile_numel_per_lane}x{mlir_dtype}>" # FIXME. Maybe require fine grain compute...
         if tile_numel_per_lane > 1:
             operation = "affine.vector_store"
             line = f"{operation} %{spad_vars[first_dim]}, %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}"
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 0603b97d..08792009 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -1,4 +1,5 @@
 import dataclasses
+import math
 from typing import Dict
 from typing import List
 from collections import defaultdict
@@ -253,7 +254,7 @@ def get_mlir_shape(self, dtype):
         return f"memref<{shape}x{dtype}, 1>"
 
     def get_mlir_vshape(self, mlir_dtype):
-        return f"vector<{self.get_compute_vec_size()}x{mlir_dtype}>" if self.vlane_stride > 1 else f"{mlir_dtype}"
+        return f"vector<{self.get_compute_vec_size()}x{mlir_dtype}>" if self.get_compute_vec_size() > 1 else f"{mlir_dtype}"
 
     def get_used_vlane(self):
         """
@@ -271,6 +272,8 @@ def get_compute_vec_size(self):
         if self.nr_rdim:
             assert self.nr_rdim==1
             return self.get_numel_per_lane() // self._tile_size[-1]
+        if self.vlane_stride < 16 and (self.get_numel_per_lane() // 16 >= 1):
+            return 16
         return self.vlane_stride
 
     @staticmethod
@@ -438,34 +441,68 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
 
         vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
         vlane_stride = 8 # TODO: VCIX widening is not implemented
-        # Adjust tile size to avoid too much paddings
-        for i in range(1, len(tile_size)+1):
-            target_range = self.ranges[-i]
-            if implicit_ranges:
-                target_range = implicit_dim_size[len(tile_size)-i][-1]
-
-            if tile_size[-i] > target_range:
-                remains = (target_range % vlane_stride)
-                tile_size[-i] = target_range
-                if remains:
-                    tile_size[-i] += vlane_stride - remains
-
-        # Adjust tile size
-        for i in range(len(vars)):
-            if tile_size[i] >= self.vector_lane: # maximize used vector lane
-                vlane_split_axis = i
-        used_vlane = min((tile_size[vlane_split_axis] + vlane_stride - 1) // vlane_stride, self.vector_lane)
-        padded_size = used_vlane * vlane_stride
-        tile_size[vlane_split_axis] = ((tile_size[vlane_split_axis] + padded_size - 1) // padded_size) * padded_size
+
+        # FIXME: Naive tile size decrement
+        def decrease_tile_size(tile_size):
+            for i in range(len(tile_size)):
+                if tile_size[i] > 1:
+                    tile_size[i] = int(tile_size[i] // 2)
+                    break
+            return tile_size
+
+        # FIXME: Not considering removed buffers
+        n_buffer = sum(
+            len(node.read_writes.reads) + len(node.read_writes.writes)
+            for node in nodes
+        )
+
+        spad_overflow = True
+        # Find proper tile size
+        while spad_overflow:
+            # Adjust tile size to avoid too much paddings
+            for i in range(1, len(tile_size)+1):
+                target_range = self.ranges[-i]
+                if implicit_ranges:
+                    target_range = implicit_dim_size[len(tile_size)-i][-1]
+
+                if tile_size[-i] > target_range:
+                    remains = (target_range % vlane_stride)
+                    tile_size[-i] = target_range
+                    if remains:
+                        tile_size[-i] += vlane_stride - remains
+
+            # Adjust tile size
+            for i in range(len(vars)):
+                if tile_size[i] >= self.vector_lane: # maximize used vector lane
+                    vlane_split_axis = i
+            used_vlane = min((tile_size[vlane_split_axis] + vlane_stride - 1) // vlane_stride, self.vector_lane)
+            padded_size = used_vlane * vlane_stride
+            tile_size[vlane_split_axis] = ((tile_size[vlane_split_axis] + padded_size - 1) // padded_size) * padded_size
+
+            used_vlane = min((tile_size[vlane_split_axis] + vlane_stride - 1) // vlane_stride, self.vector_lane)
+            padded_size = used_vlane * vlane_stride
+            tile_size[vlane_split_axis] = ((tile_size[vlane_split_axis] + padded_size - 1) // padded_size) * padded_size
+
+            # Check spad overflow
+            spad_usage_per_vlane = n_buffer * math.prod(tile_size) * self.precision // used_vlane
+            if spad_usage_per_vlane >= self.spad_info["spad_size"]:
+                new_tile_size = decrease_tile_size(tile_size.copy())
+                if new_tile_size == tile_size:
+                    raise NotImplementedError("Error: Cannot find proper tile size")
+                tile_size = new_tile_size
+                spad_overflow = True
+                continue
+            else:
+                spad_overflow = False
 
         # Handle scalar case
-        if len(tile_size)==1 and tile_size[0] == 1:
+        if len(self.ranges)==1 and self.ranges[0] == 1:
             vlane_stride = 1
+            vlane_split_axis = 0
             tile_size[0] = 1
         elif vlane_split_axis == -1:
-            tile_size = [1] + tile_size
             vlane_split_axis = 0
-            vlane_stride = 1
+            vlane_stride = tile_size[0]
 
         # Select tile info.
         # Note: Kernel Group have to share same tile desc for fusion
@@ -476,13 +513,6 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
         tile_desc.nr_rdim = len(reduction_vars)
         return tile_desc
 
-    def set_tile_size(self, template_store_info):
-        tile_desc = MLIRMultiDimTile(template_store_info['tile_size'],
-            self.vector_lane,
-            vlane_split_axis=template_store_info['vlane_split_axis'],
-            vlane_stride=template_store_info['vlane_stride'])
-        return tile_desc
-
     def codegen_nodes(self, nodes, kernel_name):
         _, (group, reduction_group) = max(
             nodes, key=lambda x: int(x.is_reduction())
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 0c10160d..6eba7a39 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -684,10 +684,6 @@ def render(self,
 
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
 
-        # FIXME: transposed inputs not supported
-        # W_transposed = self.is_transposed(W)
-        # X_transposed = self.is_transposed(X)
-
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
@@ -742,10 +738,7 @@ def render(self,
             vlane_split_axis = 3,
             vlane_stride = 1,
             mlir_dtype = kernel.render_options['DATA_STYPE'],
-            tile_nr_dim = 4,
             dram_shape = f"memref<{BATCH * O_C * O_H * O_W}x{kernel.render_options['DATA_STYPE']}>",
-            tile_shape = f"memref<{TILE_O_H}x{TILE_O_W}x{TILE_M}x{TILE_N}x{kernel.render_options['DATA_STYPE']}, 1>" if conv_template in (CONV_TEMPLATE, MULTI_TILE_CONV_TEMPLATE)
-                          else f"memref<1x{TILE_O_H}x{TILE_M}x{TILE_N}x{kernel.render_options['DATA_STYPE']}, 1>",
             tile_size = (TILE_O_H, TILE_O_W, TILE_M, TILE_N) if conv_template in (CONV_TEMPLATE, MULTI_TILE_CONV_TEMPLATE) else (1, TILE_O_H, TILE_M, TILE_N),
             tile_stride = [TILE_O_W * TILE_M * TILE_N, TILE_M * TILE_N, 1, TILE_M]
         )
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index c99ea222..84b8bdbb 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -11,6 +11,15 @@
 from PyTorchSimFrontend import extension_config
 
 GEMM_TEMPLATE = r"""
+// GEMM kernel
+// M = {{ M }}
+// N = {{ N }}
+// K = {{ K }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
+// SUB_TILE_M = {{ SUB_TILE_M }}
+// SUB_TILE_N = {{ SUB_TILE_N }}
 {% if X_transposed %}#map0 = affine_map<(d0, d1) -> (d1 * {{ M }} + d0)>{% else %}#map0 = affine_map<(d0, d1) -> (d0 * {{ K }} + d1)>{% endif %}
 {% if W_transposed %}#map1 = affine_map<(d0, d1) -> (d1 * {{ K }} + d0)>{% else %}#map1 = affine_map<(d0, d1) -> (d0 * {{ N }} + d1)>{% endif %}
 #map2 = affine_map<(d0, d1) -> (d0 * {{ N }} + d1)>
@@ -119,7 +128,7 @@ def render(self,
         M, N, K = X.get_size()[0], W.get_size()[1], X.get_size()[1]
         n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
         if (M == 0) or (N == 0) or (K == 0):
-            TILE_M, TILE_N, TILE_K = 0, 0, 0
+            TILE_M, TILE_N, TILE_K = 1, 1, 1
             template = EMPTY_TEMPLATE
         else:
             TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node)
@@ -155,8 +164,8 @@ def render(self,
             Y = Y,
             Bias = Bias,
             Bias_rank = len(Bias.data.get_size()) if Bias is not None else 0,
-            W_transposed = W_transposed,
             X_transposed = X_transposed,
+            W_transposed = W_transposed,
             Y_numel = M * N,
             epilogue_nodes = epilogue_nodes,
             input_reorder = self.input_reorder
@@ -172,13 +181,12 @@ def render(self,
             vlane_split_axis = 1,
             vlane_stride = 1,
             mlir_dtype = kernel.render_options['DATA_STYPE'],
-            tile_nr_dim = 2,
             dram_shape = f"memref<{kernel.render_options['Y_numel']}x{kernel.render_options['DATA_STYPE']}>",
-            tile_shape = f"memref<{TILE_M}x{TILE_N}x{kernel.render_options['DATA_STYPE']}, 1>",
             tile_size = (TILE_M, TILE_N),
             tile_stride = [1, TILE_M]
         )
         code = self._template_from_string(template).render(**kernel.render_options)
+        kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
 
         self.header = f"float X_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_K)}] __attribute__ ((section(\".spad\")));\n"
         self.header += f"float W_spad[{kernel.get_spad_size_per_lane(TILE_K, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index f66fa010..88313eff 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -144,19 +144,18 @@ def codegen_template_code(self, kernel, render, template_node, epilogue_nodes):
             for node in [template_node, *epilogue_nodes]:
                 node.mark_run()
             partial_code = render()
+            tile_desc = kernel.set_tile_size(kernel.store_info)
+            kernel.kernel_group.set_tile_info(tile_desc)
             if epilogue_nodes:
                 _, (group, reduction_group) = max(
                     epilogue_nodes, key=lambda x: int(x.is_reduction())
                 ).group
-                vars, reduction_vars = kernel.set_ranges(group, reduction_group)    # Do we need this?
-                tile_desc = kernel.set_tile_size(kernel.store_info)
-                kernel.kernel_group.set_tile_info(tile_desc)
+                vars, reduction_vars = kernel.set_ranges(group, reduction_group)
             # Flush created varaibles, since template fusion doen't share variable
             kernel.cse.cache.clear()
             for node in epilogue_nodes:
                 if template_node.node.name in [dep[0] for dep in list(node.read_writes.reads)]:
                     kernel.store_info['dependent_buf'].append(node.node.name)
-                kernel.store_info
                 node.codegen((vars, reduction_vars))
         with V.set_kernel_handler(kernel):
             src_code = (
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 692dce00..e26d9c0d 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -2,6 +2,7 @@
 import itertools
 import textwrap
 import re
+import contextlib
 import math
 import sympy
 from collections import OrderedDict
@@ -273,29 +274,34 @@ def call_kernel(self, kernel_name):
 
     def codegen_body(self):
         def template_store():
+            zero_cse = self.get_const_cse(0)
             sram_var = self.store_info["sram_var"]
             dram_var = self.store_info["dram_var"]
             index_var = self.store_info["index_var"]
             tag_var = self.store_info["tag_var"]
-            vlane_split_axis = self.store_info["vlane_split_axis"]
-            vlane_stride = self.store_info["vlane_stride"]
             mlir_dtype = self.store_info["mlir_dtype"]
             dram_shape = self.store_info["dram_shape"]
-            tile_shape = self.store_info["tile_shape"]
-            zero_cse = self.get_const_cse(0)
-            sram_index_var = ",".join([f"%{zero_cse}"] * self.store_info["tile_nr_dim"])
-            tile_stride = self.store_info['tile_stride']
+            vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis
+            vlane_stride = self.kernel_group.tile_desc.get_vlane_stride()
+            tile_stride = self.store_info["tile_stride"]
+            tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
+            sram_index_var = ",".join([f"%{zero_cse}"] *  self.kernel_group.tile_desc.get_nr_dim())
             code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  tag_var, dram_shape, tile_shape, tile_stride)
             self.cse.generate(self.dma_stores, code, assignment = False)
         self.body.splice(self.spad_buffer)
         self.body.splice(self.applys)
         self.body.splice(self.dma_loads)
-        self.body.splice(self.loads)
-        self.body.splice(self.compute)
-        if len(self.stores._lines) == 0:
-            template_store()
-        self.body.splice(self.stores)
+        self.body.writelines(self.compute_body_loop.lines())
+        compute_body = mlir_common.ParallelLoopBuffer()
+        with contextlib.ExitStack() as stack:
+            stack.enter_context(compute_body.indent(attribute="{inner_loop=false}",suffix=self.compute_body_loop.epilogue_line()))
+            compute_body.splice(self.loads)
+            compute_body.splice(self.compute)
+            if len(self.stores._lines) == 0:
+                template_store()
+            compute_body.splice(self.stores)
+        self.body.splice(compute_body)
         self.body.splice(self.dma_stores)
         self.loads.clear()
         self.compute.clear()
@@ -492,13 +498,16 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis if len(load_dim) != 1 else 0    # FIXME: Fixed split axis for 1d load dim
         vlane_stride = self.kernel_group.tile_desc.vlane_stride if len(load_dim) != 1 else 1    # FIXME: Fixed stride for 1d load dim
         tile_numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
+        tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
+        tile_stride = self.store_info['tile_stride']
+
+        # Compute vector unit size
         vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
-        vshape = f", {vshape}" if tile_numel_per_lane > 1 else ""
+        compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
+
         if name not in self.buffer_names:
             # Allocate sram buffer
             dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
-            tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
-            tile_stride = self.store_info['tile_stride']
             sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
             self.buffer_names[name] = sram_var
             code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
@@ -507,12 +516,17 @@ def load_epilogue(self, name: str, index: sympy.Expr):
 
         # Load vector from sram
         sram_var = self.buffer_names[name]
-        operation = "affine.vector_load" if tile_numel_per_lane > 1 else "affine.load"
         zero_var = self.get_const_cse(0)
-        tile_indices = ",".join([f"%{zero_var}"] * self.store_info["tile_nr_dim"])
-        line = f"{operation} %{sram_var}[{tile_indices}] : {self.store_info['tile_shape']}{vshape}"
+        compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
+        if compute_vec_size > 1:
+            operation = "affine.vector_load"
+            line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
+        else:
+            operation = "affine.load"
+            line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
+
         out = self.cse.generate(self.loads, line)
-        self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
+        self.register_var_info(out, [compute_vec_size, mlir_dtype])
         return out
 
     def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
@@ -527,8 +541,10 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = self.store_info['tile_stride']
+
+        # Compute vector unit size
         vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
-        vshape = f", {vshape}" if tile_numel_per_lane > 1 else ""
+        compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
 
         if name not in self.buffer_names:
             sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
@@ -538,18 +554,23 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
             sram_dims = len(tile_shape.split("x")) - 1
             sram_index_var = ",".join([f"%{zero_cse}"] * sram_dims)
         sram_var = self.buffer_names[name]
-
-        operation = "affine.vector_store" if tile_numel_per_lane > 1 else "affine.store"
         zero_var = self.get_const_cse(0)
 
         _, operand_type = self.var_info[value]
         if mlir_dtype != operand_type:
             value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
 
-        tile_indices = ",".join([f"%{zero_var}"] * self.store_info["tile_nr_dim"])
-        line = f"{operation} %{value}, %{sram_var}[{tile_indices}] : {tile_shape}{vshape}"
-
+        compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
+        # Generate vector load instruction
+        if compute_vec_size > 1:
+            operation = "affine.vector_store"
+            line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
+        else:
+            operation = "affine.store"
+            line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}"
         self.cse.generate(self.stores, line, assignment = False)
+
+        # Generate DMA instruction
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  f"{name}_tag", dram_shape, tile_shape, tile_stride)
         self.cse.generate(self.dma_stores, code, assignment = False)
@@ -557,6 +578,15 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
     def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index):
         return super().get_scratchpad_buffer(dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index, True)
 
+    def set_tile_size(self, template_store_info):
+        tile_desc = mlir_common.MLIRMultiDimTile(template_store_info['tile_size'],
+            self.vector_lane,
+            vlane_split_axis=template_store_info['vlane_split_axis'],
+            vlane_stride=template_store_info['vlane_stride'])
+        self.compute_body_loop.size = tile_desc.get_numel_per_lane()
+        self.compute_body_loop.step = tile_desc.get_compute_vec_size()
+        return tile_desc
+
 class MLIRTemplateCaller(CUDATemplateCaller):
     def __str__(self):
         return f"MLIRTemplateCaller(source_file={self.bmreq.source_file})"

From 48ecef329fce53726c3b8a5b9ad8274f579ea18c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 4 Apr 2025 11:14:02 +0000
Subject: [PATCH 265/432] [Frontend] Optmize reduce operation by using partial
 reduce

---
 .../mlir/mlir_codegen_backend.py              | 50 ++++++++++++++++---
 PyTorchSimFrontend/mlir/mlir_common.py        | 23 ++++++---
 2 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 601e7e39..772e2338 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -34,7 +34,7 @@ def reduction_init(reduction_type, dtype):
         return f"0.0"
     raise AssertionError(reduction_type)
 
-def reduction_combine_vec(reduction_type, vector_value, init_value):
+def reduction_partial_combine_vec(reduction_type, vector_value, init_value):
     if reduction_type == "sum":
         return ops.add(vector_value, init_value)
     if reduction_type == "prod":
@@ -47,6 +47,19 @@ def reduction_combine_vec(reduction_type, vector_value, init_value):
         return ops.logical_and(vector_value, init_value)
     raise AssertionError(reduction_type)
 
+def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape, reduced_shape):
+    if reduction_type == "sum":
+        return f"vector.multi_reduction <add>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+    if reduction_type == "prod":
+        return f"vector.multi_reduction <mul>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+    if reduction_type == "max":
+        return f"vector.multi_reduction <maximumf>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+    if reduction_type == "min":
+        return f"vector.multi_reduction <minimumf>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+    if reduction_type == "any":
+        return f"vector.multi_reduction <and>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+    raise AssertionError(reduction_type)
+
 class ExtensionWrapperCodegen(wrapper.WrapperCodeGen):
     def __init__(self):
         super().__init__()
@@ -934,15 +947,14 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         reduced_shape = self.kernel_group.tile_desc.get_mlir_vshape(type_name)
 
         # Set accumulation var
-        if len(self.ranges) == 1 or (len(self.ranges) == 2 and vec_len == 1): # 1-D vector to scalar
+        if vec_len == 1: # 1-D vector to scalar
             # Edge case for scalar
             init_vec = init
         else:
             # Adjust shape and inital value
             init_vec = self.cse.generate(self.reduction_prefix, f"vector.broadcast %{init} : {type_name} to {reduced_shape}")
         acc_var = init_vec
-        var_info = [vec_len, mlir_common.DTYPE_TO_MLIR[dtype]]
-        self.register_var_info(acc, var_info)
+
 
         # Reduction body prepare
         body_acc = self.reduction_cse.generate(
@@ -960,9 +972,29 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         self.init_cse.reduction_cache[reduction_key] = init_vec
 
         # Reduction body codegen
-        result = reduction_combine_vec(reduction_type, value, body_iter_arg)
+        result = reduction_partial_combine_vec(reduction_type, value, body_iter_arg)
         self.compute_body_loop.reduction_vars[body_acc] = (reduction_type, body_iter_arg, iterator, reduced_shape)
         self.compute_body_loop.affine_yield[result] = reduced_shape
+
+        # Final reduction
+        reduction_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_tile_size()[-1]
+        assert(vec_len % reduction_size==0)
+        if vec_len > reduction_size:
+            init = self.cse.generate(self.reductions_suffix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
+            if reduction_size == 1:
+                final_reduced_shape = f"{type_name}"
+                out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(reduction_type, acc, init, axis=0, shape=reduced_shape, reduced_shape=final_reduced_shape))
+            else:
+                final_reduced_shape = f"vector<{reduction_size}x{type_name}>"
+                init_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{init} : {type_name} to {final_reduced_shape}")
+                new_vshape= f"vector<{vec_len//reduction_size}x{reduction_size}x{type_name}>"
+                value = self.cse.generate(self.reductions_suffix, f"vector.shape_cast %{acc} : {reduced_shape} to {new_vshape}")
+                out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(reduction_type, value, init_vec, axis=0, shape=new_vshape, reduced_shape=final_reduced_shape))
+            acc = out
+
+        # reigster reduction output
+        var_info = [reduction_size, mlir_common.DTYPE_TO_MLIR[dtype]]
+        self.register_var_info(acc, var_info)
         return acc
 
     def store_reduction(self, name, index, value):
@@ -986,9 +1018,11 @@ def store_reduction(self, name, index, value):
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
-        vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
-        compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
-
+        compute_vec_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_tile_size()[-1]
+        if compute_vec_size == 1:
+            vshape = f"{mlir_dtype}"
+        else:
+            vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
         sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var,
                                                                          index, buffer=self.reduction_suffix)
         if self.welford_reduce_out is not None:
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 08792009..f6dd38b3 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -271,9 +271,20 @@ def get_compute_vec_size(self):
         # Granule size used in compute loop
         if self.nr_rdim:
             assert self.nr_rdim==1
-            return self.get_numel_per_lane() // self._tile_size[-1]
-        if self.vlane_stride < 16 and (self.get_numel_per_lane() // 16 >= 1):
-            return 16
+            val = self.get_numel_per_lane() // self._tile_size[-1]
+            if self.get_numel_per_lane() >= val * 8:
+                return val*8
+            elif self.get_numel_per_lane() >= val * 4:
+                return val*4
+            elif self.get_numel_per_lane() >= val * 2:
+                return val*2
+            return val
+        if (self.get_numel_per_lane() // self.vlane_stride) >= 8:
+            return self.vlane_stride * 8
+        if (self.get_numel_per_lane() // self.vlane_stride) >= 4:
+            return self.vlane_stride * 4
+        if (self.get_numel_per_lane() // self.vlane_stride) >= 2:
+            return self.vlane_stride * 2
         return self.vlane_stride
 
     @staticmethod
@@ -425,8 +436,8 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
         # Dummy tile size
         tile_size = [1] * (len(vars) + len(reduction_vars))
         if len(tile_size) == 2:
-            tile_size[-1] = 512
-            tile_size[-2] = 512
+            tile_size[-1] = 1024
+            tile_size[-2] = 2048
         elif len(tile_size) == 0: # Scalar
             tile_size = [1]
             self.ranges = [1]
@@ -440,7 +451,7 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
             raise NotImplementedError("dummy tile size fail!")
 
         vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
-        vlane_stride = 8 # TODO: VCIX widening is not implemented
+        vlane_stride = 8
 
         # FIXME: Naive tile size decrement
         def decrease_tile_size(tile_size):

From 7903df93914d562b305cb19d8db74be173d40b74 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 4 Apr 2025 13:02:05 +0000
Subject: [PATCH 266/432] [Script] Print functionality result

---
 experiments/BERT.py                    |  6 +++-
 experiments/conv.py                    |  6 +++-
 experiments/gemm.py                    |  6 +++-
 scripts/get_tog_result.sh              | 14 ++++++--
 tests/Fusion/test_addmm_residual.py    |  6 +++-
 tests/Fusion/test_matmul_activation.py |  6 +++-
 tests/Fusion/test_matmul_scalar.py     |  6 +++-
 tests/Mixtral_8x7B/test_attention.py   |  8 +++--
 tests/test_activation.py               |  6 +++-
 tests/test_add.py                      |  7 +++-
 tests/test_batchnorm.py                |  6 +++-
 tests/test_bmm.py                      |  6 +++-
 tests/test_cnn.py                      |  6 +++-
 tests/test_compile_overhead.py         | 45 ++++++++++++++++++++++++++
 tests/test_conv2d.py                   |  6 +++-
 tests/test_exponent.py                 |  6 +++-
 tests/test_indirect_access.py          |  6 +++-
 tests/test_layernorm.py                |  6 +++-
 tests/test_matmul.py                   |  6 +++-
 tests/test_mlp.py                      |  6 +++-
 tests/test_pool.py                     |  6 +++-
 tests/test_reduce.py                   | 22 +++++++++++--
 tests/test_resnet.py                   |  6 +++-
 tests/test_single_perceptron.py        |  6 +++-
 tests/test_sparse_core.py              |  6 +++-
 tests/test_stonne.py                   |  6 +++-
 tests/test_transformer.py              |  6 +++-
 tests/test_transpose2D.py              |  6 +++-
 tests/test_transpose3D.py              |  6 +++-
 tests/test_view3D_2D.py                |  6 +++-
 30 files changed, 214 insertions(+), 32 deletions(-)
 create mode 100644 tests/test_compile_overhead.py

diff --git a/experiments/BERT.py b/experiments/BERT.py
index 35f85631..e954131b 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -9,12 +9,16 @@
 import datetime
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/experiments/conv.py b/experiments/conv.py
index 48bfea60..1a8bc0b5 100644
--- a/experiments/conv.py
+++ b/experiments/conv.py
@@ -7,12 +7,16 @@
 import datetime
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/experiments/gemm.py b/experiments/gemm.py
index 5ac4baf4..648a7221 100644
--- a/experiments/gemm.py
+++ b/experiments/gemm.py
@@ -7,12 +7,16 @@
 import datetime
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/scripts/get_tog_result.sh b/scripts/get_tog_result.sh
index 86401c72..f796df26 100755
--- a/scripts/get_tog_result.sh
+++ b/scripts/get_tog_result.sh
@@ -19,5 +19,15 @@ while IFS= read -r line; do
             echo "File not found: $file_path"
         fi
     fi
-done
-echo "Accumulated Total Cycle: $total_cycles"
\ No newline at end of file
+    # Check if the line ends with "Test passed|"
+    if [[ "$line" == *"Test Passed|" ]]; then
+        echo "$line"
+        echo "Accumulated Total Cycle: $total_cycles"
+        total_cycles=0
+    fi
+    if [[ "$line" == *"Test Failed|" ]]; then
+        echo "$line"
+        echo "Accumulated Total Cycle: $total_cycles"
+        total_cycles=0
+    fi
+done
\ No newline at end of file
diff --git a/tests/Fusion/test_addmm_residual.py b/tests/Fusion/test_addmm_residual.py
index 10f387e8..a5e05182 100644
--- a/tests/Fusion/test_addmm_residual.py
+++ b/tests/Fusion/test_addmm_residual.py
@@ -3,12 +3,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/Fusion/test_matmul_activation.py b/tests/Fusion/test_matmul_activation.py
index fc7960c5..2381bd8c 100644
--- a/tests/Fusion/test_matmul_activation.py
+++ b/tests/Fusion/test_matmul_activation.py
@@ -4,12 +4,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/Fusion/test_matmul_scalar.py b/tests/Fusion/test_matmul_scalar.py
index b29f37f8..0dcb54f9 100644
--- a/tests/Fusion/test_matmul_scalar.py
+++ b/tests/Fusion/test_matmul_scalar.py
@@ -3,12 +3,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py
index 0040b90e..cc2adc96 100644
--- a/tests/Mixtral_8x7B/test_attention.py
+++ b/tests/Mixtral_8x7B/test_attention.py
@@ -5,12 +5,16 @@
 from model import Transformer, TransformerBlock, ModelArgs, Attention, FeedForward, KVCache, precompute_freqs_cis, sample
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
@@ -143,7 +147,7 @@ def concat_tensors(a, b):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_decode(device, 33, 3)
+    test_decode(device, 32, 3)
     #test_concat(device, size1=(1, 8, 32, 64), size2=(1,8,1,64), dim=2)
     #test_attention(device)
     #test_ffn(device)
diff --git a/tests/test_activation.py b/tests/test_activation.py
index 066074df..de3542c3 100644
--- a/tests/test_activation.py
+++ b/tests/test_activation.py
@@ -4,12 +4,16 @@
 import torch.nn.functional as F
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_add.py b/tests/test_add.py
index d258a3ee..5e1ab15e 100644
--- a/tests/test_add.py
+++ b/tests/test_add.py
@@ -3,12 +3,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
@@ -57,6 +61,7 @@ def vectoradd(a, b):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
+    test_vectoradd(device, (1, 1))
     test_vectoradd(device, (47, 10))
     test_vectoradd(device, (128, 128))
     test_vectoradd(device, (4071, 429))
diff --git a/tests/test_batchnorm.py b/tests/test_batchnorm.py
index bb8d529f..8c78fb97 100644
--- a/tests/test_batchnorm.py
+++ b/tests/test_batchnorm.py
@@ -3,12 +3,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_bmm.py b/tests/test_bmm.py
index 1114c750..6d9279aa 100644
--- a/tests/test_bmm.py
+++ b/tests/test_bmm.py
@@ -3,12 +3,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_cnn.py b/tests/test_cnn.py
index 2d96fe7a..978243d8 100644
--- a/tests/test_cnn.py
+++ b/tests/test_cnn.py
@@ -3,12 +3,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_compile_overhead.py b/tests/test_compile_overhead.py
new file mode 100644
index 00000000..cf0dc1bb
--- /dev/null
+++ b/tests/test_compile_overhead.py
@@ -0,0 +1,45 @@
+import os
+import time
+import sys
+import torch
+from torchvision.models import resnet18 as model1
+import argparse
+import shutil
+
+sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator
+CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+
+if __name__ == "__main__":
+    target_model1 = model1().eval()
+
+    # Init scheduler
+    for i in range(1):
+        timestamp = time.time()  # 현재 타임스탬프 (초 단위)
+        print(f"[{i}] Time Stamp: {timestamp:.6f}")  # 소수점 6자리까지 출력
+        #try:
+        #    shutil.rmtree("/tmp/torchinductor")
+        #except FileNotFoundError:
+        #    print("no cache")
+        scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
+        # Register compiled model
+        opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
+        SchedulerDNNModel.register_model("resnet18", opt_model1)
+
+        # Generate time stamp
+        for request_time in [0]*12:
+            # Init input data
+            model_input1 = torch.randn(1, 3, 224, 224)
+
+            # Init request
+            new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0)
+
+            # Add request to scheduler
+            print("[Reqest] Resnet18 request time: ", request_time, flush=True)
+            scheduler.add_request(new_request1, request_time=request_time)
+
+        # Run scheduler
+        while not scheduler.is_finished():
+            scheduler.schedule()
+
+    print("Done", file=sys.stderr)
\ No newline at end of file
diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py
index 92ac959c..55c4821e 100644
--- a/tests/test_conv2d.py
+++ b/tests/test_conv2d.py
@@ -3,12 +3,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_exponent.py b/tests/test_exponent.py
index 536bef13..c95823cb 100644
--- a/tests/test_exponent.py
+++ b/tests/test_exponent.py
@@ -3,12 +3,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py
index 16d8afd3..6d16c9d0 100644
--- a/tests/test_indirect_access.py
+++ b/tests/test_indirect_access.py
@@ -4,12 +4,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_layernorm.py b/tests/test_layernorm.py
index 64d4cbe5..1cea9d9f 100644
--- a/tests/test_layernorm.py
+++ b/tests/test_layernorm.py
@@ -3,12 +3,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_matmul.py b/tests/test_matmul.py
index 232eb5b4..44f70b69 100644
--- a/tests/test_matmul.py
+++ b/tests/test_matmul.py
@@ -3,12 +3,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_mlp.py b/tests/test_mlp.py
index 2787499c..b8118aa3 100644
--- a/tests/test_mlp.py
+++ b/tests/test_mlp.py
@@ -4,12 +4,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_pool.py b/tests/test_pool.py
index e50c700e..e8d99a57 100644
--- a/tests/test_pool.py
+++ b/tests/test_pool.py
@@ -3,12 +3,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_reduce.py b/tests/test_reduce.py
index 512d9e36..0289da61 100644
--- a/tests/test_reduce.py
+++ b/tests/test_reduce.py
@@ -3,12 +3,30 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
@@ -39,7 +57,7 @@ def reduce_sum(a, dim, keepdim):
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
     parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
-    parser.add_argument('--shape', type=str, default="(512,768)")
+    parser.add_argument('--shape', type=str, default="(128,768)")
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
diff --git a/tests/test_resnet.py b/tests/test_resnet.py
index dc021174..d76ec4a8 100644
--- a/tests/test_resnet.py
+++ b/tests/test_resnet.py
@@ -4,12 +4,16 @@
 from torchvision.models import resnet18
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_single_perceptron.py b/tests/test_single_perceptron.py
index 78a6b117..c7fdca06 100644
--- a/tests/test_single_perceptron.py
+++ b/tests/test_single_perceptron.py
@@ -4,12 +4,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_sparse_core.py b/tests/test_sparse_core.py
index 3d368175..b2b16818 100644
--- a/tests/test_sparse_core.py
+++ b/tests/test_sparse_core.py
@@ -5,12 +5,16 @@
 import torch.nn.utils.prune as prune
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_stonne.py b/tests/test_stonne.py
index f82f833b..5e4fe5fb 100644
--- a/tests/test_stonne.py
+++ b/tests/test_stonne.py
@@ -14,12 +14,16 @@ def apply_pruning(tensor, sparsity):
     tensor *= mask
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index 8716ba06..c760008b 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -5,12 +5,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_transpose2D.py b/tests/test_transpose2D.py
index afc17a23..14f16fbb 100644
--- a/tests/test_transpose2D.py
+++ b/tests/test_transpose2D.py
@@ -3,12 +3,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_transpose3D.py b/tests/test_transpose3D.py
index d19ea242..937948c4 100644
--- a/tests/test_transpose3D.py
+++ b/tests/test_transpose3D.py
@@ -3,12 +3,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)
diff --git a/tests/test_view3D_2D.py b/tests/test_view3D_2D.py
index f943e20e..a5a31a85 100644
--- a/tests/test_view3D_2D.py
+++ b/tests/test_view3D_2D.py
@@ -3,12 +3,16 @@
 import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
         print("-" * len(message))
         print(message)
         print("-" * len(message))
     else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
         exit(1)

From b26f1dcbf3c399d0a82c95f012686204439bb6b9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 4 Apr 2025 13:21:12 +0000
Subject: [PATCH 267/432] [Tests] Add vector ops experiment script

---
 scripts/get_tog_result.sh |  3 +++
 tests/test_reduce.py      | 25 ++++++-------------------
 tests/test_vectorops.py   | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 41 insertions(+), 19 deletions(-)
 create mode 100644 tests/test_vectorops.py

diff --git a/scripts/get_tog_result.sh b/scripts/get_tog_result.sh
index f796df26..9359e1e5 100755
--- a/scripts/get_tog_result.sh
+++ b/scripts/get_tog_result.sh
@@ -30,4 +30,7 @@ while IFS= read -r line; do
         echo "Accumulated Total Cycle: $total_cycles"
         total_cycles=0
     fi
+    if [[ "$line" == *"[log]"* ]]; then
+        echo "$line"
+    fi
 done
\ No newline at end of file
diff --git a/tests/test_reduce.py b/tests/test_reduce.py
index 0289da61..c1556787 100644
--- a/tests/test_reduce.py
+++ b/tests/test_reduce.py
@@ -17,20 +17,6 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
         print("cpu out: ", cpu_out)
         exit(1)
 
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
-
 def test_reduce_sum(device, size, dim, keepdim=False):
     def reduce_sum(a, b, dim, keepdim):
         return torch.sum(a + b, axis=dim, keepdim=keepdim)
@@ -41,7 +27,7 @@ def reduce_sum(a, b, dim, keepdim):
     out = reduce_sum(x.cpu(), y.cpu(), dim, keepdim)
     test_result("ReduceSum", res, out)
 
-def test_reduce_sum2(device, size, dim, keepdim=False):
+def test_reduce_sum2(device, size, dim=-1, keepdim=False):
     def reduce_sum(a, dim, keepdim):
         return torch.sum(a, axis=dim, keepdim=keepdim)
     x = torch.randn(size).to(device=device)
@@ -64,8 +50,9 @@ def reduce_sum(a, dim, keepdim):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_reduce_sum(device, (29, 47), 1, keepdim=True)
-    test_reduce_sum(device, (17, 68), 0, keepdim=True)
-    test_reduce_sum(device, (327, 447), 1, keepdim=True)
-    test_reduce_sum(device, (327, 447), 0, keepdim=True)
+    #test_reduce_sum(device, (29, 47), 1, keepdim=True)
+    #test_reduce_sum(device, (17, 68), 0, keepdim=True)
+    #test_reduce_sum(device, (327, 447), 1, keepdim=True)
+    #test_reduce_sum(device, (327, 447), 0, keepdim=True)
+    test_reduce_sum2(device, shape)
 
diff --git a/tests/test_vectorops.py b/tests/test_vectorops.py
new file mode 100644
index 00000000..0677b7ae
--- /dev/null
+++ b/tests/test_vectorops.py
@@ -0,0 +1,32 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    
+    # Target shape
+    seq_list = [1,128,512,2048,8192]
+    d_model = 768
+    from tests.test_add import test_vectoradd
+    from tests.test_activation import test_GeLU
+    from tests.test_reduce import test_reduce_sum2
+    from tests.test_layernorm import test_LayerNorm
+    from tests.test_softmax import test_softmax
+    func_list = [test_vectoradd, test_GeLU, test_reduce_sum2, test_LayerNorm, test_softmax]
+    for test_func in func_list:
+        for seq in seq_list:
+            if test_func == test_GeLU:
+                print(f"[log] {test_func.__name__}, seq: {seq}")
+                test_func(device, size=[seq, d_model*4])
+            elif test_func == test_softmax:
+                print(f"[log] {test_func.__name__}, seq: {seq}")
+                test_func(device, size=[seq, seq])
+            else:
+                print(f"[log] {test_func.__name__}, seq: {seq}")
+                test_func(device, size=[seq, d_model])

From 289bd811324272462b54cbe63cf117effdfc16ac Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 4 Apr 2025 14:32:04 +0000
Subject: [PATCH 268/432] [Frontend] Use more vectorlane for
 small-shape-reductions

---
 PyTorchSimFrontend/mlir/mlir_common.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index f6dd38b3..d743b187 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -506,6 +506,11 @@ def decrease_tile_size(tile_size):
             else:
                 spad_overflow = False
 
+        # Maximize the utilizaiotn of vectorlane
+        if len(reduction_vars):
+            minimum_stride = max(self.roundup_vectorlane(tile_size[vlane_split_axis]) // self.vector_lane, 2)
+            vlane_stride = min(minimum_stride, 8)
+
         # Handle scalar case
         if len(self.ranges)==1 and self.ranges[0] == 1:
             vlane_stride = 1

From b618d2985342e22fdcf6e99fa00dd8c0761e1878 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 8 Apr 2025 08:34:33 +0000
Subject: [PATCH 269/432] [Issue] TOG parser double free

---
 PyTorchSimBackend/src/TileGraphParser.cc | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index 4985d2f9..d350ae87 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -541,16 +541,17 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       inst->set_overlapping_cycle(compute_node->get_overlapping_cycle());
       inst->set_compute_type(compute_node->get_compute_type());
 
+      // FIXME: double free error
       /* Check should we have to skip */
-      auto output_idx_list = calc_output_idx(tog_parser, iter); // (M,N,K) order
-      if (compute_node->get_compute_type() == 1 && output_idx_list.size() == 3) { // FIXME. hardcoded type
-        bool skip = find_output_idx(tog_parser, output_idx_list);
-        if (skip) {
-          inst->set_compute_cycle(0);
-          inst->set_overlapping_cycle(0);
-          spdlog::trace("[TOGParser/Sparse] Skip output tile index: {}", fmt::join(output_idx_list, ","));
-        }
-      }
+      // auto output_idx_list = calc_output_idx(tog_parser, iter); // (M,N,K) order
+      // if (compute_node->get_compute_type() == 1 && output_idx_list.size() == 3) { // FIXME. hardcoded type
+      //   bool skip = find_output_idx(tog_parser, output_idx_list);
+      //   if (skip) {
+      //     inst->set_compute_cycle(0);
+      //     inst->set_overlapping_cycle(0);
+      //     spdlog::trace("[TOGParser/Sparse] Skip output tile index: {}", fmt::join(output_idx_list, ","));
+      //   }
+      // }
 
       link_map[tile_node] = inst;
       tile_vec.back()->append_instuction(inst);

From d8abb99cebe2d2caf9f5660062b3731e66bfb6d5 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 8 Apr 2025 08:39:14 +0000
Subject: [PATCH 270/432] [TOGSim] distinguish preload & compute

---
 AsmParser/tog_generator.py                | 5 ++---
 PyTorchSimBackend/include/Core.h          | 3 ++-
 PyTorchSimBackend/src/Core.cc             | 3 ++-
 PyTorchSimFrontend/extension_codecache.py | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
index 0616a21b..1dea2f8d 100644
--- a/AsmParser/tog_generator.py
+++ b/AsmParser/tog_generator.py
@@ -210,7 +210,6 @@ def parse_graph(self):
 
     def generate_tile_graph(self, name="tile_graph", cycle_list=list, x_offset=int, w_offset=int, vector_lane=int, stonneGraph=False):
         node_list = list(self.node_dict.values())[1:]
-        is_preload = True # FIXME: first systolic array node is preload
         if len(node_list):
             node_list[0].set_parent([])
             for iter_node in self.node_dict.values():
@@ -221,9 +220,9 @@ def generate_tile_graph(self, name="tile_graph", cycle_list=list, x_offset=int,
                         print("[TOGGen] Error compute cycle timing is missing...!")
                         iter_node.torchsim_cycle = 10
                     # FIXME.
-                    if iter_node.torchsim_compute_type == 1:
+                    if iter_node.torchsim_compute_type > 0:
+                        is_preload = iter_node.torchsim_compute_type == 2
                         offset = w_offset if is_preload else x_offset
-                        is_preload = False
                         iter_node.torchsim_overlapping_cycle = max(iter_node.torchsim_cycle - offset, 0)
 
         origin_info = "_".join(map(str, self.origins))
diff --git a/PyTorchSimBackend/include/Core.h b/PyTorchSimBackend/include/Core.h
index dfc81686..9a656d57 100644
--- a/PyTorchSimBackend/include/Core.h
+++ b/PyTorchSimBackend/include/Core.h
@@ -32,7 +32,8 @@ class Core {
   std::queue<std::shared_ptr<Instruction>>& get_compute_pipeline(int compute_type);
   enum {
     VECTOR_UNIT,
-    SYSTOLIC_ARRAY,
+    MATMUL,
+    PRELOAD,
     NR_COMPUTE_UNIT
   };
 
diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index af71c290..45bba879 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -20,6 +20,7 @@ Core::Core(uint32_t id, SimulationConfig config)
 
 bool Core::can_issue(const std::shared_ptr<Tile>& op) {
   /* Check SRAM is enough to run tile */
+  assert(op->get_required_sram_size() <= _sram_size);
   return op->get_required_sram_size() + _used_sram_size <= _sram_size && !op->is_stonne_tile();
 }
 
@@ -48,7 +49,7 @@ std::shared_ptr<Tile> Core::pop_finished_tile() {
 std::queue<std::shared_ptr<Instruction>>& Core::get_compute_pipeline(int compute_type) {
   if (compute_type == VECTOR_UNIT)
     return _vu_compute_pipeline;
-  else if (compute_type == SYSTOLIC_ARRAY) {
+  else if (compute_type == MATMUL || compute_type == PRELOAD) {
     uint32_t sa_idx = _systolic_array_rr;
     _systolic_array_rr = (_systolic_array_rr + 1) % _num_systolic_array_per_core;
     return _sa_compute_pipeline.at(sa_idx);
diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index f58605cc..afe569e6 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -217,7 +217,7 @@ def load(cls, source_code,
             x_offset = kwargs['loop_size'][-3]
         if kwargs['loop_size'] is not None and kwargs['loop_size'][-1] < vectorlane_size:
             w_offset = kwargs['loop_size'][-1]
-        w_offset = max(w_offset - x_offset, 0)
+        w_offset = 0 # max(w_offset - x_offset, 0)
         tile_graph_generator = tog_generator(origins)
         tile_graph_generator.load_file(raw_tog_path)
         tile_graph_generator.generate_tile_graph(

From f266b3b9771500dc9b39860036ff7fe8b6f6f413 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 8 Apr 2025 08:40:30 +0000
Subject: [PATCH 271/432] [Frontend] Avoid Row Conflict Tile Size

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 10 +++++-----
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 6eba7a39..3e69f979 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -632,7 +632,7 @@ def render(self,
         TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
         SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
         SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_K = TILE_K
         TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
         TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
         SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
@@ -649,7 +649,7 @@ def render(self,
           TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
           TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
           TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-          SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
+          SUB_TILE_K = TILE_K
           x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H * TILE_M, TILE_K)
           w_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_H * TILE_K, TILE_N)
           y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)
@@ -666,7 +666,7 @@ def render(self,
           y_spad_size = TILE_O_H * TILE_M * TILE_N
           SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
           SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-          SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
+          SUB_TILE_K = TILE_K
           TOG_latency = O_W if TILE_M > O_W else TILE_M
         elif self.is_single_batch(BATCH) and self.stride[0] == 1:
           conv_template = SINGLE_BATCH_CONV_TEMPLATE
@@ -675,13 +675,13 @@ def render(self,
           TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
           SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane
           SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-          SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
+          SUB_TILE_K = TILE_K
           x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H, TILE_K)
           y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H  * TILE_M, TILE_N)
           x_spad_size = TILE_I_W * TILE_I_H * TILE_K
           y_spad_size = TILE_O_H * TILE_M * TILE_N
           TOG_latency = O_W if TILE_M > O_W else TILE_M
-
+        TOG_latency = 8 if TOG_latency < 8 else TOG_latency
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
 
         kernel.render_options = dict(
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 84b8bdbb..00d23e76 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -136,11 +136,11 @@ def render(self,
         TILE_M = min(extension_config.CONFIG_FORCE_TILE_M, TILE_M)
         TILE_N = min(extension_config.CONFIG_FORCE_TILE_N, TILE_N)
         TILE_K = min(extension_config.CONFIG_FORCE_TILE_K, TILE_K)
-        TOG_latency = M if TILE_M > M else TILE_M
-        kernel.loop_size =[TOG_latency, TILE_N, TILE_K]
         SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_N = TILE_N
+        SUB_TILE_K = TILE_K
+        TOG_latency = M if SUB_TILE_M > M else SUB_TILE_M
+        kernel.loop_size =[TOG_latency, SUB_TILE_N, SUB_TILE_K]
 
         W_transposed = self.is_transposed(W)
         X_transposed = self.is_transposed(X)

From 9149048e4d2a1e928224bcc84ad1b1b55d76eb3e Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 8 Apr 2025 08:40:54 +0000
Subject: [PATCH 272/432] [Frontend] pseudo-auto-tune

---
 PyTorchSimFrontend/mlir/mlir_template.py | 48 ++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index e26d9c0d..f870c443 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -140,10 +140,10 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, pad_k=True):
         maximize_i_j = 1 # reuse weight
         for k in tile_K_range:
             tile_K = k * self.vector_lane if K > self.vector_lane else K_padded
-            for j in tile_N_range:
-                tile_N = j * self.vector_lane if N > self.vector_lane else N_padded
-                for i in tile_M_range:
-                    tile_M = i * self.vector_lane if M > self.vector_lane else M_padded
+            for i in tile_M_range:
+                tile_M = i * self.vector_lane if M > self.vector_lane else M_padded
+                for j in tile_N_range:
+                    tile_N = j * self.vector_lane if N > self.vector_lane else N_padded
                     used_spad_size = (tile_M * tile_K + tile_K * tile_N + tile_M * tile_N * (1 + n_extra_node)) * self.precision
                     weight_size_per_lane = self.get_spad_size_per_lane(tile_K, tile_N)
                     input_size_per_lane = self.get_spad_size_per_lane(tile_M, tile_K)
@@ -155,6 +155,42 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, pad_k=True):
                         mapping = (tile_M, tile_N, tile_K)
         return mapping
 
+    def search_mapping_space(self, mapping, idx, increment, stride, dilation, n_extra_node=0):
+        if idx == 0 or idx == 1 or idx == 4 or idx == 5 or idx == 6:
+            raise NotImplementedError("Only O_H and O_W are supported for search_mapping_space")
+        spad_size_per_lane = self.spad_info["spad_size"]
+        spad_size = spad_size_per_lane * self.vector_lane
+        max_spad_size = spad_size // 2 # double buffer
+        max_spad_per_lane = spad_size_per_lane // 2 # double buffer
+
+        mapping = list(mapping)
+        mapping[idx] += increment
+        k_h, k_w, o_h, o_w, M, N, K = mapping
+        i_h = 1 + (o_h - 1) * stride[0] + (k_h - 1) * dilation[0]
+        i_w = 1 + (o_w - 1) * stride[1] + (k_w - 1) * dilation[1]
+        weight_size = k_w * k_h * K * N
+        input_size = i_w * i_h * M * K
+        output_size = o_w * o_h * M * N
+        used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision
+        weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N)
+        input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
+        output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M  * (1 + n_extra_node), N)
+        used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
+        if used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane:
+            mapping = (k_h, k_w, o_h, o_w, M, N, K)
+        else:
+            mapping[idx] -= increment
+
+        return mapping
+
+    def pseudo_auto_tune(self, mapping, stride, dilation, n_extra_node=0):
+        # pseudo auto-tune
+        if mapping[2] == 1:
+            mapping = self.search_mapping_space(mapping, 2, 1, stride, dilation, n_extra_node=n_extra_node)
+        if mapping[3] == 1:
+            mapping = self.search_mapping_space(mapping, 3, 1, stride, dilation, n_extra_node=n_extra_node)
+        return mapping
+
     def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0):
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
@@ -182,6 +218,10 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation
                             max_used_spad_size = used_spad_size
                             max_k_h_w = k_h * k_w
                             mapping = (k_h, k_w, o_h, o_w, M, N, K)
+
+        # FIXME: this should be implemented with auto-tuning
+        mapping = self.pseudo_auto_tune(mapping, stride, dilation, n_extra_node=n_extra_node)
+
         if max_used_spad_size == 0:
             raise RuntimeError("Cannot find a valid mapping")
         return mapping

From 448aa6b7361c86f428d9de7cebae6396c765304b Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 8 Apr 2025 08:42:04 +0000
Subject: [PATCH 273/432] [Fix] Experiments setting

---
 .../configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json        | 2 +-
 experiments/conv.py                                             | 2 +-
 experiments/gemm.py                                             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
index 98943fae..cd8396ff 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
@@ -17,7 +17,7 @@
 
   "icnt_type" : "simple",
   "icnt_latency" : 7,
-  "icnt_freq" : 14000,
+  "icnt_freq" : 15000,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
 
   "precision" : 4,
diff --git a/experiments/conv.py b/experiments/conv.py
index 1a8bc0b5..bd587edc 100644
--- a/experiments/conv.py
+++ b/experiments/conv.py
@@ -27,7 +27,7 @@ def custom_conv2d(a, b, bias):
         o_c = b.shape[0]
         conv2d = torch.nn.Conv2d(i_c, o_c, b.shape[-1], stride=stride, padding=padding, dilation=1, bias=False)
         conv2d.weight = torch.nn.Parameter(b)
-        conv2d.bias = torch.nn.Parameter(bias)
+        # conv2d.bias = torch.nn.Parameter(bias)
         return conv2d(a)
     torch.manual_seed(0)
     conv_input = torch.randn(batch_size, i_c, i_h, i_w).to(memory_format=torch.channels_last, device=device)
diff --git a/experiments/gemm.py b/experiments/gemm.py
index 648a7221..b9c24fed 100644
--- a/experiments/gemm.py
+++ b/experiments/gemm.py
@@ -43,7 +43,7 @@ def custom_matmul(a, b):
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
     config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
-    config = config.split('/')[-1].split('.')[0] # extract config name from config path
+    config = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--size', nargs='+', type=int, default=[128, 128, 128], help='M K N')

From e9df4ba1820de7e827310f2cc0a08ad70176b5ba Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 9 Apr 2025 15:06:14 +0000
Subject: [PATCH 274/432] [Backendsim] Fix performance degradation

---
 .../systolic_ws_8x8_c1_12G_simple_noc.json    |  2 +-
 PyTorchSimBackend/include/Core.h              |  5 +-
 PyTorchSimBackend/include/Instruction.h       |  7 ++-
 PyTorchSimBackend/include/TMA.h               | 37 +++++++--------
 PyTorchSimBackend/include/TileGraphParser.h   |  7 +++
 PyTorchSimBackend/src/Core.cc                 | 46 ++++++++++---------
 PyTorchSimBackend/src/Instruction.cc          | 28 ++++++-----
 PyTorchSimBackend/src/SparseCore.cc           | 14 +++---
 PyTorchSimBackend/src/TMA.cc                  | 10 ++--
 PyTorchSimBackend/src/TileGraphParser.cc      | 14 ++++--
 10 files changed, 95 insertions(+), 75 deletions(-)

diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json
index 8aee751b..e9a64f2e 100644
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json
+++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json
@@ -16,7 +16,7 @@
  
   "icnt_type" : "simple",
   "icnt_latency" : 1,
-  "icnt_freq" : 8000,
+  "icnt_freq" : 1000,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
  
   "precision" : 4,
diff --git a/PyTorchSimBackend/include/Core.h b/PyTorchSimBackend/include/Core.h
index 9a656d57..c151be4f 100644
--- a/PyTorchSimBackend/include/Core.h
+++ b/PyTorchSimBackend/include/Core.h
@@ -1,6 +1,6 @@
 #pragma once
 #include <robin_hood.h>
-
+#include <unordered_set>
 #include <memory>
 #include <vector>
 #include <fmt/core.h>
@@ -83,7 +83,8 @@ class Core {
   std::queue<std::shared_ptr<Instruction>> _ld_inst_queue;
   std::queue<std::shared_ptr<Instruction>> _st_inst_queue;
 
-  std::vector<std::shared_ptr<Instruction>> _dma_waiting_queue;
+  std::unordered_map<Instruction*, std::shared_ptr<Instruction>> _dma_waiting_queue;
+  std::vector<std::shared_ptr<Instruction>> _dma_finished_queue;
   /* Interconnect queue */
   std::queue<mem_fetch*> _request_queue;
   std::queue<mem_fetch*> _response_queue;
diff --git a/PyTorchSimBackend/include/Instruction.h b/PyTorchSimBackend/include/Instruction.h
index 86979530..45fe983e 100644
--- a/PyTorchSimBackend/include/Instruction.h
+++ b/PyTorchSimBackend/include/Instruction.h
@@ -51,7 +51,7 @@ class Instruction {
   void set_compute_cycle(cycle_type cycle) { compute_cycle = cycle; }
   void set_indirect_index_path(std::string indirect_path) { _is_indirect_mode=true; _indirect_index_path=indirect_path; }
   void print();
-  std::set<addr_type> get_dram_address(addr_type dram_req_size);
+  std::shared_ptr<std::set<addr_type>> get_dram_address(addr_type dram_req_size);
   std::vector<addr_type> get_trace_address() { return _trace_address; }
   bool load_indirect_index(const std::string& path, uint64_t*& indirect_index, const std::vector<uint64_t>& tile_size);
   void set_trace_address(std::vector<addr_type>& trace_address) { _trace_address = trace_address; }
@@ -71,11 +71,13 @@ class Instruction {
   std::vector<int>& get_tag_idx_list() { return _tag_idx_list; }
   std::vector<int>& get_tag_stride_list() { return _tag_stride_list; }
   std::vector<int>& get_tag_id() { return _tag_key; }
-  void set_addr_name(std::string name) { _addr_name = name; }
+  void set_addr_name(std::string name, int id) { _addr_name = name; _addr_id = id; }
   std::string get_addr_name() { return _addr_name; }
+  int get_addr_id() { return _addr_id; }
   void set_nr_inner_loop(int nr) { _nr_inner_loop = nr; }
   int get_nr_inner_loop() { return _nr_inner_loop; }
   void set_is_async(bool is_async) { _is_async_dma = is_async; }
+  void prepare_tag_key();
 
   cycle_type start_cycle;
   cycle_type finish_cycle;
@@ -107,6 +109,7 @@ class Instruction {
   std::vector<int> _loop_size_list;
   std::vector<addr_type> _trace_address;
   std::string _addr_name;
+  int _addr_id;
   int _nr_inner_loop = 0;
   bool _is_async_dma=false;
   bool _is_indirect_mode=false;
diff --git a/PyTorchSimBackend/include/TMA.h b/PyTorchSimBackend/include/TMA.h
index 5d08a882..964969a8 100644
--- a/PyTorchSimBackend/include/TMA.h
+++ b/PyTorchSimBackend/include/TMA.h
@@ -23,22 +23,22 @@ class TMA {
   void issue_tile(std::shared_ptr<Instruction> inst);
   bool is_finished() { return _finished; }
   bool empty() { return _current_inst==nullptr; }
-  void register_tag(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
+  void register_tag(int subgraph_id, std::vector<int>& key) {
     if (tag_table.find(subgraph_id) == tag_table.end()) {
-      tag_table[subgraph_id] = std::map<std::pair<std::string, std::vector<int>>, uint32_t>();
-      waiters[subgraph_id] = std::map<std::pair<std::string, std::vector<int>>, std::vector<std::shared_ptr<Instruction>>>();
+      tag_table[subgraph_id] = std::map<std::vector<int>, uint32_t>();
+      waiters[subgraph_id] = std::map<std::vector<int>, std::vector<std::shared_ptr<Instruction>>>();
     }
     tag_table[subgraph_id][key] = 0;
     waiters[subgraph_id][key] = std::vector<std::shared_ptr<Instruction>>();
   }
-  void set_tag_finish(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
+  void set_tag_finish(int subgraph_id, std::vector<int>& key) {
     if (tag_table.find(subgraph_id) == tag_table.end()) {
       throw std::runtime_error("Subgraph does not exist in tag_table");
     }
     tag_table[subgraph_id][key] = 1;
   }
 
-  void mark_tag_used(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
+  void mark_tag_used(int subgraph_id, std::vector<int>& key) {
     if (tag_table.find(subgraph_id) == tag_table.end()) {
       throw std::runtime_error("Subgraph does not exist in tag_table");
     } else if (!tag_table[subgraph_id][key]) {
@@ -50,21 +50,18 @@ class TMA {
   void check_table() {
     for (const auto& entry: tag_table) {
       auto subgraph_id = entry.first;
-      for (const auto& tag_key: tag_table[subgraph_id]) {
-        const auto& tag_pair = tag_key.first;
-        const std::string& tag_name = tag_pair.first;
-        const std::vector<int>& tag_values = tag_pair.second;
-        uint32_t value = tag_key.second;
-
+      for (const auto& tag_entry: tag_table[subgraph_id]) {
+        const std::vector<int>& tag_key = tag_entry.first;
+        uint32_t value = tag_entry.second;
         if (value == 1) {
-          spdlog::warn("[Tag Table][{}] Unused tag found: (name={}, key={}, val={})",
-            subgraph_id, tag_name, fmt::format("[{}]", fmt::join(tag_values, ", ")), value);
+          spdlog::warn("[Tag Table][{}] Unused tag found: (key={}, val={})",
+            subgraph_id, fmt::format("[{}]", fmt::join(tag_key, ", ")), value);
         }
       }
     }
   }
 
-  bool tag_key_exist(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
+  bool tag_key_exist(int subgraph_id, std::vector<int>& key) {
     auto subgraph_it = tag_table.find(subgraph_id);
     if (subgraph_it == tag_table.end())
       return false;
@@ -73,7 +70,7 @@ class TMA {
     auto key_it = key_map.find(key);
     return key_it != key_map.end();
   }
-  bool get_tag_finish(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
+  bool get_tag_finish(int subgraph_id, std::vector<int>& key) {
     auto subgraph_it = tag_table.find(subgraph_id);
     auto& key_map = subgraph_it->second;
     auto key_it = key_map.find(key);
@@ -90,7 +87,7 @@ class TMA {
     tag_table.erase(subgraph_id);
     waiters.erase(subgraph_id);
   }
-  void register_tag_waiter(int subgraph_id, const std::pair<std::string, std::vector<int>>& key, std::shared_ptr<Instruction> inst) {
+  void register_tag_waiter(int subgraph_id, std::vector<int>& key, std::shared_ptr<Instruction> inst) {
     auto subgraph_it = tag_table.find(subgraph_id);
     auto& key_map = subgraph_it->second;
     auto key_it = key_map.find(key);
@@ -99,7 +96,7 @@ class TMA {
     }
     waiters[subgraph_id][key].push_back(inst);
   }
-  std::vector<std::shared_ptr<Instruction>>& get_tag_waiter(int subgraph_id, const std::pair<std::string, std::vector<int>>& key) {
+  std::vector<std::shared_ptr<Instruction>>& get_tag_waiter(int subgraph_id, std::vector<int>& key) {
     auto subgraph_it = tag_table.find(subgraph_id);
     auto& key_map = subgraph_it->second;
     auto key_it = key_map.find(key);
@@ -110,7 +107,7 @@ class TMA {
   }
 
   std::shared_ptr<Instruction>& get_current_inst() { return _current_inst; }
-  std::vector<mem_fetch*> get_memory_access();
+  std::shared_ptr<std::vector<mem_fetch*>> get_memory_access();
   uint32_t generate_mem_access_id();
   const uint32_t get_max_dim() { return _max_dim; }
 
@@ -124,7 +121,7 @@ class TMA {
   size_t _tile_idx_stride=1;
   uint32_t _tile_idx;
   bool _finished=true;
-  std::map<int, std::map<std::pair<std::string, std::vector<int>>, uint32_t>> tag_table;
-  std::map<int, std::map<std::pair<std::string, std::vector<int>>, std::vector<std::shared_ptr<Instruction>>>> waiters;
+  std::map<int, std::map<std::vector<int>, uint32_t>> tag_table;
+  std::map<int, std::map<std::vector<int>, std::vector<std::shared_ptr<Instruction>>>> waiters;
 };
 #endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/PyTorchSimBackend/include/TileGraphParser.h
index a8a97bb4..97e808c6 100644
--- a/PyTorchSimBackend/include/TileGraphParser.h
+++ b/PyTorchSimBackend/include/TileGraphParser.h
@@ -94,6 +94,12 @@ class TileGraphParser {
     return new_path.string();
   }
   void inc_indirect_counter() { indirect_counter++; }
+  int register_addr_name(const std::string& addr_name) {
+    if (_addr_name_map.find(addr_name) == _addr_name_map.end())
+      _addr_name_map[addr_name] = _addr_name_map.size();
+    return _addr_name_map[addr_name];
+  }
+  int get_addr_name_id(const std::string& addr_name) { return _addr_name_map[addr_name]; }
 
  private:
   void register_tile(std::shared_ptr<TileNode> tile_node);
@@ -115,6 +121,7 @@ class TileGraphParser {
   std::map<std::string, std::tuple<int, int, LoopType>> _loop_size_map;
   std::map<std::string, std::string> _tog_meta;
   std::map<std::pair<std::string, std::vector<int>>, uint32_t> _tag_table;
+  std::unordered_map<std::string, int> _addr_name_map;
 };
 
 class TileComputeNode : public TileNode {
diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index 45bba879..ec84a368 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -21,7 +21,7 @@ Core::Core(uint32_t id, SimulationConfig config)
 bool Core::can_issue(const std::shared_ptr<Tile>& op) {
   /* Check SRAM is enough to run tile */
   assert(op->get_required_sram_size() <= _sram_size);
-  return op->get_required_sram_size() + _used_sram_size <= _sram_size && !op->is_stonne_tile();
+  return op->get_required_sram_size() + _used_sram_size <= _sram_size &&  _tiles.size() < 2  && !op->is_stonne_tile();
 }
 
 void Core::issue(std::shared_ptr<Tile> op) {
@@ -111,11 +111,9 @@ void Core::compute_cycle() {
 
 void Core::dma_cycle() {
   /* Check finished dma operation */
-  for (int i=0; i<_dma_waiting_queue.size(); i++){
-    std::shared_ptr<Instruction>& instruction = _dma_waiting_queue.at(i);
-    /* Pass not finished instruction */
-    if (instruction->get_waiting_request())
-      continue;
+  while(_dma_finished_queue.size()) {
+    std::shared_ptr<Instruction>& instruction = _dma_finished_queue.at(0);
+    assert(instruction->get_waiting_request()==0);
 
     /* Finish DMA read instruction */
     if (instruction->is_dma_read() && !instruction->is_async_dma())
@@ -123,7 +121,7 @@ void Core::dma_cycle() {
 
     /* Set tag table of async dma load */
     if (instruction->is_dma_read() && instruction->is_async_dma()) {
-      auto key = std::make_pair(instruction->get_addr_name(), instruction->get_tag_id());
+      auto& key = instruction->get_tag_id();
       assert(!_tma.get_tag_finish(instruction->subgraph_id, key));
       _tma.set_tag_finish(instruction->subgraph_id, key);
       spdlog::trace("[Core {}][{}] {} ASYNC FINISHED, Used sram: {}, Release sram: {}, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}",
@@ -138,10 +136,7 @@ void Core::dma_cycle() {
         finish_instruction(wait_inst);
       }
     }
-
-    /* Erase the instruction in DMA waiting queue */
-    _dma_waiting_queue.erase(_dma_waiting_queue.begin() + i);
-    i--;
+    _dma_finished_queue.erase(_dma_finished_queue.begin());
   }
 
   if (_tma.is_finished()) {
@@ -153,8 +148,7 @@ void Core::dma_cycle() {
         finish_instruction(finished_inst);
       } else if (finished_inst->is_dma_read() && finished_inst->is_async_dma()) {
         /* Register tag table for async dma load */
-        _tma.register_tag(finished_inst->subgraph_id,
-                          std::make_pair(finished_inst->get_addr_name(), finished_inst->get_tag_id()));
+        _tma.register_tag(finished_inst->subgraph_id, finished_inst->get_tag_id());
         finish_instruction(finished_inst);
       } else if(!finished_inst->is_dma_read()) {
         spdlog::error("[Core {}][{}] TMA instruction in not valid", _id, _core_cycle);
@@ -167,7 +161,7 @@ void Core::dma_cycle() {
                       fmt::format("[{}]", fmt::join(finished_inst->get_tag_stride_list(), ", ")));
       }
       /*Pass to waiting queue */
-      _dma_waiting_queue.push_back(std::move(finished_inst));
+      _dma_waiting_queue[finished_inst.get()] = std::move(finished_inst);
     }
 
     /* Issue new DMA operation */
@@ -186,8 +180,8 @@ void Core::dma_cycle() {
     }
   }
   /* Generate memfetch */
-  std::vector<mem_fetch*> access_vec = _tma.get_memory_access();
-  for (auto access : access_vec) {
+  auto access_vec = _tma.get_memory_access();
+  for (auto access : *access_vec) {
     access->set_start_cycle(_core_cycle);
     _request_queue.push(access);
   }
@@ -219,7 +213,7 @@ void Core::cycle() {
         case Opcode::MOVIN:
           {
             /* Check another MOVIN with same tag is issued */
-            auto key = std::make_pair(inst->get_addr_name(), inst->get_tag_id());
+            auto& key = inst->get_tag_id();
             if (inst->is_async_dma() && _tma.tag_key_exist(inst->subgraph_id, key)) {
               bool finished = _tma.get_tag_finish(inst->subgraph_id, key);
               if (finished)
@@ -286,7 +280,7 @@ void Core::cycle() {
           break;
         case Opcode::BAR:
           {
-            auto key = std::make_pair(inst->get_addr_name(), inst->get_tag_id());
+            auto& key = inst->get_tag_id();
             bool finished = _tma.get_tag_finish(inst->subgraph_id, key);
             if (finished) {
               _tma.mark_tag_used(inst->subgraph_id, key);
@@ -369,7 +363,7 @@ bool Core::running() {
   running = running || !_vu_compute_pipeline.empty();
   for (int i=0; i<_num_systolic_array_per_core;i++)
     running = running || !_sa_compute_pipeline.at(i).empty();
-  running = running || !_dma_waiting_queue.empty();
+  running = running || !_dma_waiting_queue.empty() || !_dma_finished_queue.empty();
   running = running || !_tma.empty();
   running = running || !_ld_inst_queue.empty();
   running = running || !_st_inst_queue.empty();
@@ -385,12 +379,20 @@ void Core::pop_memory_request() {
 }
 
 void Core::push_memory_response(mem_fetch* response) {
-  Instruction * owner_inst = static_cast<Instruction*>(response->get_custom_data());
-
-  assert(owner_inst);
+  Instruction* owner_inst = static_cast<Instruction*>(response->get_custom_data());
   assert(owner_inst->get_waiting_request());
 
   owner_inst->dec_waiting_request();
+  if (!owner_inst->get_waiting_request()) {
+    auto it = _dma_waiting_queue.find(owner_inst);
+    if (it != _dma_waiting_queue.end()) {
+      std::shared_ptr<Instruction> moved_inst = std::move(it->second);
+      _dma_finished_queue.push_back(std::move(moved_inst));
+      _dma_waiting_queue.erase(it);
+    } else {
+      assert(true || "Can't happend...!");
+    }
+  }
   delete response;
 }
 
diff --git a/PyTorchSimBackend/src/Instruction.cc b/PyTorchSimBackend/src/Instruction.cc
index 5f21f5c6..c4892284 100644
--- a/PyTorchSimBackend/src/Instruction.cc
+++ b/PyTorchSimBackend/src/Instruction.cc
@@ -13,7 +13,8 @@ std::string opcode_to_string(Opcode opcode) {
 Instruction::Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_parents,
             addr_type dram_addr, std::vector<size_t> tile_size, size_t precision,
             std::vector<int>& idx_list, std::vector<int>& stride_list,
-            std::vector<int> tag_idx_list, std::vector<int> tag_stride_list, std::vector<int> accum_tag_idx_list, std::vector<int> loop_size_list)
+            std::vector<int> tag_idx_list, std::vector<int> tag_stride_list,
+            std::vector<int> accum_tag_idx_list, std::vector<int> loop_size_list)
   : opcode(opcode), compute_cycle(compute_cycle), ready_counter(num_parents), dram_addr(dram_addr),
     tile_size(tile_size), _precision(precision), _idx_list(idx_list),
     _stride_list(stride_list), _tag_idx_list(tag_idx_list), _tag_stride_list(tag_stride_list),
@@ -27,14 +28,6 @@ Instruction::Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_par
   if (_stride_list.size() == 1) {
     _stride_list.push_back(1);
   }
-
-  /* Calculate tag key */
-  int key_offset = 0;
-  for (int i=0; i<_tag_idx_list.size(); i++)
-    key_offset += _tag_idx_list.at(i) * _tag_stride_list.at(i);
-  for (auto accum_dim : accum_tag_idx_list)
-    _tag_key.push_back(accum_dim);
-  _tag_key.push_back(key_offset);
 }
 
 void Instruction::finish_instruction() {
@@ -57,12 +50,23 @@ void Instruction::dec_waiting_request() {
   _nr_waiting_request--;
 }
 
+void Instruction::prepare_tag_key() {
+  /* Calculate tag key */
+  int key_offset = 0;
+  _tag_key.push_back(_addr_id);
+  for (int i=0; i<_tag_idx_list.size(); i++)
+    key_offset += _tag_idx_list.at(i) * _tag_stride_list.at(i);
+  for (auto accum_dim : _accum_tag_idx_list)
+    _tag_key.push_back(accum_dim);
+  _tag_key.push_back(key_offset);
+}
+
 void Instruction::print() {
   spdlog::info("{}", opcode_to_string(opcode));
 }
 
-std::set<addr_type> Instruction::get_dram_address(addr_type dram_req_size) {
-  std::set<addr_type> address_set;
+std::shared_ptr<std::set<addr_type>> Instruction::get_dram_address(addr_type dram_req_size) {
+  auto address_set = std::make_shared<std::set<addr_type>>();
   uint64_t* indirect_index = NULL;
   size_t index_count = 0;
   /* Set 4D shape*/
@@ -90,7 +94,7 @@ std::set<addr_type> Instruction::get_dram_address(addr_type dram_req_size) {
             uint64_t index_val = indirect_index[index_count++];
             address += index_val * _precision;
           }
-          address_set.insert(address - (address & dram_req_size-1));
+          address_set->insert(address - (address & dram_req_size-1));
         }
       }
     }
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 5584cad0..39841185 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -201,8 +201,8 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
     }
 
     /* Check finished dma operation */
-    for (int i=0; i<_dma_waiting_queue.size(); i++){
-      std::shared_ptr<Instruction>& instruction = _dma_waiting_queue.at(i);
+    while(_dma_finished_queue.size()) {
+      std::shared_ptr<Instruction>& instruction = _dma_finished_queue.at(0);
       /* Pass not finished instruction */
       if (instruction->get_waiting_request())
         continue;
@@ -210,10 +210,8 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
       /* Finish DMA read instruction */
       if (instruction->is_dma_read())
         finish_instruction(instruction);
-
-      /* Erase the instruction in DMA waiting queue */
-      _dma_waiting_queue.erase(_dma_waiting_queue.begin() + i);
-      i--;
+      /* Erase the instruction in DMA finished queue */
+      _dma_finished_queue.erase(_dma_finished_queue.begin());
     }
 
     /* Peek instruction*/
@@ -240,7 +238,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
             });
           }
           issued = true;
-          _dma_waiting_queue.push_back(std::move(inst));
+          _dma_waiting_queue[inst.get()] = std::move(inst);
         }
         break;
       case Opcode::MOVOUT:
@@ -261,7 +259,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
           }
           issued = true;
           finish_instruction(inst);
-          _dma_waiting_queue.push_back(std::move(inst));
+          _dma_waiting_queue[inst.get()] = std::move(inst);
         }
         break;
       case Opcode::COMP:
diff --git a/PyTorchSimBackend/src/TMA.cc b/PyTorchSimBackend/src/TMA.cc
index cdff9869..5b028123 100644
--- a/PyTorchSimBackend/src/TMA.cc
+++ b/PyTorchSimBackend/src/TMA.cc
@@ -18,19 +18,19 @@ void TMA::issue_tile(std::shared_ptr<Instruction> inst) {
   _finished = false;
 }
 
-std::vector<mem_fetch*> TMA::get_memory_access() {
-  std::set<addr_type> addr_set = _current_inst->get_dram_address(_dram_req_size);
-  std::vector<mem_fetch *> access_vec;
+std::shared_ptr<std::vector<mem_fetch*>> TMA::get_memory_access() {
+  auto addr_set = _current_inst->get_dram_address(_dram_req_size);
+  auto access_vec = std::make_shared<std::vector<mem_fetch *>>();
   Tile* owner = (Tile*)_current_inst->get_owner();
   std::shared_ptr<TileSubGraph> owner_subgraph = owner->get_owner();
   spdlog::trace("[NUMA Trace] Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}",
     owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write());
-  for (auto addr: addr_set) {
+  for (auto addr: *addr_set) {
     mem_access_type acc_type = _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W : mem_access_type::GLOBAL_ACC_R;
     mf_type type = _current_inst->is_dma_write() ? mf_type::WRITE_REQUEST : mf_type::READ_REQUEST;
     mem_fetch* access = new mem_fetch(addr, acc_type, type, _dram_req_size, _current_inst->get_numa_id(), static_cast<void*>(_current_inst.get()));
     _current_inst->inc_waiting_request();
-    access_vec.push_back(access);
+    access_vec->push_back(access);
   }
   _finished = true;
   return access_vec;
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index d350ae87..1dc2c632 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -71,6 +71,8 @@ std::vector<uint32_t> calc_output_idx(TileGraphParser* tog_parser, std::map<std:
   }
 
   offset = outer_loop.size() - inner_loop.size();
+  if (offset < 0)
+    return outer_loop;
   for (int i=0; i<inner_loop.size(); i++)
     outer_loop[offset+i] += inner_loop[i] * step;
   return outer_loop;
@@ -340,6 +342,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
     if (tile_node->get_type() == TileType::LOAD_NODE) {
       std::shared_ptr<TileMemoryNode> mem_node = std::static_pointer_cast<TileMemoryNode>(tile_node);
       auto base_addr_name = mem_node->get_base_addr_name();
+      int base_addr_id = tog_parser->register_addr_name(base_addr_name);
       std::vector<std::string>& tag_idx_list = mem_node->get_tag_idx_list();
       std::vector<int>& tag_stride_list = mem_node->get_tag_stride_list();
       std::vector<int> skip_idx_list;
@@ -415,7 +418,8 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         mem_node->get_tile_size(), mem_node->get_precision(), iter_list,
         mem_node->get_stride_list(), tag_list, tag_stride_list, accum_tag_list, loop_size_list
       );
-      inst->set_addr_name(base_addr_name);
+      inst->set_addr_name(base_addr_name, base_addr_id);
+      inst->prepare_tag_key();
       inst->set_nr_inner_loop(nr_inner_loop);
       inst->adjust_dram_address();
       inst->set_is_async(mem_node->is_async_node());
@@ -429,6 +433,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
     } else if (tile_node->get_type() == TileType::STORE_NODE) {
       std::shared_ptr<TileMemoryNode> mem_node = std::static_pointer_cast<TileMemoryNode>(tile_node);
       auto base_addr_name = mem_node->get_base_addr_name();
+      int base_addr_id = tog_parser->register_addr_name(base_addr_name);
       /* Lookup given name's address */
       addr_type base_addr = tog_parser->lookup(base_addr_name);
       std::vector<int>& tag_stride_list = mem_node->get_tag_stride_list();
@@ -469,7 +474,8 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         mem_node->get_tile_size(), mem_node->get_precision(), iter_list,
         mem_node->get_stride_list(), std::vector<int>(1), tag_stride_list, accum_tag_list, loop_size_list
       );
-      inst->set_addr_name(base_addr_name);
+      inst->set_addr_name(base_addr_name, base_addr_id);
+      inst->prepare_tag_key();
       inst->set_nr_inner_loop(nr_inner_loop);
       inst->adjust_dram_address();
       inst->set_is_async(mem_node->is_async_node());
@@ -484,6 +490,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       printIndexMap("[TOGParser] DMA Wait Node ", iter);
       std::shared_ptr<TileMemoryWaitNode> wait_node = std::static_pointer_cast<TileMemoryWaitNode>(tile_node);
       auto base_addr_name = wait_node->get_base_addr_name();
+      int base_addr_id = tog_parser->register_addr_name(base_addr_name);
       addr_type base_addr = tog_parser->lookup(base_addr_name);
       /* Lookup given name's address */
       std::vector<int> iter_list;
@@ -522,7 +529,8 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         std::vector<size_t>(), 0, iter_list,
         iter_list, tag_list, new_tag_stride_list, accum_tag_list, std::vector<int>()
       );
-      inst->set_addr_name(base_addr_name);
+      inst->set_addr_name(base_addr_name, base_addr_id);
+      inst->prepare_tag_key();
       link_map[tile_node] = inst;
       tile_vec.back()->append_instuction(inst);
     } else if (tile_node->get_type() == TileType::COMPUTE_NODE) {

From 013d05f9bac4ba6837a381a9439e161f81036c4c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 11 Apr 2025 10:35:01 +0000
Subject: [PATCH 275/432] [Script] Fix hetero experiment

---
 .../configs/heterogeneous_c2_simple_noc.json           | 10 ++++++----
 .../configs/stonne_big_c1_simple_noc.json              |  9 +++++----
 ... systolic_ws_128x128_c1_simple_noc_tpuv3_half.json} | 10 ++++++----
 PyTorchSimBackend/src/SparseCore.cc                    |  1 +
 PyTorchSimFrontend/extension_op.py                     |  2 +-
 scripts/stonne_experiment/run.sh                       |  2 +-
 tests/test_hetro.py                                    |  6 +++---
 7 files changed, 23 insertions(+), 17 deletions(-)
 rename PyTorchSimBackend/configs/{systolic_ws_128x128_c1_simple_noc_tpuv2_half.json => systolic_ws_128x128_c1_simple_noc_tpuv3_half.json} (70%)

diff --git a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json
index 40a100ef..8f196e81 100644
--- a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json
+++ b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json
@@ -2,24 +2,26 @@
   "core_type" : ["stonne", "ws_mesh"],
   "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
   "num_cores" : 2,
-  "core_freq" : 700,
+  "core_freq" : 940,
   "sram_size" : 65536,
   "core_print_interval" : 10000,
   "num_stonne_per_core" : 8,
   "num_stonne_port" : 64,
+  "num_systolic_array_per_core" : 2,
 
   "dram_type" : "ramulator2",
-  "dram_freq" : 700,
+  "dram_freq" : 940,
   "dram_channels": 16,
   "dram_req_size": 32,
   "dram_latency" : 10,
+  "dram_size" : 32,
   "dram_nbl" : 2,
   "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
 
   "icnt_type" : "simple",
   "icnt_latency" : 7,
-  "icnt_freq" : 7000,
+  "icnt_freq" : 15000,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
 
   "precision" : 4,
diff --git a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
index 72f52318..c7ef15f7 100644
--- a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
+++ b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
@@ -2,24 +2,25 @@
   "core_type" : ["stonne"],
   "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
   "num_cores" : 1,
-  "core_freq" : 700,
+  "core_freq" : 940,
   "sram_size" : 65536,
   "core_print_interval" : 10000,
   "num_stonne_per_core" : 8,
   "num_stonne_port" : 64,
 
   "dram_type" : "ramulator2",
-  "dram_freq" : 700,
+  "dram_freq" : 940,
   "dram_channels": 8,
   "dram_req_size": 32,
   "dram_latency" : 10,
+  "dram_size" : 32,
   "dram_nbl" : 2,
   "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
 
   "icnt_type" : "simple",
   "icnt_latency" : 7,
-  "icnt_freq" : 7000,
+  "icnt_freq" : 15000,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
 
   "precision" : 4,
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2_half.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
similarity index 70%
rename from PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2_half.json
rename to PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
index ab5266c5..69ec8bd0 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2_half.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
@@ -1,21 +1,23 @@
 {
   "num_cores" : 1,
-  "core_freq" : 700,
+  "core_freq" : 940,
   "sram_size" : 65536,
   "core_print_interval" : 10000,
+  "num_systolic_array_per_core" : 2,
 
   "dram_type" : "ramulator2",
-  "dram_freq" : 700,
+  "dram_freq" : 940,
   "dram_channels": 8,
   "dram_req_size": 32,
   "dram_latency" : 10,
+  "dram_size" : 32,
   "dram_nbl" : 2,
   "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
  
   "icnt_type" : "simple",
   "icnt_latency" : 7,
-  "icnt_freq" : 7000,
+  "icnt_freq" : 15000,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
  
   "precision" : 4,
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 39841185..08297b38 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -168,6 +168,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
     /* Finish stonne core */
     if (coreBusy.at(subcore_id) && stonneCore->isFinished()) {
       stonneCore->finish();
+      spdlog::info("[SparseCore][{}] Operation finished at {}", _id, _core_cycle);
       std::shared_ptr<Tile> target_tile = percore_tiles.at(subcore_id).front();
       SST_STONNE::StonneOpDesc *opDesc = static_cast<SST_STONNE::StonneOpDesc*>(target_tile->get_custom_data());
       if (opDesc->trace_path != "")
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 6950375d..22a727c5 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -31,7 +31,7 @@
         "children": [2],
         "loop_index": "loop_arg000",
         "loop_start": 0,
-        "loop_end": 8,  # FIXME. this is a trick that generate multiple tile.
+        "loop_end": 4,  # FIXME. this is a trick that generate multiple tile.
         "loop_step": 1,
         "loop_type": "outer_loop"
     },
diff --git a/scripts/stonne_experiment/run.sh b/scripts/stonne_experiment/run.sh
index f456658b..b856b492 100755
--- a/scripts/stonne_experiment/run.sh
+++ b/scripts/stonne_experiment/run.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config stonne_big_c1_simple_noc.json --mode 0 > hetero/big_sparse.log
-python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config systolic_ws_128x128_c1_simple_noc_tpuv2_half.json --mode 1 > hetero/big.log
+python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config systolic_ws_128x128_c1_simple_noc_tpuv3_half.json --mode 1 > hetero/big.log
 python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config heterogeneous_c2_simple_noc.json --mode 2 > hetero/hetero.log
 
 echo "All processes completed!"
diff --git a/tests/test_hetro.py b/tests/test_hetro.py
index f2ae6ba7..5e36d730 100644
--- a/tests/test_hetro.py
+++ b/tests/test_hetro.py
@@ -45,14 +45,14 @@ def custom_matmul(a, b):
         SchedulerDNNModel.register_model("spmm", opt_model2)
 
         # Init input data
-        for i in range(4):
+        for i in range(1):
             dense_input1 = torch.randn(M, K)
             dense_input2 = torch.randn(K, N)
 
             sparse_input1 = torch.randn(128, 128)
             sparse_input2 = torch.randn(128, 128)
-            mask1 = torch.randn(sparse_input1.shape) > sparsity
-            mask2 = torch.randn(sparse_input2.shape) > sparsity
+            mask1 = torch.rand(sparse_input1.shape) > sparsity
+            mask2 = torch.rand(sparse_input2.shape) > sparsity
 
             sparse_input1 = sparse_input1 * mask1
             sparse_input2 = sparse_input2 * mask2

From 766aef0c5ff90198635059582031ddbdefcf78e5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 10 Apr 2025 06:19:44 +0000
Subject: [PATCH 276/432] [Config] Remove unused configs and update chiplet
 config

---
 ...tolic_ws_128x128_c16_simple_noc_tpuv4.json | 22 ---------------
 ...systolic_ws_128x128_c2_booksim_tpuv3.json} | 11 ++++----
 ...systolic_ws_128x128_c2_chiplet_tpuv3.json} | 12 ++++----
 ...ic_ws_128x128_c2_chiplet_tpuv3_xnuma.json} | 11 ++++----
 ...tolic_ws_128x128_c32_simple_noc_tpuv4.json | 22 ---------------
 .../systolic_ws_128x128_c4_booksim_tpuv2.json | 25 -----------------
 ...tolic_ws_128x128_c4_chiplet_map_tpuv2.json | 27 ------------------
 .../systolic_ws_128x128_c4_chiplet_tpuv2.json | 26 -----------------
 ...stolic_ws_128x128_c4_simple_noc_tpuv4.json | 28 -------------------
 ...128x128_c4_simple_noc_tpuv4_partition.json | 28 -------------------
 ...stolic_ws_128x128_c8_simple_noc_tpuv4.json | 22 ---------------
 11 files changed, 19 insertions(+), 215 deletions(-)
 delete mode 100644 PyTorchSimBackend/configs/systolic_ws_128x128_c16_simple_noc_tpuv4.json
 rename PyTorchSimBackend/configs/{systolic_ws_128x128_c2_booksim_tpuv2.json => systolic_ws_128x128_c2_booksim_tpuv3.json} (72%)
 rename PyTorchSimBackend/configs/{systolic_ws_128x128_c2_chiplet_tpuv2.json => systolic_ws_128x128_c2_chiplet_tpuv3.json} (70%)
 rename PyTorchSimBackend/configs/{systolic_ws_128x128_c2_chiplet_tpuv2_xnuma.json => systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json} (73%)
 delete mode 100644 PyTorchSimBackend/configs/systolic_ws_128x128_c32_simple_noc_tpuv4.json
 delete mode 100644 PyTorchSimBackend/configs/systolic_ws_128x128_c4_booksim_tpuv2.json
 delete mode 100644 PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_map_tpuv2.json
 delete mode 100644 PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_tpuv2.json
 delete mode 100644 PyTorchSimBackend/configs/systolic_ws_128x128_c4_simple_noc_tpuv4.json
 delete mode 100644 PyTorchSimBackend/configs/systolic_ws_128x128_c4_simple_noc_tpuv4_partition.json
 delete mode 100644 PyTorchSimBackend/configs/systolic_ws_128x128_c8_simple_noc_tpuv4.json

diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c16_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c16_simple_noc_tpuv4.json
deleted file mode 100644
index 17fe87bc..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c16_simple_noc_tpuv4.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "num_cores" : 16,  
-  "core_freq" : 1000,
-  "sram_size" : 65536,
-
-  "dram_type" : "ramulator",
-  "dram_freq" : 877,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_print_interval" : 10000,
-  "dram_config_path" : "../configs/ramulator_configs/HBM-config.cfg",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 2000,
-  "icnt_print_interval" : 10000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c16_m32.icnt",
-
-  "precision" : 2,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
similarity index 72%
rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv2.json
rename to PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
index f81e2472..bd355976 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv2.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
@@ -1,22 +1,23 @@
 {
   "num_cores" : 2,
-  "core_freq" : 700,
+  "core_freq" : 940,
   "sram_size" : 65536,
   "core_print_interval" : 10000,
+  "num_systolic_array_per_core" : 2,
 
   "dram_type" : "ramulator2",
-  "dram_freq" :700,
+  "dram_freq" : 940,
   "dram_channels": 32,
   "dram_req_size": 32,
   "dram_latency" : 10,
-  "dram_size" : 16,
+  "dram_size" : 32,
   "dram_nbl" : 2,
   "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
  
   "icnt_type" : "booksim2",
   "icnt_latency" : 1,
-  "icnt_freq" : 1000,
+  "icnt_freq" : 5000,
   "icnt_node_per_core" : 16,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c32_m32.icnt",
  
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
similarity index 70%
rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2.json
rename to PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
index d9161d67..e96a81ce 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
@@ -1,25 +1,27 @@
 {
   "num_cores" : 2,
-  "core_freq" : 700,
+  "core_freq" : 940,
   "sram_size" : 65536,
   "core_print_interval" : 10000,
+  "num_systolic_array_per_core" : 2,
 
   "dram_type" : "ramulator2",
-  "dram_freq" :700,
+  "dram_freq" : 940,
   "dram_channels": 32,
   "dram_req_size": 32,
   "dram_latency" : 10,
-  "dram_size" : 16,
+  "dram_size" : 32,
   "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_num_partitions" : 2,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
  
   "icnt_type" : "booksim2",
   "icnt_latency" : 1,
-  "icnt_freq" : 1000,
+  "icnt_freq" : 5000,
   "icnt_node_per_core" : 16,
   "icnt_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt",
+  "icnt_print_interval" : 1000,
  
   "precision" : 4,
   "scheduler" : "simple",
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2_xnuma.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
similarity index 73%
rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2_xnuma.json
rename to PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
index 856e9c9c..4f667ecb 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2_xnuma.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
@@ -1,23 +1,24 @@
 {
   "num_cores" : 2,
-  "core_freq" : 700,
+  "core_freq" : 940,
   "sram_size" : 65536,
   "core_print_interval" : 10000,
+  "num_systolic_array_per_core" : 2,
 
   "dram_type" : "ramulator2",
-  "dram_freq" :700,
+  "dram_freq" : 940,
   "dram_channels": 32,
   "dram_req_size": 32,
   "dram_latency" : 10,
-  "dram_size" : 16,
+  "dram_size" : 32,
   "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_num_partitions" : 1,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
  
   "icnt_type" : "booksim2",
   "icnt_latency" : 1,
-  "icnt_freq" : 1000,
+  "icnt_freq" : 5000,
   "icnt_node_per_core" : 16,
   "icnt_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt",
  
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c32_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c32_simple_noc_tpuv4.json
deleted file mode 100644
index 80814c42..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c32_simple_noc_tpuv4.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "num_cores" : 32,
-  "core_freq" : 1000,
-  "sram_size" : 65536,
-
-  "dram_type" : "ramulator",
-  "dram_freq" : 877,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_print_interval" : 10000,
-  "dram_config_path" : "../configs/ramulator_configs/HBM-config.cfg",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 2000,
-  "icnt_print_interval" : 10000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c32_m32.icnt",
-
-  "precision" : 2,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_booksim_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c4_booksim_tpuv2.json
deleted file mode 100644
index 842d5ab0..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_booksim_tpuv2.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "num_cores" : 4,
-  "core_freq" : 700,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :700,
-  "dram_channels": 64,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
- 
-  "icnt_type" : "booksim2",
-  "icnt_latency" : 1,
-  "icnt_freq" : 1000,
-  "icnt_node_per_core" : 16,
-  "icnt_config_path" : "booksim2_configs/fly_128.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_map_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_map_tpuv2.json
deleted file mode 100644
index 080599f2..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_map_tpuv2.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-  "num_cores" : 4,
-  "core_freq" : 700,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :700,
-  "dram_channels": 64,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_num_partitions" : 4,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
- 
-  "icnt_type" : "booksim2",
-  "icnt_latency" : 1,
-  "icnt_freq" : 1000,
-  "icnt_node_per_core" : 16,
-  "icnt_config_path" : "../configs/booksim2_configs/chiplet_16_16_4.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 0
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_tpuv2.json
deleted file mode 100644
index 5d4fa211..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_chiplet_tpuv2.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "num_cores" : 4,
-  "core_freq" : 700,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :700,
-  "dram_channels": 64,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
- 
-  "icnt_type" : "booksim2",
-  "icnt_latency" : 1,
-  "icnt_freq" : 1000,
-  "icnt_node_per_core" : 16,
-  "icnt_config_path" : "../configs/booksim2_configs/chiplet_16_16_4.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 0
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c4_simple_noc_tpuv4.json
deleted file mode 100644
index 9606281d..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_simple_noc_tpuv4.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 700,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-
-  "dram_type" : "ramulator",
-  "dram_freq" : 700,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_print_interval" : 10000,
-  "dram_config_path" : "../configs/ramulator_configs/HBM-config.cfg",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 2000,
-  "icnt_print_interval" : 0,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_simple_noc_tpuv4_partition.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c4_simple_noc_tpuv4_partition.json
deleted file mode 100644
index f705506a..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c4_simple_noc_tpuv4_partition.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 1000,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-
-  "dram_type" : "ramulator",
-  "dram_freq" : 877,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_print_interval" : 10000,
-  "dram_config_path" : "../configs/ramulator_configs/HBM-config.cfg",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 2000,
-  "icnt_print_interval" : 0,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
-
-  "precision" : 2,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":1
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c8_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c8_simple_noc_tpuv4.json
deleted file mode 100644
index 496531a5..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c8_simple_noc_tpuv4.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "num_cores" : 8,  
-  "core_freq" : 1000,
-  "sram_size" : 65536,
-
-  "dram_type" : "ramulator",
-  "dram_freq" : 877,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_print_interval" : 10000,
-  "dram_config_path" : "../configs/ramulator_configs/HBM-config.cfg",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 2000,
-  "icnt_print_interval" : 10000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c8_m32.icnt",
-
-  "precision" : 2,
-  "scheduler" : "simple"
-}
\ No newline at end of file

From 045e5638388765ec069ec5970f43eda0d7fb9f9b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 10 Apr 2025 06:22:59 +0000
Subject: [PATCH 277/432] [Backend+script] Update Numa stat for togsim and
 upload chiplet script

---
 .../configs/booksim2_configs/fly_c32_m4.icnt  |  18 +++
 .../configs/booksim2_configs/fly_c32_m8.icnt  |  18 +++
 .../systolic_ws_128x128_c2_booksim_tpuv3.json |   2 +-
 .../systolic_ws_128x128_c2_chiplet_tpuv3.json |   4 +-
 ...lic_ws_128x128_c2_chiplet_tpuv3_xnuma.json |   2 +-
 PyTorchSimBackend/include/Core.h              |   4 +
 PyTorchSimBackend/include/Dram.h              |   2 +-
 PyTorchSimBackend/src/Core.cc                 |   1 +
 PyTorchSimBackend/src/Simulator.cc            |   6 +
 PyTorchSimBackend/src/TMA.cc                  |   4 +-
 scripts/chiplet.sh                            |  32 +++--
 scripts/chiplet_prep.py                       | 120 ++++++++++++++++++
 scripts/chiplet_prep.sh                       |  14 ++
 13 files changed, 211 insertions(+), 16 deletions(-)
 create mode 100644 PyTorchSimBackend/configs/booksim2_configs/fly_c32_m4.icnt
 create mode 100644 PyTorchSimBackend/configs/booksim2_configs/fly_c32_m8.icnt
 create mode 100644 scripts/chiplet_prep.py
 create mode 100755 scripts/chiplet_prep.sh

diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m4.icnt b/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m4.icnt
new file mode 100644
index 00000000..e5765207
--- /dev/null
+++ b/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m4.icnt
@@ -0,0 +1,18 @@
+[config]
+use_map = 0
+flit_size = 32
+topology = fly
+k = 36
+n = 1
+routing_function = dest_tag
+subnets = 1
+
+vc_buf_size = 256
+input_buffer_size = 256
+ejection_buffer_size = 256
+boundary_buffer_size = 256
+wait_for_tail_credit = 0
+vc_allocator = islip
+sw_allocator = islip
+alloc_iters = 1
+deadlock_warn_timeout = 10000
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m8.icnt b/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m8.icnt
new file mode 100644
index 00000000..29e573cb
--- /dev/null
+++ b/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m8.icnt
@@ -0,0 +1,18 @@
+[config]
+use_map = 0
+flit_size = 32
+topology = fly
+k = 40
+n = 1
+routing_function = dest_tag
+subnets = 1
+
+vc_buf_size = 256
+input_buffer_size = 256
+ejection_buffer_size = 256
+boundary_buffer_size = 256
+wait_for_tail_credit = 0
+vc_allocator = islip
+sw_allocator = islip
+alloc_iters = 1
+deadlock_warn_timeout = 10000
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
index bd355976..5e30fd43 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
@@ -17,7 +17,7 @@
  
   "icnt_type" : "booksim2",
   "icnt_latency" : 1,
-  "icnt_freq" : 5000,
+  "icnt_freq" : 1000,
   "icnt_node_per_core" : 16,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c32_m32.icnt",
  
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
index e96a81ce..b2661894 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
@@ -18,10 +18,10 @@
  
   "icnt_type" : "booksim2",
   "icnt_latency" : 1,
-  "icnt_freq" : 5000,
+  "icnt_freq" : 1000,
   "icnt_node_per_core" : 16,
   "icnt_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt",
-  "icnt_print_interval" : 1000,
+  "icnt_print_interval" : 10000,
  
   "precision" : 4,
   "scheduler" : "simple",
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
index 4f667ecb..922ede5b 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
@@ -18,7 +18,7 @@
  
   "icnt_type" : "booksim2",
   "icnt_latency" : 1,
-  "icnt_freq" : 5000,
+  "icnt_freq" : 1000,
   "icnt_node_per_core" : 16,
   "icnt_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt",
  
diff --git a/PyTorchSimBackend/include/Core.h b/PyTorchSimBackend/include/Core.h
index c151be4f..a9c201ea 100644
--- a/PyTorchSimBackend/include/Core.h
+++ b/PyTorchSimBackend/include/Core.h
@@ -28,6 +28,8 @@ class Core {
   virtual mem_fetch* top_memory_request() { return _request_queue.front(); }
   virtual void push_memory_response(mem_fetch* response);
   void check_tag() { _tma.check_table(); }
+  void inc_numa_hit() { _stat_numa_hit++; }
+  void inc_numa_miss() { _stat_numa_miss++; }
 
   std::queue<std::shared_ptr<Instruction>>& get_compute_pipeline(int compute_type);
   enum {
@@ -67,6 +69,8 @@ class Core {
   std::vector<uint64_t> _stat_tot_sa_inst;
   uint64_t _stat_gemm_inst = 0;
   uint64_t _stat_skip_dma = 0;
+  uint64_t _stat_numa_hit = 0;
+  uint64_t _stat_numa_miss = 0;
 
   cycle_type _stat_vu_compute_cycle = 0;
   std::vector<cycle_type> _stat_sa_compute_cycle;
diff --git a/PyTorchSimBackend/include/Dram.h b/PyTorchSimBackend/include/Dram.h
index e600bbfb..f4615d0a 100644
--- a/PyTorchSimBackend/include/Dram.h
+++ b/PyTorchSimBackend/include/Dram.h
@@ -27,7 +27,7 @@ class Dram {
   virtual void pop(uint32_t cid) = 0;
   uint32_t get_channel_id(mem_fetch* request);
   virtual void print_stat() {}
-
+  uint32_t get_channels_per_partition() { return _n_ch_per_partition; }
  protected:
   SimulationConfig _config;
   CacheConfig _m_cache_config;
diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index ec84a368..10d5a647 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -420,6 +420,7 @@ void Core::print_stats() {
   spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {}", _id, _stat_tot_tma_cycle, _stat_tot_tma_idle_cycle);
   spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
     static_cast<float>(_stat_tot_vu_compute_cycle * 100) / _core_cycle, _stat_tot_vu_compute_cycle, _stat_tot_vu_compute_idle_cycle);
+  spdlog::info("Core [{}] : Numa hit count : {}, Numa miss count : {}", _id, _stat_numa_hit, _stat_numa_miss);
   spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle);
 }
 
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index b4637513..99e71964 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -49,6 +49,7 @@ Simulator::Simulator(SimulationConfig config)
   }
 
   // Create interconnect object
+  spdlog::info("[Config/Interconnect] Inerconnect freq: {} MHz", config.icnt_freq);
   if (config.icnt_type == IcntType::SIMPLE) {
     spdlog::info("[Config/Interconnect] SimpleInerconnect selected");
     _icnt = std::make_unique<SimpleInterconnect>(config);
@@ -113,6 +114,11 @@ void Simulator::icnt_cycle() {
         mem_fetch *front = _cores[core_id]->top_memory_request();
         front->set_core_id(core_id);
         if (!_icnt->is_full(port_id, front)) {
+          //int node_id = _dram->get_channel_id(front) / 16;
+          //if (core_id == node_id)
+          //  _cores[core_id]->inc_numa_hit();
+          //else
+          //  _cores[core_id]->inc_numa_miss();
           _icnt->push(port_id , get_dest_node(front), front);
           _cores[core_id]->pop_memory_request();
           _nr_from_core++;
diff --git a/PyTorchSimBackend/src/TMA.cc b/PyTorchSimBackend/src/TMA.cc
index 5b028123..d60e7149 100644
--- a/PyTorchSimBackend/src/TMA.cc
+++ b/PyTorchSimBackend/src/TMA.cc
@@ -23,8 +23,8 @@ std::shared_ptr<std::vector<mem_fetch*>> TMA::get_memory_access() {
   auto access_vec = std::make_shared<std::vector<mem_fetch *>>();
   Tile* owner = (Tile*)_current_inst->get_owner();
   std::shared_ptr<TileSubGraph> owner_subgraph = owner->get_owner();
-  spdlog::trace("[NUMA Trace] Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}",
-    owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write());
+  spdlog::trace("[NUMA Trace] Core-{}, Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}",
+    _id, owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write());
   for (auto addr: *addr_set) {
     mem_access_type acc_type = _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W : mem_access_type::GLOBAL_ACC_R;
     mf_type type = _current_inst->is_dma_write() ? mf_type::WRITE_REQUEST : mf_type::READ_REQUEST;
diff --git a/scripts/chiplet.sh b/scripts/chiplet.sh
index f0404eb6..d6f27853 100755
--- a/scripts/chiplet.sh
+++ b/scripts/chiplet.sh
@@ -13,19 +13,22 @@ if [ $# -lt 1 ]; then
 fi
 
 GEMM_PATH="$1"
+INDEX_NAME="$2"
 SIMULATOR_PATH="$TORCHSIM_DIR/PyTorchSimBackend/build/bin/Simulator"
 GEMM_DIR_NAME=$(basename "$GEMM_PATH")
 echo "GEMM Directory Name: $GEMM_DIR_NAME"
 
 CONFIG_LIST=(
-    "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json"
-    "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv2.json"
-    "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2.json"
-    "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv2_xnuma.json"
+    "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json"
 )
+CONFIG_LIST2=(
+    "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json"
+    "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json"
+)
+shift
 shift
 for ATTRIBUTE in "$@"; do
-    ATTRIBUTE_FILE="$GEMM_PATH/attribute/$ATTRIBUTE"
+    ATTRIBUTE_FILE="$GEMM_PATH/runtime_0000/attribute/$ATTRIBUTE"
     if [ ! -f "$ATTRIBUTE_FILE" ]; then
         echo "Error: Attribute file '$ATTRIBUTE_FILE' does not exist."
         exit 1
@@ -33,7 +36,7 @@ for ATTRIBUTE in "$@"; do
     ATTRIBUTE_FILES+=("$ATTRIBUTE_FILE")
 done
 MODELS_LIST="$GEMM_PATH/tile_graph.onnx"
-ATTRIBUTE_PATH="$GEMM_PATH/attribute"
+ATTRIBUTE_PATH="$GEMM_PATH/runtime_0000/attribute"
 
 for CONFIG in "${CONFIG_LIST[@]}"; do
     CONFIG_NAME=$(basename "$CONFIG" .json)
@@ -41,16 +44,27 @@ for CONFIG in "${CONFIG_LIST[@]}"; do
     for ATTRIBUTE_FILE in "${ATTRIBUTE_FILES[@]}"; do
         ATTRIBUTE_NAME=$(basename "$ATTRIBUTE_FILE")
 
-        RESULTS_DIR="./results/$GEMM_DIR_NAME/$ATTRIBUTE_NAME"
+        RESULTS_DIR="./chiplet_results$INDEX_NAME/$GEMM_DIR_NAME/$ATTRIBUTE_NAME"
         mkdir -p "$RESULTS_DIR"
         OUTPUT_FILE="$RESULTS_DIR/${CONFIG_NAME}_result.txt"
 
         # Run Simulator
         echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME"
         "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" &
-
-        echo "===== Simulation for $CONFIG completed. Results saved to $OUTPUT_FILE ====="
+        echo "[BackendSimulator] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\""
     done
 done
 
+for CONFIG in "${CONFIG_LIST2[@]}"; do
+    CONFIG_NAME=$(basename "$CONFIG" .json)
+    ATTRIBUTE_NAME=0
+    RESULTS_DIR="./chiplet_results$INDEX_NAME/$GEMM_DIR_NAME/$ATTRIBUTE_NAME"
+    mkdir -p "$RESULTS_DIR"
+    OUTPUT_FILE="$RESULTS_DIR/${CONFIG_NAME}_result.txt"
+
+    # Run Simulator
+    echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME"
+    "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" &
+    echo "[BackendSimulator] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\""
+done
 wait
\ No newline at end of file
diff --git a/scripts/chiplet_prep.py b/scripts/chiplet_prep.py
new file mode 100644
index 00000000..9b82ea39
--- /dev/null
+++ b/scripts/chiplet_prep.py
@@ -0,0 +1,120 @@
+import os
+import json
+import shutil
+import argparse
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def test_matmul(device, input_size=128, hidden_size=128, output_size=128):
+    def custom_matmul(a, b):
+        return torch.matmul(a, b)
+    torch.manual_seed(0)
+    input = torch.randn(input_size, hidden_size)
+    weight = torch.randn(hidden_size, output_size)
+    x1 = input.to(device=device)
+    w1 = weight.to(device=device)
+    x2 = input.to("cpu")
+    w2 = weight.to("cpu")
+    opt_fn = torch.compile(dynamic=False)(custom_matmul)
+    res = opt_fn(x1, w1)
+    y = custom_matmul(x2, w2)
+    test_result("Matmul Forward", res, y)
+
+def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None):
+    file_path = os.path.join(dump_path, 'runtime_0000', 'attribute', '0')
+    if not os.path.exists(file_path):
+        print(f"File {file_path} does not exist.")
+        return
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    # address_numa_stride와 subgraph_map 추가
+    if address_numa_stride:
+        data['address_numa_stride'] = address_numa_stride
+    if subgraph_map:
+        data['subgraph_map'] = subgraph_map
+
+    output_path = file_path = os.path.join(dump_path, 'runtime_0000', 'attribute')
+    os.makedirs(output_path, exist_ok=True)
+    output_file = os.path.join(output_path, name)
+    with open(output_file, 'w') as f:
+        json.dump(data, f, indent=4)
+    print(f"Modified file saved to {output_file}")
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    parser = argparse.ArgumentParser(description='Process folder argument.')
+    parser.add_argument('size', type=int, help='Folder value', default=256)
+    args = parser.parse_args()
+
+    folder = int(args.size)
+    print("Taget size: ", folder)
+    folder_path = os.environ.get("TORCHSIM_DUMP_PATH")
+    print(folder_path)
+    os.makedirs(folder_path, exist_ok=True)
+    test_matmul(device, folder, folder, folder)
+
+    pp = os.listdir(folder_path)[0]
+    dump_path = os.path.join(folder_path, pp)
+    pp = os.listdir(dump_path)[0]
+    dump_path = os.path.join(dump_path, pp)
+    subgraph_map_best = { "0": 0, "1": 0, "2": 1, "3": 1 }
+    subgraph_map_worst = { "0": 1, "1": 1, "2": 0, "3": 0 }
+    numa_stride = { "arg0" : [1], "arg1" : [1] , "arg2": [0 , 2] }
+
+    subgraph_map_best1k = { "0": 0, "1": 0, "2": 1, "3": 1 }
+    subgraph_map_worst1k = { "0": 1, "1": 1, "2": 0, "3": 0 }
+    numa_stride_1k = { "arg0" : [1], "arg1" : [1] , "arg2": [0 , 2] }
+
+    subgraph_map_best2k = {
+        "0": 0,
+        "1": 0,
+        "2": 0,
+        "3": 0,
+        "4": 1,
+        "5": 1,
+        "6": 1,
+        "7": 1
+    }
+    subgraph_map_worst2k = {
+        "0": 1,
+        "1": 1,
+        "2": 1,
+        "3": 1,
+        "4": 0,
+        "5": 0,
+        "6": 0,
+        "7": 0
+    }
+    numa_stride_2k = { "arg0" : [2], "arg1" : [1] , "arg2": [0 , 4] }
+    if args.size == 1024:
+        modify_file(dump_path, "best", numa_stride_1k, subgraph_map_best1k)
+        modify_file(dump_path, "worst", numa_stride_1k, subgraph_map_worst1k)
+    elif args.size == 2048:
+        modify_file(dump_path, "best", numa_stride_2k, subgraph_map_best2k)
+        modify_file(dump_path, "worst", numa_stride_2k, subgraph_map_worst2k)
+    else:
+        modify_file(dump_path, "best", numa_stride, subgraph_map_best)
+        modify_file(dump_path, "worst", numa_stride, subgraph_map_worst)
+
diff --git a/scripts/chiplet_prep.sh b/scripts/chiplet_prep.sh
new file mode 100755
index 00000000..6976b198
--- /dev/null
+++ b/scripts/chiplet_prep.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+sizes=(256 512 1024 2048)
+# 각 size에 대해 처리
+for size in "${sizes[@]}"; do
+    echo "Processing size: $size"
+
+    # 환경 변수 설정
+    export TORCHSIM_FORCE_TIME_M=$((size / 2))
+    export TORCHSIM_FORCE_TIME_K=$((size / 2))
+    export TORCHSIM_FORCE_TIME_N=$((size / 2))
+    export TORCHSIM_DUMP_PATH=$(pwd)/chiplet_result/$size
+    python3 chiplet_prep.py $size
+done
\ No newline at end of file

From ae2cdd9f417383877a8c7f0d4b2095ded5864899 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 15 Apr 2025 07:11:49 +0000
Subject: [PATCH 278/432] [Scheduler] Fix parallel execution mechanism

---
 .github/workflows/docker-image.yml            | 19 ++++++-
 .github/workflows/pull-request.yml            | 18 ++++++-
 ...stolic_ws_128x128_c1_simple_noc_tpuv3.json |  7 +--
 PyTorchSimBackend/src/Simulator.cc            | 19 +++++--
 PyTorchSimBackend/src/TileGraphParser.cc      |  1 +
 Scheduler/scheduler.py                        | 50 +++++++++++--------
 Simulator/simulator.py                        |  7 ++-
 tests/test_scheduler.py                       | 12 +++--
 8 files changed, 98 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index c30a09c7..8c35edc7 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -575,6 +575,22 @@ jobs:
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_indirect_access.py
 
+  test_scheduler:
+    name: Run test_scheduler
+    runs-on: self-hosted
+    needs: build
+    env:
+      GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+    steps:
+      - name: Run test_scheduler.py
+        run: |
+          echo "Running test_scheduler.py"
+          echo $GIT_ACCESS_TOKEN | docker login ghcr.io -u USERNAME --password-stdin
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_scheduler.py
+
   test_cleanup:
     name: Clean test cases
     runs-on: self-hosted
@@ -583,7 +599,8 @@ jobs:
             test_transpose2D, test_view3D_2D, test_layernorm,
             test_mlp, test_resnet, test_transformer, test_transpose3D,
             test_sparsity, test_activation, test_pool, test_perceptron,
-            test_fusion, test_mistral, test_moe, test_indirect]
+            test_fusion, test_mistral, test_moe, test_indirect, test_scheduler]
+
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index 35d45e6c..a1078031 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -597,6 +597,22 @@ jobs:
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_indirect_access.py
 
+  test_scheduler:
+    name: Run test_scheduler
+    runs-on: self-hosted
+    needs: build
+    env:
+      GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+    steps:
+      - name: Run test_scheduler.py
+        run: |
+          echo "Running test_scheduler.py"
+          echo $GIT_ACCESS_TOKEN | docker login ghcr.io -u USERNAME --password-stdin
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_scheduler.py
+
   test_cleanup:
     name: Clean test cases
     runs-on: self-hosted
@@ -605,7 +621,7 @@ jobs:
             test_transpose2D, test_view3D_2D, test_layernorm,
             test_mlp, test_resnet, test_transformer, test_transpose3D,
             test_sparsity, test_activation, test_pool, test_perceptron,
-            test_fusion, test_mistral, test_moe, test_indirect]
+            test_fusion, test_mistral, test_moe, test_indirect, test_scheduler]
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
index cd8396ff..55c04b92 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
@@ -21,10 +21,5 @@
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
 
   "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
+  "scheduler" : "simple"
 }
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index 99e71964..aaed8bda 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -185,17 +185,30 @@ int Simulator::until(cycle_type until_cycle) {
 
     // Check if core status has changed
     if (_core_cycles % 10 == 0) {
+      int bitmap = 0;
       for (int i=0; i<_partition_scheduler.size(); i++) {
         /* Skip this */
         if (partition_scheudler_status.at(i))
           continue;
 
-        if (_partition_scheduler.at(i)->empty())
-          return i;
+        if (_partition_scheduler.at(i)->empty()) {
+          bitmap |= (1 << i);
+        }
       }
+      if (bitmap)
+        return bitmap;
     }
   }
-  return -1;
+  int bitmap = 0;
+  for (int i=0; i<_partition_scheduler.size(); i++) {
+    /* Skip this */
+    if (partition_scheudler_status.at(i))
+      continue;
+
+    if (_partition_scheduler.at(i)->empty())
+      bitmap |= (1ULL << i);
+  }
+  return bitmap;
 }
 
 void Simulator::cycle() {
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index 1dc2c632..b1bff65f 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -218,6 +218,7 @@ TileMemoryNode::TileMemoryNode(onnx::NodeProto& node) : TileNode(node) {
       _is_async = attribute.i();
     } else if (attribute.name() == "torchsim_indirect_mode") {
       _is_indirect = attribute.i();
+    } else if (attribute.name() == "torchsim_name") {
     } else {
       spdlog::info("Unknown attribute: {}", attribute.name());
     }
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 89f9ded9..7aa3e931 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -118,9 +118,7 @@ def find_model(self, model_name : str):
         if model_name in SchedulerDNNModel.MODEL_MAP:
             return SchedulerDNNModel.MODEL_MAP[model_name]
         else:
-            print(f'[Scheduler] Requested model "{model_name}"is not registered...')
-            return None
-
+            raise KeyError(f'[Scheduler] Requested model "{model_name}" is not registered...')
 
     def get_batchable_input(self):
         batched_input_tensor = []
@@ -217,7 +215,10 @@ def get_compiled_model(self, batched_req: List[Request], request_queue_idx):
     def is_partition_idle(self, partition_idx):
         return len(self.launch_model_dicts[partition_idx]) == 0
 
-    def is_idle(self):
+    def is_any_idle(self, skip_list):
+        return any([self.is_partition_idle(i) and not skip_list[i] for i in range(self.num_partion)])
+
+    def is_all_idle(self):
         return all([self.is_partition_idle(i) for i in range(self.num_partion)])
 
     def prepare_model(self, req_model: SchedulerDNNModel):
@@ -249,10 +250,10 @@ def prepare_launch_kernel(self, kernel, inputs):
     def launch_kernel(self, current_cycle, partion_idx=0):
         # Check partition is busy
         if self.partition_state[partion_idx] != self.PARTITION_IDLE:
-            return None
+            return self.partition_state[partion_idx]
         result = self.select_kernel(partion_idx)
         if result == self.SELECT_NOTHING:
-            return None
+            return self.SELECT_NOTHING
         kernel, inputs = result
         if not isinstance(kernel, str):
             onnx_path, attribute_path = self.prepare_launch_kernel(kernel, inputs)
@@ -367,11 +368,16 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE,
             exit(1)
 
     def add_request(self, request: Request, request_time=-1):
-        """register model at timestamp time"""
+        """register model at timestamp time
+            request_time : msec
+        """
         request_time = self.current_time() if request_time == -1 else request_time
         request.arrival_time = request_time
         self.request_queue[request.request_queue_idx].append(request)
 
+    def request_empty(self, request_queue_idx):
+        return len(self.request_queue[request_queue_idx])==0
+
     def select(self, request_queue_idx=0) -> List[Request]:
         """
         Select 1 request from request_queue in FCFS manner.
@@ -457,12 +463,12 @@ def schedule(self):
 
         # Try move to next nearest request time
         next_req, next_time = self.nearest_next_reqeust_time()
-        if next_req is None and self.execution_engine.is_idle():
+        if next_req is None and self.execution_engine.is_all_idle():
             # No request remained...
             return
 
         # Need to forward the time until next_arrival_time
-        if self.execution_engine.is_idle():
+        if self.execution_engine.is_all_idle():
             reason = self.backend_simulator.until(self.msec_to_cycle(next_time))
             self.current_cycle = self.backend_simulator.cycle()
         else:
@@ -470,41 +476,45 @@ def schedule(self):
         return
 
     def run(self, until_time):
+        req_empty_info = [self.request_empty(i) for i in range(self.execution_engine.num_partion)]
         def execute_cycle():
+            launch_ret_info = []
             for i in range(self.execution_engine.num_partion):
                 if self.execution_engine.partition_state[i] == ExecutionEngine.PARTITION_IDLE:
                     ret = self.execution_engine.launch_kernel(self.current_cycle, i)
+                    launch_ret_info.append(ret)
 
             self.check_finish_request()
             # Check if the stop condition is met
-            if self.execution_engine.is_idle():
-                return -1
+            if self.execution_engine.is_any_idle(req_empty_info) or self.execution_engine.is_all_idle(): # Ignore empty request queue
+                return []
 
             # Schedule jobs and update the current time
-            result = self.backend_simulator.until(self.msec_to_cycle(until_time))
+            result_list = self.backend_simulator.until(self.msec_to_cycle(until_time))
             self.current_cycle = self.backend_simulator.cycle()
 
-            if result != -1:
+            for core_idx in result_list:
                 # Kernel is finished. So set idle state
-                self.execution_engine.partition_state[result] = ExecutionEngine.PARTITION_IDLE
+                self.execution_engine.partition_state[core_idx] = ExecutionEngine.PARTITION_IDLE
 
-            return result
+            return result_list
 
         if self.current_cycle >= self.msec_to_cycle(until_time):
             until_time = -1
 
         if until_time == -1:
-            while not self.execution_engine.is_idle():
+            while not self.execution_engine.is_any_idle(req_empty_info):
                 result = execute_cycle()
+                req_empty_info = [self.request_empty(i) for i in range(self.execution_engine.num_partion)]
                 # if result is not -1, schedule new request
-                if result == -1:
+                if len(result)==0:
                     break
 
         else:
-            while self.current_cycle <= self.msec_to_cycle(until_time) and not self.execution_engine.is_idle():
+            while self.current_cycle <= self.msec_to_cycle(until_time) and not self.execution_engine.is_all_idle():
                 result = execute_cycle()
                 # if result is not -1, schedule new request
-                if result == -1:
+                if len(result)==0:
                     break
         return
 
@@ -515,7 +525,7 @@ def is_request_queue_empty(self):
         return result
 
     def is_finished(self):
-        if self.is_request_queue_empty() and self.execution_engine.is_idle():
+        if self.is_request_queue_empty() and self.execution_engine.is_all_idle():
             self.backend_simulator.wait()
             return True
         return False
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 40351b14..de92663e 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -296,7 +296,12 @@ def cycle(self):
     def until(self, until_cycle):
         command = f"until {until_cycle}"
         ret = self.send_command(command)
-        return int(ret.split(" ")[-1])
+        bitmap = int(ret.split(" ")[-1])
+        indices = []
+        for i in range(64):
+            if (bitmap >> i) & 1:
+                indices.append(i)
+        return indices
 
     def quit(self):
         command = "quit"
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index cec07fb3..e05fa392 100644
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -4,14 +4,16 @@
 from torchvision.models import resnet18 as model1
 from test_transformer import DecoderBlock as model2
 
-sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+sys.path.append(base_path)
 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
+config = f'{base_path}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
 
 target_model1 = model1().eval()
 target_model2 = model2(768, 12).eval()
 
 # Init scheduler
-scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE)
+scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
 # Register compiled model
 opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last))
 opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device()))
@@ -20,15 +22,19 @@
 
 # Init input data
 model_input1 = torch.randn(1, 3, 224, 224)
-model_input2 = torch.randn(512, 768)
+model_input2 = torch.randn(128, 768)
 
 # Init request
 new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0)
 new_request2 = Request("bert", [model_input2], [], request_queue_idx=1)
+new_request3 = Request("resnet18", [model_input1], [], request_queue_idx=0)
+new_request4 = Request("bert", [model_input2], [], request_queue_idx=1)
 
 # Add request to scheduler
 scheduler.add_request(new_request1, request_time=0)
 scheduler.add_request(new_request2, request_time=0)
+scheduler.add_request(new_request3, request_time=0)
+scheduler.add_request(new_request4, request_time=0)
 
 # Run scheduler
 while not scheduler.is_finished():

From 71af9f9ef000cfb702545ba2a873b8f8e91070f2 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 15 Apr 2025 11:19:44 +0000
Subject: [PATCH 279/432] [Frontend/Template] Use torch.tensor&affine_map
 instead of is_transposed()

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  | 49 +++++++------------
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 47 ++++++------------
 tests/test_transformer.py                     |  2 +-
 3 files changed, 33 insertions(+), 65 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 588c6f42..493a7a4b 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -1,4 +1,5 @@
 import os
+from torch import empty_strided
 from typing import List, Optional, cast
 
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
@@ -20,8 +21,8 @@
 // TILE_K = {{ TILE_K }}
 // SUB_TILE_M = {{ SUB_TILE_M }}
 // SUB_TILE_N = {{ SUB_TILE_N }}
-{% if X_transposed %}#map0 = affine_map<(d0, d1, d2) -> (d0 * {{ K * M }} + d2 * {{ M }} + d1)>{% else %}#map0 = affine_map<(d0, d1, d2) -> (d0 * {{ M * K }} + d1 * {{ K }} + d2)>{% endif %}
-{% if W_transposed %}#map1 = affine_map<(d0, d1, d2) -> (d0 * {{ N * K }} + d2 * {{ K }} + d1)>{% else %}#map1 = affine_map<(d0, d1, d2) -> (d0 * {{ K * N }} + d1 * {{ N }} + d2)>{% endif %}
+#map0 = affine_map<(d0, d1, d2) -> ({{ X_map }})>
+#map1 = affine_map<(d0, d1, d2) -> ({{ W_map }})>
 #map2 = affine_map<(d0, d1, d2) -> (d0 * {{ M * N }} + d1 * {{ N }} + d2)>
 memref.global @X_spad : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
@@ -86,30 +87,6 @@ class MLIRBMMTemplate(MLIRTemplate):
     def __init__(self, input_nodes, layout, input_reorder=None):
         super().__init__("kernel", input_nodes, layout, input_reorder)
 
-    def is_transposed(self, node):
-        if isinstance(node, ReinterpretView):
-            unsqueezed_layout_stride = [s for s, size in zip(node.layout.stride, node.layout.size) if size > 1]
-            unsqueezed_data_stride = [s for s, size in zip(node.data.layout.stride, node.data.layout.size) if size > 1]
-
-            if 0 in node.layout.stride: # [MoE] Temporary solution
-                if node.layout.stride[1] == 0:
-                    return True
-            if len(node.layout.stride) == len(node.data.layout.stride):
-                if node.layout.stride[-2] == node.data.layout.stride[-1] and node.layout.stride[-1] == node.data.layout.stride[-2]:
-                    return True
-                else:
-                    raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
-            elif len(node.layout.stride) < len(node.data.layout.stride):
-                # Squeezed case
-                if node.layout.stride == node.data.layout.stride[-len(node.layout.stride):]:
-                    return False
-                if len(unsqueezed_layout_stride) < len(unsqueezed_data_stride):
-                    if unsqueezed_layout_stride == unsqueezed_data_stride[-len(unsqueezed_layout_stride):]:
-                        return False
-                raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
-
-        return False
-
     def render(self,
                kernel: MLIRTemplateKernel,
                template_buffer_node = None,
@@ -124,7 +101,18 @@ def render(self,
         Y = self.output_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
-        B, M, N, K = X.get_size()[0], X.get_size()[1], W.get_size()[2], X.get_size()[2]
+        W_tensor =  empty_strided(W.layout.size, W.layout.stride)
+        X_tensor =  empty_strided(X.layout.size, X.layout.stride)
+        if len(W_tensor.size()) > 3:
+          W_tensor = W_tensor.view([-1, W_tensor.shape[-2], W_tensor.shape[-1]])
+        if len(X_tensor.size()) > 3:
+          X_tensor = X_tensor.view([-1, X_tensor.shape[-2], X_tensor.shape[-1]])
+        W_stride = W_tensor.stride()
+        X_stride = X_tensor.stride()
+        W_map = " + ".join([f"d{idx}*{s}" for idx, s in enumerate(W_stride)])
+        X_map = " + ".join([f"d{idx}*{s}" for idx, s in enumerate(X_stride)])
+
+        B, M, N, K = X_tensor.size()[0], X_tensor.size()[1], W_tensor.size()[2], X_tensor.size()[2]
         TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
         TOG_latency = M if TILE_M > M else TILE_M
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
@@ -132,9 +120,6 @@ def render(self,
         SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
         SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
 
-        W_transposed = self.is_transposed(W)
-        X_transposed = self.is_transposed(X)
-
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
@@ -155,8 +140,8 @@ def render(self,
             Y = Y,
             Bias = Bias,
             Bias_rank = len(Bias.data.get_size()) if Bias is not None else 0,
-            X_transposed = X_transposed,
-            W_transposed = W_transposed,
+            X_map = X_map,
+            W_map = W_map,
             Y_numel = B * M * N,
             input_reorder = self.input_reorder
         )
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 00d23e76..442e533a 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -1,4 +1,5 @@
 import os
+from torch import empty_strided
 from typing import List, Optional, cast
 
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
@@ -20,8 +21,8 @@
 // TILE_K = {{ TILE_K }}
 // SUB_TILE_M = {{ SUB_TILE_M }}
 // SUB_TILE_N = {{ SUB_TILE_N }}
-{% if X_transposed %}#map0 = affine_map<(d0, d1) -> (d1 * {{ M }} + d0)>{% else %}#map0 = affine_map<(d0, d1) -> (d0 * {{ K }} + d1)>{% endif %}
-{% if W_transposed %}#map1 = affine_map<(d0, d1) -> (d1 * {{ K }} + d0)>{% else %}#map1 = affine_map<(d0, d1) -> (d0 * {{ N }} + d1)>{% endif %}
+#map0 = affine_map<(d0, d1) -> ({{ X_map }})>
+#map1 = affine_map<(d0, d1) -> ({{ W_map }})>
 #map2 = affine_map<(d0, d1) -> (d0 * {{ N }} + d1)>
 memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
@@ -87,30 +88,6 @@ class MLIRGemmTemplate(MLIRTemplate):
     def __init__(self, input_nodes, layout, input_reorder=None):
         super().__init__("kernel", input_nodes, layout, input_reorder)
 
-    def is_transposed(self, node):
-        if isinstance(node, ReinterpretView):
-            unsqueezed_layout_stride = [s for s, size in zip(node.layout.stride, node.layout.size) if size > 1]
-            unsqueezed_data_stride = [s for s, size in zip(node.data.layout.stride, node.data.layout.size) if size > 1]
-
-            if 0 in node.layout.stride: # [MoE] Temporary solution
-                if node.layout.stride[1] == 0:
-                    return True
-            if len(node.layout.stride) == len(node.data.layout.stride):
-                if node.layout.stride[-2] == node.data.layout.stride[-1] and node.layout.stride[-1] == node.data.layout.stride[-2]:
-                    return True
-                else:
-                    raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
-            elif len(node.layout.stride) < len(node.data.layout.stride):
-                # Squeezed case
-                if node.layout.stride == node.data.layout.stride[-len(node.layout.stride):]:
-                    return False
-                if len(unsqueezed_layout_stride) < len(unsqueezed_data_stride):
-                    if unsqueezed_layout_stride == unsqueezed_data_stride[-len(unsqueezed_layout_stride):]:
-                        return False
-                raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
-
-        return False
-
     def render(self,
                kernel: MLIRTemplateKernel,
                template_buffer_node = None,
@@ -125,7 +102,16 @@ def render(self,
         Y = self.output_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
-        M, N, K = X.get_size()[0], W.get_size()[1], X.get_size()[1]
+        W_tensor =  empty_strided(W.layout.size, W.layout.stride)
+        X_tensor =  empty_strided(X.layout.size, X.layout.stride)
+        if len(W_tensor.size()) > 2 or len(X_tensor.size()) > 2:
+            raise NotImplementedError("Please report this case to us...")
+        W_stride = W_tensor.stride()
+        X_stride = X_tensor.stride()
+        W_map = " + ".join([f"d{idx}*{s}" for idx, s in enumerate(W_stride)])
+        X_map = " + ".join([f"d{idx}*{s}" for idx, s in enumerate(X_stride)])
+
+        M, N, K = X_tensor.size()[0], W_tensor.size()[1], X_tensor.size()[1]
         n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
         if (M == 0) or (N == 0) or (K == 0):
             TILE_M, TILE_N, TILE_K = 1, 1, 1
@@ -142,9 +128,6 @@ def render(self,
         TOG_latency = M if SUB_TILE_M > M else SUB_TILE_M
         kernel.loop_size =[TOG_latency, SUB_TILE_N, SUB_TILE_K]
 
-        W_transposed = self.is_transposed(W)
-        X_transposed = self.is_transposed(X)
-
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
@@ -164,8 +147,8 @@ def render(self,
             Y = Y,
             Bias = Bias,
             Bias_rank = len(Bias.data.get_size()) if Bias is not None else 0,
-            X_transposed = X_transposed,
-            W_transposed = W_transposed,
+            X_map = X_map,
+            W_map = W_map,
             Y_numel = M * N,
             epilogue_nodes = epilogue_nodes,
             input_reorder = self.input_reorder
diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index c760008b..83ed5850 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -36,7 +36,7 @@ def __init__(self, h, d_model, dropout=0.1):
     def forward(self, query, key, value):
         # 1) Do all the linear projections in batch from d_model => h x d_k
         query, key, value = [
-            lin(x).view(-1, self.h, self.d_k).transpose(0, 1).contiguous()
+            lin(x).view(-1, self.h, self.d_k).transpose(0, 1)
             for lin, x in zip(self.linears, (query, key, value))
         ]
 

From 44affc39e35cec4d0a5a7dbfc4e42a10c69c481e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 15 Apr 2025 11:48:24 +0000
Subject: [PATCH 280/432] [Config] Add multitenancy experiment

---
 ...128x128_c2_simple_noc_tpuv3_partition.json | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json

diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
new file mode 100644
index 00000000..132a52e6
--- /dev/null
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
@@ -0,0 +1,30 @@
+{
+  "num_cores" : 2,
+  "core_freq" : 940,
+  "sram_size" : 65536,
+  "core_print_interval" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq" : 940,
+  "dram_channels": 32,
+  "dram_req_size": 32,
+  "dram_latency" : 10,
+  "dram_size" : 32,
+  "dram_nbl" : 2,
+  "dram_print_interval": 10000,
+  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq" : 28000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
+
+  "precision" : 4,
+  "scheduler" : "simple",
+  "num_partition" : 2,
+  "partition": {
+    "core_0":0,
+    "core_1":1
+  }
+}
\ No newline at end of file

From 34909a4825a8d4eb3f7ee01195bbb1ac64fbef11 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Thu, 17 Apr 2025 08:41:14 +0000
Subject: [PATCH 281/432] [Fix] Maximize vlane usage when decreasing tile size

---
 PyTorchSimFrontend/mlir/mlir_common.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index d743b187..c64fabac 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -454,11 +454,19 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
         vlane_stride = 8
 
         # FIXME: Naive tile size decrement
-        def decrease_tile_size(tile_size):
+        def decrease_tile_size(tile_size, vlane_split_axis):
+            is_decreased = False
+            # Decrease tile size
             for i in range(len(tile_size)):
+                if i == vlane_split_axis:
+                    continue
                 if tile_size[i] > 1:
                     tile_size[i] = int(tile_size[i] // 2)
+                    is_decreased = True
                     break
+            if not is_decreased:
+                if tile_size[vlane_split_axis] > 1:
+                    tile_size[vlane_split_axis] = int(tile_size[vlane_split_axis] // 2)
             return tile_size
 
         # FIXME: Not considering removed buffers
@@ -490,14 +498,10 @@ def decrease_tile_size(tile_size):
             padded_size = used_vlane * vlane_stride
             tile_size[vlane_split_axis] = ((tile_size[vlane_split_axis] + padded_size - 1) // padded_size) * padded_size
 
-            used_vlane = min((tile_size[vlane_split_axis] + vlane_stride - 1) // vlane_stride, self.vector_lane)
-            padded_size = used_vlane * vlane_stride
-            tile_size[vlane_split_axis] = ((tile_size[vlane_split_axis] + padded_size - 1) // padded_size) * padded_size
-
             # Check spad overflow
             spad_usage_per_vlane = n_buffer * math.prod(tile_size) * self.precision // used_vlane
             if spad_usage_per_vlane >= self.spad_info["spad_size"]:
-                new_tile_size = decrease_tile_size(tile_size.copy())
+                new_tile_size = decrease_tile_size(tile_size.copy(), vlane_split_axis)
                 if new_tile_size == tile_size:
                     raise NotImplementedError("Error: Cannot find proper tile size")
                 tile_size = new_tile_size

From 2dac5ae7bda9e5e8806035cc9221296d6dbf4dba Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 16 Apr 2025 02:15:41 +0000
Subject: [PATCH 282/432] [Frontend] adjust fine-grained DMA

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 2 +-
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 5 ++++-
 PyTorchSimFrontend/mlir/mlir_lowering.py      | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 3e69f979..7a19a8a1 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -588,7 +588,7 @@ def is_transposed(self, node):
         return False
 
     def is_multi_tile(self, I_C):
-        return I_C < 16 # 16 is hard-coded for now. This should be changed to a better heuristic.
+        return I_C < (self.kernel.vector_lane // 8) # 8 is hard-coded for now. This should be changed to a better heuristic.
 
     def is_single_batch(self, BATCH):
         return BATCH == 1
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 442e533a..3152aee2 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -123,7 +123,10 @@ def render(self,
         TILE_N = min(extension_config.CONFIG_FORCE_TILE_N, TILE_N)
         TILE_K = min(extension_config.CONFIG_FORCE_TILE_K, TILE_K)
         SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_N = TILE_N
+        if (TILE_M == M and TILE_N == N):
+            SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+        else: # Avoid Row Conflict of weights
+            SUB_TILE_N = TILE_N
         SUB_TILE_K = TILE_K
         TOG_latency = M if SUB_TILE_M > M else SUB_TILE_M
         kernel.loop_size =[TOG_latency, SUB_TILE_N, SUB_TILE_K]
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index fc96a255..df816cd1 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -159,4 +159,4 @@ def sparse_addmm(*args, **kwargs):
 lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()})
 lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
 lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
-# lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
\ No newline at end of file
+lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
\ No newline at end of file

From 237acacd4b8e375e9922cc750a6840731db7fcb2 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 16 Apr 2025 02:17:56 +0000
Subject: [PATCH 283/432] [Experiments] simulation cycle & tpuv3 booksim config

---
 .../configs/booksim2_configs/fly_c2_m32.icnt  | 17 ++++++++++++
 .../systolic_ws_128x128_c2_booksim_tpuv3.json |  2 +-
 experiments/BERT.py                           |  1 +
 experiments/conv.py                           |  1 +
 experiments/gemm.py                           |  1 +
 experiments/resnet18.py                       |  1 +
 experiments/resnet50.py                       |  1 +
 scripts/sim_time.sh                           | 27 +++++++++++++++++++
 8 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 PyTorchSimBackend/configs/booksim2_configs/fly_c2_m32.icnt
 create mode 100755 scripts/sim_time.sh

diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c2_m32.icnt b/PyTorchSimBackend/configs/booksim2_configs/fly_c2_m32.icnt
new file mode 100644
index 00000000..f8874f20
--- /dev/null
+++ b/PyTorchSimBackend/configs/booksim2_configs/fly_c2_m32.icnt
@@ -0,0 +1,17 @@
+[config]
+use_map = 0
+flit_size = 64
+topology = fly
+k = 34
+n = 1
+routing_function = dest_tag
+subnets = 1
+
+vc_buf_size = 64
+input_buffer_size = 256
+ejection_buffer_size = 64
+boundary_buffer_size = 64
+wait_for_tail_credit = 0
+vc_allocator = islip
+sw_allocator = islip
+alloc_iters = 1
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
index 5e30fd43..7115b475 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
@@ -16,7 +16,7 @@
   "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
  
   "icnt_type" : "booksim2",
-  "icnt_latency" : 1,
+  "icnt_latency" : 7,
   "icnt_freq" : 1000,
   "icnt_node_per_core" : 16,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c32_m32.icnt",
diff --git a/experiments/BERT.py b/experiments/BERT.py
index e954131b..e7d6fb35 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -123,3 +123,4 @@ def run_BERT(device, size, input_seq, validation):
     run_BERT(device, size, input_seq, args.validation)
     # compute cycles with shell script
     subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)
+    subprocess.run([f"{base_dir}/scripts/sim_time.sh {result_path}"], shell=True)
diff --git a/experiments/conv.py b/experiments/conv.py
index bd587edc..de2c9128 100644
--- a/experiments/conv.py
+++ b/experiments/conv.py
@@ -68,3 +68,4 @@ def custom_conv2d(a, b, bias):
     run_conv2d(device, size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7], args.validation)
     # compute cycles with shell script
     subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)
+    subprocess.run([f"{base_dir}/scripts/sim_time.sh {result_path}"], shell=True)
diff --git a/experiments/gemm.py b/experiments/gemm.py
index b9c24fed..4fcf2c38 100644
--- a/experiments/gemm.py
+++ b/experiments/gemm.py
@@ -66,3 +66,4 @@ def custom_matmul(a, b):
     run_matmul(device, size[0], size[1], size[2], args.validation)
     # compute cycles with shell script
     subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)
+    subprocess.run([f"{base_dir}/scripts/sim_time.sh {result_path}"], shell=True)
diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index 4d5c4c6e..1f74df4d 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -42,3 +42,4 @@ def run_resnet(device, batch):
     run_resnet(device, batch)
     # compute cycles with shell script
     subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)
+    subprocess.run([f"{base_dir}/scripts/sim_time.sh {result_path}"], shell=True)
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index da8aa710..788fd591 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -42,3 +42,4 @@ def run_resnet(device, batch):
     run_resnet(device, batch)
     # compute cycles with shell script
     subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)
+    subprocess.run([f"{base_dir}/scripts/sim_time.sh {result_path}"], shell=True)
diff --git a/scripts/sim_time.sh b/scripts/sim_time.sh
new file mode 100755
index 00000000..15c60736
--- /dev/null
+++ b/scripts/sim_time.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Base directory
+BASE_PATH=$1 # Input as the first argument
+
+# Initialize total_sum as string for awk processing
+total_sum=0.0
+
+# Find all backendsim_result folders
+mapfile -t backend_folders < <(find "$BASE_PATH" -type d -name "backendsim_result")
+
+# Iterate over each backendsim_result folder
+for backend_folder in "${backend_folders[@]}"; do
+  mapfile -t files < <(find "$backend_folder" -type f)
+
+  for file in "${files[@]}"; do
+    sim_time=$(grep "Simulation time:" "$file" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+(\.[0-9]+)?).*/\1/')
+    echo "file: $file total_cycle: $sim_time"
+
+    if [[ -n "$sim_time" ]]; then
+      total_sum=$(awk -v a="$total_sum" -v b="$sim_time" 'BEGIN {printf "%.6f", a + b}')
+    fi
+  done
+done
+
+# Print the total simulation time
+echo "simulation time: $total_sum"

From e0fe70f10d7618d0d995a1bb0cbe1fbea390bea2 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 16 Apr 2025 02:18:47 +0000
Subject: [PATCH 284/432] [TOGSim] stats re-order

---
 PyTorchSimBackend/src/Simulator.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index aaed8bda..9893b60e 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -282,9 +282,10 @@ uint32_t Simulator::get_dest_node(mem_fetch *access) {
 
 void Simulator::print_core_stat()
 {
+  _icnt->print_stats();
+  _dram->print_stat();
   for (int core_id = 0; core_id < _n_cores; core_id++) {
     _cores[core_id]->print_stats();
   }
-  _icnt->print_stats();
-  _dram->print_stat();
+  spdlog::info("Total execution cycle: {}", _core_cycles);
 }
\ No newline at end of file

From e9f1c60e742456bd8a178e56b0c358b0c274fb01 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 16 Apr 2025 12:42:28 +0000
Subject: [PATCH 285/432] [Frontend] Comment out template maxpool

---
 PyTorchSimFrontend/mlir/mlir_lowering.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index df816cd1..ba4cce44 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -159,4 +159,4 @@ def sparse_addmm(*args, **kwargs):
 lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()})
 lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
 lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
-lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
\ No newline at end of file
+#lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
\ No newline at end of file

From 183c1895d95bb8bc40eaaf38f755da31879e10d5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 16 Apr 2025 12:42:51 +0000
Subject: [PATCH 286/432] [Config] Make scratchpad size optinable

---
 PyTorchSimFrontend/extension_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 4e035db4..e053f8d5 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -7,7 +7,7 @@
 CONFIG_SPAD_INFO = {
   "spad_vaddr" : 0xD0000000,
   "spad_paddr" : 0x2000000000,
-  "spad_size" : 128 << 10
+  "spad_size" : int(os.environ.get("TORCHSIM_SPAD_SIZE", default=128)) << 10 # Note: spad size per lane
 }
 CONFIG_PRECISION = 4 # 32bit
 CONFIG_NUM_CORES = 1

From 1ce67893431dae485b712e0bc8e3f023b5b086b6 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 16 Apr 2025 12:56:02 +0000
Subject: [PATCH 287/432] [CI] Add mobile config CI

---
 .github/workflows/pull-request_mobile.yml | 632 ++++++++++++++++++++++
 1 file changed, 632 insertions(+)
 create mode 100644 .github/workflows/pull-request_mobile.yml

diff --git a/.github/workflows/pull-request_mobile.yml b/.github/workflows/pull-request_mobile.yml
new file mode 100644
index 00000000..d3698133
--- /dev/null
+++ b/.github/workflows/pull-request_mobile.yml
@@ -0,0 +1,632 @@
+name: PR test CI for mobile
+
+on:
+  pull_request:
+    branches: [ "master", "develop" ]
+
+jobs:
+  build:
+    runs-on: [self-hosted, Linux]
+
+    permissions:
+      contents: read
+      packages: write
+      attestations: write
+      id-token: write
+
+    steps:
+      # Step 1: Checkout the repository
+      - name: Checkout Code
+        uses: actions/checkout@v4
+      # Step 2: Log in to GitHub Container Registry (optional)
+      # If you need to push the built image, authenticate here.
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+
+      # Step 3: Pull the Cached Image
+      - name: Pull Cached Image & Set environment
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          docker pull ghcr.io/psal-postech/torchsim_base:latest || echo "No cache available"
+          echo "IMAGE_TAG=torchsim-ci:${GITHUB_SHA}" >> $GITHUB_ENV
+          echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}" >> $GITHUB_ENV
+          echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}"
+          gem5_response_file=/tmp/releases-gem5-latest.json
+          response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} )
+          GEM5_ASSET_ID=$(cat ${gem5_response_file} | jq ".assets[0]."id"")
+          echo "GEM5_ASSET_ID=$GEM5_ASSET_ID"
+          echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" >> $GITHUB_ENV
+
+          llvm_response_file=/tmp/releases-gem5-latest.json
+          response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/latest > ${llvm_response_file} )
+          LLVM_ASSET_ID=$(cat ${llvm_response_file} | jq ".assets[0]."id"")
+          echo "LLVM_ASSET_ID=$LLVM_ASSET_ID"
+          echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" >> $GITHUB_ENV
+
+          mkdir -p /tmp/torchsim-ci/${GITHUB_SHA}
+          echo "DUMP_PATH=/tmp/torchsim-ci/${GITHUB_SHA}"
+
+      # Step 4: Build and Push Docker Image
+      - name: Build and Push Docker Image
+        uses: docker/build-push-action@v4
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          build-args: |
+            GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
+            LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
+            GIT_ACCESS_TOKEN=${{ env.GIT_ACCESS_TOKEN }}
+            TORCHSIM_SHA=${{ env.GITHUB_SHA }}
+          tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG}}
+
+  test_add:
+    name: Run test_add.py
+    runs-on: self-hosted
+
+    permissions:
+      contents: read
+      packages: write
+      attestations: write
+      id-token: write
+    needs: build
+
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_add.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_add.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_add.py
+
+  test_activation:
+    name: Run test_activation.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_activation.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_activation.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_activation.py
+
+  test_batchnorm:
+    name: Run test_batchnorm.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_batchnorm.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_batchnorm.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_batchnorm.py
+
+  test_bmm:
+    name: Run test_bmm.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_bmm.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_bmm.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_bmm.py
+
+  test_cnn:
+    name: Run test_cnn.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_cnn.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_cnn.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_cnn.py
+
+  test_conv2d:
+    name: Run test_conv2d.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_conv2d.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_conv2d.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_conv2d.py
+
+  test_matmul:
+    name: Run test_matmul.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_matmul.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_matmul.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_matmul.py
+
+  test_reduce:
+    name: Run test_reduce.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_reduce.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_reduce.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_reduce.py
+
+  test_softmax:
+    name: Run test_softmax.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_softmax.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_softmax.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_softmax.py
+
+  test_transpose2D:
+    name: Run test_transpose2D.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_transpose2D.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_transpose2D.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transpose2D.py
+
+  test_view3D_2D:
+    name: Run test_view3D_2D.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_view3D_2D.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_view3D_2D.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_view3D_2D.py
+
+  test_layernorm:
+    name: Run test_layernorm.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_layernorm.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_layernorm.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_layernorm.py
+
+  test_mlp:
+    name: Run test_mlp.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_mlp.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_mlp.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_mlp.py
+
+  test_resnet:
+    name: Run test_resnet.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_resnet.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_resnet.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_resnet.py
+
+  test_transformer:
+    name: Run test_transformer.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_transformer.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_transformer.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transformer.py
+
+  test_transpose3D:
+    name: Run test_transpose3D.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_transpose3D.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_transpose3D.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transpose3D.py
+
+  test_sparsity:
+    name: Run test_sparsity.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_sparsity.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_sparsity.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_sparsity.py
+
+  test_pool:
+    name: Run test_pool.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_pool.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_pool.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_pool.py
+
+  test_perceptron:
+    name: Run test_perceptron.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_single_perceptron.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_single_perceptron.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_single_perceptron.py
+
+  test_fusion:
+    name: Run test_fusion
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_addmm_residual.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_addmm_residual.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_addmm_residual.py
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_matmul_activation.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_matmul_activation.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_activation.py
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_matmul_scalar.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_matmul_scalar.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_scalar.py
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_conv_fusion.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_conv_fusion.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_conv_fusion.py
+
+  test_moe:
+    name: Run test_moe
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_moe.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_moe.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/MoE/test_moe.py
+
+  test_mistral:
+    name: Run test_mistral
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+      - name: Run test_mistral.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_mistral.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Mixtral_8x7B/test_attention.py
+
+  test_indirect:
+    name: Run test_indirect
+    runs-on: self-hosted
+    needs: build
+    env:
+      GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+    steps:
+      - name: Run test_indirect.py
+        run: |
+          echo "Running test_indirect.py"
+          echo $GIT_ACCESS_TOKEN | docker login ghcr.io -u USERNAME --password-stdin
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_indirect_access.py
+
+  test_scheduler:
+    name: Run test_scheduler
+    runs-on: self-hosted
+    needs: build
+    env:
+      GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+    steps:
+      - name: Run test_scheduler.py
+        run: |
+          echo "Running test_scheduler.py"
+          echo $GIT_ACCESS_TOKEN | docker login ghcr.io -u USERNAME --password-stdin
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_scheduler.py
+
+  test_cleanup:
+    name: Clean test cases
+    runs-on: self-hosted
+    needs: [test_add, test_batchnorm, test_bmm, test_cnn, test_conv2d,
+            test_matmul, test_reduce, test_softmax,
+            test_transpose2D, test_view3D_2D, test_layernorm,
+            test_mlp, test_resnet, test_transformer, test_transpose3D,
+            test_sparsity, test_activation, test_pool, test_perceptron,
+            test_fusion, test_mistral, test_moe, test_indirect, test_scheduler]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Clean test case
+        run: |
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} chown -R $(id -u):$(id -g) /dump

From 65a8494c7f831c7415e3e976d82c38a82f301b61 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 17 Apr 2025 11:12:09 +0000
Subject: [PATCH 288/432] [Frontend] Rework indexed operation

---
 .../mlir/mlir_caller_codegen.py               |   2 +
 .../mlir/mlir_codegen_backend.py              | 159 +++++++++++++-----
 tests/test_cnn.py                             |   2 +-
 tests/test_pool.py                            |   5 +-
 4 files changed, 119 insertions(+), 49 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
index 9da276f6..3fff9958 100644
--- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
+++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
@@ -50,6 +50,8 @@ def generate_kernel_declare(self):
 
     def generate_args_define(self):
         name_set = set()
+        if self.validation:
+            self.writeline(f'int padding[0x100000]{self.ending}') # FIXME. For pooling operation... Some pooling layer use negative offset
         for arg_name, (_, arg_type, arg_size, arg_sizes, arg_stride) in self.arg_attributes:
             if not arg_name in name_set:
                 if self.validation:
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 772e2338..759da7d4 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -193,6 +193,16 @@ def truediv(operand1, operand2, *args, var_info=None, **kwargs):
             opcode = f'arith.divui'
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
+    @staticmethod
+    def modular(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        if ret_type[0] == "f":
+            raise NotImplementedError("Not support remainder operation for floating point")
+        else:
+            opcode = f'arith.remui'
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
     @staticmethod
     def minimum(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
@@ -747,8 +757,8 @@ def __init__(self, kernel_group):
         self.welford_reduce_out = None
         self.reduce_iterator = {}
         self.is_template_kernel = False
-        self.index_set = set()
         self.spad_buffer_dict = dict()
+        self.base_vector_initialized = False
 
     # padding type 0: zero-padding 1: negative-padding(-inf) ...
     def get_padding_type(self):
@@ -893,7 +903,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
             # Generate vector store instruction
             store_size, operand_type = self.var_info[value]
             if mlir_dtype != operand_type:
-                value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
+                value = ops.custom_cast(value, mlir_dtype, var_info=self.var_info)
 
             if compute_vec_size > 1 and store_size > 1:
                 operation = "affine.vector_store"
@@ -1067,62 +1077,119 @@ def store_reduction(self, name, index, value):
     def indirect_indexing(self, index_var, size, check=True):
         return str(index_var)
 
-    def _index_expr(self, tile_size, buffer, renamed_expression, index):
-        str_tile_size = [str(dim) for dim in tile_size]
-        shape = "x".join(str_tile_size)
+    def _index_expr(self, tile_size, renamed_expression, index, base_vector_index):
+        tile_desc = self.kernel_group.tile_desc
+        compute_vec_size = tile_desc.get_compute_vec_size()
 
-        dim = ["%d"+str(i) for i in range(len(tile_size))]
-        sym_dim = ["d"+str(i) for i in range(len(tile_size))]
-        start_dim = [str(0) for i in tile_size]
-        end_dim = [str(i) for i in tile_size]
-        indices = [str(i) for i in index.free_symbols]
+        strides = [1] * len(tile_size)
+        for i in range(len(tile_size) - 2, -1, -1):
+            strides[i] = strides[i + 1] * tile_size[i + 1]
+
+        # Create vector index
+        compute_vec = self.cse.generate(self.compute, f"vector.broadcast %{self.compute_idx} : index to vector<{compute_vec_size}xindex>")
+        self.register_var_info(compute_vec, [compute_vec_size, "index"])
+        vector_index = ops.add(base_vector_index, compute_vec)
+
+        # Create tile_dim index
+        dim_list = []
+        for idx in range(len(tile_size)):
+            div_coeff = self.get_const_cse(strides[idx], "index")
+            mod_coeff = self.get_const_cse(tile_size[idx], "index")
+            div_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{div_coeff} : index to vector<{compute_vec_size}xindex>")
+            mod_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{mod_coeff} : index to vector<{compute_vec_size}xindex>")
+            self.register_var_info(div_vec, [compute_vec_size, "index"])
+            self.register_var_info(mod_vec, [compute_vec_size, "index"])
+            dim = ops.modular(ops.div(vector_index, div_vec), mod_vec)
+            if idx == tile_desc.vlane_split_axis: # Need to add vector lane offset
+                offset = tile_desc.vlane_stride * strides[idx]
+                vlane_coeff = self.get_const_cse(0, "i64")
+                vlane_vec_size = 4
+                vlane_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{vlane_coeff} : i64 to vector<{vlane_vec_size}xi64>")
+                vlane_offset = self.const_cse.generate(self.const_buffer, f"arith.addi %{vlane_vec}, %{vlane_vec} {{ vlane_offset={offset} }} : vector<{vlane_vec_size}xi64> // vlane offset")
+                self.register_var_info(vlane_offset, [vlane_vec_size, "i64"])
+                vlane_offset = ops.index_cast(vlane_offset, "index")
+                self.register_var_info(vlane_offset, [vlane_vec_size, "index"])
+                dim = ops.add(dim, vlane_offset)
+            dim_list.append(dim)
 
-        affine_map_str = "(" + ", ".join(sym_dim) + ") -> ("
-        affine_map_str += sympy.printing.ccode(renamed_expression) + ")"
-        affine_offset_map = "(d0, d1) -> (d0 + d1)"
-        offset_vars = dim.copy()
-        parallel_map = f"affine.parallel ({','.join(dim)}) = ({','.join(start_dim)}) to ({','.join(end_dim)}) {{"
-        self.indexed_buffer.writeline(parallel_map)
-        with self.indexed_buffer.indent():
-            for idx in indices:
-                i = int(idx[5:])
-                self.indexed_buffer.writeline(f"%offset{i} = affine.apply affine_map<{affine_offset_map}>(%{idx}, {dim[i]})")
-                offset_vars[i] = f"%offset{i}"
-            apply_map = f"affine.apply affine_map<{affine_map_str}>({', '.join(offset_vars)}) {{global_idx=1}}"
-            apply_map_var = self.indexed_cse.generate(self.indexed_buffer, apply_map)
-            broadcast = f"vector.broadcast %{apply_map_var} : index to vector<2xindex>"
-            broadcast_var = self.indexed_cse.generate(self.indexed_buffer, broadcast)
-            cast_i64 = f"arith.index_cast %{broadcast_var} : vector<2xindex> to vector<2xi64>"
-            cast_i64_var = self.indexed_cse.generate(self.indexed_buffer, cast_i64)
-            affine_store = f"affine.vector_store %{cast_i64_var}, %{buffer}[{','.join(dim)}] : memref<{shape}xi64, 1>, vector<2xi64>"
-            self.cse.generate(self.indexed_buffer, affine_store, assignment=False)
-        self.indexed_buffer.writeline("}")
-        return buffer
+        indices = [str(i) for i in index.free_symbols]
+        for idx in indices:
+            i = int(idx[5:])
+            index_vec = self.cse.generate(self.compute, f"vector.broadcast %{idx} : index to vector<{compute_vec_size}xindex>")
+            self.register_var_info(index_vec, [compute_vec_size, "index"])
+            offset = ops.add(index_vec, dim_list[i])
+            dim_list[i] = offset
+        arg_lists = []
+        for arg in renamed_expression.args:
+            if isinstance(arg, sympy.Integer):
+                offset = self.get_const_cse(int(arg))
+                offset_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{offset} : index to vector<{compute_vec_size}xindex>")
+                self.register_var_info(offset_vec, [compute_vec_size, "index"])
+                arg_lists.append(offset_vec)
+            elif isinstance(arg, sympy.Mul):
+                if isinstance(arg.args[0], sympy.Integer) and isinstance(arg.args[1], sympy.Symbol):
+                    coeff = self.get_const_cse(int(arg.args[0]))
+                    coeff_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{coeff} : index to vector<{compute_vec_size}xindex>")
+                    self.register_var_info(coeff_vec, [compute_vec_size, "index"])
+                    result = ops.mul(dim_list[int(str(arg.args[1])[1:])], coeff_vec)
+                    arg_lists.append(result)
+                elif isinstance(arg.args[1], sympy.Integer) and isinstance(arg.args[0], sympy.Symbol):
+                    coeff = self.get_const_cse(int(arg.args[1]))
+                    coeff_vec = self.cse.generate(self.compute, f"vector.broadcast %{coeff} : index to vector<{compute_vec_size}xindex>")
+                    self.register_var_info(coeff_vec, [compute_vec_size, "index"])
+                    result = ops.mul(dim_list[int(str(arg.args[0])[1:])], coeff_vec)
+                    arg_lists.append(result)
+                else:
+                    raise NotImplementedError("Not supporting format")
+            elif isinstance(arg, sympy.Symbol):
+                arg_lists.append(dim_list[int(str(arg)[1:])])
+            else:
+                raise NotImplementedError("Not supporting format")
+        accum = arg_lists[0]
+        for arg in arg_lists[1:]:
+            accum = ops.add(accum, arg)
+        return accum
 
     def index_expr(self, index, dtype):
         tile_desc = self.kernel_group.tile_desc
         tile_size = tile_desc.get_tile_size_per_lane()
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        tile_numel_per_lane = tile_desc.get_numel_per_lane()
         str_tile_size = [str(dim) for dim in tile_size]
-        tile_shape = f"memref<{'x'.join(str_tile_size)}xi64, 1>"
-        vshape = tile_desc.get_mlir_vshape(mlir_dtype)
         compute_vec_size = tile_desc.get_compute_vec_size()
-
-        # Define scratch pad buffer
-        sram_var, _, _ = self.get_scratchpad_buffer(dtype, "index_buffer", tile_numel_per_lane, tile_shape, None, index)
+        tile_shape = f"memref<{compute_vec_size}xi64, 1>"
+        vshape = f"vector<{compute_vec_size}xi64>"
+
+        # Create base_vector index var
+        c_type = "uint64_t"
+        new_name = f"index_expr_{compute_vec_size}"
+        if new_name not in self.global_vars_dict:
+            self.header.writeline(f"{c_type} {new_name}_spad[{compute_vec_size}] __attribute__ ((section(\".spad\")));")
+            self.gem5_header.writeline(f"{c_type} {new_name}_spad[{compute_vec_size}] __attribute__((aligned(64)));")
+            self.global_vars.writeline(f"memref.global @{new_name}_spad : {tile_shape}")
+            self.global_vars_dict[new_name] = []
+        sram_var = self.spad_cse.generate(self.spad_buffer, f"memref.get_global @{new_name}_spad : {tile_shape}")
+        # Initialize base vector
+        if not self.base_vector_initialized:
+            init_iter = "iter"
+            parallel_map = f"affine.parallel (%{init_iter}) = ({0}) to ({compute_vec_size}) {{ // Base vector initializer"
+            self.spad_buffer.writeline(parallel_map)
+            with self.spad_buffer.indent():
+                self.spad_buffer.writeline(f"%init_vec = vector.broadcast %{init_iter} : index to vector<2xindex>")
+                self.spad_buffer.writeline(f"%init_cvt_vec = arith.index_cast %init_vec : vector<2xindex> to vector<2xi64>")
+                self.spad_buffer.writeline(f"affine.vector_store %init_cvt_vec, %{sram_var}[%{init_iter}] : {tile_shape}, vector<2xi64>")
+            self.spad_buffer.writeline("}")
+            self.base_vector_initialized = True
+
+        line = f"affine.vector_load %{sram_var}[0] : {tile_shape}, {vshape}"
+        out = self.cse.generate(self.compute, line)
+        self.register_var_info(out, [compute_vec_size, "i64"])
+        base_vector_index = ops.index_cast(out, "index")
+        self.register_var_info(base_vector_index, [compute_vec_size, "index"])
 
         renamed_symbols = {symbol: "d"+str(symbol)[5:] for symbol in index.free_symbols}
         renamed_expression = index.subs(renamed_symbols)
-        if index not in self.index_set:
-            # Register this operand
-            self.index_set.add(index)
-            ops._index_expr(tile_size, sram_var, renamed_expression, index)
-
-        line = f"affine.vector_load %{sram_var}[0, 0, %{self.compute_idx}] : {tile_shape}, {vshape} // {renamed_expression}"
-        out = self.cse.generate(self.compute, line)
-        self.register_var_info(out, [compute_vec_size, mlir_dtype])
-        return out
+        result = self._index_expr(tile_size, renamed_expression, index, base_vector_index)
+        return result
 
     def codegen_global_init(self):
         return self.global_vars
diff --git a/tests/test_cnn.py b/tests/test_cnn.py
index 978243d8..aaad2836 100644
--- a/tests/test_cnn.py
+++ b/tests/test_cnn.py
@@ -28,9 +28,9 @@ def __init__(self):
 
     def forward(self, x):
         x = self.conv1(x)
+        x = self.maxpool(x)
         x = self.norm(x)
         x = self.conv2(x)
-        # x = self.maxpool(x)
         x = torch.nn.functional.relu(x)
         return x
 
diff --git a/tests/test_pool.py b/tests/test_pool.py
index e8d99a57..e94df65b 100644
--- a/tests/test_pool.py
+++ b/tests/test_pool.py
@@ -30,11 +30,11 @@ def test_maxpool(device, b=1, c=64, h=112, w=112):
     out = model(x2)
     test_result("Maxpool Forward", res, out) # TODO: MaxPool Functionality is not working
 
-def test_avgpool(device):
+def test_avgpool(device, b=1, c=64, h=112, w=112):
     def avgpool(a):
         return torch.nn.AdaptiveAvgPool2d((1, 1))(a)
     torch.manual_seed(0)
-    input = torch.randn(1, 16, 64, 64).to(device=device) #FIXME: channel 8 does not work (range padding issue)
+    input = torch.randn(b, c, h, w).to(device=device) #FIXME: channel 8 does not work (range padding issue)
     x1 = input.to(device=device)
     x2 = input.to("cpu")
     opt_fn = torch.compile(dynamic=False)(avgpool)
@@ -51,4 +51,5 @@ def avgpool(a):
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
     test_maxpool(device, b=1, c=8, h=16, w=16)
+    test_maxpool(device, b=1, c=8, h=112, w=112)
     test_avgpool(device)

From a67eef0f0e47f05b65c296fd500b5ed16479b6fd Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 18 Apr 2025 10:01:25 +0000
Subject: [PATCH 289/432] [Frontend] Fix reduction initial value

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 759da7d4..5ff2efee 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -27,9 +27,19 @@ def reduction_init(reduction_type, dtype):
     if reduction_type == "prod":
         return float(1) if dtype.is_floating_point else int(1)
     if reduction_type in {"max", "argmax"}:
-        return "0.0"
+        if dtype == torch.float32:
+            return f"0x{mlir_common.MLIR_INF['-inf']['f32']:x}"
+        elif dtype == torch.float64:
+            return f"0x{mlir_common.MLIR_INF['-inf']['f64']:x}"
+        else:
+            return "0.0"
     if reduction_type in {"min", "argmin"}:
-        return "0.0"
+        if dtype == torch.float32:
+            return f"0x{mlir_common.MLIR_INF['inf']['f32']:x}"
+        elif dtype == torch.float64:
+            return f"0x{mlir_common.MLIR_INF['inf']['f64']:x}"
+        else:
+            return "0.0"
     if reduction_type in {"welford_reduce"}:
         return f"0.0"
     raise AssertionError(reduction_type)
@@ -965,7 +975,6 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             init_vec = self.cse.generate(self.reduction_prefix, f"vector.broadcast %{init} : {type_name} to {reduced_shape}")
         acc_var = init_vec
 
-
         # Reduction body prepare
         body_acc = self.reduction_cse.generate(
             self.compute, f"reduction {reduction_key}body_acc", write=False

From 9e80488a42f57b1550dfa8a9d72597c2e483c5e8 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 21 Apr 2025 05:15:59 +0000
Subject: [PATCH 290/432] [Frontend] Add spad usage tracking logic

---
 PyTorchSimFrontend/extension_codecache.py     | 27 +++++++++-
 .../llvm/llvm_caller_codegen.py               | 49 +++++++++++++++++++
 .../mlir/mlir_codegen_backend.py              |  9 ++--
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  5 +-
 4 files changed, 82 insertions(+), 8 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index afe569e6..1b85ad28 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -31,6 +31,21 @@ def dump_metadata(args, arg_attributes, path):
             file.write(f'{arg_name}=({arg_attribute[0]}, {arg.dtype}, {arg.shape})\n')
     return
 
+def parse_stack_sizes(file_path):
+    meta_path = file_path.split(".")[0]+".meta"
+    cmd = ["riscv64-unknown-elf-objcopy", "--dump-section", f".stack_sizes={meta_path}", file_path, "/dev/null"]
+    subprocess.run(cmd, check=True)
+
+    with open(meta_path, 'rb') as f:
+        stack_sizes_data = list(f.read())
+    if len(stack_sizes_data) <= 17:
+        raise ValueError("Invalid .stack_sizes section size")
+
+    stack_size_bytes = stack_sizes_data[8:-9]
+    stack_size = int.from_bytes(stack_size_bytes, byteorder='little')
+    return stack_size
+
+
 def llvm_compile_command(input, output):
     opt_output = f"{input[:-3]}_opt.ll"
     return [re.sub(r"[ \n]+", " ",
@@ -77,7 +92,7 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
             re.sub(r"[ \n]+", " ",
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc \
-                -relocation-model=pic -march=riscv64 -O3 \
+                -relocation-model=pic -march=riscv64 -O3 --stack-size-section \
                 -mattr=+m,+f,+d,+a,+c,+v,+xsfvcp,zvl{vlen}b \
                 {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_LLVM_IR else ''} \
                 -O2 {filename}.ll -o {filename}.s
@@ -118,7 +133,7 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
             re.sub(r"[ \n]+", " ",
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc \
-                -relocation-model=pic -march=riscv64 -O3 \
+                -relocation-model=pic -march=riscv64 -O3 --stack-size-section \
                 -mattr=+m,+f,+d,+a,+c,+v,+xsfvcp,zvl{vlen}b \
                 {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_LLVM_IR else ''} \
                 -O2 {sample_filename}.ll -o {sample_filename}.s
@@ -178,6 +193,14 @@ def load(cls, source_code,
                 val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name)
                 val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name,
                                                    validation_binary_name, new_link_option)
+                target = os.path.join(write_path, validation_binary_name)
+                stack_size = val_llvm_caller.parse_stack_sizes(target)
+                spad_size =  val_llvm_caller.get_spad_size(target)
+                spad_usage = stack_size + spad_size # Spad usage per lane
+                if extension_config.CONFIG_SPAD_INFO["spad_size"] < spad_usage:
+                    print(f"[Warning] Scratchpad size exceeded: required {spad_usage} bytes, "
+                        f"but only {extension_config.CONFIG_SPAD_INFO['spad_size']} bytes available.")
+
         # Launch tile graph generator
         gem5_sample_cmd = shlex.split(gem5_cmds[0])
         gem5_translate_cmd = shlex.split(gem5_cmds[1])
diff --git a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py b/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
index 06c20a45..273685d5 100644
--- a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
+++ b/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
@@ -1,6 +1,7 @@
 import os
 import subprocess
 import shlex
+import re
 
 from torch._inductor.utils import IndentedBuffer
 from torch._inductor.codegen import cpp
@@ -174,3 +175,51 @@ def compile_wih_kernel(self, write_path, llvm_name, wrapper_name, binary_name, l
             print("Command failed with exit code", e.returncode)
             print("Error output:", e.output)
             assert(0)
+
+    def parse_stack_sizes(self, file_path):
+        meta_path = file_path.split(".")[0]+".meta"
+        cmd = ["riscv64-unknown-elf-objcopy", "--dump-section", f".stack_sizes={meta_path}", file_path, "/dev/null"]
+        subprocess.run(cmd, check=True)
+
+        with open(meta_path, 'rb') as f:
+            stack_sizes_data = bytearray(list(f.read()))
+        # Wrapper kernel serach
+        cmd = ["riscv64-unknown-elf-readelf", "-s", file_path]
+        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(f"Readelf error: {result.stderr}")
+        output = result.stdout
+        for line in output.splitlines():
+            if 'wrapper_kernel' in line:
+                sym_addr = int(line.split()[1], 16)
+                byte_array = sym_addr.to_bytes(8, byteorder='little')
+                break
+        wrapper_pos = stack_sizes_data.find(byte_array)
+
+        if len(stack_sizes_data) <= 17:
+            raise ValueError("Invalid .stack_sizes section size")
+        stack_size_bytes = stack_sizes_data[8:wrapper_pos]
+        stack_size = int.from_bytes(stack_size_bytes, byteorder='little')
+        return stack_size
+
+    def get_spad_size(self, binary_path):
+        cmd = ["riscv64-unknown-elf-readelf", "-s", binary_path]
+        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(f"Readelf error: {result.stderr}")
+
+        output = result.stdout
+        spad_start = None
+        spad_end = None
+        for line in output.splitlines():
+            if '.spad' in line and 'SECTION' in line:
+                parts = line.split()
+                spad_start = int(parts[1], 16)
+            elif 'spad_end' in line:
+                parts = line.split()
+                spad_end = int(parts[1], 16)
+
+        if spad_start is None or spad_end is None:
+            raise ValueError("Could not find .spad addresses")
+        spad_size = spad_end - spad_start
+        return spad_size
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 5ff2efee..3c299bac 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -962,7 +962,7 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             self.loads, f"reduction {reduction_key}", write=False
         )
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
-        init = self.cse.generate(self.reduction_prefix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
+        init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
         vec_len = self.kernel_group.tile_desc.get_compute_vec_size()
         reduced_shape = self.kernel_group.tile_desc.get_mlir_vshape(type_name)
 
@@ -972,7 +972,7 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             init_vec = init
         else:
             # Adjust shape and inital value
-            init_vec = self.cse.generate(self.reduction_prefix, f"vector.broadcast %{init} : {type_name} to {reduced_shape}")
+            init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {type_name} to {reduced_shape}")
         acc_var = init_vec
 
         # Reduction body prepare
@@ -1263,8 +1263,9 @@ def codegen_nodes(self, nodes, kernel_name):
         spike_write_path = os.path.join(write_path, "global_var.h")
         gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
         if not os.path.exists(spike_write_path):
-            spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
-            write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol)
+            spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n"
+            spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
+            write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol)
         if not os.path.exists(gem5_write_path):
             write_atomic(gem5_write_path, self.gem5_header.getvalue())
         return src_code
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 88313eff..174c322c 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -180,8 +180,9 @@ def codegen_template(self, template_node, epilogue_nodes):
             src_code = self.codegen_template_code(kernel, render, template_node, epilogue_nodes)
 
         with V.set_kernel_handler(kernel):
-            spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\"), aligned({kernel.spad_info['spad_size']*kernel.vector_lane})));"
-            codegen_header(src_code, (kernel.header.getvalue()+spad_end_symbol, kernel.gem5_header.getvalue()))
+            spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));"
+            spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({kernel.spad_info['spad_size']*kernel.vector_lane})));"
+            codegen_header(src_code, (kernel.header.getvalue()+spad_end_symbol+spad_section_end_symbol, kernel.gem5_header.getvalue()))
             kernel.meta_kernel()
             kernel_name = self.define_kernel(src_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info,
                                              kernel.loop_size, origins={str(i) for i in template_node.node.origins})

From e4aa268a363a66550a05abcb3884fd445f0e32a1 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 21 Apr 2025 07:15:00 +0000
Subject: [PATCH 291/432] [Frontend] Fix indexed operation for mistral case

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 3c299bac..a1b644e2 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -999,7 +999,7 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         reduction_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_tile_size()[-1]
         assert(vec_len % reduction_size==0)
         if vec_len > reduction_size:
-            init = self.cse.generate(self.reductions_suffix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
+            init = self.const_cse.generate(self.reductions_suffix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
             if reduction_size == 1:
                 final_reduced_shape = f"{type_name}"
                 out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(reduction_type, acc, init, axis=0, shape=reduced_shape, reduced_shape=final_reduced_shape))
@@ -1154,6 +1154,8 @@ def _index_expr(self, tile_size, renamed_expression, index, base_vector_index):
                 arg_lists.append(dim_list[int(str(arg)[1:])])
             else:
                 raise NotImplementedError("Not supporting format")
+        if isinstance(renamed_expression, sympy.Symbol):
+            arg_lists.append(dim_list[int(str(renamed_expression)[1:])])
         accum = arg_lists[0]
         for arg in arg_lists[1:]:
             accum = ops.add(accum, arg)

From 3a73492948714e5aef56601f26fb3a36be0ccd33 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 21 Apr 2025 08:32:40 +0000
Subject: [PATCH 292/432] [CI] Fix cleanup job

---
 .github/workflows/docker-image.yml        | 2 +-
 .github/workflows/pull-request.yml        | 2 +-
 .github/workflows/pull-request_mobile.yml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 8c35edc7..e5bab560 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -608,4 +608,4 @@ jobs:
         run: |
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} chown -R $(id -u):$(id -g) /dump
\ No newline at end of file
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} chown -R $(id -u):$(id -g) /dump
\ No newline at end of file
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index a1078031..3dbb3e36 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -629,4 +629,4 @@ jobs:
         run: |
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} chown -R $(id -u):$(id -g) /dump
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} chown -R $(id -u):$(id -g) /dump
\ No newline at end of file
diff --git a/.github/workflows/pull-request_mobile.yml b/.github/workflows/pull-request_mobile.yml
index d3698133..945bac3b 100644
--- a/.github/workflows/pull-request_mobile.yml
+++ b/.github/workflows/pull-request_mobile.yml
@@ -629,4 +629,4 @@ jobs:
         run: |
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} chown -R $(id -u):$(id -g) /dump
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} chown -R $(id -u):$(id -g) /dump

From 7efbbba6d48209c9f19a6ac9f664603fbe992827 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 21 Apr 2025 14:02:08 +0000
Subject: [PATCH 293/432] [Frontend] Fix stack usage calculation

---
 PyTorchSimFrontend/extension_codecache.py     |  6 +-
 PyTorchSimFrontend/extension_config.py        |  2 +-
 .../llvm/llvm_caller_codegen.py               | 61 +++++++++++--------
 .../mlir/mlir_codegen_backend.py              |  2 +-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  3 +-
 5 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 1b85ad28..772bb75f 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -156,6 +156,8 @@ def load(cls, source_code,
              cycle_binary_name="cycle_bin",
              arg_attributes=[], vectorlane_size=16,
              spad_info=None, origins=None, **kwargs):
+        vlen = kwargs['vlen']
+        vlenb = vlen // 8
         write_path = get_write_path(source_code)
         key, input_path = write(source_code, "mlir", specified_dir=write_path)
         new_input_path = os.path.splitext(input_path)[0]
@@ -175,7 +177,7 @@ def load(cls, source_code,
         if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE:
             # Use custom malloc to avoid size error
             new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
-            cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=256)
+            cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
             opt_cmd = shlex.split(cmds[0])
             translate_cmd = shlex.split(cmds[1])
             llc_cmd = shlex.split(cmds[2])
@@ -194,7 +196,7 @@ def load(cls, source_code,
                 val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name,
                                                    validation_binary_name, new_link_option)
                 target = os.path.join(write_path, validation_binary_name)
-                stack_size = val_llvm_caller.parse_stack_sizes(target)
+                stack_size = val_llvm_caller.parse_stack_sizes(f"{write_path}/{key}.s", vlenb=vlenb)
                 spad_size =  val_llvm_caller.get_spad_size(target)
                 spad_usage = stack_size + spad_size # Spad usage per lane
                 if extension_config.CONFIG_SPAD_INFO["spad_size"] < spad_usage:
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index e053f8d5..09fad275 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -11,7 +11,7 @@
 }
 CONFIG_PRECISION = 4 # 32bit
 CONFIG_NUM_CORES = 1
-CONFIG_VLEN = 32 // CONFIG_PRECISION # 256bits / 32bits = 8 [elements]
+CONFIG_VLEN = 256 # 256bits / 32bits = 8 [elements]
 
 # Tile size config
 CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
diff --git a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py b/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
index 273685d5..835d9b80 100644
--- a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
+++ b/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
@@ -176,31 +176,42 @@ def compile_wih_kernel(self, write_path, llvm_name, wrapper_name, binary_name, l
             print("Error output:", e.output)
             assert(0)
 
-    def parse_stack_sizes(self, file_path):
-        meta_path = file_path.split(".")[0]+".meta"
-        cmd = ["riscv64-unknown-elf-objcopy", "--dump-section", f".stack_sizes={meta_path}", file_path, "/dev/null"]
-        subprocess.run(cmd, check=True)
-
-        with open(meta_path, 'rb') as f:
-            stack_sizes_data = bytearray(list(f.read()))
-        # Wrapper kernel serach
-        cmd = ["riscv64-unknown-elf-readelf", "-s", file_path]
-        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-        if result.returncode != 0:
-            raise RuntimeError(f"Readelf error: {result.stderr}")
-        output = result.stdout
-        for line in output.splitlines():
-            if 'wrapper_kernel' in line:
-                sym_addr = int(line.split()[1], 16)
-                byte_array = sym_addr.to_bytes(8, byteorder='little')
-                break
-        wrapper_pos = stack_sizes_data.find(byte_array)
-
-        if len(stack_sizes_data) <= 17:
-            raise ValueError("Invalid .stack_sizes section size")
-        stack_size_bytes = stack_sizes_data[8:wrapper_pos]
-        stack_size = int.from_bytes(stack_size_bytes, byteorder='little')
-        return stack_size
+    def parse_stack_sizes(self, file_path, vlenb=256):
+        with open(file_path, 'r') as f:
+            stack_sizes_data = f.readlines()
+
+        in_proc = False
+        stack_base = None
+        dynamic_expr = None
+        max_offset = 0
+
+        for line in stack_sizes_data:
+            line = line.strip()
+            if line.startswith(".cfi_startproc"):
+                in_proc = True
+                continue
+            elif line.startswith(".cfi_endproc") and in_proc:
+                if dynamic_expr:
+                    total_stack = eval(dynamic_expr, {"vlenb": vlenb})
+                    return total_stack
+                elif stack_base:
+                    return stack_base
+                else:
+                    return max_offset
+
+            # Skip outer function
+            if not in_proc:
+                continue
+
+            if line.startswith(".cfi_def_cfa_offset"):
+                stack_base = int(line.split()[-1])
+
+            if ".cfi_escape" in line and "#" in line:
+                comment = line.split("#")[-1].strip()
+                m = re.search(r"sp \+ (\d+)\s*\+\s*(\d+)\s*\*\s*vlenb", comment)
+                if m:
+                    base, scale = int(m.group(1)), int(m.group(2))
+                    dynamic_expr = f"{base} + {scale} * vlenb"
 
     def get_spad_size(self, binary_path):
         cmd = ["riscv64-unknown-elf-readelf", "-s", binary_path]
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index a1b644e2..9aab47bc 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1432,7 +1432,7 @@ def adjust_tile_size(self):
         # Case 1. vector kernel
         if len(self.itervars) == 1:
             tile_size = self.tile_desc.get_tile_size() if self.tile_desc.get_tile_size() < self.ranges[0] else self.ranges[0]
-            min_tile_size_unit = self.vector_lane * self.vlen # TODO: VCIX widening is not implemented
+            min_tile_size_unit = self.vector_lane * self.vlen // (8 * self.precision) # TODO: VCIX widening is not implemented
             self.tile_desc.n_col = math.ceil(tile_size / min_tile_size_unit) * min_tile_size_unit # padding
             self.tile_desc.n_row = 1
         elif len(self.itervars) == 0:
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 174c322c..2911c51d 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -135,7 +135,8 @@ def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, loop_size
             codecache_def.writeline(f"loop_size={loop_size},")
             codecache_def.writeline(f"spad_info={spad_info},")
             codecache_def.writeline(f"origins={origins},")
-            codecache_def.writeline("arg_attributes=arg_attributes)")
+            codecache_def.writeline("arg_attributes=arg_attributes,")
+            codecache_def.writeline(f"vlen={extension_config.CONFIG_VLEN})")
             wrapper.define_kernel(kernel_name, codecache_def.getvalue(), cuda=False)
         return kernel_name
 

From 9ae513014608fc50199b655db44e3177044f3900 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 21 Apr 2025 15:39:04 +0000
Subject: [PATCH 294/432] [Frontend] Introduce autotuning infrastructure

---
 PyTorchSimFrontend/extension_codecache.py     |  7 ++-
 PyTorchSimFrontend/mlir/mlir_autotune.py      | 53 ++++++++-----------
 .../mlir/mlir_codegen_backend.py              |  6 +++
 PyTorchSimFrontend/mlir/mlir_common.py        | 27 ++++++++++
 4 files changed, 60 insertions(+), 33 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 772bb75f..d1588ab5 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -1,5 +1,3 @@
-import getpass
-import tempfile
 import os
 import re
 import shlex
@@ -140,6 +138,10 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
         """,
     ).strip()]
 
+class SpadOverflowError(Exception):
+    def __init__(self, message="SPAD overflow occurred."):
+        super().__init__(message)
+
 class MLIRCodeCache:
     cache = dict()
     clear = staticmethod(cache.clear)   # Todo: Cache
@@ -202,6 +204,7 @@ def load(cls, source_code,
                 if extension_config.CONFIG_SPAD_INFO["spad_size"] < spad_usage:
                     print(f"[Warning] Scratchpad size exceeded: required {spad_usage} bytes, "
                         f"but only {extension_config.CONFIG_SPAD_INFO['spad_size']} bytes available.")
+                    raise SpadOverflowError()
 
         # Launch tile graph generator
         gem5_sample_cmd = shlex.split(gem5_cmds[0])
diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py
index cea9834b..804cd2e8 100644
--- a/PyTorchSimFrontend/mlir/mlir_autotune.py
+++ b/PyTorchSimFrontend/mlir/mlir_autotune.py
@@ -1,8 +1,8 @@
 import functools
 import torch
+import dataclasses
 from torch._inductor.autotune_process import BenchmarkRequest
 from torch._inductor.autotune_process import TensorMeta
-from torch._inductor.codecache import CUDACodeCache
 
 from typing import (
     Any,
@@ -15,8 +15,8 @@
     TYPE_CHECKING,
     Union,
 )
-
-class MLIRBenchmarkRequest(BenchmarkRequest):
+@dataclasses.dataclass
+class MLIRBenchmarkRequest():
     def __init__(
         self,
         kernel_name: str,
@@ -25,50 +25,41 @@ def __init__(
         extra_args: Iterable[Any],
         source_code: str,
     ):
-        super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args)
+        self.kernel_name = kernel_name
+        if isinstance(input_tensor_meta, TensorMeta):
+            input_tensor_meta = [input_tensor_meta]
+        self.input_tensor_meta = input_tensor_meta
+
+        if isinstance(output_tensor_meta, TensorMeta):
+            output_tensor_meta = [output_tensor_meta]
+        self.output_tensor_meta = output_tensor_meta
         self.source_code = source_code
         self.workspace_size: int = 0
         self.workspace: Optional[torch.Tensor] = None
         self.hash_key: str = ""
         self.source_file: str = ""
+        self.extra_args = extra_args
         #self.hash_key, self.source_file = CUDACodeCache.write(self.source_code, "so")
 
     def make_run_fn(
-        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+        self, input_tensors: torch.Tensor, output_tensors: torch.Tensor
     ) -> Callable[[], None]:
-        self.DLL, self.hash_key, self.source_file = CUDACodeCache.load(
-            self.source_code, "so"
-        )
+        from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile
+        custom_async_compile = CustomAsyncCompile()
+        run_method = custom_async_compile.mlir(
+            self.source_code, vectorlane_size=self.extra_args["vector_lane"],
+            loop_size=None, spad_info=self.extra_args["spad_info"],
+            vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"],
+            origins="Unknown")
 
         args = [
-            tensor.data_ptr()
-            for tensor in list(input_tensors) + [output_tensor]
+            tensor
+            for tensor in list(input_tensors) + list(output_tensors)
         ]
-
-        print(
-            "make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, args=%s, self.extra_args=%s",
-            self.kernel_name,
-            self.source_file,
-            self.hash_key,
-            args,
-            self.extra_args,
-        )
-
-        run_method = getattr(self.DLL, self.kernel_name)
-
-        # Retrieve workspace_size and initialize workspace.
-        run_method(
-            *args,  # input ptrs and output ptrs
-            *self.extra_args,
-        )
-
         # Generate partial function.
         return functools.partial(
             run_method,
             *args,
-            *self.extra_args,
-            None,  # null workspace size ptr
-            None,  # set workspace ptr, TODO: update it to a real ptr if workspace_size > 0
         )
 
     def __str__(self) -> str:
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 9aab47bc..a847fd73 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1270,6 +1270,12 @@ def codegen_nodes(self, nodes, kernel_name):
             write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol)
         if not os.path.exists(gem5_write_path):
             write_atomic(gem5_write_path, self.gem5_header.getvalue())
+
+        try:
+            bench_runner = self.run_bench(nodes, kernel_name, src_code)
+            bench_runner()
+        except extension_codecache.SpadOverflowError:
+            print("Overflowed...")
         return src_code
 
     def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffer=None): # Need more argument?
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index c64fabac..55600fb0 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -6,6 +6,8 @@
 from functools import reduce
 from operator import mul
 import torch
+from torch._dynamo.testing import rand_strided
+from torch._inductor.autotune_process import TensorMeta
 from torch._inductor.codegen import common
 from torch._inductor.codegen import cpp
 from torch._inductor.virtualized import V
@@ -30,6 +32,7 @@
     unique,
 )
 from PyTorchSimFrontend import extension_config
+from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
 schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
 
 DTYPE_TO_MLIR = {
@@ -555,6 +558,29 @@ def codegen_nodes(self, nodes, kernel_name):
         self.meta_kernel()
         return src_code
 
+    def run_bench(self, nodes, kernel_name, src_code):
+        _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs()
+        input_call_args = tuple(self.args.input_buffers.keys())
+        output_call_args = tuple(self.args.output_buffers.keys())
+        full_input_nodes = tuple([V.graph.get_buffer(k) for k in input_call_args])
+        full_output_nodes = tuple([V.graph.get_buffer(k) for k in output_call_args])
+
+        bmreq = MLIRBenchmarkRequest(
+            kernel_name=kernel_name,
+            input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes),
+            output_tensor_meta=TensorMeta.from_irnodes(full_output_nodes),
+            extra_args={
+                "vector_lane" : self.vector_lane,
+                "spad_info": self.spad_info,
+                "vlen" : self.vlen,
+                "arg_attributes" : arg_attributes
+            },
+            source_code=src_code,
+        )
+        dummy_inputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.input_tensor_meta]
+        dummy_outputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.output_tensor_meta]
+        return bmreq.make_run_fn(dummy_inputs, dummy_outputs)
+
     def codegen_kernel(self, kernel_name):
         arg_defs, _, _, _ = self.kernel_group.args.mlir_argdefs()
         arg_defs = ",\n".ljust(25).join(arg_defs)
@@ -580,6 +606,7 @@ def meta_kernel(self):
         wrapper.add_import_once(f'\ncustom_async_compile = CustomAsyncCompile()')
         # Dump loop and load/store information
         wrapper.add_import_once(f"arg_attributes = {arg_attributes}")
+        return arg_attributes
 
     def get_constant_vector(self, expr):
         constant_vector = [[int(expr.coeff(var)),None] for var in self.itervars]

From 821a25925ca9ca5e4f5da5bb7749a487b2826df5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 22 Apr 2025 07:18:43 +0000
Subject: [PATCH 295/432] [Backendsim] Rework block sparse mechanism

---
 PyTorchSimBackend/include/Core.h            |  3 +-
 PyTorchSimBackend/include/Instruction.h     |  4 +++
 PyTorchSimBackend/include/TMA.h             |  9 ++++-
 PyTorchSimBackend/include/TileGraphParser.h | 28 ++++++++++++++++
 PyTorchSimBackend/src/Core.cc               | 37 +++++++++++++--------
 PyTorchSimBackend/src/TileGraphParser.cc    |  7 ++++
 Simulator/simulator.py                      |  1 +
 7 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/PyTorchSimBackend/include/Core.h b/PyTorchSimBackend/include/Core.h
index a9c201ea..6585e222 100644
--- a/PyTorchSimBackend/include/Core.h
+++ b/PyTorchSimBackend/include/Core.h
@@ -66,7 +66,8 @@ class Core {
   cycle_type _stat_tot_tma_idle_cycle = 0;
   cycle_type _stat_tot_vu_compute_idle_cycle = 0;
   std::vector<cycle_type> _stat_tot_sa_compute_idle_cycle;
-  std::vector<uint64_t> _stat_tot_sa_inst;
+  std::vector<uint64_t> _stat_inst_count;
+  std::vector<uint64_t> _stat_tot_skipped_inst;
   uint64_t _stat_gemm_inst = 0;
   uint64_t _stat_skip_dma = 0;
   uint64_t _stat_numa_hit = 0;
diff --git a/PyTorchSimBackend/include/Instruction.h b/PyTorchSimBackend/include/Instruction.h
index 45fe983e..e0e904af 100644
--- a/PyTorchSimBackend/include/Instruction.h
+++ b/PyTorchSimBackend/include/Instruction.h
@@ -78,6 +78,9 @@ class Instruction {
   int get_nr_inner_loop() { return _nr_inner_loop; }
   void set_is_async(bool is_async) { _is_async_dma = is_async; }
   void prepare_tag_key();
+  bool is_sparse_inst() { return _is_sparse_inst; }
+  void set_sparse_state(bool state) { _is_sparse_inst = state; }
+  std::set<std::shared_ptr<Instruction>>& get_child_inst() { return child_inst; }
 
   cycle_type start_cycle;
   cycle_type finish_cycle;
@@ -113,5 +116,6 @@ class Instruction {
   int _nr_inner_loop = 0;
   bool _is_async_dma=false;
   bool _is_indirect_mode=false;
+  bool _is_sparse_inst=false;
   std::string _indirect_index_path="";
 };
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/TMA.h b/PyTorchSimBackend/include/TMA.h
index 964969a8..f8355470 100644
--- a/PyTorchSimBackend/include/TMA.h
+++ b/PyTorchSimBackend/include/TMA.h
@@ -38,6 +38,13 @@ class TMA {
     tag_table[subgraph_id][key] = 1;
   }
 
+  void set_tag_sparse(int subgraph_id, std::vector<int>& key) {
+    if (tag_table.find(subgraph_id) == tag_table.end()) {
+      throw std::runtime_error("Subgraph does not exist in tag_table");
+    }
+    tag_table[subgraph_id][key] = -1;
+  }
+
   void mark_tag_used(int subgraph_id, std::vector<int>& key) {
     if (tag_table.find(subgraph_id) == tag_table.end()) {
       throw std::runtime_error("Subgraph does not exist in tag_table");
@@ -70,7 +77,7 @@ class TMA {
     auto key_it = key_map.find(key);
     return key_it != key_map.end();
   }
-  bool get_tag_finish(int subgraph_id, std::vector<int>& key) {
+  uint32_t get_tag_finish(int subgraph_id, std::vector<int>& key) {
     auto subgraph_it = tag_table.find(subgraph_id);
     auto& key_map = subgraph_it->second;
     auto key_it = key_map.find(key);
diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/PyTorchSimBackend/include/TileGraphParser.h
index 97e808c6..16393325 100644
--- a/PyTorchSimBackend/include/TileGraphParser.h
+++ b/PyTorchSimBackend/include/TileGraphParser.h
@@ -93,7 +93,33 @@ class TileGraphParser {
     fs::path new_path = base_folder / "indirect_access" / (std::string("indirect_index") + std::to_string(indirect_counter) + ".raw");
     return new_path.string();
   }
+  std::string get_sparse_tile_meta_path() {
+    namespace fs = std::filesystem;
+    fs::path original(_attribute_path);
+    fs::path base_folder = original.parent_path().parent_path();
+    fs::path new_path = base_folder / "dma_access" / (std::string("sparse_tile.raw"));
+    return new_path.string();
+  }
+  void load_sparse_meta_data() {
+    /* Prepare runtime attribute */
+    std::string sparse_meta_path = get_sparse_tile_meta_path();
+    std::ifstream file(sparse_meta_path, std::ios::binary);
+    if (file) {
+      file.seekg(0, std::ios::end);
+      std::streamsize size = file.tellg();
+      file.seekg(0, std::ios::beg);
+      size_t count = size / sizeof(int64_t);
+      for (size_t i = 0; i < count; ++i) {
+          int64_t val;
+          file.read(reinterpret_cast<char*>(&val), sizeof(int64_t));
+          sparse_tile_set.insert(val);
+      }
+    }
+  }
   void inc_indirect_counter() { indirect_counter++; }
+  uint64_t get_dma_counter() { return dma_counter; }
+  void inc_dma_counter() { dma_counter++; }
+  bool is_sparse_tile(uint64_t idx) { return sparse_tile_set.find(idx) != sparse_tile_set.end(); }
   int register_addr_name(const std::string& addr_name) {
     if (_addr_name_map.find(addr_name) == _addr_name_map.end())
       _addr_name_map[addr_name] = _addr_name_map.size();
@@ -112,6 +138,8 @@ class TileGraphParser {
   std::string _tog_path;
   std::string _attribute_path;
   uint64_t indirect_counter = 0;
+  uint64_t dma_counter = 0;
+  std::set<uint64_t> sparse_tile_set;
   std::map<std::string, std::shared_ptr<TileNode>> _output_map;
   std::vector<std::vector<std::shared_ptr<TileNode>>> _loop_nodes;
   std::vector<std::shared_ptr<TileNode>> _tile_vec;
diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index 10d5a647..32fabd63 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -14,8 +14,8 @@ Core::Core(uint32_t id, SimulationConfig config)
   _stat_sa_compute_cycle.resize(_num_systolic_array_per_core);
   _stat_tot_sa_compute_idle_cycle.resize(_num_systolic_array_per_core);
   _stat_sa_compute_idle_cycle.resize(_num_systolic_array_per_core);
-  _stat_tot_sa_inst.resize(_num_systolic_array_per_core);
-  _stat_tot_sa_inst.resize(static_cast<size_t>(Opcode::COUNT), 0);
+  _stat_inst_count.resize(static_cast<size_t>(Opcode::COUNT), 0);
+  _stat_tot_skipped_inst.resize(static_cast<size_t>(Opcode::COUNT), 0);
 }
 
 bool Core::can_issue(const std::shared_ptr<Tile>& op) {
@@ -214,7 +214,14 @@ void Core::cycle() {
           {
             /* Check another MOVIN with same tag is issued */
             auto& key = inst->get_tag_id();
-            if (inst->is_async_dma() && _tma.tag_key_exist(inst->subgraph_id, key)) {
+            if (inst->is_sparse_inst()) {
+              _tma.register_tag(inst->subgraph_id, key);
+              _tma.set_tag_sparse(inst->subgraph_id, key);
+              finish_instruction(inst);
+              issued = true;
+              _stat_tot_skipped_inst.at(static_cast<size_t>(inst->get_opcode()))++;
+              break;
+            } else if (inst->is_async_dma() && _tma.tag_key_exist(inst->subgraph_id, key)) {
               bool finished = _tma.get_tag_finish(inst->subgraph_id, key);
               if (finished)
                 finish_instruction(inst);
@@ -227,7 +234,7 @@ void Core::cycle() {
                             fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
                             fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
               issued = true;
-              _stat_skip_dma++;
+              _stat_tot_skipped_inst.at(static_cast<size_t>(inst->get_opcode()))++;
               break;
             } else {
               spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
@@ -260,11 +267,9 @@ void Core::cycle() {
               inst->bubble_cycle = bubble_cycle;
             }
             if (inst->get_compute_cycle() == 0) {
-              spdlog::trace("[Core {}][SA {}][{}] {} SKIPPED", _id, _systolic_array_rr, _core_cycle,
-                            opcode_to_string(inst->get_opcode()));
               inst->finish_instruction();
               static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
-              _stat_tot_sa_inst.at(static_cast<size_t>(inst->get_opcode()))++;
+              _stat_tot_skipped_inst.at(static_cast<size_t>(inst->get_opcode()))++;
               auto it = instructions.begin() + j; // Position 2 is the third element
               instructions.erase(it);
             } else {
@@ -281,8 +286,15 @@ void Core::cycle() {
         case Opcode::BAR:
           {
             auto& key = inst->get_tag_id();
-            bool finished = _tma.get_tag_finish(inst->subgraph_id, key);
-            if (finished) {
+            uint32_t finished = _tma.get_tag_finish(inst->subgraph_id, key);
+            if (finished == -1) {
+              for (auto child_inst : inst->get_child_inst()) {
+                if (child_inst->get_opcode() == Opcode::COMP && child_inst->get_compute_type() == MATMUL) {
+                  child_inst->set_compute_cycle(0);
+                }
+              }
+              finish_instruction(inst);
+            } else if (finished != 0) {
               _tma.mark_tag_used(inst->subgraph_id, key);
               finish_instruction(inst);
             } else {
@@ -302,7 +314,7 @@ void Core::cycle() {
       }
 
       if (issued) {
-        _stat_tot_sa_inst.at(static_cast<size_t>(inst->get_opcode()))++;
+        _stat_inst_count.at(static_cast<size_t>(inst->get_opcode()))++;
         auto it = instructions.begin() + j; // Position 2 is the third element
         instructions.erase(it);
         break;
@@ -406,11 +418,10 @@ void Core::print_stats() {
   spdlog::info("===== Instructions count =====");
   for (int i=0; i < static_cast<size_t>(Opcode::COUNT); i++) {
     if (i == static_cast<size_t>(Opcode::COMP))
-      spdlog::info("Core [{}] : {} inst count {} (GEMM: {}, Vector: {})", _id, opcode_to_string(static_cast<Opcode>(i)), _stat_tot_sa_inst.at(i), _stat_gemm_inst, _stat_tot_sa_inst.at(i) - _stat_gemm_inst);
+      spdlog::info("Core [{}] : {} inst count {} (GEMM: {}, Vector: {}), skipped inst count {}", _id, opcode_to_string(static_cast<Opcode>(i)), _stat_inst_count.at(i), _stat_gemm_inst, _stat_inst_count.at(i) - _stat_gemm_inst, _stat_tot_skipped_inst.at(i));
     else
-      spdlog::info("Core [{}] : {} inst count {}", _id, opcode_to_string(static_cast<Opcode>(i)), _stat_tot_sa_inst.at(i));
+      spdlog::info("Core [{}] : {} inst count {}, skipped inst count {}", _id, opcode_to_string(static_cast<Opcode>(i)), _stat_inst_count.at(i), _stat_tot_skipped_inst.at(i));
   }
-  spdlog::trace("Core [{}] : SKipped MOVIN inst count {}", _id, _stat_skip_dma);
   spdlog::info("========= Core stat =========");
   for (int i=0; i<_num_systolic_array_per_core; i++)
     sa_utilization.push_back(static_cast<float>(_stat_tot_sa_compute_cycle.at(i) * 100) / _core_cycle);
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index b1bff65f..27ad4ea5 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -428,6 +428,12 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       if (mem_node->is_indirect()) {
         inst->set_indirect_index_path(tog_parser->get_indirect_path());
         tog_parser->inc_indirect_counter();
+      } else {
+        bool is_sparse_tile = tog_parser->is_sparse_tile(tog_parser->get_dma_counter());
+        tog_parser->inc_dma_counter();
+        if (is_sparse_tile) {
+          inst->set_sparse_state(is_sparse_tile);
+        }
       }
       link_map[tile_node] = inst;
       tile_vec.back()->append_instuction(inst);
@@ -760,6 +766,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa
       spdlog::info("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", it.key(), fmt::join(_arg_numa_stride[it.key()], ", "));
     }
   }
+  load_sparse_meta_data();
 
   /* ONNX file parsing */
   _tog_path = onnx_path;
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index de92663e..b3349318 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -96,6 +96,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
         kernel_address = f"--kernel-addr={kernel_start_addr}:{kernel_end_addr}"
         base_path= f"--base-path={runtime_path}"
         os.makedirs(os.path.join(runtime_path, "indirect_access"), exist_ok=True)
+        os.makedirs(os.path.join(runtime_path, "dma_access"), exist_ok=True)
         run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
 
         print("[SpikeSimulator] cmd> ", run)

From cd82ea52a85b64e81173ac0e3caa7afb8cbbeaa5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 22 Apr 2025 07:26:02 +0000
Subject: [PATCH 296/432] [Script] Update sparsity script

---
 PyTorchSimBackend/src/TileGraphParser.cc | 12 -----
 Simulator/simulator.py                   | 56 ------------------------
 scripts/sparsity_experiment/run.sh       |  6 ++-
 3 files changed, 4 insertions(+), 70 deletions(-)

diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index 27ad4ea5..4da85362 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -556,18 +556,6 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       inst->set_overlapping_cycle(compute_node->get_overlapping_cycle());
       inst->set_compute_type(compute_node->get_compute_type());
 
-      // FIXME: double free error
-      /* Check should we have to skip */
-      // auto output_idx_list = calc_output_idx(tog_parser, iter); // (M,N,K) order
-      // if (compute_node->get_compute_type() == 1 && output_idx_list.size() == 3) { // FIXME. hardcoded type
-      //   bool skip = find_output_idx(tog_parser, output_idx_list);
-      //   if (skip) {
-      //     inst->set_compute_cycle(0);
-      //     inst->set_overlapping_cycle(0);
-      //     spdlog::trace("[TOGParser/Sparse] Skip output tile index: {}", fmt::join(output_idx_list, ","));
-      //   }
-      // }
-
       link_map[tile_node] = inst;
       tile_vec.back()->append_instuction(inst);
     } else if (tile_node->get_type() == TileType::LOOP_INDEX_NODE) {
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index b3349318..48f16cbd 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -320,62 +320,6 @@ def create_attribute_file(self, attribute_path, inputs, **kwargs):
             address_info[f"arg{idx}"] = tensor.data_ptr()
         json_content["address_info"] = address_info
 
-        if extension_config.CONFIG_BLOCK_SPARSE and "loop_size" in kwargs and len(kwargs['loop_size'])==3 and kwargs['loop_size'][0] != 1:
-            # GEMM
-            import copy
-            zero_skip = {}
-            input, weight = inputs[:2]
-            M, N, K = kwargs['loop_size']
-
-            padded_input = copy.deepcopy(input.cpu())
-            padded_weight = copy.deepcopy(weight.cpu())
-
-            original_input_shape = input.shape
-            original_weight_shape = weight.shape
-
-            # Initialize padding for all dimensions
-            pad_input = [(0, 0)] * input.ndim
-            pad_weight = [(0, 0)] * weight.ndim
-
-            if input.ndim == 2:
-                # 2D tensor: (Height, Width)
-                pad_input[0] = (0, M - original_input_shape[0] if original_input_shape[0] < M else 0)
-                pad_input[1] = (0, K - original_input_shape[1] if original_input_shape[1] < K else 0)
-            elif input.ndim == 3:
-                # 3D tensor: (Depth, Height, Width)
-                pad_input[1] = (0, M - original_input_shape[1] if original_input_shape[1] < M else 0)
-                pad_input[2] = (0, K - original_input_shape[2] if original_input_shape[2] < K else 0)
-
-            if weight.ndim == 2:
-                # 2D tensor: (Height, Width)
-                pad_weight[0] = (0, K - original_weight_shape[0] if original_weight_shape[0] < K else 0)
-                pad_weight[1] = (0, N - original_weight_shape[1] if original_weight_shape[1] < N else 0)
-            elif weight.ndim == 3:
-                # 3D tensor: (Depth, Height, Width)
-                pad_weight[1] = (0, K - original_weight_shape[1] if original_weight_shape[1] < K else 0)
-                pad_weight[2] = (0, N - original_weight_shape[2] if original_weight_shape[2] < N else 0)
-
-            # Apply padding
-            padded_input = np.pad(
-                padded_input,
-                pad_width=pad_input,
-                mode='constant',
-                constant_values=0
-            )
-
-            padded_weight = np.pad(
-                padded_weight,
-                pad_width=pad_weight,
-                mode='constant',
-                constant_values=0
-            )
-
-            #input_zero_pos = self.find_zero_sub_tensors(padded_input)
-            weight_zero_pos = self.find_zero_sub_tensors(padded_weight)
-            #zero_skip["arg0"] = input_zero_pos
-            zero_skip["arg1"] = weight_zero_pos
-            json_content["zero_skip"] = zero_skip
-
         with open(attribute_path, "w") as f:
             json.dump(json_content, f, indent=4)
         return attribute_path
diff --git a/scripts/sparsity_experiment/run.sh b/scripts/sparsity_experiment/run.sh
index 5644b768..0b7bc6f5 100755
--- a/scripts/sparsity_experiment/run.sh
+++ b/scripts/sparsity_experiment/run.sh
@@ -1,6 +1,8 @@
 export TORCHSIM_DUMP_PATH=$(pwd)/result
-export BLOCK_SPARSE=1
-export TORCHSIM_FORCE_TIME_M=32
+export SPIKE_DUMP_SPARSE_TILE=1
+export TORCHSIM_FORCE_TIME_K=8
+export TORCHSIM_FORCE_TIME_M=8
+export TORCHSIM_FORCE_TIME_N=8
 
 OUTPUT_DIR="12GB"
 export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json"

From 4befa5f41e36c9d06f5dae86ef5cade29d6d2436 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Tue, 22 Apr 2025 06:39:19 +0000
Subject: [PATCH 297/432] [Fix] Change initial tile size by number of vector
 lane and vlane_stride

---
 PyTorchSimFrontend/mlir/mlir_common.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 55600fb0..2609a0da 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -436,26 +436,26 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
                 if sz != target_operand[3][dim]:
                     raise NotImplementedError("Not supporting type...")
 
+        vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
+        vlane_stride = 8
+
         # Dummy tile size
         tile_size = [1] * (len(vars) + len(reduction_vars))
         if len(tile_size) == 2:
-            tile_size[-1] = 1024
-            tile_size[-2] = 2048
+            tile_size[-1] = vlane_stride * self.vector_lane
+            tile_size[-2] = 2 * vlane_stride * self.vector_lane
         elif len(tile_size) == 0: # Scalar
             tile_size = [1]
             self.ranges = [1]
         elif len(tile_size) == 1:
-            tile_size[0] = 128*128*2
+            tile_size[0] = 2 * vlane_stride * self.vector_lane
         elif len(tile_size) == 3:
-            tile_size[-1] = 128
-            tile_size[-2] = 128
+            tile_size[-1] = self.vector_lane
+            tile_size[-2] = self.vector_lane
             tile_size[-3] = 2
         else:
             raise NotImplementedError("dummy tile size fail!")
 
-        vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
-        vlane_stride = 8
-
         # FIXME: Naive tile size decrement
         def decrease_tile_size(tile_size, vlane_split_axis):
             is_decreased = False

From 7d4117161048bb2ec23cdb8d771e527988cf50e4 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 23 Apr 2025 07:47:46 +0000
Subject: [PATCH 298/432] [Frontend] Rebase with branch spad

---
 PyTorchSimFrontend/extension_codecache.py     |  3 +-
 PyTorchSimFrontend/extension_config.py        |  4 ++
 .../mlir/mlir_codegen_backend.py              | 61 +++++++++++++------
 PyTorchSimFrontend/mlir/mlir_common.py        | 49 +++++++++------
 Simulator/simulator.py                        | 10 ++-
 5 files changed, 85 insertions(+), 42 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index d1588ab5..ac37f35b 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -347,6 +347,7 @@ def task():
         else:
             loop_size = []
         def dummy_simulator(*args, **kwargs):
+            validate = kwargs.get('validate', False)
             # Wait for compilation
             key = future.result()
 
@@ -355,7 +356,7 @@ def dummy_simulator(*args, **kwargs):
             # Dump arguments and meta data
             dump_metadata(args, arg_attributes, result_path)
             runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
-            if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE:
+            if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE or validate:
                 funcsim = FunctionalSimulator(result_path, key)
                 funcsim.run_spike(args, arg_attributes,
                                   runtime_path, self.validation_binary_name,
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 09fad275..2f4444cb 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -45,6 +45,10 @@
 CONFIG_GEM5_SCRIPT_PATH = os.environ.get('GEM5_SCRIPT_PATH',
                                   default=f"{CONFIG_TORCHSIM_DIR}/gem5_script/script_systolic.py")
 
+# AUTOTUNE config
+CONFIG_AUTOTUNE = int(os.environ.get('AUTOTUNE', default=True))
+CONFIG_MAX_AUTOTUNE_TRY = int(os.environ.get('MAX_AUTOTUNE_TRY', default=10))
+
 # For block sparse
 CONFIG_BLOCK_SPARSE = int(os.environ.get('BLOCK_SPARSE', default=0))
 CONFIG_FORCE_TILE_M = int(os.environ.get("TORCHSIM_FORCE_TIME_M", default=sys.maxsize))
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index a847fd73..d164acd0 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -14,6 +14,7 @@
 from torch.utils._sympy.functions import ModularIndexing
 import PyTorchSimFrontend.extension_codecache as extension_codecache
 
+from PyTorchSimFrontend import extension_config
 from . import mlir_common
 from .mlir_common import LoopLevel, LoopNest
 
@@ -770,6 +771,9 @@ def __init__(self, kernel_group):
         self.spad_buffer_dict = dict()
         self.base_vector_initialized = False
 
+    def reset(self):
+        self.__init__(self.kernel_group)
+
     # padding type 0: zero-padding 1: negative-padding(-inf) ...
     def get_padding_type(self):
         ops = self.current_node.node.origins
@@ -1256,26 +1260,43 @@ def codegen_loops(self):
         return code
 
     def codegen_nodes(self, nodes, kernel_name):
-        src_code = super().codegen_nodes(nodes, kernel_name)
-
-        # Create extra headers for simulators
-        write_path = extension_codecache.get_write_path(src_code)
-        if not os.path.exists(write_path):
-            os.makedirs(write_path)
-        spike_write_path = os.path.join(write_path, "global_var.h")
-        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-        if not os.path.exists(spike_write_path):
-            spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n"
-            spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
-            write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol)
-        if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, self.gem5_header.getvalue())
-
-        try:
-            bench_runner = self.run_bench(nodes, kernel_name, src_code)
-            bench_runner()
-        except extension_codecache.SpadOverflowError:
-            print("Overflowed...")
+        src_code = ""
+        n_try = 0
+        while n_try < extension_config.CONFIG_MAX_AUTOTUNE_TRY:
+            src_code = super().codegen_nodes(nodes, kernel_name)
+
+            # Create extra headers for simulators
+            write_path = extension_codecache.get_write_path(src_code)
+            if not os.path.exists(write_path):
+                os.makedirs(write_path)
+            spike_write_path = os.path.join(write_path, "global_var.h")
+            gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
+            if not os.path.exists(spike_write_path):
+                spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n"
+                spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
+                write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol)
+            if not os.path.exists(gem5_write_path):
+                write_atomic(gem5_write_path, self.gem5_header.getvalue())
+
+            if not extension_config.CONFIG_AUTOTUNE:
+                break
+
+            try:
+                bench_runner = self.run_bench(nodes, kernel_name, src_code)
+                bench_runner(validate=True)
+                print("Benchmark succeeded.")
+                break
+            except RuntimeError as e:
+                if str(e) == "STACK_OVERFLOW":
+                    n_try += 1
+                    print(f"Benchmark failed due to stack overflow with tile size: {self.kernel_group.tile_desc.get_tile_size()}")
+                    self.reset()
+                else:
+                    print(f"Benchmark failed with error: {str(e)}")
+                    # raise e
+            if n_try == extension_config.CONFIG_MAX_AUTOTUNE_TRY:
+                print("Cannot find valid tile size.")
+                break
         return src_code
 
     def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffer=None): # Need more argument?
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 2609a0da..437e5aef 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -439,27 +439,15 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
         vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
         vlane_stride = 8
 
-        # Dummy tile size
-        tile_size = [1] * (len(vars) + len(reduction_vars))
-        if len(tile_size) == 2:
-            tile_size[-1] = vlane_stride * self.vector_lane
-            tile_size[-2] = 2 * vlane_stride * self.vector_lane
-        elif len(tile_size) == 0: # Scalar
-            tile_size = [1]
-            self.ranges = [1]
-        elif len(tile_size) == 1:
-            tile_size[0] = 2 * vlane_stride * self.vector_lane
-        elif len(tile_size) == 3:
-            tile_size[-1] = self.vector_lane
-            tile_size[-2] = self.vector_lane
-            tile_size[-3] = 2
-        else:
-            raise NotImplementedError("dummy tile size fail!")
-
-        # FIXME: Naive tile size decrement
+        # FIXME: Naive decrease tile size
         def decrease_tile_size(tile_size, vlane_split_axis):
             is_decreased = False
-            # Decrease tile size
+
+            # Decrease vlane_split_axis when it is too large
+            if tile_size[vlane_split_axis] > vlane_stride * self.vector_lane:
+                tile_size[vlane_split_axis] = int(tile_size[vlane_split_axis] // 2)
+                return tile_size
+
             for i in range(len(tile_size)):
                 if i == vlane_split_axis:
                     continue
@@ -467,11 +455,34 @@ def decrease_tile_size(tile_size, vlane_split_axis):
                     tile_size[i] = int(tile_size[i] // 2)
                     is_decreased = True
                     break
+
+            # Decrease vlane_split_axis at the end to maximize the vlane usage
             if not is_decreased:
                 if tile_size[vlane_split_axis] > 1:
                     tile_size[vlane_split_axis] = int(tile_size[vlane_split_axis] // 2)
             return tile_size
 
+        # Dummy tile size
+        if self.kernel_group.tile_desc:
+            tile_size = self.kernel_group.tile_desc.get_tile_size()
+            decrease_tile_size(tile_size, vlane_split_axis)
+        else:
+            tile_size = [1] * (len(vars) + len(reduction_vars))
+            if len(tile_size) == 2:
+                tile_size[-1] = vlane_stride * self.vector_lane
+                tile_size[-2] = 2 * vlane_stride * self.vector_lane
+            elif len(tile_size) == 0: # Scalar
+                tile_size = [1]
+                self.ranges = [1]
+            elif len(tile_size) == 1:
+                tile_size[0] = 2 * vlane_stride * self.vector_lane
+            elif len(tile_size) == 3:
+                tile_size[-1] = self.vector_lane
+                tile_size[-2] = self.vector_lane
+                tile_size[-3] = 2
+            else:
+                raise NotImplementedError("dummy tile size fail!")
+
         # FIXME: Not considering removed buffers
         n_buffer = sum(
             len(node.read_writes.reads) + len(node.read_writes.writes)
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 48f16cbd..54ebd04c 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -105,8 +105,14 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
             subprocess.check_call(run_cmd)
         except subprocess.CalledProcessError as e:
             print("[SpikeSimulator] Command failed with exit code", e.returncode)
-            print("[SpikeSimulator] Error output:", e.output)
-            assert(0)
+            error_msg = ""
+            if e.returncode == 200:
+                error_msg = "INVALID_SPAD_ACCESS"
+            elif e.returncode == 201:
+                error_msg = "STACK_OVERFLOW"
+            else:
+                error_msg = "UNKNOWN_ERROR"
+            raise RuntimeError(f"{error_msg}")
 
         for (arg_name, arg_attribute), arg, path in zip(arg_attributes, args, file_path):
             if LLVMKernelArgs.is_llvm_arg_out(arg_attribute[0]):

From b78d0e14082277fcec980c0d98026b938e765b12 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 28 Apr 2025 05:38:39 +0000
Subject: [PATCH 299/432] [Frontend] Fix tensor value dump logic to support non
 linearized stride

---
 Simulator/simulator.py  |  5 ++++-
 tests/test_batchnorm.py | 13 +++++++++----
 tests/test_resnet.py    |  7 ++++++-
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 54ebd04c..04efb25a 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -1,5 +1,6 @@
 import os
 import shlex
+import ctypes
 import subprocess
 import re
 import sys
@@ -49,7 +50,9 @@ def write_arg(self, arg, path, name):
         if (isinstance(arg, torch.Tensor)):
             data_path = os.path.join(dump_path, f'{index}.raw')
             tensor = arg.cpu().detach()
-            t_arr = tensor.numpy().flatten()
+            buffer_size = tensor.untyped_storage().size()
+            buffer = (ctypes.c_char * buffer_size).from_address(tensor.data_ptr())
+            t_arr = np.frombuffer(buffer, dtype=tensor.numpy().dtype, count=buffer_size // tensor.element_size())
             t_arr.tofile(data_path)
         else:
             assert(0)
diff --git a/tests/test_batchnorm.py b/tests/test_batchnorm.py
index 8c78fb97..f7abacf5 100644
--- a/tests/test_batchnorm.py
+++ b/tests/test_batchnorm.py
@@ -20,10 +20,12 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
 def test_BatchNorm(device, size=(1, 16, 64, 64)):
     torch.manual_seed(0)
     model = torch.nn.BatchNorm2d(size[1]).eval()
-    model.to(device=device)
-    input = torch.randn(size)
-    x1 = input.to(device=device)
-    x2 = input.to("cpu")
+    model.to(device=device, memory_format=torch.channels_last)
+    input = torch.empty_strided(size, (size[1]*size[2]*size[3], 1, size[1], size[1]*size[2]))
+    input.uniform_(-1, 1)
+
+    x1 = input.to(device=device, memory_format=torch.channels_last)
+    x2 = input.to("cpu", memory_format=torch.channels_last)
     opt_fn = torch.compile(dynamic=False)(model)
     y = opt_fn(x1)
     cpu_model = model.to("cpu")
@@ -39,3 +41,6 @@ def test_BatchNorm(device, size=(1, 16, 64, 64)):
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
     test_BatchNorm(device)
+    test_BatchNorm(device, size=(1,64, 32, 32))
+    test_BatchNorm(device, size=(1, 8, 4, 4))
+    test_BatchNorm(device, size=(1,256, 32, 32))
diff --git a/tests/test_resnet.py b/tests/test_resnet.py
index d76ec4a8..e2a6f2a7 100644
--- a/tests/test_resnet.py
+++ b/tests/test_resnet.py
@@ -23,10 +23,15 @@ def test_resnet(device):
     # model = resnet._resnet(resnet.BasicBlock, [1, 1, 0, 0], weights=None, progress=False).eval()
     model = resnet18().eval()
     model.to(device, memory_format=torch.channels_last)
-    input = torch.randn(1, 3, 224, 224).to(device=device)
+    input = torch.randn(1, 3, 224, 224)
     x1 = input.to(device=device, memory_format=torch.channels_last)
+    x2 = input.cpu().to(memory_format=torch.channels_last)
     opt_fn = torch.compile(dynamic=False)(model)
     res = opt_fn(x1)
+    cpu_model = model.cpu().to(memory_format=torch.channels_last)
+    cpu_res = cpu_model(x2)
+    test_result("ResNet18 inference", res, cpu_res)
+    print("Max diff > ", torch.max(torch.abs(res.cpu() - cpu_res)))
     print("ResNet18 Simulation Done")
 
 if __name__ == "__main__":

From c1171613a601e706334bf0305c86f84dbe5f79d1 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 28 Apr 2025 07:44:19 +0000
Subject: [PATCH 300/432] [Frontend] Catch unhandled spadoverflow exception

---
 .../mlir/mlir_codegen_backend.py              | 60 +++++++++----------
 PyTorchSimFrontend/mlir/mlir_common.py        |  2 +-
 2 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index d164acd0..ed80a91a 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1260,44 +1260,40 @@ def codegen_loops(self):
         return code
 
     def codegen_nodes(self, nodes, kernel_name):
-        src_code = ""
-        n_try = 0
-        while n_try < extension_config.CONFIG_MAX_AUTOTUNE_TRY:
+        for n_try in range(extension_config.CONFIG_MAX_AUTOTUNE_TRY):
             src_code = super().codegen_nodes(nodes, kernel_name)
-
-            # Create extra headers for simulators
-            write_path = extension_codecache.get_write_path(src_code)
-            if not os.path.exists(write_path):
-                os.makedirs(write_path)
-            spike_write_path = os.path.join(write_path, "global_var.h")
-            gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-            if not os.path.exists(spike_write_path):
-                spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n"
-                spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
-                write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol)
-            if not os.path.exists(gem5_write_path):
-                write_atomic(gem5_write_path, self.gem5_header.getvalue())
-
+            self._prepare_simulator_headers(src_code)
             if not extension_config.CONFIG_AUTOTUNE:
-                break
+                return src_code
 
             try:
                 bench_runner = self.run_bench(nodes, kernel_name, src_code)
                 bench_runner(validate=True)
-                print("Benchmark succeeded.")
-                break
-            except RuntimeError as e:
-                if str(e) == "STACK_OVERFLOW":
-                    n_try += 1
-                    print(f"Benchmark failed due to stack overflow with tile size: {self.kernel_group.tile_desc.get_tile_size()}")
-                    self.reset()
-                else:
-                    print(f"Benchmark failed with error: {str(e)}")
-                    # raise e
-            if n_try == extension_config.CONFIG_MAX_AUTOTUNE_TRY:
-                print("Cannot find valid tile size.")
-                break
-        return src_code
+                return src_code
+            except (extension_codecache.SpadOverflowError, RuntimeError) as e:
+                if isinstance(e, RuntimeError) and str(e) != "STACK_OVERFLOW":
+                    print(f"Benchmark[trial-{n_try}] failed with unexpected error: {e}")
+                    raise
+                print(f"Benchmark failed due to stack overflow with tile size: {self.kernel_group.tile_desc.get_tile_size()}")
+                self.reset()
+        raise RuntimeError("Exceeded maximum number of autotuning attempts")
+
+    def _prepare_simulator_headers(self, src_code):
+        write_path = extension_codecache.get_write_path(src_code)
+        os.makedirs(write_path, exist_ok=True)
+
+        spike_write_path = os.path.join(write_path, "global_var.h")
+        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
+
+        if not os.path.exists(spike_write_path):
+            spad_end_symbol = "int spad_end[0] __attribute__ ((section(\".spad\")));\n"
+            spad_section_end_symbol = (
+                f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
+            )
+            write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol)
+
+        if not os.path.exists(gem5_write_path):
+            write_atomic(gem5_write_path, self.gem5_header.getvalue())
 
     def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffer=None): # Need more argument?
         """
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 437e5aef..8cf079f1 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -400,7 +400,7 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
                     break
 
         if implicit_ranges:
-            print("This operation contina implicit dimension space!")
+            #print("This operation contain implicit dimension space!")
             linearized_stride = [1] * len(target_operand.var_names)
             for i in range(len(target_operand[3])-2, -1, -1):
                 linearized_stride[i] = linearized_stride[i+1] * target_operand[3][i+1]

From c4f39f716c0f148059e2ee249bd28b7b0b39a1a4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 29 Apr 2025 06:09:31 +0000
Subject: [PATCH 301/432] [Frontend/Template] Fix bmm fusion awareness tiling

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py    |  3 ++-
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 17 +++++++----------
 PyTorchSimFrontend/mlir/mlir_scheduling.py      |  2 +-
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 493a7a4b..709af4d7 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -113,7 +113,8 @@ def render(self,
         X_map = " + ".join([f"d{idx}*{s}" for idx, s in enumerate(X_stride)])
 
         B, M, N, K = X_tensor.size()[0], X_tensor.size()[1], W_tensor.size()[2], X_tensor.size()[2]
-        TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
+        n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
+        TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node)
         TOG_latency = M if TILE_M > M else TILE_M
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
         SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index ed80a91a..1de637ec 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1274,7 +1274,7 @@ def codegen_nodes(self, nodes, kernel_name):
                 if isinstance(e, RuntimeError) and str(e) != "STACK_OVERFLOW":
                     print(f"Benchmark[trial-{n_try}] failed with unexpected error: {e}")
                     raise
-                print(f"Benchmark failed due to stack overflow with tile size: {self.kernel_group.tile_desc.get_tile_size()}")
+                print(f"Benchmark failed due to spad overflow with tile size: {self.kernel_group.tile_desc.get_tile_size()}")
                 self.reset()
         raise RuntimeError("Exceeded maximum number of autotuning attempts")
 
@@ -1285,15 +1285,12 @@ def _prepare_simulator_headers(self, src_code):
         spike_write_path = os.path.join(write_path, "global_var.h")
         gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
 
-        if not os.path.exists(spike_write_path):
-            spad_end_symbol = "int spad_end[0] __attribute__ ((section(\".spad\")));\n"
-            spad_section_end_symbol = (
-                f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
-            )
-            write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol)
-
-        if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, self.gem5_header.getvalue())
+        spad_end_symbol = "int spad_end[0] __attribute__ ((section(\".spad\")));\n"
+        spad_section_end_symbol = (
+            f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
+        )
+        write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol)
+        write_atomic(gem5_write_path, self.gem5_header.getvalue())
 
     def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffer=None): # Need more argument?
         """
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 2911c51d..841a8fad 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -181,7 +181,7 @@ def codegen_template(self, template_node, epilogue_nodes):
             src_code = self.codegen_template_code(kernel, render, template_node, epilogue_nodes)
 
         with V.set_kernel_handler(kernel):
-            spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));"
+            spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n"
             spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({kernel.spad_info['spad_size']*kernel.vector_lane})));"
             codegen_header(src_code, (kernel.header.getvalue()+spad_end_symbol+spad_section_end_symbol, kernel.gem5_header.getvalue()))
             kernel.meta_kernel()

From 4c1e5c9f5cddc593b1612f458c8c9c3cfc1ef84e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 29 Apr 2025 10:07:10 +0000
Subject: [PATCH 302/432] [Frontend/conv] Single batch conv fix

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 7a19a8a1..279a7112 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -299,6 +299,7 @@
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * (I_H + 2 * PADDING_W) * I_C }} + d1 * {{ (I_W + 2 * PADDING_W) * I_C }} + d2 * {{ I_C }} + d3)> // input (BATCH, I_H, I_W, I_C) Stride should be changed if kernel stride > 1
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C)
 #map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
+#map_I_W = affine_map<(d0, d1) -> (d0 * {{ STRIDE_W }} + d1)>
 #offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_K_W * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
 #offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_I_W, TILE_K) }} + d1)>
 #offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
@@ -328,7 +329,7 @@
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   {{- kernel.def_local_vars() }}
-
+  affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }} {
   affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
     affine.for %tile_m = 0 to {{ O_W }} step {{ TILE_M }} {
       affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
@@ -344,7 +345,8 @@
           affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} {
             affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
               %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
-              %index1 = affine.apply #map1(%c0, %index_i_h, %k_w, %tile_k) // input index
+              %index_i_w = affine.apply #map_I_W(%o_w, %k_w)
+              %index1 = affine.apply #map1(%c0, %index_i_h, %index_i_w, %tile_k) // input index
               %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
               // Load input matrix
               memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
@@ -377,6 +379,7 @@
       } { outer_loop=true }
     } { outer_loop=true }
   } { outer_loop=true }
+  } { outer_loop=true }
   return
 }
 """
@@ -465,7 +468,7 @@
           affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} {
             affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
               %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
-              %index1 = affine.apply #map1(%index_i_h, %k_w, %c0, %tile_k) // input index
+              %index1 = affine.apply #map1(%index_i_h, %k_w, %tile_m, %tile_k) // input index
               %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
               // Load input matrix
               memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride

From c1a84431b92bf562d3eb5e9d3757691fc2fc1b89 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 29 Apr 2025 10:15:48 +0000
Subject: [PATCH 303/432] [Test] Add various conv test case

---
 tests/test_conv2d.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py
index 55c4821e..9d8b855a 100644
--- a/tests/test_conv2d.py
+++ b/tests/test_conv2d.py
@@ -44,3 +44,5 @@ def custom_conv2d(a, b, bias):
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
     test_conv2d(device, batch_size=1, in_channels=128, out_channels=128, input_size=28, kernel_size=3, stride=1, padding=1)
+    test_conv2d(device, batch_size=1, in_channels=3, out_channels=64, input_size=64, kernel_size=7, stride=2, padding=3)
+    test_conv2d(device, batch_size=1, in_channels=3, out_channels=64, input_size=64, kernel_size=7, stride=1, padding=3)

From cd323d640a06e3623f657ba8c307deed2aa2443b Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 16 Apr 2025 02:17:56 +0000
Subject: [PATCH 304/432] [Experiments] simulation cycle & tpuv3 booksim config

---
 .../configs/systolic_ws_128x128_c2_booksim_tpuv3.json  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
index 7115b475..d51e9c5f 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
@@ -14,13 +14,13 @@
   "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
- 
+
   "icnt_type" : "booksim2",
   "icnt_latency" : 7,
-  "icnt_freq" : 1000,
-  "icnt_node_per_core" : 16,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c32_m32.icnt",
- 
+  "icnt_freq" : 28000,
+  "icnt_node_per_core" : 1,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m32.icnt",
+
   "precision" : 4,
   "scheduler" : "simple",
   "num_partition" : 2,

From b75d272a813b68456f0140573997953b98358fce Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 29 Apr 2025 04:02:46 +0000
Subject: [PATCH 305/432] [Frontend] CONV mapping for multi-core

---
 PyTorchSimFrontend/mlir/mlir_template.py | 41 +++++++++++++-----------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index f870c443..31fc6068 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -200,24 +200,29 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation
         max_used_spad_size = 0
         M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False)
         max_k_h_w = 1 # maximize kernel size
-        for o_h in sympy.divisors(O_H):
-            for o_w in sympy.divisors(O_W):
-                for k_h in sympy.divisors(K_H):
-                    for k_w in sympy.divisors(K_W):
-                        i_h = 1 + (o_h - 1) * stride[0] + (k_h - 1) * dilation[0]
-                        i_w = 1 + (o_w - 1) * stride[1] + (k_w - 1) * dilation[1]
-                        weight_size = k_w * k_h * K * N
-                        input_size = i_w * i_h * M * K
-                        output_size = o_w * o_h * M * N
-                        used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision
-                        weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N)
-                        input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
-                        output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M  * (1 + n_extra_node), N)
-                        used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
-                        if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h * k_w:
-                            max_used_spad_size = used_spad_size
-                            max_k_h_w = k_h * k_w
-                            mapping = (k_h, k_w, o_h, o_w, M, N, K)
+        max_o_h_w = 1 # maximize output size
+        k_index = K // self.vector_lane
+        for k in sympy.divisors(k_index): # k occupies the large space in scratchpad memory
+            K = k * self.vector_lane if K > self.vector_lane else k
+            for o_h in sympy.divisors(O_H):
+                for o_w in sympy.divisors(O_W):
+                    for k_h in sympy.divisors(K_H):
+                        for k_w in sympy.divisors(K_W):
+                            i_h = 1 + (o_h - 1) * stride[0] + (k_h - 1) * dilation[0]
+                            i_w = 1 + (o_w - 1) * stride[1] + (k_w - 1) * dilation[1]
+                            weight_size = k_w * k_h * K * N
+                            input_size = i_w * i_h * M * K
+                            output_size = o_w * o_h * M * N
+                            used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision
+                            weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N)
+                            input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
+                            output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M  * (1 + n_extra_node), N)
+                            used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
+                            if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h * k_w and max_o_h_w <= o_h * o_w:
+                                max_used_spad_size = used_spad_size
+                                max_k_h_w = k_h * k_w
+                                max_o_h_w = o_h * o_w
+                                mapping = (k_h, k_w, o_h, o_w, M, N, K)
 
         # FIXME: this should be implemented with auto-tuning
         mapping = self.pseudo_auto_tune(mapping, stride, dilation, n_extra_node=n_extra_node)

From d621ce81bc86a7913a0788765b615a4b6696255d Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 29 Apr 2025 04:06:47 +0000
Subject: [PATCH 306/432] [Log] Print L2 Cache log

---
 PyTorchSimBackend/include/Dram.h         |  2 ++
 PyTorchSimBackend/include/L2Cache.h      |  2 ++
 PyTorchSimBackend/src/Dram.cc            |  6 ++++
 PyTorchSimBackend/src/L2Cache.cc         | 10 ++++--
 PyTorchSimBackend/src/Simulator.cc       |  1 +
 PyTorchSimFrontend/mlir/mlir_template.py | 46 ++++++++++++------------
 6 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/PyTorchSimBackend/include/Dram.h b/PyTorchSimBackend/include/Dram.h
index f4615d0a..96c2397c 100644
--- a/PyTorchSimBackend/include/Dram.h
+++ b/PyTorchSimBackend/include/Dram.h
@@ -27,6 +27,7 @@ class Dram {
   virtual void pop(uint32_t cid) = 0;
   uint32_t get_channel_id(mem_fetch* request);
   virtual void print_stat() {}
+  virtual void print_cache_stats() {};
   uint32_t get_channels_per_partition() { return _n_ch_per_partition; }
  protected:
   SimulationConfig _config;
@@ -58,6 +59,7 @@ class DramRamulator2 : public Dram {
   virtual mem_fetch* top(uint32_t cid) override;
   virtual void pop(uint32_t cid) override;
   virtual void print_stat() override;
+  void print_cache_stats() override;
 
  private:
   std::vector<std::unique_ptr<Ramulator2>> _mem;
diff --git a/PyTorchSimBackend/include/L2Cache.h b/PyTorchSimBackend/include/L2Cache.h
index f8ca55d1..70977844 100644
--- a/PyTorchSimBackend/include/L2Cache.h
+++ b/PyTorchSimBackend/include/L2Cache.h
@@ -18,6 +18,7 @@ class L2Cache {
   // Pop memory request from Cache
   void pop() { l_to_mem_queue.pop(); }
   mem_fetch* top() { return l_to_mem_queue.empty() ? NULL : l_to_mem_queue.front(); }
+  virtual void print_stats() {};
 
 protected:
   cycle_type *l_core_cycle;   // Core cycle
@@ -48,4 +49,5 @@ class ReadOnlyL2Cache : public L2Cache {
     std::queue<mem_fetch*> *from_xbar_queue);
   void cycle() override;
   bool push(mem_fetch* req) override;  // Push memory response from DRAM
+  virtual void print_stats() override;
 };
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Dram.cc b/PyTorchSimBackend/src/Dram.cc
index 88475be0..62dd0ca1 100644
--- a/PyTorchSimBackend/src/Dram.cc
+++ b/PyTorchSimBackend/src/Dram.cc
@@ -123,3 +123,9 @@ void DramRamulator2::print_stat() {
     _mem[ch]->print(stdout);
   }
 }
+
+void DramRamulator2::print_cache_stats() {
+  for (int ch = 0; ch < _n_ch; ch++) {
+    _m_caches[ch]->print_stats();
+  }
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/L2Cache.cc b/PyTorchSimBackend/src/L2Cache.cc
index 69ec58bc..4fa80efe 100644
--- a/PyTorchSimBackend/src/L2Cache.cc
+++ b/PyTorchSimBackend/src/L2Cache.cc
@@ -12,7 +12,7 @@ void NoL2Cache::cycle() {
   }
 }
 
-ReadOnlyL2Cache::ReadOnlyL2Cache(std::string name,  CacheConfig &cache_config, uint32_t id, 
+ReadOnlyL2Cache::ReadOnlyL2Cache(std::string name,  CacheConfig &cache_config, uint32_t id,
   cycle_type *core_cycle, uint32_t l2d_hit_latency,
   std::queue<mem_fetch*> *to_xbar_queue, std::queue<mem_fetch*> *from_xbar_queue) :
   L2Cache(name, cache_config, id, core_cycle, l2d_hit_latency, to_xbar_queue, from_xbar_queue) {
@@ -60,7 +60,7 @@ void ReadOnlyL2Cache::cycle() {
       }
       l_from_xbar_queue->pop();
     } else if (status != RESERVATION_FAIL) {
-      if (req->is_write() &&
+      if (req->is_write() && // FIXME: req->is_write() already checked above 48 line.
           (l_cache_config.get_write_alloc_policy() == FETCH_ON_WRITE ||
             l_cache_config.get_write_alloc_policy() == LAZY_FETCH_ON_READ)) {
         req->set_reply();
@@ -96,3 +96,9 @@ void ReadOnlyL2Cache::cycle() {
     l_from_cache_queue.pop();
   }
 }
+
+void ReadOnlyL2Cache::print_stats() {
+  if (l_id == 0) {
+    l_cache->get_stats().print_stats(stdout, l_name.c_str());
+  }
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index 9893b60e..0cb24b2d 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -284,6 +284,7 @@ void Simulator::print_core_stat()
 {
   _icnt->print_stats();
   _dram->print_stat();
+  _dram->print_cache_stats();
   for (int core_id = 0; core_id < _n_cores; core_id++) {
     _cores[core_id]->print_stats();
   }
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 31fc6068..de7c4fb8 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -201,34 +201,32 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation
         M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False)
         max_k_h_w = 1 # maximize kernel size
         max_o_h_w = 1 # maximize output size
-        k_index = K // self.vector_lane
-        for k in sympy.divisors(k_index): # k occupies the large space in scratchpad memory
-            K = k * self.vector_lane if K > self.vector_lane else k
-            for o_h in sympy.divisors(O_H):
-                for o_w in sympy.divisors(O_W):
-                    for k_h in sympy.divisors(K_H):
-                        for k_w in sympy.divisors(K_W):
-                            i_h = 1 + (o_h - 1) * stride[0] + (k_h - 1) * dilation[0]
-                            i_w = 1 + (o_w - 1) * stride[1] + (k_w - 1) * dilation[1]
-                            weight_size = k_w * k_h * K * N
-                            input_size = i_w * i_h * M * K
-                            output_size = o_w * o_h * M * N
-                            used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision
-                            weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N)
-                            input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
-                            output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M  * (1 + n_extra_node), N)
-                            used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
-                            if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h * k_w and max_o_h_w <= o_h * o_w:
-                                max_used_spad_size = used_spad_size
-                                max_k_h_w = k_h * k_w
-                                max_o_h_w = o_h * o_w
-                                mapping = (k_h, k_w, o_h, o_w, M, N, K)
+        K = min(K, self.vector_lane)
+        for o_h in sympy.divisors(O_H):
+            for o_w in sympy.divisors(O_W):
+                for k_h in sympy.divisors(K_H):
+                    for k_w in sympy.divisors(K_W):
+                        i_h = 1 + (o_h - 1) * stride[0] + (k_h - 1) * dilation[0]
+                        i_w = 1 + (o_w - 1) * stride[1] + (k_w - 1) * dilation[1]
+                        weight_size = k_w * k_h * K * N
+                        input_size = i_w * i_h * M * K
+                        output_size = o_w * o_h * M * N
+                        used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision
+                        weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N)
+                        input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
+                        output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M  * (1 + n_extra_node), N)
+                        used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
+                        if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h * k_w and max_o_h_w <= o_h * o_w:
+                            max_used_spad_size = used_spad_size
+                            max_k_h_w = k_h * k_w
+                            max_o_h_w = o_h * o_w
+                            mapping = (k_h, k_w, o_h, o_w, M, N, K)
+        if max_used_spad_size == 0:
+            raise RuntimeError("Cannot find a valid mapping")
 
         # FIXME: this should be implemented with auto-tuning
         mapping = self.pseudo_auto_tune(mapping, stride, dilation, n_extra_node=n_extra_node)
 
-        if max_used_spad_size == 0:
-            raise RuntimeError("Cannot find a valid mapping")
         return mapping
 
     def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0):

From b1c3359b2d570b45343fdad6b2b43ed6852ab3e6 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Thu, 8 May 2025 02:46:03 +0000
Subject: [PATCH 307/432] [Experiments] Use scheduler for multi kernels
 simulation

---
 ...stolic_ws_128x128_c2_simple_noc_tpuv4.json |  10 +-
 experiments/BERT.py                           | 109 +++---------------
 experiments/conv.py                           |  51 +++-----
 experiments/gemm.py                           |  55 ++++-----
 experiments/layernorm.py                      |  47 ++++++++
 experiments/resnet18.py                       |  34 +++---
 experiments/resnet50.py                       |  34 +++---
 experiments/softmax.py                        |  47 ++++++++
 8 files changed, 192 insertions(+), 195 deletions(-)
 create mode 100644 experiments/layernorm.py
 create mode 100644 experiments/softmax.py

diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
index f4a5172d..63b926d6 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
@@ -1,7 +1,7 @@
 {
   "num_cores" : 2,
   "core_freq" : 1050,
-  "sram_size" : 16777216,
+  "sram_size" : 32768,
   "core_print_interval" : 10000,
   "num_systolic_array_per_core" : 4,
 
@@ -10,21 +10,21 @@
   "dram_channels": 32,
   "dram_req_size": 32,
   "dram_latency" : 10,
-  "dram_size" : 16,
+  "dram_size" : 32,
   "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
   "l2d_type" : "readonly",
   "l2d_config" : "S:64:128:512,32,L:R:m:L:L,A:192:4,32:0,32",
- 
+
   "icnt_type" : "simple",
   "icnt_latency" : 7,
   "icnt_freq" : 38400,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
- 
+
   "precision" : 4,
   "scheduler" : "simple",
-  "num_partition" : 2,
+  "num_partition" : 1,
   "partition": {
     "core_0":0,
     "core_1":0
diff --git a/experiments/BERT.py b/experiments/BERT.py
index e7d6fb35..f42b8be1 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -1,126 +1,55 @@
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
-import math
-import copy
 
 import argparse
-import subprocess
 import datetime
 
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
+def run_BERT(size, input_seq, config):
+    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
+    from tests.test_transformer import DecoderBlock
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    device = scheduler.execution_engine.module.custom_device()
 
-def clones(module, N):
-    "Produce N identical layers."
-    return torch.nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
-
-class my_MultiheadAttention(torch.nn.Module):
-    def __init__(self, h, d_model, dropout=0.1):
-        super(my_MultiheadAttention, self).__init__()
-        assert d_model % h == 0
-        # We assume d_v always equals d_k
-        self.d_k = d_model // h
-        self.h = h
-        self.linears = clones(torch.nn.Linear(d_model, d_model), 4)
-        self.attn = None
-
-    def forward(self, query, key, value):
-        # 1) Do all the linear projections in batch from d_model => h x d_k
-        query, key, value = [
-            lin(x).view(-1, self.h, self.d_k).transpose(0, 1).contiguous()
-            for lin, x in zip(self.linears, (query, key, value))
-        ]
-
-        # 2) Apply attention on all the projected vectors in batch.
-        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)
-        p_attn = scores.softmax(dim=-1)
-        x = torch.matmul(p_attn, value)
-        # 3) "Concat" using a view and apply a final linear.
-        x = (
-            x.transpose(0, 1)
-            .contiguous()
-            .view(-1, self.h * self.d_k)
-        )
-        del query
-        del key
-        del value
-        return self.linears[-1](x)
-
-class DecoderBlock(torch.nn.Module):
-    def __init__(self, embed_dim, num_heads):
-        super(DecoderBlock, self).__init__()
-        self.multihead_attn = my_MultiheadAttention(num_heads, embed_dim)
-        self.layer_norm = torch.nn.LayerNorm(embed_dim)
-        self.ffn1 = torch.nn.Linear(embed_dim, embed_dim*4)
-        self.act = torch.nn.ReLU()
-        self.ffn2 = torch.nn.Linear(embed_dim*4, embed_dim)
-
-    def forward(self, x):
-        result = self.multihead_attn(x, x, x)
-        result = self.layer_norm(result+x)
-
-        ffn1_result = self.ffn1(result)
-        act_result = self.act(ffn1_result)
-        ffn2_result = self.ffn2(act_result)
-        return self.layer_norm(ffn2_result + result)
-
-def run_BERT(device, size, input_seq, validation):
     hidden_dim = {'base': 768, 'large': 1024, 'xlarge': 2048}
     embedding_size = {'base': 768, 'large': 1024, 'xlarge': 2048}
     heads = {'base': 12, 'large': 16, 'xlarge': 32} # hidden/64 https://arxiv.org/pdf/1909.11942
     cpu_query = torch.randn(input_seq, hidden_dim[size])
     decoder_block = DecoderBlock(embedding_size[size], heads[size])
-    cpu_res = decoder_block(cpu_query)
 
     query = cpu_query.clone().to(device=device)
-    decoder_block.to(device=device)
-    opt_fn = torch.compile(dynamic=False)(decoder_block)
-    res = opt_fn(query)
+    opt_fn = torch.compile(dynamic=False)(decoder_block.to(device=device))
+
+    SchedulerDNNModel.register_model(f"BERT-{size}", opt_fn)
+    request = Request(f"BERT-{size}", [query], [], request_queue_idx=0)
+    scheduler.add_request(request, request_time=0)
+
+    # Run scheduler
+    while not scheduler.is_finished():
+        scheduler.schedule()
 
-    if validation:
-        test_result(f"BERT-{size} Forwrad", res, cpu_res)
     print(f"BERT-{size} Simulation Done")
 
 if __name__ == "__main__":
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
-    config = config.split('/')[-1].split('.')[0][9:] # extract config name from config path FIXME: gem5 result is different as directoy name
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path FIXME: gem5 result is different as directoy name
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--size', type=str, default='base')
     args.add_argument('--dump_path', type=str, default='results')
     args.add_argument('--input_size', type=int, default=512)
-    args.add_argument('--validation', type=int, default=0)
     args = args.parse_args()
     size = args.size
     input_seq = args.input_size
-    result_path = os.path.join(base_dir, args.dump_path, config, f"BERT_{size}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"BERT_{size}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
-    os.environ['TORCHSIM_VALIDATION_MODE'] = str(args.validation)
+    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
     if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
         del os.environ['BACKENDSIM_SPIKE_ONLY']
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
-    device = module.custom_device()
-    run_BERT(device, size, input_seq, args.validation)
-    # compute cycles with shell script
-    subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)
-    subprocess.run([f"{base_dir}/scripts/sim_time.sh {result_path}"], shell=True)
+    run_BERT(size, input_seq, config)
diff --git a/experiments/conv.py b/experiments/conv.py
index de2c9128..115e4aac 100644
--- a/experiments/conv.py
+++ b/experiments/conv.py
@@ -3,25 +3,11 @@
 import torch.utils.cpp_extension
 
 import argparse
-import subprocess
 import datetime
 
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
 
-def run_conv2d(device, batch_size, i_h, i_w, i_c, o_c, kernel_size, stride, padding, validation):
+def run_conv2d(batch_size, i_h, i_w, i_c, o_c, kernel_size, stride, padding, config):
+    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
     def custom_conv2d(a, b, bias):
         i_c = a.shape[1]
         o_c = b.shape[0]
@@ -29,43 +15,42 @@ def custom_conv2d(a, b, bias):
         conv2d.weight = torch.nn.Parameter(b)
         # conv2d.bias = torch.nn.Parameter(bias)
         return conv2d(a)
-    torch.manual_seed(0)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    device = scheduler.execution_engine.module.custom_device()
     conv_input = torch.randn(batch_size, i_c, i_h, i_w).to(memory_format=torch.channels_last, device=device)
     conv_kernel = torch.randn(o_c, i_c, kernel_size, kernel_size).to(memory_format=torch.channels_last, device=device)
     conv_bias = torch.randn(o_c).to(device=device)
     opt_fn = torch.compile(dynamic=False)(custom_conv2d)
-    res = opt_fn(conv_input, conv_kernel, conv_bias)
-    out = custom_conv2d(conv_input.cpu(), conv_kernel.cpu(), conv_bias.cpu())
-    if validation:
-        test_result("CONV Forward", res, y)
+
+    SchedulerDNNModel.register_model("CONV", opt_fn)
+    request = Request("CONV", [conv_input, conv_kernel, conv_bias], [], request_queue_idx=0)
+    scheduler.add_request(request, request_time=0)
+
+    # Run scheduler
+    while not scheduler.is_finished():
+        scheduler.schedule()
+
     print(f"CONV {batch_size}_{i_h}_{i_w}_{i_c}_{o_c}_{kernel_size}_{stride}_{padding} (B_H_W_I_C_O_C_K_S_P) Simulation Done")
 
 if __name__ == "__main__":
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
-    config = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--size', nargs='+', type=int, default=[8, 28, 28, 128, 128, 3, 1, 1], help='B H W I_C O_C K S P')
     args.add_argument('--dump_path', type=str, default='results')
-    args.add_argument('--validation', type=int, default=0)
     args = args.parse_args()
     size = args.size
     size_str = "_".join([str(i) for i in size])
-    result_path = os.path.join(base_dir, args.dump_path, config, f"CONV_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"CONV_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
-    os.environ['TORCHSIM_VALIDATION_MODE'] = str(args.validation)
+    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
     if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
         del os.environ['BACKENDSIM_SPIKE_ONLY']
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
-    device = module.custom_device()
-    run_conv2d(device, size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7], args.validation)
-    # compute cycles with shell script
-    subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)
-    subprocess.run([f"{base_dir}/scripts/sim_time.sh {result_path}"], shell=True)
+    run_conv2d(size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7], config)
\ No newline at end of file
diff --git a/experiments/gemm.py b/experiments/gemm.py
index 4fcf2c38..a1fdcff6 100644
--- a/experiments/gemm.py
+++ b/experiments/gemm.py
@@ -3,67 +3,52 @@
 import torch.utils.cpp_extension
 
 import argparse
-import subprocess
 import datetime
 
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        message = f"|{name} Test Passed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-    else:
-        message = f"|{name} Test Failed|"
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
-        exit(1)
 
-def run_matmul(device, input_size, hidden_size, output_size, validation):
+def run_matmul(input_size, hidden_size, output_size, config):
+    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
     def custom_matmul(a, b):
         return torch.matmul(a, b)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    device = scheduler.execution_engine.module.custom_device()
     torch.manual_seed(0)
-    input = torch.randn(input_size, hidden_size)
-    weight = torch.randn(hidden_size, output_size)
-    x1 = input.to(device=device)
-    w1 = weight.to(device=device)
-    x2 = input.to("cpu")
-    w2 = weight.to("cpu")
+    input = torch.randn(input_size, hidden_size).to(device=device)
+    weight = torch.randn(hidden_size, output_size).to(device=device)
     opt_fn = torch.compile(dynamic=False)(custom_matmul)
-    res = opt_fn(x1, w1)
-    y = custom_matmul(x2, w2)
-    if validation:
-        test_result("Matmul Forward", res, y)
+
+    SchedulerDNNModel.register_model("GEMM", opt_fn)
+    request = Request("GEMM", [input, weight], [], request_queue_idx=0)
+    scheduler.add_request(request, request_time=0)
+
+    # Run scheduler
+    while not scheduler.is_finished():
+        scheduler.schedule()
+
     print(f"GEMM {input_size}x{hidden_size}x{output_size} (MxKxN) Simulation Done")
 
 if __name__ == "__main__":
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
-    config = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--size', nargs='+', type=int, default=[128, 128, 128], help='M K N')
     args.add_argument('--dump_path', type=str, default='results')
-    args.add_argument('--validation', type=int, default=0)
     args = args.parse_args()
     size = args.size
     size_str = "x".join([str(i) for i in size])
-    result_path = os.path.join(base_dir, args.dump_path, config, f"GEMM_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"GEMM_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
-    os.environ['TORCHSIM_VALIDATION_MODE'] = str(args.validation)
+    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
     if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
         del os.environ['BACKENDSIM_SPIKE_ONLY']
 
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    run_matmul(device, size[0], size[1], size[2], args.validation)
-    # compute cycles with shell script
-    subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)
-    subprocess.run([f"{base_dir}/scripts/sim_time.sh {result_path}"], shell=True)
+    run_matmul(size[0], size[1], size[2], config)
diff --git a/experiments/layernorm.py b/experiments/layernorm.py
new file mode 100644
index 00000000..378833f7
--- /dev/null
+++ b/experiments/layernorm.py
@@ -0,0 +1,47 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+import argparse
+import datetime
+
+
+def run_layernorm(size, config):
+    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    device = scheduler.execution_engine.module.custom_device()
+    input = torch.randn(size).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(torch.nn.LayerNorm(size[-1]).to(device=device))
+
+    SchedulerDNNModel.register_model("LayerNorm", opt_fn)
+    request = Request("LayerNorm", [input], [], request_queue_idx=0)
+    scheduler.add_request(request, request_time=0)
+
+    # Run scheduler
+    while not scheduler.is_finished():
+        scheduler.schedule()
+
+    print(f"LayerNorm {str(size)} Simulation Done")
+
+if __name__ == "__main__":
+    import os
+    import sys
+    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
+    sys.path.append(base_dir)
+    args = argparse.ArgumentParser()
+    args.add_argument('--size', nargs='+', type=int, default=[512, 768], help='Tensor Shape')
+    args.add_argument('--dump_path', type=str, default='results')
+    args = args.parse_args()
+    size = args.size
+    size_str = "x".join([str(i) for i in size])
+    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"LayerNorm_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    # setting environment variables
+    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    # only timing simulation
+    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
+    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
+        del os.environ['BACKENDSIM_SPIKE_ONLY']
+
+    run_layernorm(size, config)
diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index 1f74df4d..202642d3 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -3,32 +3,40 @@
 import torch.utils.cpp_extension
 
 import argparse
-import subprocess
 import datetime
 
-def run_resnet(device, batch):
+def run_resnet(batch, config):
     from torchvision.models import resnet18
+    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    device = scheduler.execution_engine.module.custom_device()
     model = resnet18().eval()
-    model.to(device, memory_format=torch.channels_last)
     input = torch.randn(batch, 3, 224, 224).to(device=device)
-    x1 = input.to(device=device, memory_format=torch.channels_last)
-    opt_fn = torch.compile(dynamic=False)(model)
-    res = opt_fn(x1)
+    opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last))
+
+    SchedulerDNNModel.register_model("resnet18", opt_fn)
+    request = Request("resnet18", [input], [], request_queue_idx=0)
+    scheduler.add_request(request, request_time=0)
+
+    # Run scheduler
+    while not scheduler.is_finished():
+        scheduler.schedule()
+
     print("ResNet18 Simulation Done")
 
 if __name__ == "__main__":
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
-    config = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--batch', type=int, default=1)
     args.add_argument('--dump_path', type=str, default='results')
     args = args.parse_args()
     batch = args.batch
-    result_path = os.path.join(base_dir, args.dump_path, config, f"resnet18_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"resnet18_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
@@ -36,10 +44,4 @@ def run_resnet(device, batch):
     if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
         del os.environ['BACKENDSIM_SPIKE_ONLY']
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
-    device = module.custom_device()
-    run_resnet(device, batch)
-    # compute cycles with shell script
-    subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)
-    subprocess.run([f"{base_dir}/scripts/sim_time.sh {result_path}"], shell=True)
+    run_resnet(batch, config)
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index 788fd591..915bee5f 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -3,32 +3,40 @@
 import torch.utils.cpp_extension
 
 import argparse
-import subprocess
 import datetime
 
-def run_resnet(device, batch):
+def run_resnet(batch, config):
     from torchvision.models import resnet50
+    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    device = scheduler.execution_engine.module.custom_device()
     model = resnet50().eval()
-    model.to(device, memory_format=torch.channels_last)
     input = torch.randn(batch, 3, 224, 224).to(device=device)
-    x1 = input.to(device=device, memory_format=torch.channels_last)
-    opt_fn = torch.compile(dynamic=False)(model)
-    res = opt_fn(x1)
+    opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last))
+
+    SchedulerDNNModel.register_model("resnet50", opt_fn)
+    request = Request("resnet50", [input], [], request_queue_idx=0)
+    scheduler.add_request(request, request_time=0)
+
+    # Run scheduler
+    while not scheduler.is_finished():
+        scheduler.schedule()
+
     print("ResNet50 Simulation Done")
 
 if __name__ == "__main__":
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json')
-    config = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--batch', type=int, default=1)
     args.add_argument('--dump_path', type=str, default='results')
     args = args.parse_args()
     batch = args.batch
-    result_path = os.path.join(base_dir, args.dump_path, config, f"resnet50_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"resnet50_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
@@ -36,10 +44,4 @@ def run_resnet(device, batch):
     if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
         del os.environ['BACKENDSIM_SPIKE_ONLY']
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
-    device = module.custom_device()
-    run_resnet(device, batch)
-    # compute cycles with shell script
-    subprocess.run([f"{base_dir}/scripts/end2end.sh {result_path}"], shell=True)
-    subprocess.run([f"{base_dir}/scripts/sim_time.sh {result_path}"], shell=True)
+    run_resnet(batch, config)
diff --git a/experiments/softmax.py b/experiments/softmax.py
new file mode 100644
index 00000000..14d28fee
--- /dev/null
+++ b/experiments/softmax.py
@@ -0,0 +1,47 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+import argparse
+import datetime
+
+
+def run_softmax(size, config, dim=1):
+    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    device = scheduler.execution_engine.module.custom_device()
+    input = torch.randn(size).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(torch.nn.Softmax(dim=dim).to(device=device))
+
+    SchedulerDNNModel.register_model("Softmax", opt_fn)
+    request = Request("Softmax", [input], [], request_queue_idx=0)
+    scheduler.add_request(request, request_time=0)
+
+    # Run scheduler
+    while not scheduler.is_finished():
+        scheduler.schedule()
+
+    print(f"Softmax {str(size)} Simulation Done")
+
+if __name__ == "__main__":
+    import os
+    import sys
+    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
+    sys.path.append(base_dir)
+    args = argparse.ArgumentParser()
+    args.add_argument('--size', nargs='+', type=int, default=[512, 512], help='Tensor Shape')
+    args.add_argument('--dump_path', type=str, default='results')
+    args = args.parse_args()
+    size = args.size
+    size_str = "x".join([str(i) for i in size])
+    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"Softmax_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    # setting environment variables
+    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    # only timing simulation
+    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
+    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
+        del os.environ['BACKENDSIM_SPIKE_ONLY']
+
+    run_softmax(size, config)

From 6efb8372abbc4e30da047dfadc6683141fbf1a07 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Fri, 9 May 2025 04:08:30 +0000
Subject: [PATCH 308/432] [Frontend] minimum number of tiles

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 2 +-
 PyTorchSimFrontend/mlir/mlir_template.py      | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 3152aee2..fe9f6946 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -117,7 +117,7 @@ def render(self,
             TILE_M, TILE_N, TILE_K = 1, 1, 1
             template = EMPTY_TEMPLATE
         else:
-            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node)
+            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node, min_tile=True)
             template = GEMM_TEMPLATE
         TILE_M = min(extension_config.CONFIG_FORCE_TILE_M, TILE_M)
         TILE_N = min(extension_config.CONFIG_FORCE_TILE_N, TILE_N)
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index de7c4fb8..285915d9 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -118,11 +118,12 @@ def gemmini_gemm_mapping(self, M, N, K):
 
         return inner_I, inner_J, inner_K
 
-    def gemm_combination_mapping(self, M, N, K, n_extra_node=0, pad_k=True):
+    def gemm_combination_mapping(self, M, N, K, n_extra_node=0, pad_k=True, min_tile=False):
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
         max_spad_per_lane = spad_size_per_lane // 2 # double buffer
+        minimum_n_tile = self.num_cores if min_tile else 1
         m_pad_factor = self.vector_lane if M > self.vector_lane else 8
         n_pad_factor = self.vector_lane if N > self.vector_lane else 8
         k_pad_factor = self.vector_lane if K > self.vector_lane else (8 if pad_k else 1)
@@ -149,7 +150,8 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, pad_k=True):
                     input_size_per_lane = self.get_spad_size_per_lane(tile_M, tile_K)
                     output_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_extra_node), tile_N)
                     used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
-                    if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and maximize_i_j <= tile_M * tile_N:
+                    n_tile = math.ceil(M / tile_M) * math.ceil(N / tile_N)
+                    if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and maximize_i_j <= tile_M * tile_N and n_tile >= minimum_n_tile:
                         max_used_spad_size = used_spad_size
                         maximize_i_j = tile_M * tile_N
                         mapping = (tile_M, tile_N, tile_K)

From 8953aa390afb25c454722120365609336b1f7258 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Fri, 9 May 2025 07:27:16 +0000
Subject: [PATCH 309/432] [TOGSim] Core-level interleaving

---
 PyTorchSimBackend/src/Simulator.cc | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index 0cb24b2d..5299efb4 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -73,14 +73,9 @@ void Simulator::run_simulator() {
 }
 
 void Simulator::core_cycle() {
-  for (int core_id = 0; core_id < _n_cores; core_id++) {
-    std::shared_ptr<Tile> finished_tile = _cores[core_id]->pop_finished_tile();
-    if (finished_tile->get_status() == Tile::Status::FINISH) {
-      get_partition_scheduler(core_id)->finish_tile(std::move(finished_tile));
-    }
-
+  for (int i=0; i<_max_slot; i++, _slot_id=(_slot_id + 1) % _max_slot) {
     // Issue new tile to core
-    for (int i=0; i<_max_slot; i++, _slot_id=(_slot_id + 1) % _max_slot) {
+    for (int core_id = 0; core_id < _n_cores; core_id++) {
       const std::shared_ptr<Tile> tile = get_partition_scheduler(core_id)->peek_tile(core_id, _slot_id, _config.core_type[core_id]);
       if (tile->get_status() != Tile::Status::EMPTY && _cores[core_id]->can_issue(tile))  {
         if (tile->get_status() == Tile::Status::INITIALIZED) {
@@ -92,6 +87,12 @@ void Simulator::core_cycle() {
         }
       }
     }
+  }
+  for (int core_id = 0; core_id < _n_cores; core_id++) {
+      std::shared_ptr<Tile> finished_tile = _cores[core_id]->pop_finished_tile();
+      if (finished_tile->get_status() == Tile::Status::FINISH) {
+        get_partition_scheduler(core_id)->finish_tile(std::move(finished_tile));
+      }
     _cores[core_id]->cycle();
   }
   /* L2 cache */

From 5dc3d161adec82faef88223ec70488f9ee6ba2e2 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Thu, 15 May 2025 07:20:40 +0000
Subject: [PATCH 310/432] [Fix] recompute single batch conv spad size

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 4 ++++
 PyTorchSimFrontend/mlir/mlir_template.py      | 8 ++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 279a7112..b8378397 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -664,8 +664,10 @@ def render(self,
           TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
           TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
           x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_W * TILE_I_H * TILE_M, TILE_K)
+          w_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_W * TILE_K_H * TILE_K, TILE_N)
           y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N)
           x_spad_size = TILE_K_W * TILE_I_H * TILE_M * TILE_K
+          w_spad_size = TILE_K_W * TILE_K_H * TILE_K * TILE_N
           y_spad_size = TILE_O_H * TILE_M * TILE_N
           SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
           SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
@@ -680,8 +682,10 @@ def render(self,
           SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
           SUB_TILE_K = TILE_K
           x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H, TILE_K)
+          w_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_W * TILE_K_H * TILE_K, TILE_N)
           y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H  * TILE_M, TILE_N)
           x_spad_size = TILE_I_W * TILE_I_H * TILE_K
+          w_spad_size = TILE_K_W * TILE_K_H * TILE_K * TILE_N
           y_spad_size = TILE_O_H * TILE_M * TILE_N
           TOG_latency = O_W if TILE_M > O_W else TILE_M
         TOG_latency = 8 if TOG_latency < 8 else TOG_latency
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 285915d9..aa6f82bc 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -185,11 +185,11 @@ def search_mapping_space(self, mapping, idx, increment, stride, dilation, n_extr
 
         return mapping
 
-    def pseudo_auto_tune(self, mapping, stride, dilation, n_extra_node=0):
+    def pseudo_auto_tune(self, mapping, stride, dilation, O_H, O_W, n_extra_node=0):
         # pseudo auto-tune
-        if mapping[2] == 1:
+        if mapping[2] == 1 and not (O_H == 1):
             mapping = self.search_mapping_space(mapping, 2, 1, stride, dilation, n_extra_node=n_extra_node)
-        if mapping[3] == 1:
+        if mapping[3] == 1 and not (O_W == 1):
             mapping = self.search_mapping_space(mapping, 3, 1, stride, dilation, n_extra_node=n_extra_node)
         return mapping
 
@@ -227,7 +227,7 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation
             raise RuntimeError("Cannot find a valid mapping")
 
         # FIXME: this should be implemented with auto-tuning
-        mapping = self.pseudo_auto_tune(mapping, stride, dilation, n_extra_node=n_extra_node)
+        mapping = self.pseudo_auto_tune(mapping, stride, dilation, O_H, O_W, n_extra_node=n_extra_node)
 
         return mapping
 

From b63c8f728a39c5b1f2917a676ce63fdb7a067afb Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 12 May 2025 08:32:42 +0000
Subject: [PATCH 311/432] [Frontend] Add boilerpalte for sram allocation plan

---
 PyTorchSimFrontend/extension_config.py        | 24 ++++-
 .../mlir/mlir_codegen_backend.py              | 87 ++++++++++++++++++-
 PyTorchSimFrontend/mlir/mlir_common.py        |  2 -
 PyTorchSimFrontend/mlir/mlir_template.py      |  2 -
 4 files changed, 108 insertions(+), 7 deletions(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 2f4444cb..0c1e8b6a 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -1,6 +1,7 @@
 import os
 import sys
 import tempfile
+import importlib
 
 # Hardware info config
 CONFIG_VECTOR_LANE = int(os.environ.get("TORCHSIM_VECTOR_LANE", default=128))
@@ -53,4 +54,25 @@
 CONFIG_BLOCK_SPARSE = int(os.environ.get('BLOCK_SPARSE', default=0))
 CONFIG_FORCE_TILE_M = int(os.environ.get("TORCHSIM_FORCE_TIME_M", default=sys.maxsize))
 CONFIG_FORCE_TILE_N = int(os.environ.get("TORCHSIM_FORCE_TIME_N", default=sys.maxsize))
-CONFIG_FORCE_TILE_K = int(os.environ.get("TORCHSIM_FORCE_TIME_K", default=sys.maxsize))
\ No newline at end of file
+CONFIG_FORCE_TILE_K = int(os.environ.get("TORCHSIM_FORCE_TIME_K", default=sys.maxsize))
+
+# SRAM Buffer allocation plan
+def load_plan_from_module(module_path):
+    if module_path is None:
+      return None
+
+    try:
+        spec = importlib.util.spec_from_file_location("plan_module", module_path)
+        if spec is None:
+            return None
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        if hasattr(module, 'plan'):
+            return module.plan
+        return None
+    except Exception as e:
+        print(f"[Warning] Failed to load SRAM buffer plan from module: {e}")
+        return None
+
+CONFIG_SRAM_BUFFER_PLAN_PATH = os.environ.get("SRAM_BUFFER_PLAN_PATH", default=None)
+CONFIG_SRAM_BUFFER_PLAN = load_plan_from_module(CONFIG_SRAM_BUFFER_PLAN_PATH)
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 1de637ec..0eac1b48 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -4,7 +4,8 @@
 import os
 import math
 import torch
-from torch._inductor.codegen import cpp, wrapper, common
+from torch._dynamo.utils import dynamo_timed
+from torch._inductor.codegen import cpp, wrapper, common, memory_planning
 from torch._inductor.virtualized import V, _ops as ops
 from torch._inductor.codecache import write_atomic, write
 from torch._inductor.utils import (
@@ -91,6 +92,7 @@ def write_header(self):
 
                 from torch import device, empty, empty_strided
                 from {extension_codecache.__name__} import CustomAsyncCompile
+                from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN
                 from PyTorchSimFrontend.extension_op import sparse_mm_dummy_stonne_outer
                 from torch._inductor.select_algorithm import extern_kernels
 
@@ -99,10 +101,91 @@ def write_header(self):
                 assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                 alloc_from_pool = torch.ops.inductor._alloc_from_pool
                 reinterpret_tensor = torch.ops.aten._reinterpret_tensor
-                async_compile = CustomAsyncCompile()
+                custom_async_compile = CustomAsyncCompile()
                 os.environ["TORCHSIM_LAST_COMPILED_MODULE"] = __file__
             """
         )
+        self.header.splice(
+            f"""
+            def sram_allocate(buffer):
+                pass
+
+            def sram_deallocate(buffer):
+                pass
+
+            def host2device_memcopy(buffer):
+                pass
+
+            def device2host_memcpy(buffer):
+                pass
+            """
+        )
+
+    def write_prefix(self):
+        self.prefix.splice(
+            """
+            def call(args):
+            """
+        )
+        with self.prefix.indent():
+            inp_len = len(V.graph.graph_inputs.keys())
+            if inp_len != 0:
+                lhs = f"{', '.join(V.graph.graph_inputs.keys())}{'' if inp_len != 1 else ','}"
+                self.prefix.writeline(f"{lhs} = args")
+                self.prefix.writeline("args.clear()")
+
+            self.codegen_inputs(self.prefix, V.graph.graph_inputs)
+            self.codegen_input_size_asserts()
+
+    @dynamo_timed
+    def generate(self, is_inference):
+        result = IndentedBuffer()
+        result.splice(self.header)
+
+        with contextlib.ExitStack() as stack:
+            stack.enter_context(self.wrapper_call.indent())
+            self.memory_plan()
+            for line in self.lines:
+                if isinstance(line, wrapper.MemoryPlanningLine):
+                    line.codegen(self.wrapper_call)
+                else:
+                    self.wrapper_call.writeline(line)
+
+            output_refs = self.get_output_refs()
+            self.mark_output_type()
+            self.generate_return(output_refs)
+
+        self.append_precomputed_sizes_to_prefix()
+        self.finalize_prefix()
+        result.splice(self.prefix)
+
+        with result.indent():
+            result.splice(self.wrapper_call)
+
+        self.generate_end(result)
+        self.add_benchmark_harness(result)
+        return result.getvaluewithlinemap()
+
+    def memory_plan(self):
+        self.lines = memory_planning.MemoryPlanner(self).plan(self.lines)
+
+    def memory_plan_reuse(self):
+        out_names = V.graph.get_output_names()
+
+        while (
+            self.lines
+            and isinstance(self.lines[-1], wrapper.MemoryPlanningLine)
+            # TODO: this seems legit, NullLine has no node
+            and self.lines[-1].node.name not in out_names  # type: ignore[attr-defined]
+        ):
+            # these lines will be pointless
+            self.lines.pop()
+
+        # codegen allocations in two passes
+        planning_state = wrapper.MemoryPlanningState()
+        for i in range(len(self.lines)):
+            if isinstance(self.lines[i], wrapper.MemoryPlanningLine):
+                self.lines[i] = self.lines[i].plan(planning_state)
 
 class ExtensionOverrides(common.OpOverrides):
     # Binary element wise operations
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 8cf079f1..2e5eee0e 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -613,8 +613,6 @@ def meta_kernel(self):
         wrapper = V.graph.wrapper_code
         _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs()
         wrapper.add_import_once('\nprint(f\'Wrapper Codegen Path = {__file__}\')')
-        wrapper.add_import_once(f'\nfrom PyTorchSimFrontend.extension_codecache import CustomAsyncCompile')
-        wrapper.add_import_once(f'\ncustom_async_compile = CustomAsyncCompile()')
         # Dump loop and load/store information
         wrapper.add_import_once(f"arg_attributes = {arg_attributes}")
         return arg_attributes
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index aa6f82bc..4229a266 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -301,8 +301,6 @@ def meta_kernel(self):
                     if arg_attributes[idx][0] == name:
                         arg_attributes[idx][1] = attr
         wrapper.add_import_once('\nprint(f\'Wrapper Codegen Path = {__file__}\')')
-        wrapper.add_import_once(f'\nfrom PyTorchSimFrontend.extension_codecache import CustomAsyncCompile')
-        wrapper.add_import_once(f'\ncustom_async_compile = CustomAsyncCompile()')
         # Dump loop and load/store information
         wrapper.add_import_once(f"loop_info = {self.loop_info}")
         wrapper.add_import_once(f"load_tile_info = {self.load_desc}")

From 408e09a73a97c1ba81e952047c49a828bda8865e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 13 May 2025 05:10:21 +0000
Subject: [PATCH 312/432] [Frontend+Simulator] Implement sram buffer allocation
 infra

---
 .../mlir/mlir_codegen_backend.py              | 74 ++++++++++++-------
 Simulator/simulator.py                        | 15 ++++
 2 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 0eac1b48..dd5ba709 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -11,6 +11,7 @@
 from torch._inductor.utils import (
     IndentedBuffer,
     is_welford_reduction,
+    sympy_product
 )
 from torch.utils._sympy.functions import ModularIndexing
 import PyTorchSimFrontend.extension_codecache as extension_codecache
@@ -92,7 +93,8 @@ def write_header(self):
 
                 from torch import device, empty, empty_strided
                 from {extension_codecache.__name__} import CustomAsyncCompile
-                from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN
+                from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_BACKENDSIM_EAGER_MODE
+                from Simulator.simulator import BackendSimulator
                 from PyTorchSimFrontend.extension_op import sparse_mm_dummy_stonne_outer
                 from torch._inductor.select_algorithm import extern_kernels
 
@@ -107,11 +109,27 @@ def write_header(self):
         )
         self.header.splice(
             f"""
-            def sram_allocate(buffer):
-                pass
-
-            def sram_deallocate(buffer):
-                pass
+            def sram_plan_prefix(buffer_name, buffer):
+                #if CONFIG_SRAM_BUFFER_PLAN is None:
+                #    return
+                #elif buffer_name not in CONFIG_SRAM_BUFFER_PLAN:
+                #    return
+                buffer_size = buffer.element_size() * buffer.untyped_storage().size()
+                start = buffer.data_ptr()
+                end = start + buffer_size
+                # print(f'Alloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})')
+                BackendSimulator.sram_alloc(buffer_name, [start, end])
+
+            def sram_plan_postfix(buffer_name, buffer):
+                #if CONFIG_SRAM_BUFFER_PLAN is None:
+                #    return
+                #elif buffer_name not in CONFIG_SRAM_BUFFER_PLAN:
+                #    return
+                buffer_size = buffer.element_size() * buffer.untyped_storage().size()
+                start = buffer.data_ptr()
+                end = start + buffer_size
+                # print(f'Dealloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})')
+                BackendSimulator.sram_dealloc(buffer_name, [start, end])
 
             def host2device_memcopy(buffer):
                 pass
@@ -136,6 +154,19 @@ def call(args):
 
             self.codegen_inputs(self.prefix, V.graph.graph_inputs)
             self.codegen_input_size_asserts()
+            self.codegen_sram_plan_prefix()
+
+    def codegen_sram_plan_prefix(self):
+        for name, buf in V.graph.graph_inputs.items():
+            if isinstance(buf, sympy.Expr):
+                continue
+            if sympy_product(buf.get_size()) == 0:
+                continue
+            self.prefix.writeline(f"sram_plan_prefix('{name}', {name})")
+
+    def codegen_sram_plan_postfix(self, outputs):
+        for name in outputs:
+            self.wrapper_call.writeline(f"sram_plan_postfix('{name}', {name})")
 
     @dynamo_timed
     def generate(self, is_inference):
@@ -146,12 +177,22 @@ def generate(self, is_inference):
             stack.enter_context(self.wrapper_call.indent())
             self.memory_plan()
             for line in self.lines:
+                # Add buffer plan hook for dealloc
+                if isinstance(line, memory_planning.DeallocFromPoolLine):
+                    self.wrapper_call.writeline(f"sram_plan_postfix('{line.node.get_name()}', {line.node.get_name()})")
+                elif isinstance(line, str) and "del" in line:
+                    name = line.split(" ")[1]
+                    self.wrapper_call.writeline(f"sram_plan_postfix('{name}', {name})")
+
                 if isinstance(line, wrapper.MemoryPlanningLine):
                     line.codegen(self.wrapper_call)
                 else:
                     self.wrapper_call.writeline(line)
-
+                # Add buffer plan hook for alloc
+                if isinstance(line, memory_planning.AllocFromPoolLine):
+                    self.wrapper_call.writeline(f"sram_plan_prefix('{line.node.get_name()}', {line.node.get_name()})")
             output_refs = self.get_output_refs()
+            self.codegen_sram_plan_postfix(output_refs)
             self.mark_output_type()
             self.generate_return(output_refs)
 
@@ -168,25 +209,6 @@ def generate(self, is_inference):
 
     def memory_plan(self):
         self.lines = memory_planning.MemoryPlanner(self).plan(self.lines)
-
-    def memory_plan_reuse(self):
-        out_names = V.graph.get_output_names()
-
-        while (
-            self.lines
-            and isinstance(self.lines[-1], wrapper.MemoryPlanningLine)
-            # TODO: this seems legit, NullLine has no node
-            and self.lines[-1].node.name not in out_names  # type: ignore[attr-defined]
-        ):
-            # these lines will be pointless
-            self.lines.pop()
-
-        # codegen allocations in two passes
-        planning_state = wrapper.MemoryPlanningState()
-        for i in range(len(self.lines)):
-            if isinstance(self.lines[i], wrapper.MemoryPlanningLine):
-                self.lines[i] = self.lines[i].plan(planning_state)
-
 class ExtensionOverrides(common.OpOverrides):
     # Binary element wise operations
     @staticmethod
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 04efb25a..5d233986 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -188,6 +188,7 @@ def show_progress():
 class BackendSimulator():
     BACKEND_RESULT_PATH_KEY = "BACKEND_RESULT_PATH"
     FINISH_STR = "Simulation Finished"
+    ALLOC_POOL = dict() # For eagermode buffer plan
     def __init__(self, backend_path, config_path, vectorlane_size=-1) -> None:
         self.base_dir = backend_path
         self.config_path = config_path
@@ -318,8 +319,18 @@ def quit(self):
         ret = self.send_command(command)
         return
 
+    @classmethod
+    def sram_alloc(cls, buf_name, addr_range):
+        cls.ALLOC_POOL[buf_name] = addr_range
+
+    @classmethod
+    def sram_dealloc(cls, buf_name, addr_range):
+        if buf_name in cls.ALLOC_POOL:
+            del cls.ALLOC_POOL[buf_name]
+
     def create_attribute_file(self, attribute_path, inputs, **kwargs):
         address_info = {}
+        sram_buffer = {}
         json_content = {}
         os.makedirs(attribute_path, exist_ok=True)
         index = str(len(os.listdir(attribute_path)))
@@ -329,6 +340,10 @@ def create_attribute_file(self, attribute_path, inputs, **kwargs):
             address_info[f"arg{idx}"] = tensor.data_ptr()
         json_content["address_info"] = address_info
 
+        for buf_name, range in self.ALLOC_POOL.items():
+            sram_buffer[buf_name] = range
+        json_content["sram_alloc"] = sram_buffer
+
         with open(attribute_path, "w") as f:
             json.dump(json_content, f, indent=4)
         return attribute_path

From 4202eab9009991b3a163684b289b5b3c18799629 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 19 May 2025 08:18:04 +0000
Subject: [PATCH 313/432] [TOGSim] Support manual l2 sram allocation

---
 ...stolic_ws_128x128_c1_simple_noc_tpuv4.json |   4 +-
 ...stolic_ws_128x128_c2_simple_noc_tpuv4.json |   6 +-
 PyTorchSimBackend/include/Cache.h             |  96 ++++++
 PyTorchSimBackend/include/Cache_defs.h        |   7 +-
 PyTorchSimBackend/include/Dram.h              |   2 +-
 PyTorchSimBackend/include/Instruction.h       |   1 +
 PyTorchSimBackend/include/L2Cache.h           |  14 +-
 PyTorchSimBackend/include/Memfetch.h          |   3 +
 PyTorchSimBackend/include/SimulationConfig.h  |   2 +-
 PyTorchSimBackend/include/TileGraph.h         |   7 +
 PyTorchSimBackend/include/TileGraphParser.h   |   2 +
 PyTorchSimBackend/src/Cache.cc                | 307 ++++++++++++++++--
 PyTorchSimBackend/src/Common.cc               |   4 +-
 PyTorchSimBackend/src/Dram.cc                 |   6 +-
 PyTorchSimBackend/src/L2Cache.cc              |  87 +++--
 PyTorchSimBackend/src/TMA.cc                  |   7 +
 PyTorchSimBackend/src/TileGraph.cc            |   7 +-
 PyTorchSimBackend/src/TileGraphParser.cc      |  13 +
 18 files changed, 481 insertions(+), 94 deletions(-)

diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
index 2594f734..ed007c2b 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
@@ -14,8 +14,8 @@
   "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-  "l2d_type" : "readonly",
-  "l2d_config" : "S:128:128:512,32,L:R:m:L:L,A:192:4,32:0,32",
+  "l2d_type" : "datacache",
+  "l2d_config" : "S:128:128:512,32,L:T:m:L:L,A:192:4,32:0,32",
  
   "icnt_type" : "simple",
   "icnt_latency" : 7,
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
index 63b926d6..1b593e7f 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
@@ -14,9 +14,9 @@
   "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-  "l2d_type" : "readonly",
-  "l2d_config" : "S:64:128:512,32,L:R:m:L:L,A:192:4,32:0,32",
-
+  "l2d_type" : "datacache",
+  "l2d_config" : "S:64:128:512,32,L:T:m:L:L,A:192:4,32:0,32",
+ 
   "icnt_type" : "simple",
   "icnt_latency" : 7,
   "icnt_freq" : 38400,
diff --git a/PyTorchSimBackend/include/Cache.h b/PyTorchSimBackend/include/Cache.h
index e09af658..1d5927ac 100644
--- a/PyTorchSimBackend/include/Cache.h
+++ b/PyTorchSimBackend/include/Cache.h
@@ -372,4 +372,100 @@ class ReadOnlyCache : public Cache {
                                     std::deque<CacheEvent> &event) override;
 };
 
+class DataCache : public Cache {
+ public:
+  DataCache(std::string name, CacheConfig &config, int core_id, int type_id,
+            std::queue<mem_fetch*> *to_mem_queue, bool is_l1 = false)
+      : Cache(name, config, core_id, type_id, to_mem_queue) {
+    init();
+    m_write_alloc_type = L2_CACHE_WA;
+    m_write_back_type = L2_CACHE_WB;
+  }
+  virtual void init();
+  virtual void print_cache_stats();
+  virtual CacheRequestStatus access(uint64_t addr, uint32_t time, mem_fetch *mf,
+                                    std::deque<CacheEvent> &event) override;
+ protected:
+  mem_access_type m_write_alloc_type;
+  mem_access_type m_write_back_type;
+  CacheRequestStatus process_tag_probe(bool wr, CacheRequestStatus status,
+                                       uint64_t addr, uint32_t cache_index,
+                                       mem_fetch *mf, uint32_t time,
+                                       std::deque<CacheEvent> &events);
+  // Functions for data cache access
+  /// Sends write request to lower level memory (write or writeback)
+  void send_write_request(mem_fetch *mf, CacheEvent request, uint32_t time,
+                          std::deque<CacheEvent> &events);
+  void write_back(EvictedBlockInfo &evicted, uint32_t time, std::deque<CacheEvent> &events);
+
+  CacheRequestStatus (DataCache::*m_wr_hit)(uint64_t addr, uint32_t cache_index,
+                                            mem_fetch *mf, uint32_t time,
+                                            std::deque<CacheEvent> &event,
+                                            CacheRequestStatus status);
+  CacheRequestStatus (DataCache::*m_wr_miss)(uint64_t addr,
+                                             uint32_t cache_index,
+                                             mem_fetch *mf, uint32_t time,
+                                             std::deque<CacheEvent> &event,
+                                             CacheRequestStatus status);
+  CacheRequestStatus (DataCache::*m_rd_hit)(uint64_t addr, uint32_t cache_index,
+                                            mem_fetch *mf, uint32_t time,
+                                            std::deque<CacheEvent> &event,
+                                            CacheRequestStatus status);
+  CacheRequestStatus (DataCache::*m_rd_miss)(uint64_t addr,
+                                             uint32_t cache_index,
+                                             mem_fetch *mf, uint32_t time,
+                                             std::deque<CacheEvent> &event,
+                                             CacheRequestStatus status);
+
+  // Function pointers for different cache access
+  // Write hit
+  CacheRequestStatus wr_hit_wb(
+      uint64_t addr, uint32_t cache_index, mem_fetch *mf, uint32_t time,
+      std::deque<CacheEvent> &event,
+      CacheRequestStatus status);  // write hit with write back
+  CacheRequestStatus wr_hit_wt(
+      uint64_t addr, uint32_t cache_index, mem_fetch *mf, uint32_t time,
+      std::deque<CacheEvent> &event,
+      CacheRequestStatus status);  // write hit with write through
+  CacheRequestStatus wr_hit_we(
+      uint64_t addr, uint32_t cache_index, mem_fetch *mf, uint32_t time,
+      std::deque<CacheEvent> &event,
+      CacheRequestStatus status);  // write hit with write evict
+  CacheRequestStatus wr_hit_global_we_local_wb(
+      uint64_t addr, uint32_t cache_index, mem_fetch *mf, uint32_t time,
+      std::deque<CacheEvent> &event,
+      CacheRequestStatus status);  // write hit with write evict for global and
+                                   // write back for local
+  // Write miss
+  CacheRequestStatus wr_miss_wa_naive(
+      uint64_t addr, uint32_t cache_index, mem_fetch *mf, uint32_t time,
+      std::deque<CacheEvent> &event,
+      CacheRequestStatus status);  // write allocate send write and read requsts
+  CacheRequestStatus wr_miss_wa_lazy_fetch_on_read(
+      uint64_t addr, uint32_t cache_index, mem_fetch *mf, uint32_t time,
+      std::deque<CacheEvent> &event,
+      CacheRequestStatus status);  // write allocate with read-fetch-only
+  CacheRequestStatus wr_miss_wa_write_validate(
+      uint64_t addr, uint32_t cache_index, mem_fetch *mf, uint32_t time,
+      std::deque<CacheEvent> &event,
+      CacheRequestStatus
+          status);  // write-allocate that writes with no read fetch
+  CacheRequestStatus wr_miss_no_wa(
+      uint64_t addr, uint32_t cache_index, mem_fetch *mf, uint32_t time,
+      std::deque<CacheEvent> &event,
+      CacheRequestStatus status);  // no write allocate
+
+  // Read hit
+  CacheRequestStatus rd_hit_base(uint64_t addr, uint32_t cache_index,
+                                 mem_fetch *mf, uint32_t time,
+                                 std::deque<CacheEvent> &event,
+                                 CacheRequestStatus status);  // read hit base
+
+  // Read miss
+  CacheRequestStatus rd_miss_base(uint64_t addr, uint32_t cache_index,
+                                  mem_fetch *mf, uint32_t time,
+                                  std::deque<CacheEvent> &event,
+                                  CacheRequestStatus status);  // read miss base
+};
+
 #endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/Cache_defs.h b/PyTorchSimBackend/include/Cache_defs.h
index 8cb75fce..af5035fc 100644
--- a/PyTorchSimBackend/include/Cache_defs.h
+++ b/PyTorchSimBackend/include/Cache_defs.h
@@ -8,7 +8,12 @@
 
 const int SECTOR_CHUNCK_SIZE = 4;
 typedef std::bitset<SECTOR_CHUNCK_SIZE> SectorMask;
-enum CacheBlockState { INVALID, RESERVED, VALID, MODIFIED };
+enum CacheBlockState {
+  INVALID,  // Initial state
+  RESERVED, // Reserved state (alloc())
+  VALID,    // Filled state (fill)
+  MODIFIED  // Filled with modified data (fill)
+};
 enum CacheRequestStatus {
   HIT,
   HIT_RESERVED,
diff --git a/PyTorchSimBackend/include/Dram.h b/PyTorchSimBackend/include/Dram.h
index 96c2397c..0b129aef 100644
--- a/PyTorchSimBackend/include/Dram.h
+++ b/PyTorchSimBackend/include/Dram.h
@@ -43,7 +43,7 @@ class Dram {
   std::vector<std::queue<mem_fetch*>> m_from_crossbar_queue;
   std::vector<std::queue<mem_fetch*>> m_to_crossbar_queue;
   std::vector<std::queue<mem_fetch*>> m_to_mem_queue;
-  std::vector<L2Cache*> _m_caches;
+  std::vector<L2CacheBase*> _m_caches;
 };
 
 class DramRamulator2 : public Dram {
diff --git a/PyTorchSimBackend/include/Instruction.h b/PyTorchSimBackend/include/Instruction.h
index e0e904af..6544c930 100644
--- a/PyTorchSimBackend/include/Instruction.h
+++ b/PyTorchSimBackend/include/Instruction.h
@@ -60,6 +60,7 @@ class Instruction {
     int offset = std::inner_product(_idx_list.begin(), _idx_list.end(), _stride_list.begin(), 0);
     dram_addr += offset * _precision;
   }
+  addr_type get_base_dram_address() { return dram_addr; }
   void set_free_sram_size(size_t sram_size) { _free_sram_size=sram_size; }
   void* get_owner() { return _owner; }
   void set_owner(void *owner) { _owner = owner;}
diff --git a/PyTorchSimBackend/include/L2Cache.h b/PyTorchSimBackend/include/L2Cache.h
index 70977844..e822e6be 100644
--- a/PyTorchSimBackend/include/L2Cache.h
+++ b/PyTorchSimBackend/include/L2Cache.h
@@ -3,10 +3,11 @@
 #include "Memfetch.h"
 #include "Cache.h"
 #include "Instruction.h"
+#include "IntervalTree.h"
 
-class L2Cache {
+class L2CacheBase {
 public:
-  L2Cache(std::string name, CacheConfig &cache_config, uint32_t id, cycle_type *core_cycle,
+  L2CacheBase(std::string name, CacheConfig &cache_config, uint32_t id, cycle_type *core_cycle,
     uint32_t l2d_hit_latency, std::queue<mem_fetch*> *to_xbar_queue,
     std::queue<mem_fetch*> *from_xbar_queue) : 
     l_name(name), l_cache_config(cache_config), l_id(id), l_core_cycle(core_cycle),
@@ -33,18 +34,19 @@ class L2Cache {
   std::unique_ptr<Cache> l_cache;
 };
 
-class NoL2Cache : public L2Cache {
+class NoL2Cache : public L2CacheBase {
 public:
   NoL2Cache(std::string name,  CacheConfig &cache_config, uint32_t id, cycle_type *core_cycle,
     std::queue<mem_fetch*> *to_xbar_queue, std::queue<mem_fetch*> *from_xbar_queue) : 
-    L2Cache(name, cache_config, id, core_cycle, 0, to_xbar_queue, from_xbar_queue) {}
+    L2CacheBase(name, cache_config, id, core_cycle, 0, to_xbar_queue, from_xbar_queue) {}
   void cycle() override;
   bool push(mem_fetch* req) override;  // Push memory response from DRAM
 };
 
-class ReadOnlyL2Cache : public L2Cache {
+class L2DataCache : public L2CacheBase {
 public:
-  ReadOnlyL2Cache(std::string name,  CacheConfig &cache_config, uint32_t id, cycle_type *core_cycle,
+  typedef IntervalTree<new_addr_type, int> CachePlan;
+  L2DataCache(std::string name,  CacheConfig &cache_config, uint32_t id, cycle_type *core_cycle,
     uint32_t l2d_hit_latency, std::queue<mem_fetch*> *to_xbar_queue,
     std::queue<mem_fetch*> *from_xbar_queue);
   void cycle() override;
diff --git a/PyTorchSimBackend/include/Memfetch.h b/PyTorchSimBackend/include/Memfetch.h
index 5eb659cf..128d81b2 100644
--- a/PyTorchSimBackend/include/Memfetch.h
+++ b/PyTorchSimBackend/include/Memfetch.h
@@ -59,6 +59,8 @@ class mem_fetch {
   mem_fetch* get_original_mf() { return m_original_mf; }
   bool is_atomic() { return false; }
   bool is_request() { return m_type == mf_type::READ_REQUEST || m_type == mf_type::WRITE_REQUEST; }
+  void set_cacheable(bool cacheable) { m_cacheable = cacheable; }
+  bool is_cacheable() { return m_cacheable; }
   void set_reply() {
     if (m_type == mf_type::READ_REQUEST)
       m_type = mf_type::READ_REPLY;
@@ -91,6 +93,7 @@ class mem_fetch {
   mem_fetch* m_original_mf;
   void* m_custom_data = NULL;
   uint64_t m_start_cycle = 0ULL;
+  bool m_cacheable = true;
 };
 
 #endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/PyTorchSimBackend/include/SimulationConfig.h
index b647b3f9..8f011d00 100644
--- a/PyTorchSimBackend/include/SimulationConfig.h
+++ b/PyTorchSimBackend/include/SimulationConfig.h
@@ -11,7 +11,7 @@ enum class DramType { SIMPLE, RAMULATOR1, RAMULATOR2 };
 
 enum class IcntType { SIMPLE, BOOKSIM2 };
 
-enum class L2CacheType { NOCACHE, READONLY };
+enum class L2CacheType { NOCACHE, DATACACHE };
 
 struct SimulationConfig {
   /* Core config */
diff --git a/PyTorchSimBackend/include/TileGraph.h b/PyTorchSimBackend/include/TileGraph.h
index 6bd281d5..770bee06 100644
--- a/PyTorchSimBackend/include/TileGraph.h
+++ b/PyTorchSimBackend/include/TileGraph.h
@@ -5,6 +5,7 @@
 #include <queue>
 #include <set>
 #include "Tile.h"
+#include "IntervalTree.h"
 
 class TileSubGraph {
  public:
@@ -17,6 +18,8 @@ class TileSubGraph {
   int get_id() { return _id; }
   void set_core_id(int core_id) { _core_id = core_id; }
   int get_core_id() { return _core_id; }
+  void set_owner(void* owner) { _owner = owner; }
+  void* get_owner() { return _owner; }
   struct CompareReadyTile {
     bool operator()(const std::shared_ptr<Tile>& a, const std::shared_ptr<Tile>& b) const {
       return a->get_required_sram_size() > b->get_required_sram_size();
@@ -29,6 +32,7 @@ class TileSubGraph {
   int _id;
   int _core_id = -1;
   static int _next_id;
+  void* _owner=NULL;
 };
 
 class TileGraph {
@@ -63,6 +67,8 @@ class TileGraph {
   std::string get_name() { return _name; }
   void set_arrival_time(cycle_type arrival_time) { _arrival_time = arrival_time; }
   cycle_type get_arrival_time() { return _arrival_time; }
+  void init_cache_plan(IntervalTree<unsigned long long, int>::interval_vector it) { _cache_plan = IntervalTree<unsigned long long, int>(std::move(it)); }
+  bool is_cacheable(unsigned long long start, unsigned long long end);
   bool StonneGraph = false;
 
   class Iterator {
@@ -128,6 +134,7 @@ class TileGraph {
   std::vector<std::shared_ptr<TileSubGraph>> _subgraph_vec;
   std::vector<std::shared_ptr<TileSubGraph>> _finished_subgraph_vec;
   std::map<int, std::map<int, std::shared_ptr<TileSubGraph>>> _cpu_graph_map;
+  IntervalTree<unsigned long long, int> _cache_plan;
   cycle_type _arrival_time;
   static std::shared_ptr<Tile> null_tile;
 };
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/PyTorchSimBackend/include/TileGraphParser.h
index 16393325..b5322b76 100644
--- a/PyTorchSimBackend/include/TileGraphParser.h
+++ b/PyTorchSimBackend/include/TileGraphParser.h
@@ -8,6 +8,7 @@
 #include "TileGraph.h"
 #include "Instruction.h"
 #include "sstStonne.h"
+#include "IntervalTree.h"
 #include "onnx/defs/schema.h"
 #include "onnx/onnx-operators_pb.h"
 #include "onnx/onnx_pb.h"
@@ -146,6 +147,7 @@ class TileGraphParser {
   std::unique_ptr<TileGraph> _tile_graph;
   std::map<std::string, addr_type> _arg_to_address;
   std::map<std::string, std::vector<uint32_t>> _arg_numa_stride;
+  std::vector<Interval<unsigned long long, int>> _cache_plan;
   std::map<std::string, std::tuple<int, int, LoopType>> _loop_size_map;
   std::map<std::string, std::string> _tog_meta;
   std::map<std::pair<std::string, std::vector<int>>, uint32_t> _tag_table;
diff --git a/PyTorchSimBackend/src/Cache.cc b/PyTorchSimBackend/src/Cache.cc
index b7423424..8bbab112 100644
--- a/PyTorchSimBackend/src/Cache.cc
+++ b/PyTorchSimBackend/src/Cache.cc
@@ -112,7 +112,9 @@ void LineCacheBlock::fill(uint32_t time, SectorMask) {
 
 SectorMask LineCacheBlock::get_dirty_mask() {
   SectorMask dirty_mask;
-  dirty_mask.set();
+  dirty_mask.reset();
+  if (m_status == MODIFIED)
+    dirty_mask.set();
   return dirty_mask;
 }
 
@@ -126,13 +128,8 @@ void SectorCacheBlock::allocate(uint64_t tag, uint64_t block_addr,
   uint32_t sidx = get_sector_index(sector_mask);
   m_sector_alloc_time[sidx] = time;
   m_sector_last_access_time[sidx] = time;
-  m_sector_fill_time[sidx] = 0;
-  m_status[sidx] = RESERVED;
-  m_ignore_on_fill_status[sidx] = false;
-  m_set_modified_on_fill_status[sidx] = false;
   m_line_alloc_time = time;
   m_line_last_access_time = time;
-  m_line_fill_time = 0;
 }
 
 void SectorCacheBlock::allocate_sector(uint32_t time, SectorMask sector_mask) {
@@ -140,16 +137,11 @@ void SectorCacheBlock::allocate_sector(uint32_t time, SectorMask sector_mask) {
   uint32_t sidx = get_sector_index(sector_mask);
   m_sector_alloc_time[sidx] = time;
   m_sector_last_access_time[sidx] = time;
-  m_sector_fill_time[sidx] = 0;
-  if (m_status[sidx] == MODIFIED)
-    m_set_modified_on_fill_status[sidx] = true;
-  else
-    m_set_modified_on_fill_status[sidx] = false;
+  m_line_last_access_time = time;
+  m_set_modified_on_fill_status[sidx] = m_status[sidx] == MODIFIED ? true : false; 
   m_status[sidx] = RESERVED;
   m_ignore_on_fill_status[sidx] = false;
   m_readable[sidx] = true;
-  m_line_last_access_time = time;
-  m_line_fill_time = 0;
 }
 
 void SectorCacheBlock::fill(uint32_t time, SectorMask sector_mask) {
@@ -158,6 +150,7 @@ void SectorCacheBlock::fill(uint32_t time, SectorMask sector_mask) {
   m_sector_fill_time[sidx] = time;
   m_line_fill_time = time;
 }
+
 bool SectorCacheBlock::is_valid_line() { return !(is_invalid_line()); }
 
 bool SectorCacheBlock::is_invalid_line() {
@@ -300,47 +293,51 @@ CacheRequestStatus TagArray::probe(uint64_t addr, uint32_t &idx,
   for (uint32_t way = 0; way < m_config.get_num_assoc(); way++) {
     uint32_t index = set_index * m_config.get_num_assoc() + way;
     CacheBlock *line = m_lines[index];
-    if (line->match_tag(tag)) {  // tag matched
+
+    // Handle tag matched case
+    if (line->match_tag(tag)) {
+      idx = index;
       if (line->get_status(mask) == RESERVED) {
-        idx = index;
         return HIT_RESERVED;
       } else if (line->get_status(mask) == VALID ||
                  (line->get_status(mask) == MODIFIED &&
                   line->is_readable(mask))) {
-        idx = index;
         return HIT;
       } else if ((line->get_status(mask) == MODIFIED &&
                   !line->is_readable(mask)) ||
                  (line->is_valid_line() && line->get_status(mask) == INVALID)) {
-        idx = index;
         return SECTOR_MISS;
       } else {
         assert(line->get_status(mask) == INVALID);
       }
-    }
-    if (!line->is_reserved_line()) {
+    } else if (!line->is_reserved_line()) {
       all_reserved = false;
       if (line->is_invalid_line()) {
         invalid_line = index;
-      } else {
-        if (m_config.get_evict_policy() == LRU) {
-          if (line->get_last_access_time() < valid_timestamp) {
-            valid_timestamp = line->get_last_access_time();
-            valid_line = index;
-          }
-        } else if (m_config.get_evict_policy() == FIFO) {
-          if (line->get_alloc_time() < valid_timestamp) {
-            valid_timestamp = line->get_alloc_time();
-            valid_line = index;
-          }
+        continue;
+      }
+
+      // Choose cacheline for eviction
+      if (m_config.get_evict_policy() == LRU) {
+        if (line->get_last_access_time() < valid_timestamp) {
+          valid_timestamp = line->get_last_access_time();
+          valid_line = index;
+        }
+      } else if (m_config.get_evict_policy() == FIFO) {
+        if (line->get_alloc_time() < valid_timestamp) {
+          valid_timestamp = line->get_alloc_time();
+          valid_line = index;
         }
       }
     }
   }
+
+  // All target cachelines are reserved
   if (all_reserved) {
     assert(m_config.get_alloc_policy() == ON_MISS);
     return RESERVATION_FAIL;
   }
+
   if (invalid_line != (uint32_t)-1) {
     idx = invalid_line;
   } else if (valid_line != (uint32_t)-1) {
@@ -710,3 +707,253 @@ CacheRequestStatus ReadOnlyCache::access(uint64_t addr, uint32_t time,
   m_bandwidth_management.use_data_port(mf, cache_status, events);
   return cache_status;
 }
+
+/* Data Cache */
+void DataCache::init() {
+  m_rd_hit = &DataCache::rd_hit_base;
+  m_rd_miss = &DataCache::rd_miss_base;
+  switch (m_config.get_write_policy()) {
+    case READ_ONLY:
+      assert(0);  // Data cache cannot be read only
+    case WRITE_BACK:
+      m_wr_hit = &DataCache::wr_hit_wb;
+      break;
+    case WRITE_THROUGH:
+      m_wr_hit = &DataCache::wr_hit_wt;
+      break;
+    case WRITE_EVICT:
+      m_wr_hit = &DataCache::wr_hit_we;
+      break;
+    default:
+      assert(0);
+  }
+  switch (m_config.get_write_alloc_policy()) {
+    case NO_WRITE_ALLOCATE:
+      m_wr_miss = &DataCache::wr_miss_no_wa;
+      break;
+    case WRITE_ALLOCATE:
+      m_wr_miss = &DataCache::wr_miss_wa_naive;
+      break;
+    default:
+      assert(0);
+  }
+}
+
+void DataCache::print_cache_stats() {
+  uint64_t hit = m_stats.get_interval_hit();
+  uint64_t miss = m_stats.get_interval_miss();
+  if (m_id == 0) {
+    spdlog::info("NDP {:2}: average Data Cache Hit : {}, Miss : {} , Hit Raito : {:.2f}\%", m_id,
+                 hit, miss, ((float)hit) / (hit + miss) * 100);
+  } else {
+    spdlog::debug("NDP {:2}: average Data Cache Hit : {}, Miss : {} , Hit Raito : {:.2f}\%", m_id,
+                 hit, miss, ((float)hit) / (hit + miss) * 100);
+  }
+}
+
+CacheRequestStatus DataCache::access(uint64_t addr, uint32_t time,
+                                     mem_fetch *mf,
+                                     std::deque<CacheEvent> &events) {
+  bool wr = mf->is_write();
+  uint64_t block_addr = m_config.get_block_addr(addr);
+  uint32_t cache_index = (uint32_t)-1;
+  CacheRequestStatus probe_status =
+      m_tag_array->probe(block_addr, cache_index, mf, true);
+  CacheRequestStatus access_status =
+      process_tag_probe(wr, probe_status, addr, cache_index, mf, time, events);
+  m_stats.inc_stats(mf->get_access_type(),
+                    m_stats.select_stats_status(probe_status, access_status));
+  return access_status;
+}
+
+CacheRequestStatus DataCache::process_tag_probe(bool wr,
+                                                CacheRequestStatus probe_status,
+                                                uint64_t addr,
+                                                uint32_t cache_index,
+                                                mem_fetch *mf, uint32_t time,
+                                                std::deque<CacheEvent> &events) {
+  CacheRequestStatus access_status = probe_status;
+  if (wr) {  // Write
+    if (probe_status == HIT) {
+      access_status =
+          (this->*m_wr_hit)(addr, cache_index, mf, time, events, probe_status);
+    } else if (probe_status != RESERVATION_FAIL ||
+               (probe_status == RESERVATION_FAIL &&
+                m_config.get_write_alloc_policy() == NO_WRITE_ALLOCATE)) {
+      access_status =
+          (this->*m_wr_miss)(addr, cache_index, mf, time, events, probe_status);
+    } else {
+      m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL);
+    }
+  } else {  // Read
+    if (probe_status == HIT) {
+      access_status =
+          (this->*m_rd_hit)(addr, cache_index, mf, time, events, probe_status);
+    } else if (probe_status != RESERVATION_FAIL) {
+      access_status =
+          (this->*m_rd_miss)(addr, cache_index, mf, time, events, probe_status);
+    } else {
+      m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL);
+    }
+  }
+  m_bandwidth_management.use_data_port(mf, access_status, events);
+  return access_status;
+}
+
+void DataCache::send_write_request(mem_fetch *mf, CacheEvent request,
+                                   uint32_t time,
+                                   std::deque<CacheEvent> &events) {
+  events.push_back(request);
+  m_miss_queue.push_back(mf);
+}
+
+void DataCache::write_back(EvictedBlockInfo &evicted, uint32_t time, std::deque<CacheEvent> &events) {
+  auto packet_size = m_config.get_atom_size();
+  for(int i = 0; i < evicted.m_modified_size / packet_size; i++) {
+    uint64_t evicted_addr = evicted.m_block_addr + i * packet_size;
+    mem_fetch *wb_mf =
+        new mem_fetch(evicted_addr, m_write_back_type, WRITE_REQUEST,
+                      packet_size);
+    wb_mf->set_dirty_mask(evicted.m_dirty_mask);
+    send_write_request(wb_mf, CacheEvent(WRITE_BACK_REQUEST_SENT, evicted),
+                       time, events);
+  }
+}
+
+/*** WRITE-hit functions (Set by config file) ***/
+// Write hit: Write back
+CacheRequestStatus DataCache::wr_hit_wb(uint64_t addr, uint32_t cache_index,
+                                        mem_fetch *mf, uint32_t time,
+                                        std::deque<CacheEvent> &events,
+                                        CacheRequestStatus status) {
+  uint64_t block_addr = m_config.get_block_addr(addr);
+  m_tag_array->access(block_addr, time, cache_index, mf);
+  CacheBlock *block = m_tag_array->get_block(cache_index);
+  block->set_status(MODIFIED, mf->get_access_sector_mask());
+  return HIT;
+}
+
+// Write hit: Write through
+CacheRequestStatus DataCache::wr_hit_wt(uint64_t addr, uint32_t cache_index,
+                                        mem_fetch *mf, uint32_t time,
+                                        std::deque<CacheEvent> &events,
+                                        CacheRequestStatus status) {
+  if (miss_queue_full(0)) {
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    return RESERVATION_FAIL;
+  }
+  uint64_t block_addr = m_config.get_block_addr(addr);
+  m_tag_array->access(block_addr, time, cache_index, mf);
+  CacheBlock *block = m_tag_array->get_block(cache_index);
+  block->set_status(MODIFIED, mf->get_access_sector_mask());
+
+  // Generate a write-through
+  send_write_request(mf, CacheEvent(WRITE_REQUEST_SENT), time, events);
+  return HIT;
+}
+
+// Write hit: Write evict
+CacheRequestStatus DataCache::wr_hit_we(uint64_t addr, uint32_t cache_index,
+                                        mem_fetch *mf, uint32_t time,
+                                        std::deque<CacheEvent> &events,
+                                        CacheRequestStatus status) {
+  if (miss_queue_full(0)) {
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    return RESERVATION_FAIL;
+  }
+  CacheBlock *block = m_tag_array->get_block(cache_index);
+  send_write_request(mf, CacheEvent(WRITE_REQUEST_SENT), time, events);
+  block->set_status(INVALID, mf->get_access_sector_mask());
+  return HIT;
+}
+
+/*** WRITE-miss functions (Set by config file) ***/
+// Write miss: Write allocate naive
+CacheRequestStatus DataCache::wr_miss_wa_naive(uint64_t addr,
+                                               uint32_t cache_index,
+                                               mem_fetch *mf, uint32_t time,
+                                               std::deque<CacheEvent> &events,
+                                               CacheRequestStatus status) {
+  uint64_t block_addr = m_config.get_block_addr(addr);
+  uint64_t mshr_addr = m_config.get_mshr_addr(addr);
+  bool mshr_hit = m_mshrs->probe(mshr_addr);
+  bool mshr_avail = !m_mshrs->full(mshr_addr);
+  if (miss_queue_full(2)) {
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    return RESERVATION_FAIL;
+  } else if (mshr_hit && !mshr_avail) {
+    m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENTRY_FAIL);
+    return RESERVATION_FAIL;
+  } else if (!mshr_hit && !mshr_avail) {
+    m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENTRY_FAIL);
+    return RESERVATION_FAIL;
+  }
+  send_write_request(mf, CacheEvent(WRITE_REQUEST_SENT), time, events);
+  mem_fetch *new_mf = new mem_fetch(
+      mf->get_addr(), m_write_alloc_type, READ_REQUEST, m_config.get_atom_size());
+  bool do_miss = false;
+  bool wb = false;
+  EvictedBlockInfo evicted;
+
+  // Send read request resulting from write miss
+  send_read_request(addr, block_addr, cache_index, new_mf, time, do_miss, wb,
+                    evicted, events, false, true);
+  if (do_miss) {
+    if (wb && (m_config.get_write_policy() != WRITE_THROUGH)) {
+      assert(status == MISS);
+      write_back(evicted, time, events);
+    }
+    return MISS;
+  }
+  return RESERVATION_FAIL;
+}
+
+// Write miss: Write allocate no write allocate
+CacheRequestStatus DataCache::wr_miss_no_wa(uint64_t addr, uint32_t cache_index,
+                                            mem_fetch *mf, uint32_t time,
+                                            std::deque<CacheEvent> &events,
+                                            CacheRequestStatus status) {
+  if (miss_queue_full(0)) {
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    return RESERVATION_FAIL;
+  }
+  send_write_request(mf, CacheEvent(WRITE_REQUEST_SENT), time, events);
+  return MISS;
+}
+
+CacheRequestStatus DataCache::rd_hit_base(uint64_t addr, uint32_t cache_index,
+                                          mem_fetch *mf, uint32_t time,
+                                          std::deque<CacheEvent> &events,
+                                          CacheRequestStatus status) {
+  uint64_t block_addr = m_config.get_block_addr(addr);
+  m_tag_array->access(block_addr, time, cache_index, mf);
+  if (mf->is_atomic()) {
+    CacheBlock *block = m_tag_array->get_block(cache_index);
+    block->set_status(MODIFIED, mf->get_access_sector_mask());
+  }
+  return HIT;
+}
+
+CacheRequestStatus DataCache::rd_miss_base(uint64_t addr, uint32_t cache_index,
+                                           mem_fetch *mf, uint32_t time,
+                                           std::deque<CacheEvent> &events,
+                                           CacheRequestStatus status) {
+  if (miss_queue_full(1)) {
+    mf->current_state = "MISS_QUEUE_FULL";
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    return RESERVATION_FAIL;
+  }
+  uint64_t block_addr = m_config.get_block_addr(addr);
+  bool do_miss = false;
+  bool wb = false;
+  EvictedBlockInfo evicted;
+  send_read_request(addr, block_addr, cache_index, mf, time, do_miss, wb,
+                    evicted, events, false, true);
+  if (do_miss) {
+    if (wb && (m_config.get_write_policy() != WRITE_THROUGH)) {
+      write_back(evicted, time, events);
+    }
+    return MISS;
+  }
+  return RESERVATION_FAIL;
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc
index e160a83b..5581f8bd 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/PyTorchSimBackend/src/Common.cc
@@ -82,8 +82,8 @@ SimulationConfig initialize_config(json config) {
   if (config.contains("l2d_type")) {
     if ((std::string)config["l2d_type"] == "nocache")
       parsed_config.l2d_type = L2CacheType::NOCACHE;
-    else if ((std::string)config["l2d_type"] == "readonly")
-      parsed_config.l2d_type = L2CacheType::READONLY;
+    else if ((std::string)config["l2d_type"] == "datacache")
+      parsed_config.l2d_type = L2CacheType::DATACACHE;
     else
       throw std::runtime_error(fmt::format("Not implemented l2 cache type {} ",
                                           (std::string)config["l2d_type"]));
diff --git a/PyTorchSimBackend/src/Dram.cc b/PyTorchSimBackend/src/Dram.cc
index 62dd0ca1..944068cb 100644
--- a/PyTorchSimBackend/src/Dram.cc
+++ b/PyTorchSimBackend/src/Dram.cc
@@ -34,15 +34,15 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) {
     spdlog::info("[Config/L2Cache] No L2 cache");
     for (int ch = 0; ch < _n_ch; ch++)
       _m_caches[ch] = new NoL2Cache(name, _m_cache_config, ch, _core_cycles, &m_to_crossbar_queue[ch], &m_from_crossbar_queue[ch]);
-  } else if (config.l2d_type == L2CacheType::READONLY) {
-    std::string name = "L2 ReadOnly cache";
+  } else if (config.l2d_type == L2CacheType::DATACACHE) {
+    std::string name = "L2 cache";
     _m_cache_config.init(config.l2d_config_str);
     spdlog::info("[Config/L2Cache] Total Size: {} KB, Partition Size: {} KB, Set: {}, Assoc: {}, Line Size: {}B Sector Size: {}B",
             _m_cache_config.get_total_size_in_kb() * _n_ch, _m_cache_config.get_total_size_in_kb(),
             _m_cache_config.get_num_sets(), _m_cache_config.get_num_assoc(),
             _m_cache_config.get_line_size(), _m_cache_config.get_sector_size());
     for (int ch = 0; ch < _n_ch; ch++)
-      _m_caches[ch] = new ReadOnlyL2Cache(name, _m_cache_config, ch, _core_cycles, _config.l2d_hit_latency, &m_to_crossbar_queue[ch], &m_from_crossbar_queue[ch]);
+      _m_caches[ch] = new L2DataCache(name, _m_cache_config, ch, _core_cycles, _config.l2d_hit_latency, &m_to_crossbar_queue[ch], &m_from_crossbar_queue[ch]);
   } else {
     spdlog::error("[Config/L2D] Invalid L2 cache type...!");
     exit(EXIT_FAILURE);
diff --git a/PyTorchSimBackend/src/L2Cache.cc b/PyTorchSimBackend/src/L2Cache.cc
index 4fa80efe..db8a5921 100644
--- a/PyTorchSimBackend/src/L2Cache.cc
+++ b/PyTorchSimBackend/src/L2Cache.cc
@@ -12,79 +12,78 @@ void NoL2Cache::cycle() {
   }
 }
 
-ReadOnlyL2Cache::ReadOnlyL2Cache(std::string name,  CacheConfig &cache_config, uint32_t id,
+L2DataCache::L2DataCache(std::string name,  CacheConfig &cache_config, uint32_t id,
   cycle_type *core_cycle, uint32_t l2d_hit_latency,
   std::queue<mem_fetch*> *to_xbar_queue, std::queue<mem_fetch*> *from_xbar_queue) :
-  L2Cache(name, cache_config, id, core_cycle, l2d_hit_latency, to_xbar_queue, from_xbar_queue) {
+  L2CacheBase(name, cache_config, id, core_cycle, l2d_hit_latency, to_xbar_queue, from_xbar_queue) {
   l_cache = std::make_unique<ReadOnlyCache>(name, cache_config, id, 0, &l_to_mem_queue);
   l_from_cache_queue = DelayQueue<mem_fetch*>(l_name + "_latency_queue", true, 0);
 }
 
-bool ReadOnlyL2Cache::push(mem_fetch* req) {
+bool L2DataCache::push(mem_fetch* req) {
   if (l_cache->waiting_for_fill(req)) {
     if (!l_cache->fill_port_free())
       return false;
     l_cache->fill(req, *l_core_cycle);
   } else {
-    if (req->get_access_type() == L2_CACHE_WB && req->get_type() == WRITE_ACK) {
-      delete req;
-    } else if (req->get_access_type() == GLOBAL_ACC_W && req->get_type() == WRITE_ACK) {
-      l_to_xbar_queue->push(req);
-    }
+    l_to_xbar_queue->push(req);
   }
   return true;
 }
 
-void ReadOnlyL2Cache::cycle() {
+void L2DataCache::cycle() {
   l_from_cache_queue.cycle();
   l_cache->cycle();
 
-  // Mem to Cache. Read Only cache
+  // Mem to Cache
   uint32_t line_size = l_cache_config.get_line_size();
   uint32_t sector_size = l_cache_config.get_sector_size();
 
-  /* Read request*/
-  if (!l_from_xbar_queue->empty() && !l_from_xbar_queue->front()->is_write() &&
-        l_cache->data_port_free()) {
+  /* Pass a request to cache */
+  if (!l_from_xbar_queue->empty()) {
     mem_fetch* req = l_from_xbar_queue->front();
-    req->set_access_sector_mask(line_size, sector_size);
-    std::deque<CacheEvent> events;
-    CacheRequestStatus status = l_cache->access(
-        req->get_addr(), *l_core_cycle, req, events);
-    bool write_sent = CacheEvent::was_write_sent(events);
-    bool read_sent = CacheEvent::was_read_sent(events);
-    if (status == HIT) {
-      if (!write_sent) {
-        req->set_reply();
-        l_from_cache_queue.push(req, l2d_hit_latency);
-      }
-      l_from_xbar_queue->pop();
-    } else if (status != RESERVATION_FAIL) {
-      if (req->is_write() && // FIXME: req->is_write() already checked above 48 line.
-          (l_cache_config.get_write_alloc_policy() == FETCH_ON_WRITE ||
-            l_cache_config.get_write_alloc_policy() == LAZY_FETCH_ON_READ)) {
-        req->set_reply();
-        l_from_cache_queue.push(req, l2d_hit_latency);
+    /* Check cache plan */
+    bool is_cacheable = req->is_cacheable();
+
+    /* Go to l2 cache */
+    if (is_cacheable && l_cache->data_port_free()) {
+      req->set_access_sector_mask(line_size, sector_size);
+      std::deque<CacheEvent> events;
+      CacheRequestStatus status = l_cache->access(
+          req->get_addr(), *l_core_cycle, req, events);
+      bool write_sent = CacheEvent::was_write_sent(events);
+      bool read_sent = CacheEvent::was_read_sent(events);
+      if (status == HIT) {
+        if (!write_sent) {
+          req->set_reply();
+          req->current_state = "L2HIT";
+          l_from_cache_queue.push(req, l2d_hit_latency);
+        }
+        l_from_xbar_queue->pop();
+      } else if (status != RESERVATION_FAIL) {
+        req->current_state = "L2MISS";
+        if (req->is_write() &&
+            (l_cache_config.get_write_alloc_policy() == FETCH_ON_WRITE ||
+              l_cache_config.get_write_alloc_policy() == LAZY_FETCH_ON_READ)) {
+          req->set_reply();
+          req->current_state = "L2MISS-WRITE";
+          l_from_cache_queue.push(req, l2d_hit_latency);
+        }
+        l_from_xbar_queue->pop();
+      } else {
+        // Status Reservation fail, Retry it
+        assert(!write_sent);
+        assert(!read_sent);
       }
+    } else if (!is_cacheable) {
+      l_to_mem_queue.push(req);
       l_from_xbar_queue->pop();
-    } else {
-      // Status Reservation fail
-      assert(!write_sent);
-      assert(!read_sent);
     }
   }
 
-  /* Write request is go mem directly */
-  if(!l_from_xbar_queue->empty() && l_from_xbar_queue->front()->is_write()) {
-    mem_fetch* req = l_from_xbar_queue->front();
-    l_to_mem_queue.push(req);
-    l_from_xbar_queue->pop();
-  }
-
   if (l_cache->access_ready() &&
       !l_from_cache_queue.full()) {
     mem_fetch* req = l_cache->top_next_access();
-    req->current_state = "L2 top next access";
     if (req->is_request()) req->set_reply();
     l_from_cache_queue.push(req, l2d_hit_latency);
     l_cache->pop_next_access();
@@ -97,7 +96,7 @@ void ReadOnlyL2Cache::cycle() {
   }
 }
 
-void ReadOnlyL2Cache::print_stats() {
+void L2DataCache::print_stats() {
   if (l_id == 0) {
     l_cache->get_stats().print_stats(stdout, l_name.c_str());
   }
diff --git a/PyTorchSimBackend/src/TMA.cc b/PyTorchSimBackend/src/TMA.cc
index d60e7149..46aad385 100644
--- a/PyTorchSimBackend/src/TMA.cc
+++ b/PyTorchSimBackend/src/TMA.cc
@@ -23,12 +23,19 @@ std::shared_ptr<std::vector<mem_fetch*>> TMA::get_memory_access() {
   auto access_vec = std::make_shared<std::vector<mem_fetch *>>();
   Tile* owner = (Tile*)_current_inst->get_owner();
   std::shared_ptr<TileSubGraph> owner_subgraph = owner->get_owner();
+  TileGraph* owner_graph = static_cast<TileGraph*>(owner_subgraph->get_owner());
+  unsigned long long base_daddr = _current_inst->get_base_dram_address();
+  // Todo. We use a ternsor level buffer allocation, so we don't need to check all memfetch
+  bool is_cacheable = owner_graph->is_cacheable(base_daddr, base_daddr + _dram_req_size);
+  spdlog::trace("[SRAM Trace] Core-{}, Address: 0x{:016x}, Is_cacheable: {}", _id, base_daddr, is_cacheable);
   spdlog::trace("[NUMA Trace] Core-{}, Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}",
     _id, owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write());
+
   for (auto addr: *addr_set) {
     mem_access_type acc_type = _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W : mem_access_type::GLOBAL_ACC_R;
     mf_type type = _current_inst->is_dma_write() ? mf_type::WRITE_REQUEST : mf_type::READ_REQUEST;
     mem_fetch* access = new mem_fetch(addr, acc_type, type, _dram_req_size, _current_inst->get_numa_id(), static_cast<void*>(_current_inst.get()));
+    access->set_cacheable(is_cacheable);
     _current_inst->inc_waiting_request();
     access_vec->push_back(access);
   }
diff --git a/PyTorchSimBackend/src/TileGraph.cc b/PyTorchSimBackend/src/TileGraph.cc
index 2a36b78d..cced5378 100644
--- a/PyTorchSimBackend/src/TileGraph.cc
+++ b/PyTorchSimBackend/src/TileGraph.cc
@@ -47,6 +47,7 @@ std::shared_ptr<Tile> TileSubGraph::get_tile() {
 
 
 void TileGraph::append_subgraph(std::shared_ptr<TileSubGraph> subgraph) {
+  subgraph->set_owner(this);
   _subgraph_vec.push_back(std::move(subgraph));
 }
 
@@ -62,7 +63,6 @@ bool TileGraph::is_finished() {
       if (tile_pair.second != nullptr)
         finished &= tile_pair.second->is_finished();
   }
-
   return finished;
 }
 
@@ -119,4 +119,9 @@ void TileGraph::allocate_subgraph(int core_id, int slot_id) {
     }
   }
   return;
+}
+
+bool TileGraph::is_cacheable(unsigned long long start, unsigned long long end) {
+  auto result = _cache_plan.findOverlapping(start, end);
+  return result.size() != 0;
 }
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index 4da85362..2b13ebc1 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -754,6 +754,18 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa
       spdlog::info("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", it.key(), fmt::join(_arg_numa_stride[it.key()], ", "));
     }
   }
+  if (_attribute_json.contains("sram_alloc")) {
+    auto sram_alloc_list = _attribute_json["sram_alloc"];
+    spdlog::info("[TOGParser/Attribute] ================= SRAM Alloc Plan ================");
+    for (auto it = sram_alloc_list.begin(); it != sram_alloc_list.end(); ++it) {
+      auto value_list = it.value();
+      unsigned long long start = value_list.at(0);
+      unsigned long long end = value_list.at(1);
+      spdlog::info("[TOGParser/Attribute] {:16s}: 0x{:016x} ~ 0x{:016x}", it.key(), start, end);
+      Interval<unsigned long long, int> entry = {start, end, 0};
+      _cache_plan.push_back(entry);
+    }
+  }
   load_sparse_meta_data();
 
   /* ONNX file parsing */
@@ -829,6 +841,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa
   }
 
   _tile_graph = std::make_unique<TileGraph>(TileGraph(onnx_path, graph_name));
+  _tile_graph->init_cache_plan(_cache_plan);
   if (std::stoi(this->getMetaByName("stonneGraph")))
     _tile_graph->StonneGraph=true;
 

From d54c9f09c41d7ad1d66c4fce77d5fd15db826e77 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 19 May 2025 08:44:09 +0000
Subject: [PATCH 314/432] [TOGSim] Fix missing header file for interval tree

---
 PyTorchSimBackend/include/IntervalTree.h | 344 +++++++++++++++++++++++
 1 file changed, 344 insertions(+)
 create mode 100644 PyTorchSimBackend/include/IntervalTree.h

diff --git a/PyTorchSimBackend/include/IntervalTree.h b/PyTorchSimBackend/include/IntervalTree.h
new file mode 100644
index 00000000..ddc2b915
--- /dev/null
+++ b/PyTorchSimBackend/include/IntervalTree.h
@@ -0,0 +1,344 @@
+#ifndef __INTERVAL_TREE_H
+#define __INTERVAL_TREE_H
+
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <cassert>
+#include <limits>
+
+#ifdef USE_INTERVAL_TREE_NAMESPACE
+namespace interval_tree {
+#endif
+template <class Scalar, typename Value>
+class Interval {
+public:
+    Scalar start;
+    Scalar stop;
+    Value value;
+    Interval(const Scalar& s, const Scalar& e, const Value& v)
+    : start(std::min(s, e))
+    , stop(std::max(s, e))
+    , value(v) 
+    {}
+};
+
+template <class Scalar, typename Value>
+Value intervalStart(const Interval<Scalar,Value>& i) {
+    return i.start;
+}
+
+template <class Scalar, typename Value>
+Value intervalStop(const Interval<Scalar, Value>& i) {
+    return i.stop;
+}
+
+template <class Scalar, typename Value>
+std::ostream& operator<<(std::ostream& out, const Interval<Scalar, Value>& i) {
+    out << "Interval(" << i.start << ", " << i.stop << "): " << i.value;
+    return out;
+}
+
+template <class Scalar, class Value>
+class IntervalTree {
+public:
+    typedef Interval<Scalar, Value> interval;
+    typedef std::vector<interval> interval_vector;
+
+
+    struct IntervalStartCmp {
+        bool operator()(const interval& a, const interval& b) {
+            return a.start < b.start;
+        }
+    };
+
+    struct IntervalStopCmp {
+        bool operator()(const interval& a, const interval& b) {
+            return a.stop < b.stop;
+        }
+    };
+
+    IntervalTree()
+        : left(nullptr)
+        , right(nullptr)
+        , center(0)
+    {}
+
+    ~IntervalTree() = default;
+
+    std::unique_ptr<IntervalTree> clone() const {
+        return std::unique_ptr<IntervalTree>(new IntervalTree(*this));
+    }
+
+    IntervalTree(const IntervalTree& other)
+    :   intervals(other.intervals),
+        left(other.left ? other.left->clone() : nullptr),
+        right(other.right ? other.right->clone() : nullptr),
+        center(other.center)
+    {}
+
+    IntervalTree& operator=(IntervalTree&&) = default;
+    IntervalTree(IntervalTree&&) = default;
+
+    IntervalTree& operator=(const IntervalTree& other) {
+        center = other.center;
+        intervals = other.intervals;
+        left = other.left ? other.left->clone() : nullptr;
+        right = other.right ? other.right->clone() : nullptr;
+        return *this;
+    }
+
+    IntervalTree(
+            interval_vector&& ivals,
+            std::size_t depth = 16,
+            std::size_t minbucket = 64,
+            std::size_t maxbucket = 512, 
+            Scalar leftextent = 0,
+            Scalar rightextent = 0)
+      : left(nullptr)
+      , right(nullptr)
+    {
+        --depth;
+        const auto minmaxStop = std::minmax_element(ivals.begin(), ivals.end(), 
+                                                    IntervalStopCmp());
+        const auto minmaxStart = std::minmax_element(ivals.begin(), ivals.end(), 
+                                                     IntervalStartCmp());
+        if (!ivals.empty()) {
+            center = (minmaxStart.first->start + minmaxStop.second->stop) / 2;
+        }
+        if (leftextent == 0 && rightextent == 0) {
+            // sort intervals by start
+            std::sort(ivals.begin(), ivals.end(), IntervalStartCmp());
+        } else {
+            assert(std::is_sorted(ivals.begin(), ivals.end(), IntervalStartCmp()));
+        }
+        if (depth == 0 || (ivals.size() < minbucket && ivals.size() < maxbucket)) {
+            std::sort(ivals.begin(), ivals.end(), IntervalStartCmp());
+            intervals = std::move(ivals);
+            assert(is_valid().first);
+            return;
+        } else {
+            Scalar leftp = 0;
+            Scalar rightp = 0;
+
+            if (leftextent || rightextent) {
+                leftp = leftextent;
+                rightp = rightextent;
+            } else {
+                leftp = ivals.front().start;
+                rightp = std::max_element(ivals.begin(), ivals.end(),
+                                          IntervalStopCmp())->stop;
+            }
+
+            interval_vector lefts;
+            interval_vector rights;
+
+            for (typename interval_vector::const_iterator i = ivals.begin(); 
+                 i != ivals.end(); ++i) {
+                const interval& interval = *i;
+                if (interval.stop < center) {
+                    lefts.push_back(interval);
+                } else if (interval.start > center) {
+                    rights.push_back(interval);
+                } else {
+                    assert(interval.start <= center);
+                    assert(center <= interval.stop);
+                    intervals.push_back(interval);
+                }
+            }
+
+            if (!lefts.empty()) {
+                left.reset(new IntervalTree(std::move(lefts), 
+                                            depth, minbucket, maxbucket,
+                                            leftp, center));
+            }
+            if (!rights.empty()) {
+                right.reset(new IntervalTree(std::move(rights), 
+                                             depth, minbucket, maxbucket, 
+                                             center, rightp));
+            }
+        }
+        assert(is_valid().first);
+    }
+
+    // Call f on all intervals near the range [start, stop]:
+    template <class UnaryFunction>
+    void visit_near(const Scalar& start, const Scalar& stop, UnaryFunction f) const {
+        if (!intervals.empty() && ! (stop < intervals.front().start)) {
+            for (auto & i : intervals) {
+              f(i);
+            }
+        }
+        if (left && start <= center) {
+            left->visit_near(start, stop, f);
+        }
+        if (right && stop >= center) {
+            right->visit_near(start, stop, f);
+        }
+    }
+
+    // Call f on all intervals crossing pos
+    template <class UnaryFunction>
+    void visit_overlapping(const Scalar& pos, UnaryFunction f) const {
+        visit_overlapping(pos, pos, f);
+    }
+
+    // Call f on all intervals overlapping [start, stop]
+    template <class UnaryFunction>
+    void visit_overlapping(const Scalar& start, const Scalar& stop, UnaryFunction f) const {
+        auto filterF = [&](const interval& interval) {
+            if (interval.stop >= start && interval.start <= stop) {
+                // Only apply f if overlapping
+                f(interval);
+            }
+        };
+        visit_near(start, stop, filterF);
+    }
+
+    // Call f on all intervals contained within [start, stop]
+    template <class UnaryFunction>
+    void visit_contained(const Scalar& start, const Scalar& stop, UnaryFunction f) const {
+        auto filterF = [&](const interval& interval) {
+            if (start <= interval.start && interval.stop <= stop) {
+                f(interval);
+            }
+        };
+        visit_near(start, stop, filterF);
+    }
+
+    interval_vector findOverlapping(const Scalar& start, const Scalar& stop) const {
+        interval_vector result;
+        visit_overlapping(start, stop,
+                          [&](const interval& interval) { 
+                            result.emplace_back(interval); 
+                          });
+        return result;
+    }
+
+    interval_vector findContained(const Scalar& start, const Scalar& stop) const {
+        interval_vector result;
+        visit_contained(start, stop,
+                        [&](const interval& interval) { 
+                          result.push_back(interval); 
+                        });
+        return result;
+    }
+    bool empty() const {
+        if (left && !left->empty()) {
+            return false;
+        }
+        if (!intervals.empty()) { 
+            return false;
+        }
+        if (right && !right->empty()) {
+            return false;
+        }
+        return true;
+    }
+
+    template <class UnaryFunction>
+    void visit_all(UnaryFunction f) const {
+        if (left) {
+            left->visit_all(f);
+        }
+        std::for_each(intervals.begin(), intervals.end(), f);
+        if (right) {
+            right->visit_all(f);
+        }
+    }
+
+    std::pair<Scalar, Scalar> extentBruitForce() const {
+        struct Extent {
+            std::pair<Scalar, Scalar> x = {std::numeric_limits<Scalar>::max(),
+                                                       std::numeric_limits<Scalar>::min() };
+            void operator()(const interval & interval) {
+                x.first  = std::min(x.first,  interval.start);
+                x.second = std::max(x.second, interval.stop);
+            }
+                                                                };
+                                            Extent extent;
+
+        visit_all([&](const interval & interval) { extent(interval); });
+        return extent.x;
+                                            }
+
+    // Check all constraints.
+    // If first is false, second is invalid.
+    std::pair<bool, std::pair<Scalar, Scalar>> is_valid() const {
+        const auto minmaxStop = std::minmax_element(intervals.begin(), intervals.end(), 
+                                                    IntervalStopCmp());
+        const auto minmaxStart = std::minmax_element(intervals.begin(), intervals.end(), 
+                                                     IntervalStartCmp());
+        
+        std::pair<bool, std::pair<Scalar, Scalar>> result = {true, { std::numeric_limits<Scalar>::max(),
+                                                                     std::numeric_limits<Scalar>::min() }};
+        if (!intervals.empty()) {
+            result.second.first   = std::min(result.second.first,  minmaxStart.first->start);
+            result.second.second  = std::min(result.second.second, minmaxStop.second->stop);
+        }
+        if (left) {
+            auto valid = left->is_valid();
+            result.first &= valid.first;
+            result.second.first   = std::min(result.second.first,  valid.second.first);
+            result.second.second  = std::min(result.second.second, valid.second.second);
+            if (!result.first) { return result; }
+            if (valid.second.second >= center) {
+                result.first = false;
+                return result;
+            }
+        }
+        if (right) {
+            auto valid = right->is_valid();
+            result.first &= valid.first;
+            result.second.first   = std::min(result.second.first,  valid.second.first);
+            result.second.second  = std::min(result.second.second, valid.second.second);
+            if (!result.first) { return result; }
+            if (valid.second.first <= center) { 
+                result.first = false;
+                return result;
+            }
+        }
+        if (!std::is_sorted(intervals.begin(), intervals.end(), IntervalStartCmp())) {
+            result.first = false;
+        }
+        return result;        
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const IntervalTree& itree) {
+        return writeOut(os, itree);
+    }
+
+    friend std::ostream& writeOut(std::ostream& os, const IntervalTree& itree, 
+                                  std::size_t depth = 0) {
+        auto pad = [&]() { for (std::size_t i = 0; i != depth; ++i) { os << ' '; } };
+        pad(); os << "center: " << itree.center << '\n';
+        for (const interval & inter : itree.intervals) {
+            pad(); os << inter << '\n';
+        }
+        if (itree.left) {
+            pad(); os << "left:\n";
+            writeOut(os, *itree.left, depth + 1);
+        } else {
+            pad(); os << "left: nullptr\n";
+        }
+        if (itree.right) {
+            pad(); os << "right:\n";
+            writeOut(os, *itree.right, depth + 1);
+        } else {
+            pad(); os << "right: nullptr\n";
+        }
+        return os;
+    }
+
+private:
+    interval_vector intervals;
+    std::unique_ptr<IntervalTree> left;
+    std::unique_ptr<IntervalTree> right;
+    Scalar center;
+};
+#ifdef USE_INTERVAL_TREE_NAMESPACE
+}
+#endif
+
+#endif
\ No newline at end of file

From 19cea2fd091b328e4a83eda7a664c091569c03d7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 19 May 2025 09:06:34 +0000
Subject: [PATCH 315/432] [TOGSim] Remove sram allocation size checking
 mechansim

---
 PyTorchSimBackend/src/Core.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index 32fabd63..5f42ecb0 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -20,8 +20,7 @@ Core::Core(uint32_t id, SimulationConfig config)
 
 bool Core::can_issue(const std::shared_ptr<Tile>& op) {
   /* Check SRAM is enough to run tile */
-  assert(op->get_required_sram_size() <= _sram_size);
-  return op->get_required_sram_size() + _used_sram_size <= _sram_size &&  _tiles.size() < 2  && !op->is_stonne_tile();
+  return _tiles.size() < 2  && !op->is_stonne_tile();
 }
 
 void Core::issue(std::shared_ptr<Tile> op) {

From ab57bda6178d2da8f400e66126e23807fc4c227a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 19 May 2025 12:58:08 +0000
Subject: [PATCH 316/432] [TOGSim] Fix data cache write back logic

---
 .../systolic_ws_128x128_c1_simple_noc_tpuv4.json    |  2 +-
 .../systolic_ws_128x128_c2_simple_noc_tpuv4.json    |  2 +-
 PyTorchSimBackend/include/Memfetch.h                |  1 +
 PyTorchSimBackend/include/TileGraph.h               | 13 +++++++------
 PyTorchSimBackend/src/Cache.cc                      |  2 ++
 PyTorchSimBackend/src/L2Cache.cc                    |  8 +++++---
 PyTorchSimBackend/src/TMA.cc                        |  3 +--
 PyTorchSimBackend/src/TileGraph.cc                  |  7 +------
 8 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
index ed007c2b..bff4e224 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
@@ -15,7 +15,7 @@
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
   "l2d_type" : "datacache",
-  "l2d_config" : "S:128:128:512,32,L:T:m:L:L,A:192:4,32:0,32",
+  "l2d_config" : "S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32",
  
   "icnt_type" : "simple",
   "icnt_latency" : 7,
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
index 1b593e7f..4b4df4e6 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
@@ -15,7 +15,7 @@
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
   "l2d_type" : "datacache",
-  "l2d_config" : "S:64:128:512,32,L:T:m:L:L,A:192:4,32:0,32",
+  "l2d_config" : "S:64:128:512,32,L:T:m:W:L,A:192:4,32:0,32",
  
   "icnt_type" : "simple",
   "icnt_latency" : 7,
diff --git a/PyTorchSimBackend/include/Memfetch.h b/PyTorchSimBackend/include/Memfetch.h
index 128d81b2..8934d5c7 100644
--- a/PyTorchSimBackend/include/Memfetch.h
+++ b/PyTorchSimBackend/include/Memfetch.h
@@ -53,6 +53,7 @@ class mem_fetch {
   void set_request_id(unsigned id) { m_request_id = id; }
   unsigned get_request_id() { return m_request_id; }
   void set_access_sector_mask(uint32_t line_size, uint32_t sector_size) { m_sector_mask.set((m_addr % line_size) / sector_size); }
+  void set_access_sector_mask(SectorMask mask) { m_sector_mask = mask; }
   SectorMask get_access_sector_mask() { return m_sector_mask; }
   void set_dirty_mask(SectorMask dirty_mask) { m_dirty_mask = dirty_mask; }
   SectorMask get_dirty_mask() { return m_dirty_mask; }
diff --git a/PyTorchSimBackend/include/TileGraph.h b/PyTorchSimBackend/include/TileGraph.h
index 770bee06..990c107d 100644
--- a/PyTorchSimBackend/include/TileGraph.h
+++ b/PyTorchSimBackend/include/TileGraph.h
@@ -18,8 +18,8 @@ class TileSubGraph {
   int get_id() { return _id; }
   void set_core_id(int core_id) { _core_id = core_id; }
   int get_core_id() { return _core_id; }
-  void set_owner(void* owner) { _owner = owner; }
-  void* get_owner() { return _owner; }
+  void init_cache_plan(std::shared_ptr<IntervalTree<unsigned long long,int>> plan) { _cache_plan = plan; }
+  bool is_cacheable(unsigned long long start, unsigned long long end) { return _cache_plan->findOverlapping(start, end).size() != 0; }
   struct CompareReadyTile {
     bool operator()(const std::shared_ptr<Tile>& a, const std::shared_ptr<Tile>& b) const {
       return a->get_required_sram_size() > b->get_required_sram_size();
@@ -32,7 +32,7 @@ class TileSubGraph {
   int _id;
   int _core_id = -1;
   static int _next_id;
-  void* _owner=NULL;
+  std::shared_ptr<IntervalTree<unsigned long long, int>> _cache_plan;
 };
 
 class TileGraph {
@@ -67,8 +67,9 @@ class TileGraph {
   std::string get_name() { return _name; }
   void set_arrival_time(cycle_type arrival_time) { _arrival_time = arrival_time; }
   cycle_type get_arrival_time() { return _arrival_time; }
-  void init_cache_plan(IntervalTree<unsigned long long, int>::interval_vector it) { _cache_plan = IntervalTree<unsigned long long, int>(std::move(it)); }
-  bool is_cacheable(unsigned long long start, unsigned long long end);
+  void init_cache_plan(IntervalTree<unsigned long long, int>::interval_vector it) {
+    _cache_plan = std::make_shared<IntervalTree<unsigned long long, int>>(std::move(it));
+  }
   bool StonneGraph = false;
 
   class Iterator {
@@ -134,7 +135,7 @@ class TileGraph {
   std::vector<std::shared_ptr<TileSubGraph>> _subgraph_vec;
   std::vector<std::shared_ptr<TileSubGraph>> _finished_subgraph_vec;
   std::map<int, std::map<int, std::shared_ptr<TileSubGraph>>> _cpu_graph_map;
-  IntervalTree<unsigned long long, int> _cache_plan;
+  std::shared_ptr<IntervalTree<unsigned long long, int>> _cache_plan;
   cycle_type _arrival_time;
   static std::shared_ptr<Tile> null_tile;
 };
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/Cache.cc b/PyTorchSimBackend/src/Cache.cc
index 8bbab112..8346fae8 100644
--- a/PyTorchSimBackend/src/Cache.cc
+++ b/PyTorchSimBackend/src/Cache.cc
@@ -891,6 +891,8 @@ CacheRequestStatus DataCache::wr_miss_wa_naive(uint64_t addr,
   send_write_request(mf, CacheEvent(WRITE_REQUEST_SENT), time, events);
   mem_fetch *new_mf = new mem_fetch(
       mf->get_addr(), m_write_alloc_type, READ_REQUEST, m_config.get_atom_size());
+  new_mf->set_access_sector_mask(mf->get_access_sector_mask());
+  new_mf->set_core_id(mf->get_core_id());
   bool do_miss = false;
   bool wb = false;
   EvictedBlockInfo evicted;
diff --git a/PyTorchSimBackend/src/L2Cache.cc b/PyTorchSimBackend/src/L2Cache.cc
index db8a5921..14c9b9da 100644
--- a/PyTorchSimBackend/src/L2Cache.cc
+++ b/PyTorchSimBackend/src/L2Cache.cc
@@ -16,7 +16,7 @@ L2DataCache::L2DataCache(std::string name,  CacheConfig &cache_config, uint32_t
   cycle_type *core_cycle, uint32_t l2d_hit_latency,
   std::queue<mem_fetch*> *to_xbar_queue, std::queue<mem_fetch*> *from_xbar_queue) :
   L2CacheBase(name, cache_config, id, core_cycle, l2d_hit_latency, to_xbar_queue, from_xbar_queue) {
-  l_cache = std::make_unique<ReadOnlyCache>(name, cache_config, id, 0, &l_to_mem_queue);
+  l_cache = std::make_unique<DataCache>(name, cache_config, id, 0, &l_to_mem_queue);
   l_from_cache_queue = DelayQueue<mem_fetch*>(l_name + "_latency_queue", true, 0);
 }
 
@@ -26,7 +26,8 @@ bool L2DataCache::push(mem_fetch* req) {
       return false;
     l_cache->fill(req, *l_core_cycle);
   } else {
-    l_to_xbar_queue->push(req);
+    if (req->get_access_type() == GLOBAL_ACC_R || req->get_access_type() == GLOBAL_ACC_W)
+      l_to_xbar_queue->push(req);
   }
   return true;
 }
@@ -91,7 +92,8 @@ void L2DataCache::cycle() {
 
   if (l_from_cache_queue.arrived()) {
     mem_fetch* req = l_from_cache_queue.top();
-    l_to_xbar_queue->push(req);
+    if (req->get_access_type() == GLOBAL_ACC_R || req->get_access_type() == GLOBAL_ACC_W)
+      l_to_xbar_queue->push(req);
     l_from_cache_queue.pop();
   }
 }
diff --git a/PyTorchSimBackend/src/TMA.cc b/PyTorchSimBackend/src/TMA.cc
index 46aad385..7744b0f5 100644
--- a/PyTorchSimBackend/src/TMA.cc
+++ b/PyTorchSimBackend/src/TMA.cc
@@ -23,10 +23,9 @@ std::shared_ptr<std::vector<mem_fetch*>> TMA::get_memory_access() {
   auto access_vec = std::make_shared<std::vector<mem_fetch *>>();
   Tile* owner = (Tile*)_current_inst->get_owner();
   std::shared_ptr<TileSubGraph> owner_subgraph = owner->get_owner();
-  TileGraph* owner_graph = static_cast<TileGraph*>(owner_subgraph->get_owner());
   unsigned long long base_daddr = _current_inst->get_base_dram_address();
   // Todo. We use a ternsor level buffer allocation, so we don't need to check all memfetch
-  bool is_cacheable = owner_graph->is_cacheable(base_daddr, base_daddr + _dram_req_size);
+  bool is_cacheable = owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size);
   spdlog::trace("[SRAM Trace] Core-{}, Address: 0x{:016x}, Is_cacheable: {}", _id, base_daddr, is_cacheable);
   spdlog::trace("[NUMA Trace] Core-{}, Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}",
     _id, owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write());
diff --git a/PyTorchSimBackend/src/TileGraph.cc b/PyTorchSimBackend/src/TileGraph.cc
index cced5378..33e995e9 100644
--- a/PyTorchSimBackend/src/TileGraph.cc
+++ b/PyTorchSimBackend/src/TileGraph.cc
@@ -47,7 +47,7 @@ std::shared_ptr<Tile> TileSubGraph::get_tile() {
 
 
 void TileGraph::append_subgraph(std::shared_ptr<TileSubGraph> subgraph) {
-  subgraph->set_owner(this);
+  subgraph->init_cache_plan(_cache_plan);
   _subgraph_vec.push_back(std::move(subgraph));
 }
 
@@ -119,9 +119,4 @@ void TileGraph::allocate_subgraph(int core_id, int slot_id) {
     }
   }
   return;
-}
-
-bool TileGraph::is_cacheable(unsigned long long start, unsigned long long end) {
-  auto result = _cache_plan.findOverlapping(start, end);
-  return result.size() != 0;
 }
\ No newline at end of file

From b68cb630a15b29cea555fde76cfca752bcc7f98e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 19 May 2025 14:14:50 +0000
Subject: [PATCH 317/432] [Frontend] Fix spad option

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index dd5ba709..1ea03da9 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -162,10 +162,14 @@ def codegen_sram_plan_prefix(self):
                 continue
             if sympy_product(buf.get_size()) == 0:
                 continue
+            if buf is None:
+                continue
             self.prefix.writeline(f"sram_plan_prefix('{name}', {name})")
 
     def codegen_sram_plan_postfix(self, outputs):
         for name in outputs:
+            if name is None or name == "None":
+                continue
             self.wrapper_call.writeline(f"sram_plan_postfix('{name}', {name})")
 
     @dynamo_timed
@@ -175,7 +179,7 @@ def generate(self, is_inference):
 
         with contextlib.ExitStack() as stack:
             stack.enter_context(self.wrapper_call.indent())
-            self.memory_plan()
+            self.memory_plan_reuse()
             for line in self.lines:
                 # Add buffer plan hook for dealloc
                 if isinstance(line, memory_planning.DeallocFromPoolLine):
@@ -189,7 +193,7 @@ def generate(self, is_inference):
                 else:
                     self.wrapper_call.writeline(line)
                 # Add buffer plan hook for alloc
-                if isinstance(line, memory_planning.AllocFromPoolLine):
+                if isinstance(line, memory_planning.AllocFromPoolLine) or isinstance(line, wrapper.AllocateLine):
                     self.wrapper_call.writeline(f"sram_plan_prefix('{line.node.get_name()}', {line.node.get_name()})")
             output_refs = self.get_output_refs()
             self.codegen_sram_plan_postfix(output_refs)

From 877c52abf15ec7f8aaa0bd6517b9beb52cb4a7d1 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 11 Jun 2025 04:59:54 +0000
Subject: [PATCH 318/432] [Frontend+Script] Add some script + timestamp for
 experminet

---
 PyTorchSimFrontend/extension_codecache.py     |  2 +-
 PyTorchSimFrontend/extension_config.py        |  5 ++-
 .../mlir/mlir_codegen_backend.py              |  2 +-
 Simulator/simulator.py                        |  2 +-
 gem5_script/script_systolic.py                |  9 ++--
 scripts/ILS_experiment/ils_parser.sh          | 43 +++++++++++++++++++
 6 files changed, 55 insertions(+), 8 deletions(-)
 create mode 100755 scripts/ILS_experiment/ils_parser.sh

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index ac37f35b..b56d8295 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -105,7 +105,7 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
             -dma-fine-grained='systolic-array-size={vectorlane_size}' \
             -global-idx='vlen={vlen}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
-            -test-tile-operation-graph='vectorlane={vectorlane_size}' \
+            -test-tile-operation-graph='vectorlane={vectorlane_size} tls_mode={extension_config.CONFIG_TLS_MODE}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size} timing=1" \
             -convert-linalg-to-loops \
             -lower-affine \
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 0c1e8b6a..d1babd47 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -75,4 +75,7 @@ def load_plan_from_module(module_path):
         return None
 
 CONFIG_SRAM_BUFFER_PLAN_PATH = os.environ.get("SRAM_BUFFER_PLAN_PATH", default=None)
-CONFIG_SRAM_BUFFER_PLAN = load_plan_from_module(CONFIG_SRAM_BUFFER_PLAN_PATH)
\ No newline at end of file
+CONFIG_SRAM_BUFFER_PLAN = load_plan_from_module(CONFIG_SRAM_BUFFER_PLAN_PATH)
+
+# For ILS experiment
+CONFIG_TLS_MODE = int(os.environ.get('TORCHSIM_TLS_MODE', default=1))
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 1ea03da9..07c2f7a5 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1372,7 +1372,7 @@ def codegen_nodes(self, nodes, kernel_name):
         for n_try in range(extension_config.CONFIG_MAX_AUTOTUNE_TRY):
             src_code = super().codegen_nodes(nodes, kernel_name)
             self._prepare_simulator_headers(src_code)
-            if not extension_config.CONFIG_AUTOTUNE:
+            if not extension_config.CONFIG_AUTOTUNE or not extension_config.CONFIG_TORCHSIM_VALIDATION_MODE:
                 return src_code
 
             try:
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 5d233986..bc74ebce 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -157,7 +157,7 @@ def show_progress():
             print("")
 
         dir_path = os.path.join(os.path.dirname(target_binary), "m5out")
-        gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-d", dir_path, extension_config.CONFIG_GEM5_SCRIPT_PATH, "-c", target_binary, "--vlane", str(vectorlane_size)]
+        gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-r", "--stdout-file=sto.log", "-d", dir_path, extension_config.CONFIG_GEM5_SCRIPT_PATH, "-c", target_binary, "--vlane", str(vectorlane_size)]
         try:
             # Create progress thread
             is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
diff --git a/gem5_script/script_systolic.py b/gem5_script/script_systolic.py
index 4dad11ac..d5d3a92d 100644
--- a/gem5_script/script_systolic.py
+++ b/gem5_script/script_systolic.py
@@ -1,3 +1,4 @@
+import time
 import argparse
 import sys
 import math
@@ -185,11 +186,11 @@ def connectCPU(self, cpu):
 # Simulation
 root = Root(full_system=False, system=system)
 m5.instantiate()
+start_time = time.time()
 exit_event = m5.simulate()
 
 if exit_event.getCause() != "exiting with last active thread context":
     exit(1)
-
-# print(f"Exiting @ tick {m5.curTick()} because {exit_event.getCause()}")
-print(f"{m5.curTick() / 1000}")
-print(f"{m5.curTick()}")
\ No newline at end of file
+end_time = time.time()
+elapsed_seconds = end_time - start_time
+print(f"Simulation time: {elapsed_seconds:.6f} seconds")
diff --git a/scripts/ILS_experiment/ils_parser.sh b/scripts/ILS_experiment/ils_parser.sh
new file mode 100755
index 00000000..bc271f5d
--- /dev/null
+++ b/scripts/ILS_experiment/ils_parser.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+ignore_rest=false
+gem5_cmd=""
+result_path=""
+gem5_time=""
+togsim_time=""
+
+total_gem5=0
+total_togsim=0
+
+while IFS= read -r line; do
+  if [[ "$line" == "Wrapper Codegen Path ="* ]]; then
+    ignore_rest=true
+  fi
+
+  if ! $ignore_rest; then
+    continue
+  fi
+
+  if [[ "$line" == "[Gem5Simulator] cmd>"* ]]; then
+    gem5_cmd=$(echo "$line" | sed 's/^\[Gem5Simulator\] cmd>  *//')
+    dir=$(echo "$line" | sed -n 's/.*-d \([^ ]*\).*/\1/p')/sto.log
+    echo $dir
+    gem5_time=$(grep "Simulation time:" "$dir" | \
+                sed -E 's/^Simulation time: ([0-9.]+) seconds$/\1/')
+    echo "GEM5: $gem5_time" 
+    total_gem5=$(echo "$total_gem5 + $gem5_time" | bc)
+  fi
+
+  if [[ "$line" == *"[BackendSimulator] Simulation of"* ]]; then
+    result_path=$(echo "$line" | awk -F' ' '{gsub(/"/, "", $8); print $8}')
+    togsim_time=$(grep "\[info\] Simulation time:" "$result_path" | \
+                  sed -E 's/^\[[^]]+\] \[info\] Simulation time: ([0-9.]+) seconds$/\1/')
+    echo "TOGSim: $togsim_time"
+    total_togsim=$(echo "$total_togsim + $togsim_time" | bc)
+  fi
+done
+
+if [[ -n "$total_gem5" && -n "$total_time" ]]; then
+  total_time=$(python3 -c "print(round($total_gem5 + $total_togsim, 6))")
+  echo "Simulation time: $total_time seconds"
+fi
\ No newline at end of file

From 40f7c3e2859670a1f45c55d67a3f5ae48bd269ab Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 12 Jun 2025 06:01:37 +0000
Subject: [PATCH 319/432] [Frontend] Fix convolution fusion case for resnet
 inference

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py |  4 +-
 PyTorchSimFrontend/mlir/mlir_lowering.py      |  4 ++
 PyTorchSimFrontend/mlir/mlir_template.py      |  8 +--
 scripts/ILS_experiment/ils_parser.sh          | 38 ++++-------
 scripts/ILS_experiment/test_matmul.py         | 66 +++++++++++++++++++
 tests/test_resnet.py                          | 19 +++---
 6 files changed, 100 insertions(+), 39 deletions(-)
 create mode 100644 scripts/ILS_experiment/test_matmul.py

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index b8378397..adaa2e7b 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -770,7 +770,7 @@ def outer_func_render(self, kernel_name, input_args):
         options = dict(
             kernel=self.kernel,
             KERNEL_NAME=kernel_name,
-            FUNC_NAME=self.function_name,
+            FUNC_NAME=self.function_name + f"_{len(input_args)}",
             INPUT=X,
             WEIGHT=W,
             BIAS=Bias,
@@ -784,7 +784,7 @@ def outer_func_render(self, kernel_name, input_args):
             input_reorder=self.input_reorder
         )
         code = self._template_from_string(WRAPPER_TEMPLATE).render(**options)
-        return code, self.function_name
+        return code, self.function_name + f"_{len(input_args)}"
 
     def get_arg_attributes(self):
         arg_attributes = []
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index ba4cce44..d18df4ce 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -93,6 +93,10 @@ def convolution(
         "groups": groups,
     }
 
+    x.realize()
+    weight.realize()
+    x = ir.ExternKernel.require_channels_last(x)
+    weight = ir.ExternKernel.require_channels_last(weight)
     layout = conv_layout(x, weight, None, **kwargs)
     mlir_template = MLIRConvTemplate([x, weight, bias], layout, **kwargs)
     return mlir_template.generate().output_node()
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 4229a266..d0761611 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -10,7 +10,7 @@
 from typing import List, Optional
 from unittest.mock import patch
 
-from torch._inductor.codegen.common import Kernel, KernelTemplate, ChoiceCaller, OpOverrides, CSE
+from torch._inductor.codegen.common import Kernel, KernelTemplate, ChoiceCaller, OpOverrides, CSE, DeferredLine
 from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, Pointwise
 from torch._inductor.select_algorithm import PartialRender
 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
@@ -312,7 +312,7 @@ def call_kernel(self, kernel_name):
         _, call_args, _, _ = self.kernel_group.args.mlir_argdefs()
         # generate the code to call this
         wrapper.generate_kernel_call(
-            kernel_name if self.outer_func_name is None else self.outer_func_name,
+            kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}",
             call_args, cuda=False)
 
     def codegen_body(self):
@@ -611,12 +611,12 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         else:
             operation = "affine.store"
             line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}"
-        self.cse.generate(self.stores, line, assignment = False)
+        self.stores.writeline(DeferredLine(name, line))
 
         # Generate DMA instruction
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  f"{name}_tag", dram_shape, tile_shape, tile_stride)
-        self.cse.generate(self.dma_stores, code, assignment = False)
+        self.dma_stores.writeline(DeferredLine(name, code))
 
     def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index):
         return super().get_scratchpad_buffer(dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index, True)
diff --git a/scripts/ILS_experiment/ils_parser.sh b/scripts/ILS_experiment/ils_parser.sh
index bc271f5d..eeaea5cd 100755
--- a/scripts/ILS_experiment/ils_parser.sh
+++ b/scripts/ILS_experiment/ils_parser.sh
@@ -10,34 +10,24 @@ total_gem5=0
 total_togsim=0
 
 while IFS= read -r line; do
-  if [[ "$line" == "Wrapper Codegen Path ="* ]]; then
-    ignore_rest=true
-  fi
-
-  if ! $ignore_rest; then
-    continue
-  fi
-
-  if [[ "$line" == "[Gem5Simulator] cmd>"* ]]; then
-    gem5_cmd=$(echo "$line" | sed 's/^\[Gem5Simulator\] cmd>  *//')
-    dir=$(echo "$line" | sed -n 's/.*-d \([^ ]*\).*/\1/p')/sto.log
-    echo $dir
-    gem5_time=$(grep "Simulation time:" "$dir" | \
-                sed -E 's/^Simulation time: ([0-9.]+) seconds$/\1/')
-    echo "GEM5: $gem5_time" 
-    total_gem5=$(echo "$total_gem5 + $gem5_time" | bc)
+  if [[ "$line" == launch* ]]; then
+    tile_path=$(echo "$line" | awk '{print $2}')
+    base_dir=$(dirname "$tile_path")
+    result_path="$base_dir/m5out/sto.log"
+    echo $result_path
+    togsim_time=$(grep "Simulation time:" "$result_path" | \
+                  sed -E 's/Simulation time: ([0-9.]+) seconds$/\1/')
+    echo "GEM5: $togsim_time"
+    total_togsim=$(echo "$total_togsim + $togsim_time" | bc)
   fi
 
-  if [[ "$line" == *"[BackendSimulator] Simulation of"* ]]; then
-    result_path=$(echo "$line" | awk -F' ' '{gsub(/"/, "", $8); print $8}')
-    togsim_time=$(grep "\[info\] Simulation time:" "$result_path" | \
-                  sed -E 's/^\[[^]]+\] \[info\] Simulation time: ([0-9.]+) seconds$/\1/')
+  if [[ "$line" == *"[info] Simulation time:"* ]]; then
+    togsim_time=$(echo $line | sed -E 's/^\[[^]]+\] \[info\] Simulation time: ([0-9.]+) seconds$/\1/')
     echo "TOGSim: $togsim_time"
-    total_togsim=$(echo "$total_togsim + $togsim_time" | bc)
   fi
 done
 
-if [[ -n "$total_gem5" && -n "$total_time" ]]; then
-  total_time=$(python3 -c "print(round($total_gem5 + $total_togsim, 6))")
-  echo "Simulation time: $total_time seconds"
+if [[ -n "$total_gem5" && -n "$total_togsim" ]]; then
+  #total_time=$(python3 -c "print(round($total_gem5 + $total_togsim, 6))")
+  echo "Simulation time: $togsim_time seconds"
 fi
\ No newline at end of file
diff --git a/scripts/ILS_experiment/test_matmul.py b/scripts/ILS_experiment/test_matmul.py
new file mode 100644
index 00000000..09cc407d
--- /dev/null
+++ b/scripts/ILS_experiment/test_matmul.py
@@ -0,0 +1,66 @@
+import torch
+import argparse
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def test_matmul(device, input_size=128, hidden_size=128, output_size=128):
+    def custom_matmul(a, b):
+        return torch.matmul(a, b)
+    torch.manual_seed(0)
+    input = torch.randn(input_size, hidden_size)
+    weight = torch.randn(hidden_size, output_size)
+    x1 = input.to(device=device)
+    w1 = weight.to(device=device)
+    x2 = input.to("cpu")
+    w2 = weight.to("cpu")
+    opt_fn = torch.compile(dynamic=False)(custom_matmul)
+    res = opt_fn(x1, w1)
+    y = custom_matmul(x2, w2)
+    test_result("Matmul Forward", res, y)
+
+def test_addmm(device, input_size=128, hidden_size=128, output_size=128, bias_rank=1):
+    def custom_matmul(bias, a, b):
+        return torch.addmm(bias, a, b)
+    torch.manual_seed(0)
+    input = torch.randn(input_size, hidden_size)
+    weight = torch.randn(hidden_size, output_size)
+    bias = torch.randn(output_size) if bias_rank == 1 else torch.randn(input_size, output_size)
+    x1 = input.to(device=device)
+    w1 = weight.to(device=device)
+    b1 = bias.to(device=device)
+    x2 = input.to("cpu")
+    w2 = weight.to("cpu")
+    b2 = bias.to("cpu")
+    opt_fn = torch.compile(dynamic=False)(custom_matmul)
+    res = opt_fn(b1, x1, w1)
+    y = custom_matmul(b2, x2, w2)
+    test_result("Addmm Forward", res, y)
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+    parser = argparse.ArgumentParser(description="Run matmul with given shape") 
+    parser.add_argument('--shape', type=str, default="(512,512,512)")
+    args = parser.parse_args()
+    shape = tuple(map(int, args.shape.strip('()').split(',')))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    test_matmul(device, *shape)
diff --git a/tests/test_resnet.py b/tests/test_resnet.py
index e2a6f2a7..5e96b922 100644
--- a/tests/test_resnet.py
+++ b/tests/test_resnet.py
@@ -21,15 +21,16 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
 def test_resnet(device):
     from torchvision.models import resnet
     # model = resnet._resnet(resnet.BasicBlock, [1, 1, 0, 0], weights=None, progress=False).eval()
-    model = resnet18().eval()
-    model.to(device, memory_format=torch.channels_last)
-    input = torch.randn(1, 3, 224, 224)
-    x1 = input.to(device=device, memory_format=torch.channels_last)
-    x2 = input.cpu().to(memory_format=torch.channels_last)
-    opt_fn = torch.compile(dynamic=False)(model)
-    res = opt_fn(x1)
-    cpu_model = model.cpu().to(memory_format=torch.channels_last)
-    cpu_res = cpu_model(x2)
+    with torch.no_grad():
+        model = resnet18().eval()
+        model.to(device, memory_format=torch.channels_last)
+        input = torch.randn(1, 3, 224, 224)
+        x1 = input.to(device=device, memory_format=torch.channels_last)
+        x2 = input.cpu().to(memory_format=torch.channels_last)
+        opt_fn = torch.compile(dynamic=False)(model)
+        res = opt_fn(x1)
+        cpu_model = model.cpu().to(memory_format=torch.channels_last)
+        cpu_res = cpu_model(x2)
     test_result("ResNet18 inference", res, cpu_res)
     print("Max diff > ", torch.max(torch.abs(res.cpu() - cpu_res)))
     print("ResNet18 Simulation Done")

From 9424b7f5faebaaea281d7d5541d4c953e3d8c591 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 12 Jun 2025 06:35:53 +0000
Subject: [PATCH 320/432] [Frontend/Fusion] Dont fuse maxpool now

---
 PyTorchSimFrontend/mlir/mlir_lowering.py   | 4 +++-
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 6 ++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index d18df4ce..b1e1ba0e 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -146,7 +146,9 @@ def custom_maxpool(
     }
     layout = maxpool_layout(x, kernel_size, stride, padding, dilation, ceil_mode)
     mlir_template = MLIRMaxPoolTemplate([x], layout, **kwargs)
-    return mlir_template.generate().output_node(), x # FIXME: x is dummy IRNode, indices are not used in our case
+    x.realize()
+    template_node = mlir_template.generate().output_node()
+    return template_node, x # FIXME: x is dummy IRNode, indices are not used in our case
 
 def sparse_addmm(*args, **kwargs):
     _, sp_mat1, sp_mat2 = args
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 841a8fad..d4aa2359 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -49,6 +49,12 @@ def can_fuse_horizontal(self, node1, node2):
 
         # Check template node fusion
         if node1.is_template() or node2.is_template():
+            # Don't fuse maxpool template code
+            from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
+            if node1.is_template() and isinstance(node1.node.template, MLIRMaxPoolTemplate) or \
+                node2.is_template() and isinstance(node2.node.template, MLIRMaxPoolTemplate):
+                return False
+
             # Different layout is not supported
             if node1.get_nodes()[0].node.layout.dtype != node2.get_nodes()[0].node.layout.dtype:
                 return False

From 09185334dab2b8909253f37f938350bf8721c9d7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 12 Jun 2025 07:23:40 +0000
Subject: [PATCH 321/432] [Frontend/Fusion] Fix naminig convention for
 scheduling

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index d4aa2359..c90b20b8 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -198,10 +198,10 @@ def codegen_template(self, template_node, epilogue_nodes):
         kernel.call_kernel(kernel_name)
         V.graph.removed_buffers |= kernel.removed_buffers
         _, args, _, _ = self.kernel_group.args.mlir_argdefs()
-        args = ", ".join(args)
         eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
         if (eager_mode):
-            target_kernel_name = kernel_name if kernel.outer_func_name is None else kernel.outer_func_name
+            target_kernel_name = kernel_name if kernel.outer_func_name is None else kernel.outer_func_name + f"_{len(args)}"
+            args = ", ".join(args)
             V.graph.wrapper_code.writeline(
                 f"yield ({target_kernel_name}, ({args}))"
             )

From 818b6db2624e57f7961b3d541c049ae6179f1273 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Fri, 13 Jun 2025 04:57:01 +0000
Subject: [PATCH 322/432] [Fix] tile size for fusion

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 27 ++++++++++---------
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  1 +
 PyTorchSimFrontend/mlir/mlir_template.py      |  3 ++-
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index adaa2e7b..7a3b4b19 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -647,19 +647,7 @@ def render(self,
         y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
         conv_template = CONV_TEMPLATE
         TOG_latency = BATCH if TILE_M > BATCH else TILE_M
-        if self.is_multi_tile(I_C):
-          conv_template = MULTI_TILE_CONV_TEMPLATE
-          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
-          TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
-          TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-          SUB_TILE_K = TILE_K
-          x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H * TILE_M, TILE_K)
-          w_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_H * TILE_K, TILE_N)
-          y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)
-          x_spad_size = TILE_I_W * TILE_I_H * TILE_M * TILE_K
-          w_spad_size = TILE_K_H * TILE_K * TILE_N
-          y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
-        elif self.is_single_batch(BATCH) and self.stride[0] != 1:
+        if self.is_single_batch(BATCH) and self.stride[0] != 1:
           conv_template = SINGLE_BATCH_CONV_STRIDE_TEMPLATE
           TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
           TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
@@ -688,6 +676,19 @@ def render(self,
           w_spad_size = TILE_K_W * TILE_K_H * TILE_K * TILE_N
           y_spad_size = TILE_O_H * TILE_M * TILE_N
           TOG_latency = O_W if TILE_M > O_W else TILE_M
+        elif self.is_multi_tile(I_C):
+          conv_template = MULTI_TILE_CONV_TEMPLATE
+          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
+          TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
+          TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+          SUB_TILE_K = TILE_K
+          x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H * TILE_M, TILE_K)
+          w_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_H * TILE_K, TILE_N)
+          y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)
+          x_spad_size = TILE_I_W * TILE_I_H * TILE_M * TILE_K
+          w_spad_size = TILE_K_H * TILE_K * TILE_N
+          y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
+        SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
         TOG_latency = 8 if TOG_latency < 8 else TOG_latency
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
 
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index fe9f6946..290c61b9 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -127,6 +127,7 @@ def render(self,
             SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
         else: # Avoid Row Conflict of weights
             SUB_TILE_N = TILE_N
+        SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N # FIXME: hardcoded & 126 line has same feature
         SUB_TILE_K = TILE_K
         TOG_latency = M if SUB_TILE_M > M else SUB_TILE_M
         kernel.loop_size =[TOG_latency, SUB_TILE_N, SUB_TILE_K]
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index d0761611..1b36c0f9 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -123,7 +123,8 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, pad_k=True, min_tile
         spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
         max_spad_per_lane = spad_size_per_lane // 2 # double buffer
-        minimum_n_tile = self.num_cores if min_tile else 1
+        force_double_buffer = 2 if n_extra_node > 0 else 1 # In fusion case, double buffer should be forced
+        minimum_n_tile = self.num_cores * force_double_buffer if min_tile else 1
         m_pad_factor = self.vector_lane if M > self.vector_lane else 8
         n_pad_factor = self.vector_lane if N > self.vector_lane else 8
         k_pad_factor = self.vector_lane if K > self.vector_lane else (8 if pad_k else 1)

From 1aa37ca8f7efca4a558db46fc2049ec7efb52926 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Fri, 13 Jun 2025 04:59:34 +0000
Subject: [PATCH 323/432] [fix] experiments for inference

---
 experiments/BERT.py      |  5 ++--
 experiments/attention.py | 56 ++++++++++++++++++++++++++++++++++++++++
 experiments/conv.py      |  3 ++-
 experiments/resnet18.py  |  3 ++-
 experiments/resnet50.py  |  3 ++-
 5 files changed, 65 insertions(+), 5 deletions(-)
 create mode 100644 experiments/attention.py

diff --git a/experiments/BERT.py b/experiments/BERT.py
index f42b8be1..e111908e 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -15,7 +15,7 @@ def run_BERT(size, input_seq, config):
     embedding_size = {'base': 768, 'large': 1024, 'xlarge': 2048}
     heads = {'base': 12, 'large': 16, 'xlarge': 32} # hidden/64 https://arxiv.org/pdf/1909.11942
     cpu_query = torch.randn(input_seq, hidden_dim[size])
-    decoder_block = DecoderBlock(embedding_size[size], heads[size])
+    decoder_block = DecoderBlock(embedding_size[size], heads[size]).eval()
 
     query = cpu_query.clone().to(device=device)
     opt_fn = torch.compile(dynamic=False)(decoder_block.to(device=device))
@@ -26,7 +26,8 @@ def run_BERT(size, input_seq, config):
 
     # Run scheduler
     while not scheduler.is_finished():
-        scheduler.schedule()
+        with torch.no_grad():
+            scheduler.schedule()
 
     print(f"BERT-{size} Simulation Done")
 
diff --git a/experiments/attention.py b/experiments/attention.py
new file mode 100644
index 00000000..acfed848
--- /dev/null
+++ b/experiments/attention.py
@@ -0,0 +1,56 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+import argparse
+import datetime
+
+
+def run_attention(size, config):
+    def attention(query, key, value):
+        import math
+        d_k = query.size(-1)
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
+        p_attn = scores.softmax(dim=-1)
+        return torch.matmul(p_attn, value)
+    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    device = scheduler.execution_engine.module.custom_device()
+    query = torch.randn(size).to(device=device)
+    key = torch.randn(size).to(device=device)
+    value = torch.randn(size).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(attention)
+
+    SchedulerDNNModel.register_model("attention", opt_fn)
+    request = Request("attention", [query, key, value], [], request_queue_idx=0)
+    scheduler.add_request(request, request_time=0)
+
+    # Run scheduler
+    while not scheduler.is_finished():
+        with torch.no_grad():
+            scheduler.schedule()
+
+    print(f"Attention {str(size)} Simulation Done")
+
+if __name__ == "__main__":
+    import os
+    import sys
+    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
+    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
+    sys.path.append(base_dir)
+    args = argparse.ArgumentParser()
+    args.add_argument('--size', nargs='+', type=int, default=[12, 512, 64], help='Tensor Shape')
+    args.add_argument('--dump_path', type=str, default='results')
+    args = args.parse_args()
+    size = args.size
+    size_str = "x".join([str(i) for i in size])
+    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"attention_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    # setting environment variables
+    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    # only timing simulation
+    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
+    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
+        del os.environ['BACKENDSIM_SPIKE_ONLY']
+
+    run_attention(size, config)
diff --git a/experiments/conv.py b/experiments/conv.py
index 115e4aac..e8b97906 100644
--- a/experiments/conv.py
+++ b/experiments/conv.py
@@ -28,7 +28,8 @@ def custom_conv2d(a, b, bias):
 
     # Run scheduler
     while not scheduler.is_finished():
-        scheduler.schedule()
+        with torch.no_grad():
+            scheduler.schedule()
 
     print(f"CONV {batch_size}_{i_h}_{i_w}_{i_c}_{o_c}_{kernel_size}_{stride}_{padding} (B_H_W_I_C_O_C_K_S_P) Simulation Done")
 
diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index 202642d3..c12cc930 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -20,7 +20,8 @@ def run_resnet(batch, config):
 
     # Run scheduler
     while not scheduler.is_finished():
-        scheduler.schedule()
+        with torch.no_grad():
+            scheduler.schedule()
 
     print("ResNet18 Simulation Done")
 
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index 915bee5f..ec2e26ff 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -20,7 +20,8 @@ def run_resnet(batch, config):
 
     # Run scheduler
     while not scheduler.is_finished():
-        scheduler.schedule()
+        with torch.no_grad():
+            scheduler.schedule()
 
     print("ResNet50 Simulation Done")
 

From e50972e5d7800a48adea68d65d36412129497380 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Fri, 13 Jun 2025 05:00:02 +0000
Subject: [PATCH 324/432] [fix] vlane stride for vlane util

---
 PyTorchSimFrontend/mlir/mlir_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 2e5eee0e..1ecf6954 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -437,7 +437,7 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
                     raise NotImplementedError("Not supporting type...")
 
         vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
-        vlane_stride = 8
+        vlane_stride = 2
 
         # FIXME: Naive decrease tile size
         def decrease_tile_size(tile_size, vlane_split_axis):

From bca7ba07c0ed9a618d1170242b0a3f146e62d890 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Sat, 14 Jun 2025 09:47:01 +0000
Subject: [PATCH 325/432] [Frontend/Fuson] Fix step size of epilogue

---
 PyTorchSimFrontend/mlir/mlir_common.py   | 5 ++++-
 PyTorchSimFrontend/mlir/mlir_template.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 1ecf6954..2035ed6a 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -178,9 +178,10 @@ def set_info(outer, inner, arg_type):
         return arg_defs, call_args, arg_attributes, buffer_types
 
 class MLIRMultiDimTile():
-    def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None):
+    def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None, vec_size=None):
         self._tile_size = list(tile_size)
         self.tile_axis_order = list(range(len(tile_size)))
+        self.vec_size = vec_size
 
         # Vector lane mapping config
         self.vector_lane = vector_lane
@@ -272,6 +273,8 @@ def get_vlane_stride(self):
 
     def get_compute_vec_size(self):
         # Granule size used in compute loop
+        if self.vec_size is not None:
+            return self.vec_size
         if self.nr_rdim:
             assert self.nr_rdim==1
             val = self.get_numel_per_lane() // self._tile_size[-1]
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 1b36c0f9..946c26db 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -626,7 +626,7 @@ def set_tile_size(self, template_store_info):
         tile_desc = mlir_common.MLIRMultiDimTile(template_store_info['tile_size'],
             self.vector_lane,
             vlane_split_axis=template_store_info['vlane_split_axis'],
-            vlane_stride=template_store_info['vlane_stride'])
+            vlane_stride=template_store_info['vlane_stride'], vec_size=64)
         self.compute_body_loop.size = tile_desc.get_numel_per_lane()
         self.compute_body_loop.step = tile_desc.get_compute_vec_size()
         return tile_desc

From b6e1020e6de9ab15ef868f44bd86b72b91189889 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 13 Jun 2025 04:18:24 +0000
Subject: [PATCH 326/432] [Tests] Add Matmul reduction fusion case

---
 tests/Fusion/test_matmul_reduction.py | 46 +++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 tests/Fusion/test_matmul_reduction.py

diff --git a/tests/Fusion/test_matmul_reduction.py b/tests/Fusion/test_matmul_reduction.py
new file mode 100644
index 00000000..289d2ebd
--- /dev/null
+++ b/tests/Fusion/test_matmul_reduction.py
@@ -0,0 +1,46 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def test_matmul_reduce(device):
+    def matmul_fused(a, b, c):
+        result = torch.matmul(a, b)
+        return result, result.max(dim=-1).values
+    torch.manual_seed(0)
+    input = torch.randn(512, 128)
+    weight = torch.randn(128, 512)
+    x1 = input.to(device=device)
+    w1 = weight.to(device=device)
+    x2 = input.to("cpu")
+    w2 = weight.to("cpu")
+    c = 7
+    opt_fn = torch.compile(dynamic=False)(matmul_fused)
+    res = opt_fn(x1, w1, c)
+    y = matmul_fused(x2, w2, c)
+    test_result("Matmul Scalar Fusion Forward", res[0], y[0])
+    test_result("Matmul Scalar Fusion Forward", res[1], y[1])
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    test_matmul_reduce(device)

From 6677057857c13d5ccf90992fea62b8abd76a916f Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 13 Jun 2025 07:35:46 +0000
Subject: [PATCH 327/432] [Frontend/Fusion] Add matmul reduction fusion beta
 version

---
 .../mlir/mlir_codegen_backend.py              |   7 +-
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  74 +++++-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  12 +-
 PyTorchSimFrontend/mlir/mlir_template.py      | 216 +++++++++++++++++-
 4 files changed, 295 insertions(+), 14 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 07c2f7a5..91072a18 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1155,8 +1155,7 @@ def store_reduction(self, name, index, value):
             vshape = f"{mlir_dtype}"
         else:
             vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
-        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var,
-                                                                         index, buffer=self.reduction_suffix)
+        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
         if self.welford_reduce_out is not None:
             # raise NotImplementedError()
             sum, sqr_sum, _ = self.welford_reduce_out
@@ -1596,7 +1595,7 @@ def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape
         tile_size = max(tile_size_per_lane, 2) * self.vector_lane
 
         if buffer is None:
-            buffer = self.loads
+            buffer = self.spad_buffer
 
         if name not in self.global_vars_dict:
             self.global_vars_dict[name] = list()
@@ -1610,7 +1609,7 @@ def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape
             self.global_vars_dict[name].append(str(raw_index))
         else:
             new_name = f"{name}_{self.global_vars_dict[name].index(str(raw_index))}"
-        sram_var = self.spad_cse.generate(self.spad_buffer, f"memref.get_global @{new_name}_spad : {dram_tile_shape}")
+        sram_var = self.spad_cse.generate(buffer, f"memref.get_global @{new_name}_spad : {dram_tile_shape}")
 
         zero_cse = self.get_const_cse(0)
         sram_dims = len(dram_tile_shape.split("x")) - 1
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 290c61b9..1d14f6c6 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -84,6 +84,74 @@
 }
 """
 
+GEMM_REDUCTION_TEMPLATE = r"""
+// GEMM kernel
+// M = {{ M }}
+// N = {{ N }}
+// K = {{ K }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
+// SUB_TILE_M = {{ SUB_TILE_M }}
+// SUB_TILE_N = {{ SUB_TILE_N }}
+#map0 = affine_map<(d0, d1) -> ({{ X_map }})>
+#map1 = affine_map<(d0, d1) -> ({{ W_map }})>
+#map2 = affine_map<(d0, d1) -> (d0 * {{ N }} + d1)>
+memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+memref.global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+{{kernel.def_global_vars()}}
+
+func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} {
+  %c_mvin = arith.constant 2 : index
+  %c_mvin2 = arith.constant 1 : index{% if Bias %}
+  %c_mvin3 = arith.constant 14 : index{% endif %}
+  %c_mvout = arith.constant 3 : index
+  %vstride = arith.constant 1 : index
+  %axis = arith.constant 1 : index
+  %X_buffer = memref.get_global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+  %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+  %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+  %tag = memref.alloc() : memref<1xi32>
+  %tag0 = memref.alloc() : memref<1xi32>
+  %tag1 = memref.alloc() : memref<1xi32>
+  %tag2 = memref.alloc() : memref<1xi32>{% if not Bias %}
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
+  %c0 = arith.constant 0 : index
+{{ kernel.def_local_vars() }}
+
+  affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
+    {{kernel.reduction_acc()}} affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {{kernel.reduction_iter_arg()}} {
+      %index2 = affine.apply #map2(%t_m, %t_n)
+      {%- if Bias %}
+      memref.dma_start %Bias[
+        {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
+        ], %Y_buffer[%c0, %c0], %c_mvin3, %tag0[%c0], %
+        {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
+        , %vstride : memref<
+        {%- if Bias_rank == 2 -%}  {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
+        xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}] }
+      {%- else %}
+      affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+      {%- endif %}
+      affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
+        %index0 = affine.apply #map0(%t_m, %t_k)
+        %index1 = affine.apply #map1(%t_k, %t_n)
+        memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
+           : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
+        memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
+           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
+        linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
+                outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
+      } { accumulation_loop=true }
+      {{kernel.store_output(indent_size=6)}}
+    } { outer_loop=true }
+    {{kernel.reduction_output(indent_size=4)}}
+  } { outer_loop=true }
+  return
+}
+"""
+
 class MLIRGemmTemplate(MLIRTemplate):
     def __init__(self, input_nodes, layout, input_reorder=None):
         super().__init__("kernel", input_nodes, layout, input_reorder)
@@ -116,6 +184,9 @@ def render(self,
         if (M == 0) or (N == 0) or (K == 0):
             TILE_M, TILE_N, TILE_K = 1, 1, 1
             template = EMPTY_TEMPLATE
+        elif n_extra_node==1 and epilogue_nodes[0].is_reduction():
+            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node, min_tile=True)
+            template = GEMM_REDUCTION_TEMPLATE
         else:
             TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node, min_tile=True)
             template = GEMM_TEMPLATE
@@ -170,7 +241,8 @@ def render(self,
             mlir_dtype = kernel.render_options['DATA_STYPE'],
             dram_shape = f"memref<{kernel.render_options['Y_numel']}x{kernel.render_options['DATA_STYPE']}>",
             tile_size = (TILE_M, TILE_N),
-            tile_stride = [1, TILE_M]
+            tile_stride = [1, TILE_M],
+            nr_rdim = '1'
         )
         code = self._template_from_string(template).render(**kernel.render_options)
         kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index c90b20b8..f286b2f5 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -4,7 +4,7 @@
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
 
 from torch._inductor import config
-from torch._inductor.scheduler import BaseScheduling, FusedSchedulerNode, SchedulerNode
+from torch._inductor.scheduler import BaseScheduling, FusedSchedulerNode, SchedulerNode, BaseSchedulerNode
 from torch._inductor.utils import IndentedBuffer
 from torch._inductor.virtualized import V
 
@@ -16,12 +16,22 @@ class MLIRScheduling(BaseScheduling):
     target_kernel = MLIRKernel
     def __init__(self, scheduler):
         self.scheduler = scheduler
+        self.scheduler.can_fuse = self.can_fuse_with_exceptions
         self.kernel_group = mlir_common.MLIRWrapperKenrelGroup()
         self._ready_to_flush = False
         self.outer_function = set()
         config.inplace_buffers = False # FIXME. inout kernel makes trouble.. So disabled it!
         self.max_fusion_size = 5
 
+    def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
+        if node1.get_device() == node2.get_device():
+            from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
+            # For matmul+reduction case
+            if node1.is_template() and isinstance(node1.node.template, MLIRGemmTemplate) and node2.is_reduction():
+                possible = node1.node.get_size()[:-1] == node2.node.get_size()
+                return True
+        return self.scheduler.can_fuse(node1, node2)
+
     def _set_flush_status(self, status: bool):
         self._ready_to_flush = status
 
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 946c26db..b8b62cb1 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -20,7 +20,7 @@
 
 from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
 from PyTorchSimFrontend.mlir.mlir_common import BaseMLIRHardwareInfo
-from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
+from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel, reduction_init, reduction_partial_combine_vec, reduction_combine_vec, is_welford_reduction
 from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode
 
 from . import mlir_common
@@ -54,10 +54,15 @@ def __init__(self,
         self.map_cse = CSE("#", self.suffix, name_prefix="template_map")
         self.const_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="template_const")
         self.alloc_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="template_alloc")
+        self.reduction_epilogue_suffix = IndentedBuffer()
+        self.reduction_body_loop = None # For reduction fusion
+        self.reduction_idx = "reduction_idx"
 
         # Overwrite ops
         self.load = self.load_epilogue
         self.store = self.store_epilogue
+        self.store_reduction = self.store_reduction_epilogue
+        self.reduction = self.reduction_epilogue
 
     def add_loop_info(self, mat_size, tile_size):
         for idx, (loop_size, stride) in enumerate(zip(mat_size, tile_size)):
@@ -340,6 +345,9 @@ def template_store():
         compute_body = mlir_common.ParallelLoopBuffer()
         with contextlib.ExitStack() as stack:
             stack.enter_context(compute_body.indent(attribute="{inner_loop=false}",suffix=self.compute_body_loop.epilogue_line()))
+            if self.reduction_body_loop is not None:
+                compute_body.writelines(self.reduction_body_loop.lines())
+                stack.enter_context(compute_body.indent(attribute="{inner_loop=false}",suffix=self.reduction_body_loop.epilogue_line()))
             compute_body.splice(self.loads)
             compute_body.splice(self.compute)
             if len(self.stores._lines) == 0:
@@ -347,6 +355,7 @@ def template_store():
             compute_body.splice(self.stores)
         self.body.splice(compute_body)
         self.body.splice(self.dma_stores)
+        self.body.splice(self.reduction_epilogue_suffix)
         self.loads.clear()
         self.compute.clear()
         self.stores.clear()
@@ -481,6 +490,37 @@ def hook():
         self.render_hooks.move_to_end("<STORE_OUTPUT>", last=False) # Force order to be triggered first
         return "<STORE_OUTPUT>"
 
+    def reduction_output(self, indent_size: int = 0):
+        def hook():
+            return textwrap.indent(self.reductions_suffix.getvalue(), " "*indent_size).strip()
+
+        assert "<REDUCTION_OUTPUT>" not in self.render_hooks
+        self.render_hooks["<REDUCTION_OUTPUT>"] = hook
+        return "<REDUCTION_OUTPUT>"
+
+    def reduction_iter_arg(self):
+        def hook():
+            if len(self.reduction_vars):
+                args = ', '.join([f"%{iter.name} = %{init.name}" for (_, iter, init, _) in self.reduction_vars.values()])
+                dtype = ', '.join([f"{dtype}" for (_, _, _, dtype) in self.reduction_vars.values()])
+                return f"iter_args({args}) -> ({dtype})"
+            return ""
+
+        assert "<REDUCTION_ITER_ARG>" not in self.render_hooks
+        self.render_hooks["<REDUCTION_ITER_ARG>"] = hook
+        return "<REDUCTION_ITER_ARG>"
+
+    def reduction_acc(self):
+        def hook():
+            if len(self.reduction_vars):
+                acc = ', '.join([f"%{acc.name}" for acc in self.reduction_vars.keys()])
+                return f"{acc} ="
+            return ""
+
+        assert "<REDUCTION_ACC>" not in self.render_hooks
+        self.render_hooks["<REDUCTION_ACC>"] = hook
+        return "<REDUCTION_ACC>"
+
     def def_function(self):
         _, call_args, _ = self.kernel_group.args.python_argdefs()
         if self.outer_func_render is not None:
@@ -561,7 +601,10 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         # Load vector from sram
         sram_var = self.buffer_names[name]
         zero_var = self.get_const_cse(0)
-        compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
+        if self.reduction_body_loop is None:
+            compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
+        else:
+            compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-2) + [f"%{self.reduction_idx}"] + [f"%{self.compute_idx}"])
         if compute_vec_size > 1:
             operation = "affine.vector_load"
             line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
@@ -603,7 +646,6 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         _, operand_type = self.var_info[value]
         if mlir_dtype != operand_type:
             value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
-
         compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
         # Generate vector load instruction
         if compute_vec_size > 1:
@@ -619,16 +661,174 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
                                  f"{name}_tag", dram_shape, tile_shape, tile_stride)
         self.dma_stores.writeline(DeferredLine(name, code))
 
-    def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index):
-        return super().get_scratchpad_buffer(dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index, True)
+    def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
+        argmax_or_argmin = reduction_type in {"argmax", "argmin"}
+        if argmax_or_argmin or is_welford_reduction(reduction_type):
+            raise NotImplementedError() #TODO: argmin, argmax
+
+        # Prepare reduction loop
+        reduction_key = src_dtype, reduction_type, value
+        acc = self.reduction_cse.generate(
+            self.loads, f"reduction {reduction_key}", write=False
+        )
+        iterator = self.iterator_cse.generate(
+            self.loads, f"reduction {reduction_key}", write=False
+        )
+        init = self.init_cse.generate(
+            self.loads, f"reduction {reduction_key}", write=False
+        )
+        init_vec = self.init_vec_cse.generate(
+            self.loads, f"reduction {reduction_key}", write=False
+        )
+        type_name = mlir_common.DTYPE_TO_MLIR[dtype]
+        init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
+        vec_len = self.kernel_group.tile_desc.get_compute_vec_size()
+        reduced_shape = self.kernel_group.tile_desc.get_mlir_vshape(type_name)
+
+        # Set accumulation var
+        if vec_len == 1: # 1-D vector to scalar
+            # Edge case for scalar
+            init_vec = init
+        else:
+            # Adjust shape and inital value
+            init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {type_name} to {reduced_shape}")
+        acc_var = init_vec
+
+        # Reduction body prepare
+        body_acc = self.reduction_cse.generate(
+            self.compute, f"reduction {reduction_key}body_acc", write=False
+        )
+        body_iter_arg = self.iterator_cse.generate(
+            self.compute, f"reduction {reduction_key}body_iter_arg", write=False
+        )
+        self.register_var_info(body_iter_arg, [vec_len, type_name])
+
+        self.reduction_vars[acc] = (reduction_type, iterator, acc_var, reduced_shape)
+        self.affine_yield[body_acc] = reduced_shape
+        self.reduction_cse.reduction_cache[reduction_key] = acc
+        self.iterator_cse.reduction_cache[reduction_key] = iterator
+        self.init_cse.reduction_cache[reduction_key] = init_vec
+
+        # Reduction body codegen
+        result = reduction_partial_combine_vec(reduction_type, value, body_iter_arg)
+        self.compute_body_loop.reduction_vars[body_acc] = (reduction_type, body_iter_arg, iterator, reduced_shape)
+        self.compute_body_loop.affine_yield[result] = reduced_shape
+
+        # Final reduction
+        reduction_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_tile_size()[-1]
+        assert(vec_len % reduction_size==0)
+        if vec_len > reduction_size:
+            init = self.const_cse.generate(self.reductions_suffix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
+            if reduction_size == 1:
+                final_reduced_shape = f"{type_name}"
+                out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(reduction_type, acc, init, axis=0, shape=reduced_shape, reduced_shape=final_reduced_shape))
+            else:
+                final_reduced_shape = f"vector<{reduction_size}x{type_name}>"
+                init_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{init} : {type_name} to {final_reduced_shape}")
+                new_vshape= f"vector<{reduction_size}x{vec_len//reduction_size}x{type_name}>"
+                value = self.cse.generate(self.reductions_suffix, f"vector.shape_cast %{acc} : {reduced_shape} to {new_vshape}")
+                out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(reduction_type, value, init_vec, axis=1, shape=new_vshape, reduced_shape=final_reduced_shape))
+            acc = out
+
+        # reigster reduction output
+        var_info = [reduction_size, mlir_common.DTYPE_TO_MLIR[dtype]]
+        self.register_var_info(acc, var_info)
+
+        # Specail handling for fusion
+        self.reduction_epilogue_suffix.writeline(f"affine.yield %{body_acc} : {self.affine_yield[body_acc]}")
+        self.reduction_body_loop.affine_yield = dict(self.compute_body_loop.affine_yield)
+        self.compute_body_loop.affine_yield.clear()
+
+        reduction_attr= self.compute_body_loop.reduction_vars[body_acc]
+        reduction_key = "reduction_epilogue"
+        new_body_acc = self.reduction_cse.generate(
+            self.compute, f"reduction {reduction_key}body_acc", write=False
+        )
+        body_iter_arg = self.iterator_cse.generate(
+            self.compute, f"reduction {reduction_key}body_iter_arg", write=False
+        )
+        iterator = self.iterator_cse.generate(
+            self.loads, f"reduction {reduction_key}", write=False
+        )
+
+        self.reduction_body_loop.reduction_vars[new_body_acc] = (reduction_attr[0], reduction_attr[1], body_iter_arg, reduction_attr[3])
+        self.compute_body_loop.reduction_vars[body_acc] = (reduction_attr[0], body_iter_arg, reduction_attr[2], reduction_attr[3])
+        self.compute_body_loop.affine_yield[new_body_acc] = reduction_attr[3]
+        return acc
+
+    def store_reduction_epilogue(self, name, index, value):
+        index = "t_n" # TODO. conversion required...
+        tmp_cse = self.cse
+        self.cse = self.reduction_cse
+
+        dram_var = self.kernel_group.args.output(name)
+        dtype = V.graph.get_dtype(name)
+        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
+        index = self.rename_indexing(index)
+
+        # Tile is always reuduced in inner loop
+        numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
+        reduction_axis_size = self.kernel_group.tile_desc.get_tile_size()[-1]
+        nr_outer_loop = numel_per_lane // reduction_axis_size
+
+        vlane_split_axis = 0
+        vlane_stride = self.kernel_group.tile_desc.vlane_stride
+        tile_numel_per_lane = vlane_stride * nr_outer_loop
+
+        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
+        tile_shape = f"memref<{self.kernel_group.tile_desc.get_tile_size()[0]}x{mlir_dtype}, 1>"
+        tile_stride = [1]
+        compute_vec_size = self.var_info[value][0]
+        if compute_vec_size == 1:
+            vshape = f"{mlir_dtype}"
+        else:
+            vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
+        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index,
+                                                                         index, buffer=self.const_buffer)
+
+        if self.welford_reduce_out is not None:
+            raise NotImplementedError()
+
+        # Select src type
+        if compute_vec_size == 1:
+            operation = "affine.store"
+            line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}"
+        else:
+            operation =  "affine.vector_store"
+            line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}"
+        self.reductions_suffix.writeline(DeferredLine(name, line))
+
+        # MVOUT Encoding
+        # Generate DMA instruction
+        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
+                                 f"{name}_tag", dram_shape, tile_shape, tile_stride)
+        self.reductions_suffix.writeline(DeferredLine(name, code))
+
+        # Restore origin cse
+        self.cse = tmp_cse
+
+    def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index, buffer=None):
+        return super().get_scratchpad_buffer(dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index, True, buffer=buffer)
 
     def set_tile_size(self, template_store_info):
         tile_desc = mlir_common.MLIRMultiDimTile(template_store_info['tile_size'],
             self.vector_lane,
             vlane_split_axis=template_store_info['vlane_split_axis'],
-            vlane_stride=template_store_info['vlane_stride'], vec_size=64)
-        self.compute_body_loop.size = tile_desc.get_numel_per_lane()
-        self.compute_body_loop.step = tile_desc.get_compute_vec_size()
+            vlane_stride=template_store_info['vlane_stride'])
+
+        if 'nr_rdim' in template_store_info:
+            tile_desc.nr_rdim = 1
+            numel_per_lane = tile_desc.get_numel_per_lane()
+            reduction_axis_size = tile_desc.get_tile_size()[-1]
+            nr_outer_loop = numel_per_lane // reduction_axis_size
+
+            self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_idx, nr_outer_loop, 0 , nr_outer_loop)
+            self.compute_body_loop.size = reduction_axis_size
+            self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop
+        else:
+            tile_desc.vec_size=64
+            self.compute_body_loop.size = tile_desc.get_numel_per_lane()
+            self.compute_body_loop.step = tile_desc.get_compute_vec_size()
         return tile_desc
 
 class MLIRTemplateCaller(CUDATemplateCaller):

From 21ee34dc391913343722ccf3679411e867e1241b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 13 Jun 2025 15:04:19 +0000
Subject: [PATCH 328/432] [Frontend/Fusion] Implement matmul+reduction fusion

---
 PyTorchSimFrontend/extension_codecache.py  |  2 ++
 PyTorchSimFrontend/mlir/mlir_scheduling.py |  3 ++-
 PyTorchSimFrontend/mlir/mlir_template.py   | 21 +++++++++++++++++----
 tests/Fusion/test_matmul_reduction.py      |  9 ++++++---
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index b56d8295..e4101e66 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -67,6 +67,7 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size}" \
             -convert-linalg-to-loops \
+            -convert-vector-to-scf='full-unroll' \
             -lower-affine \
             -finalize-memref-to-llvm \
             -lower-vector-multi-reduction \
@@ -108,6 +109,7 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
             -test-tile-operation-graph='vectorlane={vectorlane_size} tls_mode={extension_config.CONFIG_TLS_MODE}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size} timing=1" \
             -convert-linalg-to-loops \
+            -convert-vector-to-scf='full-unroll' \
             -lower-affine \
             -finalize-memref-to-llvm \
             -lower-vector-multi-reduction \
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index f286b2f5..f92b3114 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -16,6 +16,7 @@ class MLIRScheduling(BaseScheduling):
     target_kernel = MLIRKernel
     def __init__(self, scheduler):
         self.scheduler = scheduler
+        self.scheduler.can_fuse_origin = self.scheduler.can_fuse
         self.scheduler.can_fuse = self.can_fuse_with_exceptions
         self.kernel_group = mlir_common.MLIRWrapperKenrelGroup()
         self._ready_to_flush = False
@@ -30,7 +31,7 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
             if node1.is_template() and isinstance(node1.node.template, MLIRGemmTemplate) and node2.is_reduction():
                 possible = node1.node.get_size()[:-1] == node2.node.get_size()
                 return True
-        return self.scheduler.can_fuse(node1, node2)
+        return self.scheduler.can_fuse_origin(node1, node2)
 
     def _set_flush_status(self, status: bool):
         self._ready_to_flush = status
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index b8b62cb1..a64ae7a7 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -604,15 +604,22 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         if self.reduction_body_loop is None:
             compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
         else:
+            reduce_size = self.reduction_body_loop.size
             compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-2) + [f"%{self.reduction_idx}"] + [f"%{self.compute_idx}"])
+            vshape = f"vector<{reduce_size}x{compute_vec_size//reduce_size}x{mlir_dtype}>"
+
         if compute_vec_size > 1:
-            operation = "affine.vector_load"
-            line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
+            pad = self.const_cse.generate(self.const_buffer, f"arith.constant 0.0 : {mlir_dtype}")
+            operation = "vector.transfer_read"
+            line = f"{operation} %{sram_var}[{compute_index_var}], %{pad} : {tile_shape}, {vshape}"
         else:
             operation = "affine.load"
             line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
 
         out = self.cse.generate(self.loads, line)
+        if self.reduction_body_loop is not None:
+            new_vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
+            out = self.cse.generate(self.loads, f"vector.shape_cast %{out} : {vshape} to {new_vshape}")
         self.register_var_info(out, [compute_vec_size, mlir_dtype])
         return out
 
@@ -718,7 +725,7 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         reduction_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_tile_size()[-1]
         assert(vec_len % reduction_size==0)
         if vec_len > reduction_size:
-            init = self.const_cse.generate(self.reductions_suffix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
+            init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
             if reduction_size == 1:
                 final_reduced_shape = f"{type_name}"
                 out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(reduction_type, acc, init, axis=0, shape=reduced_shape, reduced_shape=final_reduced_shape))
@@ -726,8 +733,14 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
                 final_reduced_shape = f"vector<{reduction_size}x{type_name}>"
                 init_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{init} : {type_name} to {final_reduced_shape}")
                 new_vshape= f"vector<{reduction_size}x{vec_len//reduction_size}x{type_name}>"
+                partial_vshape= f"vector<{vec_len//reduction_size}x{type_name}>"
                 value = self.cse.generate(self.reductions_suffix, f"vector.shape_cast %{acc} : {reduced_shape} to {new_vshape}")
-                out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(reduction_type, value, init_vec, axis=1, shape=new_vshape, reduced_shape=final_reduced_shape))
+                # FIXME. I want to use N-Rank multi-reduciton, but we can't use it. It lowerd to scalar operations now...
+                for i in range(reduction_size):
+                    partial_value = self.cse.generate(self.reductions_suffix, f"vector.extract %{value}[{i}] : {partial_vshape} from {new_vshape}")
+                    out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(reduction_type, partial_value, init, axis=0, shape=partial_vshape, reduced_shape=type_name))
+                    init_vec = self.cse.generate(self.reductions_suffix, f"vector.insert %{out}, %{init_vec}[{i}] : {type_name} into {final_reduced_shape}")
+                out = init_vec
             acc = out
 
         # reigster reduction output
diff --git a/tests/Fusion/test_matmul_reduction.py b/tests/Fusion/test_matmul_reduction.py
index 289d2ebd..9b7afca1 100644
--- a/tests/Fusion/test_matmul_reduction.py
+++ b/tests/Fusion/test_matmul_reduction.py
@@ -20,10 +20,13 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
 def test_matmul_reduce(device):
     def matmul_fused(a, b, c):
         result = torch.matmul(a, b)
-        return result, result.max(dim=-1).values
+        return result, result.max(dim=-2).values
     torch.manual_seed(0)
-    input = torch.randn(512, 128)
-    weight = torch.randn(128, 512)
+    input = torch.randn(512, 256)
+    weight = torch.randn(256, 512)
+    #N = 256
+    #input = torch.arange(1, N * N + 1, dtype=torch.float32).reshape(N, N).to(dtype=torch.float32)
+    #weight = torch.eye(256, dtype=torch.float32)
     x1 = input.to(device=device)
     w1 = weight.to(device=device)
     x2 = input.to("cpu")

From a5f26606a0842e087001044a30a274afa055b506 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 13 Jun 2025 15:20:37 +0000
Subject: [PATCH 329/432] [Frontend/Fusion] Fix node accessing

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index f92b3114..ffc2d88f 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -28,7 +28,7 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
         if node1.get_device() == node2.get_device():
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
             # For matmul+reduction case
-            if node1.is_template() and isinstance(node1.node.template, MLIRGemmTemplate) and node2.is_reduction():
+            if node1.is_template() and len(node1.get_nodes())==1 and isinstance(node1.node.template, MLIRGemmTemplate) and node2.is_reduction():
                 possible = node1.node.get_size()[:-1] == node2.node.get_size()
                 return True
         return self.scheduler.can_fuse_origin(node1, node2)
@@ -62,8 +62,8 @@ def can_fuse_horizontal(self, node1, node2):
         if node1.is_template() or node2.is_template():
             # Don't fuse maxpool template code
             from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
-            if node1.is_template() and isinstance(node1.node.template, MLIRMaxPoolTemplate) or \
-                node2.is_template() and isinstance(node2.node.template, MLIRMaxPoolTemplate):
+            if node1.is_template() and len(node1.get_nodes())==1 and isinstance(node1.node.template, MLIRMaxPoolTemplate) or \
+                node2.is_template() and len(node1.get_nodes())==1 and isinstance(node2.node.template, MLIRMaxPoolTemplate):
                 return False
 
             # Different layout is not supported

From f5d6e5bb95df44193e37ef8292e352c1498fd275 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 14 Jun 2025 05:07:34 +0000
Subject: [PATCH 330/432] [Frontend/Fusion] Fix minor bugs for matmul+reduction

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  4 +++-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  4 ++--
 PyTorchSimFrontend/mlir/mlir_template.py      | 12 +++++++++---
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 1d14f6c6..de9d1f47 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -181,12 +181,14 @@ def render(self,
 
         M, N, K = X_tensor.size()[0], W_tensor.size()[1], X_tensor.size()[1]
         n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
+        nr_rdim = 0
         if (M == 0) or (N == 0) or (K == 0):
             TILE_M, TILE_N, TILE_K = 1, 1, 1
             template = EMPTY_TEMPLATE
         elif n_extra_node==1 and epilogue_nodes[0].is_reduction():
             TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node, min_tile=True)
             template = GEMM_REDUCTION_TEMPLATE
+            nr_rdim = 1
         else:
             TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node, min_tile=True)
             template = GEMM_TEMPLATE
@@ -242,7 +244,7 @@ def render(self,
             dram_shape = f"memref<{kernel.render_options['Y_numel']}x{kernel.render_options['DATA_STYPE']}>",
             tile_size = (TILE_M, TILE_N),
             tile_stride = [1, TILE_M],
-            nr_rdim = '1'
+            nr_rdim = nr_rdim
         )
         code = self._template_from_string(template).render(**kernel.render_options)
         kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index ffc2d88f..cec882fc 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -28,9 +28,9 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
         if node1.get_device() == node2.get_device():
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
             # For matmul+reduction case
-            if node1.is_template() and len(node1.get_nodes())==1 and isinstance(node1.node.template, MLIRGemmTemplate) and node2.is_reduction():
+            if node1.is_template() and len(node1.get_nodes())==1 and isinstance(node1.node.template, MLIRGemmTemplate) and node2.is_reduction() and len(node2.get_nodes())==1:
                 possible = node1.node.get_size()[:-1] == node2.node.get_size()
-                return True
+                return possible
         return self.scheduler.can_fuse_origin(node1, node2)
 
     def _set_flush_status(self, status: bool):
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index a64ae7a7..8d6b4b29 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -348,17 +348,23 @@ def template_store():
             if self.reduction_body_loop is not None:
                 compute_body.writelines(self.reduction_body_loop.lines())
                 stack.enter_context(compute_body.indent(attribute="{inner_loop=false}",suffix=self.reduction_body_loop.epilogue_line()))
+                if (self.compute.getvalue()==''):
+                    print('here')
             compute_body.splice(self.loads)
             compute_body.splice(self.compute)
             if len(self.stores._lines) == 0:
                 template_store()
             compute_body.splice(self.stores)
-        self.body.splice(compute_body)
+        if (compute_body.getvalue()):
+            self.body.splice(compute_body)
         self.body.splice(self.dma_stores)
         self.body.splice(self.reduction_epilogue_suffix)
+
+        # Clear buffers
         self.loads.clear()
         self.compute.clear()
         self.stores.clear()
+        self.reduction_body_loop = None
 
     def def_kernel(
         self,
@@ -829,11 +835,11 @@ def set_tile_size(self, template_store_info):
             vlane_split_axis=template_store_info['vlane_split_axis'],
             vlane_stride=template_store_info['vlane_stride'])
 
-        if 'nr_rdim' in template_store_info:
+        if 'nr_rdim' in template_store_info and template_store_info['nr_rdim']==1:
             tile_desc.nr_rdim = 1
             numel_per_lane = tile_desc.get_numel_per_lane()
             reduction_axis_size = tile_desc.get_tile_size()[-1]
-            nr_outer_loop = numel_per_lane // reduction_axis_size
+            nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size
 
             self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_idx, nr_outer_loop, 0 , nr_outer_loop)
             self.compute_body_loop.size = reduction_axis_size

From 45a6d92c201b7d9ee217b34754b8fdbedc5a5e22 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 14 Jun 2025 06:23:54 +0000
Subject: [PATCH 331/432] [Frontend/Fusion] Fix matmul-reduction fusion for
 non-square shape

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +-
 PyTorchSimFrontend/mlir/mlir_scheduling.py      | 4 +++-
 PyTorchSimFrontend/mlir/mlir_template.py        | 8 ++++----
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 91072a18..371e56a9 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1109,7 +1109,7 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         self.compute_body_loop.affine_yield[result] = reduced_shape
 
         # Final reduction
-        reduction_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_tile_size()[-1]
+        reduction_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_tile_size()[-2]
         assert(vec_len % reduction_size==0)
         if vec_len > reduction_size:
             init = self.const_cse.generate(self.reductions_suffix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index cec882fc..abf8919d 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -29,7 +29,9 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
             # For matmul+reduction case
             if node1.is_template() and len(node1.get_nodes())==1 and isinstance(node1.node.template, MLIRGemmTemplate) and node2.is_reduction() and len(node2.get_nodes())==1:
-                possible = node1.node.get_size()[:-1] == node2.node.get_size()
+                reduction_axis = node2.node.origin_node.args[1]
+                output_dims = len(node1.node.get_size())
+                possible = node1.node.get_size()[:-1] == node2.node.get_size() and ((reduction_axis==0 and output_dims==2) or (reduction_axis==1 and output_dims==3))
                 return possible
         return self.scheduler.can_fuse_origin(node1, node2)
 
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 8d6b4b29..a2722496 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -728,7 +728,7 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         self.compute_body_loop.affine_yield[result] = reduced_shape
 
         # Final reduction
-        reduction_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_tile_size()[-1]
+        reduction_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_tile_size()[-2]
         assert(vec_len % reduction_size==0)
         if vec_len > reduction_size:
             init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
@@ -787,7 +787,7 @@ def store_reduction_epilogue(self, name, index, value):
 
         # Tile is always reuduced in inner loop
         numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
-        reduction_axis_size = self.kernel_group.tile_desc.get_tile_size()[-1]
+        reduction_axis_size = self.kernel_group.tile_desc.get_tile_size()[-2]
         nr_outer_loop = numel_per_lane // reduction_axis_size
 
         vlane_split_axis = 0
@@ -795,7 +795,7 @@ def store_reduction_epilogue(self, name, index, value):
         tile_numel_per_lane = vlane_stride * nr_outer_loop
 
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
-        tile_shape = f"memref<{self.kernel_group.tile_desc.get_tile_size()[0]}x{mlir_dtype}, 1>"
+        tile_shape = f"memref<{self.kernel_group.tile_desc.get_tile_size()[1]}x{mlir_dtype}, 1>"
         tile_stride = [1]
         compute_vec_size = self.var_info[value][0]
         if compute_vec_size == 1:
@@ -838,7 +838,7 @@ def set_tile_size(self, template_store_info):
         if 'nr_rdim' in template_store_info and template_store_info['nr_rdim']==1:
             tile_desc.nr_rdim = 1
             numel_per_lane = tile_desc.get_numel_per_lane()
-            reduction_axis_size = tile_desc.get_tile_size()[-1]
+            reduction_axis_size = tile_desc.get_tile_size()[-2]
             nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size
 
             self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_idx, nr_outer_loop, 0 , nr_outer_loop)

From 2437850b36d686b22f19bfa5fcf46a14b02329e0 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 14 Jun 2025 12:06:54 +0000
Subject: [PATCH 332/432] [Frontend/Fusion] Fix matmul+reduction bugs

---
 .../mlir/mlir_codegen_backend.py              |  2 +-
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  6 +-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  4 +
 PyTorchSimFrontend/mlir/mlir_template.py      | 80 ++++++++-----------
 tests/Fusion/test_matmul_reduction.py         | 14 ++--
 5 files changed, 50 insertions(+), 56 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 371e56a9..91072a18 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1109,7 +1109,7 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         self.compute_body_loop.affine_yield[result] = reduced_shape
 
         # Final reduction
-        reduction_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_tile_size()[-2]
+        reduction_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_tile_size()[-1]
         assert(vec_len % reduction_size==0)
         if vec_len > reduction_size:
             init = self.const_cse.generate(self.reductions_suffix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index de9d1f47..5257201d 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -143,11 +143,11 @@
            : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                 outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
-      } { accumulation_loop=true }
+      } { accumulation_loop=true, loop_k=true }
       {{kernel.store_output(indent_size=6)}}
-    } { outer_loop=true }
+    } { outer_loop=true, loop_m=true}
     {{kernel.reduction_output(indent_size=4)}}
-  } { outer_loop=true }
+  } { outer_loop=true, loop_n=true }
   return
 }
 """
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index abf8919d..97b57495 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -30,7 +30,11 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
             # For matmul+reduction case
             if node1.is_template() and len(node1.get_nodes())==1 and isinstance(node1.node.template, MLIRGemmTemplate) and node2.is_reduction() and len(node2.get_nodes())==1:
                 reduction_axis = node2.node.origin_node.args[1]
+                if isinstance(reduction_axis, list):
+                    reduction_axis = reduction_axis[0]
                 output_dims = len(node1.node.get_size())
+                if reduction_axis < 0:
+                    reduction_axis = output_dims + reduction_axis
                 possible = node1.node.get_size()[:-1] == node2.node.get_size() and ((reduction_axis==0 and output_dims==2) or (reduction_axis==1 and output_dims==3))
                 return possible
         return self.scheduler.can_fuse_origin(node1, node2)
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index a2722496..da8c2575 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -55,8 +55,7 @@ def __init__(self,
         self.const_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="template_const")
         self.alloc_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="template_alloc")
         self.reduction_epilogue_suffix = IndentedBuffer()
-        self.reduction_body_loop = None # For reduction fusion
-        self.reduction_idx = "reduction_idx"
+        self.reduction_fusion = False
 
         # Overwrite ops
         self.load = self.load_epilogue
@@ -345,11 +344,6 @@ def template_store():
         compute_body = mlir_common.ParallelLoopBuffer()
         with contextlib.ExitStack() as stack:
             stack.enter_context(compute_body.indent(attribute="{inner_loop=false}",suffix=self.compute_body_loop.epilogue_line()))
-            if self.reduction_body_loop is not None:
-                compute_body.writelines(self.reduction_body_loop.lines())
-                stack.enter_context(compute_body.indent(attribute="{inner_loop=false}",suffix=self.reduction_body_loop.epilogue_line()))
-                if (self.compute.getvalue()==''):
-                    print('here')
             compute_body.splice(self.loads)
             compute_body.splice(self.compute)
             if len(self.stores._lines) == 0:
@@ -364,7 +358,6 @@ def template_store():
         self.loads.clear()
         self.compute.clear()
         self.stores.clear()
-        self.reduction_body_loop = None
 
     def def_kernel(
         self,
@@ -607,23 +600,38 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         # Load vector from sram
         sram_var = self.buffer_names[name]
         zero_var = self.get_const_cse(0)
-        if self.reduction_body_loop is None:
+        if not self.reduction_fusion:
             compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
-        else:
-            reduce_size = self.reduction_body_loop.size
-            compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-2) + [f"%{self.reduction_idx}"] + [f"%{self.compute_idx}"])
-            vshape = f"vector<{reduce_size}x{compute_vec_size//reduce_size}x{mlir_dtype}>"
-
-        if compute_vec_size > 1:
-            pad = self.const_cse.generate(self.const_buffer, f"arith.constant 0.0 : {mlir_dtype}")
-            operation = "vector.transfer_read"
-            line = f"{operation} %{sram_var}[{compute_index_var}], %{pad} : {tile_shape}, {vshape}"
-        else:
-            operation = "affine.load"
-            line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
+            if compute_vec_size > 1:
+                operation = "affine.vector_load"
+                line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
+            else:
+                operation = "affine.load"
+                line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
+            out = self.cse.generate(self.loads, line)
+        else: # For reduction case
+            reduce_size = self.reduction_nr_outer_loop
+            vsize = compute_vec_size//reduce_size
+            vshape = f"vector<{vsize}x{mlir_dtype}>"
+            tshape = f"vector<{reduce_size}x{vsize}x{mlir_dtype}>"
+
+            init = self.cse.generate(self.loads, f"arith.constant 0.0 : {mlir_dtype}")
+            init_vec = self.cse.generate(self.loads, f"vector.broadcast %{init} : {mlir_dtype} to {tshape}")
+            if compute_vec_size > 1:
+                for i in range(reduce_size):
+                    offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0) -> (d0 + {i*(self.reduction_axis_size)})>(%{self.compute_idx})")
+                    compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{offset}"])
+                    operation = "affine.vector_load"
+                    line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
+                    out = self.cse.generate(self.loads, line)
+                    init_vec = self.cse.generate(self.loads, f"vector.insert %{out}, %{init_vec}[{i}] : {vshape} into {tshape}")
+                out = init_vec
+                vshape = tshape
+            else:
+                line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
+                out = self.cse.generate(self.loads, line)
 
-        out = self.cse.generate(self.loads, line)
-        if self.reduction_body_loop is not None:
+        if self.reduction_fusion:
             new_vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
             out = self.cse.generate(self.loads, f"vector.shape_cast %{out} : {vshape} to {new_vshape}")
         self.register_var_info(out, [compute_vec_size, mlir_dtype])
@@ -728,8 +736,7 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         self.compute_body_loop.affine_yield[result] = reduced_shape
 
         # Final reduction
-        reduction_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_tile_size()[-2]
-        assert(vec_len % reduction_size==0)
+        reduction_size = self.reduction_nr_outer_loop
         if vec_len > reduction_size:
             init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
             if reduction_size == 1:
@@ -755,24 +762,6 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
 
         # Specail handling for fusion
         self.reduction_epilogue_suffix.writeline(f"affine.yield %{body_acc} : {self.affine_yield[body_acc]}")
-        self.reduction_body_loop.affine_yield = dict(self.compute_body_loop.affine_yield)
-        self.compute_body_loop.affine_yield.clear()
-
-        reduction_attr= self.compute_body_loop.reduction_vars[body_acc]
-        reduction_key = "reduction_epilogue"
-        new_body_acc = self.reduction_cse.generate(
-            self.compute, f"reduction {reduction_key}body_acc", write=False
-        )
-        body_iter_arg = self.iterator_cse.generate(
-            self.compute, f"reduction {reduction_key}body_iter_arg", write=False
-        )
-        iterator = self.iterator_cse.generate(
-            self.loads, f"reduction {reduction_key}", write=False
-        )
-
-        self.reduction_body_loop.reduction_vars[new_body_acc] = (reduction_attr[0], reduction_attr[1], body_iter_arg, reduction_attr[3])
-        self.compute_body_loop.reduction_vars[body_acc] = (reduction_attr[0], body_iter_arg, reduction_attr[2], reduction_attr[3])
-        self.compute_body_loop.affine_yield[new_body_acc] = reduction_attr[3]
         return acc
 
     def store_reduction_epilogue(self, name, index, value):
@@ -840,8 +829,9 @@ def set_tile_size(self, template_store_info):
             numel_per_lane = tile_desc.get_numel_per_lane()
             reduction_axis_size = tile_desc.get_tile_size()[-2]
             nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size
-
-            self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_idx, nr_outer_loop, 0 , nr_outer_loop)
+            self.reduction_fusion = True
+            self.reduction_axis_size =  tile_desc.get_tile_size()[-2]
+            self.reduction_nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size
             self.compute_body_loop.size = reduction_axis_size
             self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop
         else:
diff --git a/tests/Fusion/test_matmul_reduction.py b/tests/Fusion/test_matmul_reduction.py
index 9b7afca1..9f2cc7f3 100644
--- a/tests/Fusion/test_matmul_reduction.py
+++ b/tests/Fusion/test_matmul_reduction.py
@@ -17,16 +17,16 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
         print("cpu out: ", cpu_out)
         exit(1)
 
-def test_matmul_reduce(device):
+def test_matmul_reduce(device, size=512):
     def matmul_fused(a, b, c):
         result = torch.matmul(a, b)
         return result, result.max(dim=-2).values
     torch.manual_seed(0)
-    input = torch.randn(512, 256)
-    weight = torch.randn(256, 512)
-    #N = 256
+    N = size
+    input = torch.randn(N, N)
+    weight = torch.randn(N, N)
     #input = torch.arange(1, N * N + 1, dtype=torch.float32).reshape(N, N).to(dtype=torch.float32)
-    #weight = torch.eye(256, dtype=torch.float32)
+    #weight = torch.eye(N, dtype=torch.float32)
     x1 = input.to(device=device)
     w1 = weight.to(device=device)
     x2 = input.to("cpu")
@@ -35,8 +35,8 @@ def matmul_fused(a, b, c):
     opt_fn = torch.compile(dynamic=False)(matmul_fused)
     res = opt_fn(x1, w1, c)
     y = matmul_fused(x2, w2, c)
-    test_result("Matmul Scalar Fusion Forward", res[0], y[0])
-    test_result("Matmul Scalar Fusion Forward", res[1], y[1])
+    test_result("Matmul Reduction Fusion activation", res[0], y[0])
+    test_result("Matmul Reduction Fusion reduction", res[1], y[1])
 
 if __name__ == "__main__":
     import os

From fb7b0e1b7a0be0d1ff38177fa9bec1a6246a9030 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 14 Jun 2025 16:38:39 +0000
Subject: [PATCH 333/432] [Frontend/Fusion] Support BMM+reduction fusion

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  | 88 ++++++++++++++++++-
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  3 +-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    | 22 ++---
 PyTorchSimFrontend/mlir/mlir_template.py      |  9 +-
 4 files changed, 105 insertions(+), 17 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 709af4d7..d6917cad 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -83,6 +83,81 @@
 }
 """
 
+BMM_REDUCTION_TEMPLATE = r"""
+// BMM kernel
+// BATCH = {{ B }}
+// M = {{ M }}
+// N = {{ N }}
+// K = {{ K }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
+// SUB_TILE_M = {{ SUB_TILE_M }}
+// SUB_TILE_N = {{ SUB_TILE_N }}
+#map0 = affine_map<(d0, d1, d2) -> ({{ X_map }})>
+#map1 = affine_map<(d0, d1, d2) -> ({{ W_map }})>
+#map2 = affine_map<(d0, d1, d2) -> (d0 * {{ M * N }} + d1 * {{ N }} + d2)>
+memref.global @X_spad : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+memref.global @W_spad : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+memref.global @Y_spad : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+{{kernel.def_global_vars()}}
+
+func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} {
+  %c_mvin = arith.constant 2 : index
+  %c_mvin2 = arith.constant 1 : index{% if Bias %}
+  %c_mvin3 = arith.constant 14 : index{% endif %}
+  %c_mvout = arith.constant 3 : index
+  %vstride = arith.constant 1 : index
+  %axis = arith.constant 2 : index
+  %X_buffer = memref.get_global @X_spad : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+  %W_buffer = memref.get_global @W_spad : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+  %Y_buffer = memref.get_global @Y_spad : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+  %tag = memref.alloc() : memref<1xi32>
+  %tag0 = memref.alloc() : memref<1xi32>
+  %tag1 = memref.alloc() : memref<1xi32>
+  %tag2 = memref.alloc() : memref<1xi32>{% if not Bias %}
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
+  %c0 = arith.constant 0 : index
+{{ kernel.def_local_vars() }}
+  affine.for %b=0 to {{ B }} {
+    affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
+      %red_idx = affine.apply affine_map<(d0, d1) -> ({{M}}*d0 + d1)>(%b, %t_n)
+      {{kernel.reduction_acc()}} affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {{kernel.reduction_iter_arg()}} {
+        %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+        %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+        %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+
+        %index2 = affine.apply #map2(%b, %t_m, %t_n)
+        {% if Bias -%}
+        memref.dma_start %Bias[
+        {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
+          ], %Y_buffer2D[0, 0], %c_mvin3, %tag0[%c0], %
+        {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
+          , %vstride : memref<
+        {%- if Bias_rank == 2 -%} {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
+          xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1 , {{ TILE_M }}] }
+        {%- else -%}
+        affine.vector_store %v0, %Y_buffer2D[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
+        affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
+          %index0 = affine.apply #map0(%b, %t_m, %t_k)
+          %index1 = affine.apply #map1(%b, %t_k, %t_n)
+          memref.dma_start %X[%index0], %X_buffer2D[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
+             : memref<{{ B * M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
+          memref.dma_start %W[%index1], %W_buffer2D[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
+             : memref<{{ B * K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
+
+          linalg.matmul ins(%X_buffer2D, %W_buffer2D : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
+                  outs(%Y_buffer2D : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
+        } { accumulation_loop=true, loop_k=true }
+        {{kernel.store_output(indent_size=8)}}
+      } { outer_loop=true, loop_m=true }
+      {{kernel.reduction_output(indent_size=6)}}
+    } { outer_loop=true, loop_n=true}
+  } { outer_loop=true }
+  return
+}
+"""
+
 class MLIRBMMTemplate(MLIRTemplate):
     def __init__(self, input_nodes, layout, input_reorder=None):
         super().__init__("kernel", input_nodes, layout, input_reorder)
@@ -121,6 +196,13 @@ def render(self,
         SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
         SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
 
+        if n_extra_node==1 and epilogue_nodes[0].is_reduction():
+          template = BMM_REDUCTION_TEMPLATE
+          nr_rdim = 1
+        else:
+          template = BMM_TEMPLATE
+          nr_rdim = 0
+
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
@@ -159,9 +241,11 @@ def render(self,
             mlir_dtype = kernel.render_options['DATA_STYPE'],
             dram_shape = f"memref<{kernel.render_options['Y_numel']}x{kernel.render_options['DATA_STYPE']}>",
             tile_size = (1, TILE_M, TILE_N),
-            tile_stride = [1, 1, TILE_M]
+            tile_stride = [1, 1, TILE_M],
+            nr_rdim = nr_rdim,
+            reduction_idx = "red_idx"
         )
-        code = self._template_from_string(BMM_TEMPLATE).render(**kernel.render_options)
+        code = self._template_from_string(template).render(**kernel.render_options)
         kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
 
         self.header = f"float X_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_K)}] __attribute__ ((section(\".spad\")));\n"
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 5257201d..f41c1893 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -244,7 +244,8 @@ def render(self,
             dram_shape = f"memref<{kernel.render_options['Y_numel']}x{kernel.render_options['DATA_STYPE']}>",
             tile_size = (TILE_M, TILE_N),
             tile_stride = [1, TILE_M],
-            nr_rdim = nr_rdim
+            nr_rdim = nr_rdim,
+            reduction_idx = "t_n"
         )
         code = self._template_from_string(template).render(**kernel.render_options)
         kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 97b57495..65843e7c 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -1,5 +1,6 @@
 import os
 import math
+from sympy import symbols, sympify
 from PyTorchSimFrontend import extension_config
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
 
@@ -27,16 +28,17 @@ def __init__(self, scheduler):
     def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
         if node1.get_device() == node2.get_device():
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
-            # For matmul+reduction case
-            if node1.is_template() and len(node1.get_nodes())==1 and isinstance(node1.node.template, MLIRGemmTemplate) and node2.is_reduction() and len(node2.get_nodes())==1:
-                reduction_axis = node2.node.origin_node.args[1]
-                if isinstance(reduction_axis, list):
-                    reduction_axis = reduction_axis[0]
-                output_dims = len(node1.node.get_size())
-                if reduction_axis < 0:
-                    reduction_axis = output_dims + reduction_axis
-                possible = node1.node.get_size()[:-1] == node2.node.get_size() and ((reduction_axis==0 and output_dims==2) or (reduction_axis==1 and output_dims==3))
-                return possible
+            from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
+            if (node1.is_template() and len(node1.get_nodes())==1 and \
+                (isinstance(node1.node.template, MLIRGemmTemplate) or isinstance(node1.node.template, MLIRBMMTemplate)) and \
+                node2.is_reduction() and len(node2.get_nodes())==1):
+                # For matmul/bmm+reduction case
+                size_match =node1.node.get_size() == node2.node.get_size() + node2.node.get_reduction_size()
+                stride = [i.strip()[:-1].split(",")[-1].strip() for i in str(node2.node).split("\n") if "r0" in i][1]
+                target_symbol = symbols("r0")
+                # We can't fuse dim=-1
+                possible = int(sympify(stride).coeff(target_symbol)) != 1
+                return size_match and possible
         return self.scheduler.can_fuse_origin(node1, node2)
 
     def _set_flush_status(self, status: bool):
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index da8c2575..c58e9056 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -56,6 +56,7 @@ def __init__(self,
         self.alloc_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="template_alloc")
         self.reduction_epilogue_suffix = IndentedBuffer()
         self.reduction_fusion = False
+        self.reduction_idx = None
 
         # Overwrite ops
         self.load = self.load_epilogue
@@ -349,8 +350,7 @@ def template_store():
             if len(self.stores._lines) == 0:
                 template_store()
             compute_body.splice(self.stores)
-        if (compute_body.getvalue()):
-            self.body.splice(compute_body)
+        self.body.splice(compute_body)
         self.body.splice(self.dma_stores)
         self.body.splice(self.reduction_epilogue_suffix)
 
@@ -765,7 +765,7 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         return acc
 
     def store_reduction_epilogue(self, name, index, value):
-        index = "t_n" # TODO. conversion required...
+        index = self.reduction_idx
         tmp_cse = self.cse
         self.cse = self.reduction_cse
 
@@ -779,7 +779,7 @@ def store_reduction_epilogue(self, name, index, value):
         reduction_axis_size = self.kernel_group.tile_desc.get_tile_size()[-2]
         nr_outer_loop = numel_per_lane // reduction_axis_size
 
-        vlane_split_axis = 0
+        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis - 1
         vlane_stride = self.kernel_group.tile_desc.vlane_stride
         tile_numel_per_lane = vlane_stride * nr_outer_loop
 
@@ -832,6 +832,7 @@ def set_tile_size(self, template_store_info):
             self.reduction_fusion = True
             self.reduction_axis_size =  tile_desc.get_tile_size()[-2]
             self.reduction_nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size
+            self.reduction_idx = template_store_info["reduction_idx"]
             self.compute_body_loop.size = reduction_axis_size
             self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop
         else:

From faf4988adcb281df096c702e00f6d3230a491457 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sun, 15 Jun 2025 07:58:01 +0000
Subject: [PATCH 334/432] [Frontend/Fusion] Fix matmul+reduction bugs

---
 PyTorchSimFrontend/mlir/mlir_template.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index c58e9056..07625d55 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -613,27 +613,25 @@ def load_epilogue(self, name: str, index: sympy.Expr):
             reduce_size = self.reduction_nr_outer_loop
             vsize = compute_vec_size//reduce_size
             vshape = f"vector<{vsize}x{mlir_dtype}>"
-            tshape = f"vector<{reduce_size}x{vsize}x{mlir_dtype}>"
+            flatten_tshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
 
             init = self.cse.generate(self.loads, f"arith.constant 0.0 : {mlir_dtype}")
-            init_vec = self.cse.generate(self.loads, f"vector.broadcast %{init} : {mlir_dtype} to {tshape}")
+            init_vec = self.cse.generate(self.loads, f"vector.broadcast %{init} : {mlir_dtype} to {flatten_tshape}")
             if compute_vec_size > 1:
+                out_list = []
                 for i in range(reduce_size):
                     offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0) -> (d0 + {i*(self.reduction_axis_size)})>(%{self.compute_idx})")
                     compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{offset}"])
                     operation = "affine.vector_load"
                     line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
                     out = self.cse.generate(self.loads, line)
-                    init_vec = self.cse.generate(self.loads, f"vector.insert %{out}, %{init_vec}[{i}] : {vshape} into {tshape}")
+                    out_list.append(out)
+                for idx, partial_out in enumerate(out_list):
+                    init_vec = self.cse.generate(self.loads, f"vector.insert_strided_slice %{partial_out}, %{init_vec} {{offsets=[{vsize*idx}],strides=[1]}} : {vshape} into {flatten_tshape}")
                 out = init_vec
-                vshape = tshape
             else:
                 line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
                 out = self.cse.generate(self.loads, line)
-
-        if self.reduction_fusion:
-            new_vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
-            out = self.cse.generate(self.loads, f"vector.shape_cast %{out} : {vshape} to {new_vshape}")
         self.register_var_info(out, [compute_vec_size, mlir_dtype])
         return out
 

From 8586a160ba522b8ab6320bd689d1b98bcf524d75 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sun, 15 Jun 2025 11:24:48 +0000
Subject: [PATCH 335/432] [Frontend/Fusion] Add attention fusion case

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py |  4 +-
 PyTorchSimFrontend/mlir/mlir_template.py   |  2 +
 tests/Fusion/test_attention_fusion.py      | 84 ++++++++++++++++++++++
 3 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 tests/Fusion/test_attention_fusion.py

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 65843e7c..ec8de5a1 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -33,7 +33,9 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
                 (isinstance(node1.node.template, MLIRGemmTemplate) or isinstance(node1.node.template, MLIRBMMTemplate)) and \
                 node2.is_reduction() and len(node2.get_nodes())==1):
                 # For matmul/bmm+reduction case
-                size_match =node1.node.get_size() == node2.node.get_size() + node2.node.get_reduction_size()
+                size_match = node1.node.get_size() == node2.node.get_size() + node2.node.get_reduction_size()
+                if len(node1.node.get_size()) == len(node2.node.get_size()):
+                    size_match = node1.node.get_size() == [dim for dim in node2.node.get_size() if dim!=1] + node2.node.get_reduction_size()
                 stride = [i.strip()[:-1].split(",")[-1].strip() for i in str(node2.node).split("\n") if "r0" in i][1]
                 target_symbol = symbols("r0")
                 # We can't fuse dim=-1
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 07625d55..a0537201 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -827,6 +827,8 @@ def set_tile_size(self, template_store_info):
             numel_per_lane = tile_desc.get_numel_per_lane()
             reduction_axis_size = tile_desc.get_tile_size()[-2]
             nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size
+            tile_desc.vec_size = nr_outer_loop * 2 # Why? Emprically selected, other option failed to functionality...
+
             self.reduction_fusion = True
             self.reduction_axis_size =  tile_desc.get_tile_size()[-2]
             self.reduction_nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size
diff --git a/tests/Fusion/test_attention_fusion.py b/tests/Fusion/test_attention_fusion.py
new file mode 100644
index 00000000..a513b0bb
--- /dev/null
+++ b/tests/Fusion/test_attention_fusion.py
@@ -0,0 +1,84 @@
+import math
+import copy
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def clones(module, N):
+    "Produce N identical layers."
+    return torch.nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
+
+class my_MultiheadAttention(torch.nn.Module):
+    def __init__(self, h, d_model, dropout=0.1):
+        super(my_MultiheadAttention, self).__init__()
+        assert d_model % h == 0
+        # We assume d_v always equals d_k
+        self.d_k = d_model // h
+        self.h = h
+        self.linear = torch.nn.Linear(d_model, d_model)
+        self.attn = None
+
+    def forward(self, query, key, value):
+        # BMM + Max
+        scores = torch.matmul(key, query.transpose(-2, -1))
+        s_max = scores.max(dim=-2, keepdim=True).values
+
+        # Reduce Sum
+        scores = torch.exp(scores-s_max)
+        s_sum = scores.sum(dim=-2, keepdim=True)
+
+        # Elementwise + BMM
+        p_attn = scores/s_sum
+        x = torch.matmul(value.transpose(-1, -2), p_attn)
+        # 3) "Concat" using a view and apply a final linear.
+        x = (
+            x.contiguous()
+            .view(-1, self.h * self.d_k)
+        )
+        del query
+        del key
+        del value
+        return self.linear(x)
+
+def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512):
+    MHA = my_MultiheadAttention(num_heads, embed_dim)
+    cpu_query = torch.randn(num_heads, input_seq, embed_dim//num_heads)
+    cpu_key = torch.randn(num_heads, input_seq, embed_dim//num_heads)
+    cpu_value = torch.randn(num_heads, input_seq, embed_dim//num_heads)
+    cpu_res = MHA(cpu_query, cpu_key, cpu_value)
+
+    query = cpu_query.clone().to(device=device)
+    key = cpu_key.clone().to(device=device)
+    value = cpu_value.clone().to(device=device)
+    MHA.to(device=device)
+    opt_fn = torch.compile(dynamic=False)(MHA)
+    res = opt_fn(query, key, value)
+
+    test_result("MHA Forward", res, cpu_res)
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    test_MHA(device)
+    # test_Attention(device, head=16, seq=512, d_k=64)
+    # test_MHA(device, num_heads=12, embed_dim=768)

From 0732db80fa25dcac39d872cc99728d7c767dca4b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sun, 15 Jun 2025 16:12:07 +0000
Subject: [PATCH 336/432] [Frontend] Add masked load

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 91072a18..5425d6a2 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -982,9 +982,22 @@ def load(self, name: str, index: sympy.Expr):
         self.cse.generate(self.dma_loads, code, assignment = False) # FIXME: assignment = False does not support caching
         compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"])
         # Generate vector load instruction
+        needs_mask = self.compute_body_loop.size % self.compute_body_loop.step != 0
         if compute_vec_size > 1:
-            operation = "affine.vector_load"
-            line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
+            if needs_mask:
+                index_shape = f"vector<{self.compute_body_loop.step}xindex>"
+                mask_shape = f"vector<{compute_vec_size}xi1>"
+                step_vec = self.cse.generate(self.loads, f"vector.step : {index_shape}")
+                upper_bound = self.get_const_cse(self.compute_body_loop.size, "index")
+                gap = self.cse.generate(self.loads, f"arith.subi %{upper_bound}, %{self.compute_idx} : index")
+                gap_vec = self.cse.generate(self.loads, f"vector.broadcast %{gap} : index to {index_shape}")
+                mask_var = self.cse.generate(self.loads, f"arith.cmpi ult, %{step_vec}, %{gap_vec} : {index_shape}")
+                pad_val = self.get_const_cse(0, mlir_dtype)
+                pad_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{pad_val} : {mlir_dtype} to {vshape}")
+                line = f"vector.maskedload %{sram_var}[{compute_index_var}], %{mask_var}, %{pad_vec} : {tile_shape}, {mask_shape}, {vshape} into {vshape}"
+            else:
+                operation = "affine.vector_load"
+                line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
         else:
             operation = "affine.load"
             line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"

From fefa193ce76dc96fc9b1d6ce4b503689f74bd0ca Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Sat, 14 Jun 2025 11:11:02 +0000
Subject: [PATCH 337/432] [Frontend] naive Auto-tuning

---
 PyTorchSimFrontend/extension_codecache.py     | 76 +++++++++-------
 PyTorchSimFrontend/extension_config.py        |  1 +
 .../mlir/mlir_codegen_backend.py              | 90 ++++++++++++++-----
 PyTorchSimFrontend/mlir/mlir_common.py        | 31 +++++--
 Scheduler/scheduler.py                        |  2 +-
 Simulator/simulator.py                        |  5 +-
 6 files changed, 145 insertions(+), 60 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index e4101e66..79d2b9d0 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -226,37 +226,37 @@ def load(cls, source_code,
                 print("Error output:", e.output)
                 assert(0)
 
-        if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
-            return key
+            if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
+                return key
+
+            # Generate MLIR kernel calller and binary for cycle calculation
+            cycle_llvm_caller = MLIRKernelCallerCodeGen(False, arg_attributes, cycle_sim=True)
+            cycle_llvm_caller.generate_wrapper_file(write_path, cycle_wrapper_name)
+            cycle_llvm_caller.compile_wih_kernel(write_path, key + "_sample", cycle_wrapper_name, cycle_binary_name, link_option)
+            array_size = []
+            for (arg_name, arg_attribute) in arg_attributes:
+                array_size.append(str(arg_attribute[2]))
 
-        # Generate MLIR kernel calller and binary for cycle calculation
-        cycle_llvm_caller = MLIRKernelCallerCodeGen(False, arg_attributes, cycle_sim=True)
-        cycle_llvm_caller.generate_wrapper_file(write_path, cycle_wrapper_name)
-        cycle_llvm_caller.compile_wih_kernel(write_path, key + "_sample", cycle_wrapper_name, cycle_binary_name, link_option)
-        array_size = []
-        for (arg_name, arg_attribute) in arg_attributes:
-            array_size.append(str(arg_attribute[2]))
-
-        # Run cyclesim
-        cyclesim = CycleSimulator()
-        cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), " ".join(array_size), vectorlane_size)
-
-        # Create TOG
-        w_offset, x_offset = vectorlane_size, vectorlane_size
-        if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size:
-            x_offset = kwargs['loop_size'][-3]
-        if kwargs['loop_size'] is not None and kwargs['loop_size'][-1] < vectorlane_size:
-            w_offset = kwargs['loop_size'][-1]
-        w_offset = 0 # max(w_offset - x_offset, 0)
-        tile_graph_generator = tog_generator(origins)
-        tile_graph_generator.load_file(raw_tog_path)
-        tile_graph_generator.generate_tile_graph(
-            os.path.join(write_path, "tile_graph.onnx"),
-            cycle_list=cycle_list,
-            x_offset=x_offset, # FIXME.
-            w_offset=w_offset, # FIXME.
-            vector_lane=vectorlane_size
-        )
+            # Run cyclesim
+            cyclesim = CycleSimulator()
+            cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), " ".join(array_size), vectorlane_size)
+
+            # Create TOG
+            w_offset, x_offset = vectorlane_size, vectorlane_size
+            if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size:
+                x_offset = kwargs['loop_size'][-3]
+            if kwargs['loop_size'] is not None and kwargs['loop_size'][-1] < vectorlane_size:
+                w_offset = kwargs['loop_size'][-1]
+            w_offset = 0 # max(w_offset - x_offset, 0)
+            tile_graph_generator = tog_generator(origins)
+            tile_graph_generator.load_file(raw_tog_path)
+            tile_graph_generator.generate_tile_graph(
+                os.path.join(write_path, "tile_graph.onnx"),
+                cycle_list=cycle_list,
+                x_offset=x_offset, # FIXME.
+                w_offset=w_offset, # FIXME.
+                vector_lane=vectorlane_size
+            )
         return key
 
 class LLVMCodeCache:
@@ -378,12 +378,26 @@ def dummy_simulator(*args, **kwargs):
             return result
 
         def dryrun_simulator(*args, **kwargs):
+            autotune = kwargs.get('autotune', False)
             key = future.result()
              # Run simulator pass
             result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key))
             # Dump arguments and meta data
             dump_metadata(args, arg_attributes, result_path)
             runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
+            if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
+                return
+
+            if autotune:
+                onnx_path = os.path.join(result_path, "tile_graph.onnx")
+                attribute_path = os.path.join(runtime_path, "attribute")
+                backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
+                backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG)
+                backsim.vectorlane_size = vectorlane_size
+                attribute_path = backsim.create_attribute_file(attribute_path, args, loop_size=loop_size)
+                result_path_2 = backsim.simulation(onnx_path, attribute_path)
+                result = BackendSimulator.get_result_from_file(result_path_2)
+                return result_path, runtime_path, result
 
             # Todo. Support valude dependent mode for graph mode
             if False: # extension_config.CONFIG_TORCHSIM_VALIDATION_MODE:
@@ -392,7 +406,7 @@ def dryrun_simulator(*args, **kwargs):
                                   runtime_path, self.validation_binary_name,
                                   vectorlane_size=vectorlane_size, spad_info=spad_info,
                                   cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS)
-            return result_path, runtime_path
+            return result_path, runtime_path, None
 
         is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
         target_simulator = dryrun_simulator if is_dryrun else dummy_simulator
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index d1babd47..17fa74d9 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -5,6 +5,7 @@
 
 # Hardware info config
 CONFIG_VECTOR_LANE = int(os.environ.get("TORCHSIM_VECTOR_LANE", default=128))
+CONFIG_VECTOR_LANE_STRIDE = int(os.environ.get("TORCHSIM_VECTOR_LANE_STRIDE", default=2))
 CONFIG_SPAD_INFO = {
   "spad_vaddr" : 0xD0000000,
   "spad_paddr" : 0x2000000000,
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 5425d6a2..882160c6 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -4,6 +4,7 @@
 import os
 import math
 import torch
+from concurrent.futures import ThreadPoolExecutor
 from torch._dynamo.utils import dynamo_timed
 from torch._inductor.codegen import cpp, wrapper, common, memory_planning
 from torch._inductor.virtualized import V, _ops as ops
@@ -836,8 +837,8 @@ class MLIRKernel(mlir_common.BaseMLIRKernel):
     overrides = ExtensionOverrides
     newvar_prefix = "%"
 
-    def __init__(self, kernel_group):
-        super().__init__(kernel_group)
+    def __init__(self, kernel_group, reason=None):
+        super().__init__(kernel_group, reason=reason)
         self.const_buffer = IndentedBuffer()
         self.alloc_buffer = IndentedBuffer()
         self.spad_buffer = IndentedBuffer()
@@ -880,8 +881,8 @@ def __init__(self, kernel_group):
         self.spad_buffer_dict = dict()
         self.base_vector_initialized = False
 
-    def reset(self):
-        self.__init__(self.kernel_group)
+    def reset(self, reason):
+        self.__init__(self.kernel_group, reason=reason)
 
     # padding type 0: zero-padding 1: negative-padding(-inf) ...
     def get_padding_type(self):
@@ -1380,24 +1381,73 @@ def codegen_loops(self):
         code.writeline(f"return")
         return code
 
-    def codegen_nodes(self, nodes, kernel_name):
-        for n_try in range(extension_config.CONFIG_MAX_AUTOTUNE_TRY):
-            src_code = super().codegen_nodes(nodes, kernel_name)
-            self._prepare_simulator_headers(src_code)
-            if not extension_config.CONFIG_AUTOTUNE or not extension_config.CONFIG_TORCHSIM_VALIDATION_MODE:
-                return src_code
+    def make_choices(self, nodes, kernel_name):
+        choices = []
+        initial_tile_size = self.kernel_group.tile_desc.get_tile_size()
+        previous_ranges = self.ranges
+        for vlane_stride in [2, 4, 8]:
+                os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = str(vlane_stride)
+                previous_tile_size = initial_tile_size
+                incrase_dim = -1 # only increase the last dimension
+                while previous_tile_size[incrase_dim] * 2 <= previous_ranges[incrase_dim]:
+                    src_code = super().codegen_nodes(nodes, kernel_name)
+                    print(f"[Auto-tune] Trying tile size: {self.kernel_group.tile_desc.get_tile_size()}, vlane_stride: {vlane_stride}")
+                    if self.stop_autotune:
+                        print(f"[Auto-tune] Skipping autotuning due to enough tile size: {self.kernel_group.tile_desc.get_tile_size()}")
+                        break
+                    previous_tile_size = self.kernel_group.tile_desc.get_tile_size()
+                    self._prepare_simulator_headers(src_code)
+                    bench_runner = self.run_bench(nodes, kernel_name, src_code)
+                    choices.append((bench_runner, src_code, self.kernel_group))
+                    self.reset(f"tile_size_{incrase_dim}")
+                self.reset("vlane_stride")
+        return choices
+
+    def autotune(self, nodes, kernel_name):
+        def get_cycle(choice):
+            bench_runner, src_code, kernel_group = choice
+            for n_try in range(extension_config.CONFIG_MAX_AUTOTUNE_TRY): # TODO: make simple
+                try:
+                    # bench_runner = self.run_bench(nodes, kernel_name, src_code)
+                    if int(os.environ.get('BACKENDSIM_DRYRUN', default=False)):
+                        _, _, out = bench_runner(autotune=1)
+                    else:
+                        out = bench_runner(validate=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE)
+                    return out[-1]
+                except (extension_codecache.SpadOverflowError, RuntimeError) as e:
+                    if isinstance(e, RuntimeError) and str(e) != "STACK_OVERFLOW":
+                        print(f"Benchmark[trial-{n_try}] failed with unexpected error: {e}")
+                        raise
+                    print(f"Benchmark failed due to spad overflow with tile size: {self.kernel_group.tile_desc.get_tile_size()}")
+                    self.kernel_group = kernel_group # Reset to the original tile desc
+                    self.reset("spad_overflow")
+                    src_code = super().codegen_nodes(nodes, kernel_name)
+                    bench_runner = self.run_bench(nodes, kernel_name, src_code)
+                    kernel_group = self.kernel_group
+                    self._prepare_simulator_headers(src_code)
+            raise RuntimeError("[Auto-tune] Exceeded maximum number of autotuning attempts")
+
+        choices = self.make_choices(nodes, kernel_name)
+
+        if len(choices) == 0: # can't autotune
+            return None
+        with ThreadPoolExecutor(max_workers=5) as executor:
+            results = list(executor.map(get_cycle, choices))
+        max_idx = results.index(min(results))
+        optimal_src_code = choices[max_idx][1]
+        return optimal_src_code
 
-            try:
-                bench_runner = self.run_bench(nodes, kernel_name, src_code)
-                bench_runner(validate=True)
+    def codegen_nodes(self, nodes, kernel_name):
+        src_code = super().codegen_nodes(nodes, kernel_name)
+        self._prepare_simulator_headers(src_code)
+        if not extension_config.CONFIG_AUTOTUNE or extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
+            return src_code
+        else:
+            optimal_src_code = self.autotune(nodes, kernel_name)
+            if optimal_src_code:
+                return optimal_src_code
+            else:
                 return src_code
-            except (extension_codecache.SpadOverflowError, RuntimeError) as e:
-                if isinstance(e, RuntimeError) and str(e) != "STACK_OVERFLOW":
-                    print(f"Benchmark[trial-{n_try}] failed with unexpected error: {e}")
-                    raise
-                print(f"Benchmark failed due to spad overflow with tile size: {self.kernel_group.tile_desc.get_tile_size()}")
-                self.reset()
-        raise RuntimeError("Exceeded maximum number of autotuning attempts")
 
     def _prepare_simulator_headers(self, src_code):
         write_path = extension_codecache.get_write_path(src_code)
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 2035ed6a..cd1a8600 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -322,7 +322,7 @@ class BaseMLIRKernel(common.Kernel, BaseMLIRHardwareInfo):
     load_format = None
     store_format = None
 
-    def __init__(self, kernel_group):
+    def __init__(self, kernel_group, reason=None):
         super().__init__(kernel_group.args)
         self.kernel_group = kernel_group
         # Kernel iteration range info
@@ -339,6 +339,8 @@ def __init__(self, kernel_group):
         self.buffer_types : dict = None # format: dtype, numel, size, stride
         self.compute_idx = "compute_idx"
         self.compute_body_loop = LoopLevel(self.compute_idx, 1)
+        self.recodegen = reason # spad overflow, tile size, vlane stride
+        self.stop_autotune = False
 
     def set_ranges(self, lengths, reduction_lengths):
         if self.call_ranges:
@@ -440,7 +442,6 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
                     raise NotImplementedError("Not supporting type...")
 
         vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
-        vlane_stride = 2
 
         # FIXME: Naive decrease tile size
         def decrease_tile_size(tile_size, vlane_split_axis):
@@ -466,14 +467,11 @@ def decrease_tile_size(tile_size, vlane_split_axis):
             return tile_size
 
         # Dummy tile size
-        if self.kernel_group.tile_desc:
-            tile_size = self.kernel_group.tile_desc.get_tile_size()
-            decrease_tile_size(tile_size, vlane_split_axis)
-        else:
+        def dummy_tile_size():
             tile_size = [1] * (len(vars) + len(reduction_vars))
             if len(tile_size) == 2:
                 tile_size[-1] = vlane_stride * self.vector_lane
-                tile_size[-2] = 2 * vlane_stride * self.vector_lane
+                tile_size[-2] = 2 * self.vector_lane
             elif len(tile_size) == 0: # Scalar
                 tile_size = [1]
                 self.ranges = [1]
@@ -485,6 +483,23 @@ def decrease_tile_size(tile_size, vlane_split_axis):
                 tile_size[-3] = 2
             else:
                 raise NotImplementedError("dummy tile size fail!")
+            return tile_size
+
+        vlane_stride = extension_config.CONFIG_VECTOR_LANE_STRIDE
+        if self.recodegen is None:
+            tile_size = dummy_tile_size()
+        else:
+            if self.recodegen == "spad_overflow":
+                tile_size = self.kernel_group.tile_desc.get_tile_size()
+                decrease_tile_size(tile_size, vlane_split_axis)
+            elif self.recodegen == "vlane_stride":
+                tile_size = dummy_tile_size()
+            elif "tile_size" in self.recodegen:
+                dim = int(self.recodegen.split("_")[-1])
+                tile_size = self.kernel_group.tile_desc.get_tile_size() # TODO:
+                tile_size[dim] = tile_size[dim] * 2
+            else:
+                raise NotImplementedError(f"Unknown recodegen reason: {self.recodegen}")
 
         # FIXME: Not considering removed buffers
         n_buffer = sum(
@@ -503,6 +518,7 @@ def decrease_tile_size(tile_size, vlane_split_axis):
 
                 if tile_size[-i] > target_range:
                     remains = (target_range % vlane_stride)
+                    self.stop_autotune = True
                     tile_size[-i] = target_range
                     if remains:
                         tile_size[-i] += vlane_stride - remains
@@ -523,6 +539,7 @@ def decrease_tile_size(tile_size, vlane_split_axis):
                     raise NotImplementedError("Error: Cannot find proper tile size")
                 tile_size = new_tile_size
                 spad_overflow = True
+                self.stop_autotune = True # for auto-tune
                 continue
             else:
                 spad_overflow = False
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 7aa3e931..834698a6 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -240,7 +240,7 @@ def finish_model(self, model : SchedulerDNNModel, output : torch.Tensor):
             self.finish_req_dict[req] = RequestReturn(RequestReturn.FINISHED)
 
     def prepare_launch_kernel(self, kernel, inputs):
-        result_path, runtime_path = kernel(*inputs)
+        result_path, runtime_path, _ = kernel(*inputs)
         onnx_path = os.path.join(result_path, "tile_graph.onnx")
 
         attribute_path = os.path.join(runtime_path, "attribute")
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index bc74ebce..105edfa2 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -434,10 +434,13 @@ def get_result_from_file(result_path):
             if 'DRAM: AVG BW Util' in line:
                 avg_dram_bw = float(re.search(r'AVG BW Util (\d+\.?\d*)%', line).group(1))
 
+            if 'Total execution cycle' in line:
+                total_cycle = int(re.search(r'Total execution cycle: (\d+)', line).group(1))
+
             # Parse total simulation time
             if 'Simulation time' in line:
                 simulation_time = float(re.search(r'Simulation time: (\d+\.?\d*) seconds', line).group(1))
-        return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time
+        return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time, total_cycle
 
 if __name__ == "__main__":
     sim = BackendSimulator("/workspace/PyTorchSim/PyTorchSimBackend", "/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c4_simple_noc_tpuv4.json")

From 9e5c29d62781ca9e98d2a61251acd019d0045c74 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Sun, 15 Jun 2025 05:49:03 +0000
Subject: [PATCH 338/432] [fix] prohibit unsupported vlane size case

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +-
 experiments/resnet18.py                         | 1 +
 experiments/resnet50.py                         | 1 +
 tests/Mixtral_8x7B/test_attention.py            | 1 +
 4 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 882160c6..4e3f9500 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1389,7 +1389,7 @@ def make_choices(self, nodes, kernel_name):
                 os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = str(vlane_stride)
                 previous_tile_size = initial_tile_size
                 incrase_dim = -1 # only increase the last dimension
-                while previous_tile_size[incrase_dim] * 2 <= previous_ranges[incrase_dim]:
+                while previous_tile_size[incrase_dim] * 2 <= previous_ranges[incrase_dim] and previous_tile_size[incrase_dim] <= 2 ** 13:
                     src_code = super().codegen_nodes(nodes, kernel_name)
                     print(f"[Auto-tune] Trying tile size: {self.kernel_group.tile_desc.get_tile_size()}, vlane_stride: {vlane_stride}")
                     if self.stop_autotune:
diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index c12cc930..f4ce2dc5 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -42,6 +42,7 @@ def run_resnet(batch, config):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
+    os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = "8"
     if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
         del os.environ['BACKENDSIM_SPIKE_ONLY']
 
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index ec2e26ff..ee7a9208 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -40,6 +40,7 @@ def run_resnet(batch, config):
     result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"resnet50_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = "8"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
     if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py
index cc2adc96..0e06b2d0 100644
--- a/tests/Mixtral_8x7B/test_attention.py
+++ b/tests/Mixtral_8x7B/test_attention.py
@@ -143,6 +143,7 @@ def concat_tensors(a, b):
     import os
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+    os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = "8"
 
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()

From b76defb3b1b347fb883aa40cf3fd5fcb5bfef973 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Sun, 15 Jun 2025 05:49:34 +0000
Subject: [PATCH 339/432] [experiments] ILS script fix

---
 scripts/ILS_experiment/ils_parser.sh | 27 ++++++++++++++-------------
 tests/test_resnet.py                 |  1 +
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/scripts/ILS_experiment/ils_parser.sh b/scripts/ILS_experiment/ils_parser.sh
index eeaea5cd..a02d8edb 100755
--- a/scripts/ILS_experiment/ils_parser.sh
+++ b/scripts/ILS_experiment/ils_parser.sh
@@ -11,23 +11,24 @@ total_togsim=0
 
 while IFS= read -r line; do
   if [[ "$line" == launch* ]]; then
-    tile_path=$(echo "$line" | awk '{print $2}')
-    base_dir=$(dirname "$tile_path")
-    result_path="$base_dir/m5out/sto.log"
-    echo $result_path
-    togsim_time=$(grep "Simulation time:" "$result_path" | \
-                  sed -E 's/Simulation time: ([0-9.]+) seconds$/\1/')
-    echo "GEM5: $togsim_time"
-    total_togsim=$(echo "$total_togsim + $togsim_time" | bc)
+    tile_graph_path=$(echo "$line" | awk '{for (i=1; i<=NF; i++) if ($i ~ /tile_graph\.onnx$/) print $i}')
+    if [[ -n "$tile_graph_path" ]]; then
+      dir_path=$(dirname "$tile_graph_path")
+      sto_log_path="$dir_path/m5out/sto.log"
+      echo "sto.log path: $sto_log_path"
+      gem5_time=$(grep "Simulation time:" "$sto_log_path" | \
+                sed -E 's/^Simulation time: ([0-9.]+) seconds$/\1/')
+      echo "GEM5: $gem5_time" 
+      total_gem5=$(echo "$total_gem5 + $gem5_time" | bc)
+    fi
   fi
-
-  if [[ "$line" == *"[info] Simulation time:"* ]]; then
-    togsim_time=$(echo $line | sed -E 's/^\[[^]]+\] \[info\] Simulation time: ([0-9.]+) seconds$/\1/')
+  if [[ "$line" == *"Simulation time:"* ]]; then
+    togsim_time=$(echo "$line" | sed -E 's/.*Simulation time: ([0-9.]+) seconds/\1/')
     echo "TOGSim: $togsim_time"
   fi
 done
 
 if [[ -n "$total_gem5" && -n "$total_togsim" ]]; then
-  #total_time=$(python3 -c "print(round($total_gem5 + $total_togsim, 6))")
-  echo "Simulation time: $togsim_time seconds"
+  total_time=$(python3 -c "print(round($total_gem5 + $total_togsim, 6))")
+  echo "Simulation time: $total_time seconds"
 fi
\ No newline at end of file
diff --git a/tests/test_resnet.py b/tests/test_resnet.py
index 5e96b922..8c1dfb29 100644
--- a/tests/test_resnet.py
+++ b/tests/test_resnet.py
@@ -39,6 +39,7 @@ def test_resnet(device):
     import os
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+    os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = "8"
 
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()

From c176d4ff614dfe683883505cd1b47a37a5c69471 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Sun, 15 Jun 2025 16:36:48 +0000
Subject: [PATCH 340/432] [fix] use small vlane_stride for resnet

---
 experiments/resnet18.py              | 1 -
 experiments/resnet50.py              | 1 -
 tests/Fusion/test_conv_fusion.py     | 5 +++--
 tests/Mixtral_8x7B/test_attention.py | 1 -
 tests/test_resnet.py                 | 1 -
 5 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index f4ce2dc5..c12cc930 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -42,7 +42,6 @@ def run_resnet(batch, config):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = "8"
     if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
         del os.environ['BACKENDSIM_SPIKE_ONLY']
 
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index ee7a9208..ec2e26ff 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -40,7 +40,6 @@ def run_resnet(batch, config):
     result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"resnet50_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
-    os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = "8"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
     if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
diff --git a/tests/Fusion/test_conv_fusion.py b/tests/Fusion/test_conv_fusion.py
index 62cab8d3..42210b13 100644
--- a/tests/Fusion/test_conv_fusion.py
+++ b/tests/Fusion/test_conv_fusion.py
@@ -76,7 +76,7 @@ def test_conv_bn_relu(device, batch_size=1, in_channels=8, out_channels=16, inpu
     def custom_conv_bn_relu(a, b, bias, c, d, e, f):
         i_c = a.shape[1]
         o_c = b.shape[0]
-        conv2d = torch.nn.Conv2d(in_channels, out_channels, b.shape[-1], stride=stride, padding=padding, dilation=1, bias=True)
+        conv2d = torch.nn.Conv2d(in_channels, out_channels, b.shape[-1], stride=stride, padding=padding, dilation=1, bias=True).eval()
         conv2d.weight = torch.nn.Parameter(b)
         conv2d.bias = torch.nn.Parameter(bias)
         # return torch.nn.functional.batch_norm(conv2d(a), c, d, weight=e, bias=f)
@@ -90,7 +90,8 @@ def custom_conv_bn_relu(a, b, bias, c, d, e, f):
     bn_mean = torch.zeros(out_channels).to(device=device)
     bn_var = torch.ones(out_channels).to(device=device)
     opt_fn = torch.compile(dynamic=False)(custom_conv_bn_relu)
-    res = opt_fn(conv_input, conv_kernel, conv_bias, bn_mean, bn_var, bn_weight, bn_bias)
+    with torch.no_grad():
+        res = opt_fn(conv_input, conv_kernel, conv_bias, bn_mean, bn_var, bn_weight, bn_bias)
     out = custom_conv_bn_relu(conv_input.cpu(), conv_kernel.cpu(), conv_bias.cpu(), bn_mean.cpu(), bn_var.cpu(), bn_weight.cpu(), bn_bias.cpu())
     test_result("Conv2d + BN + ReLU Fusion Forward", res, out, rtol=1e-3, atol=1e-3)
     print("Max diff > ", torch.max(torch.abs(res.cpu() - out)))
diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py
index 0e06b2d0..cc2adc96 100644
--- a/tests/Mixtral_8x7B/test_attention.py
+++ b/tests/Mixtral_8x7B/test_attention.py
@@ -143,7 +143,6 @@ def concat_tensors(a, b):
     import os
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-    os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = "8"
 
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
diff --git a/tests/test_resnet.py b/tests/test_resnet.py
index 8c1dfb29..5e96b922 100644
--- a/tests/test_resnet.py
+++ b/tests/test_resnet.py
@@ -39,7 +39,6 @@ def test_resnet(device):
     import os
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-    os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = "8"
 
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()

From a750c0a93b6e8ebddf879ec562f4977f8014fc58 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 16 Jun 2025 03:15:14 +0000
Subject: [PATCH 341/432] [Frontend] Fix masked load for softmax case

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 4e3f9500..cc01b795 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -983,7 +983,7 @@ def load(self, name: str, index: sympy.Expr):
         self.cse.generate(self.dma_loads, code, assignment = False) # FIXME: assignment = False does not support caching
         compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"])
         # Generate vector load instruction
-        needs_mask = self.compute_body_loop.size % self.compute_body_loop.step != 0
+        needs_mask = self.compute_body_loop.size % self.compute_body_loop.step != 0 and len(index.free_symbols) == len(self.ranges)
         if compute_vec_size > 1:
             if needs_mask:
                 index_shape = f"vector<{self.compute_body_loop.step}xindex>"
@@ -993,7 +993,10 @@ def load(self, name: str, index: sympy.Expr):
                 gap = self.cse.generate(self.loads, f"arith.subi %{upper_bound}, %{self.compute_idx} : index")
                 gap_vec = self.cse.generate(self.loads, f"vector.broadcast %{gap} : index to {index_shape}")
                 mask_var = self.cse.generate(self.loads, f"arith.cmpi ult, %{step_vec}, %{gap_vec} : {index_shape}")
-                pad_val = self.get_const_cse(0, mlir_dtype)
+                if padding:
+                    pad_val = self.const_cse.generate(self.const_buffer, f"arith.constant 0x{mlir_common.MLIR_INF['-inf'][mlir_dtype]:x} : {mlir_dtype}")
+                else:
+                    pad_val = self.get_const_cse(0, mlir_dtype)
                 pad_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{pad_val} : {mlir_dtype} to {vshape}")
                 line = f"vector.maskedload %{sram_var}[{compute_index_var}], %{mask_var}, %{pad_vec} : {tile_shape}, {mask_shape}, {vshape} into {vshape}"
             else:

From f3643ead6d238fc12b2cc74c34c0381865d0ffe0 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 17 Jun 2025 05:53:30 +0000
Subject: [PATCH 342/432] [Script+Stonne] Add script for stonne experiments

---
 .../stonne_validation_c1_simple_noc.json      | 31 +++++++
 PyTorchSimBackend/include/Dram.h              | 21 +++++
 PyTorchSimBackend/include/SparseCore.h        |  6 +-
 PyTorchSimBackend/src/Dram.cc                 | 82 +++++++++++++++++-
 PyTorchSimBackend/src/Simulator.cc            |  4 +-
 PyTorchSimBackend/src/SparseCore.cc           | 47 +++++-----
 scripts/stonne_experiment2/tog_gen.py         | 85 +++++++++++++++++++
 7 files changed, 252 insertions(+), 24 deletions(-)
 create mode 100644 PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json
 create mode 100644 scripts/stonne_experiment2/tog_gen.py

diff --git a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json
new file mode 100644
index 00000000..08548638
--- /dev/null
+++ b/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json
@@ -0,0 +1,31 @@
+{
+  "core_type" : ["stonne"],
+  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "num_cores" : 1,
+  "core_freq" : 1000,
+  "sram_size" : 65536,
+  "core_print_interval" : 10000,
+  "num_stonne_per_core" : 1,
+  "num_stonne_port" : 32,
+
+  "dram_type" : "simple",
+  "dram_freq" : 1000,
+  "dram_channels": 1,
+  "dram_req_size": 32,
+  "dram_latency" : 100,
+  "dram_print_interval": 10000,
+  "l2d_type" : "datacache",
+  "l2d_config" : "S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq" : 7000,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
+
+  "precision" : 4,
+  "scheduler" : "simple",
+  "num_partition" : 1,
+  "partition": {
+    "core_0":0
+  }
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/Dram.h b/PyTorchSimBackend/include/Dram.h
index 0b129aef..5e51b96d 100644
--- a/PyTorchSimBackend/include/Dram.h
+++ b/PyTorchSimBackend/include/Dram.h
@@ -67,4 +67,25 @@ class DramRamulator2 : public Dram {
   int _tx_log2;
 };
 
+class SimpleDRAM: public Dram {
+ public:
+  SimpleDRAM(SimulationConfig config, cycle_type *core_cycle);
+
+  virtual bool running() override;
+  virtual void cycle() override;
+  virtual void cache_cycle() override;
+  virtual bool is_full(uint32_t cid, mem_fetch* request) override;
+  virtual void push(uint32_t cid, mem_fetch* request) override;
+  virtual bool is_empty(uint32_t cid) override;
+  virtual mem_fetch* top(uint32_t cid) override;
+  virtual void pop(uint32_t cid) override;
+  virtual void print_stat() override;
+  void print_cache_stats() override;
+ private:
+  int _latency = 1;
+  int _tx_ch_log2;
+  int _tx_log2;
+  std::vector<std::unique_ptr<DelayQueue<mem_fetch*>>> _mem;
+};
+
 #endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/SparseCore.h b/PyTorchSimBackend/include/SparseCore.h
index 55b19da3..9188b21d 100644
--- a/PyTorchSimBackend/include/SparseCore.h
+++ b/PyTorchSimBackend/include/SparseCore.h
@@ -63,7 +63,8 @@ class SparseCore : public Core {
   bool isTraceMode(int stonne_core_id) { return traceMode.at(stonne_core_id); }
   void setTraceMode(int stonne_core_id, bool mode) { traceMode.at(stonne_core_id) = mode; }
   void checkStatus(uint32_t subcore_id);
-  void registerMemfetch(const std::tuple<uint64_t, mem_access_type, mf_type>& key, std::function<void()> callback);
+  void registerMemfetch(const std::tuple<uint64_t, mem_access_type, mf_type, int>& key, std::function<void()> callback);
+  int allocTrafficID() { int id = traffic_id; traffic_id++; return 0; }
   uint32_t num_ms = 1;
   uint32_t r_port_nr = 1;
   uint32_t w_port_nr = 1;
@@ -82,8 +83,9 @@ class SparseCore : public Core {
   /* Interconnect queue */
   std::queue<mem_fetch*> _request_queue;
   std::queue<mem_fetch*> _response_queue;
-  std::map<std::tuple<uint64_t, mem_access_type, mf_type>, mem_fetch*> request_merge_table;
+  std::map<std::tuple<uint64_t, mem_access_type, mf_type, int>, mem_fetch*> request_merge_table;
   std::vector<MSwitchStats> percore_stat;
   std::vector<MSwitchStats> percore_total_stat;
+  int traffic_id=0;
 };
 
diff --git a/PyTorchSimBackend/src/Dram.cc b/PyTorchSimBackend/src/Dram.cc
index 944068cb..e604f73f 100644
--- a/PyTorchSimBackend/src/Dram.cc
+++ b/PyTorchSimBackend/src/Dram.cc
@@ -128,4 +128,84 @@ void DramRamulator2::print_cache_stats() {
   for (int ch = 0; ch < _n_ch; ch++) {
     _m_caches[ch]->print_stats();
   }
-}
\ No newline at end of file
+}
+
+SimpleDRAM::SimpleDRAM(SimulationConfig config, cycle_type* core_cycle) : Dram(config, core_cycle) {
+  /* Initialize DRAM Channels */
+  spdlog::info("[SimpleDRAM] DRAM latecny: {}", config.dram_latency);
+  for (int ch = 0; ch < _n_ch; ch++) {
+    _mem.push_back(std::make_unique<DelayQueue<mem_fetch*>>("SimpleDRAM", true, -1));
+  }
+  _latency =  config.dram_latency;
+  _tx_log2 = log2(_req_size);
+  _tx_ch_log2 = log2(_n_ch_per_partition) + _tx_log2;
+}
+
+bool SimpleDRAM::running() {
+  for (int ch = 0; ch < _n_ch; ch++) {
+    if (!_mem[ch]->queue_empty())
+      return true;
+    if (mem_fetch* req = _m_caches[ch]->top())
+      return true;
+  }
+  return false;
+}
+
+void SimpleDRAM::cycle() {
+  for (int ch = 0; ch < _n_ch; ch++) {
+    _mem[ch]->cycle();
+
+    // From Cache to DRAM
+    if (mem_fetch* req = _m_caches[ch]->top()) {
+      //spdlog::info("[Cache->DRAM] mem_fetch: addr={:#x}", req->get_addr());
+
+      _mem[ch]->push(req, _latency);
+      _m_caches[ch]->pop();
+    }
+
+    // From DRAM to Cache
+    if (_mem[ch]->arrived()) {
+      mem_fetch* req = _mem[ch]->top();
+      req->set_reply();
+      //spdlog::info("[DRAM->Cache] mem_fetch: addr={:#x}", req->get_addr());
+      if(_m_caches[ch]->push(req))
+        _mem[ch]->pop();
+    }
+  }
+}
+
+void SimpleDRAM::cache_cycle()  {
+  for (int ch = 0; ch < _n_ch; ch++) {
+    _m_caches[ch]->cycle();
+  }
+}
+
+bool SimpleDRAM::is_full(uint32_t cid, mem_fetch* request) {
+  return false; //m_from_crossbar_queue[cid].full(); Infinite length
+}
+
+void SimpleDRAM::push(uint32_t cid, mem_fetch* request) {
+  m_from_crossbar_queue[cid].push(request);
+}
+
+bool SimpleDRAM::is_empty(uint32_t cid) {
+  return m_to_crossbar_queue[cid].empty();
+}
+
+mem_fetch* SimpleDRAM::top(uint32_t cid) {
+  assert(!is_empty(cid));
+  return m_to_crossbar_queue[cid].front();
+}
+
+void SimpleDRAM::pop(uint32_t cid) {
+  assert(!is_empty(cid));
+  m_to_crossbar_queue[cid].pop();
+}
+
+void SimpleDRAM::print_stat() {}
+
+void SimpleDRAM::print_cache_stats() {
+  for (int ch = 0; ch < _n_ch; ch++) {
+    _m_caches[ch]->print_stats();
+  }
+}
diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc
index 5299efb4..6bc80286 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/PyTorchSimBackend/src/Simulator.cc
@@ -35,7 +35,9 @@ Simulator::Simulator(SimulationConfig config)
     }
   }
 
-  if (config.dram_type == DramType::RAMULATOR2) {
+  if (config.dram_type == DramType::SIMPLE) {
+    _dram = std::make_unique<SimpleDRAM>(config, &_core_cycles);
+  } else if (config.dram_type == DramType::RAMULATOR2) {
     std::string ramulator_config = fs::path(onnxim_path)
                                        .append("configs")
                                        .append(config.dram_config_path)
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 08297b38..2b4793cb 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -156,7 +156,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
       }
       req->request_time = _core_cycle;
       req->stonneId = subcore_id;
-      std::tuple<uint64_t, mem_access_type, mf_type> key = std::make_tuple(target_addr, acc_type, type);
+      std::tuple<uint64_t, mem_access_type, mf_type, int> key = std::make_tuple(target_addr, acc_type, type, allocTrafficID());
       registerMemfetch(key, [this, req, acc_type, type]() {
         spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
               _core_cycle, _core_cycle - req->request_time, req->getAddress(), int(req->getcmd()), _config.dram_req_size);
@@ -202,22 +202,28 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
     }
 
     /* Check finished dma operation */
-    while(_dma_finished_queue.size()) {
-      std::shared_ptr<Instruction>& instruction = _dma_finished_queue.at(0);
-      /* Pass not finished instruction */
-      if (instruction->get_waiting_request())
-        continue;
-
-      /* Finish DMA read instruction */
-      if (instruction->is_dma_read())
-        finish_instruction(instruction);
-      /* Erase the instruction in DMA finished queue */
-      _dma_finished_queue.erase(_dma_finished_queue.begin());
+    bool retry=true;
+    while (retry) {
+      retry = false;
+      for (auto it=_dma_finished_queue.begin();it!=_dma_finished_queue.end();it++) {
+        std::shared_ptr<Instruction>& instruction = _dma_finished_queue.at(0);
+        /* Pass not finished instruction */
+        if (instruction->get_waiting_request())
+          continue;
+
+        /* Finish DMA read instruction */
+        if (instruction->is_dma_read())
+          finish_instruction(instruction);
+        /* Erase the instruction in DMA finished queue */
+        _dma_finished_queue.erase(_dma_finished_queue.begin());
+        retry = true;
+        break;
+      }
     }
 
     /* Peek instruction*/
     auto& inst = instructions.front();
-    if (!inst->is_ready())
+    if (instructions.empty() || !inst->is_ready())
       return;
 
     bool issued = false;
@@ -229,8 +235,9 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
           spdlog::trace("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle,
                         opcode_to_string(inst->get_opcode()));
           for (auto addr : inst->get_trace_address()) {
+            addr = addr - (addr & _config.dram_req_size-1);
             inst->inc_waiting_request();
-            std::tuple<uint64_t, mem_access_type, mf_type> key = std::make_tuple(addr, acc_type, type);
+            std::tuple<uint64_t, mem_access_type, mf_type, int> key = std::make_tuple(addr, acc_type, type, allocTrafficID());
             uint64_t current_time = _core_cycle;
             registerMemfetch(key, [this, inst, addr, current_time, type]() {
               spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
@@ -239,7 +246,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
             });
           }
           issued = true;
-          _dma_waiting_queue[inst.get()] = std::move(inst);
+          _dma_finished_queue.push_back(std::move(inst));
         }
         break;
       case Opcode::MOVOUT:
@@ -249,8 +256,9 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
           spdlog::trace("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle,
                         opcode_to_string(inst->get_opcode()));
           for (auto addr : inst->get_trace_address()) {
+            addr = addr - (addr & _config.dram_req_size-1);
             inst->inc_waiting_request();
-            std::tuple<uint64_t, mem_access_type, mf_type> key = std::make_tuple(addr, acc_type, type);
+            std::tuple<uint64_t, mem_access_type, mf_type, int> key = std::make_tuple(addr, acc_type, type, allocTrafficID());
             uint64_t current_time = _core_cycle;
             registerMemfetch(key, [this, inst, addr, current_time, type]() {
               spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
@@ -260,7 +268,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
           }
           issued = true;
           finish_instruction(inst);
-          _dma_waiting_queue[inst.get()] = std::move(inst);
+          _dma_finished_queue.push_back(std::move(inst));
         }
         break;
       case Opcode::COMP:
@@ -280,9 +288,8 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
         spdlog::error("Undefined instruction opcode type");
         exit(EXIT_FAILURE);
     }
-
     if (issued) {
-      instructions.erase(instructions.begin());
+      instructions.erase(std::find(instructions.begin(), instructions.end(), inst));
     }
   }
 }
@@ -400,7 +407,7 @@ void SparseCore::finish_instruction(std::shared_ptr<Instruction>& inst) {
   }
 }
 
-void SparseCore::registerMemfetch(const std::tuple<uint64_t, mem_access_type, mf_type>& key, std::function<void()> callback) {
+void SparseCore::registerMemfetch(const std::tuple<uint64_t, mem_access_type, mf_type, int>& key, std::function<void()> callback) {
   if (request_merge_table.find(key) == request_merge_table.end()) {
     mem_fetch* req_wrapper = new mem_fetch(std::get<0>(key), std::get<1>(key), std::get<2>(key), _config.dram_req_size, -1);
 
diff --git a/scripts/stonne_experiment2/tog_gen.py b/scripts/stonne_experiment2/tog_gen.py
new file mode 100644
index 00000000..2f184f4c
--- /dev/null
+++ b/scripts/stonne_experiment2/tog_gen.py
@@ -0,0 +1,85 @@
+import os
+import sys
+import re
+import glob
+from collections import defaultdict
+sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+from AsmParser.tog_generator import tog_generator
+from Simulator.simulator import BackendSimulator
+from PyTorchSimFrontend import extension_config
+
+def extract_simulation_stats(result_path):
+    with open(result_path, "r") as f:
+        lines = f.readlines()[-4:]
+
+    nr_multiplications = None
+    total_cycle = None
+    sim_time = None
+
+    for line in lines:
+        if "nr_multiplications" in line:
+            nr_multiplications = line.strip().split(":")[-1].strip()
+        elif "Total execution cycle" in line:
+            total_cycle = line.strip().split(":")[-1].strip()
+        elif "Simulation time" in line:
+            sim_time = line.strip().split(":")[-1].replace("seconds", "").strip()
+    return nr_multiplications, total_cycle, sim_time
+
+if __name__ == "__main__":
+    base_dir = "/home/workspace/stonneResult"
+    trace_mode_paths = []
+    perf_mode_paths = []
+    for root, dirs, files in os.walk(base_dir):
+        if "raw_tog.py" in files:
+            raw_tog_path = os.path.join(root, "raw_tog.py")
+            tog_path = os.path.join(root, "tile_graph.onnx")
+            if not os.path.exists(tog_path):
+                tile_graph_generator = tog_generator([root])
+                tile_graph_generator.load_file(raw_tog_path)
+                tile_graph_generator.generate_tile_graph(
+                    tog_path,
+                    cycle_list=[0],
+                    x_offset=0,
+                    w_offset=0,
+                    vector_lane=0,
+                    stonneGraph=True
+                )
+                print(f"TOG genereted at {tog_path}")
+            rel_depth = os.path.relpath(root, base_dir).count(os.sep)
+            if rel_depth == 0:
+                trace_mode_paths.append(root)
+            else:
+                perf_mode_paths.append(root)
+    cycle_list = {}
+    simul_list = defaultdict(list)
+    for path in perf_mode_paths:
+        parent = os.path.dirname(path)
+        counter_files = glob.glob(os.path.join(path, "*.counters"))
+        for counter_file in counter_files:
+            with open(counter_file, 'r') as f:
+                first_line = f.readline().strip()
+                second_line = f.readline().strip()
+                if first_line.startswith("CYCLES="):
+                    cycle = int(first_line.split("=")[1])
+                    cycle_list[parent] = cycle
+                if second_line.startswith("Simulation time="):
+                    match = re.search(r'Simulation time=([0-9.]+)', second_line)
+                    simul_list[parent].append(float(match.group(1)))
+
+    print("\n=== Run TLS simulation ===")
+    for path in trace_mode_paths:
+        if "outerPro" in path:
+            continue
+        tog_path = os.path.join(path, "tile_graph.onnx")
+        backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
+        stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json'
+        backsim = BackendSimulator(backend_path, stonne_config_path)
+        result_path = backsim.simulation(tog_path)
+        nr_multiplications, total_cycle, sim_time = extract_simulation_stats(result_path)
+        sim_time, total_cycle = float(sim_time), int(total_cycle)
+        print(f"[TLS] Cycle={total_cycle} Sim time={sim_time} nr_multiplications={nr_multiplications}")
+        avg_simul = sum(simul_list[path]) / len(simul_list[path])
+        print(f"[ILS] Cycle={cycle_list[path]} Sim time= {avg_simul} at {path}")
+        speedup = avg_simul / sim_time if avg_simul != 0 else float('inf')
+        error_rate = abs(cycle_list[path] - int(total_cycle)) / total_cycle if total_cycle != 0 else float('inf')
+        print(f"[EVAL] Speedup={speedup:.3f}x Error rate={error_rate:.4%}")
\ No newline at end of file

From a5abb85db15379bbf90ae3274c57ddcd8c90391c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 21 Jun 2025 06:59:39 +0000
Subject: [PATCH 343/432] [SparseCore] Fix segfault for outerproduct

---
 PyTorchSimBackend/src/SparseCore.cc | 31 ++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc
index 2b4793cb..64d3da55 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/PyTorchSimBackend/src/SparseCore.cc
@@ -180,17 +180,6 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
       coreBusy.at(subcore_id) = false;
     }
   } else {
-    auto& instructions = percore_tiles.at(subcore_id).front()->get_instructions();
-    /* Finish stonne core */
-    if (coreBusy.at(subcore_id) && instructions.empty()) {
-      std::shared_ptr<Tile> target_tile = percore_tiles.at(subcore_id).front();
-      target_tile->set_status(Tile::Status::FINISH);
-      _finished_tiles.push(target_tile);
-      percore_tiles.at(subcore_id).erase(percore_tiles.at(subcore_id).begin());
-      coreBusy.at(subcore_id) = false;
-      return;
-    }
-
     /* Check finished computation */
     auto& target_pipeline = get_compute_pipeline(0);
     if (!target_pipeline.empty()) {
@@ -221,11 +210,29 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
       }
     }
 
+    auto& tile_queue = percore_tiles.at(subcore_id);
+    if (tile_queue.empty())
+      return;
+    auto& instructions = tile_queue.front()->get_instructions();
+
+    /* Finish stonne core */
+    if (coreBusy.at(subcore_id) && instructions.empty()) {
+      std::shared_ptr<Tile> target_tile = percore_tiles.at(subcore_id).front();
+      target_tile->set_status(Tile::Status::FINISH);
+      _finished_tiles.push(target_tile);
+      percore_tiles.at(subcore_id).erase(percore_tiles.at(subcore_id).begin());
+      coreBusy.at(subcore_id) = false;
+      return;
+    }
+
     /* Peek instruction*/
+    if (instructions.empty())
+      return;
     auto& inst = instructions.front();
-    if (instructions.empty() || !inst->is_ready())
+    if (!inst->is_ready())
       return;
 
+
     bool issued = false;
     switch (inst->get_opcode()) {
       case Opcode::MOVIN:

From fc560281f2e2c78fe5f21f86f1a1e2dc4de36a5e Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 17 Jun 2025 14:42:33 +0000
Subject: [PATCH 344/432] [Frontend] 2D space autotune

---
 .../systolic_ws_128x128_c1_simple_noc_tpuv3.json   |  6 +++++-
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py    | 14 ++++++++++----
 PyTorchSimFrontend/mlir/mlir_common.py             |  2 +-
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
index 55c04b92..7348d5bc 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
@@ -21,5 +21,9 @@
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
 
   "precision" : 4,
-  "scheduler" : "simple"
+  "scheduler" : "simple",
+  "num_partition" : 1,
+  "partition": {
+    "core_0": 0
+  }
 }
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index cc01b795..a038cf64 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1389,21 +1389,26 @@ def make_choices(self, nodes, kernel_name):
         initial_tile_size = self.kernel_group.tile_desc.get_tile_size()
         previous_ranges = self.ranges
         for vlane_stride in [2, 4, 8]:
-                os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = str(vlane_stride)
-                previous_tile_size = initial_tile_size
+            os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = str(vlane_stride)
+            previous_tile_size = initial_tile_size
+            increase_dim = 0 # increase the first dimension
+            while previous_tile_size[increase_dim] * 2 <= previous_ranges[increase_dim] and previous_tile_size[increase_dim] <= 2 ** 13:
                 incrase_dim = -1 # only increase the last dimension
                 while previous_tile_size[incrase_dim] * 2 <= previous_ranges[incrase_dim] and previous_tile_size[incrase_dim] <= 2 ** 13:
                     src_code = super().codegen_nodes(nodes, kernel_name)
-                    print(f"[Auto-tune] Trying tile size: {self.kernel_group.tile_desc.get_tile_size()}, vlane_stride: {vlane_stride}")
                     if self.stop_autotune:
                         print(f"[Auto-tune] Skipping autotuning due to enough tile size: {self.kernel_group.tile_desc.get_tile_size()}")
                         break
+                    print(f"[Auto-tune] Trying tile size: {self.kernel_group.tile_desc.get_tile_size()}, vlane_stride: {vlane_stride}")
                     previous_tile_size = self.kernel_group.tile_desc.get_tile_size()
                     self._prepare_simulator_headers(src_code)
                     bench_runner = self.run_bench(nodes, kernel_name, src_code)
                     choices.append((bench_runner, src_code, self.kernel_group))
                     self.reset(f"tile_size_{incrase_dim}")
-                self.reset("vlane_stride")
+                previous_tile_size[incrase_dim] = initial_tile_size[incrase_dim]
+                self.kernel_group.tile_desc.set_tile_size(previous_tile_size)
+                self.reset(f"tile_size_{increase_dim}")
+            self.reset("vlane_stride")
         return choices
 
     def autotune(self, nodes, kernel_name):
@@ -1437,6 +1442,7 @@ def get_cycle(choice):
         with ThreadPoolExecutor(max_workers=5) as executor:
             results = list(executor.map(get_cycle, choices))
         max_idx = results.index(min(results))
+        print(f"[Auto-tune] Optimal tile size: {choices[max_idx][2].tile_desc.get_tile_size()}, vlane_stride: {choices[max_idx][2].tile_desc.vlane_stride}, cycles: {results[max_idx]}")
         optimal_src_code = choices[max_idx][1]
         return optimal_src_code
 
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index cd1a8600..8ab94049 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -479,7 +479,7 @@ def dummy_tile_size():
                 tile_size[0] = 2 * vlane_stride * self.vector_lane
             elif len(tile_size) == 3:
                 tile_size[-1] = self.vector_lane
-                tile_size[-2] = self.vector_lane
+                tile_size[-2] = 2 * self.vector_lane
                 tile_size[-3] = 2
             else:
                 raise NotImplementedError("dummy tile size fail!")

From 4a4151931f46efdf9ede32a1ef709f8dbd2cf395 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 19 Jun 2025 05:35:03 +0000
Subject: [PATCH 345/432] [Fix] Update extra spad cal logic

---
 PyTorchSimBackend/src/Instruction.cc          |  2 +-
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimBackend/src/Instruction.cc b/PyTorchSimBackend/src/Instruction.cc
index c4892284..b706ca8f 100644
--- a/PyTorchSimBackend/src/Instruction.cc
+++ b/PyTorchSimBackend/src/Instruction.cc
@@ -74,7 +74,7 @@ std::shared_ptr<std::set<addr_type>> Instruction::get_dram_address(addr_type dra
     tile_size.insert(tile_size.begin(), 1);
 
   while (_stride_list.size() < 4)
-    _stride_list.insert(_stride_list.begin(), 1);
+    _stride_list.insert(_stride_list.begin(), 0);
   if (_is_indirect_mode) {
     spdlog::trace("[Indirect Access] Indirect mode, dump_path: {}", _indirect_index_path);
     load_indirect_index(_indirect_index_path, indirect_index, tile_size);
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index f41c1893..a6b3423b 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -181,6 +181,14 @@ def render(self,
 
         M, N, K = X_tensor.size()[0], W_tensor.size()[1], X_tensor.size()[1]
         n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
+        # Caculate extra reads
+        n_extra_read = set()
+        if epilogue_nodes is not None:
+          for enode in epilogue_nodes:
+            n_extra_read.update(enode.node.get_read_names())
+          if self.output_node.name in n_extra_read:
+            n_extra_read.remove(self.output_node.name)
+
         nr_rdim = 0
         if (M == 0) or (N == 0) or (K == 0):
             TILE_M, TILE_N, TILE_K = 1, 1, 1
@@ -190,7 +198,7 @@ def render(self,
             template = GEMM_REDUCTION_TEMPLATE
             nr_rdim = 1
         else:
-            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node, min_tile=True)
+            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, len(n_extra_read), min_tile=True)
             template = GEMM_TEMPLATE
         TILE_M = min(extension_config.CONFIG_FORCE_TILE_M, TILE_M)
         TILE_N = min(extension_config.CONFIG_FORCE_TILE_N, TILE_N)

From d30fef36221e40d06b7faab0fd3861f178b658bc Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 21 Jun 2025 11:26:03 +0000
Subject: [PATCH 346/432] [TOGSim] Change inst issue logic to make it efficient

---
 PyTorchSimBackend/include/Instruction.h | 10 ++++++++--
 PyTorchSimBackend/include/Tile.h        |  6 +++++-
 PyTorchSimBackend/src/Core.cc           | 19 +++++++++++--------
 PyTorchSimBackend/src/Tile.cc           |  1 +
 4 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/PyTorchSimBackend/include/Instruction.h b/PyTorchSimBackend/include/Instruction.h
index 6544c930..84b17d7c 100644
--- a/PyTorchSimBackend/include/Instruction.h
+++ b/PyTorchSimBackend/include/Instruction.h
@@ -3,6 +3,7 @@
 #include <robin_hood.h>
 #include <spdlog/fmt/ranges.h>
 #include <spdlog/spdlog.h>
+#include <list>
 #include <numeric>
 
 #include <set>
@@ -18,7 +19,7 @@ typedef uint64_t cycle_type;
 
 std::string opcode_to_string(Opcode opcode);
 
-class Instruction {
+class Instruction : public std::enable_shared_from_this<Instruction> {
  public:
   Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_parents, addr_type dram_addr,
               std::vector<size_t> tile_size, size_t precision, std::vector<int> &idx_list,
@@ -38,6 +39,9 @@ class Instruction {
   void dec_ready_counter() {
     assert(ready_counter!=0);
     ready_counter--;
+    if (!ready_counter && _owner_ready_queue_ref != nullptr) {
+      _owner_ready_queue_ref->push_back(shared_from_this());
+    }
   }
   size_t get_tile_numel() { return _tile_numel; }
   size_t get_precision() { return _precision; }
@@ -64,6 +68,7 @@ class Instruction {
   void set_free_sram_size(size_t sram_size) { _free_sram_size=sram_size; }
   void* get_owner() { return _owner; }
   void set_owner(void *owner) { _owner = owner;}
+  void set_owner_ready_queue(std::list<std::shared_ptr<Instruction>>* q) { _owner_ready_queue_ref = q; }
   void set_compute_type(int type) { _compute_type = type; }
   int get_compute_type() { return _compute_type; }
   void set_numa_id(int numa_id) { _numa_id = numa_id; }
@@ -90,7 +95,8 @@ class Instruction {
   bool finished=false;
   int subgraph_id;
  private:
-  void *_owner;
+  void *_owner = nullptr;
+  std::list<std::shared_ptr<Instruction>>* _owner_ready_queue_ref = nullptr;
   Opcode opcode;
   cycle_type compute_cycle;
   cycle_type overlapping_cycle;
diff --git a/PyTorchSimBackend/include/Tile.h b/PyTorchSimBackend/include/Tile.h
index 8db245be..d867a037 100644
--- a/PyTorchSimBackend/include/Tile.h
+++ b/PyTorchSimBackend/include/Tile.h
@@ -3,11 +3,12 @@
 
 #include <memory>
 #include <deque>
+#include <list>
 #include "Instruction.h"
 
 class TileSubGraph;
 
-class Tile {
+class Tile : public std::enable_shared_from_this<Tile> {
  public:
   enum class Status {
     INITIALIZED,
@@ -33,6 +34,8 @@ class Tile {
   void finish_tile();
   bool is_ready() { return _ready_counter==0; }
   std::deque<std::shared_ptr<Instruction>>& get_instructions() { return _instructions; } 
+  void enqueue_ready(const std::shared_ptr<Instruction>& inst) { _ready_queue.push_back(inst); }
+  std::list<std::shared_ptr<Instruction>>& get_ready_instructions() { return _ready_queue; }
   void print();
   size_t nr_insts() { return _nr_insts; }
   size_t nr_finshed_insts() { return _nr_finished_insts; }
@@ -53,6 +56,7 @@ class Tile {
   size_t _nr_insts = 0;
   size_t _nr_finished_insts = 0;
   std::deque<std::shared_ptr<Instruction>> _instructions;
+  std::list<std::shared_ptr<Instruction>> _ready_queue;
   std::vector<std::shared_ptr<Tile>> _child_tiles;
   void *_custom_data=NULL;
   bool _stonne_tile=false;
diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index 5f42ecb0..317908a3 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -20,7 +20,7 @@ Core::Core(uint32_t id, SimulationConfig config)
 
 bool Core::can_issue(const std::shared_ptr<Tile>& op) {
   /* Check SRAM is enough to run tile */
-  return _tiles.size() < 2  && !op->is_stonne_tile();
+  return _tiles.size() < 4  && !op->is_stonne_tile();
 }
 
 void Core::issue(std::shared_ptr<Tile> op) {
@@ -33,6 +33,10 @@ void Core::issue(std::shared_ptr<Tile> op) {
       _id, _core_cycle, _sram_size-_used_sram_size, op->get_required_sram_size());
   }
   //_used_sram_size += op->get_required_sram_size();
+  for (const auto& inst : op->get_instructions()) {
+    if (inst->is_ready())
+      op->enqueue_ready(inst);
+  }
   _tiles.push_back(std::move(op));
 }
 
@@ -201,12 +205,12 @@ void Core::cycle() {
   bool issued = false;
 
   for (int i=0; i<_tiles.size() && !issued; i++) {
-    auto& instructions = _tiles[i]->get_instructions();
-    for (int j=0; j<instructions.size(); j++) {
-      auto& inst = instructions.at(j);
+    auto& instructions = _tiles[i]->get_ready_instructions();
+    for (auto it=instructions.begin(); it!=instructions.end();) {
+      auto& inst = *it;
       /* Skip instruction is not ready  */
-      if (!inst->is_ready())
-        continue;
+      //if (!inst->is_ready())
+      //  continue;
 
       switch (inst->get_opcode()) {
         case Opcode::MOVIN:
@@ -269,7 +273,6 @@ void Core::cycle() {
               inst->finish_instruction();
               static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
               _stat_tot_skipped_inst.at(static_cast<size_t>(inst->get_opcode()))++;
-              auto it = instructions.begin() + j; // Position 2 is the third element
               instructions.erase(it);
             } else {
               spdlog::trace("[Core {}][SA {}][{}] {}-{} ISSUED, finsh at {}", _id, _systolic_array_rr, _core_cycle,
@@ -314,10 +317,10 @@ void Core::cycle() {
 
       if (issued) {
         _stat_inst_count.at(static_cast<size_t>(inst->get_opcode()))++;
-        auto it = instructions.begin() + j; // Position 2 is the third element
         instructions.erase(it);
         break;
       }
+      it++;
     }
   }
 
diff --git a/PyTorchSimBackend/src/Tile.cc b/PyTorchSimBackend/src/Tile.cc
index bb166ca0..2e05cb08 100644
--- a/PyTorchSimBackend/src/Tile.cc
+++ b/PyTorchSimBackend/src/Tile.cc
@@ -21,6 +21,7 @@ void Tile::append_instuction(std::shared_ptr<Instruction>& inst) {
   /* Move instructions */
   _nr_insts++;
   inst->set_owner(this);
+  inst->set_owner_ready_queue(&_ready_queue);
   _instructions.push_back(inst);
 }
 

From 5a7a39d836ca0bc2ffe39dec6738ba42ec013c9d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 21 Jun 2025 16:51:29 +0000
Subject: [PATCH 347/432] [TOGSim] Add core dram bw stat

---
 PyTorchSimBackend/include/Core.h | 2 ++
 PyTorchSimBackend/src/Core.cc    | 9 +++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimBackend/include/Core.h b/PyTorchSimBackend/include/Core.h
index 6585e222..a3d55fa2 100644
--- a/PyTorchSimBackend/include/Core.h
+++ b/PyTorchSimBackend/include/Core.h
@@ -68,6 +68,7 @@ class Core {
   std::vector<cycle_type> _stat_tot_sa_compute_idle_cycle;
   std::vector<uint64_t> _stat_inst_count;
   std::vector<uint64_t> _stat_tot_skipped_inst;
+  uint64_t _stat_tot_mem_response = 0;
   uint64_t _stat_gemm_inst = 0;
   uint64_t _stat_skip_dma = 0;
   uint64_t _stat_numa_hit = 0;
@@ -79,6 +80,7 @@ class Core {
   cycle_type _stat_tma_idle_cycle = 0;
   cycle_type _stat_vu_compute_idle_cycle = 0;
   std::vector<cycle_type> _stat_sa_compute_idle_cycle;
+  uint64_t _stat_mem_response = 0;
 
   std::vector<std::shared_ptr<Tile>> _tiles;
   std::queue<std::shared_ptr<Tile>> _finished_tiles;
diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index 317908a3..4be41a70 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -407,6 +407,7 @@ void Core::push_memory_response(mem_fetch* response) {
       assert(true || "Can't happend...!");
     }
   }
+  _stat_mem_response++;
   delete response;
 }
 
@@ -430,7 +431,8 @@ void Core::print_stats() {
   for (int i=0; i<_num_systolic_array_per_core; i++)
     spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i),
       _stat_tot_sa_compute_cycle.at(i), _stat_tot_sa_compute_idle_cycle.at(i));
-  spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {}", _id, _stat_tot_tma_cycle, _stat_tot_tma_idle_cycle);
+  float dram_bw = _config.dram_req_size * _stat_tot_mem_response * _config.core_freq / (_core_cycle * 1000); // B/cycle
+  spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_tot_tma_cycle, _stat_tot_tma_idle_cycle, dram_bw, _stat_tot_mem_response);
   spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
     static_cast<float>(_stat_tot_vu_compute_cycle * 100) / _core_cycle, _stat_tot_vu_compute_cycle, _stat_tot_vu_compute_idle_cycle);
   spdlog::info("Core [{}] : Numa hit count : {}, Numa miss count : {}", _id, _stat_numa_hit, _stat_numa_miss);
@@ -441,6 +443,7 @@ void Core::print_current_stats() {
   std::vector<float> sa_utilization;
   for (int i=0; i<_num_systolic_array_per_core; i++)
     sa_utilization.push_back(static_cast<float>(_stat_sa_compute_cycle.at(i) * 100) / _config.core_print_interval);
+  float dram_bw = _config.dram_req_size * _stat_mem_response * _config.core_freq / (_config.core_print_interval * 1000); // B/cycle
   auto level = spdlog::level::info;
   if(_id != 0)
     level = spdlog::level::debug;
@@ -449,7 +452,7 @@ void Core::print_current_stats() {
   for (int i=0; i<_num_systolic_array_per_core; i++)
     spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i),
       _stat_sa_compute_cycle.at(i), _stat_sa_compute_idle_cycle.at(i));
-  spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {}", _id, _stat_tma_cycle, _stat_tma_idle_cycle);
+  spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_tma_cycle, _stat_tma_idle_cycle, dram_bw, _stat_mem_response);
   spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
     static_cast<float>(_stat_vu_compute_cycle * 100) / _config.core_print_interval, _stat_vu_compute_cycle, _stat_vu_compute_idle_cycle);
   spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle);
@@ -467,9 +470,11 @@ void Core::update_stats() {
   _stat_tot_vu_compute_cycle += _stat_vu_compute_cycle;
   _stat_tot_tma_cycle += _stat_tma_cycle;
   _stat_tot_tma_idle_cycle += _stat_tma_idle_cycle;
+  _stat_tot_mem_response += +_stat_mem_response;
 
   _stat_vu_compute_cycle = 0;
   _stat_tma_cycle = 0;
   _stat_tma_idle_cycle = 0;
   _stat_vu_compute_idle_cycle = 0;
+  _stat_mem_response = 0;
 }
\ No newline at end of file

From 97dc265ddaab48301ed89f400926daf91afef8e0 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 18 Jun 2025 04:34:37 +0000
Subject: [PATCH 348/432] [fix] prevent infinite loop in auto-tune

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index a038cf64..9a3c4148 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1388,12 +1388,16 @@ def make_choices(self, nodes, kernel_name):
         choices = []
         initial_tile_size = self.kernel_group.tile_desc.get_tile_size()
         previous_ranges = self.ranges
+        prevent_infinite_loop = 0
+        if len(initial_tile_size) < 2:
+            return choices # Can't autotune for 1-D tile size
         for vlane_stride in [2, 4, 8]:
             os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = str(vlane_stride)
             previous_tile_size = initial_tile_size
             increase_dim = 0 # increase the first dimension
-            while previous_tile_size[increase_dim] * 2 <= previous_ranges[increase_dim] and previous_tile_size[increase_dim] <= 2 ** 13:
+            while previous_tile_size[increase_dim] * 2 <= previous_ranges[increase_dim] and previous_tile_size[increase_dim] <= 2 ** 13 and prevent_infinite_loop < 10:
                 incrase_dim = -1 # only increase the last dimension
+                prevent_infinite_loop += 1
                 while previous_tile_size[incrase_dim] * 2 <= previous_ranges[incrase_dim] and previous_tile_size[incrase_dim] <= 2 ** 13:
                     src_code = super().codegen_nodes(nodes, kernel_name)
                     if self.stop_autotune:

From 510cad4f8a98fae98b8eaf84541c82a5e42a6064 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 17 Jun 2025 14:17:56 +0000
Subject: [PATCH 349/432] [scripts] breakdown script

---
 scripts/end2end.sh | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/scripts/end2end.sh b/scripts/end2end.sh
index 13755867..7ca5c93d 100755
--- a/scripts/end2end.sh
+++ b/scripts/end2end.sh
@@ -6,6 +6,7 @@ BASE_PATH=$1 # Input as the first argument
 # Initialize the total cycle sum
 total_sum=0
 total_core=0
+total_vector=0
 # Find all backendsim_result folders
 mapfile -t backend_folders < <(find "$BASE_PATH" -type d -name "backendsim_result")
 
@@ -29,6 +30,13 @@ for backend_folder in "${backend_folders[@]}"; do
     else
         echo "Error: cannot find core active cycle"
     fi
+    if [[ "$num_cycles" -ge 1 ]]; then
+        # Extract the last two active cycles
+        vector_core_cycle=${active_cycles[$((num_cycles-1))]}
+    else
+        echo "Error: cannot find vector core active cycle"
+    fi
+    echo "file: $file total_cycle: $total_cycle SA core_cycle: $core_cycle vector_core_cycle: $vector_core_cycle"
 
     if [[ -n "$total_cycle" ]]; then
       # Add the total cycle to the total sum
@@ -40,9 +48,15 @@ for backend_folder in "${backend_folders[@]}"; do
       # echo "Adding $total_cycle to total_sum"
       total_core=$((total_core + core_cycle))
     fi
+    if [[ -n "$vector_core_cycle" ]]; then
+      # Add the total cycle to the total sum
+      # echo "Adding $total_cycle to total_sum"
+      total_vector=$((total_vector + vector_core_cycle))
+    fi
   done
 done
 
 # Print the total cycle sum
 echo "total end2end cycle: $total_sum"
-echo "total core cycle: $total_core"
\ No newline at end of file
+echo "total core cycle: $total_core"
+echo "total vector core cycle: $total_vector"
\ No newline at end of file

From 04dde98d8a6bf787aa1f73eaf7fba9389f2204b6 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 21 Jun 2025 16:54:05 +0000
Subject: [PATCH 350/432] [Script] Update chiplet script

---
 scripts/chiplet.sh      |  2 +-
 scripts/chiplet_prep.py |  2 +-
 scripts/chiplet_prep.sh |  1 +
 scripts/chiplet_run.py  | 37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 40 insertions(+), 2 deletions(-)
 create mode 100644 scripts/chiplet_run.py

diff --git a/scripts/chiplet.sh b/scripts/chiplet.sh
index d6f27853..3dfba3d9 100755
--- a/scripts/chiplet.sh
+++ b/scripts/chiplet.sh
@@ -63,7 +63,7 @@ for CONFIG in "${CONFIG_LIST2[@]}"; do
     OUTPUT_FILE="$RESULTS_DIR/${CONFIG_NAME}_result.txt"
 
     # Run Simulator
-    echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME"
+    # echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME"
     "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" &
     echo "[BackendSimulator] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\""
 done
diff --git a/scripts/chiplet_prep.py b/scripts/chiplet_prep.py
index 9b82ea39..168532f1 100644
--- a/scripts/chiplet_prep.py
+++ b/scripts/chiplet_prep.py
@@ -34,7 +34,7 @@ def custom_matmul(a, b):
     opt_fn = torch.compile(dynamic=False)(custom_matmul)
     res = opt_fn(x1, w1)
     y = custom_matmul(x2, w2)
-    test_result("Matmul Forward", res, y)
+    #test_result("Matmul Forward", res, y)
 
 def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None):
     file_path = os.path.join(dump_path, 'runtime_0000', 'attribute', '0')
diff --git a/scripts/chiplet_prep.sh b/scripts/chiplet_prep.sh
index 6976b198..99fc9b30 100755
--- a/scripts/chiplet_prep.sh
+++ b/scripts/chiplet_prep.sh
@@ -11,4 +11,5 @@ for size in "${sizes[@]}"; do
     export TORCHSIM_FORCE_TIME_N=$((size / 2))
     export TORCHSIM_DUMP_PATH=$(pwd)/chiplet_result/$size
     python3 chiplet_prep.py $size
+    #python3 chiplet_run.py $(pwd)/chiplet_result
 done
\ No newline at end of file
diff --git a/scripts/chiplet_run.py b/scripts/chiplet_run.py
new file mode 100644
index 00000000..e53352e6
--- /dev/null
+++ b/scripts/chiplet_run.py
@@ -0,0 +1,37 @@
+import argparse
+from pathlib import Path
+import os
+
+def list_nested_folders(root_path):
+    root = Path(root_path)
+
+    if not root.exists() or not root.is_dir():
+        print(f"[Error] '{root}' is not a valid directory.")
+        return []
+
+    folders = set()
+    for p in root.rglob('*'):
+        if p.is_dir():
+            rel_depth = len(p.relative_to(root).parts)
+            if rel_depth == 3:
+                folders.add(p)
+
+    return sorted(folders)
+
+def main():
+    parser = argparse.ArgumentParser(description="List folders up to depth 3 and parse arguments.")
+
+    parser.add_argument("path", type=str, help="Root directory to start scanning")
+    parser.add_argument("--index", type=int, default=0, help="Index value (default: 0)")
+    parser.add_argument("--attr", nargs='*', default=["best", "worst"],
+                        help='List of attr (default: ["best", "worst"])')
+
+    args = parser.parse_args()
+    folders = list_nested_folders(args.path)
+    for folder in folders:
+        cmd = f"./chiplet.sh {folder} {args.index} {' '.join(args.attr)}"
+        print(cmd)
+        os.system(cmd)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From b61e369a58edee92d121ed30c71294ade5b2be26 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 30 Jun 2025 01:54:33 +0000
Subject: [PATCH 351/432] [Script] Force tile size for chiplet setting

---
 scripts/stonne_experiment/run.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/stonne_experiment/run.sh b/scripts/stonne_experiment/run.sh
index b856b492..1825817f 100755
--- a/scripts/stonne_experiment/run.sh
+++ b/scripts/stonne_experiment/run.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+export TORCHSIM_FORCE_TIME_M=1024
+export TORCHSIM_FORCE_TIME_K=1024
+export TORCHSIM_FORCE_TIME_N=1024
 python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config stonne_big_c1_simple_noc.json --mode 0 > hetero/big_sparse.log
 python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config systolic_ws_128x128_c1_simple_noc_tpuv3_half.json --mode 1 > hetero/big.log
 python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config heterogeneous_c2_simple_noc.json --mode 2 > hetero/hetero.log

From e65cae910f312989fd1b5a7d6295b31776187aad Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 18 Jun 2025 19:17:03 +0000
Subject: [PATCH 352/432] [Example] BERT fusion

---
 tests/Fusion/test_transformer_fusion.py | 212 ++++++++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 tests/Fusion/test_transformer_fusion.py

diff --git a/tests/Fusion/test_transformer_fusion.py b/tests/Fusion/test_transformer_fusion.py
new file mode 100644
index 00000000..15bacb39
--- /dev/null
+++ b/tests/Fusion/test_transformer_fusion.py
@@ -0,0 +1,212 @@
+import math
+import copy
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def clones(module, N):
+    "Produce N identical layers."
+    return torch.nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
+
+class my_MultiheadAttention_origin(torch.nn.Module):
+    def __init__(self, h, d_model, dropout=0.1):
+        super(my_MultiheadAttention_origin, self).__init__()
+        assert d_model % h == 0
+        # We assume d_v always equals d_k
+        self.d_k = d_model // h
+        self.h = h
+        self.linears = clones(torch.nn.Linear(d_model, d_model), 4)
+        self.attn = None
+
+    def forward(self, query, key, value):
+        # 1) Do all the linear projections in batch from d_model => h x d_k
+        query, key, value = [
+            lin(x).view(-1, self.h, self.d_k).transpose(0, 1)
+            for lin, x in zip(self.linears, (query, key, value))
+        ]
+
+        # 2) Apply attention on all the projected vectors in batch.
+        scores = torch.matmul(key, query.transpose(-2, -1)) / math.sqrt(self.d_k)
+        p_attn = scores.softmax(dim=-2)
+        x = torch.matmul(value.transpose(-1, -2), p_attn)
+        # 3) "Concat" using a view and apply a final linear.
+        x = (
+            x.view(-1, self.h * self.d_k)
+        )
+        del query
+        del key
+        del value
+        return self.linears[-1](x)
+
+class DecoderBlock_origin(torch.nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super(DecoderBlock_origin, self).__init__()
+        self.multihead_attn = my_MultiheadAttention_origin(num_heads, embed_dim)
+        self.layer_norm = torch.nn.LayerNorm(embed_dim)
+        self.ffn1 = torch.nn.Linear(embed_dim, embed_dim*4)
+        self.act = torch.nn.ReLU()
+        self.ffn2 = torch.nn.Linear(embed_dim*4, embed_dim)
+
+    def forward(self, x):
+        result = self.multihead_attn(x, x, x).reshape(x.shape)
+        result = self.layer_norm(result+x)
+
+        ffn1_result = self.ffn1(result)
+        act_result = self.act(ffn1_result)
+        ffn2_result = self.ffn2(act_result)
+        return self.layer_norm(ffn2_result + result)
+
+class my_MultiheadAttention(torch.nn.Module):
+    def __init__(self, h, d_model, dropout=0.1):
+        super(my_MultiheadAttention, self).__init__()
+        assert d_model % h == 0
+        # We assume d_v always equals d_k
+        self.d_k = d_model // h
+        self.h = h
+        self.linears = clones(torch.nn.Linear(d_model, d_model), 3)
+        self.attn = None
+
+    def forward(self, query, key, value):
+        # 1) Do all the linear projections in batch from d_model => h x d_k
+        query, key, value = [
+            lin(x).view(-1, self.h, self.d_k).transpose(0, 1)
+            for lin, x in zip(self.linears, (query, key, value))
+        ]
+
+        # 2) Apply attention on all the projected vectors in batch.
+        scores = torch.matmul(key, query.transpose(-2, -1)) / math.sqrt(self.d_k)
+        p_attn = scores.softmax(dim=-2)
+        x = torch.matmul(value.transpose(-1, -2), p_attn)
+        # 3) "Concat" using a view and apply a final linear.
+        x = (
+            x.view(-1, self.h * self.d_k)
+        )
+        del query
+        del key
+        del value
+        return x
+
+class custom_MatmulLayerNorm(torch.nn.Module):
+    def __init__(self, hidden_size, output_size):    # (512, 3072, 768)
+        super(custom_MatmulLayerNorm, self).__init__()
+        self.weight = torch.nn.Parameter(torch.randn(output_size, hidden_size))  # (768, 3072)
+        self.bias = torch.nn.Parameter(torch.randn(output_size))    # (768)
+        self.layer_norm = torch.nn.LayerNorm(output_size)   # 768
+    def forward(self, x, residual):
+        out = torch.matmul(self.weight, x.transpose(-1, -2)) + self.bias[:, None] # (1, 768, 512)
+        return self.layer_norm(out.transpose(-1, -2) + residual)
+
+class DecoderBlock(torch.nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super(DecoderBlock, self).__init__()
+        self.multihead_attn = my_MultiheadAttention(num_heads, embed_dim)
+        self.layer_norm = torch.nn.LayerNorm(embed_dim)
+        self.ffn1 = torch.nn.Linear(embed_dim, embed_dim*4)
+        self.act = torch.nn.ReLU()
+        self.ffn2 = torch.nn.Linear(embed_dim*4, embed_dim)
+        self.matmulln1 = custom_MatmulLayerNorm(embed_dim, embed_dim)
+        self.matmulln2 = custom_MatmulLayerNorm(embed_dim*4, embed_dim)
+
+    def forward(self, x):
+        result = self.multihead_attn(x, x, x)
+        result = self.matmulln1(result, x)
+
+        ffn1_result = self.ffn1(result)
+        act_result = self.act(ffn1_result)
+        return self.matmulln2(act_result, result)
+
+def test_DecoderBlock(device, head=12, embed_dim=768, input_seq=512):
+    cpu_query = torch.randn(input_seq, embed_dim)
+    decoder_block = DecoderBlock(embed_dim, head)
+    cpu_res = decoder_block(cpu_query)
+
+    query = cpu_query.clone().to(device=device)
+    decoder_block.to(device=device)
+    with torch.no_grad():
+        opt_fn = torch.compile(dynamic=False)(decoder_block)
+        res = opt_fn(query)
+
+    test_result("Decoder Block Forwrad", res, cpu_res)
+
+def test_Attention(device, head=16, seq=512, d_k=64):
+    def attention(query, key, value):
+        import math
+        d_k = query.size(-1)
+        scores = torch.matmul(key, query.transpose(-2, -1)) / math.sqrt(d_k)
+        p_attn = scores.softmax(dim=-2)
+        return torch.matmul(value.transpose(-1, -2), p_attn)
+
+    torch.manual_seed(0)
+    query = torch.randn(head, seq, d_k).to(device=device)
+    key = torch.randn(head, seq, d_k).to(device=device)
+    value = torch.randn(head, seq, d_k).to(device=device)
+
+    opt_fn = torch.compile(dynamic=False)(attention)
+    res = opt_fn(query, key, value)
+
+    cpu_res = attention(query.cpu(), key.cpu(), value.cpu())
+    test_result("Attention Forward", res, cpu_res)
+
+def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512):
+    MHA = my_MultiheadAttention(num_heads, embed_dim)
+    cpu_query = torch.randn(input_seq, embed_dim)
+    cpu_res = MHA(cpu_query, cpu_query, cpu_query)
+
+    query = cpu_query.clone().to(device=device)
+    MHA.to(device=device)
+    opt_fn = torch.compile(dynamic=False)(MHA)
+    res = opt_fn(query, query, query)
+
+    test_result("MHA Forward", res, cpu_res)
+
+def test_DecoderBlock_validation(head=12, embed_dim=768, input_seq=512):
+    bert_origin = DecoderBlock_origin(embed_dim, head)
+    bert = DecoderBlock(embed_dim, head)
+
+    bert.multihead_attn.linears[0].weight = bert_origin.multihead_attn.linears[0].weight
+    bert.multihead_attn.linears[0].bias = bert_origin.multihead_attn.linears[0].bias
+    bert.multihead_attn.linears[1].weight = bert_origin.multihead_attn.linears[1].weight
+    bert.multihead_attn.linears[1].bias = bert_origin.multihead_attn.linears[1].bias
+    bert.multihead_attn.linears[2].weight = bert_origin.multihead_attn.linears[2].weight
+    bert.multihead_attn.linears[2].bias = bert_origin.multihead_attn.linears[2].bias
+    bert.ffn1.weight = bert_origin.ffn1.weight
+    bert.ffn1.bias = bert_origin.ffn1.bias
+    bert.matmulln1.weight = torch.nn.Parameter(bert_origin.multihead_attn.linears[-1].weight)
+    bert.matmulln1.bias = torch.nn.Parameter(bert_origin.multihead_attn.linears[-1].bias)
+    bert.matmulln2.weight = torch.nn.Parameter(bert_origin.ffn2.weight)
+    bert.matmulln2.bias = torch.nn.Parameter(bert_origin.ffn2.bias)
+
+    origin_query = torch.randn(input_seq, embed_dim)
+    query = origin_query.clone()
+    origin_res = bert_origin(origin_query)
+    res = bert(query)
+
+    test_result("Decoder Block Validation", res, origin_res)
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    test_DecoderBlock(device)
+    # test_DecoderBlock_validation()
+    # test_Attention(device, head=16, seq=512, d_k=64)
+    # test_MHA(device, num_heads=12, embed_dim=768)

From 29e2a42ca71483ccc4f20c537bc96de46c8671f4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 21 Jun 2025 07:37:55 +0000
Subject: [PATCH 353/432] [Gem5] Boost vector computation

---
 gem5_script/vpu_config.py | 55 ++++++++++++++++++++++++++++++++-------
 1 file changed, 45 insertions(+), 10 deletions(-)

diff --git a/gem5_script/vpu_config.py b/gem5_script/vpu_config.py
index 01437381..eeeaefab 100644
--- a/gem5_script/vpu_config.py
+++ b/gem5_script/vpu_config.py
@@ -160,6 +160,16 @@ class MinorCustomFUPool(MinorFUPool):
         MinorCustomMemFU(),
         MinorCustomMiscFU(),
 
+        # Scalar unit
+        MinorFPUnit(),
+        MinorCustomIntFU(),
+        MinorCustomIntFU(),
+        MinorCustomIntMulFU(),
+        MinorCustomIntDivFU(),
+        MinorCustomPredFU(),
+        MinorCustomMemFU(),
+        MinorCustomMiscFU(),
+
         # Matmul unit
         SystolicArray(), # 0
  
@@ -183,22 +193,47 @@ class MinorCustomFUPool(MinorFUPool):
         MinorVecDivider(),
         MinorVecReduce(),
 
+        # Vector
+        MinorVecConfig(), # 1 for vector config
+        MinorVecConfig(),
+        MinorVecMisc(),
+        MinorVecMisc(),
+        MinorVecLdStore(),
+        MinorVecLdStore(),
+
+        # Vector ALU0
+        MinorVecAdder(), # 6
+        MinorVecMultiplier(), # 7
+        MinorVecDivider(), # 8
+        MinorVecReduce(),
+
+        # Vector ALU1
+        MinorVecAdder(), # 18 ~ 29
+        MinorVecMultiplier(),
+        MinorVecDivider(),
+        MinorVecReduce(),
+
         # SFU
         SpecialFunctionUnit(),
     ]
 
 class RiscvVPU(RiscvMinorCPU):
-    fetch2InputBufferSize = 2
-    decodeInputBufferSize = 1
-    decodeInputWidth = 1
-    executeInputWidth = 8
-    executeIssueLimit = 8
-    executeCommitLimit = 8
+    fetch1FetchLimit = 8
+    decodeInputWidth = 8
+    fetch1ToFetch2BackwardDelay = 0
+    fetch2InputBufferSize = 8
+    decodeInputBufferSize = 8
+    decodeInputWidth = 8
+    executeInputBufferSize = 128
+    executeInputWidth = 12
+    executeIssueLimit = 12
+    executeCommitLimit = 12
+
     # Memory
-    executeMemoryIssueLimit = 2
-    executeMemoryCommitLimit = 2
-    executeMaxAccessesInMemory = 2
-    executeLSQMaxStoreBufferStoresPerCycle = 2
+    executeMemoryIssueLimit = 8
+    executeMemoryCommitLimit = 8
+    executeMaxAccessesInMemory = 8
+    executeLSQMaxStoreBufferStoresPerCycle = 8
     executeLSQTransfersQueueSize = 8
     executeLSQStoreBufferSize = 8
 

From 1b27b4c82ce050eb367f019770c2a075f58ca342 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Sun, 15 Jun 2025 09:28:09 +0000
Subject: [PATCH 354/432] [Frontend/Fusion] Prologue fusion implementation
 working

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  | 106 +++++++-
 .../mlir/mlir_codegen_backend.py              |   1 -
 PyTorchSimFrontend/mlir/mlir_common.py        |   3 +
 PyTorchSimFrontend/mlir/mlir_conv_template.py |   2 +-
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  33 ++-
 .../mlir/mlir_maxpool_template.py             |   2 +-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  86 ++++++-
 PyTorchSimFrontend/mlir/mlir_template.py      | 238 +++++++++++++++---
 tests/Fusion/test_prologue_fusion.py          |  81 ++++++
 9 files changed, 503 insertions(+), 49 deletions(-)
 create mode 100644 tests/Fusion/test_prologue_fusion.py

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index d6917cad..85631adb 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -83,8 +83,77 @@
 }
 """
 
+BMM_PROLOGUE_TEMPLATE = r"""
+// BMM Prologue kernel
+// BATCH = {{ B }}
+// M = {{ M }}
+// N = {{ N }}
+// K = {{ K }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
+// SUB_TILE_M = {{ SUB_TILE_M }}
+// SUB_TILE_N = {{ SUB_TILE_N }}
+#map0 = affine_map<(d0, d1, d2) -> ({{ X_map }})>
+#map1 = affine_map<(d0, d1, d2) -> ({{ W_map }})>
+#map2 = affine_map<(d0, d1, d2) -> (d0 * {{ M * N }} + d1 * {{ N }} + d2)>
+memref.global @X_spad : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+memref.global @W_spad : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+memref.global @Y_spad : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+{{kernel.def_global_vars()}}
+
+func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} {
+  %c_mvin = arith.constant 2 : index
+  %c_mvin2 = arith.constant 1 : index{% if Bias %}
+  %c_mvin3 = arith.constant 14 : index{% endif %}
+  %c_mvout = arith.constant 3 : index
+  %vstride = arith.constant 1 : index
+  %axis = arith.constant 2 : index
+  %X_buffer = memref.get_global @X_spad : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+  %W_buffer = memref.get_global @W_spad : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+  %Y_buffer = memref.get_global @Y_spad : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+  %tag = memref.alloc() : memref<1xi32>
+  %tag0 = memref.alloc() : memref<1xi32>
+  %tag1 = memref.alloc() : memref<1xi32>
+  %tag2 = memref.alloc() : memref<1xi32>{% if not Bias %}
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
+  %c0 = arith.constant 0 : index
+{{ kernel.def_local_vars() }}
+  affine.for %b=0 to {{ B }} {
+    affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
+      affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
+        %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+        %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+        %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+
+        %index2 = affine.apply #map2(%b, %t_m, %t_n)
+        {% if Bias -%}
+        memref.dma_start %Bias[
+        {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
+          ], %Y_buffer2D[0, 0], %c_mvin3, %tag0[%c0], %
+        {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
+          , %vstride : memref<
+        {%- if Bias_rank == 2 -%} {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
+          xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1 , {{ TILE_M }}] }
+        {%- else -%}
+        affine.vector_store %v0, %Y_buffer2D[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
+        affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
+          %index0 = affine.apply #map0(%b, %t_m, %t_k)
+          %index1 = affine.apply #map1(%b, %t_k, %t_n)
+          {{kernel.prepare_input(indent_size=10)}}
+          linalg.matmul ins(%X_buffer2D, %W_buffer2D : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
+                  outs(%Y_buffer2D : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
+        } { accumulation_loop=true }
+        memref.dma_start %Y_buffer[%c0, %c0, %c0], %Y[%index2], %c_mvout, %tag[%c0], %axis, %vstride : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ B * M * N }}xf32>, memref<1xi32> { padding=0, sram_stride=[1, 1, {{ TILE_M }}] }
+      } { outer_loop=true }
+    } { outer_loop=true }
+  } { outer_loop=true }
+  return
+}
+"""
+
 BMM_REDUCTION_TEMPLATE = r"""
-// BMM kernel
+// BMM Reduction kernel
 // BATCH = {{ B }}
 // M = {{ M }}
 // N = {{ N }}
@@ -166,6 +235,7 @@ def render(self,
                kernel: MLIRTemplateKernel,
                template_buffer_node = None,
                epilogue_nodes: Optional[List[IRNode]] = None,
+               prologue_nodes: Optional[List[IRNode]] = None,
                **kwargs):
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
@@ -192,13 +262,17 @@ def render(self,
         TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node)
         TOG_latency = M if TILE_M > M else TILE_M
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
-        SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_K = TILE_K if TILE_K < kernel.vector_lane else kernel.vector_lane
+        TILE_K = TILE_K // 2 if prologue_nodes else TILE_K
+        SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
+        SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
+        SUB_TILE_K = TILE_K # if (TILE_K < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
 
         if n_extra_node==1 and epilogue_nodes[0].is_reduction():
           template = BMM_REDUCTION_TEMPLATE
           nr_rdim = 1
+        elif prologue_nodes:
+          template = BMM_PROLOGUE_TEMPLATE
+          nr_rdim = 0
         else:
           template = BMM_TEMPLATE
           nr_rdim = 0
@@ -229,7 +303,29 @@ def render(self,
             input_reorder = self.input_reorder
         )
 
-        kernel.store_info = dict(
+        kernel.prologue_info = dict (
+            input_sram_var = "X_buffer2D",
+            input_dram_var = "X",
+            input_index_var = "index0",
+            input_tag_var = "tag1",
+            input_numel = B * M * K,
+            input_tile_size = (TILE_M, TILE_K),
+            input_sram_stride = [1, TILE_M],
+            input_subtile_size = (SUB_TILE_M, SUB_TILE_K),
+            weight_sram_var = "W_buffer2D",
+            weight_dram_var = "W",
+            weight_index_var = "index1",
+            weight_tag_var = "tag2",
+            weight_numel = B * K * N,
+            weight_tile_size = (TILE_K, TILE_N),
+            weight_sram_stride = [1, TILE_K],
+            weight_subtile_size = (SUB_TILE_K, SUB_TILE_N),
+            tile_size = (TILE_M, TILE_K),
+            vlane_split_axis = 1,
+            vlane_stride = 1,
+            is_bmm = True,
+        )
+        kernel.epilogue_info = dict(
             output_node = self.output_node.name,
             dependent_buf = [],
             sram_var = "Y_buffer",
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 9a3c4148..1272a46e 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -845,7 +845,6 @@ def __init__(self, kernel_group, reason=None):
         self.reduction_prefix = IndentedBuffer()
         self.reduction_suffix = IndentedBuffer()
         self.applys = IndentedBuffer()
-        self.body = IndentedBuffer()
         self.dma_loads = IndentedBuffer()
         self.dma_stores = IndentedBuffer()
         self.indexed_buffer = IndentedBuffer()
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 8ab94049..c3dc0c51 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -306,6 +306,9 @@ def __init__(self):
     def set_tile_info(self, tile_desc : MLIRMultiDimTile):
         self.tile_desc = tile_desc
 
+    def set_prologue_tile_info(self, tile_desc : MLIRMultiDimTile):
+        self.prologue_tile_desc = tile_desc
+
 class BaseMLIRHardwareInfo():
     def __init__(self):
         # Default HW setting
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 7a3b4b19..0b6d13ef 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -736,7 +736,7 @@ def render(self,
             input_reorder=self.input_reorder
         )
 
-        kernel.store_info = dict(
+        kernel.epilogue_info = dict(
             output_node = self.output_node.name,
             dependent_buf = [],
             sram_var = "output_buffer",
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index a6b3423b..ec1dd9a8 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -64,10 +64,15 @@
       affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
         %index0 = affine.apply #map0(%t_m, %t_k)
         %index1 = affine.apply #map1(%t_k, %t_n)
+        {% if prologue_nodes -%}
+        // prologue nodes
+        {{kernel.prepare_input(indent_size=8)}}
+        {%- else -%}
         memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
            : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
         memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
            : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
+        {%- endif %}
         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                 outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
       } { accumulation_loop=true }
@@ -160,6 +165,7 @@ def render(self,
                kernel: MLIRTemplateKernel,
                template_buffer_node = None,
                epilogue_nodes: Optional[List[IRNode]] = None,
+               prologue_nodes: Optional[List[IRNode]] = None,
                **kwargs):
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
@@ -236,10 +242,33 @@ def render(self,
             W_map = W_map,
             Y_numel = M * N,
             epilogue_nodes = epilogue_nodes,
+            prologue_nodes = prologue_nodes,
             input_reorder = self.input_reorder
         )
-
-        kernel.store_info = dict(
+        kernel.prologue_info = dict (
+            input_sram_var = "X_buffer",
+            input_dram_var = "X",
+            input_index_var = "index0",
+            input_tag_var = "tag1",
+            input_numel = M * K,
+            input_tile_size = (TILE_M, TILE_K),
+            input_sram_stride = [1, TILE_M],
+            vector_sram_stride = [TILE_M, 1],
+            input_subtile_size = (SUB_TILE_M, SUB_TILE_K),
+            weight_sram_var = "W_buffer",
+            weight_dram_var = "W",
+            weight_index_var = "index1",
+            weight_tag_var = "tag2",
+            weight_numel = K * N,
+            weight_tile_size = (TILE_K, TILE_N),
+            weight_sram_stride = [1, TILE_K],
+            weight_subtile_size = (SUB_TILE_K, SUB_TILE_N),
+            tile_size = (TILE_M, TILE_K),
+            vlane_split_axis = 1,
+            vlane_stride = 1,
+            is_bmm = False,
+        )
+        kernel.epilogue_info = dict(
             output_node = self.output_node.name,
             dependent_buf = [],
             sram_var = "Y_buffer",
diff --git a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
index 6a5aafa0..ff617eb4 100644
--- a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
@@ -75,7 +75,7 @@ def render(self,
             out_tile=out_tile,
             DATA_STYPE="f32",
         )
-        kernel.store_info = dict(
+        kernel.epilogue_info = dict(
             output_node = self.output_node.name,
             dependent_buf = [],
             sram_var = "Y_buffer",
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index ec8de5a1..a1f39543 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -1,5 +1,7 @@
 import os
 import math
+from functools import reduce
+import operator
 from sympy import symbols, sympify
 from PyTorchSimFrontend import extension_config
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
@@ -41,6 +43,22 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
                 # We can't fuse dim=-1
                 possible = int(sympify(stride).coeff(target_symbol)) != 1
                 return size_match and possible
+
+            # For prologue fusion case
+            if not node1.is_template() and len(node1.get_nodes())==1 and node2.is_template():
+                # Return false if node2 is Convolution template
+                if node2.get_nodes()[0].node.origin_node.target._name == 'aten::mm' or \
+                    node2.get_nodes()[0].node.origin_node.target._name == 'aten::addmm':
+                    return False
+                if node2.get_nodes()[0].node.origin_node is not None and hasattr(node2.get_nodes()[0].node.origin_node.target, "_name") and node2.get_nodes()[0].node.origin_node.target._name == 'aten::convolution':
+                    return False
+                if node1.is_reduction():
+                    return False
+                if len(node1.read_writes.writes) != 1:
+                    return False
+                if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]:
+                    return True
+
         return self.scheduler.can_fuse_origin(node1, node2)
 
     def _set_flush_status(self, status: bool):
@@ -167,13 +185,51 @@ def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, loop_size
             wrapper.define_kernel(kernel_name, codecache_def.getvalue(), cuda=False)
         return kernel_name
 
-    def codegen_template_code(self, kernel, render, template_node, epilogue_nodes):
+    def codegen_template_code(self, kernel, render, template_node, prologue_nodes, epilogue_nodes):
         with kernel:
-            for node in [template_node, *epilogue_nodes]:
+            for node in [template_node, *prologue_nodes, *epilogue_nodes]:
                 node.mark_run()
             partial_code = render()
-            tile_desc = kernel.set_tile_size(kernel.store_info)
+            tile_desc = kernel.set_tile_size(kernel.epilogue_info)
             kernel.kernel_group.set_tile_info(tile_desc)
+            if prologue_nodes:
+                _, (group, reduction_group) = max(
+                    prologue_nodes, key=lambda x: int(x.is_reduction())
+                ).group
+                tile_desc = kernel.set_tile_size(kernel.prologue_info)
+                kernel.kernel_group.set_prologue_tile_info(tile_desc)
+                vars, reduction_vars = kernel.set_ranges(group, reduction_group)
+            # Flush created varaibles, since template fusion doen't share variable
+            kernel.cse.cache.clear()
+            kernel.prologue_buffer_group.set_buffers()
+            kernel.call_ranges = None
+            kernel.load = kernel.load_prologue
+            kernel.store = kernel.store_prologue
+            for node in prologue_nodes:
+                # Reuse created spad
+                read_list = sorted(list(node.read_writes.reads))
+                if reduce(operator.mul, read_list[-1].size, 1) == template_node.node.get_numel():
+                    prologue_input_arg = read_list[-1].name
+                else:
+                    prologue_input_arg = read_list[0].name
+                prologue_output_arg = list(node.read_writes.writes)[0].name
+                template_buf = self.kernel_group.args.input_buffers[prologue_output_arg]
+                if template_node.get_nodes()[0].node.origin_node.target._name == 'aten::bmm':
+                    target_buf = f"{template_buf}_buffer2D"
+                else:
+                    target_buf = f"{template_buf}_buffer"
+
+                # To skip the dma code gen
+                kernel.buffer_names[prologue_input_arg] = target_buf
+                kernel.buffer_names[prologue_output_arg] = target_buf
+
+                # Edge delete
+                kernel.kernel_group.args.input_buffers = {
+                    (arg if buf != template_buf else prologue_input_arg): buf
+                    for arg, buf in kernel.kernel_group.args.input_buffers.items()
+                }
+                node.codegen((vars, reduction_vars))
+
             if epilogue_nodes:
                 _, (group, reduction_group) = max(
                     epilogue_nodes, key=lambda x: int(x.is_reduction())
@@ -181,9 +237,12 @@ def codegen_template_code(self, kernel, render, template_node, epilogue_nodes):
                 vars, reduction_vars = kernel.set_ranges(group, reduction_group)
             # Flush created varaibles, since template fusion doen't share variable
             kernel.cse.cache.clear()
+            kernel.epilogue_buffer_group.set_buffers()
+            kernel.load = kernel.load_epilogue
+            kernel.store = kernel.store_epilogue
             for node in epilogue_nodes:
                 if template_node.node.name in [dep[0] for dep in list(node.read_writes.reads)]:
-                    kernel.store_info['dependent_buf'].append(node.node.name)
+                    kernel.epilogue_info['dependent_buf'].append(node.node.name)
                 node.codegen((vars, reduction_vars))
         with V.set_kernel_handler(kernel):
             src_code = (
@@ -194,18 +253,29 @@ def codegen_template_code(self, kernel, render, template_node, epilogue_nodes):
         return src_code
 
     def codegen_template(self, template_node, epilogue_nodes):
+        # Handle prologue pattern
+        prologue_nodes = []
+        if not template_node.is_template():
+            epilogue_nodes = [template_node] + epilogue_nodes
+            for i, node in enumerate(epilogue_nodes):
+                if node.is_template():
+                    template_node = node
+                    prologue_nodes = epilogue_nodes[:i]
+                    epilogue_nodes = epilogue_nodes[i+1:]
+                    break
+
         _, (numel, rnumel) = template_node.group
         template_buffer = template_node.node
-        kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group)
+        kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group)
         _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs()
 
-        src_code = self.codegen_template_code(kernel, render, template_node, epilogue_nodes)
+        src_code = self.codegen_template_code(kernel, render, template_node, prologue_nodes, epilogue_nodes)
         wrapper = V.graph.wrapper_code
 
         if src_code in wrapper.src_to_kernel: # [CONV] check inner function is already defined
             kernel_name = wrapper.src_to_kernel[src_code]
-            kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, epilogue_nodes=epilogue_nodes, kernel_name=kernel_name) # update kernel name
-            src_code = self.codegen_template_code(kernel, render, template_node, epilogue_nodes)
+            kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_name=kernel_name) # update kernel name
+            src_code = self.codegen_template_code(kernel, render, template_node, prologue_nodes, epilogue_nodes)
 
         with V.set_kernel_handler(kernel):
             spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n"
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index a0537201..6cd06a23 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -25,6 +25,30 @@
 
 from . import mlir_common
 
+class IndentedBufferGroup:
+    def __init__(self, kernel: 'MLIRTemplateKernel'):
+        self.kernel = kernel
+        self.body = IndentedBuffer()
+        self.loads = IndentedBuffer()
+        self.compute = IndentedBuffer()
+        self.stores = IndentedBuffer()
+        self.applys = IndentedBuffer()
+        self.dma_loads = IndentedBuffer()
+        self.dma_stores = IndentedBuffer()
+        self.spad_buffer = IndentedBuffer()
+
+    def set_buffers(self):
+        self.kernel.loads = self.loads
+        self.kernel.compute = self.compute
+        self.kernel.stores = self.stores
+        self.kernel.dma_loads = self.dma_loads
+        self.kernel.dma_stores = self.dma_stores
+        self.kernel.spad_buffer = self.spad_buffer
+
+    @contextlib.contextmanager
+    def as_local(self):
+        yield self
+
 class MLIRTemplateKernel(MLIRKernel, BaseMLIRHardwareInfo):
     def __init__(self,
                  kernel_name,
@@ -54,6 +78,9 @@ def __init__(self,
         self.map_cse = CSE("#", self.suffix, name_prefix="template_map")
         self.const_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="template_const")
         self.alloc_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="template_alloc")
+        self.prologue_buffer_group = IndentedBufferGroup(self)
+        self.epilogue_buffer_group = IndentedBufferGroup(self)
+        self.global_vars = IndentedBuffer()
         self.reduction_epilogue_suffix = IndentedBuffer()
         self.reduction_fusion = False
         self.reduction_idx = None
@@ -321,27 +348,48 @@ def call_kernel(self, kernel_name):
             kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}",
             call_args, cuda=False)
 
-    def codegen_body(self):
+    def codegen_prologue_body(self):
+        with self.prologue_buffer_group.as_local() as buf:
+            buf.body.splice(buf.spad_buffer)
+            buf.body.splice(buf.applys)
+            buf.body.splice(buf.dma_loads)
+
+            if (buf.loads.getvalue() != '' or buf.compute.getvalue() != '' or buf.stores.getvalue() != ''):
+                buf.body.writelines(self.compute_body_loop.lines())
+                compute_body = mlir_common.ParallelLoopBuffer()
+                with contextlib.ExitStack() as stack:
+                    stack.enter_context(compute_body.indent(attribute="{inner_loop=false}"))
+                    compute_body.splice(buf.loads)
+                    compute_body.splice(buf.compute)
+                    compute_body.splice(buf.stores)
+                buf.body.splice(compute_body)
+
+        # Clear buffers
+        self.loads.clear()
+        self.compute.clear()
+        self.stores.clear()
+
+    def codegen_epilogue_body(self):
         def template_store():
             zero_cse = self.get_const_cse(0)
-            sram_var = self.store_info["sram_var"]
-            dram_var = self.store_info["dram_var"]
-            index_var = self.store_info["index_var"]
-            tag_var = self.store_info["tag_var"]
-            mlir_dtype = self.store_info["mlir_dtype"]
-            dram_shape = self.store_info["dram_shape"]
+            sram_var = self.epilogue_info["sram_var"]
+            dram_var = self.epilogue_info["dram_var"]
+            index_var = self.epilogue_info["index_var"]
+            tag_var = self.epilogue_info["tag_var"]
+            mlir_dtype = self.epilogue_info["mlir_dtype"]
+            dram_shape = self.epilogue_info["dram_shape"]
             vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis
             vlane_stride = self.kernel_group.tile_desc.get_vlane_stride()
-            tile_stride = self.store_info["tile_stride"]
+            tile_stride = self.epilogue_info["tile_stride"]
             tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
             sram_index_var = ",".join([f"%{zero_cse}"] *  self.kernel_group.tile_desc.get_nr_dim())
             code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  tag_var, dram_shape, tile_shape, tile_stride)
             self.cse.generate(self.dma_stores, code, assignment = False)
-        self.body.splice(self.spad_buffer)
-        self.body.splice(self.applys)
-        self.body.splice(self.dma_loads)
-        self.body.writelines(self.compute_body_loop.lines())
+        self.epilogue_buffer_group.body.splice(self.spad_buffer)
+        self.epilogue_buffer_group.body.splice(self.applys)
+        self.epilogue_buffer_group.body.splice(self.dma_loads)
+        self.epilogue_buffer_group.body.writelines(self.compute_body_loop.lines())
         compute_body = mlir_common.ParallelLoopBuffer()
         with contextlib.ExitStack() as stack:
             stack.enter_context(compute_body.indent(attribute="{inner_loop=false}",suffix=self.compute_body_loop.epilogue_line()))
@@ -349,10 +397,11 @@ def template_store():
             compute_body.splice(self.compute)
             if len(self.stores._lines) == 0:
                 template_store()
-            compute_body.splice(self.stores)
-        self.body.splice(compute_body)
-        self.body.splice(self.dma_stores)
-        self.body.splice(self.reduction_epilogue_suffix)
+            compute_body.splice(self.epilogue_buffer_group.stores)
+        if (compute_body.getvalue()):
+            self.epilogue_buffer_group.body.splice(compute_body)
+        self.epilogue_buffer_group.body.splice(self.dma_stores)
+        self.epilogue_buffer_group.body.splice(self.reduction_epilogue_suffix)
 
         # Clear buffers
         self.loads.clear()
@@ -394,7 +443,7 @@ def def_kernel(
                     extra_node[node.get_name()] = node.node
                 else:
                     extra_node[node.get_name()] = node
-                self.buffer_names[node.get_name()] = self.store_info['sram_var']
+                self.buffer_names[node.get_name()] = self.epilogue_info['sram_var']
 
         def hook():
             arg_defs, *_ = self.kernel_group.args.mlir_argdefs(extra_node=extra_node)
@@ -439,7 +488,7 @@ def def_conv_kernel(
                 self.kernel_group.args.output_buffers[node.get_name()] = name
                 self.store_buffer_names.add(node.get_name())    #TODO: Is this enough not calling store() in mlir_common.py?
                 self.extra_node[node.get_name()] = node
-                self.buffer_names[node.get_name()] = self.store_info['sram_var']   #TODO: Buffer name fixed
+                self.buffer_names[node.get_name()] = self.epilogue_info['sram_var']   #TODO: Buffer name fixed
 
         def kernel_hook():
             arg_defs, *_ = self.kernel_group.args.mlir_argdefs(extra_node=self.extra_node)
@@ -467,6 +516,50 @@ def get_conv_inputs(self):
     def get_conv_outputs(self):
         return {k: v for k, v in self.kernel_group.args.output_buffers.items() if v != 'REMOVED'}
 
+    def prepare_input(self, indent_size: int = 0):
+        def emit_dma_start(buffer_name, index_var, tag_var, size, tile_size, subtile_size=None, async_flag=True, label="X"):
+            base = f"memref.dma_start %{label}[%{index_var}], %{buffer_name}[%c0, %c0], %c_mvin"
+            if label == "W":
+                base = base.replace("mvin", "mvin2")
+
+            suffix = f"%{tag_var}[%c0], %axis, %vstride"
+            memref_shape = f"memref<{size}xf32>"
+            tile_shape = "x".join([str(x) for x in tile_size])
+            tile_memref = f"memref<{tile_shape}xf32, 1>"
+            tag_memref = f"memref<1xi32>"
+            attrs = f"sram_stride=[1, {tile_size[0]}]"
+            async_flag = "true" if async_flag else "false"
+            if subtile_size:
+                subtile_shape = ", ".join([str(x) for x in subtile_size])
+                attrs = f"subtile_size=[{subtile_shape}], async={async_flag}, {attrs}"
+            else:
+                subtile_shape = ", ".join([str(x) for x in tile_size])
+                attrs = f"subtile_size=[{subtile_shape}], async={async_flag}, {attrs}"
+            attr_memref = f"{{ {attrs} }}"
+            return f"{base}, {suffix}: {memref_shape}, {tile_memref}, {tag_memref} {attr_memref}"
+
+        def hook():
+            code = IndentedBuffer()
+            self.codegen_prologue_body()
+            prologue_code = self.prologue_buffer_group.body
+            if prologue_code.getvalue():
+                code.writeline(emit_dma_start(self.prologue_info["input_sram_var"], self.prologue_info["input_index_var"], self.prologue_info["input_tag_var"],
+                                              self.prologue_info["input_numel"], self.prologue_info["input_tile_size"], subtile_size=self.prologue_info["input_subtile_size"], label="X"))
+                code.writeline(emit_dma_start(self.prologue_info["weight_sram_var"], self.prologue_info["weight_index_var"], self.prologue_info["weight_tag_var"],
+                                              self.prologue_info["weight_numel"], self.prologue_info["weight_tile_size"], subtile_size=self.prologue_info["weight_subtile_size"], label="W"))
+                code.splice(prologue_code)
+            else:
+                code.writeline(emit_dma_start(self.prologue_info["input_sram_var"], self.prologue_info["input_index_var"], self.prologue_info["input_tag_var"],
+                                              self.prologue_info["input_numel"], self.prologue_info["input_tile_size"], self.prologue_info["input_subtile_size"], async_flag=True, label="X"))
+                code.writeline(emit_dma_start(self.prologue_info["weight_sram_var"], self.prologue_info["weight_index_var"], self.prologue_info["weight_tag_var"],
+                                              self.prologue_info["weight_numel"], self.prologue_info["weight_tile_size"], self.prologue_info["weight_subtile_size"], async_flag=True, label="W"))
+            code = textwrap.indent(code.getvalue(), " "*indent_size).strip()
+            return code
+
+        assert "<PREPARE_INPUT>" not in self.render_hooks
+        self.render_hooks["<PREPARE_INPUT>"] = hook
+        return "<PREPARE_INPUT>"
+
     def output_name(self):
         # Cannot know the output name from the template, so we need to hook it
         def hook():
@@ -481,8 +574,8 @@ def hook():
 
     def store_output(self, indent_size: int = 0):
         def hook():
-            self.codegen_body()
-            return textwrap.indent(self.body.getvalue(), " "*indent_size).strip()
+            self.codegen_epilogue_body()
+            return textwrap.indent(self.epilogue_buffer_group.body.getvalue(), " "*indent_size).strip()
 
         assert "<STORE_OUTPUT>" not in self.render_hooks
         self.render_hooks["<STORE_OUTPUT>"] = hook
@@ -569,11 +662,92 @@ def get_spad_size_per_lane(self, tile_m, tile_n):
         size = tile_m * ((tile_n + self.vector_lane - 1) // self.vector_lane)
         return max(size, 2) # vector load/store
 
+    def load_prologue(self, name: str, index: sympy.Expr):
+        load_dim = []
+        if not isinstance(V.graph, NullHandler) and name in V.graph.graph_inputs:
+            load_dim = V.graph.graph_inputs[name].layout.size
+        if self.kernel_group.prologue_tile_desc.get_numel() == self.buffer_types[name][1]:
+            index_var = self.prologue_info['input_index_var'] if len(load_dim) != 1 else 'tile_n'
+        else:
+            # Broadcast pattern
+            zero_index = self.const_cse.generate(self.const_buffer, "arith.constant 0 : index")
+            if self.prologue_info['is_bmm']: # FIXME: hardcoded
+                idx = f"%b, %t_m, %{zero_index}"
+                map_var = self.map_cse.generate(self.global_vars, f"affine_map<(d0, d1, d2) -> (d0 * 512 + d1)>")
+                vlane_split_axis = 2
+            else:
+                idx = f"%t_m, %{zero_index}"
+                map_var = self.map_cse.generate(self.global_vars, f"affine_map<(d0, d1) -> (d0)>")
+                vlane_split_axis = self.kernel_group.prologue_tile_desc.vlane_split_axis if len(load_dim) != 1 else 0    # FIXME: Fixed split axis for 1d load dim
+            index_var = self.apply_cse.generate(self.dma_loads, f"affine.apply #{map_var}({idx})")
+        index = self.rename_indexing(index)
+        dram_var = self.kernel_group.args.input(name)
+        dtype = V.graph.get_dtype(name)
+        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
+        vlane_stride = self.kernel_group.prologue_tile_desc.vlane_stride if len(load_dim) != 1 else 1    # FIXME: Fixed stride for 1d load dim
+        tile_numel_per_lane = self.kernel_group.prologue_tile_desc.get_numel_per_lane()
+        tile_shape = self.kernel_group.prologue_tile_desc.get_mlir_shape(mlir_dtype)
+        tile_stride = self.prologue_info['input_sram_stride']
+
+        # Compute vector unit size
+        vshape = self.kernel_group.prologue_tile_desc.get_mlir_vshape(mlir_dtype)
+        compute_vec_size = self.kernel_group.prologue_tile_desc.get_compute_vec_size()
+
+        if name not in self.buffer_names:
+            # Allocate sram buffer
+            dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
+            sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index, self.alloc_buffer)
+            self.buffer_names[name] = sram_var
+            code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
+                                     f"{name}_tag", dram_shape, tile_shape, tile_stride)
+            self.cse.generate(self.dma_loads, code, assignment = False)
+
+        # Load vector from sram
+        sram_var = self.buffer_names[name]
+        zero_var = self.get_const_cse(0)
+        compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.prologue_tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
+
+        if compute_vec_size > 1:
+            operation = "affine.vector_load"
+            line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
+        else:
+            operation = "affine.load"
+            line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
+
+        out = self.cse.generate(self.loads, line)
+        self.register_var_info(out, [compute_vec_size, mlir_dtype])
+        return out
+
+    def store_prologue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
+        dtype = V.graph.get_dtype(name)
+        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
+        tile_shape = self.kernel_group.prologue_tile_desc.get_mlir_shape(mlir_dtype)
+
+        # Compute vector unit size
+        vshape = self.kernel_group.prologue_tile_desc.get_mlir_vshape(mlir_dtype)
+        compute_vec_size = self.kernel_group.prologue_tile_desc.get_compute_vec_size()
+
+        sram_var = self.buffer_names[name]
+        zero_var = self.get_const_cse(0)
+
+        _, operand_type = self.var_info[value]
+        if mlir_dtype != operand_type:
+            value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
+        compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.prologue_tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
+        # Generate vector load instruction
+        if compute_vec_size > 1:
+            operation = "affine.vector_store"
+            line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
+        else:
+            operation = "affine.store"
+            line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}"
+        self.stores.writeline(line)
+
     def load_epilogue(self, name: str, index: sympy.Expr):
         load_dim = []
         if not isinstance(V.graph, NullHandler) and name in V.graph.graph_inputs:
             load_dim = V.graph.graph_inputs[name].layout.size
-        index_var = self.store_info['index_var'] if len(load_dim) != 1 else 'tile_n'
+        index_var = self.epilogue_info['index_var'] if len(load_dim) != 1 else 'tile_n'
         index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.input(name)
         dtype = V.graph.get_dtype(name)
@@ -582,7 +756,7 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         vlane_stride = self.kernel_group.tile_desc.vlane_stride if len(load_dim) != 1 else 1    # FIXME: Fixed stride for 1d load dim
         tile_numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
-        tile_stride = self.store_info['tile_stride']
+        tile_stride = self.epilogue_info['tile_stride']
 
         # Compute vector unit size
         vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
@@ -636,7 +810,7 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         return out
 
     def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
-        index_var = self.store_info['index_var']
+        index_var = self.epilogue_info['index_var']
         dram_var = self.kernel_group.args.output(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
@@ -646,7 +820,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
-        tile_stride = self.store_info['tile_stride']
+        tile_stride = self.epilogue_info['tile_stride']
 
         # Compute vector unit size
         vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
@@ -816,13 +990,13 @@ def store_reduction_epilogue(self, name, index, value):
     def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index, buffer=None):
         return super().get_scratchpad_buffer(dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index, True, buffer=buffer)
 
-    def set_tile_size(self, template_store_info):
-        tile_desc = mlir_common.MLIRMultiDimTile(template_store_info['tile_size'],
+    def set_tile_size(self, template_epilogue_info):
+        tile_desc = mlir_common.MLIRMultiDimTile(template_epilogue_info['tile_size'],
             self.vector_lane,
-            vlane_split_axis=template_store_info['vlane_split_axis'],
-            vlane_stride=template_store_info['vlane_stride'])
+            vlane_split_axis=template_epilogue_info['vlane_split_axis'],
+            vlane_stride=template_epilogue_info['vlane_stride'])
 
-        if 'nr_rdim' in template_store_info and template_store_info['nr_rdim']==1:
+        if 'nr_rdim' in template_epilogue_info and template_epilogue_info['nr_rdim']==1:
             tile_desc.nr_rdim = 1
             numel_per_lane = tile_desc.get_numel_per_lane()
             reduction_axis_size = tile_desc.get_tile_size()[-2]
@@ -832,7 +1006,7 @@ def set_tile_size(self, template_store_info):
             self.reduction_fusion = True
             self.reduction_axis_size =  tile_desc.get_tile_size()[-2]
             self.reduction_nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size
-            self.reduction_idx = template_store_info["reduction_idx"]
+            self.reduction_idx = template_epilogue_info["reduction_idx"]
             self.compute_body_loop.size = reduction_axis_size
             self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop
         else:
@@ -890,6 +1064,7 @@ def generate(self, **kwargs) -> ChoiceCaller:
 
         def make_kernel_render(
             template_node: TemplateBuffer,
+            prologue_nodes: Optional[List[IRNode]] = None,
             epilogue_nodes: Optional[List[IRNode]] = None,
             kernel_name: str = kernel_hash_name,
             kernel_group: Optional[mlir_common.MLIRWrapperKenrelGroup] = None
@@ -910,7 +1085,8 @@ def make_kernel_render(
             kwargs = {
                 'kernel': kernel,
                 'template_buffer_node': template_node,
-                'epilogue_nodes': epilogue_nodes
+                'epilogue_nodes': epilogue_nodes,
+                'prologue_nodes': prologue_nodes,
             }
             render = functools.partial(
                 kernel.render,
diff --git a/tests/Fusion/test_prologue_fusion.py b/tests/Fusion/test_prologue_fusion.py
new file mode 100644
index 00000000..12098b24
--- /dev/null
+++ b/tests/Fusion/test_prologue_fusion.py
@@ -0,0 +1,81 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def test_elem_broadcast_fusion(device):
+    def matmul_fused(a, b, c):
+        return torch.matmul(c * a, b)
+    torch.manual_seed(0)
+    input = torch.randn(128, 128)
+    weight = torch.randn(128, 128)
+    c = torch.randn(128, 1, dtype=torch.float32)
+    x1 = input.to(device=device)
+    w1 = weight.to(device=device)
+    c1 = c.to(device=device)
+    x2 = input.to("cpu")
+    w2 = weight.to("cpu")
+    c2 = c.to("cpu")
+    opt_fn = torch.compile(dynamic=False)(matmul_fused)
+    res = opt_fn(x1, w1, c1)
+    y = matmul_fused(x2, w2, c2)
+    test_result("Matmul Scalar Fusion Forward", res, y)
+
+def test_elem_fusion(device):
+    def matmul_fused(a, b, c):
+        return torch.matmul(c * a, b)
+    torch.manual_seed(0)
+    input = torch.randn(128, 128)
+    weight = torch.randn(128, 128)
+    c = torch.randn(128, 128, dtype=torch.float32)
+    x1 = input.to(device=device)
+    w1 = weight.to(device=device)
+    c1 = c.to(device=device)
+    x2 = input.to("cpu")
+    w2 = weight.to("cpu")
+    c2 = c.to("cpu")
+    opt_fn = torch.compile(dynamic=False)(matmul_fused)
+    res = opt_fn(x1, w1, c1)
+    y = matmul_fused(x2, w2, c2)
+    test_result("Matmul Element-wise Fusion Forward", res, y)
+
+def test_elem_bmm_fusion(device, batch_size=1, m=512, n=512, k=64):
+    def bmm(a, b, c, d):
+        return torch.bmm((a - b)/c , d)
+    torch.manual_seed(0)
+    a = torch.randn(batch_size, m, k).to(device=device)
+    b = torch.randn(batch_size, m, 1).to(device=device)
+    c = torch.randn(batch_size, m, 1) * 1000
+    c = c.to(device=device)
+    d = torch.randn(batch_size, k, n).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(bmm)
+    res = opt_fn(a, b, c, d)
+    out = bmm(a.cpu(), b.cpu(), c.cpu(), d.cpu())
+    print(torch.max(torch.abs(res.cpu() - out)))
+    test_result("BMM Element-wise Fusion Forward", res, out)
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    test_elem_broadcast_fusion(device)
+    test_elem_fusion(device)
+    test_elem_bmm_fusion(device, batch_size=12, m=512, n=64, k=512)
\ No newline at end of file

From 018078ec5ae1a8427a330886e856c4f925356d0d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 17 Jun 2025 21:13:12 +0000
Subject: [PATCH 355/432] [Frontend/Fusion] Optimize BMM+Reduction fusion

---
 PyTorchSimFrontend/mlir/mlir_template.py | 218 ++++++++++-------------
 1 file changed, 99 insertions(+), 119 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 6cd06a23..a9da6e9d 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -83,6 +83,7 @@ def __init__(self,
         self.global_vars = IndentedBuffer()
         self.reduction_epilogue_suffix = IndentedBuffer()
         self.reduction_fusion = False
+        self.reduction_body_loop = None
         self.reduction_idx = None
 
         # Overwrite ops
@@ -386,6 +387,12 @@ def template_store():
             code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  tag_var, dram_shape, tile_shape, tile_stride)
             self.cse.generate(self.dma_stores, code, assignment = False)
+        # Do dma store first to overlap epilogue nodes
+        if self.reduction_fusion:
+            if len(self.stores._lines) == 0:
+                template_store()
+                self.epilogue_buffer_group.body.splice(self.dma_stores)
+                self.dma_stores.clear()
         self.epilogue_buffer_group.body.splice(self.spad_buffer)
         self.epilogue_buffer_group.body.splice(self.applys)
         self.epilogue_buffer_group.body.splice(self.dma_loads)
@@ -393,10 +400,18 @@ def template_store():
         compute_body = mlir_common.ParallelLoopBuffer()
         with contextlib.ExitStack() as stack:
             stack.enter_context(compute_body.indent(attribute="{inner_loop=false}",suffix=self.compute_body_loop.epilogue_line()))
-            compute_body.splice(self.loads)
-            compute_body.splice(self.compute)
-            if len(self.stores._lines) == 0:
-                template_store()
+            if self.reduction_fusion:
+                #if len(self.stores._lines) == 0:
+                #    template_store()
+                compute_body.writelines(self.reduction_body_loop.lines())
+                stack.enter_context(compute_body.indent(attribute="{inner_loop=false}"))
+                compute_body.splice(self.loads)
+                compute_body.splice(self.compute)
+            else:
+                compute_body.splice(self.loads)
+                compute_body.splice(self.compute)
+                if len(self.stores._lines) == 0:
+                    template_store()
             compute_body.splice(self.epilogue_buffer_group.stores)
         if (compute_body.getvalue()):
             self.epilogue_buffer_group.body.splice(compute_body)
@@ -783,30 +798,22 @@ def load_epilogue(self, name: str, index: sympy.Expr):
                 operation = "affine.load"
                 line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
             out = self.cse.generate(self.loads, line)
+            self.register_var_info(out, [compute_vec_size, mlir_dtype])
         else: # For reduction case
             reduce_size = self.reduction_nr_outer_loop
             vsize = compute_vec_size//reduce_size
             vshape = f"vector<{vsize}x{mlir_dtype}>"
-            flatten_tshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
 
-            init = self.cse.generate(self.loads, f"arith.constant 0.0 : {mlir_dtype}")
-            init_vec = self.cse.generate(self.loads, f"vector.broadcast %{init} : {mlir_dtype} to {flatten_tshape}")
             if compute_vec_size > 1:
-                out_list = []
-                for i in range(reduce_size):
-                    offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0) -> (d0 + {i*(self.reduction_axis_size)})>(%{self.compute_idx})")
-                    compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{offset}"])
-                    operation = "affine.vector_load"
-                    line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
-                    out = self.cse.generate(self.loads, line)
-                    out_list.append(out)
-                for idx, partial_out in enumerate(out_list):
-                    init_vec = self.cse.generate(self.loads, f"vector.insert_strided_slice %{partial_out}, %{init_vec} {{offsets=[{vsize*idx}],strides=[1]}} : {vshape} into {flatten_tshape}")
-                out = init_vec
+                offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.reduction_axis_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})")
+                compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{offset}"])
+                operation = "affine.vector_load"
+                line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
+                out = self.cse.generate(self.loads, line)
             else:
                 line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
                 out = self.cse.generate(self.loads, line)
-        self.register_var_info(out, [compute_vec_size, mlir_dtype])
+            self.register_var_info(out, [self.compute_body_loop.step, mlir_dtype])
         return out
 
     def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
@@ -859,91 +866,39 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         if argmax_or_argmin or is_welford_reduction(reduction_type):
             raise NotImplementedError() #TODO: argmin, argmax
 
-        # Prepare reduction loop
-        reduction_key = src_dtype, reduction_type, value
-        acc = self.reduction_cse.generate(
-            self.loads, f"reduction {reduction_key}", write=False
-        )
-        iterator = self.iterator_cse.generate(
-            self.loads, f"reduction {reduction_key}", write=False
-        )
-        init = self.init_cse.generate(
-            self.loads, f"reduction {reduction_key}", write=False
-        )
-        init_vec = self.init_vec_cse.generate(
-            self.loads, f"reduction {reduction_key}", write=False
-        )
+        # Reduction fusion codegen part
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
-        init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
-        vec_len = self.kernel_group.tile_desc.get_compute_vec_size()
-        reduced_shape = self.kernel_group.tile_desc.get_mlir_vshape(type_name)
-
-        # Set accumulation var
-        if vec_len == 1: # 1-D vector to scalar
-            # Edge case for scalar
-            init_vec = init
-        else:
-            # Adjust shape and inital value
-            init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {type_name} to {reduced_shape}")
-        acc_var = init_vec
-
-        # Reduction body prepare
-        body_acc = self.reduction_cse.generate(
-            self.compute, f"reduction {reduction_key}body_acc", write=False
-        )
-        body_iter_arg = self.iterator_cse.generate(
-            self.compute, f"reduction {reduction_key}body_iter_arg", write=False
-        )
-        self.register_var_info(body_iter_arg, [vec_len, type_name])
+        vec_size = self.compute_body_loop.step
+        vshape = f"vector<{vec_size}x{type_name}>"
+
+        tile_shape = f"memref<{self.reduction_body_loop.size * self.vector_lane}x{vec_size}x{type_name}, 1>"
+        name = f"{reduction_type}_buffer"
+        index = "dummy_index" # Not used
+        tile_numel_per_lane = self.compute_body_loop.step * self.reduction_body_loop.size
+        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, None, index, self.const_buffer)
+        zero_var = self.get_const_cse(0)
 
-        self.reduction_vars[acc] = (reduction_type, iterator, acc_var, reduced_shape)
-        self.affine_yield[body_acc] = reduced_shape
-        self.reduction_cse.reduction_cache[reduction_key] = acc
-        self.iterator_cse.reduction_cache[reduction_key] = iterator
-        self.init_cse.reduction_cache[reduction_key] = init_vec
+        # Load partial result
+        operation = "affine.vector_load"
+        compute_index_var = ",".join([f"%{self.reduction_loop_idx}"] + [f"%{zero_var}"])
+        line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
+        out = self.cse.generate(self.loads, line)
+        self.register_var_info(out, [self.compute_body_loop.step, type_name])
 
         # Reduction body codegen
-        result = reduction_partial_combine_vec(reduction_type, value, body_iter_arg)
-        self.compute_body_loop.reduction_vars[body_acc] = (reduction_type, body_iter_arg, iterator, reduced_shape)
-        self.compute_body_loop.affine_yield[result] = reduced_shape
-
-        # Final reduction
-        reduction_size = self.reduction_nr_outer_loop
-        if vec_len > reduction_size:
-            init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
-            if reduction_size == 1:
-                final_reduced_shape = f"{type_name}"
-                out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(reduction_type, acc, init, axis=0, shape=reduced_shape, reduced_shape=final_reduced_shape))
-            else:
-                final_reduced_shape = f"vector<{reduction_size}x{type_name}>"
-                init_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{init} : {type_name} to {final_reduced_shape}")
-                new_vshape= f"vector<{reduction_size}x{vec_len//reduction_size}x{type_name}>"
-                partial_vshape= f"vector<{vec_len//reduction_size}x{type_name}>"
-                value = self.cse.generate(self.reductions_suffix, f"vector.shape_cast %{acc} : {reduced_shape} to {new_vshape}")
-                # FIXME. I want to use N-Rank multi-reduciton, but we can't use it. It lowerd to scalar operations now...
-                for i in range(reduction_size):
-                    partial_value = self.cse.generate(self.reductions_suffix, f"vector.extract %{value}[{i}] : {partial_vshape} from {new_vshape}")
-                    out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(reduction_type, partial_value, init, axis=0, shape=partial_vshape, reduced_shape=type_name))
-                    init_vec = self.cse.generate(self.reductions_suffix, f"vector.insert %{out}, %{init_vec}[{i}] : {type_name} into {final_reduced_shape}")
-                out = init_vec
-            acc = out
-
-        # reigster reduction output
-        var_info = [reduction_size, mlir_common.DTYPE_TO_MLIR[dtype]]
-        self.register_var_info(acc, var_info)
-
-        # Specail handling for fusion
-        self.reduction_epilogue_suffix.writeline(f"affine.yield %{body_acc} : {self.affine_yield[body_acc]}")
-        return acc
+        result = reduction_partial_combine_vec(reduction_type, value, out)
 
-    def store_reduction_epilogue(self, name, index, value):
-        index = self.reduction_idx
-        tmp_cse = self.cse
-        self.cse = self.reduction_cse
+        # Store partial result
+        operation = "affine.vector_store"
+        line = f"{operation} %{result}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
+        self.compute.writeline(line) # Need to be placed after partial reduction
+        self.reduction_info = {sram_var : reduction_type}
+        return sram_var
 
+    def store_reduction_epilogue(self, name, index, value):
         dram_var = self.kernel_group.args.output(name)
         dtype = V.graph.get_dtype(name)
-        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
+        type_name = mlir_common.DTYPE_TO_MLIR[dtype]
         index = self.rename_indexing(index)
 
         # Tile is always reuduced in inner loop
@@ -953,40 +908,63 @@ def store_reduction_epilogue(self, name, index, value):
 
         vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis - 1
         vlane_stride = self.kernel_group.tile_desc.vlane_stride
-        tile_numel_per_lane = vlane_stride * nr_outer_loop
+        tile_numel_per_lane = vlane_stride * nr_outer_loop * 2
 
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
-        tile_shape = f"memref<{self.kernel_group.tile_desc.get_tile_size()[1]}x{mlir_dtype}, 1>"
+        tile_shape = f"memref<{self.kernel_group.tile_desc.get_tile_size()[1]*2}x{type_name}, 1>"
         tile_stride = [1]
-        compute_vec_size = self.var_info[value][0]
-        if compute_vec_size == 1:
-            vshape = f"{mlir_dtype}"
-        else:
-            vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
         sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index,
                                                                          index, buffer=self.const_buffer)
+        for i in range(self.reduction_body_loop.size):
+            vec_size = self.compute_body_loop.step
+            vshape = f"vector<{vec_size}x{type_name}>"
+
+            partial_tile_shape = f"memref<{self.reduction_body_loop.size * self.vector_lane}x{vec_size}x{type_name}, 1>"
+            # Load partial result
+            init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(self.reduction_info[value], dtype)} : {type_name}")
+            init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {type_name} to {vshape}")
+            zero_var = self.const_cse.generate(self.const_buffer, f"arith.constant {0} : index")
+            index_var = self.const_cse.generate(self.const_buffer, f"arith.constant {i} : index")
+            compute_index_var = ",".join([f"%{index_var}"] + [f"%{zero_var}"])
 
-        if self.welford_reduce_out is not None:
-            raise NotImplementedError()
+            operation = "affine.vector_load"
+            line = f"{operation} %{value}[{compute_index_var}] : {partial_tile_shape}, {vshape}"
+            out = self.cse.generate(self.reductions_suffix, line)
+            operation = "affine.vector_store"
+            line = f"{operation} %{init_vec}, %{value}[{compute_index_var}] : {partial_tile_shape}, {vshape}"
+            self.reductions_suffix.writeline(line)
+
+            # 2 step reduction
+            new_vec_size = 2
+            new_vshape = f"vector<{vec_size//new_vec_size}x{new_vec_size}x{type_name}>"
+            new_reduced_shape = f"vector<{new_vec_size}x{type_name}>"
+            out = self.cse.generate(self.reductions_suffix, f"vector.shape_cast %{out} : {vshape} to {new_vshape}")
+            init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {type_name} to {new_reduced_shape}")
+            out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(self.reduction_info[value], out, init_vec, axis=0, shape=new_vshape, reduced_shape=new_reduced_shape))
+            out2 = self.cse.generate(self.reductions_suffix, f"vector.shuffle %{out}, %{out} [1, 0] : {new_reduced_shape}, {new_reduced_shape}")
+
+            self.compute, self.reductions_suffix = self.reductions_suffix, self.compute
+            self.register_var_info(out, [new_vec_size, type_name])
+            self.register_var_info(out2, [new_vec_size, type_name])
+            out = reduction_partial_combine_vec(self.reduction_info[value], out, out2)
+            self.compute, self.reductions_suffix = self.reductions_suffix, self.compute
+
+            # Final reduction
+            #final_reduced_shape = type_name
+            #init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(self.reduction_info[value], dtype)} : {type_name}")
+            #out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(self.reduction_info[value], out, init, axis=0, shape=vshape, reduced_shape=final_reduced_shape))
 
-        # Select src type
-        if compute_vec_size == 1:
-            operation = "affine.store"
-            line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}"
-        else:
-            operation =  "affine.vector_store"
-            line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}"
-        self.reductions_suffix.writeline(DeferredLine(name, line))
+            operation = "affine.vector_store"
+            line = f"{operation} %{out}, %{sram_var}[%{index_var}] : {tile_shape}, {new_reduced_shape}"
+            self.reductions_suffix.writeline(DeferredLine(name, line))
 
         # MVOUT Encoding
         # Generate DMA instruction
-        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                 f"{name}_tag", dram_shape, tile_shape, tile_stride)
+        index_var = "red_idx"
+        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, type_name, dram_var, index_var, sram_var, sram_index_var,
+                                f"{name}_tag", dram_shape, tile_shape, tile_stride)
         self.reductions_suffix.writeline(DeferredLine(name, code))
 
-        # Restore origin cse
-        self.cse = tmp_cse
-
     def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index, buffer=None):
         return super().get_scratchpad_buffer(dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index, True, buffer=buffer)
 
@@ -1001,14 +979,16 @@ def set_tile_size(self, template_epilogue_info):
             numel_per_lane = tile_desc.get_numel_per_lane()
             reduction_axis_size = tile_desc.get_tile_size()[-2]
             nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size
-            tile_desc.vec_size = nr_outer_loop * 2 # Why? Emprically selected, other option failed to functionality...
+            tile_desc.vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality...
 
             self.reduction_fusion = True
             self.reduction_axis_size =  tile_desc.get_tile_size()[-2]
             self.reduction_nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size
             self.reduction_idx = template_epilogue_info["reduction_idx"]
+            self.reduction_loop_idx = "reduce_loop_idx"
             self.compute_body_loop.size = reduction_axis_size
             self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop
+            self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_loop_idx, nr_outer_loop)
         else:
             tile_desc.vec_size=64
             self.compute_body_loop.size = tile_desc.get_numel_per_lane()

From 9ce931025a362563e28a06d1b1499d5e16068ba9 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 18 Jun 2025 04:40:43 +0000
Subject: [PATCH 356/432] [Frontend] optimize attention kernel

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py |  4 ++--
 PyTorchSimFrontend/mlir/mlir_scheduling.py   |  6 +++---
 experiments/BERT.py                          |  5 +++--
 experiments/attention.py                     |  6 +++---
 tests/Fusion/test_attention_fusion.py        |  3 +--
 tests/test_matmul.py                         | 21 ++++++++++++++++++++
 tests/test_transformer.py                    | 10 ++++------
 7 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 85631adb..25858222 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -262,7 +262,7 @@ def render(self,
         TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node)
         TOG_latency = M if TILE_M > M else TILE_M
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
-        TILE_K = TILE_K // 2 if prologue_nodes else TILE_K
+        TILE_K = TILE_K // 4 if prologue_nodes else TILE_K
         SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
         SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
         SUB_TILE_K = TILE_K # if (TILE_K < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
@@ -320,7 +320,7 @@ def render(self,
             weight_tile_size = (TILE_K, TILE_N),
             weight_sram_stride = [1, TILE_K],
             weight_subtile_size = (SUB_TILE_K, SUB_TILE_N),
-            tile_size = (TILE_M, TILE_K),
+            tile_size = (TILE_K, TILE_N),
             vlane_split_axis = 1,
             vlane_stride = 1,
             is_bmm = True,
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index a1f39543..d41a9128 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -47,9 +47,9 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
             # For prologue fusion case
             if not node1.is_template() and len(node1.get_nodes())==1 and node2.is_template():
                 # Return false if node2 is Convolution template
-                if node2.get_nodes()[0].node.origin_node.target._name == 'aten::mm' or \
-                    node2.get_nodes()[0].node.origin_node.target._name == 'aten::addmm':
-                    return False
+                # if node2.get_nodes()[0].node.origin_node.target._name == 'aten::mm' or \
+                #     node2.get_nodes()[0].node.origin_node.target._name == 'aten::addmm':
+                #     return False
                 if node2.get_nodes()[0].node.origin_node is not None and hasattr(node2.get_nodes()[0].node.origin_node.target, "_name") and node2.get_nodes()[0].node.origin_node.target._name == 'aten::convolution':
                     return False
                 if node1.is_reduction():
diff --git a/experiments/BERT.py b/experiments/BERT.py
index e111908e..7086ad9a 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -7,7 +7,8 @@
 
 def run_BERT(size, input_seq, config):
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    from tests.test_transformer import DecoderBlock
+    # from tests.test_transformer import DecoderBlock
+    from tests.Fusion.test_transformer_fusion import DecoderBlock
     scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
     device = scheduler.execution_engine.module.custom_device()
 
@@ -35,7 +36,7 @@ def run_BERT(size, input_seq, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path FIXME: gem5 result is different as directoy name
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
diff --git a/experiments/attention.py b/experiments/attention.py
index acfed848..e8f89dac 100644
--- a/experiments/attention.py
+++ b/experiments/attention.py
@@ -10,9 +10,9 @@ def run_attention(size, config):
     def attention(query, key, value):
         import math
         d_k = query.size(-1)
-        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
-        p_attn = scores.softmax(dim=-1)
-        return torch.matmul(p_attn, value)
+        scores = torch.matmul(key, query.transpose(-2, -1)) / math.sqrt(d_k)
+        p_attn = scores.softmax(dim=-2)
+        return torch.matmul(value.transpose(-1, -2), p_attn)
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
     scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
     device = scheduler.execution_engine.module.custom_device()
diff --git a/tests/Fusion/test_attention_fusion.py b/tests/Fusion/test_attention_fusion.py
index a513b0bb..95bdf165 100644
--- a/tests/Fusion/test_attention_fusion.py
+++ b/tests/Fusion/test_attention_fusion.py
@@ -47,8 +47,7 @@ def forward(self, query, key, value):
         x = torch.matmul(value.transpose(-1, -2), p_attn)
         # 3) "Concat" using a view and apply a final linear.
         x = (
-            x.contiguous()
-            .view(-1, self.h * self.d_k)
+            x.view(-1, self.h * self.d_k)
         )
         del query
         del key
diff --git a/tests/test_matmul.py b/tests/test_matmul.py
index 44f70b69..bd219051 100644
--- a/tests/test_matmul.py
+++ b/tests/test_matmul.py
@@ -50,6 +50,27 @@ def custom_matmul(bias, a, b):
     y = custom_matmul(b2, x2, w2)
     test_result("Addmm Forward", res, y)
 
+def test_linear(device, input_size=128, hidden_size=128, output_size=128):
+    def custom_linear(a, b, bias):
+        linear = torch.nn.Linear(hidden_size, output_size)
+        linear.weight = torch.nn.Parameter(b)
+        linear.bias = torch.nn.Parameter(bias)
+        return linear(a)
+    torch.manual_seed(0)
+    input = torch.randn(input_size, hidden_size)
+    weight = torch.randn(output_size, hidden_size)
+    bias = torch.randn(output_size)
+    x1 = input.to(device=device)
+    w1 = weight.to(device=device)
+    b1 = bias.to(device=device)
+    x2 = input.to("cpu")
+    w2 = weight.to("cpu")
+    b2 = bias.to("cpu")
+    opt_fn = torch.compile(dynamic=False)(custom_linear)
+    res = opt_fn(x1, w1, b1)
+    y = custom_linear(x2, w2, b2)
+    test_result("Linear Forward", res, y)
+
 if __name__ == "__main__":
     import os
     import sys
diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index 83ed5850..82773da2 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -41,14 +41,12 @@ def forward(self, query, key, value):
         ]
 
         # 2) Apply attention on all the projected vectors in batch.
-        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)
-        p_attn = scores.softmax(dim=-1)
-        x = torch.matmul(p_attn, value)
+        scores = torch.matmul(key, query.transpose(-2, -1)) / math.sqrt(self.d_k)
+        p_attn = scores.softmax(dim=-2)
+        x = torch.matmul(value.transpose(-1, -2), p_attn)
         # 3) "Concat" using a view and apply a final linear.
         x = (
-            x.transpose(0, 1)
-            .contiguous()
-            .view(-1, self.h * self.d_k)
+            x.view(-1, self.h * self.d_k)
         )
         del query
         del key

From 831dddf9f32c2ae068a0b5db261211875cd424da Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 18 Jun 2025 07:23:46 +0000
Subject: [PATCH 357/432] [Fix] BMM weight fused

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py | 51 +++++++++++---------
 PyTorchSimFrontend/mlir/mlir_template.py     |  4 +-
 tests/Fusion/test_prologue_fusion.py         |  8 +--
 tests/test_transformer.py                    | 10 ++--
 4 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 25858222..41f90864 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -303,28 +303,35 @@ def render(self,
             input_reorder = self.input_reorder
         )
 
-        kernel.prologue_info = dict (
-            input_sram_var = "X_buffer2D",
-            input_dram_var = "X",
-            input_index_var = "index0",
-            input_tag_var = "tag1",
-            input_numel = B * M * K,
-            input_tile_size = (TILE_M, TILE_K),
-            input_sram_stride = [1, TILE_M],
-            input_subtile_size = (SUB_TILE_M, SUB_TILE_K),
-            weight_sram_var = "W_buffer2D",
-            weight_dram_var = "W",
-            weight_index_var = "index1",
-            weight_tag_var = "tag2",
-            weight_numel = B * K * N,
-            weight_tile_size = (TILE_K, TILE_N),
-            weight_sram_stride = [1, TILE_K],
-            weight_subtile_size = (SUB_TILE_K, SUB_TILE_N),
-            tile_size = (TILE_K, TILE_N),
-            vlane_split_axis = 1,
-            vlane_stride = 1,
-            is_bmm = True,
-        )
+        if prologue_nodes:
+          # if Input fused:
+          #   tile_size = (TILE_M, TILE_K)
+          #   input_sram_stride = [1, TILE_M]
+          # elif Weight fused:
+          tile_size = (TILE_K, TILE_N)
+          input_sram_stride = [1, TILE_K]
+          kernel.prologue_info = dict (
+              input_sram_var = "X_buffer2D",
+              input_dram_var = "X",
+              input_index_var = "index0",
+              input_tag_var = "tag1",
+              input_numel = B * M * K,
+              input_tile_size = (TILE_M, TILE_K),
+              input_sram_stride = input_sram_stride,
+              input_subtile_size = (SUB_TILE_M, SUB_TILE_K),
+              weight_sram_var = "W_buffer2D",
+              weight_dram_var = "W",
+              weight_index_var = "index1",
+              weight_tag_var = "tag2",
+              weight_numel = B * K * N,
+              weight_tile_size = (TILE_K, TILE_N),
+              weight_sram_stride = [1, TILE_K],
+              weight_subtile_size = (SUB_TILE_K, SUB_TILE_N),
+              tile_size = tile_size,
+              vlane_split_axis = 1,
+              vlane_stride = 1,
+              is_bmm = True,
+          )
         kernel.epilogue_info = dict(
             output_node = self.output_node.name,
             dependent_buf = [],
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index a9da6e9d..c6893e73 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -687,8 +687,8 @@ def load_prologue(self, name: str, index: sympy.Expr):
             # Broadcast pattern
             zero_index = self.const_cse.generate(self.const_buffer, "arith.constant 0 : index")
             if self.prologue_info['is_bmm']: # FIXME: hardcoded
-                idx = f"%b, %t_m, %{zero_index}"
-                map_var = self.map_cse.generate(self.global_vars, f"affine_map<(d0, d1, d2) -> (d0 * 512 + d1)>")
+                idx = f"%b, %t_k, %t_n"
+                map_var = self.map_cse.generate(self.global_vars, f"affine_map<(d0, d1, d2) -> (d0 * 512 + d2)>")
                 vlane_split_axis = 2
             else:
                 idx = f"%t_m, %{zero_index}"
diff --git a/tests/Fusion/test_prologue_fusion.py b/tests/Fusion/test_prologue_fusion.py
index 12098b24..926782be 100644
--- a/tests/Fusion/test_prologue_fusion.py
+++ b/tests/Fusion/test_prologue_fusion.py
@@ -55,11 +55,11 @@ def matmul_fused(a, b, c):
 
 def test_elem_bmm_fusion(device, batch_size=1, m=512, n=512, k=64):
     def bmm(a, b, c, d):
-        return torch.bmm((a - b)/c , d)
+        return torch.bmm(a , (d - b)/c)
     torch.manual_seed(0)
     a = torch.randn(batch_size, m, k).to(device=device)
-    b = torch.randn(batch_size, m, 1).to(device=device)
-    c = torch.randn(batch_size, m, 1) * 1000
+    b = torch.randn(batch_size, 1, n).to(device=device)
+    c = torch.randn(batch_size, 1, n) * 1000
     c = c.to(device=device)
     d = torch.randn(batch_size, k, n).to(device=device)
     opt_fn = torch.compile(dynamic=False)(bmm)
@@ -78,4 +78,4 @@ def bmm(a, b, c, d):
     device = module.custom_device()
     test_elem_broadcast_fusion(device)
     test_elem_fusion(device)
-    test_elem_bmm_fusion(device, batch_size=12, m=512, n=64, k=512)
\ No newline at end of file
+    test_elem_bmm_fusion(device, batch_size=12, m=64, n=512, k=512)
\ No newline at end of file
diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index 82773da2..cfa2a622 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -87,9 +87,9 @@ def test_Attention(device, head=16, seq=512, d_k=64):
     def attention(query, key, value):
         import math
         d_k = query.size(-1)
-        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
-        p_attn = scores.softmax(dim=-1)
-        return torch.matmul(p_attn, value), p_attn
+        scores = torch.matmul(key, query.transpose(-2, -1)) / math.sqrt(d_k)
+        p_attn = scores.softmax(dim=-2)
+        return torch.matmul(value.transpose(-1, -2), p_attn)
 
     torch.manual_seed(0)
     query = torch.randn(head, seq, d_k).to(device=device)
@@ -97,9 +97,9 @@ def attention(query, key, value):
     value = torch.randn(head, seq, d_k).to(device=device)
 
     opt_fn = torch.compile(dynamic=False)(attention)
-    res, p_attn = opt_fn(query, key, value)
+    res = opt_fn(query, key, value)
 
-    cpu_res, cpu_p_attn = attention(query.cpu(), key.cpu(), value.cpu())
+    cpu_res = attention(query.cpu(), key.cpu(), value.cpu())
     test_result("Attention Forward", res, cpu_res)
 
 def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512):

From d0108fd891c31021a67b5e84118abb7da3c11588 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 18 Jun 2025 11:14:16 +0000
Subject: [PATCH 358/432] [Frontend/Fusion] Implement matmul+var_mean fusion
 for LayerNorm

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  2 +-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  7 +--
 PyTorchSimFrontend/mlir/mlir_template.py      | 33 +++++++++---
 tests/Fusion/test_matmul_reduction.py         | 53 ++++++++++++++++++-
 4 files changed, 82 insertions(+), 13 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index ec1dd9a8..35132739 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -199,7 +199,7 @@ def render(self,
         if (M == 0) or (N == 0) or (K == 0):
             TILE_M, TILE_N, TILE_K = 1, 1, 1
             template = EMPTY_TEMPLATE
-        elif n_extra_node==1 and epilogue_nodes[0].is_reduction():
+        elif n_extra_node>=1 and epilogue_nodes[0].is_reduction():
             TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node, min_tile=True)
             template = GEMM_REDUCTION_TEMPLATE
             nr_rdim = 1
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index d41a9128..bc0e8560 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -31,13 +31,10 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
         if node1.get_device() == node2.get_device():
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
             from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
-            if (node1.is_template() and len(node1.get_nodes())==1 and \
-                (isinstance(node1.node.template, MLIRGemmTemplate) or isinstance(node1.node.template, MLIRBMMTemplate)) and \
+            if (node1.is_template() and (isinstance(node1.get_nodes()[0].node.template, MLIRGemmTemplate) or isinstance(node1.node.template, MLIRBMMTemplate)) and \
                 node2.is_reduction() and len(node2.get_nodes())==1):
                 # For matmul/bmm+reduction case
-                size_match = node1.node.get_size() == node2.node.get_size() + node2.node.get_reduction_size()
-                if len(node1.node.get_size()) == len(node2.node.get_size()):
-                    size_match = node1.node.get_size() == [dim for dim in node2.node.get_size() if dim!=1] + node2.node.get_reduction_size()
+                size_match = reduce(operator.mul, node1.get_nodes()[0].node.get_size(), 1) == reduce(operator.mul, node2.node.get_size(), 1) * reduce(operator.mul, node2.node.get_reduction_size(), 1)
                 stride = [i.strip()[:-1].split(",")[-1].strip() for i in str(node2.node).split("\n") if "r0" in i][1]
                 target_symbol = symbols("r0")
                 # We can't fuse dim=-1
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index c6893e73..935510b6 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -81,10 +81,14 @@ def __init__(self,
         self.prologue_buffer_group = IndentedBufferGroup(self)
         self.epilogue_buffer_group = IndentedBufferGroup(self)
         self.global_vars = IndentedBuffer()
+        # Reduction data structure
         self.reduction_epilogue_suffix = IndentedBuffer()
         self.reduction_fusion = False
         self.reduction_body_loop = None
         self.reduction_idx = None
+        self.reduction_buffer_idx = 0
+        self.reduction_info = {}
+        self.reduction_epilogue_result = {}
 
         # Overwrite ops
         self.load = self.load_epilogue
@@ -863,8 +867,23 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
     def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         argmax_or_argmin = reduction_type in {"argmax", "argmin"}
-        if argmax_or_argmin or is_welford_reduction(reduction_type):
+        if argmax_or_argmin:
             raise NotImplementedError() #TODO: argmin, argmax
+        if is_welford_reduction(reduction_type):
+            if reduction_type == "welford_combine":
+                raise NotImplementedError("welford_combine")
+            else:
+                assert reduction_type == "welford_reduce"
+                type_name = mlir_common.DTYPE_TO_MLIR[dtype]
+                reduction_key = src_dtype, reduction_type, value
+                sum = self.reduction_epilogue(dtype, src_dtype, "sum", value)
+                sqr_sum = self.reduction_epilogue(dtype, src_dtype, "sum", ops.mul(value, value))
+                self.welford_reduce_out = (sum, sqr_sum, None)
+                return sum, sqr_sum, None
+        # Check duplicated reductions
+        reduction_key = src_dtype, reduction_type, value
+        if reduction_key in self.reduction_epilogue_result:
+            return self.reduction_epilogue_result[reduction_key]
 
         # Reduction fusion codegen part
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
@@ -872,13 +891,15 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         vshape = f"vector<{vec_size}x{type_name}>"
 
         tile_shape = f"memref<{self.reduction_body_loop.size * self.vector_lane}x{vec_size}x{type_name}, 1>"
-        name = f"{reduction_type}_buffer"
+        name = f"{reduction_type}_buffer{self.reduction_buffer_idx}"
+        self.reduction_buffer_idx += 1
         index = "dummy_index" # Not used
         tile_numel_per_lane = self.compute_body_loop.step * self.reduction_body_loop.size
         sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, None, index, self.const_buffer)
-        zero_var = self.get_const_cse(0)
+        self.reduction_epilogue_result[reduction_key] = sram_var
 
         # Load partial result
+        zero_var = self.get_const_cse(0)
         operation = "affine.vector_load"
         compute_index_var = ",".join([f"%{self.reduction_loop_idx}"] + [f"%{zero_var}"])
         line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
@@ -892,7 +913,7 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         operation = "affine.vector_store"
         line = f"{operation} %{result}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
         self.compute.writeline(line) # Need to be placed after partial reduction
-        self.reduction_info = {sram_var : reduction_type}
+        self.reduction_info[sram_var] = reduction_type
         return sram_var
 
     def store_reduction_epilogue(self, name, index, value):
@@ -911,7 +932,7 @@ def store_reduction_epilogue(self, name, index, value):
         tile_numel_per_lane = vlane_stride * nr_outer_loop * 2
 
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
-        tile_shape = f"memref<{self.kernel_group.tile_desc.get_tile_size()[1]*2}x{type_name}, 1>"
+        tile_shape = f"memref<{self.kernel_group.tile_desc.get_tile_size()[1]}x{type_name}, 1>"
         tile_stride = [1]
         sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index,
                                                                          index, buffer=self.const_buffer)
@@ -960,7 +981,7 @@ def store_reduction_epilogue(self, name, index, value):
 
         # MVOUT Encoding
         # Generate DMA instruction
-        index_var = "red_idx"
+        index_var = self.reduction_idx
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, type_name, dram_var, index_var, sram_var, sram_index_var,
                                 f"{name}_tag", dram_shape, tile_shape, tile_stride)
         self.reductions_suffix.writeline(DeferredLine(name, code))
diff --git a/tests/Fusion/test_matmul_reduction.py b/tests/Fusion/test_matmul_reduction.py
index 9f2cc7f3..07dd914d 100644
--- a/tests/Fusion/test_matmul_reduction.py
+++ b/tests/Fusion/test_matmul_reduction.py
@@ -38,6 +38,55 @@ def matmul_fused(a, b, c):
     test_result("Matmul Reduction Fusion activation", res[0], y[0])
     test_result("Matmul Reduction Fusion reduction", res[1], y[1])
 
+def test_matmul_var_mean(device, size=512):
+    def matmul_fused(a, b, c):
+        result = torch.matmul(a, b.T)
+        var, mean = torch.var_mean(result, dim=-2)
+        return result, var, mean
+    torch.manual_seed(0)
+    N = size
+    input = torch.randn(3072, 768)
+    weight = torch.randn(512, 768)
+    #input = torch.arange(1, N * N + 1, dtype=torch.float32).reshape(N, N).to(dtype=torch.float32)
+    #weight = torch.eye(N, dtype=torch.float32)
+    x1 = input.to(device=device)
+    w1 = weight.to(device=device)
+    x2 = input.to("cpu")
+    w2 = weight.to("cpu")
+    c = 7
+    opt_fn = torch.compile(dynamic=False)(matmul_fused)
+    res = opt_fn(x1, w1, c)
+    y = matmul_fused(x2, w2, c)
+    test_result("Matmul var_mean Fusion activation", res[0], y[0])
+    test_result("Matmul var_mean Fusion reduction", res[1], y[1])
+    test_result("Matmul var_mean Fusion reduction", res[2], y[2])
+
+def test_matmul_add_var_mean(device, size=512):
+    def matmul_fused(a, b, c, d):
+        result = torch.matmul(a, b.T) + c.T
+        var, mean = torch.var_mean(result + d, dim=-2)
+        return result, var, mean
+    torch.manual_seed(0)
+    N = size
+    input = torch.randn(768, 3072)
+    weight = torch.randn(512, 3072)
+    bias = torch.randn(768, 512)
+    residual = torch.randn(768,512)
+    x1 = input.to(device=device)
+    w1 = weight.to(device=device)
+    b1 = bias.to(device=device)
+    r1 = residual.to(device=device)
+    x2 = input.to("cpu")
+    w2 = weight.to("cpu")
+    b2 = bias.to("cpu")
+    r2 = residual.to("cpu")
+    opt_fn = torch.compile(dynamic=False)(matmul_fused)
+    res = opt_fn(x1, w1, b1, r1)
+    y = matmul_fused(x2, w2, b2, r2)
+    test_result("Matmul+residual+var_mean Fusion activation", res[0], y[0])
+    test_result("Matmul+residual+var_mean Fusion reduction", res[1], y[1])
+    test_result("Matmul+residual+var_mean Fusion reduction", res[2], y[2])
+
 if __name__ == "__main__":
     import os
     import sys
@@ -46,4 +95,6 @@ def matmul_fused(a, b, c):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_matmul_reduce(device)
+    #test_matmul_reduce(device)
+    test_matmul_var_mean(device)
+    #test_matmul_add_var_mean(device)

From bb2a083e2f15b29902a27ee7acfe349af7b90c9e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 18 Jun 2025 16:41:02 +0000
Subject: [PATCH 359/432] [Temporary] Make compile it force

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 34 ++++++++++++-------
 PyTorchSimFrontend/mlir/mlir_template.py      |  6 ++--
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 35132739..ed1361ff 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -24,6 +24,7 @@
 #map0 = affine_map<(d0, d1) -> ({{ X_map }})>
 #map1 = affine_map<(d0, d1) -> ({{ W_map }})>
 #map2 = affine_map<(d0, d1) -> (d0 * {{ N }} + d1)>
+#map3 = affine_map<(d0, d1) -> (d0 * {{ N }})>
 memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
 memref.global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
@@ -50,14 +51,11 @@
   affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
     affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
       %index2 = affine.apply #map2(%t_m, %t_n)
+      %index3 = affine.apply #map2(%t_m, %t_n)
       {%- if Bias %}
-      memref.dma_start %Bias[
-        {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
-        ], %Y_buffer[%c0, %c0], %c_mvin3, %tag0[%c0], %
+      memref.dma_start %Bias[{{ Bias_idx }}], %Y_buffer[%c0, %c0], %c_mvin3, %tag0[%c0], %
         {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
-        , %vstride : memref<
-        {%- if Bias_rank == 2 -%}  {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
-        xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}] }
+        , %vstride : memref<{{ Bias.data.get_numel() }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}] }
       {%- else %}
       affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
       {%- endif %}
@@ -102,6 +100,7 @@
 #map0 = affine_map<(d0, d1) -> ({{ X_map }})>
 #map1 = affine_map<(d0, d1) -> ({{ W_map }})>
 #map2 = affine_map<(d0, d1) -> (d0 * {{ N }} + d1)>
+#map3 = affine_map<(d0, d1) -> (d0 * {{ N }})>
 memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
 memref.global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
@@ -128,14 +127,11 @@
   affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
     {{kernel.reduction_acc()}} affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {{kernel.reduction_iter_arg()}} {
       %index2 = affine.apply #map2(%t_m, %t_n)
+      %index3 = affine.apply #map2(%t_m, %t_n)
       {%- if Bias %}
-      memref.dma_start %Bias[
-        {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
-        ], %Y_buffer[%c0, %c0], %c_mvin3, %tag0[%c0], %
+      memref.dma_start %Bias[{{ Bias_idx }}], %Y_buffer[%c0, %c0], %c_mvin3, %tag0[%c0], %
         {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
-        , %vstride : memref<
-        {%- if Bias_rank == 2 -%}  {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
-        xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}] }
+        , %vstride : memref<{{ Bias.data.get_numel() }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}] }
       {%- else %}
       affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
       {%- endif %}
@@ -174,7 +170,6 @@ def render(self,
 
         X, W = self.input_nodes[0], self.input_nodes[1]
         Y = self.output_node
-        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
         W_tensor =  empty_strided(W.layout.size, W.layout.stride)
         X_tensor =  empty_strided(X.layout.size, X.layout.stride)
@@ -219,6 +214,18 @@ def render(self,
         TOG_latency = M if SUB_TILE_M > M else SUB_TILE_M
         kernel.loop_size =[TOG_latency, SUB_TILE_N, SUB_TILE_K]
 
+        # Extract Bias info
+        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+        if Bias is not None:
+          if Bias.data.get_numel() == M*N:
+            Bias_idx = "%index2"
+          elif Bias.data.get_numel() == M:
+            Bias_idx = "%index3"
+          else:
+            Bias_idx = "%t_n"
+        else:
+          Bias_idx = None
+
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
@@ -237,6 +244,7 @@ def render(self,
             W = W,
             Y = Y,
             Bias = Bias,
+            Bias_idx = Bias_idx,
             Bias_rank = len(Bias.data.get_size()) if Bias is not None else 0,
             X_map = X_map,
             W_map = W_map,
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 935510b6..8017a3a5 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -766,13 +766,13 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         load_dim = []
         if not isinstance(V.graph, NullHandler) and name in V.graph.graph_inputs:
             load_dim = V.graph.graph_inputs[name].layout.size
-        index_var = self.epilogue_info['index_var'] if len(load_dim) != 1 else 'tile_n'
+        index_var = self.epilogue_info['index_var'] if len(load_dim) <= 1 else 'tile_n'
         index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.input(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis if len(load_dim) != 1 else 0    # FIXME: Fixed split axis for 1d load dim
-        vlane_stride = self.kernel_group.tile_desc.vlane_stride if len(load_dim) != 1 else 1    # FIXME: Fixed stride for 1d load dim
+        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis if len(load_dim) <= 1 else 0    # FIXME: Fixed split axis for 1d load dim
+        vlane_stride = self.kernel_group.tile_desc.vlane_stride if len(load_dim) <= 1 else 1    # FIXME: Fixed stride for 1d load dim
         tile_numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = self.epilogue_info['tile_stride']

From 519088521059fba16a98ff9b883872cdb7b364c8 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 18 Jun 2025 18:21:37 +0000
Subject: [PATCH 360/432] [Frontend/Fusion] Fix&cleanup fusion policy

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py |  3 +-
 PyTorchSimFrontend/mlir/mlir_scheduling.py   | 47 +++++++++++---------
 2 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 41f90864..043cd46b 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -267,7 +267,8 @@ def render(self,
         SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
         SUB_TILE_K = TILE_K # if (TILE_K < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
 
-        if n_extra_node==1 and epilogue_nodes[0].is_reduction():
+        nr_reduction_nodes = [node for node in epilogue_nodes if node.is_reduction()] if epilogue_nodes is not None else []
+        if nr_reduction_nodes:
           template = BMM_REDUCTION_TEMPLATE
           nr_rdim = 1
         elif prologue_nodes:
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index bc0e8560..ffca0d8c 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -28,33 +28,40 @@ def __init__(self, scheduler):
         self.max_fusion_size = 5
 
     def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
-        if node1.get_device() == node2.get_device():
+        # Extract base template node
+        base_template_node1 = [node for node in node1.get_nodes() if node.is_template()]
+        base_template_node2 = [node for node in node2.get_nodes() if node.is_template()]
+        if node1.get_device() != node2.get_device():
+            return False
+
+        if len(base_template_node1) == 1 and len(base_template_node2) == 0:
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
             from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
-            if (node1.is_template() and (isinstance(node1.get_nodes()[0].node.template, MLIRGemmTemplate) or isinstance(node1.node.template, MLIRBMMTemplate)) and \
-                node2.is_reduction() and len(node2.get_nodes())==1):
+            if (isinstance(base_template_node1[0].node.template, MLIRGemmTemplate) or isinstance(base_template_node1[0].node.template, MLIRBMMTemplate)) and node2.is_reduction() and len(node2.get_nodes())==1:
                 # For matmul/bmm+reduction case
                 size_match = reduce(operator.mul, node1.get_nodes()[0].node.get_size(), 1) == reduce(operator.mul, node2.node.get_size(), 1) * reduce(operator.mul, node2.node.get_reduction_size(), 1)
                 stride = [i.strip()[:-1].split(",")[-1].strip() for i in str(node2.node).split("\n") if "r0" in i][1]
                 target_symbol = symbols("r0")
                 # We can't fuse dim=-1
-                possible = int(sympify(stride).coeff(target_symbol)) != 1
-                return size_match and possible
-
-            # For prologue fusion case
-            if not node1.is_template() and len(node1.get_nodes())==1 and node2.is_template():
-                # Return false if node2 is Convolution template
-                # if node2.get_nodes()[0].node.origin_node.target._name == 'aten::mm' or \
-                #     node2.get_nodes()[0].node.origin_node.target._name == 'aten::addmm':
-                #     return False
-                if node2.get_nodes()[0].node.origin_node is not None and hasattr(node2.get_nodes()[0].node.origin_node.target, "_name") and node2.get_nodes()[0].node.origin_node.target._name == 'aten::convolution':
-                    return False
-                if node1.is_reduction():
-                    return False
-                if len(node1.read_writes.writes) != 1:
-                    return False
-                if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]:
-                    return True
+                layout_possible = int(sympify(stride).coeff(target_symbol)) != 1
+                dependecy_check = base_template_node1[0].node.name in node2.node.get_read_names() and len(node2.node.get_read_names()) == 1
+                return size_match and layout_possible and dependecy_check
+
+        # For prologue fusion case
+        if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and len(base_template_node2) == 1:
+            # Return false if node2 is Convolution template
+            # if node2.get_nodes()[0].node.origin_node.target._name == 'aten::mm' or \
+            #     node2.get_nodes()[0].node.origin_node.target._name == 'aten::addmm':
+            #     return False
+            target_node = base_template_node2[0].node
+            if target_node.origin_node is not None and hasattr(target_node.origin_node.target, "_name") and target_node.origin_node.target._name == 'aten::convolution':
+                return False
+            if node1.is_reduction():
+                return False
+            if len(node1.read_writes.writes) != 1:
+                return False
+            if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]:
+                return True
 
         return self.scheduler.can_fuse_origin(node1, node2)
 

From e555ab87a0c09f922f4b96a01de3a6ffaa0085b5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 19 Jun 2025 03:02:52 +0000
Subject: [PATCH 361/432] [Frontend/Fusion] Fix prologue target buf selecting
 logic

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index ffca0d8c..4f3c159e 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -212,10 +212,18 @@ def codegen_template_code(self, kernel, render, template_node, prologue_nodes, e
             for node in prologue_nodes:
                 # Reuse created spad
                 read_list = sorted(list(node.read_writes.reads))
-                if reduce(operator.mul, read_list[-1].size, 1) == template_node.node.get_numel():
-                    prologue_input_arg = read_list[-1].name
-                else:
-                    prologue_input_arg = read_list[0].name
+                candidate_found = False
+                # Why? There is a case that memdep.get_size() != data.get_size()
+                buf_dict = {}
+                buf_dict.update({val.get_name() : val for val in V.graph.graph_inputs.values()})
+                buf_dict.update({val.name : val for val in V.graph.buffers})
+                for candidate_read in read_list:
+                    if reduce(operator.mul, buf_dict[candidate_read.name].get_size(), 1) == node.node.get_numel():
+                        prologue_input_arg = candidate_read.name
+                        candidate_found = True
+                        break
+                assert(candidate_found)
+                assert(len(node.read_writes.writes)==1)
                 prologue_output_arg = list(node.read_writes.writes)[0].name
                 template_buf = self.kernel_group.args.input_buffers[prologue_output_arg]
                 if template_node.get_nodes()[0].node.origin_node.target._name == 'aten::bmm':

From 3fc33e192f2833cd85caee9f6edd17702aa6a988 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Thu, 19 Jun 2025 06:29:44 +0000
Subject: [PATCH 362/432] [Frontend] Optimize fusion tile size

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  | 2 +-
 PyTorchSimFrontend/mlir/mlir_common.py        | 2 +-
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 9 +++++----
 PyTorchSimFrontend/mlir/mlir_template.py      | 9 +++++----
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 043cd46b..91ba9ba1 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -262,7 +262,7 @@ def render(self,
         TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node)
         TOG_latency = M if TILE_M > M else TILE_M
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
-        TILE_K = TILE_K // 4 if prologue_nodes else TILE_K
+        TILE_K = TILE_K // 2 if prologue_nodes else TILE_K
         SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
         SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
         SUB_TILE_K = TILE_K # if (TILE_K < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index c3dc0c51..4409ee8e 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -482,7 +482,7 @@ def dummy_tile_size():
                 tile_size[0] = 2 * vlane_stride * self.vector_lane
             elif len(tile_size) == 3:
                 tile_size[-1] = self.vector_lane
-                tile_size[-2] = 2 * self.vector_lane
+                tile_size[-2] = 4 * self.vector_lane
                 tile_size[-3] = 2
             else:
                 raise NotImplementedError("dummy tile size fail!")
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index ed1361ff..050624db 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -12,7 +12,7 @@
 from PyTorchSimFrontend import extension_config
 
 GEMM_TEMPLATE = r"""
-// GEMM kernel
+// GEMM {% if prologue_nodes -%}prologue fused{%- endif %} {% if epilogue_nodes -%}eilogue fused{%- endif %} kernel
 // M = {{ M }}
 // N = {{ N }}
 // K = {{ K }}
@@ -88,7 +88,7 @@
 """
 
 GEMM_REDUCTION_TEMPLATE = r"""
-// GEMM kernel
+// GEMM reduction kernel
 // M = {{ M }}
 // N = {{ N }}
 // K = {{ K }}
@@ -190,16 +190,17 @@ def render(self,
           if self.output_node.name in n_extra_read:
             n_extra_read.remove(self.output_node.name)
 
+        n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0
         nr_rdim = 0
         if (M == 0) or (N == 0) or (K == 0):
             TILE_M, TILE_N, TILE_K = 1, 1, 1
             template = EMPTY_TEMPLATE
         elif n_extra_node>=1 and epilogue_nodes[0].is_reduction():
-            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node, min_tile=True)
+            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, len(n_extra_read), n_prologue_node, min_tile=True)
             template = GEMM_REDUCTION_TEMPLATE
             nr_rdim = 1
         else:
-            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, len(n_extra_read), min_tile=True)
+            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, len(n_extra_read), n_prologue_node, min_tile=True)
             template = GEMM_TEMPLATE
         TILE_M = min(extension_config.CONFIG_FORCE_TILE_M, TILE_M)
         TILE_N = min(extension_config.CONFIG_FORCE_TILE_N, TILE_N)
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 8017a3a5..5357979b 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -155,7 +155,7 @@ def gemmini_gemm_mapping(self, M, N, K):
 
         return inner_I, inner_J, inner_K
 
-    def gemm_combination_mapping(self, M, N, K, n_extra_node=0, pad_k=True, min_tile=False):
+    def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, pad_k=True, min_tile=False):
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
@@ -183,13 +183,14 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, pad_k=True, min_tile
                 tile_M = i * self.vector_lane if M > self.vector_lane else M_padded
                 for j in tile_N_range:
                     tile_N = j * self.vector_lane if N > self.vector_lane else N_padded
-                    used_spad_size = (tile_M * tile_K + tile_K * tile_N + tile_M * tile_N * (1 + n_extra_node)) * self.precision
+                    used_spad_size = (tile_M * tile_K * (1 + n_prologue_node) + tile_K * tile_N + tile_M * tile_N * (1 + n_extra_node)) * self.precision
                     weight_size_per_lane = self.get_spad_size_per_lane(tile_K, tile_N)
-                    input_size_per_lane = self.get_spad_size_per_lane(tile_M, tile_K)
+                    input_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_prologue_node), tile_K)
                     output_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_extra_node), tile_N)
                     used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
                     n_tile = math.ceil(M / tile_M) * math.ceil(N / tile_N)
-                    if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and maximize_i_j <= tile_M * tile_N and n_tile >= minimum_n_tile:
+                    check_spad_size = (used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane)
+                    if check_spad_size and maximize_i_j <= tile_M * tile_N and n_tile >= minimum_n_tile and tile_N // tile_M < 10:
                         max_used_spad_size = used_spad_size
                         maximize_i_j = tile_M * tile_N
                         mapping = (tile_M, tile_N, tile_K)

From 66a4c41b6b24002ffd611de5eaf04721324a3ec4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 19 Jun 2025 07:13:12 +0000
Subject: [PATCH 363/432] [Frontend/Fusion] Update 1D load epilogue

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py |  8 +++++---
 PyTorchSimFrontend/mlir/mlir_template.py   | 10 ++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 4f3c159e..14c36dc2 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -39,13 +39,15 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
             from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
             if (isinstance(base_template_node1[0].node.template, MLIRGemmTemplate) or isinstance(base_template_node1[0].node.template, MLIRBMMTemplate)) and node2.is_reduction() and len(node2.get_nodes())==1:
                 # For matmul/bmm+reduction case
-                size_match = reduce(operator.mul, node1.get_nodes()[0].node.get_size(), 1) == reduce(operator.mul, node2.node.get_size(), 1) * reduce(operator.mul, node2.node.get_reduction_size(), 1)
+                size_match = node1.get_nodes()[0].node.get_numel() == reduce(operator.mul, node2.node.get_size(), 1) * reduce(operator.mul, node2.node.get_reduction_size(), 1)
                 stride = [i.strip()[:-1].split(",")[-1].strip() for i in str(node2.node).split("\n") if "r0" in i][1]
                 target_symbol = symbols("r0")
                 # We can't fuse dim=-1
                 layout_possible = int(sympify(stride).coeff(target_symbol)) != 1
-                dependecy_check = base_template_node1[0].node.name in node2.node.get_read_names() and len(node2.node.get_read_names()) == 1
-                return size_match and layout_possible and dependecy_check
+                # Directed linked?
+                dependency_check = node2 in [node.node for node in base_template_node1[0].users]# and len(node2.read_writes.reads)==1
+                dependency_size = all([i.get_numel() == node1.get_nodes()[0].node.get_numel() for i in node2.read_writes.reads])
+                return size_match and layout_possible and dependency_check & dependency_size
 
         # For prologue fusion case
         if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and len(base_template_node2) == 1:
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 5357979b..201e046b 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -764,16 +764,14 @@ def store_prologue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         self.stores.writeline(line)
 
     def load_epilogue(self, name: str, index: sympy.Expr):
-        load_dim = []
-        if not isinstance(V.graph, NullHandler) and name in V.graph.graph_inputs:
-            load_dim = V.graph.graph_inputs[name].layout.size
-        index_var = self.epilogue_info['index_var'] if len(load_dim) <= 1 else 'tile_n'
+        is_1d_source = len(index.free_symbols) == 1
+        index_var = self.epilogue_info['index_var'] if not is_1d_source else 'tile_n'
         index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.input(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis if len(load_dim) <= 1 else 0    # FIXME: Fixed split axis for 1d load dim
-        vlane_stride = self.kernel_group.tile_desc.vlane_stride if len(load_dim) <= 1 else 1    # FIXME: Fixed stride for 1d load dim
+        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis if not is_1d_source else 0    # FIXME: Fixed split axis for 1d load dim
+        vlane_stride = self.kernel_group.tile_desc.vlane_stride if not is_1d_source else 1    # FIXME: Fixed stride for 1d load dim
         tile_numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = self.epilogue_info['tile_stride']

From ee5c1a9a2c57da24b7d28d55293c7cd485ba8688 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Thu, 19 Jun 2025 14:19:46 +0000
Subject: [PATCH 364/432] [Frontend] Welford reduction fusion debug

---
 PyTorchSimFrontend/mlir/mlir_template.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 201e046b..d6cdaf06 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -89,6 +89,7 @@ def __init__(self,
         self.reduction_buffer_idx = 0
         self.reduction_info = {}
         self.reduction_epilogue_result = {}
+        self.reduction_mean = []
 
         # Overwrite ops
         self.load = self.load_epilogue
@@ -974,6 +975,26 @@ def store_reduction_epilogue(self, name, index, value):
             #init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(self.reduction_info[value], dtype)} : {type_name}")
             #out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(self.reduction_info[value], out, init, axis=0, shape=vshape, reduced_shape=final_reduced_shape))
 
+            if self.welford_reduce_out is not None:
+                # mean
+                divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(768)} : f32")
+                if self.buffer_types[name][1] > 1:
+                    divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to {new_reduced_shape}")
+                else:
+                    divider_vec = divider
+
+                if self.current_node.node.origin_node: # FIXME: This is a temporary solution
+                    # mean = E(X) / N
+                    self.reduction_mean.append(self.cse.generate(self.reductions_suffix, f"arith.divf %{out}, %{divider_vec} : {new_reduced_shape}"))
+                    out = self.reduction_mean[i]
+                else:
+                    # m2 = (E(X^2) - E(X)^2) * N
+                    sqr_mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{out}, %{divider_vec} : {new_reduced_shape}")
+                    mean_sqr = self.cse.generate(self.reductions_suffix, f"arith.mulf %{self.reduction_mean[i]}, %{self.reduction_mean[i]} : {new_reduced_shape}")
+                    variance = self.cse.generate(self.reductions_suffix, f"arith.subf %{sqr_mean}, %{mean_sqr} : {new_reduced_shape}")
+                    m2 = self.cse.generate(self.reductions_suffix, f"arith.mulf %{variance}, %{divider_vec} : {new_reduced_shape}")
+                    out = m2
+
             operation = "affine.vector_store"
             line = f"{operation} %{out}, %{sram_var}[%{index_var}] : {tile_shape}, {new_reduced_shape}"
             self.reductions_suffix.writeline(DeferredLine(name, line))

From 5e70202bc5d63399af665b9f58ff5ded7b16a1cd Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Thu, 19 Jun 2025 17:15:12 +0000
Subject: [PATCH 365/432] [Fix] Matmul epilogue fusion

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  5 +++++
 PyTorchSimFrontend/mlir/mlir_template.py      | 10 +++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 050624db..310b92dd 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -25,6 +25,7 @@
 #map1 = affine_map<(d0, d1) -> ({{ W_map }})>
 #map2 = affine_map<(d0, d1) -> (d0 * {{ N }} + d1)>
 #map3 = affine_map<(d0, d1) -> (d0 * {{ N }})>
+#map4 = affine_map<(d0, d1) -> (d0 + d1 * {{ M }})>
 memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
 memref.global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
@@ -52,6 +53,7 @@
     affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
       %index2 = affine.apply #map2(%t_m, %t_n)
       %index3 = affine.apply #map2(%t_m, %t_n)
+      %index4 = affine.apply #map4(%t_m, %t_n)
       {%- if Bias %}
       memref.dma_start %Bias[{{ Bias_idx }}], %Y_buffer[%c0, %c0], %c_mvin3, %tag0[%c0], %
         {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
@@ -101,6 +103,7 @@
 #map1 = affine_map<(d0, d1) -> ({{ W_map }})>
 #map2 = affine_map<(d0, d1) -> (d0 * {{ N }} + d1)>
 #map3 = affine_map<(d0, d1) -> (d0 * {{ N }})>
+#map4 = affine_map<(d0, d1) -> (d0 + d1 * {{ M }})>
 memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
 memref.global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
@@ -128,6 +131,7 @@
     {{kernel.reduction_acc()}} affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {{kernel.reduction_iter_arg()}} {
       %index2 = affine.apply #map2(%t_m, %t_n)
       %index3 = affine.apply #map2(%t_m, %t_n)
+      %index4 = affine.apply #map4(%t_m, %t_n)
       {%- if Bias %}
       memref.dma_start %Bias[{{ Bias_idx }}], %Y_buffer[%c0, %c0], %c_mvin3, %tag0[%c0], %
         {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
@@ -283,6 +287,7 @@ def render(self,
             sram_var = "Y_buffer",
             dram_var = "Y",
             index_var = "index2",
+            t_index_var = "index4", # FIXME: for epilogue transposed input
             tag_var = "tag",
             vlane_split_axis = 1,
             vlane_stride = 1,
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index d6cdaf06..503bc874 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -766,7 +766,15 @@ def store_prologue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
     def load_epilogue(self, name: str, index: sympy.Expr):
         is_1d_source = len(index.free_symbols) == 1
-        index_var = self.epilogue_info['index_var'] if not is_1d_source else 'tile_n'
+        is_transpose = False    # FIXME: Only works for 2d input
+        if len(index.args) == 2:
+            for expr in index.args:
+                if len(expr.args):
+                    if expr.args[1].name == "index0" and expr.args[0] > 1:
+                        is_transpose = True
+                        break
+        key = 't_index_var' if is_transpose else 'index_var'
+        index_var = self.epilogue_info[key] if not is_1d_source else 'tile_n'
         index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.input(name)
         dtype = V.graph.get_dtype(name)

From 2c67e9be80e7478bbb0d76097986a5160468e4d2 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 19 Jun 2025 10:00:13 +0000
Subject: [PATCH 366/432] [Frontend] Add a spad reuse feature in the fusion
 kernel

---
 PyTorchSimFrontend/mlir/mlir_template.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 503bc874..166bfc3c 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -90,6 +90,7 @@ def __init__(self,
         self.reduction_info = {}
         self.reduction_epilogue_result = {}
         self.reduction_mean = []
+        self.reuse_buffer_names = {}
 
         # Overwrite ops
         self.load = self.load_epilogue
@@ -797,9 +798,15 @@ def load_epilogue(self, name: str, index: sympy.Expr):
             code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                      f"{name}_tag", dram_shape, tile_shape, tile_stride)
             self.cse.generate(self.dma_loads, code, assignment = False)
+        elif name in self.reuse_buffer_names:
+            sram_var = self.reuse_buffer_names[name]
+            code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
+                                     f"{name}_tag", dram_shape, tile_shape, tile_stride)
+            self.cse.generate(self.dma_loads, code, assignment = False)
+        else:
+            sram_var = self.buffer_names[name]
 
         # Load vector from sram
-        sram_var = self.buffer_names[name]
         zero_var = self.get_const_cse(0)
         if not self.reduction_fusion:
             compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
@@ -1023,6 +1030,9 @@ def set_tile_size(self, template_epilogue_info):
             vlane_split_axis=template_epilogue_info['vlane_split_axis'],
             vlane_stride=template_epilogue_info['vlane_stride'])
 
+        if "reuse_buffer_names" in template_epilogue_info:
+            self.reuse_buffer_names.update(template_epilogue_info["reuse_buffer_names"])
+
         if 'nr_rdim' in template_epilogue_info and template_epilogue_info['nr_rdim']==1:
             tile_desc.nr_rdim = 1
             numel_per_lane = tile_desc.get_numel_per_lane()

From c559bdceb0d92f900c3406c5e2ba7e1105c84cb6 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 20 Jun 2025 02:22:18 +0000
Subject: [PATCH 367/432] [Frontend] Fix transposed 1D bias

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 310b92dd..b1d597a0 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -24,7 +24,7 @@
 #map0 = affine_map<(d0, d1) -> ({{ X_map }})>
 #map1 = affine_map<(d0, d1) -> ({{ W_map }})>
 #map2 = affine_map<(d0, d1) -> (d0 * {{ N }} + d1)>
-#map3 = affine_map<(d0, d1) -> (d0 * {{ N }})>
+#map3 = affine_map<(d0, d1) -> (d0)>
 #map4 = affine_map<(d0, d1) -> (d0 + d1 * {{ M }})>
 memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
@@ -52,12 +52,10 @@
   affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
     affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
       %index2 = affine.apply #map2(%t_m, %t_n)
-      %index3 = affine.apply #map2(%t_m, %t_n)
+      %index3 = affine.apply #map3(%t_m, %c0)
       %index4 = affine.apply #map4(%t_m, %t_n)
       {%- if Bias %}
-      memref.dma_start %Bias[{{ Bias_idx }}], %Y_buffer[%c0, %c0], %c_mvin3, %tag0[%c0], %
-        {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
-        , %vstride : memref<{{ Bias.data.get_numel() }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}] }
+      memref.dma_start %Bias[{{ Bias_idx }}], %Y_buffer[%c0, %c0], %c_mvin3, %tag0[%c0], {{ Bias_axis }}, %vstride : memref<{{ Bias.data.get_numel() }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}] }
       {%- else %}
       affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
       {%- endif %}
@@ -102,7 +100,7 @@
 #map0 = affine_map<(d0, d1) -> ({{ X_map }})>
 #map1 = affine_map<(d0, d1) -> ({{ W_map }})>
 #map2 = affine_map<(d0, d1) -> (d0 * {{ N }} + d1)>
-#map3 = affine_map<(d0, d1) -> (d0 * {{ N }})>
+#map3 = affine_map<(d0, d1) -> (d0)>
 #map4 = affine_map<(d0, d1) -> (d0 + d1 * {{ M }})>
 memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
 memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
@@ -130,12 +128,10 @@
   affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
     {{kernel.reduction_acc()}} affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {{kernel.reduction_iter_arg()}} {
       %index2 = affine.apply #map2(%t_m, %t_n)
-      %index3 = affine.apply #map2(%t_m, %t_n)
+      %index3 = affine.apply #map3(%t_m, %c0)
       %index4 = affine.apply #map4(%t_m, %t_n)
       {%- if Bias %}
-      memref.dma_start %Bias[{{ Bias_idx }}], %Y_buffer[%c0, %c0], %c_mvin3, %tag0[%c0], %
-        {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
-        , %vstride : memref<{{ Bias.data.get_numel() }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}] }
+      memref.dma_start %Bias[{{ Bias_idx }}], %Y_buffer[%c0, %c0], %c_mvin3, %tag0[%c0], {{ Bias_axis }}, %vstride : memref<{{ Bias.data.get_numel() }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}] }
       {%- else %}
       affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
       {%- endif %}
@@ -224,12 +220,16 @@ def render(self,
         if Bias is not None:
           if Bias.data.get_numel() == M*N:
             Bias_idx = "%index2"
+            Bias_axis = "%axis"
           elif Bias.data.get_numel() == M:
             Bias_idx = "%index3"
+            Bias_axis = "%axis"
           else:
             Bias_idx = "%t_n"
+            Bias_axis = "%c0"
         else:
           Bias_idx = None
+          Bias_axis = None
 
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
@@ -250,7 +250,7 @@ def render(self,
             Y = Y,
             Bias = Bias,
             Bias_idx = Bias_idx,
-            Bias_rank = len(Bias.data.get_size()) if Bias is not None else 0,
+            Bias_axis = Bias_axis,
             X_map = X_map,
             W_map = W_map,
             Y_numel = M * N,

From d2aa73d8763e1b3bd26e88c1f6b063aec45a2f2a Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Fri, 20 Jun 2025 05:03:12 +0000
Subject: [PATCH 368/432] [fix] prologue fusion args & shape

---
 PyTorchSimFrontend/mlir/mlir_common.py     |  1 +
 PyTorchSimFrontend/mlir/mlir_scheduling.py |  9 ++++-----
 PyTorchSimFrontend/mlir/mlir_template.py   | 19 ++++++++++++-------
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 4409ee8e..29ef65c9 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -342,6 +342,7 @@ def __init__(self, kernel_group, reason=None):
         self.buffer_types : dict = None # format: dtype, numel, size, stride
         self.compute_idx = "compute_idx"
         self.compute_body_loop = LoopLevel(self.compute_idx, 1)
+        self.prologue_compute_body_loop = LoopLevel(self.compute_idx, 1)
         self.recodegen = reason # spad overflow, tile size, vlane stride
         self.stop_autotune = False
 
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 14c36dc2..307d5afe 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -200,10 +200,10 @@ def codegen_template_code(self, kernel, render, template_node, prologue_nodes, e
             kernel.kernel_group.set_tile_info(tile_desc)
             if prologue_nodes:
                 _, (group, reduction_group) = max(
-                    prologue_nodes, key=lambda x: int(x.is_reduction())
+                    [prologue_nodes[-1]], key=lambda x: int(x.is_reduction())
                 ).group
-                tile_desc = kernel.set_tile_size(kernel.prologue_info)
-                kernel.kernel_group.set_prologue_tile_info(tile_desc)
+                prologue_tile_desc = kernel.set_tile_size(kernel.prologue_info, prologue=True)
+                kernel.kernel_group.set_prologue_tile_info(prologue_tile_desc)
                 vars, reduction_vars = kernel.set_ranges(group, reduction_group)
             # Flush created varaibles, since template fusion doen't share variable
             kernel.cse.cache.clear()
@@ -217,10 +217,9 @@ def codegen_template_code(self, kernel, render, template_node, prologue_nodes, e
                 candidate_found = False
                 # Why? There is a case that memdep.get_size() != data.get_size()
                 buf_dict = {}
-                buf_dict.update({val.get_name() : val for val in V.graph.graph_inputs.values()})
                 buf_dict.update({val.name : val for val in V.graph.buffers})
                 for candidate_read in read_list:
-                    if reduce(operator.mul, buf_dict[candidate_read.name].get_size(), 1) == node.node.get_numel():
+                    if candidate_read.name in buf_dict and reduce(operator.mul, buf_dict[candidate_read.name].get_size(), 1) == node.node.get_numel():
                         prologue_input_arg = candidate_read.name
                         candidate_found = True
                         break
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 166bfc3c..3334d991 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -363,7 +363,7 @@ def codegen_prologue_body(self):
             buf.body.splice(buf.dma_loads)
 
             if (buf.loads.getvalue() != '' or buf.compute.getvalue() != '' or buf.stores.getvalue() != ''):
-                buf.body.writelines(self.compute_body_loop.lines())
+                buf.body.writelines(self.prologue_compute_body_loop.lines())
                 compute_body = mlir_common.ParallelLoopBuffer()
                 with contextlib.ExitStack() as stack:
                     stack.enter_context(compute_body.indent(attribute="{inner_loop=false}"))
@@ -688,19 +688,20 @@ def load_prologue(self, name: str, index: sympy.Expr):
         load_dim = []
         if not isinstance(V.graph, NullHandler) and name in V.graph.graph_inputs:
             load_dim = V.graph.graph_inputs[name].layout.size
-        if self.kernel_group.prologue_tile_desc.get_numel() == self.buffer_types[name][1]:
+        if self.ranges == self.buffer_types[name][2]:
             index_var = self.prologue_info['input_index_var'] if len(load_dim) != 1 else 'tile_n'
+            vlane_split_axis = self.kernel_group.prologue_tile_desc.vlane_split_axis if len(load_dim) != 1 else 0    # FIXME: Fixed split axis for 1d load dim
         else:
             # Broadcast pattern
             zero_index = self.const_cse.generate(self.const_buffer, "arith.constant 0 : index")
             if self.prologue_info['is_bmm']: # FIXME: hardcoded
                 idx = f"%b, %t_k, %t_n"
                 map_var = self.map_cse.generate(self.global_vars, f"affine_map<(d0, d1, d2) -> (d0 * 512 + d2)>")
-                vlane_split_axis = 2
+                vlane_split_axis = 2 # 3D GEMM prologue should be loaded by axis 2
             else:
                 idx = f"%t_m, %{zero_index}"
                 map_var = self.map_cse.generate(self.global_vars, f"affine_map<(d0, d1) -> (d0)>")
-                vlane_split_axis = self.kernel_group.prologue_tile_desc.vlane_split_axis if len(load_dim) != 1 else 0    # FIXME: Fixed split axis for 1d load dim
+                vlane_split_axis = 1 # 2D GEMM prologue should be loaded by axis 1
             index_var = self.apply_cse.generate(self.dma_loads, f"affine.apply #{map_var}({idx})")
         index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.input(name)
@@ -1024,7 +1025,7 @@ def store_reduction_epilogue(self, name, index, value):
     def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index, buffer=None):
         return super().get_scratchpad_buffer(dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index, True, buffer=buffer)
 
-    def set_tile_size(self, template_epilogue_info):
+    def set_tile_size(self, template_epilogue_info, prologue=False):
         tile_desc = mlir_common.MLIRMultiDimTile(template_epilogue_info['tile_size'],
             self.vector_lane,
             vlane_split_axis=template_epilogue_info['vlane_split_axis'],
@@ -1050,8 +1051,12 @@ def set_tile_size(self, template_epilogue_info):
             self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_loop_idx, nr_outer_loop)
         else:
             tile_desc.vec_size=64
-            self.compute_body_loop.size = tile_desc.get_numel_per_lane()
-            self.compute_body_loop.step = tile_desc.get_compute_vec_size()
+            if prologue:
+                self.prologue_compute_body_loop.size = tile_desc.get_numel_per_lane()
+                self.prologue_compute_body_loop.step = tile_desc.get_compute_vec_size()
+            else:
+                self.compute_body_loop.size = tile_desc.get_numel_per_lane()
+                self.compute_body_loop.step = tile_desc.get_compute_vec_size()
         return tile_desc
 
 class MLIRTemplateCaller(CUDATemplateCaller):

From fa7e57a293ac5e197111d24daf2c08e70a98c3b0 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Sat, 21 Jun 2025 05:54:28 +0000
Subject: [PATCH 369/432] [fix] prologue prohibit subtile

---
 PyTorchSimFrontend/mlir/mlir_template.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 3334d991..ed2eb504 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -550,7 +550,7 @@ def emit_dma_start(buffer_name, index_var, tag_var, size, tile_size, subtile_siz
             tile_memref = f"memref<{tile_shape}xf32, 1>"
             tag_memref = f"memref<1xi32>"
             attrs = f"sram_stride=[1, {tile_size[0]}]"
-            async_flag = "true" if async_flag else "false"
+            async_flag = "false"
             if subtile_size:
                 subtile_shape = ", ".join([str(x) for x in subtile_size])
                 attrs = f"subtile_size=[{subtile_shape}], async={async_flag}, {attrs}"
@@ -567,9 +567,9 @@ def hook():
             if prologue_code.getvalue():
                 code.writeline(emit_dma_start(self.prologue_info["input_sram_var"], self.prologue_info["input_index_var"], self.prologue_info["input_tag_var"],
                                               self.prologue_info["input_numel"], self.prologue_info["input_tile_size"], subtile_size=self.prologue_info["input_subtile_size"], label="X"))
+                code.splice(prologue_code)
                 code.writeline(emit_dma_start(self.prologue_info["weight_sram_var"], self.prologue_info["weight_index_var"], self.prologue_info["weight_tag_var"],
                                               self.prologue_info["weight_numel"], self.prologue_info["weight_tile_size"], subtile_size=self.prologue_info["weight_subtile_size"], label="W"))
-                code.splice(prologue_code)
             else:
                 code.writeline(emit_dma_start(self.prologue_info["input_sram_var"], self.prologue_info["input_index_var"], self.prologue_info["input_tag_var"],
                                               self.prologue_info["input_numel"], self.prologue_info["input_tile_size"], self.prologue_info["input_subtile_size"], async_flag=True, label="X"))

From 75de3d4939f95f390c7d387c55dd18f56f181eb4 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Sat, 21 Jun 2025 10:17:49 +0000
Subject: [PATCH 370/432] [Validation] manual gemm tile size & fix tiling for
 double buffering

---
 PyTorchSimFrontend/extension_config.py        |  8 +++++
 .../mlir/mlir_codegen_backend.py              |  2 +-
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 31 ++++++++++++++---
 PyTorchSimFrontend/mlir/mlir_template.py      | 34 ++++++++++++++++---
 validation/gemm_tpuv3_cheatsheet.json         | 17 ++++++++++
 5 files changed, 82 insertions(+), 10 deletions(-)
 create mode 100644 validation/gemm_tpuv3_cheatsheet.json

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 17fa74d9..e461cc85 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -57,6 +57,14 @@
 CONFIG_FORCE_TILE_N = int(os.environ.get("TORCHSIM_FORCE_TIME_N", default=sys.maxsize))
 CONFIG_FORCE_TILE_K = int(os.environ.get("TORCHSIM_FORCE_TIME_K", default=sys.maxsize))
 
+# For GEMM tile size
+CONFIG_MANUAL_TILE_SIZE = int(os.environ.get('TORCHSIM_MANUAL_TILE_SIZE', default=False))
+CONFIG_TILE_M = int(os.environ.get('TORCHSIM_TILE_M', default=CONFIG_VECTOR_LANE))
+CONFIG_TILE_N = int(os.environ.get('TORCHSIM_TILE_N', default=CONFIG_VECTOR_LANE))
+CONFIG_TILE_K = int(os.environ.get('TORCHSIM_TILE_K', default=CONFIG_VECTOR_LANE))
+CONFIG_GEMM_CHEATSHEET_PATH = os.environ.get('TORCHSIM_GEMM_CHEATSHEET_PATH',
+                                            default=f"{CONFIG_TORCHSIM_DIR}/validation/gemm_tpuv3_cheatsheet.json")
+
 # SRAM Buffer allocation plan
 def load_plan_from_module(module_path):
     if module_path is None:
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 1272a46e..d091b3eb 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1393,7 +1393,7 @@ def make_choices(self, nodes, kernel_name):
         for vlane_stride in [2, 4, 8]:
             os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = str(vlane_stride)
             previous_tile_size = initial_tile_size
-            increase_dim = 0 # increase the first dimension
+            increase_dim = -2 # increase the first dimension
             while previous_tile_size[increase_dim] * 2 <= previous_ranges[increase_dim] and previous_tile_size[increase_dim] <= 2 ** 13 and prevent_infinite_loop < 10:
                 incrase_dim = -1 # only increase the last dimension
                 prevent_infinite_loop += 1
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index b1d597a0..a70efb21 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -1,4 +1,5 @@
 import os
+import json
 from torch import empty_strided
 from typing import List, Optional, cast
 
@@ -192,20 +193,42 @@ def render(self,
 
         n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0
         nr_rdim = 0
-        if (M == 0) or (N == 0) or (K == 0):
+        # Determine tile size
+        # case 1: use cheat sheet
+        if extension_config.CONFIG_GEMM_CHEATSHEET_PATH is not None:
+            try:
+              with open(extension_config.CONFIG_GEMM_CHEATSHEET_PATH, "r") as f:
+                data = json.load(f)
+            except FileNotFoundError:
+                data = {}
+        gemm_shape = f"{M}_{K}_{N}"
+        if gemm_shape in data:
+            tile_info = data[gemm_shape]
+            TILE_M = tile_info["TILE_M"]
+            TILE_N = tile_info["TILE_N"]
+            TILE_K = tile_info["TILE_K"]
+        else: # case 2: use gemm_combination_mapping
+            min_tile = (n_extra_node + n_prologue_node) == 0
+            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, max(len(n_extra_read)-2, 0), n_prologue_node, min_tile=min_tile)
+        # case 3: use manual tile size
+        if extension_config.CONFIG_MANUAL_TILE_SIZE:
+            TILE_M = extension_config.CONFIG_TILE_M
+            TILE_N = extension_config.CONFIG_TILE_N
+            TILE_K = extension_config.CONFIG_TILE_K
+
+        if (M == 0) or (N == 0) or (K == 0): # exception for MoE
             TILE_M, TILE_N, TILE_K = 1, 1, 1
             template = EMPTY_TEMPLATE
         elif n_extra_node>=1 and epilogue_nodes[0].is_reduction():
-            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, len(n_extra_read), n_prologue_node, min_tile=True)
             template = GEMM_REDUCTION_TEMPLATE
             nr_rdim = 1
         else:
-            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, len(n_extra_read), n_prologue_node, min_tile=True)
             template = GEMM_TEMPLATE
+
         TILE_M = min(extension_config.CONFIG_FORCE_TILE_M, TILE_M)
         TILE_N = min(extension_config.CONFIG_FORCE_TILE_N, TILE_N)
         TILE_K = min(extension_config.CONFIG_FORCE_TILE_K, TILE_K)
-        SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane or n_prologue_node) else kernel.vector_lane
         if (TILE_M == M and TILE_N == N):
             SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
         else: # Avoid Row Conflict of weights
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index ed2eb504..a72342bc 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -23,6 +23,7 @@
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel, reduction_init, reduction_partial_combine_vec, reduction_combine_vec, is_welford_reduction
 from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode
 
+from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_DIR
 from . import mlir_common
 
 class IndentedBufferGroup:
@@ -162,8 +163,7 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p
         spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
         max_spad_per_lane = spad_size_per_lane // 2 # double buffer
-        force_double_buffer = 2 if n_extra_node > 0 else 1 # In fusion case, double buffer should be forced
-        minimum_n_tile = self.num_cores * force_double_buffer if min_tile else 1
+        minimum_n_tile = self.num_cores * 2 if min_tile else 1
         m_pad_factor = self.vector_lane if M > self.vector_lane else 8
         n_pad_factor = self.vector_lane if N > self.vector_lane else 8
         k_pad_factor = self.vector_lane if K > self.vector_lane else (8 if pad_k else 1)
@@ -179,7 +179,31 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p
         tile_N_range = sympy.divisors(indexJ) if N > self.vector_lane else [1]
         tile_K_range = sympy.divisors(indexK) if K > self.vector_lane else [1]
         maximize_i_j = 1 # reuse weight
-        for k in tile_K_range:
+        for k in tile_K_range: # store tile candidates for manual mapping
+            tile_K = k * self.vector_lane if K > self.vector_lane else K_padded
+            for i in tile_M_range:
+                tile_M = i * self.vector_lane if M > self.vector_lane else M_padded
+                for j in tile_N_range:
+                    tile_N = j * self.vector_lane if N > self.vector_lane else N_padded
+                    used_spad_size = (tile_M * tile_K * (1 + n_prologue_node) + tile_K * tile_N + tile_M * tile_N * (1 + n_extra_node)) * self.precision
+                    weight_size_per_lane = self.get_spad_size_per_lane(tile_K, tile_N)
+                    input_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_prologue_node), tile_K)
+                    output_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_extra_node), tile_N)
+                    used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
+                    check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane)
+                    if check_spad_size:
+                        file_path = f"{CONFIG_TORCHSIM_DIR}/validation/gemm_candidates/gemm_{M}_{K}_{N}.txt"
+                        line_to_write = f"{tile_M} {tile_K} {tile_N}\n"
+                        try:
+                            with open(file_path, "r") as f:
+                                lines = f.readlines()
+                        except FileNotFoundError:
+                            lines = []
+                        if line_to_write not in lines:
+                            with open(file_path, "a") as f:
+                                f.write(line_to_write)
+
+        for k in tile_K_range: # heuristic search
             tile_K = k * self.vector_lane if K > self.vector_lane else K_padded
             for i in tile_M_range:
                 tile_M = i * self.vector_lane if M > self.vector_lane else M_padded
@@ -191,8 +215,8 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p
                     output_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_extra_node), tile_N)
                     used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
                     n_tile = math.ceil(M / tile_M) * math.ceil(N / tile_N)
-                    check_spad_size = (used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane)
-                    if check_spad_size and maximize_i_j <= tile_M * tile_N and n_tile >= minimum_n_tile and tile_N // tile_M < 10:
+                    check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane)
+                    if check_spad_size and max_used_spad_size < used_spad_size and maximize_i_j <= tile_M * tile_N and n_tile >= minimum_n_tile and tile_N // tile_M < 10:
                         max_used_spad_size = used_spad_size
                         maximize_i_j = tile_M * tile_N
                         mapping = (tile_M, tile_N, tile_K)
diff --git a/validation/gemm_tpuv3_cheatsheet.json b/validation/gemm_tpuv3_cheatsheet.json
new file mode 100644
index 00000000..76a26e1a
--- /dev/null
+++ b/validation/gemm_tpuv3_cheatsheet.json
@@ -0,0 +1,17 @@
+{
+    "512_2048_8192" : {
+        "TILE_M" : 512,
+        "TILE_K" : 512,
+        "TILE_N" : 1024
+    },
+    "512_2048_2048" : {
+        "TILE_M" : 512,
+        "TILE_K" : 512,
+        "TILE_N" : 1024
+    },
+    "2048_2048_512" : {
+        "TILE_M" : 1024,
+        "TILE_K" : 512,
+        "TILE_N" : 512
+    }
+}
\ No newline at end of file

From ed77bbaf0b1fb99973b953b3dbe9638350a03929 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Mon, 30 Jun 2025 04:07:40 +0000
Subject: [PATCH 371/432] [experiments] FG DMA experiments

---
 PyTorchSimFrontend/extension_config.py        |  5 ++++
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 26 +++++++++++++----
 scripts/CompilerOpt_experiment/DMAopt.sh      | 28 +++++++++++++++++++
 3 files changed, 53 insertions(+), 6 deletions(-)
 create mode 100644 scripts/CompilerOpt_experiment/DMAopt.sh

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index e461cc85..15413103 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -64,6 +64,11 @@
 CONFIG_TILE_K = int(os.environ.get('TORCHSIM_TILE_K', default=CONFIG_VECTOR_LANE))
 CONFIG_GEMM_CHEATSHEET_PATH = os.environ.get('TORCHSIM_GEMM_CHEATSHEET_PATH',
                                             default=f"{CONFIG_TORCHSIM_DIR}/validation/gemm_tpuv3_cheatsheet.json")
+CONFIG_SUBTILE = int(os.environ.get('TORCHSIM_SUBTILE', default=True))
+CONFIG_MANUAL_SUBTILE_SIZE = int(os.environ.get('TORCHSIM_MANUAL_SUBTILE_SIZE', default=False))
+CONFIG_SUBTILE_M = int(os.environ.get('TORCHSIM_SUBTILE_M', default=CONFIG_VECTOR_LANE))
+CONFIG_SUBTILE_N = int(os.environ.get('TORCHSIM_SUBTILE_N', default=CONFIG_VECTOR_LANE))
+CONFIG_SUBTILE_K = int(os.environ.get('TORCHSIM_SUBTILE_K', default=CONFIG_VECTOR_LANE))
 
 # SRAM Buffer allocation plan
 def load_plan_from_module(module_path):
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index a70efb21..3ac8154a 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -228,13 +228,27 @@ def render(self,
         TILE_M = min(extension_config.CONFIG_FORCE_TILE_M, TILE_M)
         TILE_N = min(extension_config.CONFIG_FORCE_TILE_N, TILE_N)
         TILE_K = min(extension_config.CONFIG_FORCE_TILE_K, TILE_K)
-        SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane or n_prologue_node) else kernel.vector_lane
-        if (TILE_M == M and TILE_N == N):
-            SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-        else: # Avoid Row Conflict of weights
+
+        # Calculate Sub Tile Size for fine-grained DMA
+        if extension_config.CONFIG_SUBTILE:
+            # Case 1: adjust selective fine-grained DMA (SFG-DMA)
+            SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane or n_prologue_node) else kernel.vector_lane
+            if (TILE_M == M and TILE_N == N and TILE_N <= 512):
+                SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+            else: # Avoid Row Conflict of weights
+                SUB_TILE_N = TILE_N
+            SUB_TILE_K = TILE_K if TILE_K > 1024 else kernel.vector_lane
+            # Case 2: use manual sub tile size (FG-DMA)
+            if extension_config.CONFIG_MANUAL_SUBTILE_SIZE:
+                SUB_TILE_M = extension_config.CONFIG_SUBTILE_M
+                SUB_TILE_N = extension_config.CONFIG_SUBTILE_N
+                SUB_TILE_K = extension_config.CONFIG_SUBTILE_K
+        # Case 3: None Subtile
+        else:
+            SUB_TILE_M = TILE_M
             SUB_TILE_N = TILE_N
-        SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N # FIXME: hardcoded & 126 line has same feature
-        SUB_TILE_K = TILE_K
+            SUB_TILE_K = TILE_K
+
         TOG_latency = M if SUB_TILE_M > M else SUB_TILE_M
         kernel.loop_size =[TOG_latency, SUB_TILE_N, SUB_TILE_K]
 
diff --git a/scripts/CompilerOpt_experiment/DMAopt.sh b/scripts/CompilerOpt_experiment/DMAopt.sh
new file mode 100644
index 00000000..469cf766
--- /dev/null
+++ b/scripts/CompilerOpt_experiment/DMAopt.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json"
+
+# None FG DMA
+export TORCHSIM_SUBTILE=0
+python experiments/gemm.py --size 128 128 128
+python experiments/gemm.py --size 256 256 256
+python experiments/gemm.py --size 512 512 512
+python experiments/gemm.py --size 1024 1024 1024
+python experiments/gemm.py --size 2048 2048 2048
+
+# FG DMA
+export TORCHSIM_SUBTILE=1
+export TORCHSIM_MANUAL_SUBTILE_SIZE=1
+python experiments/gemm.py --size 128 128 128
+python experiments/gemm.py --size 256 256 256
+python experiments/gemm.py --size 512 512 512
+python experiments/gemm.py --size 1024 1024 1024
+python experiments/gemm.py --size 2048 2048 2048
+
+# SFG DMA
+export TORCHSIM_SUBTILE=1
+export TORCHSIM_MANUAL_SUBTILE_SIZE=0
+python experiments/gemm.py --size 128 128 128
+python experiments/gemm.py --size 256 256 256
+python experiments/gemm.py --size 512 512 512
+python experiments/gemm.py --size 1024 1024 1024
+python experiments/gemm.py --size 2048 2048 2048
\ No newline at end of file

From db2d505ef8a0827ff986a669a8e1be4f2370673a Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Mon, 30 Jun 2025 04:08:33 +0000
Subject: [PATCH 372/432] [Fix] prohibit multi-thread for CI

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index d091b3eb..fee5702a 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1442,7 +1442,7 @@ def get_cycle(choice):
 
         if len(choices) == 0: # can't autotune
             return None
-        with ThreadPoolExecutor(max_workers=5) as executor:
+        with ThreadPoolExecutor(max_workers=1) as executor:
             results = list(executor.map(get_cycle, choices))
         max_idx = results.index(min(results))
         print(f"[Auto-tune] Optimal tile size: {choices[max_idx][2].tile_desc.get_tile_size()}, vlane_stride: {choices[max_idx][2].tile_desc.vlane_stride}, cycles: {results[max_idx]}")

From 3e6daf3e27db24917afbf993f5b394ff34bfd66a Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Mon, 30 Jun 2025 04:31:34 +0000
Subject: [PATCH 373/432] [Fix] minimum tile size and subtile K

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 2 +-
 PyTorchSimFrontend/mlir/mlir_template.py      | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 3ac8154a..bfd0633b 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -237,7 +237,7 @@ def render(self,
                 SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
             else: # Avoid Row Conflict of weights
                 SUB_TILE_N = TILE_N
-            SUB_TILE_K = TILE_K if TILE_K > 1024 else kernel.vector_lane
+            SUB_TILE_K = TILE_K
             # Case 2: use manual sub tile size (FG-DMA)
             if extension_config.CONFIG_MANUAL_SUBTILE_SIZE:
                 SUB_TILE_M = extension_config.CONFIG_SUBTILE_M
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index a72342bc..1db14e27 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -163,7 +163,7 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p
         spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
         max_spad_per_lane = spad_size_per_lane // 2 # double buffer
-        minimum_n_tile = self.num_cores * 2 if min_tile else 1
+        minimum_n_tile = self.num_cores if min_tile else 1
         m_pad_factor = self.vector_lane if M > self.vector_lane else 8
         n_pad_factor = self.vector_lane if N > self.vector_lane else 8
         k_pad_factor = self.vector_lane if K > self.vector_lane else (8 if pad_k else 1)
@@ -214,9 +214,9 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p
                     input_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_prologue_node), tile_K)
                     output_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_extra_node), tile_N)
                     used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
-                    n_tile = math.ceil(M / tile_M) * math.ceil(N / tile_N)
+                    n_tile = math.ceil(M / max(tile_M, 128)) * math.ceil(N / max(tile_N, 128))
                     check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane)
-                    if check_spad_size and max_used_spad_size < used_spad_size and maximize_i_j <= tile_M * tile_N and n_tile >= minimum_n_tile and tile_N // tile_M < 10:
+                    if check_spad_size and max_used_spad_size < used_spad_size and maximize_i_j <= tile_M * tile_N and n_tile >= minimum_n_tile and max(tile_N, 128) // max(tile_M, 128) < 10:
                         max_used_spad_size = used_spad_size
                         maximize_i_j = tile_M * tile_N
                         mapping = (tile_M, tile_N, tile_K)

From 29ee378a24999e3d5a4e3adb1857ec2cb22f24b1 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 30 Jun 2025 04:55:44 +0000
Subject: [PATCH 374/432] [Frontend] Make fusion optionable

---
 PyTorchSimFrontend/extension_config.py     | 4 ++++
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 15413103..b8c2b3b4 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -70,6 +70,10 @@
 CONFIG_SUBTILE_N = int(os.environ.get('TORCHSIM_SUBTILE_N', default=CONFIG_VECTOR_LANE))
 CONFIG_SUBTILE_K = int(os.environ.get('TORCHSIM_SUBTILE_K', default=CONFIG_VECTOR_LANE))
 
+# Advanced fusion options
+CONFIG_FUSION_REDUCTION = int(os.environ.get('TORCHSIM_FUSION_REDUCTION', default=False))
+CONFIG_FUSION_PROLOGUE = int(os.environ.get('TORCHSIM_FUSION_PROLOGUE', default=False))
+
 # SRAM Buffer allocation plan
 def load_plan_from_module(module_path):
     if module_path is None:
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 307d5afe..a526d17c 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -34,7 +34,7 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
         if node1.get_device() != node2.get_device():
             return False
 
-        if len(base_template_node1) == 1 and len(base_template_node2) == 0:
+        if len(base_template_node1) == 1 and len(base_template_node2) == 0 and extension_config.CONFIG_FUSION_REDUCTION:
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
             from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
             if (isinstance(base_template_node1[0].node.template, MLIRGemmTemplate) or isinstance(base_template_node1[0].node.template, MLIRBMMTemplate)) and node2.is_reduction() and len(node2.get_nodes())==1:
@@ -50,7 +50,7 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
                 return size_match and layout_possible and dependency_check & dependency_size
 
         # For prologue fusion case
-        if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and len(base_template_node2) == 1:
+        if extension_config.CONFIG_FUSION_PROLOGUE and len(base_template_node1) == 0 and len(node1.get_nodes())==1 and len(base_template_node2) == 1:
             # Return false if node2 is Convolution template
             # if node2.get_nodes()[0].node.origin_node.target._name == 'aten::mm' or \
             #     node2.get_nodes()[0].node.origin_node.target._name == 'aten::addmm':

From af6e63d633e9bd418241408173f877fd57f3bf03 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 30 Jun 2025 08:27:22 +0000
Subject: [PATCH 375/432] [Frontend] Use kernel name from define_kernel

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index a526d17c..41264a74 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -138,10 +138,10 @@ def codegen_nodes(self, nodes):
         ex_kernel = self.target_kernel(kernel_group=self.kernel_group)
         ex_kernel.kernel_group = self.kernel_group
 
-        kernel_name = f"extension_kernel_{MLIRScheduling.count}"
+        kernel_name_candidate = f"extension_kernel_{MLIRScheduling.count}"
         MLIRScheduling.count += 1
-        src_code = ex_kernel.codegen_nodes(nodes, kernel_name)
-        self.define_kernel(src_code, kernel_name, ex_kernel.vector_lane,
+        src_code = ex_kernel.codegen_nodes(nodes, kernel_name_candidate)
+        kernel_name = self.define_kernel(src_code, kernel_name_candidate, ex_kernel.vector_lane,
                            ex_kernel.spad_info, origins= {str(i) for i in nodes[0].node.origins})
         ex_kernel.call_kernel(kernel_name)
         _, args, _, _ = ex_kernel.args.mlir_argdefs()

From b82bf94566ebd74763d7d26dab9dd13220239daa Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 30 Jun 2025 08:29:35 +0000
Subject: [PATCH 376/432] [Frontend] Don't use buffer's unique name to reuse
 kernels

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index fee5702a..f6fe6a76 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -871,6 +871,7 @@ def __init__(self, kernel_group, reason=None):
         self.tags = dict()
         self.dma_read_cache = dict()
         self.dma_write_cache = dict()
+        self.spadbuf_counter = 0
         self.dma_read_counter = 1
         self.dma_write_counter = 1
         self.affine_yield = {}
@@ -958,10 +959,11 @@ def load(self, name: str, index: sympy.Expr):
         index = self.convert_indirect_indexing(index)
         padding = self.get_padding_type()
         dram_var = self.kernel_group.args.input(name)
-
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
+
         local_tile_desc, index_var = self.get_dma_info(name, index)
+
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
@@ -1305,7 +1307,7 @@ def index_expr(self, index, dtype):
             self.header.writeline(f"{c_type} {new_name}_spad[{compute_vec_size}] __attribute__ ((section(\".spad\")));")
             self.gem5_header.writeline(f"{c_type} {new_name}_spad[{compute_vec_size}] __attribute__((aligned(64)));")
             self.global_vars.writeline(f"memref.global @{new_name}_spad : {tile_shape}")
-            self.global_vars_dict[new_name] = []
+            self.global_vars_dict[new_name] = dict()
         sram_var = self.spad_cse.generate(self.spad_buffer, f"memref.get_global @{new_name}_spad : {tile_shape}")
         # Initialize base vector
         if not self.base_vector_initialized:
@@ -1673,17 +1675,18 @@ def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape
             buffer = self.spad_buffer
 
         if name not in self.global_vars_dict:
-            self.global_vars_dict[name] = list()
+            self.global_vars_dict[name] = dict()
 
         if str(raw_index) not in self.global_vars_dict[name]:
-            new_name = f"{name}_{len(self.global_vars_dict[name])}"
+            new_name = f"buf{self.spadbuf_counter}"
+            self.spadbuf_counter+=1
             # Add definition to header
             self.header.writeline(f"{c_type} {new_name}_spad[{tile_size // self.vector_lane}] __attribute__ ((section(\".spad\")));")
             self.gem5_header.writeline(f"{c_type} {new_name}_spad[{tile_size}] __attribute__((aligned(64)));")
             self.global_vars.writeline(f"memref.global @{new_name}_spad : {dram_tile_shape}")
-            self.global_vars_dict[name].append(str(raw_index))
+            self.global_vars_dict[name][str(raw_index)] = new_name
         else:
-            new_name = f"{name}_{self.global_vars_dict[name].index(str(raw_index))}"
+            new_name = self.global_vars_dict[name][str(raw_index)]
         sram_var = self.spad_cse.generate(buffer, f"memref.get_global @{new_name}_spad : {dram_tile_shape}")
 
         zero_cse = self.get_const_cse(0)

From c915f343473949f781e24d596da2d057f2ec7ab7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 30 Jun 2025 08:32:34 +0000
Subject: [PATCH 377/432] [Frontend] Add manual tile_stride for DimTile

---
 PyTorchSimFrontend/mlir/mlir_common.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 29ef65c9..92af0570 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -180,8 +180,10 @@ def set_info(outer, inner, arg_type):
 class MLIRMultiDimTile():
     def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None, vec_size=None):
         self._tile_size = list(tile_size)
+        self._tile_stride = None
         self.tile_axis_order = list(range(len(tile_size)))
         self.vec_size = vec_size
+        self.update_tile_stride()
 
         # Vector lane mapping config
         self.vector_lane = vector_lane
@@ -196,6 +198,11 @@ def set_tile_size(self, tile_size, tile_axis_order=None):
             self.tile_axis_order = list(range(len(tile_size)))
         else:
             self.tile_axis_order = tile_axis_order
+        self.update_tile_stride()
+
+    def set_tile_size_stride(self, tile_size, tile_stride):
+        self._tile_size = tile_size
+        self._tile_stride = tile_stride
 
     def get_tile_size(self):
         return self._tile_size
@@ -216,7 +223,7 @@ def get_numel_per_lane(self):
             size *= dim_size
         return size
 
-    def get_tile_stride(self):
+    def update_tile_stride(self):
         strides = [1] * len(self._tile_size)
         init = 1
 
@@ -228,7 +235,10 @@ def get_tile_stride(self):
         for _, size, original_indices in sorted_pairs:
             strides[original_indices] = init
             init *= size
-        return strides
+        self._tile_stride = strides
+
+    def get_tile_stride(self):
+        return self._tile_stride
 
     def get_tile_size_per_lane(self):
         tile_size_per_lane = list(self._tile_size)

From 491b91111e0ba987a965140b06724fd5422df1af Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 1 Jul 2025 06:33:24 +0000
Subject: [PATCH 378/432] [Frontend] Add utility method for kernel class

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index f6fe6a76..6f38b08a 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1477,6 +1477,12 @@ def _prepare_simulator_headers(self, src_code):
         write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol)
         write_atomic(gem5_write_path, self.gem5_header.getvalue())
 
+    def get_arg_info(self, name):
+        arg_info = dict()
+        arg_info.update(V.graph.graph_inputs)
+        arg_info.update({i.get_name(): i for i in V.graph.buffers})
+        return arg_info[name]
+
     def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffer=None): # Need more argument?
         """
         A tile descriptor exists that is configured on a kernel group

From d86dc3adddbe35969f3ac2faa1d8baedddc43043 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 3 Jul 2025 05:43:57 +0000
Subject: [PATCH 379/432] [Frontend/Template] Rework template codegen

There're a lot of changes. Fusion mechanism is refactored.
Major changes is that keep consistency with template and fusion nodes
To do this, I changed the loop order, and added Revert() function to
revert squeezed size of point/reduction nodes
---
 PyTorchSimFrontend/common_diff.py             | 1031 -----------------
 PyTorchSimFrontend/extension_config.py        |    6 +-
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  |  372 +++---
 .../mlir/mlir_codegen_backend.py              |  164 ++-
 PyTorchSimFrontend/mlir/mlir_common.py        |   10 +-
 .../mlir/mlir_conv_mt_template.py             |  346 ++++++
 .../mlir/mlir_conv_sb_template.py             |  342 ++++++
 .../mlir/mlir_conv_sbs_template.py            |  343 ++++++
 PyTorchSimFrontend/mlir/mlir_conv_template.py |  669 ++---------
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  422 ++++---
 PyTorchSimFrontend/mlir/mlir_lowering.py      |   17 +-
 .../mlir/mlir_maxpool_template.py             |   25 +-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  179 +--
 PyTorchSimFrontend/mlir/mlir_template.py      |  602 +++++-----
 14 files changed, 2009 insertions(+), 2519 deletions(-)
 delete mode 100644 PyTorchSimFrontend/common_diff.py
 create mode 100644 PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
 create mode 100644 PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
 create mode 100644 PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py

diff --git a/PyTorchSimFrontend/common_diff.py b/PyTorchSimFrontend/common_diff.py
deleted file mode 100644
index 6c1c875c..00000000
--- a/PyTorchSimFrontend/common_diff.py
+++ /dev/null
@@ -1,1031 +0,0 @@
-import contextlib
-import dataclasses
-import functools
-import itertools
-import logging
-import operator
-import re
-from collections import namedtuple
-from itertools import chain
-from typing import Any, Callable, ClassVar, Dict, List, NamedTuple, Optional, Set, Union
-
-import sympy
-from sympy.printing.printer import Printer
-
-import torch
-import torch.fx
-from torch.utils._sympy.value_ranges import ValueRanges
-
-from .. import metrics
-from ..utils import (
-    DeferredLineBase,
-    free_symbol_startswith,
-    get_sympy_Expr_dtype,
-    IndentedBuffer,
-    sympy_dot,
-    sympy_subs,
-    unique,
-)
-from ..virtualized import ops, OpsValue, V
-
-schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
-
-
-def data_type_logger(msg):
-    if schedule_log.isEnabledFor(logging.DEBUG):
-        schedule_log.debug("Data type propagation: %s", msg)
-
-
-TensorArg = namedtuple("TensorArg", ["name", "buffer", "dtype"])
-SizeArg = namedtuple("SizeArg", ["name", "expr"])
-
-DeviceCodegen = namedtuple("DeviceCodegen", ["scheduling", "wrapper_codegen"])
-device_codegens: Dict[str, DeviceCodegen] = {}
-
-
-# The code generated by Inductor consists of two main parts: kernel code and wrapper code.
-# For any new backend looking to integrate with Inductor, customization of these two main
-# parts are necessary to generate its specific code.
-#
-# Kernel code generation is determined by different Scheduling. Consequently, a new
-# backend needs to provide a custom Scheduling for its unique kernel code generation. Currently,
-# CppScheduling and TritonScheduling serve the C++/OpenMP and Triton backends, respectively.
-#
-# For the Wrapper, Inductor provides a WrapperCodeGen class to generate the Python wrapper code
-# that bridges kernels. This allows out-of-tree backends to inherit from WrapperCodeGen,
-# and override specific member functions to create backend-specific Python wrapper code.
-#
-# Other classes, such as CppKernel and TritonKernel, used for code generation, typically form part
-# of the logic for either Scheduling or WrapperCodeGen. So the Scheduling and WrapperCodeGen interfaces
-# provide flexibility to the backend. A backend can choose to implement these classes from scratch,
-# or reuse them by extending and overriding as necessary. And Inductor provides the registration API,
-# register_backend_for_device, to equip a new backend at runtime.
-#
-# Intel has developed a new backend on top of Triton to support Intel GPUs, leveraging these interfaces.
-# This backend can be used as a reference:
-# https://github.com/intel/intel-extension-for-pytorch/blob/5dcc9d57e5422cf295e1a1ee97896d6b6a554a85/intel_extension_for_pytorch/_inductor/__init__.py#L9
-def register_backend_for_device(
-    device: str, device_scheduling: type, device_wrapper_codegen: type
-):
-    device_codegens[device] = DeviceCodegen(device_scheduling, device_wrapper_codegen)
-
-
-def get_scheduling_for_device(device: str):
-    return device_codegens[device].scheduling if device in device_codegens else None
-
-
-def get_wrapper_codegen_for_device(device: str):
-    return (
-        device_codegens[device].wrapper_codegen if device in device_codegens else None
-    )
-
-
-def index_prevent_reordering(index: List[sympy.Expr], index_vars, sizes):
-    from ..ir import FlexibleLayout
-
-    # added contiguous index prevents reordering
-    return [*index, sympy_dot(index_vars, FlexibleLayout.contiguous_strides(sizes))]
-
-
-@functools.lru_cache(None)
-def boolean_ops():
-    return (
-        "is_inf",
-        "is_nan",
-        "bitwise_xor",
-        "logical_not",
-        "signbit",
-        "le",
-        "lt",
-        "ge",
-        "gt",
-        "eq",
-        "ne",
-    )
-
-
-DTYPE_TO_COMPUTATION_DTYPE = {
-    torch.bfloat16: torch.float,
-    torch.float16: torch.float,
-    **{
-        dtype: dtype
-        for dtype in [
-            torch.bool,
-            torch.float32,
-            torch.float64,
-            torch.int8,
-            torch.int16,
-            torch.int32,
-            torch.int64,
-            torch.uint8,
-        ]
-    },
-}
-
-
-class DataTypePropagation:
-    def __init__(self, body) -> None:
-        self.body = body
-        self.graphs: Dict[Union[Callable[..., Any], str], Any] = {
-            "root": body.root_block.graph
-        }
-        for k, v in body.subblocks.items():
-            self.graphs[k] = v.graph
-
-    def deduce_node_dtype_by_inputs(self, node: torch.fx.Node):
-        inputs = node.all_input_nodes
-        input_nodes = [
-            n for n in inputs if isinstance(n, torch.fx.Node) and n.op != "placeholder"
-        ]
-        if len(input_nodes) == 0:
-            return None
-
-        all_input_nodes_propogated = all(
-            OptimizationContext.key in n.meta
-            and n.meta[OptimizationContext.key].dtype is not None
-            for n in input_nodes
-        )
-        if not all_input_nodes_propogated:
-            return None
-
-        return functools.reduce(
-            torch.promote_types,
-            [n.meta[OptimizationContext.key].dtype for n in input_nodes],
-        )
-
-    def deduce_node_dtype_by_subgraph(self, node: torch.fx.Node):
-        sub_graph = self.graphs[node.target]
-        dtype = self.propagate_graph(sub_graph)
-        assert dtype
-        return dtype
-
-    def deduce_node_dtype(self, node: torch.fx.Node):
-        if node.target in boolean_ops():
-            return torch.bool
-
-        if node.op == "placeholder":
-            return None
-
-        if node.target == "output":
-            # we can infer output node if it only have 1 arg
-            if len(node.args) != 1:
-                return None
-
-        if node.target in (
-            "to_dtype",
-            "index_expr",
-        ):
-            return node.args[-1]
-
-        if node.target in (
-            "rand",
-            "randn",
-        ):
-            return torch.float
-
-        if node.target in (
-            "get_index",
-            "index_expr",
-        ):
-            return torch.int64
-
-        if node.target in (
-            "load",
-            "store",
-            "store_reduction",
-        ):
-            buf_name = node.args[1]
-            return V.graph.get_dtype(buf_name)
-
-        if node.target == operator.getitem:
-            return self.deduce_node_dtype(node.args[0])
-
-        assert isinstance(node.target, str)
-
-        if node.target == "reduction":
-            return node.args[1]
-
-        if node.target == "constant":
-            return DTYPE_TO_COMPUTATION_DTYPE[node.args[-1]]
-
-        if node.target.startswith("masked_subblock"):
-            return self.deduce_node_dtype_by_subgraph(node)
-
-        return self.deduce_node_dtype_by_inputs(node)
-
-    def propagate_graph(self, graph: torch.fx.Graph):
-        assert graph.nodes
-        graph_dtype = None
-        # For masked_subblock, we use output's dtype to represent
-        # the dtype of this subgraph. For other cases, graph_dtype
-        # might be None
-        for node in graph.nodes:
-            if OptimizationContext.key in node.meta:
-                opt_ctx = node.meta[OptimizationContext.key]
-            else:
-                opt_ctx = OptimizationContext()
-
-            opt_ctx.dtype = self.deduce_node_dtype(node)
-            node.meta[OptimizationContext.key] = opt_ctx
-            if node.target == "output":
-                graph_dtype = opt_ctx.dtype
-        return graph_dtype
-
-    def propagate(self):
-        self.propagate_graph(self.graphs["root"])
-
-    @classmethod
-    def propagate_loopbody(cls, body):
-        return cls(body).propagate()
-
-    @classmethod
-    def propagate_scheduler_node(cls, node):
-        from ..ir import LoopBody
-        from ..scheduler import SchedulerNode
-
-        assert isinstance(node, SchedulerNode)
-        assert isinstance(node._body, LoopBody)
-        DataTypePropagation.propagate_loopbody(node._body)
-
-
-class ExprPrinter(Printer):
-    @staticmethod
-    def paren(string):
-        def all_in_parens(string):
-            if string[0] != "(" or len(string) < 2:
-                return False
-            count = 1
-            for i, char in enumerate(string[1:]):
-                if char == "(":
-                    count += 1
-                elif char == ")":
-                    count -= 1
-                if count == 0 and i != len(string) - 2:
-                    return False
-            assert count == 0
-            return True
-
-        if (
-            isinstance(string, CSEVariable)
-            or re.match(r"^[a-z0-9_.]+$", string, re.I)
-            or re.match(r"^\([^)]*\)$", string, re.I)
-            or string == ""
-        ):
-            return string
-        # don't put extra parens for strings that are already wrapped in parens
-        if all_in_parens(string):
-            return string
-        return f"({string})"
-
-    def _print_Pow(self, expr):
-        # Pow() confuses triton
-        base, exp = expr.args
-        # NB: Remember this is sizevar computation!  You don't typically
-        # expect to have to do floating point computation including exponents
-        # in sizevar compute.  Instead of adding support for floating
-        # point pow, you should make upstream retranslate the Sympy expression
-        # into Tensor expressions earlier and do that instead.
-        if exp == 0.5:
-            return self._helper_sqrt(base)  # type: ignore[attr-defined]
-        elif exp == -0.5:
-            return "1/" + self._helper_sqrt(base)  # type: ignore[attr-defined]
-        base = self._print(base)
-        assert exp == int(exp), exp
-        exp = int(exp)
-        if exp > 0:
-            return "*".join([self.paren(base)] * exp)
-        elif exp < 0:
-            return "1/" + self.paren("*".join([self.paren(base)] * abs(exp)))
-        else:  # exp == 0
-            return "1"
-
-    def _print_Unequality(self, expr):
-        return " != ".join(map(self.paren, map(self._print, expr.args)))
-
-    def _print_Mul(self, expr):
-        return "*".join(map(self.paren, map(self._print, expr.args)))
-
-    def _print_Add(self, expr):
-        return " + ".join(map(self.paren, map(self._print, expr.args)))
-
-    def _print_Mod(self, expr):
-        return " % ".join(map(self.paren, map(self._print, expr.args)))
-
-    def _print_CleanDiv(self, expr):
-        return self._print_FloorDiv(expr)  # type: ignore[attr-defined]
-
-    def _print_GreaterThan(self, expr):
-        # GreaterThan:          >=
-        # StrictlyGreaterThan:  >
-        # Go figure...
-        return " >= ".join(map(self.paren, map(self._print, expr.args)))
-
-
-class PythonPrinter(ExprPrinter):
-    def _print_ModularIndexing(self, expr):
-        x, div, mod = expr.args
-        x = self.paren(self.doprint(x))
-        div = self.paren(self.doprint(div))
-        mod = self.paren(self.doprint(mod))
-        if div != "1":
-            x = f"({x} // {div})"
-        return f"{x} % {mod}"
-
-    def _print_FloorDiv(self, expr):
-        x, div = expr.args
-        x = self.paren(self.doprint(x))
-        div = self.paren(self.doprint(div))
-        return f"({x} // {div})"
-
-    def _helper_sqrt(self, expr):
-        return f"math.sqrt({self._print(expr)})"
-
-    def _print_floor(self, expr):
-        assert len(expr.args) == 1
-        return f"math.floor({self._print(expr.args[0])})"
-
-    def _print_ceiling(self, expr):
-        assert len(expr.args) == 1
-        return f"math.ceil({self._print(expr.args[0])})"
-
-
-class OpOverrides:
-    def __init__(self, parent):
-        super().__init__()
-        self._parent = parent
-
-    def __getattr__(self, item):
-        return getattr(self._parent, item)
-
-    @staticmethod
-    def identity(value):
-        # used to trigger cse
-        return value
-
-    @staticmethod
-    def constant(value, dtype):
-        return repr(value)
-
-    @staticmethod
-    def reciprocal(x):
-        return ops.div("1", x)
-
-    @staticmethod
-    def square(x):
-        return ops.mul(x, x)
-
-    @staticmethod
-    def bitwise_not(x):
-        return f"~{ExprPrinter.paren(x)}"
-
-    @staticmethod
-    def logical_not(a):
-        return f"{ExprPrinter.paren(a)} == 0"
-
-    @staticmethod
-    def bitwise_and(x, y):
-        return f"{ExprPrinter.paren(x)} & {ExprPrinter.paren(y)}"
-
-    @staticmethod
-    def bitwise_or(x, y):
-        return f"{ExprPrinter.paren(x)} | {ExprPrinter.paren(y)}"
-
-    @staticmethod
-    def bitwise_xor(x, y):
-        return f"{ExprPrinter.paren(x)} ^ {ExprPrinter.paren(y)}"
-
-    @staticmethod
-    def bitwise_left_shift(x, y):
-        return f"{ExprPrinter.paren(x)} << {ExprPrinter.paren(y)}"
-
-    # TODO(fdrocha): this is currently not being used anywhere,
-    # pending on moving triton pin past 972b761
-    @staticmethod
-    def bitwise_right_shift(x, y):
-        return f"{ExprPrinter.paren(x)} >> {ExprPrinter.paren(y)}"
-
-    @staticmethod
-    def remainder(a, b):
-        r = ops.mod(a, b)
-        return ops.where(f"(({r} != 0) & (({r} < 0) != ({b} < 0)))", ops.add(r, b), r)
-
-    @staticmethod
-    def load_seed(name, offset):
-        return ops.load(name, sympy.Integer(offset))
-
-
-class DeferredLine(DeferredLineBase):
-    """A line that can be 'unwritten' by adding name to V.graph.removed_buffers"""
-
-    def __init__(self, name, line):
-        super().__init__(line)
-        self.name = name
-
-    def __call__(self):
-        if (
-            self.name not in V.graph.removed_buffers
-            and self.name not in V.graph.inplaced_to_remove
-        ):
-            return self.line
-        return None
-
-    def _new_line(self, line):
-        return DeferredLine(self.name, line)
-
-
-class BracesBuffer(IndentedBuffer):
-    def indent(self, offset=1):
-        @contextlib.contextmanager
-        def ctx():
-            for _ in range(offset):
-                self.writeline("{")
-                self._indent += 1
-            for _ in range(-offset):
-                self._indent -= 1
-                self.writeline("}")
-            yield
-            for _ in range(-offset):
-                self.writeline("{")
-                self._indent += 1
-            for _ in range(offset):
-                self._indent -= 1
-                self.writeline("}")
-
-        return ctx()
-
-
-class InplacedBuffer(NamedTuple):
-    inner_name: str
-    other_names: List[str]
-
-
-class KernelArgs:
-    @staticmethod
-    def _lookup(prefix, odict, name):
-        assert isinstance(name, (str, sympy.Symbol))
-        if name not in odict:
-            odict[name] = f"{prefix}{len(odict)}"
-        return odict[name]
-
-    def __init__(self, sizevars=None):
-        self.input_buffers = dict()
-        self.output_buffers = dict()
-        self.inplace_buffers = dict()
-        self.sizevars = sizevars or dict()
-
-    def __repr__(self):
-        return "KernelArgs({})".format(
-            ", ".join(
-                map(
-                    repr,
-                    [
-                        self.input_buffers,
-                        self.output_buffers,
-                        self.inplace_buffers,
-                        self.sizevars,
-                    ],
-                )
-            )
-        )
-
-    def _buffer_is_marked_removed(self, name):
-        return isinstance(name, str) and name.startswith("REMOVED")
-
-    def input(self, name):
-        if V.graph.scheduler:
-            name = V.graph.scheduler.mutation_real_name.get(name, name)
-        assert name not in V.graph.removed_buffers, name
-        if name in self.output_buffers:
-            return self.output_buffers[name]
-        if name in self.inplace_buffers:
-            return self.inplace_buffers[name].inner_name
-        if name.startswith("seed"):
-            return self._lookup("seed", self.input_buffers, name)
-        return self._lookup("in_ptr", self.input_buffers, name)
-
-    def output(self, name):
-        if V.graph.scheduler:
-            name = V.graph.scheduler.mutation_real_name.get(name, name)
-        assert name not in V.graph.removed_buffers, name
-        if name in self.inplace_buffers:
-            return self.inplace_buffers[name].inner_name
-        return self._lookup("out_ptr", self.output_buffers, name)
-
-    def make_inplace(self, input_name, output_name):
-        assert output_name not in self.inplace_buffers
-        if input_name in self.inplace_buffers:
-            buf = self.inplace_buffers[input_name]
-            buf.other_names.append(output_name)
-            self.inplace_buffers[output_name] = buf
-        else:
-            buf = InplacedBuffer(
-                f"in_out_ptr{len(unique(self.inplace_buffers.values()))}",
-                [input_name, output_name],
-            )
-            self.inplace_buffers[input_name] = buf
-            self.inplace_buffers[output_name] = buf
-
-    def seed_offset(self, name, value):
-        if value in self.sizevars:
-            return self.sizevars[value]
-        if name in self.sizevars.values():
-            name = (
-                f"{name}{sum(1 for v in self.sizevars.values() if v.startswith(name))}"
-            )
-        self.sizevars[value] = name
-        return name
-
-    def size(self, name):
-        if str(name) == "seed":
-            self.sizevars["seed"] = "seed"
-            return "seed"
-        return self._lookup("ks", self.sizevars, name)
-
-    def call_names(self):
-        return chain(
-            self.input_buffers.keys(), self.output_buffers.keys(), self.sizevars.keys()
-        )
-
-    def wrap_ptr_arg(self, buf, dtype):
-        return f"c_void_p({buf}.data_ptr())"
-
-    def wrap_size_arg(self, size):
-        return f"c_long({size})"
-
-    def cpp_argdefs(self):
-        from .cpp import DTYPE_TO_CPP, INDEX_TYPE
-
-        # TODO(jansel): replace this with data from scheduler
-        buffer_types = {x.get_name(): x.get_dtype() for x in V.graph.buffers}
-        for name, val in V.graph.graph_inputs.items():
-            if isinstance(val, sympy.Expr):
-                buffer_types[name] = get_sympy_Expr_dtype(val)
-            else:
-                buffer_types[name] = val.get_dtype()
-        buffer_types.update(
-            {name: val.dtype for name, val in V.graph.constants.items()}
-        )
-
-        call_args = []
-        arg_defs = []
-        arg_types = []
-        for inplaced in unique(self.inplace_buffers.values()):
-            if self._buffer_is_marked_removed(inplaced):
-                continue
-            outer = inplaced.other_names[-1]
-            inner = inplaced.inner_name
-            dtype = buffer_types[outer]
-            cpp_dtype = DTYPE_TO_CPP[dtype]
-            arg_defs.append(f"{cpp_dtype}* {inner}")
-            call_args.append(self.wrap_ptr_arg(outer, dtype))
-            arg_types.append(f"{cpp_dtype}*")
-        for outer, inner in self.input_buffers.items():
-            if outer in self.inplace_buffers:
-                continue
-            dtype = buffer_types[outer]
-            cpp_dtype = DTYPE_TO_CPP[dtype]
-            arg_defs.append(f"const {cpp_dtype}* {inner}")
-            call_args.append(self.wrap_ptr_arg(outer, dtype))
-            arg_types.append(f"const {cpp_dtype}*")
-        for outer, inner in self.output_buffers.items():
-            if outer in self.inplace_buffers or self._buffer_is_marked_removed(inner):
-                continue
-            dtype = buffer_types[outer]
-            cpp_dtype = DTYPE_TO_CPP[dtype]
-            arg_defs.append(f"{cpp_dtype}* {inner}")
-            call_args.append(self.wrap_ptr_arg(outer, dtype))
-            arg_types.append(f"{cpp_dtype}*")
-        for outer, inner in self.sizevars.items():
-            arg_defs.append(f"const {INDEX_TYPE} {inner}")
-            call_args.append(self.wrap_size_arg(outer))
-            arg_types.append(f"const {INDEX_TYPE}")
-        return arg_defs, call_args, arg_types
-
-    def python_argdefs(self):
-        arg_defs = []
-        call_args = []
-        precompile_args: List[Union[TensorArg, SizeArg]] = []
-        for inplaced in unique(self.inplace_buffers.values()):
-            if self._buffer_is_marked_removed(inplaced):
-                continue
-            arg_defs.append(inplaced.inner_name)
-            call_args.append(inplaced.other_names[-1])
-            precompile_args.append(
-                TensorArg(
-                    inplaced.inner_name,
-                    inplaced.other_names[-1],
-                    V.graph.get_dtype(inplaced.other_names[-1]),
-                )
-            )
-        for outer, inner in chain(
-            self.input_buffers.items(), self.output_buffers.items()
-        ):
-            if outer in self.inplace_buffers or self._buffer_is_marked_removed(inner):
-                continue
-            arg_defs.append(inner)
-            call_args.append(outer)
-            precompile_args.append(TensorArg(inner, outer, V.graph.get_dtype(outer)))
-        for outer, inner in self.sizevars.items():
-            arg_defs.append(inner)
-            call_args.append(outer)
-            precompile_args.append(SizeArg(inner, outer))
-
-        return arg_defs, call_args, precompile_args
-
-    def aliases(self):
-        for inplaced in unique(self.inplace_buffers.values()):
-            if self._buffer_is_marked_removed(inplaced):
-                continue
-            for other in inplaced.other_names:
-                if other in V.graph.inplaced_to_remove:
-                    continue
-                if other in self.input_buffers:
-                    yield self.input_buffers[other], inplaced.inner_name
-                if other in self.output_buffers:
-                    yield self.output_buffers[other], inplaced.inner_name
-
-    def is_removed(self, name):
-        def _is_removed(name, buffers):
-            return name not in buffers or self._buffer_is_marked_removed(buffers[name])
-
-        return _is_removed(name, self.output_buffers) and _is_removed(
-            name, self.inplace_buffers
-        )
-
-    # Includes inplace buffers, excludes removed buffers.  Essentially,
-    # after you do a call into this kernel, which buffers actually contain
-    # updated data?  Modeled off of python_argdefs.
-    def live_output_buffers(self):
-        live_outs = set()
-        for inplaced in unique(self.inplace_buffers.values()):
-            if self._buffer_is_marked_removed(inplaced):
-                continue
-            live_outs.add(inplaced.other_names[-1])
-        for outer, inner in self.output_buffers.items():
-            if outer in self.inplace_buffers or self._buffer_is_marked_removed(inner):
-                continue
-            live_outs.add(outer)
-        return live_outs
-
-
-class CSEVariable:
-    """A CSEVariable is just a name for an expression but it is useful to be able to annotate them on a backend dependent basis.
-    To do so, the backends can simply overload `Kernel.create_cse_var`
-    The "CSEVariable.update_on_args" method gives you a hook for annotations
-    See example of TritonCSEVariable in triton.py
-    """
-
-    def __init__(self, name, bounds: ValueRanges):
-        assert isinstance(bounds, ValueRanges)
-        self.name = name
-        self.bounds = bounds
-
-    def __str__(self):
-        return self.name
-
-    def __hash__(self) -> int:
-        return hash(self.name)
-
-    def __eq__(self, other) -> bool:
-        return type(other) == type(self) and other.name == self.name
-
-    def update_on_args(self, name, args, kwargs):
-        pass
-
-
-class CppWrapperKernelArgs(KernelArgs):
-    def wrap_ptr_arg(self, buf, dtype):
-        from .cpp import DTYPE_TO_CPP
-
-        return f"({DTYPE_TO_CPP[dtype]}*)({buf}.data_ptr())"
-
-    def wrap_size_arg(self, size):
-        return f"{size}"
-
-
-class CSE:
-    """Common subexpression elimination"""
-
-    def __init__(
-        self,
-        prefix="",
-        suffix="",
-        name_prefix="tmp",
-        iter_buffers=None,
-        store_cache=None,
-        reduction_cache=None,
-        varname_map=None,
-    ):
-        self.prefix = prefix
-        self.suffix = suffix
-        self.cache = {}
-        self.name_prefix = name_prefix
-        self.store_cache = store_cache or {}
-        self.reduction_cache = reduction_cache or {}
-        self.iter_buffer_ids = iter_buffers or itertools.count()
-        self.invalidated_stores = set()
-        self.varname_map = varname_map or {}
-
-    def invalidate(self, keep_vars: Set[str]):
-        for name, tmp in list(self.store_cache.items()):
-            if tmp not in keep_vars:
-                del self.store_cache[name]
-                self.invalidated_stores.add(name)
-        self.cache = {k: v for k, v in self.cache.items() if v in keep_vars}
-
-    def clone(self):
-        # Note(fdrocha): reduction_cache is not being cloned, not sure if this is intentional
-        return CSE(
-            prefix=self.prefix,
-            suffix=self.suffix,
-            name_prefix=self.name_prefix,
-            iter_buffers=self.iter_buffer_ids,
-            store_cache=self.store_cache,
-            varname_map=self.varname_map,
-        )
-
-    def generate(
-        self,
-        buffer: IndentedBuffer,
-        expr: Union[str, CSEVariable, OpsValue],
-        *,
-        bounds: ValueRanges = ValueRanges.unknown(),
-        write=True,
-        assignment=True,
-    ) -> CSEVariable:
-        if isinstance(expr, OpsValue):
-            expr = expr.value
-
-        assert isinstance(expr, (str, CSEVariable)), type(expr)
-        assert write or assignment
-        if isinstance(expr, CSEVariable):
-            # If the expressions were always created with all the information, we could
-            # assert expr.bounds == bounds, but sometimes the expression is created
-            # with the loose ValueRanges.unknown(), so we need to tighten the bounds
-            expr.bounds = expr.bounds.tighten(bounds)
-            return expr
-        cache_key = expr
-        var = self.cache.get(cache_key, None)
-        if not var:
-            var = self.newvar(bounds) if assignment else None
-            self.cache[cache_key] = var
-            if write:
-                if V.kernel.current_node:
-                    V.kernel.current_node.codegen_originating_info(
-                        buffer, only_once=True
-                    )
-                if assignment:
-                    line = f"{self.prefix}{var} = {expr}{self.suffix}"
-                else:
-                    line = f"{expr}{self.suffix}"
-                buffer.writeline(line)
-        else:
-            var.bounds = var.bounds.tighten(bounds)
-
-        return var
-
-    def newvar(self, bounds: ValueRanges = ValueRanges.unknown()) -> CSEVariable:
-        var_name = f"{self.name_prefix}{next(self.iter_buffer_ids)}"
-        var = V.kernel.create_cse_var(var_name, bounds)
-        self.varname_map[var_name] = var
-        return var
-
-
-class CodeGen:
-    def __init__(self):
-        super().__init__()
-        self.exit_stack = contextlib.ExitStack()
-
-    def __enter__(self):
-        self.exit_stack.__enter__()
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.exit_stack.__exit__(exc_type, exc_val, exc_tb)
-
-
-class Kernel(CodeGen):
-    newvar_prefix = ""
-    suffix = ""
-    overrides = None
-    load_format = None
-    store_format = None
-
-    def __init__(self, args=None):
-        super().__init__()
-        metrics.generated_kernel_count += 1
-        self.args = args or KernelArgs()
-        self.loads = IndentedBuffer()
-        self.compute = IndentedBuffer()
-        self.stores = IndentedBuffer()
-        self.cse = CSE(self.newvar_prefix, self.suffix)
-        self.must_keep_buffers = set()
-        self.store_buffer_names = set()
-        # set in set_current_node
-        self.current_node = None
-        self.node_to_bounds: Optional[Dict[torch.fx.Node, ValueRanges]] = None
-
-    @contextlib.contextmanager
-    def set_current_node(self, node):
-        prior = self.current_node
-        self.current_node = node
-        self.node_to_bounds = node._body.bounds().get_bounds()
-        try:
-            yield
-        finally:
-            self.current_node = prior
-
-    @contextlib.contextmanager
-    def swap_buffers(self, lb, cb=None, sb=None):
-        if cb is None:
-            cb = lb
-        loads = self.loads
-        compute = self.compute
-        stores = self.stores
-        cse = self.cse
-        self.loads = lb
-        self.compute = cb
-        self.stores = sb
-        self.cse = cse.clone()
-        try:
-            yield
-        finally:
-            self.loads = loads
-            self.compute = compute
-            self.stores = stores
-            self.cse = cse
-
-    def load(self, name: str, index: sympy.Expr):
-        raise NotImplementedError()
-
-    def indirect_load(self, name: str, index: sympy.Expr):
-        """A load the depends on an index we have read"""
-        prior = self.loads
-        try:
-            # put the load in the compute section as it might have deps
-            self.loads = self.compute
-            return self.load(name, index)
-        finally:
-            self.loads = prior
-
-    def store_reduction(self, name, index, value):
-        raise NotImplementedError()
-
-    def store(self, name, index, value, mode=None):
-        raise NotImplementedError()
-
-    def reduction(self, dtype, src_dtype, reduction_type, value):
-        raise NotImplementedError()
-
-    def bucketize(
-        self,
-        values,
-        offsets_name: str,
-        offsets_size: sympy.Expr,
-        indexing_dtype: torch.dtype,
-        right: bool,
-    ):
-        """
-        See [Note: Inductor bucketize op]
-        """
-        raise NotImplementedError()
-
-    def __enter__(self):
-        class CSEProxy:
-            self.name = "CSEProxy"
-
-            @staticmethod
-            def __getattr__(name: str) -> Callable[..., CSEVariable]:  # type: ignore[misc]
-                def inner(*args, **kwargs):
-                    # TritonTemplateKernel has no current_node
-                    buf_bounds = ValueRanges.unknown()
-                    if hasattr(V.interpreter, "current_node"):
-                        fx_node = V.interpreter.current_node
-                        assert isinstance(self.node_to_bounds, dict)
-                        buf_bounds = self.node_to_bounds.get(
-                            fx_node, ValueRanges.unknown()
-                        )
-
-                    csevar = self.cse.generate(
-                        self.compute,
-                        getattr(parent_handler, name)(*args, **kwargs),  # type: ignore[has-type]
-                        bounds=buf_bounds,
-                    )
-                    csevar.update_on_args(name, args, kwargs)
-                    return csevar
-
-                return inner
-
-            @staticmethod
-            def indirect_indexing(index_var, size, check=True):
-                # Skip CSE since this doesn't return an expression
-                return self.indirect_indexing(index_var, size, check)  # type: ignore[attr-defined]
-
-            @staticmethod
-            def load(name: str, index: sympy.Expr):
-                if name in self.cse.invalidated_stores:
-                    # A load from an invalidated store requires us to
-                    # keep the actual buffer around
-                    V.kernel.must_keep_buffers.add(name)
-                if free_symbol_startswith(index, "tmp"):
-                    return self.indirect_load(name, index)
-                store_cache = self.cse.store_cache
-                if name in store_cache:
-                    return store_cache[name]
-                return self.load(name, index)
-
-            @staticmethod
-            def store(name, index, value, mode=None):
-                self.store_buffer_names.add(name)
-                if mode is None:
-                    self.cse.store_cache[name] = value
-                    if self.current_node:
-                        for other_name in self.current_node.get_mutations():
-                            self.cse.store_cache[other_name] = value
-                if name not in V.graph.removed_buffers:
-                    return self.store(name, index, value, mode=mode)
-
-            @staticmethod
-            def store_reduction(name, index, value):
-                self.store_buffer_names.add(name)
-                self.cse.store_cache[name] = value
-                if self.current_node:
-                    for other_name in self.current_node.get_mutations():
-                        self.cse.store_cache[other_name] = value
-
-                if name not in V.graph.removed_buffers:
-                    return self.store_reduction(name, index, value)
-
-            @staticmethod
-            def reduction(dtype, src_dtype, reduction_type, value):
-                return self.reduction(dtype, src_dtype, reduction_type, value)
-
-            @staticmethod
-            def bucketize(
-                values,
-                offsets_name: str,
-                offsets_size: sympy.Expr,
-                indexing_dtype: torch.dtype,
-                right: bool,
-            ):
-                """
-                [Note: Inductor bucketize op]
-
-                Given values (tensor) and offsets_name (reference to the name of a 1D
-                tensor), calculate the bucket that each value belongs to.
-
-                e.g. for values [-1, 0, 1, 2, 3, 4, 5, 9], offsets [0, 4, 4, 8], right=True
-                return =        [ 0, 1, 1, 1, 1, 3, 3, 4].
-
-                When right == False, bucket i refers to range (offsets[i], offsets[i+1]].
-                When right == True,  bucket i refers to range [offsets[i], offsets[i+1]).
-
-                Offsets must be non-decreasing or the result is undefined.
-                """
-                return self.bucketize(
-                    values, offsets_name, offsets_size, indexing_dtype, right
-                )
-
-        super().__enter__()
-        assert self.overrides
-        parent_handler = self.overrides(V.get_ops_handler())
-        self.exit_stack.enter_context(V.set_ops_handler(CSEProxy()))
-        self.exit_stack.enter_context(V.set_kernel_handler(self))
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if V.graph.scheduler:
-            V.graph.scheduler.remove_kernel_local_buffers()
-        super().__exit__(exc_type, exc_val, exc_tb)
-
-    def rename_indexing(self, index) -> sympy.Expr:
-        # adds the necessary kernel args for index expressions
-        # and renames variables in index expressions to kernel arg names
-        if isinstance(index, (list, tuple)):
-            return [self.rename_indexing(x) for x in index]
-        index = V.graph.sizevars.simplify(index)
-        sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name)
-        replacements = {
-            x: self.args.size(x)
-            for x in sorted_symbols
-            if x.name.startswith("s") or x.name.startswith("ps")
-        }
-        return sympy_subs(index, replacements)
-
-    def create_cse_var(self, *args, **kwargs):
-        return CSEVariable(*args, **kwargs)
-
-
-@dataclasses.dataclass
-class OptimizationContext:
-    key: ClassVar[str] = "opt_ctx"
-
-    # Load value as mask
-    is_load_as_mask: bool = False
-
-    dtype: torch.dtype = None
-    ops_name: str = ""
-    is_most_inner_loop_irrevelant: bool = False
-
-    # Load uint8 value as float32
-    is_load_uint8_as_float: bool = False
\ No newline at end of file
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index b8c2b3b4..1761e05c 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -37,7 +37,7 @@
 # Backendsim config
 CONFIG_TORCHSIM_BACKEND_CONFIG = os.environ.get('TORCHSIM_CONFIG',
                                         default=f'{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
-CONFIG_BACKENDSIM_SPIKE_ONLY = int(os.environ.get("BACKENDSIM_SPIKE_ONLY", False))
+CONFIG_BACKENDSIM_SPIKE_ONLY = int(os.environ.get("BACKENDSIM_SPIKE_ONLY", True))
 CONFIG_BACKENDSIM_EAGER_MODE = int(os.environ.get("BACKENDSIM_EAGER_MODE", default=False))
 CONFIG_BACKENDSIM_DRYRUN = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
 CONFIG_BACKENDSIM_DEBUG_LEVEL = os.environ.get("BACKENDSIM_DEBUG_LEVEL", "")
@@ -71,8 +71,8 @@
 CONFIG_SUBTILE_K = int(os.environ.get('TORCHSIM_SUBTILE_K', default=CONFIG_VECTOR_LANE))
 
 # Advanced fusion options
-CONFIG_FUSION_REDUCTION = int(os.environ.get('TORCHSIM_FUSION_REDUCTION', default=False))
-CONFIG_FUSION_PROLOGUE = int(os.environ.get('TORCHSIM_FUSION_PROLOGUE', default=False))
+CONFIG_FUSION_REDUCTION = int(os.environ.get('TORCHSIM_FUSION_REDUCTION', default=True))
+CONFIG_FUSION_PROLOGUE = int(os.environ.get('TORCHSIM_FUSION_PROLOGUE', default=True))
 
 # SRAM Buffer allocation plan
 def load_plan_from_module(module_path):
diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 91ba9ba1..b81b3862 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -1,14 +1,14 @@
 import os
 from torch import empty_strided
-from typing import List, Optional, cast
+from typing import List, Optional
+import sympy
 
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
-from torch._inductor.ir import Buffer
 from torch._inductor.ir import IRNode
-from torch._inductor.ir import ReinterpretView
 from torch._inductor.codecache import write_atomic
 import PyTorchSimFrontend.extension_codecache as extension_codecache
+from PyTorchSimFrontend.mlir import mlir_common
 
 BMM_TEMPLATE = r"""
 // BMM kernel
@@ -21,63 +21,38 @@
 // TILE_K = {{ TILE_K }}
 // SUB_TILE_M = {{ SUB_TILE_M }}
 // SUB_TILE_N = {{ SUB_TILE_N }}
-#map0 = affine_map<(d0, d1, d2) -> ({{ X_map }})>
-#map1 = affine_map<(d0, d1, d2) -> ({{ W_map }})>
-#map2 = affine_map<(d0, d1, d2) -> (d0 * {{ M * N }} + d1 * {{ N }} + d2)>
-memref.global @X_spad : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-memref.global @W_spad : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-memref.global @Y_spad : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
 {{kernel.def_global_vars()}}
 
 func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} {
-  %c_mvin = arith.constant 2 : index
-  %c_mvin2 = arith.constant 1 : index{% if Bias %}
-  %c_mvin3 = arith.constant 14 : index{% endif %}
-  %c_mvout = arith.constant 3 : index
-  %vstride = arith.constant 1 : index
-  %axis = arith.constant 2 : index
-  %X_buffer = memref.get_global @X_spad : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-  %W_buffer = memref.get_global @W_spad : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-  %Y_buffer = memref.get_global @Y_spad : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-  %tag = memref.alloc() : memref<1xi32>
-  %tag0 = memref.alloc() : memref<1xi32>
-  %tag1 = memref.alloc() : memref<1xi32>
-  %tag2 = memref.alloc() : memref<1xi32>{% if not Bias %}
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
+  {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
+  {% if not Bias %}
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+  {% endif %}
   %c0 = arith.constant 0 : index
-{{ kernel.def_local_vars() }}
-  affine.for %b=0 to {{ B }} {
-    affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
-      affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
-        %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-        %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-        %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-
-        %index2 = affine.apply #map2(%b, %t_m, %t_n)
+  {{ kernel.def_local_vars(indent_size=2) }}
+  affine.for %index0 = 0 to {{ B }} {
+    affine.for %index1 = 0 to {{ M }} step {{ TILE_M }} {
+      affine.for %index2 = 0 to {{ N }} step {{ TILE_N }} {
+        %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
+        %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
+        %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
         {% if Bias -%}
-        memref.dma_start %Bias[
-        {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
-          ], %Y_buffer2D[0, 0], %c_mvin3, %tag0[%c0], %
-        {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
-          , %vstride : memref<
-        {%- if Bias_rank == 2 -%} {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
-          xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1 , {{ TILE_M }}] }
+        {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[1, SUB_TILE_M, SUB_TILE_N], indent_size=8) }}
         {%- else -%}
-        affine.vector_store %v0, %Y_buffer2D[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
-        affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
-          %index0 = affine.apply #map0(%b, %t_m, %t_k)
-          %index1 = affine.apply #map1(%b, %t_k, %t_n)
-          memref.dma_start %X[%index0], %X_buffer2D[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
-             : memref<{{ B * M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
-          memref.dma_start %W[%index1], %W_buffer2D[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
-             : memref<{{ B * K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
+        affine.vector_store %v0, %Y_buffer[0, 0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+        {% endif %}
 
+        affine.for %index3 = 0 to {{ K }} step {{ TILE_K }} {
+          {{ kernel.def_dma_op("MVIN", "X", X_idx, X_tile_desc, subtile_size=[1, SUB_TILE_M, SUB_TILE_K], indent_size=10) }}
+          {{ kernel.def_dma_op("MVIN", "W", W_idx, W_tile_desc, subtile_size=[1, SUB_TILE_K, SUB_TILE_N], indent_size=10) }}
           linalg.matmul ins(%X_buffer2D, %W_buffer2D : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                   outs(%Y_buffer2D : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
-        } { accumulation_loop=true }
+        } { accumulation_loop=true, subtile_loop="k" }
         {{kernel.store_output(indent_size=8)}}
-      } { outer_loop=true }
-    } { outer_loop=true }
+      } { outer_loop=true, subtile_loop="n" }
+    } { outer_loop=true, subtile_loop="m" }
   } { outer_loop=true }
   return
 }
@@ -94,59 +69,36 @@
 // TILE_K = {{ TILE_K }}
 // SUB_TILE_M = {{ SUB_TILE_M }}
 // SUB_TILE_N = {{ SUB_TILE_N }}
-#map0 = affine_map<(d0, d1, d2) -> ({{ X_map }})>
-#map1 = affine_map<(d0, d1, d2) -> ({{ W_map }})>
-#map2 = affine_map<(d0, d1, d2) -> (d0 * {{ M * N }} + d1 * {{ N }} + d2)>
-memref.global @X_spad : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-memref.global @W_spad : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-memref.global @Y_spad : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
 {{kernel.def_global_vars()}}
 
 func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} {
-  %c_mvin = arith.constant 2 : index
-  %c_mvin2 = arith.constant 1 : index{% if Bias %}
-  %c_mvin3 = arith.constant 14 : index{% endif %}
-  %c_mvout = arith.constant 3 : index
-  %vstride = arith.constant 1 : index
-  %axis = arith.constant 2 : index
-  %X_buffer = memref.get_global @X_spad : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-  %W_buffer = memref.get_global @W_spad : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-  %Y_buffer = memref.get_global @Y_spad : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-  %tag = memref.alloc() : memref<1xi32>
-  %tag0 = memref.alloc() : memref<1xi32>
-  %tag1 = memref.alloc() : memref<1xi32>
-  %tag2 = memref.alloc() : memref<1xi32>{% if not Bias %}
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
+  {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
+  {% if not Bias %}
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+  {% endif %}
   %c0 = arith.constant 0 : index
-{{ kernel.def_local_vars() }}
-  affine.for %b=0 to {{ B }} {
-    affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
-      affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
+  {{ kernel.def_local_vars(indent_size=2) }}
+  affine.for %index0 = 0 to {{ B }} {
+    affine.for %index1 = 0 to {{ M }} step {{ TILE_M }} {
+      affine.for %index2 = 0 to {{ N }} step {{ TILE_N }} {
         %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
         %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
         %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-
-        %index2 = affine.apply #map2(%b, %t_m, %t_n)
         {% if Bias -%}
-        memref.dma_start %Bias[
-        {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
-          ], %Y_buffer2D[0, 0], %c_mvin3, %tag0[%c0], %
-        {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
-          , %vstride : memref<
-        {%- if Bias_rank == 2 -%} {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
-          xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1 , {{ TILE_M }}] }
+        {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[1, SUB_TILE_M, SUB_TILE_N], indent_size=8) }}
         {%- else -%}
-        affine.vector_store %v0, %Y_buffer2D[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
-        affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
-          %index0 = affine.apply #map0(%b, %t_m, %t_k)
-          %index1 = affine.apply #map1(%b, %t_k, %t_n)
-          {{kernel.prepare_input(indent_size=10)}}
+        affine.vector_store %v0, %Y_buffer[0, 0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+        {% endif %}
+        affine.for %index3 = 0 to {{ K }} step {{ TILE_K }} {
+          {{kernel.load_input(indent_size=10)}}
           linalg.matmul ins(%X_buffer2D, %W_buffer2D : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                   outs(%Y_buffer2D : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
-        } { accumulation_loop=true }
-        memref.dma_start %Y_buffer[%c0, %c0, %c0], %Y[%index2], %c_mvout, %tag[%c0], %axis, %vstride : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ B * M * N }}xf32>, memref<1xi32> { padding=0, sram_stride=[1, 1, {{ TILE_M }}] }
-      } { outer_loop=true }
-    } { outer_loop=true }
+        } { accumulation_loop=true, subtile_loop="k" }
+        {{kernel.store_output(indent_size=8)}}
+      } { outer_loop=true, subtile_loop="n" }
+    } { outer_loop=true, subtile_loop="m" }
   } { outer_loop=true }
   return
 }
@@ -163,65 +115,39 @@
 // TILE_K = {{ TILE_K }}
 // SUB_TILE_M = {{ SUB_TILE_M }}
 // SUB_TILE_N = {{ SUB_TILE_N }}
-#map0 = affine_map<(d0, d1, d2) -> ({{ X_map }})>
-#map1 = affine_map<(d0, d1, d2) -> ({{ W_map }})>
-#map2 = affine_map<(d0, d1, d2) -> (d0 * {{ M * N }} + d1 * {{ N }} + d2)>
-memref.global @X_spad : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-memref.global @W_spad : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-memref.global @Y_spad : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
 {{kernel.def_global_vars()}}
 
 func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} {
-  %c_mvin = arith.constant 2 : index
-  %c_mvin2 = arith.constant 1 : index{% if Bias %}
-  %c_mvin3 = arith.constant 14 : index{% endif %}
-  %c_mvout = arith.constant 3 : index
-  %vstride = arith.constant 1 : index
-  %axis = arith.constant 2 : index
-  %X_buffer = memref.get_global @X_spad : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-  %W_buffer = memref.get_global @W_spad : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-  %Y_buffer = memref.get_global @Y_spad : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-  %tag = memref.alloc() : memref<1xi32>
-  %tag0 = memref.alloc() : memref<1xi32>
-  %tag1 = memref.alloc() : memref<1xi32>
-  %tag2 = memref.alloc() : memref<1xi32>{% if not Bias %}
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
+  {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
+  {% if not Bias %}
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+  {% endif %}
   %c0 = arith.constant 0 : index
-{{ kernel.def_local_vars() }}
-  affine.for %b=0 to {{ B }} {
-    affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
-      %red_idx = affine.apply affine_map<(d0, d1) -> ({{M}}*d0 + d1)>(%b, %t_n)
-      {{kernel.reduction_acc()}} affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {{kernel.reduction_iter_arg()}} {
+  {{ kernel.def_local_vars() }}
+  affine.for %index0=0 to {{ B }} {
+    affine.for %index2 = 0 to {{ N }} step {{ TILE_N }} {
+      affine.for %index1 = 0 to {{ M }} step {{ TILE_M }} {
         %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
         %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-        %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+        %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_N }}x{{ TILE_M }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
 
-        %index2 = affine.apply #map2(%b, %t_m, %t_n)
         {% if Bias -%}
-        memref.dma_start %Bias[
-        {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
-          ], %Y_buffer2D[0, 0], %c_mvin3, %tag0[%c0], %
-        {%- if Bias_rank == 2 -%} axis {%- else -%} c0 {%- endif -%}
-          , %vstride : memref<
-        {%- if Bias_rank == 2 -%} {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
-          xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1 , {{ TILE_M }}] }
+        {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[1, SUB_TILE_M, SUB_TILE_N], indent_size=8) }} // Why not N,M? Currently, dma-fine-grained pass assume M->N order...
         {%- else -%}
-        affine.vector_store %v0, %Y_buffer2D[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
-        affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
-          %index0 = affine.apply #map0(%b, %t_m, %t_k)
-          %index1 = affine.apply #map1(%b, %t_k, %t_n)
-          memref.dma_start %X[%index0], %X_buffer2D[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
-             : memref<{{ B * M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
-          memref.dma_start %W[%index1], %W_buffer2D[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
-             : memref<{{ B * K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
-
+        affine.vector_store %v0, %Y_buffer[0, 0, 0] : memref<1x{{ TILE_N }}x{{ TILE_M }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+        {% endif %}
+        affine.for %index3 = 0 to {{ K }} step {{ TILE_K }} {
+          {{ kernel.def_dma_op("MVIN", "X", X_idx, X_tile_desc, subtile_size=[1, SUB_TILE_M, SUB_TILE_K], indent_size=10) }}
+          {{ kernel.def_dma_op("MVIN", "W", W_idx, W_tile_desc, subtile_size=[1, SUB_TILE_K, SUB_TILE_N], indent_size=10) }}
           linalg.matmul ins(%X_buffer2D, %W_buffer2D : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                   outs(%Y_buffer2D : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
-        } { accumulation_loop=true, loop_k=true }
+        } { accumulation_loop=true, subtile_loop="k" }
         {{kernel.store_output(indent_size=8)}}
-      } { outer_loop=true, loop_m=true }
+      } { outer_loop=true, subtile_loop="m" }
       {{kernel.reduction_output(indent_size=6)}}
-    } { outer_loop=true, loop_n=true}
+    } { outer_loop=true, subtile_loop="n" }
   } { outer_loop=true }
   return
 }
@@ -239,9 +165,8 @@ def render(self,
                **kwargs):
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
-        #if epilogue_nodes is not None and len(epilogue_nodes) > 0:
-        #    self.output_node = cast(Buffer, epilogue_nodes[-1])
 
+        # Extract input arguments info
         X, W = self.input_nodes[0], self.input_nodes[1]
         Y = self.output_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
@@ -252,113 +177,150 @@ def render(self,
           W_tensor = W_tensor.view([-1, W_tensor.shape[-2], W_tensor.shape[-1]])
         if len(X_tensor.size()) > 3:
           X_tensor = X_tensor.view([-1, X_tensor.shape[-2], X_tensor.shape[-1]])
+        B, M, N, K = X_tensor.size()[0], X_tensor.size()[1], W_tensor.size()[2], X_tensor.size()[2]
+
         W_stride = W_tensor.stride()
         X_stride = X_tensor.stride()
-        W_map = " + ".join([f"d{idx}*{s}" for idx, s in enumerate(W_stride)])
-        X_map = " + ".join([f"d{idx}*{s}" for idx, s in enumerate(X_stride)])
 
-        B, M, N, K = X_tensor.size()[0], X_tensor.size()[1], W_tensor.size()[2], X_tensor.size()[2]
+        # Select tile size
         n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
         TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node)
-        TOG_latency = M if TILE_M > M else TILE_M
-        kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
-        TILE_K = TILE_K // 2 if prologue_nodes else TILE_K
         SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
         SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
         SUB_TILE_K = TILE_K # if (TILE_K < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
 
+        TOG_latency = M if TILE_M > M else TILE_M
+        kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
+        TILE_K = TILE_K // 2 if prologue_nodes else TILE_K
+
+        # Select template code
         nr_reduction_nodes = [node for node in epilogue_nodes if node.is_reduction()] if epilogue_nodes is not None else []
         if nr_reduction_nodes:
           template = BMM_REDUCTION_TEMPLATE
+          epilogue_dim_aliasing = {"index0":"index0", "index1":"index2", "index2": "index1"}
           nr_rdim = 1
         elif prologue_nodes:
           template = BMM_PROLOGUE_TEMPLATE
+          epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2"}
           nr_rdim = 0
         else:
           template = BMM_TEMPLATE
+          epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2"}
           nr_rdim = 0
 
+        # Prepare tile descriptors
+        vlane_stride = 1
+        vlane_split_axis = 2
+        loop_dim = [sympy.Symbol("index0"), sympy.Symbol("index1"), sympy.Symbol("index2"), sympy.Symbol("index3")]
+        X_tile_size = [1, TILE_M, TILE_K]
+        X_tile_stride = [0, 1, TILE_M]
+        X_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        X_tile_desc.set_tile_size_stride(X_tile_size, X_tile_stride)
+        X_tile_desc.set_name("X_buffer")
+        X_stride = X_tensor.stride()
+        X_idx = [loop_dim[0]*X_stride[0], loop_dim[1]*X_stride[1], loop_dim[3]*X_stride[2]] # To keep index arguemnt order, we used index_list
+
+        W_tile_size = [1, TILE_K, TILE_N]
+        W_tile_stride = [0, 1, TILE_K]
+        W_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        W_tile_desc.set_tile_size_stride(W_tile_size, W_tile_stride)
+        W_tile_desc.set_name("W_buffer")
+        W_stride = W_tensor.stride()
+        W_idx = [loop_dim[0]*W_stride[0], loop_dim[3]*W_stride[1], loop_dim[2]*W_stride[2]]
+
+        vlane_split_axis = vlane_split_axis if nr_rdim==0 else 1
+        Y_tile_size = [1, TILE_M, TILE_N] if nr_rdim == 0 else [1, TILE_N, TILE_M]
+        Y_tile_stride=[0, 1, TILE_M] if nr_rdim == 0 else [0, TILE_M, 1]
+        Y_tile_desc = mlir_common.MLIRMultiDimTile(Y_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        Y_tile_desc.set_tile_size_stride(Y_tile_size, Y_tile_stride)
+        Y_tile_desc.set_name("Y_buffer")
+        Y_stride = Y.get_layout().stride
+        if nr_rdim == 0:
+          Y_idx = [loop_dim[0]*Y_stride[0], loop_dim[1]*Y_stride[1], loop_dim[2]*Y_stride[2]]
+        else:
+          Y_idx = [loop_dim[0]*Y_stride[0], loop_dim[2]*Y_stride[2], loop_dim[1]*Y_stride[1]]
+
+        # Extract Bias info
+        if Bias is not None:
+          Bias_stride = Bias.get_layout().stride
+          if nr_rdim == 0:
+            Bias_idx = [loop_dim[0]*Bias_stride[0], loop_dim[1]*Bias_stride[1], loop_dim[2]*Bias_stride[2]]
+          else:
+            Bias_idx = [loop_dim[0]*Bias_stride[0], loop_dim[2]*Bias_stride[2], loop_dim[1]*Bias_stride[1]]
+        else:
+          Bias_idx = None
+
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
-            B=B,
-            M=M,
-            N=N,
-            K=K,
-            TILE_M=TILE_M,
-            TILE_N=TILE_N,
-            TILE_K=TILE_K,
+            B=B, M=M, N=N, K=K,
+            TILE_M=TILE_M, TILE_N=TILE_N, TILE_K=TILE_K,
             SUB_TILE_M=SUB_TILE_M,
             SUB_TILE_N=SUB_TILE_N,
             SUB_TILE_K=SUB_TILE_K,
             DATA_STYPE="f32",
-            DATA_SIZE=4,
-            X = X,
-            W = W,
-            Y = Y,
-            Bias = Bias,
-            Bias_rank = len(Bias.data.get_size()) if Bias is not None else 0,
-            X_map = X_map,
-            W_map = W_map,
-            Y_numel = B * M * N,
+            X = X, W = W,Y = Y, Bias = Bias,
+            X_idx = X_idx,
+            W_idx = W_idx,
+            Bias_idx = Bias_idx,
+            X_tile_desc = X_tile_desc,
+            W_tile_desc = W_tile_desc,
+            Y_tile_desc = Y_tile_desc,
             input_reorder = self.input_reorder
         )
 
         if prologue_nodes:
-          # if Input fused:
-          #   tile_size = (TILE_M, TILE_K)
-          #   input_sram_stride = [1, TILE_M]
-          # elif Weight fused:
-          tile_size = (TILE_K, TILE_N)
-          input_sram_stride = [1, TILE_K]
+          prologue_output_name = list(prologue_nodes[0].read_writes.writes)[0].name
+          if prologue_output_name == X.get_name():
+            # Input fusion case
+            prologue_var = "X"
+            prologue_sram_var = "X_buffer"
+            prologue_tile_desc = X_tile_desc
+            prologue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2":"index3"}
+            is_input_fused = True
+          else:
+            # Weight fusion case
+            prologue_var = "W"
+            prologue_sram_var = "W_buffer"
+            prologue_tile_desc = W_tile_desc
+            prologue_dim_aliasing = {"index0":"index0", "index1":"index3", "index2":"index2"}
+            is_input_fused = False
+ 
           kernel.prologue_info = dict (
-              input_sram_var = "X_buffer2D",
               input_dram_var = "X",
-              input_index_var = "index0",
-              input_tag_var = "tag1",
-              input_numel = B * M * K,
-              input_tile_size = (TILE_M, TILE_K),
-              input_sram_stride = input_sram_stride,
-              input_subtile_size = (SUB_TILE_M, SUB_TILE_K),
-              weight_sram_var = "W_buffer2D",
+              input_sram_var = "X_buffer",
+              input_tile_desc = X_tile_desc,
+              input_idx = X_idx,
+              input_subtile_size = [1, TILE_M, TILE_K], # TODO. Curently, Subtiling is not supported for prologue template
+              input_dim_aliasing = {"index0":"index0", "index1":"index1", "index2":"index3"},
+
               weight_dram_var = "W",
-              weight_index_var = "index1",
-              weight_tag_var = "tag2",
-              weight_numel = B * K * N,
-              weight_tile_size = (TILE_K, TILE_N),
-              weight_sram_stride = [1, TILE_K],
-              weight_subtile_size = (SUB_TILE_K, SUB_TILE_N),
-              tile_size = tile_size,
-              vlane_split_axis = 1,
-              vlane_stride = 1,
+              weight_sram_var = "W_buffer",
+              weight_tile_desc = W_tile_desc,
+              weight_idx = W_idx,
+              weight_subtile_size = [1, TILE_K, TILE_N], # TODO. Curently, Subtiling is not supported for prologue template
+              weight_dim_aliasing = {"index0":"index0", "index1":"index3", "index2":"index2"},
+
+              # Descriptor for fusion
+              dram_var = prologue_var,
+              sram_var = prologue_sram_var,
+              dram_tile_desc = prologue_tile_desc,
+              dim_aliasing = prologue_dim_aliasing,
               is_bmm = True,
+              is_input_fused = is_input_fused
           )
+
         kernel.epilogue_info = dict(
             output_node = self.output_node.name,
-            dependent_buf = [],
             sram_var = "Y_buffer",
             dram_var = "Y",
-            index_var = "index2",
-            tag_var = "tag",
-            vlane_split_axis = 2,
-            vlane_stride = 1,
-            mlir_dtype = kernel.render_options['DATA_STYPE'],
-            dram_shape = f"memref<{kernel.render_options['Y_numel']}x{kernel.render_options['DATA_STYPE']}>",
-            tile_size = (1, TILE_M, TILE_N),
-            tile_stride = [1, 1, TILE_M],
+            dram_idx = Y_idx,
+            dram_tile_desc = Y_tile_desc,
             nr_rdim = nr_rdim,
-            reduction_idx = "red_idx"
+            dim_aliasing = epilogue_dim_aliasing
         )
         code = self._template_from_string(template).render(**kernel.render_options)
         kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
-
-        self.header = f"float X_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_K)}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float W_spad[{kernel.get_spad_size_per_lane(TILE_K, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float Y_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header = f"float X_spad[{TILE_M * TILE_K}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float W_spad[{TILE_K * TILE_N}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float Y_spad[{TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
-
         return code
 
     def codegen_header(self, code, extra_headers):
@@ -368,6 +330,6 @@ def codegen_header(self, code, extra_headers):
         spike_write_path = os.path.join(write_path, "global_var.h")
         gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
         if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, self.header+extra_headers[0])
+            write_atomic(spike_write_path, extra_headers[0])
         if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, self.gem5_header+extra_headers[1])
\ No newline at end of file
+            write_atomic(gem5_write_path, extra_headers[1])
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 6f38b08a..6dbe9047 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -877,7 +877,6 @@ def __init__(self, kernel_group, reason=None):
         self.affine_yield = {}
         self.welford_reduce_out = None
         self.reduce_iterator = {}
-        self.is_template_kernel = False
         self.spad_buffer_dict = dict()
         self.base_vector_initialized = False
 
@@ -919,7 +918,7 @@ def convert_index(self, expr, buffer):
         index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})")
         return index
 
-    def parse_indices(self, expr, buffer=None) -> common.CSEVariable:
+    def parse_indices(self, expr, buffer=None, comments="") -> common.CSEVariable:
         if buffer is None:
             buffer = self.applys
 
@@ -951,6 +950,40 @@ def parse_indices(self, expr, buffer=None) -> common.CSEVariable:
         args = ", ".join(map(str, indices))
         map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args})[] -> ({expr_str})>")
         args = ", ".join([f"%{i}" for i in indices])
+        index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})[] {comments}")
+        return index
+
+    def parse_index_list(self, expr_list:list, buffer=None) -> common.CSEVariable:
+        if buffer is None:
+            buffer = self.applys
+        expr_list = [arg for arg in expr_list if arg != sympy.Number(0)]
+
+        if len(expr_list) == 1 and expr_list[0].is_number:
+            # Constant case
+            return self.get_const_cse(int(expr_list[0]))
+        elif len(expr_list) == 1 and expr_list[0].is_symbol:
+            # Identity case
+            return expr_list[0]
+
+        indices = []
+        new_expr_list = [0] * len(expr_list)
+        for idx, arg in enumerate(expr_list):
+            if arg.is_Mul and arg.args[0].is_number:
+                new_arg = sympy.Symbol(str(self.convert_index(arg.args[1], buffer)))
+                new_expr_list[idx] = arg.subs(arg.args[1], new_arg)
+                indices.append(str(new_arg))
+            elif not arg.is_number:
+                new_arg = sympy.Symbol(str(self.convert_index(arg, buffer)))
+                new_expr_list[idx] = new_arg
+                indices.append(str(new_arg))
+            else:
+                new_expr_list[idx] = arg
+
+        # Extract index var
+        expr_str = str(sum(new_expr_list))
+        args = ", ".join(map(str, indices))
+        map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args})[] -> ({expr_str})>")
+        args = ", ".join([f"%{i}" for i in indices])
         index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})[]")
         return index
 
@@ -958,16 +991,18 @@ def load(self, name: str, index: sympy.Expr):
         index = self.rename_indexing(index)
         index = self.convert_indirect_indexing(index)
         padding = self.get_padding_type()
+
+        # Extract dram info
         dram_var = self.kernel_group.args.input(name)
+        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
-        local_tile_desc, index_var = self.get_dma_info(name, index)
-
+        # Extract sram info
+        local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
-        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
 
@@ -976,11 +1011,12 @@ def load(self, name: str, index: sympy.Expr):
         compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
 
         # Define scratch pad buffer
-        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
+        sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index)
 
         # MVIN Encoding
+        attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding={padding}}}"
         code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                 f"{name}_tag", dram_shape, tile_shape, tile_stride, padding)
+                                 f"{name}_tag", dram_shape, tile_shape, attribute)
         self.cse.generate(self.dma_loads, code, assignment = False) # FIXME: assignment = False does not support caching
         compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"])
         # Generate vector load instruction
@@ -1018,16 +1054,14 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
         # Prepare dma instruction
-        local_tile_desc, index_var = self.get_dma_info(name, index)
+        local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
-        tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
 
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
         tile_size = local_tile_desc.get_tile_size()
-
         # Compute vector unit size
         vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
         compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
@@ -1039,7 +1073,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
         if require_store:
             # Define scratch pad buffer
-            sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
+            sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index)
             compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"])
             # Generate vector store instruction
             store_size, operand_type = self.var_info[value]
@@ -1058,8 +1092,9 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
             sram_index_var = self.spad_buffer_dict[str(value)][3]
 
         # Generate DMA instruction
+        attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                 f"{name}_tag", dram_shape, tile_shape, tile_stride)
+                                 f"{name}_tag", dram_shape, tile_shape, attribute)
         self.dma_stores.writeline(common.DeferredLine(name, code))
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
@@ -1152,7 +1187,9 @@ def store_reduction(self, name, index, value):
         # Store reduction can't share cached value stored in cse,
         # since it is not innermost loop body.
         tmp_cse = self.cse
+        tmp_apply_cse = self.apply_cse
         self.cse = self.reduction_cse
+        self.apply_cse = self.reduction_cse
 
         dram_var = self.kernel_group.args.output(name)
         dtype = V.graph.get_dtype(name)
@@ -1160,10 +1197,9 @@ def store_reduction(self, name, index, value):
         index = self.rename_indexing(index)
 
         # Tile is always reuduced in inner loop
-        local_tile_desc, index_var = self.get_dma_info(name, index, broadcast=False, store_reduction=True, buffer=self.reductions_suffix)
+        local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index, broadcast=False, store_reduction=True, buffer=self.reductions_suffix)
         vlane_split_axis = local_tile_desc.vlane_split_axis
         vlane_stride = local_tile_desc.vlane_stride
-        tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
 
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
@@ -1173,7 +1209,7 @@ def store_reduction(self, name, index, value):
             vshape = f"{mlir_dtype}"
         else:
             vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
-        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
+        sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index)
         if self.welford_reduce_out is not None:
             # raise NotImplementedError()
             sum, sqr_sum, _ = self.welford_reduce_out
@@ -1206,12 +1242,14 @@ def store_reduction(self, name, index, value):
 
         # MVOUT Encoding
         # Generate DMA instruction
+        attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                 f"{name}_tag", dram_shape, tile_shape, tile_stride)
+                                 f"{name}_tag", dram_shape, tile_shape, attribute)
         self.reductions_suffix.writeline(common.DeferredLine(name, code))
 
         # Restore origin cse
         self.cse = tmp_cse
+        self.apply_cse = tmp_apply_cse
 
     def indirect_indexing(self, index_var, size, check=True):
         return str(index_var)
@@ -1496,7 +1534,6 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
 
         # TODO.
         kg_tile_desc = self.kernel_group.tile_desc
-        buffer_info = self.buffer_types[name]
         # Note: index could contain symbols that represent dynamic axies
         # Extract dimension of index(e.g, index0, index1)
         local_dims = [int(str(i)[5:]) for i in index.free_symbols if "index" in str(i)]
@@ -1505,23 +1542,16 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
         local_tile_desc = mlir_common.MLIRMultiDimTile([1], self.vector_lane)
         local_dims.sort() # Assume that smaller index is placed in the outer loop
         indirect_dims = [f"{i}" for i in index.free_symbols if "tmp" in str(i)]
-        indirect_arg_dims = [f"%{i}" for i in index.free_symbols if "tmp" in str(i)]
         for indirect_dim in indirect_dims:
             index = index.replace(sympy.Symbol(indirect_dim), 0)
 
         # Reduction can have two type of tile size
         if broadcast and (total_dims != local_dims or (self.reduction_depth!=len(total_dims) and total_dims[:self.reduction_depth] == local_dims)):
-            # We have to create custom apply map to provide dram stride
-            # ex) (d0, d1, ... dn, dn+1, dn+2, dk) -> (s0*d0 + s1*d1 + ... dn*0+ dn+1*0 + ... dk*0 + const)
-            fake_dim = self.get_const_cse(0)
-            input_expr = ",".join(["d"+str(i) for i in total_dims])
-            output_expr = str(index).replace('index', 'd')
-            input_argument = ",".join(["%index" + str(i) if i in local_dims else f"%{fake_dim}" for i in total_dims])
-            map_var = self.map_cse.generate(self.global_vars, f"affine_map<({input_expr})[{','.join(indirect_dims)}] -> ({output_expr})>")
-            index_var = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({input_argument})[{','.join(indirect_arg_dims)}]")
             local_dims = total_dims # Brodatcast tile shape
-        else:
-            index_var = self.parse_indices(index, buffer=buffer)
+
+        index_var = self.parse_indices(index, buffer=buffer)
+        input_argument = [f"index{str(i)}" for i in local_dims]
+        dram_stride = [index.coeff(sympy.Symbol(arg)) for arg in input_argument]
 
         if kg_tile_desc.vlane_split_axis in local_dims:
             local_vlane_split_axis = local_dims.index(kg_tile_desc.vlane_split_axis)
@@ -1533,6 +1563,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
             local_tile_desc.set_tile_size([kg_tile_desc.get_used_vlane() * kg_tile_desc.vlane_stride])         # Force it to use vector instruction.
             local_tile_desc.vlane_split_axis = local_vlane_split_axis    # last axis
             local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+            dram_stride = [0] # Edge case
         # Case 1. Tile is 1-D vector type
         elif len(local_dims) == 1 and len(local_dims) <= self.reduction_depth:
             local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(local_dims[0])])
@@ -1565,6 +1596,14 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
                 local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims])
                 local_tile_desc.vlane_split_axis = local_vlane_split_axis
                 local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+        # Case 4. Tile is 4-D tile (e.g., Convolution epilogue)
+        elif len(local_dims) == 4:
+            is_reduction = self.reduction_depth < 3 and not store_reduction
+            if is_reduction:
+                raise NotImplementedError("Currently not implemented... ;)")
+            local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims])
+            local_tile_desc.vlane_split_axis = local_vlane_split_axis
+            local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         else:
             raise NotImplementedError("Currently not implemented... ;)")
 
@@ -1580,27 +1619,26 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
             # Update
             local_tile_desc.set_tile_size(new_tile_size)
             local_tile_desc.vlane_split_axis = new_vlane_split_axis
+        return local_tile_desc, index_var, dram_stride
 
-        return local_tile_desc, index_var
-
-    def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_var, dram_index_var, sram_var, sram_index_var,
-                     tag_name, dram_shape, tile_shape, tile_stride, padding_type=0):
-        dma_key = (attribute1, attribute2, mlir_dtype)
+    def get_dma_code(self, dma_type_name, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, dram_index_var, sram_var, sram_index_var,
+                     tag_name, dram_shape, tile_shape, attribute):
+        dma_key = (vlane_split_axis, vlane_stride, mlir_dtype)
         if dma_type_name == "MVIN" and dma_key in self.dma_read_cache:
-            dma_type, attribute1, attribute2 = self.dma_read_cache[dma_key]
+            dma_type, vlane_split_axis, vlane_stride = self.dma_read_cache[dma_key]
         elif dma_type_name == "MVOUT" and dma_key in self.dma_write_cache:
-            dma_type, attribute1, attribute2 = self.dma_write_cache[dma_key]
+            dma_type, vlane_split_axis, vlane_stride = self.dma_write_cache[dma_key]
         else:
-            attribute1 = self.get_const_cse(attribute1)
-            attribute2 = self.get_const_cse(attribute2)
+            vlane_split_axis = self.get_const_cse(vlane_split_axis)
+            vlane_stride = self.get_const_cse(vlane_stride)
             if dma_type_name == "MVIN":
                 dma_type = self.get_const_cse(DMA_TYPE[f"{dma_type_name}{self.dma_read_counter}"])
                 self.dma_read_counter += 1
-                self.dma_read_cache[dma_key] = [dma_type, attribute1, attribute2]
+                self.dma_read_cache[dma_key] = [dma_type, vlane_split_axis, vlane_stride]
             else:
                 dma_type = self.get_const_cse(DMA_TYPE[f"{dma_type_name}{self.dma_write_counter}"])
                 # self.dma_write_counter += 1 Is it okay?
-                self.dma_write_cache[dma_key] = [dma_type, attribute1, attribute2]
+                self.dma_write_cache[dma_key] = [dma_type, vlane_split_axis, vlane_stride]
         tag = self.get_tag_cse(tag_name)
         zero_cse = self.get_const_cse(0)
 
@@ -1608,7 +1646,7 @@ def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_v
         dram_operand = f"%{dram_var}[%{dram_index_var}]"
         sram_operand = f"%{sram_var}[{sram_index_var}]" # Use string
         tag_var = f"%{tag}[%{zero_cse}]"
-        dma_attribute = f"%{attribute1}, %{attribute2}"
+        dma_attribute = f"%{vlane_split_axis}, %{vlane_stride}"
         sram_shape = tile_shape
         tag_shape = "memref<1xi32>"
 
@@ -1619,9 +1657,7 @@ def get_dma_code(self, dma_type_name, attribute1, attribute2, mlir_dtype, dram_v
             src_operand, dst_operand = sram_operand, dram_operand
             src_shape, dst_shape = sram_shape, dram_shape
 
-        code = f"memref.dma_start {src_operand}, {dst_operand}, %{dma_type}, {tag_var}, {dma_attribute} : {src_shape}, {dst_shape}, {tag_shape}"
-        code = code + f" {{padding={padding_type}, sram_stride={tile_stride}}}"
-        return code
+        return f"memref.dma_start {src_operand}, {dst_operand}, %{dma_type}, {tag_var}, {dma_attribute} : {src_shape}, {dst_shape}, {tag_shape} {attribute}"
 
     def adjust_tile_size(self):
         if self.read_writes is not None:
@@ -1672,34 +1708,44 @@ def adjust_tile_size(self):
         if len(self.itervars) >= 3 and self.reduction_depth < len(self.itervars):
             raise NotImplementedError()
 
-    def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, indices, raw_index, is_template=False, buffer=None):
+    def allocate_sram_buffer(self, dtype, dram_name, tile_desc, raw_index, buffer=None, forced_name=None):
         c_type = mlir_common.DTYPE_TO_C[dtype]
+        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
+        tile_numel_per_lane = tile_desc.get_numel_per_lane()
+        tile_shape = tile_desc.get_mlir_shape(mlir_dtype)
         # Make sure each lane's buffer has at least two element
-        tile_size = max(tile_size_per_lane, 2) * self.vector_lane
+        tile_size = max(tile_numel_per_lane, 2) * self.vector_lane
 
         if buffer is None:
             buffer = self.spad_buffer
 
-        if name not in self.global_vars_dict:
-            self.global_vars_dict[name] = dict()
+        if dram_name not in self.global_vars_dict:
+            self.global_vars_dict[dram_name] = dict()
 
-        if str(raw_index) not in self.global_vars_dict[name]:
-            new_name = f"buf{self.spadbuf_counter}"
+        if str(raw_index) not in self.global_vars_dict[dram_name]:
+            new_name = f"buf{self.spadbuf_counter}_spad" if forced_name is None else f"{forced_name}_spad"
             self.spadbuf_counter+=1
             # Add definition to header
-            self.header.writeline(f"{c_type} {new_name}_spad[{tile_size // self.vector_lane}] __attribute__ ((section(\".spad\")));")
-            self.gem5_header.writeline(f"{c_type} {new_name}_spad[{tile_size}] __attribute__((aligned(64)));")
-            self.global_vars.writeline(f"memref.global @{new_name}_spad : {dram_tile_shape}")
-            self.global_vars_dict[name][str(raw_index)] = new_name
+            self.header.writeline(f"{c_type} {new_name}[{tile_size // self.vector_lane}] __attribute__ ((section(\".spad\")));")
+            self.gem5_header.writeline(f"{c_type} {new_name}[{tile_size}] __attribute__((aligned(64)));")
+            self.global_vars.writeline(f"memref.global @{new_name} : {tile_shape}")
+            self.global_vars_dict[dram_name][str(raw_index)] = new_name
         else:
-            new_name = self.global_vars_dict[name][str(raw_index)]
-        sram_var = self.spad_cse.generate(buffer, f"memref.get_global @{new_name}_spad : {dram_tile_shape}")
+            new_name = self.global_vars_dict[dram_name][str(raw_index)]
+        return new_name
 
-        zero_cse = self.get_const_cse(0)
-        sram_dims = len(dram_tile_shape.split("x")) - 1
-        sram_index_var = ",".join([f"%{zero_cse}"] * sram_dims)
+    def get_scratchpad_buffer(self, dtype, dram_name, tile_desc, raw_index, buffer=None):
+        if buffer is None:
+            buffer = self.spad_buffer
 
-        return sram_var, indices, sram_index_var
+        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
+        tile_shape = tile_desc.get_mlir_shape(mlir_dtype)
+        new_name = self.allocate_sram_buffer(dtype, dram_name, tile_desc, raw_index, buffer=buffer)
+        sram_var = self.spad_cse.generate(buffer, f"memref.get_global @{new_name} : {tile_shape}")
+
+        zero_cse = self.get_const_cse(0)
+        sram_index_var = ",".join([f"%{zero_cse}"] * tile_desc.get_nr_dim())
+        return sram_var, sram_index_var
 
     def get_const_cse(self, value, dtype="index") -> common.CSEVariable:
         # Type convert
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 92af0570..00bf4169 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -179,6 +179,7 @@ def set_info(outer, inner, arg_type):
 
 class MLIRMultiDimTile():
     def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None, vec_size=None):
+        self.name = ""
         self._tile_size = list(tile_size)
         self._tile_stride = None
         self.tile_axis_order = list(range(len(tile_size)))
@@ -192,6 +193,9 @@ def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=N
         self.implicit_dim_size = None
         self.nr_rdim = 0
 
+    def set_name(self, name: str):
+        self.name = name
+
     def set_tile_size(self, tile_size, tile_axis_order=None):
         self._tile_size = tile_size
         if tile_axis_order is None:
@@ -204,6 +208,9 @@ def set_tile_size_stride(self, tile_size, tile_stride):
         self._tile_size = tile_size
         self._tile_stride = tile_stride
 
+    def get_name(self) -> str:
+        return self.name
+
     def get_tile_size(self):
         return self._tile_size
 
@@ -316,9 +323,6 @@ def __init__(self):
     def set_tile_info(self, tile_desc : MLIRMultiDimTile):
         self.tile_desc = tile_desc
 
-    def set_prologue_tile_info(self, tile_desc : MLIRMultiDimTile):
-        self.prologue_tile_desc = tile_desc
-
 class BaseMLIRHardwareInfo():
     def __init__(self):
         # Default HW setting
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
new file mode 100644
index 00000000..7968f813
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
@@ -0,0 +1,346 @@
+import os
+import math
+from sympy import  Symbol, Number
+from typing import List, Optional
+
+from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
+from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
+from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
+from torch._inductor.ir import IRNode
+from torch._inductor.codecache import write_atomic
+import PyTorchSimFrontend.extension_codecache as extension_codecache
+from PyTorchSimFrontend.mlir import mlir_common
+from torch._inductor.codecache import get_hash
+from PyTorchSimFrontend import extension_config
+
+CONV_TEMPLATE = r"""
+// Multi Channel Tile Conv2D kernel
+// BATCH = {{ BATCH }}
+// I_C = {{ I_C }}
+// I_H = {{ I_H }}
+// I_W = {{ I_W }}
+// O_C = {{ O_C }}
+// K_H = {{ K_H }}
+// K_W = {{ K_W }}
+// O_H = {{ O_H }}
+// O_W = {{ O_W }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
+// TILE_I_H={{ TILE_I_H }},
+// TILE_I_W={{ TILE_I_W }},
+// TILE_O_H={{ TILE_O_H }},
+// TILE_O_W={{ TILE_O_W }},
+// TILE_K_H={{ TILE_K_H }},
+// TILE_K_W={{ TILE_K_W }},
+// SUB_TILE_M={{ SUB_TILE_M }},
+// SUB_TILE_N={{ SUB_TILE_N }},
+// SUB_TILE_I_W={{ SUB_TILE_I_W }},
+// SUB_TILE_K_H={{ SUB_TILE_K_H }},
+// SUB_TILE_K_W={{ SUB_TILE_K_W }},
+// PADDING_H = {{ PADDING_H }}
+// PADDING_W = {{ PADDING_W }}
+// STRIDE_H = {{ STRIDE_H }}
+// STRIDE_W = {{ STRIDE_W }}
+// DATA_STYPE = {{ DATA_STYPE }}
+
+#map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
+#offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(1 * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
+#offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_O_W * TILE_M, TILE_K) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_K) }})>
+#offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_O_W * TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
+{{kernel.def_global_vars()}}
+
+func.func @{{ KERNEL_NAME }}{{kernel.def_conv_kernel(inputs=[X, W, BIAS], outputs=[Y], names_str="X, W, Bias, Y", padded_input_size=PADDED_INPUT_SIZE, input_reorder=input_reorder)}} {
+  {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+  %c0 = arith.constant 0 : index
+  {{- kernel.def_local_vars(indent_size=2) }}
+
+  affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M }} {
+    affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
+      affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
+        affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }} {
+          // Initialize output
+          {%- if BIAS %}
+          {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_N, TILE_O_H, TILE_O_W], indent_size=10) }}
+          {%- else %}
+          affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N) }}xf32>
+          {%- endif %}
+          affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
+            affine.for %tile_k = 0 to {{ I_C * K_W }} step {{ TILE_K }} {
+              %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
+              // Load input matrix
+              {{ kernel.def_dma_op("MVIN", "X", X_idx, X_tile_desc, subtile_size=[SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_M, SUB_TILE_K], indent_size=14) }}
+              {{ kernel.def_dma_op("MVIN", "W", W_idx, W_tile_desc, subtile_size=[SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_K, SUB_TILE_N], indent_size=14) }}
+              // Compute body part
+              affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
+                affine.for %tile_k_w = 0 to 1 {
+                  %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
+                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                  affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
+                    affine.for %tile_o_w = 0 to {{ TILE_O_W }} {
+                      %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
+                      %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_o_w)
+                      %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
+                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
+                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                    } { inner_loop=true }
+                  } { inner_loop=true }
+                } { inner_loop=true }
+              } { inner_loop=true }
+            } { accumulation_loop=true, subtile_loop="k" }
+          } { accumulation_loop=true }
+          // Store output matrix
+          {{kernel.store_output(indent_size=10)}}
+        } { outer_loop=true }
+      } { outer_loop=true }
+    } { outer_loop=true, subtile_loop="n" }
+  } { outer_loop=true, subtile_loop="m" }
+  return
+}
+"""
+
+WRAPPER_TEMPLATE = r"""
+def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
+    # Padding input
+    padded_shape = list(X.shape)
+    padded_shape[2] += 2 * {{ PADDING_H }}
+    padded_shape[3] += 2 * {{ PADDING_W }}
+    X_padding = torch.zeros(padded_shape, device=X.device)
+    X_padding[:, :, {{ PADDING_H }}:X.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:X.shape[3] + {{ PADDING_W }}] = X
+
+    # Tanspose inputs
+    {%- for buf, name in kernel.get_conv_inputs().items() %}
+      {%- if name == "X" %}
+    {{ name }} = {{ name }}_padding.permute(2, 0, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, BATCH, I_W, I_C)
+      {%- elif name == "W" %}
+    {{ name }} = {{ name }}.permute(2, 3, 1, 0).contiguous() # (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)
+      {%- elif name == "Bias" %}
+    {{ name }} = {{ name }}
+      {%- endif %}
+    {%- endfor %}
+
+    # Launch kernel
+    {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
+    {%- if BACKENDSIM_EAGER_MODE %}
+    yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
+    {%- endif %}
+"""
+
+class MLIRConvMultiTileTemplate(MLIRTemplate):
+    def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
+        super().__init__("kernel", input_nodes, layout, input_reorder)
+        self.stride = kwargs["stride"]
+        self.padding = kwargs["padding"]
+        self.dilation = kwargs["dilation"]
+        self.weight_shape = [str(i) for i in input_nodes[1].layout.size]
+        self.input_shape = [i for i in input_nodes[0].layout.size]
+        self.function_name = "Conv2D_" + "_".join(self.weight_shape)+ "_" \
+            + "_".join([str(i) for i in self.stride]) \
+            + "_" + "_".join([str(i) for i in self.padding]) \
+            + "_" + "_".join([str(i) for i in self.dilation])
+        self.kernel_args = ['X', 'W', 'Bias', 'Y']
+
+    def get_padded_input_size(self, X):
+        input_padded = list(X.layout.size)
+        input_padded[2] += 2 * self.padding[0]
+        input_padded[3] += 2 * self.padding[1]
+        return math.prod(input_padded)
+
+    def render(self,
+               kernel: MLIRTemplateKernel,
+               template_buffer_node = None,
+               epilogue_nodes: Optional[List[IRNode]] = None,
+               **kwargs):
+        # Extract input arguments info
+        if template_buffer_node is not None:
+            self.output_node = template_buffer_node
+        self.kernel = kernel
+        self.epilogue_nodes = epilogue_nodes
+
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
+        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+
+        if epilogue_nodes is not None:
+            extra_node_rw = {
+                item.name for epilogue_node in epilogue_nodes
+                for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes
+                if item.name != Y.name
+            }
+        n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0
+
+        BATCH, I_C, I_H, I_W = X.layout.size
+        O_C, _, K_H, K_W = W.layout.size
+        O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2]
+        O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
+        PADDING_H=self.padding[0]
+        PADDING_W=self.padding[1]
+        STRIDE_H=self.stride[0]
+        STRIDE_W=self.stride[1]
+
+        # Select tile size adn template
+        conv_template = CONV_TEMPLATE
+        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)
+        SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
+        TOG_latency = 8 if TOG_latency < 8 else TOG_latency
+        kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
+
+        # Prepare tile descriptors
+        vlane_stride = 1
+        vlane_split_axis = 1
+        X_tile_size = [TILE_I_H, TILE_O_W, TILE_M, TILE_K]
+        X_tile_stride = [TILE_O_W*TILE_M*TILE_K, TILE_M*TILE_K, 1, TILE_M]
+        X_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, 3, vlane_stride)
+        X_tile_desc.set_tile_size_stride(X_tile_size, X_tile_stride)
+        X_tile_desc.set_name("input_buffer")
+        X_dim = [Symbol("index_i_h"), Symbol("o_w"), Symbol("tile_m"), Symbol("tile_k")]
+        X_idx = [X_dim[0]*(I_W+2*PADDING_W)*BATCH*I_C, X_dim[1]*I_C*STRIDE_W, X_dim[2]*I_C*(I_W+2*PADDING_W), X_dim[3]]
+
+        W_tile_size = [TILE_K_H, 1, TILE_K, TILE_N]
+        W_tile_stride = [TILE_K_W * TILE_K * TILE_N, TILE_K * TILE_N, 1, TILE_K]
+        W_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, 3, vlane_stride)
+        W_tile_desc.set_tile_size_stride(W_tile_size, W_tile_stride)
+        W_tile_desc.set_name("weight_buffer")
+        W_dim = [Symbol("k_h"), Symbol("k_w"), Symbol("tile_k"), Symbol("tile_n")]
+        W_idx = [W_dim[0]*K_W*I_C*O_C , Symbol("c0"), W_dim[2]*O_C, W_dim[3]]
+
+        Y_tile_size = [TILE_M, TILE_N, TILE_O_H, TILE_O_W]
+        Y_tile_stride = [1, TILE_M, TILE_O_W * TILE_M * TILE_N, TILE_M * TILE_N] # N, C, H, W
+        Y_tile_desc = mlir_common.MLIRMultiDimTile(Y_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        Y_tile_desc.set_tile_size_stride(Y_tile_size, Y_tile_stride)
+        Y_tile_desc.set_name("output_buffer")
+        Y_dim = [Symbol("tile_m"), Symbol("tile_n"), Symbol("o_h"), Symbol("o_w")]
+        Y_idx = [Y_dim[0]*O_C*O_H*O_W, Y_dim[1]*O_H*O_W, Y_dim[2]*O_W, Y_dim[3]]
+        
+        # Extract Bias info
+        Bias_idx = [Number(0), Symbol("tile_n"), Number(0), Number(0)]
+
+        kernel.render_options = dict(
+            KERNEL_NAME=self.name,
+            kernel=kernel,
+            X=X, W=W, Y=Y, BIAS=Bias,
+            PADDED_INPUT_SIZE=self.get_padded_input_size(X),
+            BATCH=BATCH,
+            I_C=I_C,
+            I_H=I_H,
+            I_W=I_W,
+            O_C=O_C,
+            K_H=K_H,
+            K_W=K_W,
+            O_H=O_H,
+            O_W=O_W,
+            TILE_M=TILE_M,
+            TILE_N=TILE_N,
+            TILE_K=TILE_K,
+            TILE_I_H=TILE_I_H,
+            TILE_I_W=TILE_I_W,
+            TILE_O_H=TILE_O_H,
+            TILE_O_W=TILE_O_W,
+            TILE_K_H=TILE_K_H,
+            TILE_K_W=TILE_K_W,
+            SUB_TILE_M=SUB_TILE_M,
+            SUB_TILE_N=SUB_TILE_N,
+            SUB_TILE_K=SUB_TILE_K,
+            SUB_TILE_I_H=SUB_TILE_I_H,
+            SUB_TILE_I_W=SUB_TILE_I_W,
+            SUB_TILE_K_H=SUB_TILE_K_H,
+            SUB_TILE_K_W=SUB_TILE_K_W,
+            PADDING_H=PADDING_H,
+            PADDING_W=PADDING_W,
+            STRIDE_H=STRIDE_H,
+            STRIDE_W=STRIDE_W,
+            X_tile_desc = X_tile_desc,
+            W_tile_desc = W_tile_desc,
+            Y_tile_desc = Y_tile_desc,
+            X_idx = X_idx,
+            W_idx = W_idx,
+            Bias_idx = Bias_idx,
+            DATA_STYPE="f32",
+            input_reorder=self.input_reorder
+        )
+
+        kernel.epilogue_info = dict(
+            output_node = self.output_node.name,
+            sram_var = "output_buffer",
+            dram_var = "Y",
+            dram_idx = Y_idx,
+            dram_tile_desc = Y_tile_desc,
+            dim_aliasing = {"index0":"tile_m", "index1":"tile_n", "index2":"o_h", "index3":"o_w"}
+        )
+        kernel.exception_nodes["X"] = {"numel" : (I_W+2*PADDING_W)*(I_H+2*PADDING_H)*I_C*BATCH}
+        code = self._template_from_string(conv_template).render(**kernel.render_options)
+        kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
+        return code
+
+    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): 
+        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
+        SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+
+        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
+        TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
+        TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+        SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
+        SUB_TILE_K = TILE_K
+
+        TOG_latency = O_W if TILE_M > O_W else TILE_M
+        return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency
+
+    def outer_func_render(self, kernel_name, input_args):
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
+        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+
+        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
+        options = dict(
+            kernel=self.kernel,
+            KERNEL_NAME=kernel_name,
+            FUNC_NAME=self.function_name + f"_{len(input_args)}",
+            INPUT=X,
+            WEIGHT=W,
+            BIAS=Bias,
+            OUTPUT=Y,
+            PADDING_H=self.padding[0],
+            PADDING_W=self.padding[1],
+            VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
+            BACKENDSIM_EAGER_MODE=eager_mode,
+            input_reorder=self.input_reorder
+        )
+        code = self._template_from_string(WRAPPER_TEMPLATE).render(**options)
+        return code, self.function_name + f"_{len(input_args)}"
+
+    def get_arg_attributes(self):
+        arg_attributes = []
+
+        X = self.input_nodes[0]
+        X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)]
+        X_shape[0] += 2 * self.padding[0]
+        X_shape[1] += 2 * self.padding[1]
+
+        def compute_stride(shape):
+            stride = [1] * len(shape)
+            for i in range(len(shape)-2, -1, -1):
+                stride[i] = stride[i+1] * shape[i+1]
+            return stride
+
+        X_stride = compute_stride(X_shape)
+        arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]])
+
+        return arg_attributes
+
+    def codegen_header(self, code, extra_headers):
+        write_path = extension_codecache.get_write_path(code)
+        if not os.path.exists(write_path):
+            os.makedirs(write_path)
+        spike_write_path = os.path.join(write_path, "global_var.h")
+        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
+        if not os.path.exists(spike_write_path):
+            write_atomic(spike_write_path, extra_headers[0])
+        if not os.path.exists(gem5_write_path):
+            write_atomic(gem5_write_path, extra_headers[1])
+        self.hash_value = get_hash(code.strip())
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
new file mode 100644
index 00000000..f2df1e43
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
@@ -0,0 +1,342 @@
+import os
+import math
+from sympy import  Symbol, Number
+from typing import List, Optional
+
+from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
+from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
+from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
+from torch._inductor.ir import IRNode
+from torch._inductor.codecache import write_atomic
+import PyTorchSimFrontend.extension_codecache as extension_codecache
+from PyTorchSimFrontend.mlir import mlir_common
+from torch._inductor.codecache import get_hash
+from PyTorchSimFrontend import extension_config
+
+CONV_TEMPLATE = r"""
+// Single Batch Conv2D kernel
+// BATCH = {{ BATCH }}
+// I_C = {{ I_C }}
+// I_H = {{ I_H }}
+// I_W = {{ I_W }}
+// O_C = {{ O_C }}
+// K_H = {{ K_H }}
+// K_W = {{ K_W }}
+// O_H = {{ O_H }}
+// O_W = {{ O_W }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
+// TILE_I_H={{ TILE_I_H }},
+// TILE_I_W={{ TILE_I_W }},
+// TILE_O_H={{ TILE_O_H }},
+// TILE_O_W={{ TILE_O_W }},
+// TILE_K_H={{ TILE_K_H }},
+// TILE_K_W={{ TILE_K_W }},
+// SUB_TILE_M={{ SUB_TILE_M }},
+// SUB_TILE_N={{ SUB_TILE_N }},
+// SUB_TILE_I_W={{ SUB_TILE_I_W }},
+// SUB_TILE_K_H={{ SUB_TILE_K_H }},
+// SUB_TILE_K_W={{ SUB_TILE_K_W }},
+// PADDING_H = {{ PADDING_H }}
+// PADDING_W = {{ PADDING_W }}
+// STRIDE_H = {{ STRIDE_H }}
+// STRIDE_W = {{ STRIDE_W }}
+// DATA_STYPE = {{ DATA_STYPE }}
+
+#map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
+#map_I_W = affine_map<(d0, d1) -> (d0 * {{ STRIDE_W }} + d1)>
+#offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_K_W * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
+#offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_I_W, TILE_K) }} + d1)>
+#offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
+{{kernel.def_global_vars()}}
+
+func.func @{{ KERNEL_NAME }}{{kernel.def_conv_kernel(inputs=[X, W, BIAS], outputs=[Y], names_str="X, W, Bias, Y", padded_input_size=PADDED_INPUT_SIZE, input_reorder=input_reorder)}} {
+  {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+  %c0 = arith.constant 0 : index
+  {{- kernel.def_local_vars(indent_size=2) }}
+  affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
+    affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
+      affine.for %tile_m = 0 to {{ O_W }} step {{ TILE_M }} {
+        // Initialize output
+        {%- if BIAS %}
+        {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[1, SUB_TILE_N, TILE_O_H, SUB_TILE_M], indent_size=8) }}
+        {%- else %}
+        affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+        {%- endif %}
+        affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
+          affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} {
+            affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
+              %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
+              %index_i_w = affine.apply #map_I_W(%tile_m, %k_w)
+              // Load input & weight matrix
+              {{ kernel.def_dma_op("MVIN", "X", X_idx, X_tile_desc, subtile_size=[1, SUB_TILE_I_H, SUB_TILE_M, SUB_TILE_K], indent_size=14) }}
+              {{ kernel.def_dma_op("MVIN", "W", W_idx, W_tile_desc, subtile_size=[SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_K, SUB_TILE_N], indent_size=14) }}
+              // Compute body part
+              affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
+                affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
+                  %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
+                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                  affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
+                    affine.for %tile_o_w = 0 to {{ 1 }} { // TILE_O_W
+                      %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
+                      %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_k_w)
+                      %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
+                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
+                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                    } { inner_loop=true }
+                  } { inner_loop=true }
+                } { inner_loop=true }
+              } { inner_loop=true }
+            } { accumulation_loop=true, subtile_loop="k" }
+          } { accumulation_loop=true }
+        } { accumulation_loop=true }
+        // Store output matrix
+        {{kernel.store_output(indent_size=8)}}
+      } { outer_loop=true, subtile_loop="m" }
+    } { outer_loop=true }
+  } { outer_loop=true, subtile_loop="n" }
+  return
+}
+"""
+
+WRAPPER_TEMPLATE = r"""
+def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
+    # Padding input
+    padded_shape = list(X.shape)
+    padded_shape[2] += 2 * {{ PADDING_H }}
+    padded_shape[3] += 2 * {{ PADDING_W }}
+    X_padding = torch.zeros(padded_shape, device=X.device)
+    X_padding[:, :, {{ PADDING_H }}:X.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:X.shape[3] + {{ PADDING_W }}] = X
+
+    # Tanspose inputs
+    {%- for buf, name in kernel.get_conv_inputs().items() %}
+      {%- if name == "X" %}
+    {{ name }} = {{ name }}_padding.permute(0, 2, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (BATCH, I_H, I_W, I_C)
+      {%- elif name == "W" %}
+    {{ name }} = {{ name }}.permute(2, 3, 1, 0).contiguous() # (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)
+      {%- elif name == "Bias" %}
+    {{ name }} = {{ name }}
+      {%- endif %}
+    {%- endfor %}
+
+    # Launch kernel
+    {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
+    {%- if BACKENDSIM_EAGER_MODE %}
+    yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
+    {%- endif %}
+"""
+
+class MLIRConvSingleBatchTemplate(MLIRTemplate):
+    def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
+        super().__init__("kernel", input_nodes, layout, input_reorder)
+        self.stride = kwargs["stride"]
+        self.padding = kwargs["padding"]
+        self.dilation = kwargs["dilation"]
+        self.weight_shape = [str(i) for i in input_nodes[1].layout.size]
+        self.input_shape = [i for i in input_nodes[0].layout.size]
+        self.function_name = "Conv2D_" + "_".join(self.weight_shape)+ "_" \
+            + "_".join([str(i) for i in self.stride]) \
+            + "_" + "_".join([str(i) for i in self.padding]) \
+            + "_" + "_".join([str(i) for i in self.dilation])
+        self.kernel_args = ['X', 'W', 'Bias', 'Y']
+
+    def get_padded_input_size(self, X):
+        input_padded = list(X.layout.size)
+        input_padded[2] += 2 * self.padding[0]
+        input_padded[3] += 2 * self.padding[1]
+        return math.prod(input_padded)
+
+    def render(self,
+               kernel: MLIRTemplateKernel,
+               template_buffer_node = None,
+               epilogue_nodes: Optional[List[IRNode]] = None,
+               **kwargs):
+        # Extract input arguments info
+        if template_buffer_node is not None:
+            self.output_node = template_buffer_node
+        self.kernel = kernel
+        self.epilogue_nodes = epilogue_nodes
+
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
+        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+
+        if epilogue_nodes is not None:
+            extra_node_rw = {
+                item.name for epilogue_node in epilogue_nodes
+                for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes
+                if item.name != Y.name
+            }
+        n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0
+
+        BATCH, I_C, I_H, I_W = X.layout.size
+        O_C, _, K_H, K_W = W.layout.size
+        O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2]
+        O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
+        PADDING_H=self.padding[0]
+        PADDING_W=self.padding[1]
+        STRIDE_H=self.stride[0]
+        STRIDE_W=self.stride[1]
+
+        # Select tile size adn template
+        conv_template = CONV_TEMPLATE
+        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)
+        SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
+        TOG_latency = 8 if TOG_latency < 8 else TOG_latency
+        kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
+        # Prepare tile descriptors
+        vlane_stride = 1
+        vlane_split_axis = 1
+        X_tile_size = [1, TILE_I_H, TILE_I_W, TILE_K]
+        X_tile_stride = [TILE_I_H * TILE_I_W * TILE_K , TILE_I_W * TILE_K, 1, TILE_I_W]
+        X_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, 3, vlane_stride)
+        X_tile_desc.set_tile_size_stride(X_tile_size, X_tile_stride)
+        X_tile_desc.set_name("input_buffer")
+        X_dim = [Symbol("c0"), Symbol("index_i_h"), Symbol("index_i_w"), Symbol("tile_k")]
+        X_idx = [X_dim[0]*((I_W+2*PADDING_W)*(I_H+2*PADDING_H)*I_C), X_dim[1]*((I_W+2*PADDING_W)*I_C), X_dim[2]*I_C, X_dim[3]]
+
+        W_tile_size = [TILE_K_H, TILE_K_W, TILE_K, TILE_N]
+        W_tile_stride = [TILE_K_W * TILE_K * TILE_N, TILE_K * TILE_N, 1, TILE_K]
+        W_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, 3, vlane_stride)
+        W_tile_desc.set_tile_size_stride(W_tile_size, W_tile_stride)
+        W_tile_desc.set_name("weight_buffer")
+        W_dim = [Symbol("k_h"), Symbol("k_w"), Symbol("tile_k"), Symbol("tile_n")]
+        W_idx = [W_dim[0]*K_W*I_C*O_C , W_dim[1]*I_C*O_C, W_dim[2]*O_C, W_dim[3]]
+
+        Y_tile_size = [1, TILE_N, TILE_O_H, TILE_M]
+        Y_tile_stride = [TILE_O_W * TILE_M * TILE_N, TILE_M * TILE_N, TILE_M, 1] # N, C, H, W
+        Y_tile_desc = mlir_common.MLIRMultiDimTile(Y_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        Y_tile_desc.set_tile_size_stride(Y_tile_size, Y_tile_stride)
+        Y_tile_desc.set_name("output_buffer")
+        Y_idx = [Number(0), Symbol("tile_n")*O_H*O_W, Symbol("o_h")*O_W, Symbol("tile_m")]
+
+        # Extract Bias info
+        Bias_idx = [Number(0), Symbol("tile_n"), Number(0), Number(0)]
+
+        kernel.render_options = dict(
+            KERNEL_NAME=self.name,
+            kernel=kernel,
+            X=X, W=W, Y=Y, BIAS=Bias,
+            PADDED_INPUT_SIZE=self.get_padded_input_size(X),
+            BATCH=BATCH,
+            I_C=I_C,
+            I_H=I_H,
+            I_W=I_W,
+            O_C=O_C,
+            K_H=K_H,
+            K_W=K_W,
+            O_H=O_H,
+            O_W=O_W,
+            TILE_M=TILE_M,
+            TILE_N=TILE_N,
+            TILE_K=TILE_K,
+            TILE_I_H=TILE_I_H,
+            TILE_I_W=TILE_I_W,
+            TILE_O_H=TILE_O_H,
+            TILE_O_W=TILE_O_W,
+            TILE_K_H=TILE_K_H,
+            TILE_K_W=TILE_K_W,
+            SUB_TILE_M=SUB_TILE_M,
+            SUB_TILE_N=SUB_TILE_N,
+            SUB_TILE_K=SUB_TILE_K,
+            SUB_TILE_I_H=SUB_TILE_I_H,
+            SUB_TILE_I_W=SUB_TILE_I_W,
+            SUB_TILE_K_H=SUB_TILE_K_H,
+            SUB_TILE_K_W=SUB_TILE_K_W,
+            PADDING_H=PADDING_H,
+            PADDING_W=PADDING_W,
+            STRIDE_H=STRIDE_H,
+            STRIDE_W=STRIDE_W,
+            X_tile_desc = X_tile_desc,
+            W_tile_desc = W_tile_desc,
+            Y_tile_desc = Y_tile_desc,
+            X_idx = X_idx,
+            W_idx = W_idx,
+            Bias_idx = Bias_idx,
+            DATA_STYPE="f32",
+            input_reorder=self.input_reorder
+        )
+
+        kernel.epilogue_info = dict(
+            output_node = self.output_node.name,
+            sram_var = "output_buffer",
+            dram_var = "Y",
+            dram_idx = Y_idx,
+            dram_tile_desc = Y_tile_desc,
+            dim_aliasing = {"index0":"c0", "index1":"tile_n", "index2":"o_h", "index3":"tile_m"}
+        )
+        kernel.exception_nodes["X"] = {"numel" : (I_W+2*PADDING_W)*(I_H+2*PADDING_H)*I_C*BATCH}
+        code = self._template_from_string(conv_template).render(**kernel.render_options)
+        kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
+        return code
+
+    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W):
+        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
+        TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+        TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
+        SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
+        SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_K = TILE_K
+        TOG_latency = O_W if TILE_M > O_W else TILE_M
+        return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency
+
+    def outer_func_render(self, kernel_name, input_args):
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
+        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+
+        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
+        options = dict(
+            kernel=self.kernel,
+            KERNEL_NAME=kernel_name,
+            FUNC_NAME=self.function_name + f"_{len(input_args)}",
+            INPUT=X,
+            WEIGHT=W,
+            BIAS=Bias,
+            OUTPUT=Y,
+            PADDING_H=self.padding[0],
+            PADDING_W=self.padding[1],
+            VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
+            BACKENDSIM_EAGER_MODE=eager_mode,
+            input_reorder=self.input_reorder
+        )
+        code = self._template_from_string(WRAPPER_TEMPLATE).render(**options)
+        return code, self.function_name + f"_{len(input_args)}"
+
+    def get_arg_attributes(self):
+        arg_attributes = []
+
+        X = self.input_nodes[0]
+        X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)]
+        X_shape[0] += 2 * self.padding[0]
+        X_shape[1] += 2 * self.padding[1]
+
+        def compute_stride(shape):
+            stride = [1] * len(shape)
+            for i in range(len(shape)-2, -1, -1):
+                stride[i] = stride[i+1] * shape[i+1]
+            return stride
+
+        X_stride = compute_stride(X_shape)
+        arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]])
+
+        return arg_attributes
+
+    def codegen_header(self, code, extra_headers):
+        write_path = extension_codecache.get_write_path(code)
+        if not os.path.exists(write_path):
+            os.makedirs(write_path)
+        spike_write_path = os.path.join(write_path, "global_var.h")
+        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
+        if not os.path.exists(spike_write_path):
+            write_atomic(spike_write_path, extra_headers[0])
+        if not os.path.exists(gem5_write_path):
+            write_atomic(gem5_write_path, extra_headers[1])
+        self.hash_value = get_hash(code.strip())
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
new file mode 100644
index 00000000..3b60dcbc
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
@@ -0,0 +1,343 @@
+import os
+import math
+from sympy import  Symbol, Number
+from typing import List, Optional
+
+from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
+from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
+from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
+from torch._inductor.ir import IRNode
+from torch._inductor.codecache import write_atomic
+import PyTorchSimFrontend.extension_codecache as extension_codecache
+from PyTorchSimFrontend.mlir import mlir_common
+from torch._inductor.codecache import get_hash
+from PyTorchSimFrontend import extension_config
+
+CONV_TEMPLATE = r"""
+// Single Batch Conv2D (Stride != 1) kernel
+// BATCH = {{ BATCH }}
+// I_C = {{ I_C }}
+// I_H = {{ I_H }}
+// I_W = {{ I_W }}
+// O_C = {{ O_C }}
+// K_H = {{ K_H }}
+// K_W = {{ K_W }}
+// O_H = {{ O_H }}
+// O_W = {{ O_W }}
+// TILE_M = {{ TILE_M }}
+// TILE_N = {{ TILE_N }}
+// TILE_K = {{ TILE_K }}
+// TILE_I_H={{ TILE_I_H }},
+// TILE_I_W={{ TILE_I_W }},
+// TILE_O_H={{ TILE_O_H }},
+// TILE_O_W={{ TILE_O_W }},
+// TILE_K_H={{ TILE_K_H }},
+// TILE_K_W={{ TILE_K_W }},
+// SUB_TILE_M={{ SUB_TILE_M }},
+// SUB_TILE_N={{ SUB_TILE_N }},
+// SUB_TILE_I_W={{ SUB_TILE_I_W }},
+// SUB_TILE_K_H={{ SUB_TILE_K_H }},
+// SUB_TILE_K_W={{ SUB_TILE_K_W }},
+// PADDING_H = {{ PADDING_H }}
+// PADDING_W = {{ PADDING_W }}
+// STRIDE_H = {{ STRIDE_H }}
+// STRIDE_W = {{ STRIDE_W }}
+// DATA_STYPE = {{ DATA_STYPE }}
+
+#map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
+#offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_K_W * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
+#offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_M * TILE_K_W, TILE_K) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_K) }})>
+#offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
+
+{{kernel.def_global_vars()}}
+
+func.func @{{ KERNEL_NAME }}{{kernel.def_conv_kernel(inputs=[X, W, BIAS], outputs=[Y], names_str="X, W, Bias, Y", padded_input_size=PADDED_INPUT_SIZE, input_reorder=input_reorder)}} {
+  {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+  %c0 = arith.constant 0 : index
+  {{- kernel.def_local_vars(indent_size=2) }}
+
+  affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
+    affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
+      affine.for %tile_m = 0 to {{ O_W }} step {{ TILE_M }} {
+        // Initialize output
+        {%- if BIAS %}
+        {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[1, SUB_TILE_N, TILE_O_H, SUB_TILE_M], indent_size=8) }}
+        {%- else %}
+        affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+        {%- endif %}
+        affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
+          affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} {
+            affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
+              %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
+              // Load input & weight matrix
+              {{ kernel.def_dma_op("MVIN", "X", X_idx, X_tile_desc, subtile_size=[SUB_TILE_I_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_K], indent_size=14) }}
+              {{ kernel.def_dma_op("MVIN", "W", W_idx, W_tile_desc, subtile_size=[SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_K, SUB_TILE_N], indent_size=14) }}
+              // Compute body part
+              affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
+                affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
+                  %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
+                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                  affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
+                    affine.for %tile_o_w = 0 to {{ 1 }} { // TILE_O_W
+                      %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
+                      %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_k_w)
+                      %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
+                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
+                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                    } { inner_loop=true }
+                  } { inner_loop=true }
+                } { inner_loop=true }
+              } { inner_loop=true }
+            } { accumulation_loop=true, subtile_loop="k" }
+          } { accumulation_loop=true }
+        } { accumulation_loop=true }
+        // Store output matrix
+        {{kernel.store_output(indent_size=8)}}
+      } { outer_loop=true, subtile_loop="m" }
+    } { outer_loop=true }
+  } { outer_loop=true, subtile_loop="n" }
+  return
+}
+"""
+
+WRAPPER_TEMPLATE = r"""
+def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
+    # Padding input
+    padded_shape = list(X.shape)
+    padded_shape[2] += 2 * {{ PADDING_H }}
+    padded_shape[3] += 2 * {{ PADDING_W }}
+    X_padding = torch.zeros(padded_shape, device=X.device)
+    X_padding[:, :, {{ PADDING_H }}:X.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:X.shape[3] + {{ PADDING_W }}] = X
+
+    # Tanspose inputs
+    {%- for buf, name in kernel.get_conv_inputs().items() %}
+      {%- if name == "X" %}
+    {{ name }} = {{ name }}_padding.permute(0, 2, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (BATCH, I_H, I_W, I_C)
+      {%- elif name == "W" %}
+    {{ name }} = {{ name }}.permute(2, 3, 1, 0).contiguous() # (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)
+      {%- elif name == "Bias" %}
+    {{ name }} = {{ name }}
+      {%- endif %}
+    {%- endfor %}
+
+    # Launch kernel
+    {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
+    {%- if BACKENDSIM_EAGER_MODE %}
+    yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
+    {%- endif %}
+"""
+
+class MLIRConvSingleBatchStridedTemplate(MLIRTemplate):
+    def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
+        super().__init__("kernel", input_nodes, layout, input_reorder)
+        self.stride = kwargs["stride"]
+        self.padding = kwargs["padding"]
+        self.dilation = kwargs["dilation"]
+        self.weight_shape = [str(i) for i in input_nodes[1].layout.size]
+        self.input_shape = [i for i in input_nodes[0].layout.size]
+        self.function_name = "Conv2D_" + "_".join(self.weight_shape)+ "_" \
+            + "_".join([str(i) for i in self.stride]) \
+            + "_" + "_".join([str(i) for i in self.padding]) \
+            + "_" + "_".join([str(i) for i in self.dilation])
+        self.kernel_args = ['X', 'W', 'Bias', 'Y']
+
+    def get_padded_input_size(self, X):
+        input_padded = list(X.layout.size)
+        input_padded[2] += 2 * self.padding[0]
+        input_padded[3] += 2 * self.padding[1]
+        return math.prod(input_padded)
+
+    def render(self,
+               kernel: MLIRTemplateKernel,
+               template_buffer_node = None,
+               epilogue_nodes: Optional[List[IRNode]] = None,
+               **kwargs):
+        # Extract input arguments info
+        if template_buffer_node is not None:
+            self.output_node = template_buffer_node
+        self.kernel = kernel
+        self.epilogue_nodes = epilogue_nodes
+
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
+        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+
+        if epilogue_nodes is not None:
+            extra_node_rw = {
+                item.name for epilogue_node in epilogue_nodes
+                for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes
+                if item.name != Y.name
+            }
+        n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0
+
+        BATCH, I_C, I_H, I_W = X.layout.size
+        O_C, _, K_H, K_W = W.layout.size
+        O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2]
+        O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
+        PADDING_H=self.padding[0]
+        PADDING_W=self.padding[1]
+        STRIDE_H=self.stride[0]
+        STRIDE_W=self.stride[1]
+
+        # Select tile size adn template
+        conv_template = CONV_TEMPLATE
+        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)
+        SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
+        TOG_latency = 8 if TOG_latency < 8 else TOG_latency
+        kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
+
+        # Prepare tile descriptors
+        vlane_stride = 1
+        vlane_split_axis = 1
+        X_tile_size = [TILE_I_H, TILE_K_H, TILE_M, TILE_K]
+        X_tile_stride = [TILE_K_W*TILE_M*TILE_K, TILE_M*TILE_K, 1, TILE_M]
+        X_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, 3, vlane_stride)
+        X_tile_desc.set_tile_size_stride(X_tile_size, X_tile_stride)
+        X_tile_desc.set_name("input_buffer")
+        X_dim = [Symbol("index_i_h"), Symbol("k_w"), Symbol("tile_m"), Symbol("tile_k")]
+        X_idx = [X_dim[0]*((I_W+2*PADDING_W)*I_C), X_dim[1]*I_C, X_dim[2]*(I_C*STRIDE_W), X_dim[3]]
+
+        W_tile_size = [TILE_K_H, TILE_K_W, TILE_K, TILE_N]
+        W_tile_stride = [TILE_K_W * TILE_K * TILE_N, TILE_K * TILE_N, 1, TILE_K]
+        W_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, 3, vlane_stride)
+        W_tile_desc.set_tile_size_stride(W_tile_size, W_tile_stride)
+        W_tile_desc.set_name("weight_buffer")
+        W_dim = [Symbol("k_h"), Symbol("k_w"), Symbol("tile_k"), Symbol("tile_n")]
+        W_idx = [W_dim[0]*K_W*I_C*O_C , W_dim[1]*I_C*O_C, W_dim[2]*O_C, W_dim[3]]
+
+        Y_tile_size = [1, TILE_N, TILE_O_H, TILE_M]
+        Y_tile_stride = [TILE_O_W * TILE_M * TILE_N, TILE_M, TILE_M * TILE_N, 1] # N, C, H, W
+        Y_tile_desc = mlir_common.MLIRMultiDimTile(Y_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        Y_tile_desc.set_tile_size_stride(Y_tile_size, Y_tile_stride)
+        Y_tile_desc.set_name("output_buffer")
+        Y_idx = [Number(0), Symbol("tile_n")*O_H*O_W, Symbol("o_h")*O_W, Symbol("tile_m")]
+
+        # Extract Bias info
+        Bias_idx = [Number(0), Symbol("tile_n"), Number(0), Number(0)]
+
+        kernel.render_options = dict(
+            KERNEL_NAME=self.name,
+            kernel=kernel,
+            X=X, W=W, Y=Y, BIAS=Bias,
+            PADDED_INPUT_SIZE=self.get_padded_input_size(X),
+            BATCH=BATCH,
+            I_C=I_C,
+            I_H=I_H,
+            I_W=I_W,
+            O_C=O_C,
+            K_H=K_H,
+            K_W=K_W,
+            O_H=O_H,
+            O_W=O_W,
+            TILE_M=TILE_M,
+            TILE_N=TILE_N,
+            TILE_K=TILE_K,
+            TILE_I_H=TILE_I_H,
+            TILE_I_W=TILE_I_W,
+            TILE_O_H=TILE_O_H,
+            TILE_O_W=TILE_O_W,
+            TILE_K_H=TILE_K_H,
+            TILE_K_W=TILE_K_W,
+            SUB_TILE_M=SUB_TILE_M,
+            SUB_TILE_N=SUB_TILE_N,
+            SUB_TILE_K=SUB_TILE_K,
+            SUB_TILE_I_H=SUB_TILE_I_H,
+            SUB_TILE_I_W=SUB_TILE_I_W,
+            SUB_TILE_K_H=SUB_TILE_K_H,
+            SUB_TILE_K_W=SUB_TILE_K_W,
+            PADDING_H=PADDING_H,
+            PADDING_W=PADDING_W,
+            STRIDE_H=STRIDE_H,
+            STRIDE_W=STRIDE_W,
+            X_tile_desc = X_tile_desc,
+            W_tile_desc = W_tile_desc,
+            Y_tile_desc = Y_tile_desc,
+            X_idx = X_idx,
+            W_idx = W_idx,
+            Bias_idx = Bias_idx,
+            DATA_STYPE="f32",
+            input_reorder=self.input_reorder
+        )
+
+        kernel.epilogue_info = dict(
+            output_node = self.output_node.name,
+            sram_var = "output_buffer",
+            dram_var = "Y",
+            dram_idx = Y_idx,
+            dram_tile_desc = Y_tile_desc,
+            dim_aliasing = {"index0":"c0", "index1":"tile_n", "index2":"o_h", "index3":"tile_m"}
+        )
+        kernel.exception_nodes["X"] = {"numel" : (I_W+2*PADDING_W)*(I_H+2*PADDING_H)*I_C*BATCH}
+        code = self._template_from_string(conv_template).render(**kernel.render_options)
+        kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
+        return code
+
+    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W):
+        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
+        TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+        TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
+        SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
+        SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_K = TILE_K
+        TOG_latency = O_W if TILE_M > O_W else TILE_M
+        return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency
+
+    def outer_func_render(self, kernel_name, input_args):
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
+        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+
+        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
+        options = dict(
+            kernel=self.kernel,
+            KERNEL_NAME=kernel_name,
+            FUNC_NAME=self.function_name + f"_{len(input_args)}",
+            INPUT=X,
+            WEIGHT=W,
+            BIAS=Bias,
+            OUTPUT=Y,
+            PADDING_H=self.padding[0],
+            PADDING_W=self.padding[1],
+            VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
+            BACKENDSIM_EAGER_MODE=eager_mode,
+            input_reorder=self.input_reorder
+        )
+        code = self._template_from_string(WRAPPER_TEMPLATE).render(**options)
+        return code, self.function_name + f"_{len(input_args)}"
+
+    def get_arg_attributes(self):
+        arg_attributes = []
+
+        X = self.input_nodes[0]
+        X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)]
+        X_shape[0] += 2 * self.padding[0]
+        X_shape[1] += 2 * self.padding[1]
+
+        def compute_stride(shape):
+            stride = [1] * len(shape)
+            for i in range(len(shape)-2, -1, -1):
+                stride[i] = stride[i+1] * shape[i+1]
+            return stride
+
+        X_stride = compute_stride(X_shape)
+        arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]])
+
+        return arg_attributes
+
+    def codegen_header(self, code, extra_headers):
+        write_path = extension_codecache.get_write_path(code)
+        if not os.path.exists(write_path):
+            os.makedirs(write_path)
+        spike_write_path = os.path.join(write_path, "global_var.h")
+        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
+        if not os.path.exists(spike_write_path):
+            write_atomic(spike_write_path, extra_headers[0])
+        if not os.path.exists(gem5_write_path):
+            write_atomic(gem5_write_path, extra_headers[1])
+        self.hash_value = get_hash(code.strip())
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 0b6d13ef..cd4ddf82 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -1,16 +1,15 @@
 import os
 import math
-from sympy import divisors, Range
-from typing import List, Optional, cast
+from sympy import  Symbol, Number
+from typing import List, Optional
 
 from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
-from torch._inductor.ir import Buffer
 from torch._inductor.ir import IRNode
-from torch._inductor.ir import ReinterpretView
 from torch._inductor.codecache import write_atomic
 import PyTorchSimFrontend.extension_codecache as extension_codecache
+from PyTorchSimFrontend.mlir import mlir_common
 from torch._inductor.codecache import get_hash
 from PyTorchSimFrontend import extension_config
 
@@ -43,56 +42,30 @@
 // PADDING_W = {{ PADDING_W }}
 // STRIDE_H = {{ STRIDE_H }}
 // STRIDE_W = {{ STRIDE_W }}
-// DILATION_H = {{ DILATION_H }}
-// DILATION_W = {{ DILATION_W }}
 // DATA_STYPE = {{ DATA_STYPE }}
-// DATA_SIZE = {{ DATA_SIZE }}
 
-#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * BATCH * O_C }} + d1 * {{ BATCH * O_C }} + d2 * {{ O_C }} + d3)> // output (O_H, O_W, BATCH, O_C)
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * BATCH * I_C }} + d1 * {{ BATCH * I_C }} + d2 * {{ I_C }} + d3)> // input (I_H, I_W, BATCH, I_C)
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C)
 #map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
 #map_I_W = affine_map<(d0, d1) -> (d0 * {{ STRIDE_W }} + d1)>
 #offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_K_W * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
 #offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_I_W * TILE_M, TILE_K) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_K) }})>
 #offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_O_W * TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
-
-memref.global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-memref.global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-memref.global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
 {{kernel.def_global_vars()}}
 
 func.func @{{ KERNEL_NAME }}{{kernel.def_conv_kernel(inputs=[X, W, BIAS], outputs=[Y], names_str="X, W, Bias, Y", padded_input_size=PADDED_INPUT_SIZE, input_reorder=input_reorder)}} {
-  %c_mvin = arith.constant 2 : index
-  %c_mvin2 = arith.constant 1 : index
-  %c_mvin3 = arith.constant 14 : index
-  %c_mvout = arith.constant 3 : index
-  %vstride = arith.constant 1 : index
-  %input_axis = arith.constant 3 : index
-  %weight_axis = arith.constant 2 : index
-  %input_buffer = memref.get_global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-  %weight_buffer = memref.get_global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-  %output_buffer = memref.get_global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-  %tag = memref.alloc() : memref<1xi32>
-  %tag0 = memref.alloc() : memref<1xi32>
-  %tag1 = memref.alloc() : memref<1xi32>
-  %tag2 = memref.alloc() : memref<1xi32>
-  %tag3 = memref.alloc() : memref<1xi32>
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N) }}xf32>
+  {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
   %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  {{- kernel.def_local_vars() }}
+  {{- kernel.def_local_vars(indent_size=2) }}
 
-  affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
-    affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }} {
-      affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M }} {
-        affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
-          %index0 = affine.apply #map0(%o_h, %o_w, %tile_m, %tile_n)
+  affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M }} {
+    affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
+      affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
+        affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }} {
           // Initialize output
           {%- if BIAS %}
-          memref.dma_start %Bias[%tile_n], %output_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag0[%c0], %c0, %vstride
-              : memref<{{ O_C }}xf32>, memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_O_H }}, {{ TILE_O_W }}, {{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_O_W * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
+          {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_N, TILE_O_H, TILE_O_W], indent_size=10) }}
           {%- else %}
           affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N) }}xf32>
           {%- endif %}
@@ -101,406 +74,37 @@
               affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
                 %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
                 %index_i_w = affine.apply #map_I_W(%o_w, %k_w)
-                %index1 = affine.apply #map1(%index_i_h, %index_i_w, %tile_m, %tile_k) // input index
-                %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
                 // Load input matrix
-                memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
-                    : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_I_H }}, {{ SUB_TILE_I_W }}, {{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[{{ TILE_I_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
-                // Load kernel matrix
-                memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
-                    : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
+                {{ kernel.def_dma_op("MVIN", "X", X_idx, X_tile_desc, subtile_size=[SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_M, SUB_TILE_K], indent_size=16) }}
+                {{ kernel.def_dma_op("MVIN", "W", W_idx, W_tile_desc, subtile_size=[SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_K, SUB_TILE_N], indent_size=16) }}
+                // Compute body part
                 affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
                   affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
                     %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
-                    %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                    %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                     affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
                       affine.for %tile_o_w = 0 to {{ TILE_O_W }} {
                         %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
                         %tile_i_w = affine.apply #map_I_W(%tile_o_w, %tile_k_w)
                         %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_i_w)
                         %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
-                        %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
-                        %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                        %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
+                        %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
                               outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
                       } { inner_loop=true }
                     } { inner_loop=true }
                   } { inner_loop=true }
                 } { inner_loop=true }
-              } { accumulation_loop=true }
+              } { accumulation_loop=true, subtile_loop="k" }
             } { accumulation_loop=true }
           } { accumulation_loop=true }
           // Store output matrix
           {{kernel.store_output(indent_size=10)}}
         } { outer_loop=true }
       } { outer_loop=true }
-    } { outer_loop=true }
-  } { outer_loop=true }
-  return
-}
-"""
-
-MULTI_TILE_CONV_TEMPLATE = r"""
-// Multi Channel Tile Conv2D kernel
-// BATCH = {{ BATCH }}
-// I_C = {{ I_C }}
-// I_H = {{ I_H }}
-// I_W = {{ I_W }}
-// O_C = {{ O_C }}
-// K_H = {{ K_H }}
-// K_W = {{ K_W }}
-// O_H = {{ O_H }}
-// O_W = {{ O_W }}
-// TILE_M = {{ TILE_M }}
-// TILE_N = {{ TILE_N }}
-// TILE_K = {{ TILE_K }}
-// TILE_I_H={{ TILE_I_H }},
-// TILE_I_W={{ TILE_I_W }},
-// TILE_O_H={{ TILE_O_H }},
-// TILE_O_W={{ TILE_O_W }},
-// TILE_K_H={{ TILE_K_H }},
-// TILE_K_W={{ TILE_K_W }},
-// SUB_TILE_M={{ SUB_TILE_M }},
-// SUB_TILE_N={{ SUB_TILE_N }},
-// SUB_TILE_I_W={{ SUB_TILE_I_W }},
-// SUB_TILE_K_H={{ SUB_TILE_K_H }},
-// SUB_TILE_K_W={{ SUB_TILE_K_W }},
-// PADDING_H = {{ PADDING_H }}
-// PADDING_W = {{ PADDING_W }}
-// STRIDE_H = {{ STRIDE_H }}
-// STRIDE_W = {{ STRIDE_W }}
-// DILATION_H = {{ DILATION_H }}
-// DILATION_W = {{ DILATION_W }}
-// DATA_STYPE = {{ DATA_STYPE }}
-// DATA_SIZE = {{ DATA_SIZE }}
-
-#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * BATCH * O_C }} + d1 * {{ BATCH * O_C }} + d2 * {{ O_C }} + d3)> // output (O_H, O_W, BATCH, O_C)
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * BATCH * I_C }} + d1 * {{ I_C * STRIDE_W }} + d2 * {{ I_C * (I_W + 2 * PADDING_W) }} + d3)> // input (I_H, BATCH, I_W, I_C)
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C)
-#map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
-#offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(1 * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
-#offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_O_W * TILE_M, TILE_K) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_K) }})>
-#offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_O_W * TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
-
-memref.global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-memref.global @W_spad : memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-memref.global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-{{kernel.def_global_vars()}}
-
-func.func @{{ KERNEL_NAME }}{{kernel.def_conv_kernel(inputs=[X, W, BIAS], outputs=[Y], names_str="X, W, Bias, Y", padded_input_size=PADDED_INPUT_SIZE, input_reorder=input_reorder)}} {
-  %c_mvin = arith.constant 2 : index
-  %c_mvin2 = arith.constant 1 : index
-  %c_mvin3 = arith.constant 14 : index
-  %c_mvout = arith.constant 3 : index
-  %vstride = arith.constant 1 : index
-  %input_axis = arith.constant 3 : index
-  %weight_axis = arith.constant 2 : index
-  %input_buffer = memref.get_global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-  %weight_buffer = memref.get_global @W_spad : memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-  %output_buffer = memref.get_global @Y_spad : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-  %tag = memref.alloc() : memref<1xi32>
-  %tag0 = memref.alloc() : memref<1xi32>
-  %tag1 = memref.alloc() : memref<1xi32>
-  %tag2 = memref.alloc() : memref<1xi32>
-  %tag3 = memref.alloc() : memref<1xi32>
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N) }}xf32>
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  {{- kernel.def_local_vars() }}
-
-  affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
-    affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }} {
-      affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M }} {
-        affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
-          %index0 = affine.apply #map0(%o_h, %o_w, %tile_m, %tile_n)
-          // Initialize output
-          {%- if BIAS %}
-          memref.dma_start %Bias[%tile_n], %output_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag0[%c0], %c0, %vstride
-              : memref<{{ O_C }}xf32>, memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_O_H }}, {{ TILE_O_W }}, {{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_O_W * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
-          {%- else %}
-          affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N) }}xf32>
-          {%- endif %}
-          affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
-            affine.for %tile_k = 0 to {{ I_C * K_W }} step {{ TILE_K }} {
-              %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
-              %index1 = affine.apply #map1(%index_i_h, %o_w, %tile_m, %tile_k) // input index
-              %index2 = affine.apply #map2(%k_h, %c0, %tile_k, %tile_n) // weight index
-              // Load input matrix
-              memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
-                  : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_I_H }}, {{ SUB_TILE_I_W }}, {{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[{{ TILE_O_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
-              // Load kernel matrix
-              memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
-                  : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
-              affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
-                affine.for %tile_k_w = 0 to 1 {
-                  %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
-                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ 1 }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
-                  affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
-                    affine.for %tile_o_w = 0 to {{ TILE_O_W }} {
-                      %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
-                      %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_o_w)
-                      %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
-                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ TILE_I_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
-                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
-                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
-                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
-                    } { inner_loop=true }
-                  } { inner_loop=true }
-                } { inner_loop=true }
-              } { inner_loop=true }
-            } { accumulation_loop=true }
-          } { accumulation_loop=true }
-          // Store output matrix
-          {{kernel.store_output(indent_size=10)}}
-        } { outer_loop=true }
-      } { outer_loop=true }
-    } { outer_loop=true }
-  } { outer_loop=true }
-  return
-}
-"""
-
-SINGLE_BATCH_CONV_TEMPLATE = r"""
-// Single Batch Conv2D kernel
-// BATCH = {{ BATCH }}
-// I_C = {{ I_C }}
-// I_H = {{ I_H }}
-// I_W = {{ I_W }}
-// O_C = {{ O_C }}
-// K_H = {{ K_H }}
-// K_W = {{ K_W }}
-// O_H = {{ O_H }}
-// O_W = {{ O_W }}
-// TILE_M = {{ TILE_M }}
-// TILE_N = {{ TILE_N }}
-// TILE_K = {{ TILE_K }}
-// TILE_I_H={{ TILE_I_H }},
-// TILE_I_W={{ TILE_I_W }},
-// TILE_O_H={{ TILE_O_H }},
-// TILE_O_W={{ TILE_O_W }},
-// TILE_K_H={{ TILE_K_H }},
-// TILE_K_W={{ TILE_K_W }},
-// SUB_TILE_M={{ SUB_TILE_M }},
-// SUB_TILE_N={{ SUB_TILE_N }},
-// SUB_TILE_I_W={{ SUB_TILE_I_W }},
-// SUB_TILE_K_H={{ SUB_TILE_K_H }},
-// SUB_TILE_K_W={{ SUB_TILE_K_W }},
-// PADDING_H = {{ PADDING_H }}
-// PADDING_W = {{ PADDING_W }}
-// STRIDE_H = {{ STRIDE_H }}
-// STRIDE_W = {{ STRIDE_W }}
-// DILATION_H = {{ DILATION_H }}
-// DILATION_W = {{ DILATION_W }}
-// DATA_STYPE = {{ DATA_STYPE }}
-// DATA_SIZE = {{ DATA_SIZE }}
-
-#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * O_H * O_C }} + d1 * {{ O_W * O_C }} + d2 * {{ O_C }} + d3)> // output (BATCH, O_H, O_W, O_C)
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * (I_H + 2 * PADDING_W) * I_C }} + d1 * {{ (I_W + 2 * PADDING_W) * I_C }} + d2 * {{ I_C }} + d3)> // input (BATCH, I_H, I_W, I_C) Stride should be changed if kernel stride > 1
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C)
-#map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
-#map_I_W = affine_map<(d0, d1) -> (d0 * {{ STRIDE_W }} + d1)>
-#offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_K_W * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
-#offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_I_W, TILE_K) }} + d1)>
-#offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
-memref.global @X_spad : memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>
-memref.global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-memref.global @Y_spad : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-{{kernel.def_global_vars()}}
-
-func.func @{{ KERNEL_NAME }}{{kernel.def_conv_kernel(inputs=[X, W, BIAS], outputs=[Y], names_str="X, W, Bias, Y", padded_input_size=PADDED_INPUT_SIZE, input_reorder=input_reorder)}} {
-  %c_mvin = arith.constant 2 : index
-  %c_mvin2 = arith.constant 1 : index
-  %c_mvin3 = arith.constant 14 : index
-  %c_mvout = arith.constant 3 : index
-  %vstride = arith.constant 1 : index
-  %input_axis = arith.constant 3 : index
-  %weight_axis = arith.constant 2 : index
-  %input_buffer = memref.get_global @X_spad : memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>
-  %weight_buffer = memref.get_global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-  %output_buffer = memref.get_global @Y_spad : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-  %tag = memref.alloc() : memref<1xi32>
-  %tag0 = memref.alloc() : memref<1xi32>
-  %tag1 = memref.alloc() : memref<1xi32>
-  %tag2 = memref.alloc() : memref<1xi32>
-  %tag3 = memref.alloc() : memref<1xi32>
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  {{- kernel.def_local_vars() }}
-  affine.for %o_w = 0 to {{ O_W }} step {{ TILE_O_W }} {
-  affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
-    affine.for %tile_m = 0 to {{ O_W }} step {{ TILE_M }} {
-      affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
-        %index0 = affine.apply #map0(%c0, %o_h, %tile_m, %tile_n)
-        // Initialize output
-        {%- if BIAS %}
-        memref.dma_start %Bias[%tile_n], %output_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag0[%c0], %c0, %vstride // not implemented yet
-            : memref<{{ O_C }}xf32>, memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ 1 }}, {{ TILE_O_H }}, {{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_O_H * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
-        {%- else %}
-        affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
-        {%- endif %}
-        affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
-          affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} {
-            affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
-              %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
-              %index_i_w = affine.apply #map_I_W(%o_w, %k_w)
-              %index1 = affine.apply #map1(%c0, %index_i_h, %index_i_w, %tile_k) // input index
-              %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
-              // Load input matrix
-              memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
-                  : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ 1 }}, {{ SUB_TILE_I_H }}, {{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[{{ TILE_I_H * TILE_I_W * TILE_K }}, {{ TILE_I_W * TILE_K }}, 1, {{ TILE_I_W }}]}
-              // Load kernel matrix
-              memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
-                  : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
-              affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
-                affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
-                  %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
-                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
-                  affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
-                    affine.for %tile_o_w = 0 to {{ 1 }} { // TILE_O_W
-                      %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
-                      %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_k_w)
-                      %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
-                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ 1 }}x{{ TILE_I_H }}x{{ TILE_I_W }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
-                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
-                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
-                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
-                    } { inner_loop=true }
-                  } { inner_loop=true }
-                } { inner_loop=true }
-              } { inner_loop=true }
-            } { accumulation_loop=true }
-          } { accumulation_loop=true }
-        } { accumulation_loop=true }
-        // Store output matrix
-        {{kernel.store_output(indent_size=8)}}
-      } { outer_loop=true }
-    } { outer_loop=true }
-  } { outer_loop=true }
-  } { outer_loop=true }
-  return
-}
-"""
-
-SINGLE_BATCH_CONV_STRIDE_TEMPLATE = r"""
-// Single Batch Conv2D (Stride != 1) kernel
-// BATCH = {{ BATCH }}
-// I_C = {{ I_C }}
-// I_H = {{ I_H }}
-// I_W = {{ I_W }}
-// O_C = {{ O_C }}
-// K_H = {{ K_H }}
-// K_W = {{ K_W }}
-// O_H = {{ O_H }}
-// O_W = {{ O_W }}
-// TILE_M = {{ TILE_M }}
-// TILE_N = {{ TILE_N }}
-// TILE_K = {{ TILE_K }}
-// TILE_I_H={{ TILE_I_H }},
-// TILE_I_W={{ TILE_I_W }},
-// TILE_O_H={{ TILE_O_H }},
-// TILE_O_W={{ TILE_O_W }},
-// TILE_K_H={{ TILE_K_H }},
-// TILE_K_W={{ TILE_K_W }},
-// SUB_TILE_M={{ SUB_TILE_M }},
-// SUB_TILE_N={{ SUB_TILE_N }},
-// SUB_TILE_I_W={{ SUB_TILE_I_W }},
-// SUB_TILE_K_H={{ SUB_TILE_K_H }},
-// SUB_TILE_K_W={{ SUB_TILE_K_W }},
-// PADDING_H = {{ PADDING_H }}
-// PADDING_W = {{ PADDING_W }}
-// STRIDE_H = {{ STRIDE_H }}
-// STRIDE_W = {{ STRIDE_W }}
-// DILATION_H = {{ DILATION_H }}
-// DILATION_W = {{ DILATION_W }}
-// DATA_STYPE = {{ DATA_STYPE }}
-// DATA_SIZE = {{ DATA_SIZE }}
-
-#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ O_W * O_H * O_C }} + d1 * {{ O_W * O_C }} + d2 * {{ O_C }} + d3)> // output (BATCH, O_H, O_W, O_C)
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ (I_W + 2 * PADDING_W) * I_C }} + d1 * {{ I_C }} + d2 * {{ I_C * STRIDE_W }} + d3)> // input (I_H, (k_w), I_W, I_C) // duplicate for k_w
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * {{ K_W * I_C * O_C }} + d1 * {{ I_C * O_C }} + d2 * {{ O_C }} + d3)> // weight (K_H, K_W, I_C, O_C)
-#map_I_H = affine_map<(d0, d1) -> (d0 * {{ STRIDE_H }} + d1)>
-#offset_w_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_K_W * TILE_K, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_K, TILE_N) }})>
-#offset_x_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_M * TILE_K_W, TILE_K) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_K) }})>
-#offset_y_map = affine_map<(d0, d1) -> (d0 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }} + d1 * {{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }})>
-
-memref.global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_K_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-memref.global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-memref.global @Y_spad : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-{{kernel.def_global_vars()}}
-
-func.func @{{ KERNEL_NAME }}{{kernel.def_conv_kernel(inputs=[X, W, BIAS], outputs=[Y], names_str="X, W, Bias, Y", padded_input_size=PADDED_INPUT_SIZE, input_reorder=input_reorder)}} {
-  %c_mvin = arith.constant 2 : index
-  %c_mvin2 = arith.constant 1 : index
-  %c_mvin3 = arith.constant 14 : index
-  %c_mvout = arith.constant 3 : index
-  %vstride = arith.constant 1 : index
-  %input_axis = arith.constant 3 : index
-  %weight_axis = arith.constant 2 : index
-  %input_buffer = memref.get_global @X_spad : memref<{{ TILE_I_H }}x{{ TILE_K_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-  %weight_buffer = memref.get_global @W_spad : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-  %output_buffer = memref.get_global @Y_spad : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-  %tag = memref.alloc() : memref<1xi32>
-  %tag0 = memref.alloc() : memref<1xi32>
-  %tag1 = memref.alloc() : memref<1xi32>
-  %tag2 = memref.alloc() : memref<1xi32>
-  %tag3 = memref.alloc() : memref<1xi32>
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  {{- kernel.def_local_vars() }}
-
-  affine.for %o_h = 0 to {{ O_H }} step {{ TILE_O_H }} {
-    affine.for %tile_m = 0 to {{ O_W }} step {{ TILE_M }} {
-      affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
-        %index0 = affine.apply #map0(%c0, %o_h, %tile_m, %tile_n)
-        // Initialize output
-        {%- if BIAS %}
-        memref.dma_start %Bias[%tile_n], %output_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag0[%c0], %c0, %vstride // not implemented yet
-            : memref<{{ O_C }}xf32>, memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ 1 }}, {{ TILE_O_H }}, {{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_O_H * TILE_M * TILE_N }}, {{ TILE_M * TILE_N }}, 1, {{ TILE_M }}]}
-        {%- else %}
-        affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
-        {%- endif %}
-        affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
-          affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} {
-            affine.for %tile_k = 0 to {{ I_C }} step {{ TILE_K }} {
-              %index_i_h = affine.apply #map_I_H(%o_h, %k_h)
-              %index1 = affine.apply #map1(%index_i_h, %k_w, %tile_m, %tile_k) // input index
-              %index2 = affine.apply #map2(%k_h, %k_w, %tile_k, %tile_n) // weight index
-              // Load input matrix
-              memref.dma_start %X[%index1], %input_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag1[%c0], %input_axis, %vstride
-                  : memref<{{ BATCH * I_C * (I_H + 2 * PADDING_H) * (I_W + 2 * PADDING_W) }}xf32>, memref<{{ TILE_I_H }}x{{ TILE_K_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_I_H }}, {{ SUB_TILE_K_W }}, {{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[{{ TILE_K_W * TILE_M * TILE_K }}, {{ TILE_M * TILE_K }}, 1, {{ TILE_M }}]}
-              // Load kernel matrix
-              memref.dma_start %W[%index2], %weight_buffer[%c0, %c0, %c0, %c0], %c_mvin, %tag2[%c0], %input_axis, %vstride
-                  : memref<{{ O_C * I_C * K_H * K_W }}xf32>, memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K_H }}, {{ SUB_TILE_K_W }}, {{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[{{ TILE_K_W * TILE_K * TILE_N }}, {{ TILE_K * TILE_N }}, 1, {{ TILE_K }}]}
-              affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
-                affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
-                  %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
-                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
-                  affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
-                    affine.for %tile_o_w = 0 to {{ 1 }} { // TILE_O_W
-                      %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
-                      %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_k_w)
-                      %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
-                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<{{ TILE_I_H }}x{{ TILE_K_W }}x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
-                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ 1 }}x{{ TILE_O_H }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
-                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
-                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
-                    } { inner_loop=true }
-                  } { inner_loop=true }
-                } { inner_loop=true }
-              } { inner_loop=true }
-            } { accumulation_loop=true }
-          } { accumulation_loop=true }
-        } { accumulation_loop=true }
-        // Store output matrix
-        {{kernel.store_output(indent_size=8)}}
-      } { outer_loop=true }
-    } { outer_loop=true }
-  } { outer_loop=true }
+    } { outer_loop=true, subtile_loop="n" }
+  } { outer_loop=true, subtile_loop="m" }
   return
 }
 """
@@ -514,40 +118,14 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
     X_padding = torch.zeros(padded_shape, device=X.device)
     X_padding[:, :, {{ PADDING_H }}:X.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:X.shape[3] + {{ PADDING_W }}] = X
 
-    # Holding original output tensor
-    {%- for buf, name in kernel.get_conv_outputs().items() %}
-    {{ name }}_t = {{ name }}
-    {%- endfor %}
-
     # Tanspose inputs
     {%- for buf, name in kernel.get_conv_inputs().items() %}
       {%- if name == "X" %}
-        {%- if MULTI_TILE %}
-    {{ name }} = {{ name }}_padding.permute(2, 0, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, BATCH, I_W, I_C)
-        {%- elif SINGLE_BATCH %}
-    {{ name }} = {{ name }}_padding.permute(0, 2, 3, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (BATCH, I_H, I_W, I_C)
-        {%- else %}
     {{ name }} = {{ name }}_padding.permute(2, 3, 0, 1).contiguous() # (BATCH, I_C, I_H, I_W) -> (I_H, I_W, BATCH, I_C)
-        {%- endif %}
       {%- elif name == "W" %}
     {{ name }} = {{ name }}.permute(2, 3, 1, 0).contiguous() # (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)
       {%- elif name == "Bias" %}
     {{ name }} = {{ name }}
-      {%- else %}
-        {%- if SINGLE_BATCH %}
-    {{ name }} = {{ name }}.permute(0, 2, 3, 1).contiguous()  if {{ name }}.dim() == 4 else {{ name }} # (BATCH, O_C, O_H, O_W) -> (BATCH, O_H, O_W, O_C)
-        {%- else %}
-    {{ name }} = {{ name }}.permute(2, 3, 0, 1).contiguous()  if {{ name }}.dim() == 4 else {{ name }} # (BATCH, O_C, O_H, O_W) -> (O_H, O_W, BATCH, O_C)
-        {%- endif %}
-      {%- endif %}
-    {%- endfor %}
-
-    # Transpose outputs
-    {%- for buf, name in kernel.get_conv_outputs().items() %}
-      {%- if SINGLE_BATCH %}
-    {{ name }} = {{ name }}.permute(0, 2, 3, 1).contiguous() # (BATCH, O_C, O_H, O_W) -> (BATCH, O_H, O_W, O_C)
-      {%- else %}
-    {{ name }} = {{ name }}.permute(2, 3, 0, 1).contiguous() # (BATCH, O_C, O_H, O_W) ->  (O_H, O_W, BATCH, O_C)
       {%- endif %}
     {%- endfor %}
 
@@ -556,15 +134,6 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
     {%- if BACKENDSIM_EAGER_MODE %}
     yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
     {%- endif %}
-
-    # Transpose back outputs
-    {%- for buf, name in kernel.get_conv_outputs().items() %}
-      {%- if SINGLE_BATCH %}
-    {{ name }}_t.copy_({{ name }}.permute(0, 3, 1, 2).contiguous()) # (BATCH, O_H, O_W, O_C) -> (BATCH, O_C, O_H, O_W)
-      {%- else %}
-    {{ name }}_t.copy_({{ name }}.permute(2, 3, 0, 1).contiguous()) # (O_H, O_W, BATCH, O_C) -> (BATCH, O_C, O_H, O_W)
-      {%- endif %}
-    {%- endfor %}
 """
 
 class MLIRConvTemplate(MLIRTemplate):
@@ -581,21 +150,6 @@ def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
             + "_" + "_".join([str(i) for i in self.dilation])
         self.kernel_args = ['X', 'W', 'Bias', 'Y']
 
-    def is_transposed(self, node):
-        if isinstance(node, ReinterpretView):
-            if node.layout.stride != node.data.layout.stride:
-                if node.layout.stride[-2] == node.data.layout.stride[-1] and node.layout.stride[-1] == node.data.layout.stride[-2]:
-                    return True
-                else:
-                  raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.")
-        return False
-
-    def is_multi_tile(self, I_C):
-        return I_C < (self.kernel.vector_lane // 8) # 8 is hard-coded for now. This should be changed to a better heuristic.
-
-    def is_single_batch(self, BATCH):
-        return BATCH == 1
-
     def get_padded_input_size(self, X):
         input_padded = list(X.layout.size)
         input_padded[2] += 2 * self.padding[0]
@@ -607,6 +161,7 @@ def render(self,
                template_buffer_node = None,
                epilogue_nodes: Optional[List[IRNode]] = None,
                **kwargs):
+        # Extract input arguments info
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
         self.kernel = kernel
@@ -617,93 +172,68 @@ def render(self,
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
         if epilogue_nodes is not None:
-          extra_node_rw = {
-            item.name for epilogue_node in epilogue_nodes
-            for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes
-            if item.name != Y.name
-          }
+            extra_node_rw = {
+                item.name for epilogue_node in epilogue_nodes
+                for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes
+                if item.name != Y.name
+            }
         n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0
 
-        BATCH = X.layout.size[0]
-        I_C = X.layout.size[1]
-        O_C = W.layout.size[0]
-        K_H = W.layout.size[2]
-        K_W = W.layout.size[3]
+        BATCH, I_C, I_H, I_W = X.layout.size
+        O_C, _, K_H, K_W = W.layout.size
         O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2]
         O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
+        PADDING_H=self.padding[0]
+        PADDING_W=self.padding[1]
+        STRIDE_H=self.stride[0]
+        STRIDE_W=self.stride[1]
 
-        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
-        SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_K = TILE_K
-        TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-        TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
-        SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
-        x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H * TILE_M, TILE_K)
-        w_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_W * TILE_K_H * TILE_K, TILE_N)
-        y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)
-        x_spad_size = TILE_I_W * TILE_I_H * TILE_M * TILE_K
-        w_spad_size = TILE_K_W * TILE_K_H * TILE_K * TILE_N
-        y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
+        # Select tile size adn template
         conv_template = CONV_TEMPLATE
-        TOG_latency = BATCH if TILE_M > BATCH else TILE_M
-        if self.is_single_batch(BATCH) and self.stride[0] != 1:
-          conv_template = SINGLE_BATCH_CONV_STRIDE_TEMPLATE
-          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
-          TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-          x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_W * TILE_I_H * TILE_M, TILE_K)
-          w_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_W * TILE_K_H * TILE_K, TILE_N)
-          y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N)
-          x_spad_size = TILE_K_W * TILE_I_H * TILE_M * TILE_K
-          w_spad_size = TILE_K_W * TILE_K_H * TILE_K * TILE_N
-          y_spad_size = TILE_O_H * TILE_M * TILE_N
-          SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
-          SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-          SUB_TILE_K = TILE_K
-          TOG_latency = O_W if TILE_M > O_W else TILE_M
-        elif self.is_single_batch(BATCH) and self.stride[0] == 1:
-          conv_template = SINGLE_BATCH_CONV_TEMPLATE
-          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
-          TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-          TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
-          SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane
-          SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-          SUB_TILE_K = TILE_K
-          x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H, TILE_K)
-          w_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_W * TILE_K_H * TILE_K, TILE_N)
-          y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H  * TILE_M, TILE_N)
-          x_spad_size = TILE_I_W * TILE_I_H * TILE_K
-          w_spad_size = TILE_K_W * TILE_K_H * TILE_K * TILE_N
-          y_spad_size = TILE_O_H * TILE_M * TILE_N
-          TOG_latency = O_W if TILE_M > O_W else TILE_M
-        elif self.is_multi_tile(I_C):
-          conv_template = MULTI_TILE_CONV_TEMPLATE
-          TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
-          TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
-          TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-          SUB_TILE_K = TILE_K
-          x_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_I_W * TILE_I_H * TILE_M, TILE_K)
-          w_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_K_H * TILE_K, TILE_N)
-          y_spad_size_per_lane = kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N)
-          x_spad_size = TILE_I_W * TILE_I_H * TILE_M * TILE_K
-          w_spad_size = TILE_K_H * TILE_K * TILE_N
-          y_spad_size = TILE_O_H * TILE_O_W * TILE_M * TILE_N
+        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)
         SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
         TOG_latency = 8 if TOG_latency < 8 else TOG_latency
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
 
+        # Prepare tile descriptors
+        vlane_stride = 1
+        vlane_split_axis = 1
+        X_tile_size = [TILE_I_H, TILE_I_W, TILE_M, TILE_K ]
+        X_tile_stride = [TILE_I_W*TILE_M*TILE_K, TILE_M*TILE_K, 1, TILE_M]
+        X_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, 3, vlane_stride)
+        X_tile_desc.set_tile_size_stride(X_tile_size, X_tile_stride)
+        X_tile_desc.set_name("input_buffer")
+        X_dim = [Symbol("index_i_h"), Symbol("index_i_w"), Symbol("tile_m"), Symbol("tile_k")]
+        X_idx = [X_dim[0]*(I_W+2*PADDING_W)*BATCH*I_C, X_dim[1]*I_C*BATCH, X_dim[2]*I_C, X_dim[3]]
+
+        W_tile_size = [TILE_K_H, TILE_K_W, TILE_K, TILE_N]
+        W_tile_stride = [TILE_K_W * TILE_K * TILE_N, TILE_K * TILE_N, 1, TILE_K]
+        W_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, 3, vlane_stride)
+        W_tile_desc.set_tile_size_stride(W_tile_size, W_tile_stride)
+        W_tile_desc.set_name("weight_buffer")
+        W_dim = [Symbol("k_h"), Symbol("k_w"), Symbol("tile_k"), Symbol("tile_n")]
+        W_idx = [W_dim[0]*K_W*I_C*O_C , W_dim[1]*I_C*O_C, W_dim[2]*O_C, W_dim[3]]
+
+        Y_tile_size = [TILE_M, TILE_N, TILE_O_H, TILE_O_W]
+        Y_tile_stride = [1, TILE_M, TILE_O_W * TILE_M * TILE_N, TILE_M * TILE_N] # N, C, H, W
+        Y_tile_desc = mlir_common.MLIRMultiDimTile(Y_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        Y_tile_desc.set_tile_size_stride(Y_tile_size, Y_tile_stride)
+        Y_tile_desc.set_name("output_buffer")
+        Y_dim = [Symbol("tile_m"), Symbol("tile_n"), Symbol("o_h"), Symbol("o_w")]
+        Y_idx = [Y_dim[0]*O_C*O_H*O_W, Y_dim[1]*O_H*O_W, Y_dim[2]*O_W, Y_dim[3]]
+        
+        # Extract Bias info
+        Bias_idx = [Number(0), Symbol("tile_n"), Number(0), Number(0)]
+
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
-            X=X,
-            W=W,
-            BIAS=Bias,
-            Y=Y,
+            X=X, W=W, Y=Y, BIAS=Bias,
             PADDED_INPUT_SIZE=self.get_padded_input_size(X),
-            BATCH=X.layout.size[0],
-            I_C=X.layout.size[1],
-            I_H=X.layout.size[2],
-            I_W=X.layout.size[3],
+            BATCH=BATCH,
+            I_C=I_C,
+            I_H=I_H,
+            I_W=I_W,
             O_C=O_C,
             K_H=K_H,
             K_W=K_W,
@@ -725,43 +255,46 @@ def render(self,
             SUB_TILE_I_W=SUB_TILE_I_W,
             SUB_TILE_K_H=SUB_TILE_K_H,
             SUB_TILE_K_W=SUB_TILE_K_W,
-            PADDING_H=self.padding[0],
-            PADDING_W=self.padding[1],
-            STRIDE_H=self.stride[0],
-            STRIDE_W=self.stride[1],
-            DILATION_H=self.dilation[0],
-            DILATION_W=self.dilation[1],
+            PADDING_H=PADDING_H,
+            PADDING_W=PADDING_W,
+            STRIDE_H=STRIDE_H,
+            STRIDE_W=STRIDE_W,
+            X_tile_desc = X_tile_desc,
+            W_tile_desc = W_tile_desc,
+            Y_tile_desc = Y_tile_desc,
+            X_idx = X_idx,
+            W_idx = W_idx,
+            Bias_idx = Bias_idx,
             DATA_STYPE="f32",
-            DATA_SIZE=4,
             input_reorder=self.input_reorder
         )
 
         kernel.epilogue_info = dict(
             output_node = self.output_node.name,
-            dependent_buf = [],
             sram_var = "output_buffer",
             dram_var = "Y",
-            index_var = "index0",
-            tag_var = "tag",
-            vlane_split_axis = 3,
-            vlane_stride = 1,
-            mlir_dtype = kernel.render_options['DATA_STYPE'],
-            dram_shape = f"memref<{BATCH * O_C * O_H * O_W}x{kernel.render_options['DATA_STYPE']}>",
-            tile_size = (TILE_O_H, TILE_O_W, TILE_M, TILE_N) if conv_template in (CONV_TEMPLATE, MULTI_TILE_CONV_TEMPLATE) else (1, TILE_O_H, TILE_M, TILE_N),
-            tile_stride = [TILE_O_W * TILE_M * TILE_N, TILE_M * TILE_N, 1, TILE_M]
+            dram_idx = Y_idx,
+            dram_tile_desc = Y_tile_desc,
+            dim_aliasing = {"index0":"c0", "index1":"tile_n", "index2":"o_h", "index3":"tile_m"}
         )
+        kernel.exception_nodes["X"] = {"numel" : (I_W+2*PADDING_W)*(I_H+2*PADDING_H)*I_C*BATCH}
         code = self._template_from_string(conv_template).render(**kernel.render_options)
-        self.header = f"float X_spad[{x_spad_size_per_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float W_spad[{w_spad_size_per_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float Y_spad[{y_spad_size_per_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header = f"float X_spad[{x_spad_size}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float W_spad[{w_spad_size}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float Y_spad[{y_spad_size}] __attribute__ ((section(\".spad\")));\n"
-
         kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
-
         return code
 
+    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W):
+        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
+        SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+        SUB_TILE_K = TILE_K
+        TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+        TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
+        SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
+        SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
+        TOG_latency = BATCH if TILE_M > BATCH else TILE_M
+        TOG_latency = 8 if TOG_latency < 8 else TOG_latency
+        return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency
+
     def outer_func_render(self, kernel_name, input_args):
         X, W = self.input_nodes[0], self.input_nodes[1]
         Y = self.output_node
@@ -778,8 +311,6 @@ def outer_func_render(self, kernel_name, input_args):
             OUTPUT=Y,
             PADDING_H=self.padding[0],
             PADDING_W=self.padding[1],
-            MULTI_TILE=self.is_multi_tile(self.input_shape[1]),
-            SINGLE_BATCH=self.is_single_batch(self.input_shape[0]),
             VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
             BACKENDSIM_EAGER_MODE=eager_mode,
             input_reorder=self.input_reorder
@@ -813,7 +344,7 @@ def codegen_header(self, code, extra_headers):
         spike_write_path = os.path.join(write_path, "global_var.h")
         gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
         if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, self.header+extra_headers[0])
+            write_atomic(spike_write_path, extra_headers[0])
         if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, self.gem5_header+extra_headers[1])
+            write_atomic(gem5_write_path, extra_headers[1])
         self.hash_value = get_hash(code.strip())
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index bfd0633b..ace6ea9d 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -1,16 +1,17 @@
 import os
 import json
+from pathlib import Path
 from torch import empty_strided
-from typing import List, Optional, cast
+from typing import List, Optional
+import sympy
 
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
-from torch._inductor.ir import Buffer
 from torch._inductor.ir import IRNode
-from torch._inductor.ir import ReinterpretView
 from torch._inductor.codecache import write_atomic
 import PyTorchSimFrontend.extension_codecache as extension_codecache
 from PyTorchSimFrontend import extension_config
+from PyTorchSimFrontend.mlir import mlir_common
 
 GEMM_TEMPLATE = r"""
 // GEMM {% if prologue_nodes -%}prologue fused{%- endif %} {% if epilogue_nodes -%}eilogue fused{%- endif %} kernel
@@ -22,62 +23,36 @@
 // TILE_K = {{ TILE_K }}
 // SUB_TILE_M = {{ SUB_TILE_M }}
 // SUB_TILE_N = {{ SUB_TILE_N }}
-#map0 = affine_map<(d0, d1) -> ({{ X_map }})>
-#map1 = affine_map<(d0, d1) -> ({{ W_map }})>
-#map2 = affine_map<(d0, d1) -> (d0 * {{ N }} + d1)>
-#map3 = affine_map<(d0, d1) -> (d0)>
-#map4 = affine_map<(d0, d1) -> (d0 + d1 * {{ M }})>
-memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-memref.global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
 {{kernel.def_global_vars()}}
 
 func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} {
-  %c_mvin = arith.constant 2 : index
-  %c_mvin2 = arith.constant 1 : index{% if Bias %}
-  %c_mvin3 = arith.constant 14 : index{% endif %}
-  %c_mvout = arith.constant 3 : index
-  %vstride = arith.constant 1 : index
-  %axis = arith.constant 1 : index
-  %X_buffer = memref.get_global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-  %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-  %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-  %tag = memref.alloc() : memref<1xi32>
-  %tag0 = memref.alloc() : memref<1xi32>
-  %tag1 = memref.alloc() : memref<1xi32>
-  %tag2 = memref.alloc() : memref<1xi32>{% if not Bias %}
+  {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
+  {% if not Bias %}
   %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
-  %c0 = arith.constant 0 : index
-{{ kernel.def_local_vars() }}
-
-  affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
-    affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
-      %index2 = affine.apply #map2(%t_m, %t_n)
-      %index3 = affine.apply #map3(%t_m, %c0)
-      %index4 = affine.apply #map4(%t_m, %t_n)
+  {{ kernel.def_local_vars(indent_size=2) }}
+  affine.for %index0 = 0 to {{ M }} step {{ TILE_M }} {
+    affine.for %index1 = 0 to {{ N }} step {{ TILE_N }} {
       {%- if Bias %}
-      memref.dma_start %Bias[{{ Bias_idx }}], %Y_buffer[%c0, %c0], %c_mvin3, %tag0[%c0], {{ Bias_axis }}, %vstride : memref<{{ Bias.data.get_numel() }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}] }
+      {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_N], indent_size=6) }}
       {%- else %}
-      affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+      affine.vector_store %v0, %Y_buffer[0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
       {%- endif %}
-      affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
-        %index0 = affine.apply #map0(%t_m, %t_k)
-        %index1 = affine.apply #map1(%t_k, %t_n)
+      affine.for %index2 = 0 to {{ K }} step {{ TILE_K }} {
         {% if prologue_nodes -%}
         // prologue nodes
-        {{kernel.prepare_input(indent_size=8)}}
+        {{kernel.load_input(indent_size=8)}}
         {%- else -%}
-        memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
-           : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
-        memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
-           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
+        {{ kernel.def_dma_op("MVIN", "X", X_idx, X_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_K], indent_size=8) }}
+        {{ kernel.def_dma_op("MVIN", "W", W_idx, W_tile_desc, subtile_size=[SUB_TILE_K, SUB_TILE_N], indent_size=8) }}
         {%- endif %}
-        linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
-                outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
-      } { accumulation_loop=true }
+        linalg.matmul ins(%X_buffer, %W_buffer : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }}, {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }})
+                outs(%Y_buffer : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }})
+      } { accumulation_loop=true, subtile_loop="k" }
       {{kernel.store_output(indent_size=6)}}
-    } { outer_loop=true }
-  } { outer_loop=true }
+    } { outer_loop=true, subtile_loop="n"  }
+  } { outer_loop=true, subtile_loop="m" }
   return
 }
 """
@@ -98,58 +73,34 @@
 // TILE_K = {{ TILE_K }}
 // SUB_TILE_M = {{ SUB_TILE_M }}
 // SUB_TILE_N = {{ SUB_TILE_N }}
-#map0 = affine_map<(d0, d1) -> ({{ X_map }})>
-#map1 = affine_map<(d0, d1) -> ({{ W_map }})>
-#map2 = affine_map<(d0, d1) -> (d0 * {{ N }} + d1)>
-#map3 = affine_map<(d0, d1) -> (d0)>
-#map4 = affine_map<(d0, d1) -> (d0 + d1 * {{ M }})>
-memref.global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-memref.global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-memref.global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
 {{kernel.def_global_vars()}}
 
 func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} {
-  %c_mvin = arith.constant 2 : index
-  %c_mvin2 = arith.constant 1 : index{% if Bias %}
-  %c_mvin3 = arith.constant 14 : index{% endif %}
-  %c_mvout = arith.constant 3 : index
-  %vstride = arith.constant 1 : index
-  %axis = arith.constant 1 : index
-  %X_buffer = memref.get_global @X_spad : memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-  %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-  %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
-  %tag = memref.alloc() : memref<1xi32>
-  %tag0 = memref.alloc() : memref<1xi32>
-  %tag1 = memref.alloc() : memref<1xi32>
-  %tag2 = memref.alloc() : memref<1xi32>{% if not Bias %}
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
-  %c0 = arith.constant 0 : index
-{{ kernel.def_local_vars() }}
-
-  affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
-    {{kernel.reduction_acc()}} affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {{kernel.reduction_iter_arg()}} {
-      %index2 = affine.apply #map2(%t_m, %t_n)
-      %index3 = affine.apply #map3(%t_m, %c0)
-      %index4 = affine.apply #map4(%t_m, %t_n)
+  {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
+  {% if not Bias %}
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+  {% endif %}
+  {{ kernel.def_local_vars(indent_size=2) }}
+  affine.for %index1 = 0 to {{ N }} step {{ TILE_N }} {
+    affine.for %index0 = 0 to {{ M }} step {{ TILE_M }} {
+      %Y_bufferT = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
       {%- if Bias %}
-      memref.dma_start %Bias[{{ Bias_idx }}], %Y_buffer[%c0, %c0], %c_mvin3, %tag0[%c0], {{ Bias_axis }}, %vstride : memref<{{ Bias.data.get_numel() }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32>  { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_M }}] }
+      {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_N], indent_size=6) }}
       {%- else %}
-      affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+      affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_N }}x{{ TILE_M }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
       {%- endif %}
-      affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
-        %index0 = affine.apply #map0(%t_m, %t_k)
-        %index1 = affine.apply #map1(%t_k, %t_n)
-        memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag1[%c0], %axis, %vstride
-           : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_M }}, {{ SUB_TILE_K }}], async=1, sram_stride=[1, {{ TILE_M }}]}
-        memref.dma_start %W[%index1], %W_buffer[%c0, %c0], %c_mvin2, %tag2[%c0], %axis, %vstride
-           : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ SUB_TILE_K }}, {{ SUB_TILE_N }}], async=1, sram_stride=[1, {{ TILE_K }}]}
-        linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
-                outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
-      } { accumulation_loop=true, loop_k=true }
+      affine.for %index2 = 0 to {{ K }} step {{ TILE_K }} {
+        {{ kernel.def_dma_op("MVIN", "X", X_idx, X_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_K], indent_size=8) }}
+        {{ kernel.def_dma_op("MVIN", "W", W_idx, W_tile_desc, subtile_size=[SUB_TILE_K, SUB_TILE_N], indent_size=8) }}
+        linalg.matmul ins(%X_buffer, %W_buffer : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }}, {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }})
+                outs(%Y_bufferT : memref<{{TILE_M}}x{{TILE_N}}x{{DATA_STYPE}}, 1>)
+      } { accumulation_loop=true, subtile_loop="k" }
       {{kernel.store_output(indent_size=6)}}
-    } { outer_loop=true, loop_m=true}
+    } { outer_loop=true, subtile_loop="m" }
     {{kernel.reduction_output(indent_size=4)}}
-  } { outer_loop=true, loop_n=true }
+  } { outer_loop=true, subtile_loop="n" }
   return
 }
 """
@@ -166,114 +117,90 @@ def render(self,
                **kwargs):
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
-        # if epilogue_nodes is not None and len(epilogue_nodes) > 0:
-        #     self.output_node = cast(Buffer, epilogue_nodes[-1]) #FIXME: Temperary solution
 
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        Y = self.output_node
-
-        W_tensor =  empty_strided(W.layout.size, W.layout.stride)
-        X_tensor =  empty_strided(X.layout.size, X.layout.stride)
+        # Extract input arguments info
+        X, W, Y = self.input_nodes[0], self.input_nodes[1], self.output_node
+        X_tensor = empty_strided(X.layout.size, X.layout.stride)
+        W_tensor = empty_strided(W.layout.size, W.layout.stride)
         if len(W_tensor.size()) > 2 or len(X_tensor.size()) > 2:
             raise NotImplementedError("Please report this case to us...")
-        W_stride = W_tensor.stride()
-        X_stride = X_tensor.stride()
-        W_map = " + ".join([f"d{idx}*{s}" for idx, s in enumerate(W_stride)])
-        X_map = " + ".join([f"d{idx}*{s}" for idx, s in enumerate(X_stride)])
 
-        M, N, K = X_tensor.size()[0], W_tensor.size()[1], X_tensor.size()[1]
-        n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
-        # Caculate extra reads
+        # Extract fusion info
+        n_epilogue_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
+        n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0
         n_extra_read = set()
         if epilogue_nodes is not None:
-          for enode in epilogue_nodes:
-            n_extra_read.update(enode.node.get_read_names())
-          if self.output_node.name in n_extra_read:
-            n_extra_read.remove(self.output_node.name)
+            for enode in epilogue_nodes:
+                n_extra_read.update(enode.node.get_read_names())
+            if self.output_node.name in n_extra_read:
+                n_extra_read.remove(self.output_node.name)
 
-        n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0
-        nr_rdim = 0
-        # Determine tile size
-        # case 1: use cheat sheet
-        if extension_config.CONFIG_GEMM_CHEATSHEET_PATH is not None:
-            try:
-              with open(extension_config.CONFIG_GEMM_CHEATSHEET_PATH, "r") as f:
-                data = json.load(f)
-            except FileNotFoundError:
-                data = {}
-        gemm_shape = f"{M}_{K}_{N}"
-        if gemm_shape in data:
-            tile_info = data[gemm_shape]
-            TILE_M = tile_info["TILE_M"]
-            TILE_N = tile_info["TILE_N"]
-            TILE_K = tile_info["TILE_K"]
-        else: # case 2: use gemm_combination_mapping
-            min_tile = (n_extra_node + n_prologue_node) == 0
-            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, max(len(n_extra_read)-2, 0), n_prologue_node, min_tile=min_tile)
-        # case 3: use manual tile size
-        if extension_config.CONFIG_MANUAL_TILE_SIZE:
-            TILE_M = extension_config.CONFIG_TILE_M
-            TILE_N = extension_config.CONFIG_TILE_N
-            TILE_K = extension_config.CONFIG_TILE_K
+        # Select tile size
+        M, N, K = X_tensor.size()[0], W_tensor.size()[1], X_tensor.size()[1]
+        TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node)
 
+        # Select template code
         if (M == 0) or (N == 0) or (K == 0): # exception for MoE
-            TILE_M, TILE_N, TILE_K = 1, 1, 1
             template = EMPTY_TEMPLATE
-        elif n_extra_node>=1 and epilogue_nodes[0].is_reduction():
+            nr_rdim = 0
+        elif n_epilogue_node>=1 and epilogue_nodes[0].is_reduction():
             template = GEMM_REDUCTION_TEMPLATE
+            epilogue_dim_aliasing = {"index0":"index1", "index1":"index0"}
             nr_rdim = 1
         else:
             template = GEMM_TEMPLATE
-
-        TILE_M = min(extension_config.CONFIG_FORCE_TILE_M, TILE_M)
-        TILE_N = min(extension_config.CONFIG_FORCE_TILE_N, TILE_N)
-        TILE_K = min(extension_config.CONFIG_FORCE_TILE_K, TILE_K)
-
-        # Calculate Sub Tile Size for fine-grained DMA
-        if extension_config.CONFIG_SUBTILE:
-            # Case 1: adjust selective fine-grained DMA (SFG-DMA)
-            SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane or n_prologue_node) else kernel.vector_lane
-            if (TILE_M == M and TILE_N == N and TILE_N <= 512):
-                SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-            else: # Avoid Row Conflict of weights
-                SUB_TILE_N = TILE_N
-            SUB_TILE_K = TILE_K
-            # Case 2: use manual sub tile size (FG-DMA)
-            if extension_config.CONFIG_MANUAL_SUBTILE_SIZE:
-                SUB_TILE_M = extension_config.CONFIG_SUBTILE_M
-                SUB_TILE_N = extension_config.CONFIG_SUBTILE_N
-                SUB_TILE_K = extension_config.CONFIG_SUBTILE_K
-        # Case 3: None Subtile
-        else:
-            SUB_TILE_M = TILE_M
-            SUB_TILE_N = TILE_N
-            SUB_TILE_K = TILE_K
+            epilogue_dim_aliasing = {"index0":"index0", "index1":"index1"}
+            nr_rdim = 0
 
         TOG_latency = M if SUB_TILE_M > M else SUB_TILE_M
         kernel.loop_size =[TOG_latency, SUB_TILE_N, SUB_TILE_K]
 
+        # Prepare tile descriptors
+        vlane_stride = 1
+        vlane_split_axis = 1
+        X_tile_size = [TILE_M, TILE_K]
+        X_tile_stride = [1, TILE_M]
+        X_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        X_tile_desc.set_tile_size_stride(X_tile_size, X_tile_stride)
+        X_tile_desc.set_name("X_buffer")
+        X_stride = X.get_layout().stride
+        X_idx = [sympy.Symbol("index0") * X_stride[0], sympy.Symbol("index2") * X_stride[1]] # To keep index arguemnt order, we used index_list
+
+        W_tile_size = [TILE_K, TILE_N]
+        W_tile_stride = [1, TILE_K]
+        W_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        W_tile_desc.set_tile_size_stride(W_tile_size, W_tile_stride)
+        W_tile_desc.set_name("W_buffer")
+        W_stride = W.get_layout().stride
+        W_idx = [sympy.Symbol("index2") * W_stride[0], sympy.Symbol("index1") * W_stride[1]]
+
+        vlane_split_axis = vlane_split_axis if nr_rdim==0 else 0
+        Y_tile_size = [TILE_M, TILE_N] if nr_rdim == 0 else [TILE_N, TILE_M]
+        Y_tile_stride=[1, TILE_M] if nr_rdim == 0 else [TILE_M, 1]
+        Y_tile_desc = mlir_common.MLIRMultiDimTile(Y_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        Y_tile_desc.set_tile_size_stride(Y_tile_size, Y_tile_stride)
+        Y_tile_desc.set_name("Y_buffer")
+        Y_stride = Y.get_layout().stride
+        if nr_rdim == 0:
+            Y_idx = [sympy.Symbol("index0") * Y_stride[0], sympy.Symbol("index1") * Y_stride[1]]
+        else:
+            Y_idx = [sympy.Symbol("index1") * Y_stride[1], sympy.Symbol("index0") * Y_stride[0]]
+
         # Extract Bias info
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
         if Bias is not None:
-          if Bias.data.get_numel() == M*N:
-            Bias_idx = "%index2"
-            Bias_axis = "%axis"
-          elif Bias.data.get_numel() == M:
-            Bias_idx = "%index3"
-            Bias_axis = "%axis"
+          Bias_stride = Bias.get_layout().stride
+          if nr_rdim == 0:
+            Bias_idx = [sympy.Symbol("index0") * Bias_stride[0], sympy.Symbol("index1") * Bias_stride[1]]
           else:
-            Bias_idx = "%t_n"
-            Bias_axis = "%c0"
+            Bias_idx = [sympy.Symbol("index1") * Bias_stride[1], sympy.Symbol("index0") * Bias_stride[0]]
         else:
           Bias_idx = None
-          Bias_axis = None
 
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
-            M=M,
-            N=N,
-            K=K,
+            M=M, N=N, K=K,
             TILE_M=TILE_M,
             TILE_N=TILE_N,
             TILE_K=TILE_K,
@@ -281,73 +208,120 @@ def render(self,
             SUB_TILE_N=SUB_TILE_N,
             SUB_TILE_K=SUB_TILE_K,
             DATA_STYPE="f32",
-            DATA_SIZE=4,
-            X = X,
-            W = W,
-            Y = Y,
+            X = X, W = W, Y = Y,
             Bias = Bias,
+            X_idx = X_idx,
+            W_idx = W_idx,
             Bias_idx = Bias_idx,
-            Bias_axis = Bias_axis,
-            X_map = X_map,
-            W_map = W_map,
-            Y_numel = M * N,
+            X_tile_desc = X_tile_desc,
+            W_tile_desc = W_tile_desc,
+            Y_tile_desc = Y_tile_desc,
             epilogue_nodes = epilogue_nodes,
             prologue_nodes = prologue_nodes,
             input_reorder = self.input_reorder
         )
-        kernel.prologue_info = dict (
-            input_sram_var = "X_buffer",
-            input_dram_var = "X",
-            input_index_var = "index0",
-            input_tag_var = "tag1",
-            input_numel = M * K,
-            input_tile_size = (TILE_M, TILE_K),
-            input_sram_stride = [1, TILE_M],
-            vector_sram_stride = [TILE_M, 1],
-            input_subtile_size = (SUB_TILE_M, SUB_TILE_K),
-            weight_sram_var = "W_buffer",
-            weight_dram_var = "W",
-            weight_index_var = "index1",
-            weight_tag_var = "tag2",
-            weight_numel = K * N,
-            weight_tile_size = (TILE_K, TILE_N),
-            weight_sram_stride = [1, TILE_K],
-            weight_subtile_size = (SUB_TILE_K, SUB_TILE_N),
-            tile_size = (TILE_M, TILE_K),
-            vlane_split_axis = 1,
-            vlane_stride = 1,
-            is_bmm = False,
-        )
+        if prologue_nodes:
+            prologue_output_name = list(prologue_nodes[0].read_writes.writes)[0].name
+            if prologue_output_name == X.get_name():
+                # Input fusion case
+                prologue_var = "X"
+                prologue_sram_var = "X_buffer"
+                prologue_tile_desc = X_tile_desc
+                prologue_dim_aliasing = {"index0":"index0", "index1":"index2"}
+                is_input_fused = True
+            else:
+                # Weight fusion case
+                prologue_var = "W"
+                prologue_sram_var = "W_buffer"
+                prologue_tile_desc = W_tile_desc
+                prologue_dim_aliasing = {"index0":"index2", "index1":"index1"}
+                is_input_fused = False
+
+            kernel.prologue_info = dict (
+                input_dram_var = "X",
+                input_sram_var = "X_buffer",
+                input_tile_desc = X_tile_desc,
+                input_idx = X_idx,
+                input_subtile_size = [TILE_M, TILE_K],
+                input_dim_aliasing = {"index0":"index0", "index1":"index2"},
+
+                weight_dram_var = "W",
+                weight_sram_var = "W_buffer",
+                weight_tile_desc = W_tile_desc,
+                weight_idx = W_idx,
+                weight_subtile_size = [TILE_K, TILE_N],
+                weight_dim_aliasing = {"index0":"index2", "index1":"index1"},
+
+                # Descriptor for fusion
+                dram_var = prologue_var,
+                sram_var = prologue_sram_var,
+                dram_tile_desc = prologue_tile_desc,
+                dim_aliasing = prologue_dim_aliasing,
+                is_bmm = False,
+                is_input_fused = is_input_fused
+            )
         kernel.epilogue_info = dict(
             output_node = self.output_node.name,
-            dependent_buf = [],
-            sram_var = "Y_buffer",
             dram_var = "Y",
-            index_var = "index2",
-            t_index_var = "index4", # FIXME: for epilogue transposed input
-            tag_var = "tag",
-            vlane_split_axis = 1,
-            vlane_stride = 1,
-            mlir_dtype = kernel.render_options['DATA_STYPE'],
-            dram_shape = f"memref<{kernel.render_options['Y_numel']}x{kernel.render_options['DATA_STYPE']}>",
-            tile_size = (TILE_M, TILE_N),
-            tile_stride = [1, TILE_M],
+            sram_var = "Y_buffer",
+            dram_idx = Y_idx,
+            dram_tile_desc = Y_tile_desc,
             nr_rdim = nr_rdim,
-            reduction_idx = "t_n"
+            dim_aliasing = epilogue_dim_aliasing
         )
         code = self._template_from_string(template).render(**kernel.render_options)
         kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
+        return code
 
-        self.header = f"float X_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_K)}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float W_spad[{kernel.get_spad_size_per_lane(TILE_K, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float Y_spad[{kernel.get_spad_size_per_lane(TILE_M, TILE_N)}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header = f"float X_spad[{TILE_M * TILE_K}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float W_spad[{TILE_K * TILE_N}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float Y_spad[{TILE_M * TILE_N}] __attribute__ ((section(\".spad\")));\n"
+    def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node):
+        # Check cheat sheet
+        cheatsheet_path = extension_config.CONFIG_GEMM_CHEATSHEET_PATH
+        data = {}
+        if extension_config.CONFIG_GEMM_CHEATSHEET_PATH is not None:
+            path = Path(cheatsheet_path)
+            if path.is_file():
+                with path.open("r") as f:
+                    data = json.load(f)
 
-        kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
+        gemm_shape = f"{M}_{K}_{N}"
+        if gemm_shape in data:
+            tile_info = data[gemm_shape]
+            TILE_M = tile_info["TILE_M"]
+            TILE_N = tile_info["TILE_N"]
+            TILE_K = tile_info["TILE_K"]
+        else: # case 2: use gemm_combination_mapping
+            min_tile = (n_extra_node + n_prologue_node) == 0
+            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, max(len(n_extra_read)-2, 0), n_prologue_node, min_tile=min_tile)
+        # case 3: use manual tile size
+        if extension_config.CONFIG_MANUAL_TILE_SIZE:
+            TILE_M = extension_config.CONFIG_TILE_M
+            TILE_N = extension_config.CONFIG_TILE_N
+            TILE_K = extension_config.CONFIG_TILE_K
 
-        return code
+        # Edge case
+        if (M == 0) or (N == 0) or (K == 0):
+            TILE_M, TILE_N, TILE_K = 1, 1, 1
+
+        # Calculate Sub Tile Size for fine-grained DMA
+        if extension_config.CONFIG_SUBTILE:
+            # Case 1: adjust selective fine-grained DMA (SFG-DMA)
+            SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane or n_prologue_node) else kernel.vector_lane
+            if (TILE_M == M and TILE_N == N and TILE_N <= 512):
+                SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+            else: # Avoid Row Conflict of weights
+                SUB_TILE_N = TILE_N
+            SUB_TILE_K = TILE_K
+            # Case 2: use manual sub tile size (FG-DMA)
+            if extension_config.CONFIG_MANUAL_SUBTILE_SIZE:
+                SUB_TILE_M = extension_config.CONFIG_SUBTILE_M
+                SUB_TILE_N = extension_config.CONFIG_SUBTILE_N
+                SUB_TILE_K = extension_config.CONFIG_SUBTILE_K
+        # Case 3: None Subtile
+        else:
+            SUB_TILE_M = TILE_M
+            SUB_TILE_N = TILE_N
+            SUB_TILE_K = TILE_K
+        return TILE_M,TILE_N,TILE_K, SUB_TILE_M,SUB_TILE_N,SUB_TILE_K
 
     def codegen_header(self, code, extra_headers):
         write_path = extension_codecache.get_write_path(code)
@@ -356,6 +330,6 @@ def codegen_header(self, code, extra_headers):
         spike_write_path = os.path.join(write_path, "global_var.h")
         gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
         if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, self.header+extra_headers[0])
+            write_atomic(spike_write_path, extra_headers[0])
         if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, self.gem5_header+extra_headers[1])
+            write_atomic(gem5_write_path, extra_headers[1])
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index b1e1ba0e..aa3cf16e 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -11,7 +11,11 @@
 from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
 from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
 from PyTorchSimFrontend.mlir.mlir_conv_template import MLIRConvTemplate
+from PyTorchSimFrontend.mlir.mlir_conv_mt_template import MLIRConvMultiTileTemplate
+from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate
+from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate
 from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
+from PyTorchSimFrontend.extension_config import CONFIG_VECTOR_LANE
 
 aten = torch.ops.aten
 aten_spmm = MLIRExternKernelChoice(torch.sparse.mm, "custom_op::sparse_addmm")
@@ -96,9 +100,20 @@ def convolution(
     x.realize()
     weight.realize()
     x = ir.ExternKernel.require_channels_last(x)
+    BATCH = x.layout.size[0]
+    I_C = x.layout.size[1]
     weight = ir.ExternKernel.require_channels_last(weight)
     layout = conv_layout(x, weight, None, **kwargs)
-    mlir_template = MLIRConvTemplate([x, weight, bias], layout, **kwargs)
+
+    # Select conv kernel
+    if BATCH == 1 and stride[0] == 1:
+        mlir_template = MLIRConvSingleBatchTemplate([x, weight, bias], layout, **kwargs)
+    elif BATCH == 1 and stride[0] != 1:
+        mlir_template = MLIRConvSingleBatchStridedTemplate([x, weight, bias], layout, **kwargs)
+    elif I_C < CONFIG_VECTOR_LANE // 8: # 8 is hard-coded for now. This should be changed to a better heuristic.
+        mlir_template = MLIRConvMultiTileTemplate([x, weight, bias], layout, **kwargs)
+    else:
+        mlir_template = MLIRConvTemplate([x, weight, bias], layout, **kwargs)
     return mlir_template.generate().output_node()
 
 def maxpool_layout(
diff --git a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
index ff617eb4..5395efb2 100644
--- a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
@@ -26,8 +26,8 @@
   affine.for %i = 0 to {{ BCH }} step {{ out_tile }} {
     affine.for %j = 0 to {{ W }} step {{ out_tile }} {
       %index0 = affine.apply #map0(%i, %j)
-      memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %axis, %vstride : memref<{{ IN }}xf32>, memref<{{ in_tile }}x{{ in_tile }}xf32, 1>, memref<1xi32>
-      memref.dma_start %Y_buffer[%c0, %c0], %Y[%index0], %c_mvout, %tag[%c0], %axis, %vstride : memref<{{ out_tile }}x{{ out_tile }}xf32, 1>, memref<{{ OUT }}xf32>, memref<1xi32>
+      memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %axis, %vstride : memref<{{ IN }}xf32>, memref<{{ in_tile }}x{{ in_tile }}xf32, 1>, memref<1xi32> {dram_stride=[{{W}}, 1]}
+      memref.dma_start %Y_buffer[%c0, %c0], %Y[%index0], %c_mvout, %tag[%c0], %axis, %vstride : memref<{{ out_tile }}x{{ out_tile }}xf32, 1>, memref<{{ OUT }}xf32>, memref<1xi32> {dram_stride=[{{W}}, 1]}
     } { outer_loop=true }
   } { outer_loop=true }
   return
@@ -62,6 +62,7 @@ def render(self,
         W = Y.get_size()[3]
         BCH = B * C * H
         kernel.loop_size = None
+
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
@@ -77,26 +78,10 @@ def render(self,
         )
         kernel.epilogue_info = dict(
             output_node = self.output_node.name,
-            dependent_buf = [],
             sram_var = "Y_buffer",
             dram_var = "Y",
-            index_var = "index0",
-            tag_var = "tag",
-            vlane_split_axis = 1,
-            vlane_stride = 1,
-            mlir_dtype = kernel.render_options['DATA_STYPE'],
-            tile_nr_dim = 2,
-            dram_shape = f"memref<{kernel.render_options['OUT']}x{kernel.render_options['DATA_STYPE']}>",
-            tile_shape = f"memref<{out_tile}x{out_tile}x{kernel.render_options['DATA_STYPE']}, 1>",
-            tile_size = (out_tile, out_tile),
-            tile_stride = [1, out_tile]
         )
         code = self._template_from_string(TEMPLATE).render(**kernel.render_options)
-        self.header = f"float X_spad[{in_tile * in_tile // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.header += f"float Y_spad[{out_tile * out_tile // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header = f"float X_spad[{in_tile * in_tile // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-        self.gem5_header += f"float Y_spad[{out_tile * out_tile // kernel.vector_lane}] __attribute__ ((section(\".spad\")));\n"
-
         kernel.add_loop_info([kernel.render_options["IN"]], [kernel.vector_lane, kernel.vector_lane])
         return code
 
@@ -107,6 +92,6 @@ def codegen_header(self, code, extra_headers):
         spike_write_path = os.path.join(write_path, "global_var.h")
         gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
         if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, self.header+extra_headers[0])
+            write_atomic(spike_write_path, extra_headers[0])
         if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, self.gem5_header+extra_headers[1])
\ No newline at end of file
+            write_atomic(gem5_write_path, extra_headers[1])
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 41264a74..84418ec7 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -2,7 +2,7 @@
 import math
 from functools import reduce
 import operator
-from sympy import symbols, sympify
+from sympy import symbols, sympify, Symbol
 from PyTorchSimFrontend import extension_config
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
 
@@ -10,6 +10,8 @@
 from torch._inductor.scheduler import BaseScheduling, FusedSchedulerNode, SchedulerNode, BaseSchedulerNode
 from torch._inductor.utils import IndentedBuffer
 from torch._inductor.virtualized import V
+from torch._inductor.ir import LoopBody
+from torch._inductor import dependencies
 
 from . import mlir_common
 from . import mlir_lowering
@@ -47,7 +49,7 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
                 # Directed linked?
                 dependency_check = node2 in [node.node for node in base_template_node1[0].users]# and len(node2.read_writes.reads)==1
                 dependency_size = all([i.get_numel() == node1.get_nodes()[0].node.get_numel() for i in node2.read_writes.reads])
-                return size_match and layout_possible and dependency_check & dependency_size
+                return size_match and layout_possible and dependency_check and dependency_size
 
         # For prologue fusion case
         if extension_config.CONFIG_FUSION_PROLOGUE and len(base_template_node1) == 0 and len(node1.get_nodes())==1 and len(base_template_node2) == 1:
@@ -63,6 +65,7 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
             if len(node1.read_writes.writes) != 1:
                 return False
             if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]:
+                node1 = self.revert_group(node1)
                 return True
 
         return self.scheduler.can_fuse_origin(node1, node2)
@@ -84,12 +87,7 @@ def can_fuse_horizontal(self, node1, node2):
             return False
 
         # Can't fuse two template node
-        nr_template = 0
-        for node in node1.get_nodes() + node2.get_nodes():
-            if node.is_template():
-                nr_template += 1
-
-        if nr_template > 1:
+        if node1.is_template() and node2.is_template():
             return False
 
         # Check template node fusion
@@ -100,34 +98,48 @@ def can_fuse_horizontal(self, node1, node2):
                 node2.is_template() and len(node1.get_nodes())==1 and isinstance(node2.node.template, MLIRMaxPoolTemplate):
                 return False
 
-            # Different layout is not supported
-            if node1.get_nodes()[0].node.layout.dtype != node2.get_nodes()[0].node.layout.dtype:
-                return False
-
-            # Convolution is currently not supported
-            # if node1.is_template() and node1.get_nodes()[0].node.origin_node is not None and hasattr(node1.get_nodes()[0].node.origin_node.target, "_name") and node1.get_nodes()[0].node.origin_node.target._name == 'aten::convolution':
-            #     return False
-
-            # if node2.is_template() and node2.get_nodes()[0].node.origin_node is not None and hasattr(node2.get_nodes()[0].node.origin_node.target, "_name") and node2.get_nodes()[0].node.origin_node.target._name == 'aten::convolution':
-            #     return False
-
+            # Pointwise check
             v1_total = math.prod(vars1) if len(vars1) else 0
             v2_total = math.prod(vars2) if len(vars2) else 0
             if v1_total != v2_total:
                 return False
 
-            has_depedency = False
-            template_node = node1 if node1.is_template() else node2
-            act_node = node2 if node1.is_template() else node1
-            for write_buf in template_node.read_writes.writes:
-                has_depedency = has_depedency or (write_buf in act_node.read_writes.reads)
-            return has_depedency
+            # Pattern check
+            template_node, act_node = (node1, node2) if node1.is_template() else (node2, node1)
+            has_depedency = set(act_node.inverse_users) <= set(template_node.get_nodes())
+            if not has_depedency:
+                return False
+
+            # Revert act_node.group : simplify_and_reorder() modified _body, _size, group
+            if template_node.group != act_node.group:
+                self.revert_group(act_node)
+            return True
 
         # Check elementwise fusion
         if vars1 == vars2 and reduce1 == reduce2:
             return True
         return False
 
+    def revert_group(self, act_node):
+        args, var_ranges = dependencies.index_vars_no_squeeze(
+                act_node.node.data.get_size(), act_node.node.data.get_reduction_size(), prefix="q"
+        )
+        body = LoopBody(
+            act_node.node.get_store_function(),
+            (args if act_node.node.get_reduction_type() else args[:1]),
+            var_ranges,
+        )
+        index_size = []
+        reduce_size = []
+        for v, s in var_ranges.items():
+            if v in args[0]:
+                index_size.append(s)
+            else:
+                reduce_size.append(s)
+        node_device = act_node.get_device()
+        ranges = (index_size, reduce_size)
+        act_node._sizes, act_node._body, act_node.group = (ranges), body, (node_device, self.group_fn(ranges))
+
     def group_fn(self, sizes):
         return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes)
 
@@ -196,74 +208,73 @@ def codegen_template_code(self, kernel, render, template_node, prologue_nodes, e
             for node in [template_node, *prologue_nodes, *epilogue_nodes]:
                 node.mark_run()
             partial_code = render()
-            tile_desc = kernel.set_tile_size(kernel.epilogue_info)
-            kernel.kernel_group.set_tile_info(tile_desc)
             if prologue_nodes:
-                _, (group, reduction_group) = max(
-                    [prologue_nodes[-1]], key=lambda x: int(x.is_reduction())
-                ).group
-                prologue_tile_desc = kernel.set_tile_size(kernel.prologue_info, prologue=True)
-                kernel.kernel_group.set_prologue_tile_info(prologue_tile_desc)
-                vars, reduction_vars = kernel.set_ranges(group, reduction_group)
-            # Flush created varaibles, since template fusion doen't share variable
-            kernel.cse.cache.clear()
-            kernel.prologue_buffer_group.set_buffers()
-            kernel.call_ranges = None
-            kernel.load = kernel.load_prologue
-            kernel.store = kernel.store_prologue
-            for node in prologue_nodes:
-                # Reuse created spad
-                read_list = sorted(list(node.read_writes.reads))
-                candidate_found = False
-                # Why? There is a case that memdep.get_size() != data.get_size()
-                buf_dict = {}
-                buf_dict.update({val.name : val for val in V.graph.buffers})
-                for candidate_read in read_list:
-                    if candidate_read.name in buf_dict and reduce(operator.mul, buf_dict[candidate_read.name].get_size(), 1) == node.node.get_numel():
-                        prologue_input_arg = candidate_read.name
-                        candidate_found = True
-                        break
-                assert(candidate_found)
-                assert(len(node.read_writes.writes)==1)
-                prologue_output_arg = list(node.read_writes.writes)[0].name
-                template_buf = self.kernel_group.args.input_buffers[prologue_output_arg]
-                if template_node.get_nodes()[0].node.origin_node.target._name == 'aten::bmm':
-                    target_buf = f"{template_buf}_buffer2D"
-                else:
-                    target_buf = f"{template_buf}_buffer"
-
-                # To skip the dma code gen
-                kernel.buffer_names[prologue_input_arg] = target_buf
-                kernel.buffer_names[prologue_output_arg] = target_buf
-
-                # Edge delete
-                kernel.kernel_group.args.input_buffers = {
-                    (arg if buf != template_buf else prologue_input_arg): buf
-                    for arg, buf in kernel.kernel_group.args.input_buffers.items()
-                }
-                node.codegen((vars, reduction_vars))
+                # Flush created varaibles, since template fusion doen't share variable
+                with kernel.prologue_buffer_group.as_local():
+                    kernel.load = kernel.load_epilogue
+                    kernel.store = kernel.store_prologue
+                    _, (group, reduction_group) = max(
+                        [prologue_nodes[-1]], key=lambda x: int(x.is_reduction())
+                    ).group
+                    prologue_tile_desc = kernel.set_tile_size(kernel.prologue_info, prologue=True)
+                    kernel.kernel_group.set_tile_info(prologue_tile_desc)
+                    vars, reduction_vars = kernel.set_ranges(group, reduction_group)
+                    for node in prologue_nodes:
+                        # Reuse created spad
+                        read_list = sorted(list(node.read_writes.reads))
+                        candidate_found = False
+                        # Why? There is a case that memdep.get_size() != data.get_size()
+                        buf_dict = {}
+                        buf_dict.update({val.name : val for val in V.graph.buffers})
+                        buf_dict.update(V.graph.graph_inputs)
+                        for candidate_read in read_list:
+                            if candidate_read.name in buf_dict and reduce(operator.mul, buf_dict[candidate_read.name].get_size(), 1) == node.node.get_numel():
+                                prologue_input_arg = candidate_read.name
+                                candidate_found = True
+                                break
+                        assert(candidate_found)
+                        assert(len(node.read_writes.writes)==1)
+                        prologue_output_arg = list(node.read_writes.writes)[0].name
+                        template_buf = self.kernel_group.args.input_buffers[prologue_output_arg]
+                        target_buf = f"{template_buf}_buffer" # FIXME. How to pass spad buffer name?
+
+                        # To skip the dma code gen
+                        kernel.buffer_names[prologue_input_arg] = target_buf
+                        kernel.buffer_names[prologue_output_arg] = target_buf
+
+                        # Edge delete
+                        kernel.kernel_group.args.input_buffers = {
+                            (arg if buf != template_buf else prologue_input_arg): buf
+                            for arg, buf in kernel.kernel_group.args.input_buffers.items()
+                        }
+                        node.codegen((vars, reduction_vars))
 
+            tile_desc = kernel.set_tile_size(kernel.epilogue_info)
+            kernel.kernel_group.set_tile_info(tile_desc)
             if epilogue_nodes:
-                _, (group, reduction_group) = max(
-                    epilogue_nodes, key=lambda x: int(x.is_reduction())
-                ).group
-                vars, reduction_vars = kernel.set_ranges(group, reduction_group)
-            # Flush created varaibles, since template fusion doen't share variable
-            kernel.cse.cache.clear()
-            kernel.epilogue_buffer_group.set_buffers()
-            kernel.load = kernel.load_epilogue
-            kernel.store = kernel.store_epilogue
-            for node in epilogue_nodes:
-                if template_node.node.name in [dep[0] for dep in list(node.read_writes.reads)]:
-                    kernel.epilogue_info['dependent_buf'].append(node.node.name)
-                node.codegen((vars, reduction_vars))
+                with kernel.epilogue_buffer_group.as_local():
+                    kernel.load = kernel.load_epilogue
+                    kernel.store = kernel.store_epilogue
+                    kernel.store_reduction = kernel.store_reduction_epilogue
+                    kernel.reduction = kernel.reduction_epilogue
+
+                    _, (group, reduction_group) = max(
+                        epilogue_nodes, key=lambda x: int(x.is_reduction())
+                    ).group
+                    vars, reduction_vars = kernel.set_ranges(group, reduction_group)
+                    for node in epilogue_nodes:
+                        node.codegen((vars, reduction_vars))
+
         with V.set_kernel_handler(kernel):
             src_code = (
                 partial_code
                 if isinstance(partial_code, str)
                 else partial_code.finalize()
             )
-        return src_code
+        # For consistency, white space could make wrong write_path
+        buffer = IndentedBuffer()
+        buffer.splice(src_code)
+        return buffer.getvalue()
 
     def codegen_template(self, template_node, epilogue_nodes):
         # Handle prologue pattern
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 1db14e27..ccb9b0d1 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -11,7 +11,7 @@
 from unittest.mock import patch
 
 from torch._inductor.codegen.common import Kernel, KernelTemplate, ChoiceCaller, OpOverrides, CSE, DeferredLine
-from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, Pointwise
+from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, View
 from torch._inductor.select_algorithm import PartialRender
 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
 from torch._inductor.autotune_process import TensorMeta
@@ -22,12 +22,13 @@
 from PyTorchSimFrontend.mlir.mlir_common import BaseMLIRHardwareInfo
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel, reduction_init, reduction_partial_combine_vec, reduction_combine_vec, is_welford_reduction
 from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode
+from torch._inductor.codegen import common
 
 from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_DIR
 from . import mlir_common
 
 class IndentedBufferGroup:
-    def __init__(self, kernel: 'MLIRTemplateKernel'):
+    def __init__(self, kernel: 'MLIRTemplateKernel', prefix=""):
         self.kernel = kernel
         self.body = IndentedBuffer()
         self.loads = IndentedBuffer()
@@ -37,18 +38,51 @@ def __init__(self, kernel: 'MLIRTemplateKernel'):
         self.dma_loads = IndentedBuffer()
         self.dma_stores = IndentedBuffer()
         self.spad_buffer = IndentedBuffer()
+        self.cse = common.CSE("%", "", name_prefix=f"{prefix}")
+        self.apply_cse = common.CSE("%", "", name_prefix=f"{prefix}apply")
+        # Original buffers will be saved later in the 'with' block
+        self.original_buffers = {}
 
     def set_buffers(self):
         self.kernel.loads = self.loads
         self.kernel.compute = self.compute
         self.kernel.stores = self.stores
+        self.kernel.applys = self.applys
         self.kernel.dma_loads = self.dma_loads
         self.kernel.dma_stores = self.dma_stores
         self.kernel.spad_buffer = self.spad_buffer
+        self.kernel.cse = self.cse
+        self.kernel.apply_cse = self.apply_cse
+
+    def restore_buffers(self):
+        self.kernel.loads = self.original_buffers['loads']
+        self.kernel.compute = self.original_buffers['compute']
+        self.kernel.stores = self.original_buffers['stores']
+        self.kernel.applys = self.original_buffers['applys']
+        self.kernel.dma_loads = self.original_buffers['dma_loads']
+        self.kernel.dma_stores = self.original_buffers['dma_stores']
+        self.kernel.spad_buffer = self.original_buffers['spad_buffer']
+        self.kernel.cse = self.original_buffers['cse']
+        self.kernel.apply_cse = self.original_buffers['apply_cse']
 
     @contextlib.contextmanager
     def as_local(self):
-        yield self
+        self.original_buffers = {
+            'loads': self.kernel.loads,
+            'compute': self.kernel.compute,
+            'stores': self.kernel.stores,
+            'applys': self.kernel.applys,
+            'dma_loads': self.kernel.dma_loads,
+            'dma_stores': self.kernel.dma_stores,
+            'spad_buffer': self.kernel.spad_buffer,
+            'cse': self.kernel.cse,
+            'apply_cse': self.kernel.apply_cse,
+        }
+        try:
+            self.set_buffers()
+            yield self
+        finally:
+            self.restore_buffers()
 
 class MLIRTemplateKernel(MLIRKernel, BaseMLIRHardwareInfo):
     def __init__(self,
@@ -65,8 +99,6 @@ def __init__(self,
         self.call_size = call_size
         self.named_nodes = {}
         self.loop_info = {}
-        self.load_desc = {}
-        self.store_desc = {}
         self.outer_func_name = outer_func_name
         self.outer_func_render = outer_func_render
         self.kernel_arg_attributes = kernel_arg_attributes
@@ -75,29 +107,23 @@ def __init__(self,
         self.render_options = dict()
         self.tile_size = []
         self.loop_size = None
-        self.is_template_kernel = True
-        self.map_cse = CSE("#", self.suffix, name_prefix="template_map")
-        self.const_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="template_const")
-        self.alloc_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="template_alloc")
-        self.prologue_buffer_group = IndentedBufferGroup(self)
-        self.epilogue_buffer_group = IndentedBufferGroup(self)
+        self.map_cse = CSE("#", self.suffix, name_prefix="t_map")
+        self.const_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="t_const")
+        self.alloc_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="t_alloc")
+        self.prologue_buffer_group = IndentedBufferGroup(self, prefix="prologue_")
+        self.epilogue_buffer_group = IndentedBufferGroup(self, prefix="epilogue_")
         self.global_vars = IndentedBuffer()
+        self.exception_nodes = {}
         # Reduction data structure
         self.reduction_epilogue_suffix = IndentedBuffer()
         self.reduction_fusion = False
         self.reduction_body_loop = None
-        self.reduction_idx = None
         self.reduction_buffer_idx = 0
         self.reduction_info = {}
         self.reduction_epilogue_result = {}
         self.reduction_mean = []
-        self.reuse_buffer_names = {}
-
-        # Overwrite ops
-        self.load = self.load_epilogue
-        self.store = self.store_epilogue
-        self.store_reduction = self.store_reduction_epilogue
-        self.reduction = self.reduction_epilogue
+        # Dim info
+        self.dim_aliasing = {}
 
     def add_loop_info(self, mat_size, tile_size):
         for idx, (loop_size, stride) in enumerate(zip(mat_size, tile_size)):
@@ -368,8 +394,6 @@ def meta_kernel(self):
         wrapper.add_import_once('\nprint(f\'Wrapper Codegen Path = {__file__}\')')
         # Dump loop and load/store information
         wrapper.add_import_once(f"loop_info = {self.loop_info}")
-        wrapper.add_import_once(f"load_tile_info = {self.load_desc}")
-        wrapper.add_import_once(f"store_tile_info = {self.store_desc}")
         wrapper.add_import_once(f"arg_attributes = {arg_attributes}")
 
     def call_kernel(self, kernel_name):
@@ -381,78 +405,62 @@ def call_kernel(self, kernel_name):
             call_args, cuda=False)
 
     def codegen_prologue_body(self):
-        with self.prologue_buffer_group.as_local() as buf:
-            buf.body.splice(buf.spad_buffer)
-            buf.body.splice(buf.applys)
-            buf.body.splice(buf.dma_loads)
-
-            if (buf.loads.getvalue() != '' or buf.compute.getvalue() != '' or buf.stores.getvalue() != ''):
-                buf.body.writelines(self.prologue_compute_body_loop.lines())
+        body = IndentedBuffer()
+        with self.prologue_buffer_group.as_local():
+            body.splice(self.spad_buffer)
+            body.splice(self.applys)
+            body.splice(self.dma_loads)
+
+            if (self.loads.getvalue() != '' or self.compute.getvalue() != '' or self.stores.getvalue() != ''):
+                body.writelines(self.prologue_compute_body_loop.lines())
                 compute_body = mlir_common.ParallelLoopBuffer()
                 with contextlib.ExitStack() as stack:
                     stack.enter_context(compute_body.indent(attribute="{inner_loop=false}"))
-                    compute_body.splice(buf.loads)
-                    compute_body.splice(buf.compute)
-                    compute_body.splice(buf.stores)
-                buf.body.splice(compute_body)
-
-        # Clear buffers
-        self.loads.clear()
-        self.compute.clear()
-        self.stores.clear()
+                    compute_body.splice(self.loads)
+                    compute_body.splice(self.compute)
+                    compute_body.splice(self.stores)
+                body.splice(compute_body)
+        return body
 
     def codegen_epilogue_body(self):
         def template_store():
-            zero_cse = self.get_const_cse(0)
-            sram_var = self.epilogue_info["sram_var"]
             dram_var = self.epilogue_info["dram_var"]
-            index_var = self.epilogue_info["index_var"]
-            tag_var = self.epilogue_info["tag_var"]
-            mlir_dtype = self.epilogue_info["mlir_dtype"]
-            dram_shape = self.epilogue_info["dram_shape"]
-            vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis
-            vlane_stride = self.kernel_group.tile_desc.get_vlane_stride()
-            tile_stride = self.epilogue_info["tile_stride"]
-            tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
-            sram_index_var = ",".join([f"%{zero_cse}"] *  self.kernel_group.tile_desc.get_nr_dim())
-            code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                 tag_var, dram_shape, tile_shape, tile_stride)
+            index_list = self.epilogue_info["dram_idx"]
+            tile_desc = self.epilogue_info["dram_tile_desc"]
+            code = self.def_dma_op("MVOUT", dram_var, index_list, tile_desc)
             self.cse.generate(self.dma_stores, code, assignment = False)
-        # Do dma store first to overlap epilogue nodes
-        if self.reduction_fusion:
-            if len(self.stores._lines) == 0:
-                template_store()
-                self.epilogue_buffer_group.body.splice(self.dma_stores)
-                self.dma_stores.clear()
-        self.epilogue_buffer_group.body.splice(self.spad_buffer)
-        self.epilogue_buffer_group.body.splice(self.applys)
-        self.epilogue_buffer_group.body.splice(self.dma_loads)
-        self.epilogue_buffer_group.body.writelines(self.compute_body_loop.lines())
-        compute_body = mlir_common.ParallelLoopBuffer()
-        with contextlib.ExitStack() as stack:
-            stack.enter_context(compute_body.indent(attribute="{inner_loop=false}",suffix=self.compute_body_loop.epilogue_line()))
+
+        body = IndentedBuffer()
+        with self.epilogue_buffer_group.as_local():
+            # Do dma store first to overlap epilogue nodes
             if self.reduction_fusion:
-                #if len(self.stores._lines) == 0:
-                #    template_store()
-                compute_body.writelines(self.reduction_body_loop.lines())
-                stack.enter_context(compute_body.indent(attribute="{inner_loop=false}"))
-                compute_body.splice(self.loads)
-                compute_body.splice(self.compute)
-            else:
-                compute_body.splice(self.loads)
-                compute_body.splice(self.compute)
                 if len(self.stores._lines) == 0:
                     template_store()
-            compute_body.splice(self.epilogue_buffer_group.stores)
-        if (compute_body.getvalue()):
-            self.epilogue_buffer_group.body.splice(compute_body)
-        self.epilogue_buffer_group.body.splice(self.dma_stores)
-        self.epilogue_buffer_group.body.splice(self.reduction_epilogue_suffix)
-
-        # Clear buffers
-        self.loads.clear()
-        self.compute.clear()
-        self.stores.clear()
+                    body.splice(self.dma_stores)
+                    self.dma_stores.clear()
+            body.splice(self.spad_buffer)
+            body.splice(self.applys)
+            body.splice(self.dma_loads)
+            body.writelines(self.compute_body_loop.lines())
+            compute_body = mlir_common.ParallelLoopBuffer()
+            with contextlib.ExitStack() as stack:
+                stack.enter_context(compute_body.indent(attribute="{inner_loop=false}",suffix=self.compute_body_loop.epilogue_line()))
+                if self.reduction_fusion:
+                    compute_body.writelines(self.reduction_body_loop.lines())
+                    stack.enter_context(compute_body.indent(attribute="{inner_loop=false}"))
+                    compute_body.splice(self.loads)
+                    compute_body.splice(self.compute)
+                else:
+                    compute_body.splice(self.loads)
+                    compute_body.splice(self.compute)
+                    if len(self.stores._lines) == 0:
+                        template_store()
+                compute_body.splice(self.stores)
+            if (compute_body.getvalue()):
+                body.splice(compute_body)
+            body.splice(self.dma_stores)
+            body.splice(self.reduction_epilogue_suffix)
+        return body
 
     def def_kernel(
         self,
@@ -562,66 +570,42 @@ def get_conv_inputs(self):
     def get_conv_outputs(self):
         return {k: v for k, v in self.kernel_group.args.output_buffers.items() if v != 'REMOVED'}
 
-    def prepare_input(self, indent_size: int = 0):
-        def emit_dma_start(buffer_name, index_var, tag_var, size, tile_size, subtile_size=None, async_flag=True, label="X"):
-            base = f"memref.dma_start %{label}[%{index_var}], %{buffer_name}[%c0, %c0], %c_mvin"
-            if label == "W":
-                base = base.replace("mvin", "mvin2")
-
-            suffix = f"%{tag_var}[%c0], %axis, %vstride"
-            memref_shape = f"memref<{size}xf32>"
-            tile_shape = "x".join([str(x) for x in tile_size])
-            tile_memref = f"memref<{tile_shape}xf32, 1>"
-            tag_memref = f"memref<1xi32>"
-            attrs = f"sram_stride=[1, {tile_size[0]}]"
-            async_flag = "false"
-            if subtile_size:
-                subtile_shape = ", ".join([str(x) for x in subtile_size])
-                attrs = f"subtile_size=[{subtile_shape}], async={async_flag}, {attrs}"
-            else:
-                subtile_shape = ", ".join([str(x) for x in tile_size])
-                attrs = f"subtile_size=[{subtile_shape}], async={async_flag}, {attrs}"
-            attr_memref = f"{{ {attrs} }}"
-            return f"{base}, {suffix}: {memref_shape}, {tile_memref}, {tag_memref} {attr_memref}"
-
+    def load_input(self, indent_size: int = 0):
         def hook():
             code = IndentedBuffer()
-            self.codegen_prologue_body()
-            prologue_code = self.prologue_buffer_group.body
+            prologue_code = self.codegen_prologue_body()
             if prologue_code.getvalue():
-                code.writeline(emit_dma_start(self.prologue_info["input_sram_var"], self.prologue_info["input_index_var"], self.prologue_info["input_tag_var"],
-                                              self.prologue_info["input_numel"], self.prologue_info["input_tile_size"], subtile_size=self.prologue_info["input_subtile_size"], label="X"))
-                code.splice(prologue_code)
-                code.writeline(emit_dma_start(self.prologue_info["weight_sram_var"], self.prologue_info["weight_index_var"], self.prologue_info["weight_tag_var"],
-                                              self.prologue_info["weight_numel"], self.prologue_info["weight_tile_size"], subtile_size=self.prologue_info["weight_subtile_size"], label="W"))
+                input_dma_code = self.def_dma_op("MVIN", self.prologue_info["input_dram_var"], self.prologue_info["input_idx"],
+                                self.prologue_info["input_tile_desc"], subtile_size=self.prologue_info["input_subtile_size"], async_type=False)
+                weight_dma_code = self.def_dma_op("MVIN", self.prologue_info["weight_dram_var"], self.prologue_info["weight_idx"],
+                                self.prologue_info["weight_tile_desc"], subtile_size=self.prologue_info["weight_subtile_size"], async_type=False)
+                if (self.prologue_info["is_input_fused"]):
+                    code.splice(input_dma_code)
+                    code.splice(prologue_code)
+                    code.splice(weight_dma_code)
+                else:
+                    code.splice(weight_dma_code)
+                    code.splice(prologue_code)
+                    code.splice(input_dma_code)
             else:
-                code.writeline(emit_dma_start(self.prologue_info["input_sram_var"], self.prologue_info["input_index_var"], self.prologue_info["input_tag_var"],
-                                              self.prologue_info["input_numel"], self.prologue_info["input_tile_size"], self.prologue_info["input_subtile_size"], async_flag=True, label="X"))
-                code.writeline(emit_dma_start(self.prologue_info["weight_sram_var"], self.prologue_info["weight_index_var"], self.prologue_info["weight_tag_var"],
-                                              self.prologue_info["weight_numel"], self.prologue_info["weight_tile_size"], self.prologue_info["weight_subtile_size"], async_flag=True, label="W"))
+                dma_code = self.def_dma_op("MVIN", self.prologue_info["input_dram_var"], self.prologue_info["input_idx"],
+                                self.prologue_info["input_tile_desc"], subtile_size=self.prologue_info["input_subtile_size"], async_type=False)
+                code.splice(dma_code)
+                dma_code = self.def_dma_op("MVIN", self.prologue_info["weight_dram_var"], self.prologue_info["weight_idx"],
+                                self.prologue_info["weight_tile_desc"], subtile_size=self.prologue_info["weight_subtile_size"], async_type=False)
+                code.splice(dma_code)
             code = textwrap.indent(code.getvalue(), " "*indent_size).strip()
             return code
 
         assert "<PREPARE_INPUT>" not in self.render_hooks
         self.render_hooks["<PREPARE_INPUT>"] = hook
+        self.render_hooks.move_to_end("<PREPARE_INPUT>", last=False) # Force order to be triggered first
         return "<PREPARE_INPUT>"
 
-    def output_name(self):
-        # Cannot know the output name from the template, so we need to hook it
-        def hook():
-            arg_defs, *_ = self.kernel_group.args.mlir_argdefs()
-            output = arg_defs[3]    #FIXME: Constant index used
-            pattern = r"%(\w+):"
-            output = re.search(pattern, output).group(1)
-            return output
-        assert "<OUPUT>" not in self.render_hooks
-        self.render_hooks["<OUPUT>"] = hook
-        return "<OUPUT>"
-
     def store_output(self, indent_size: int = 0):
         def hook():
-            self.codegen_epilogue_body()
-            return textwrap.indent(self.epilogue_buffer_group.body.getvalue(), " "*indent_size).strip()
+            epilogue_code = self.codegen_epilogue_body()
+            return textwrap.indent(epilogue_code.getvalue(), " "*indent_size).strip()
 
         assert "<STORE_OUTPUT>" not in self.render_hooks
         self.render_hooks["<STORE_OUTPUT>"] = hook
@@ -636,29 +620,6 @@ def hook():
         self.render_hooks["<REDUCTION_OUTPUT>"] = hook
         return "<REDUCTION_OUTPUT>"
 
-    def reduction_iter_arg(self):
-        def hook():
-            if len(self.reduction_vars):
-                args = ', '.join([f"%{iter.name} = %{init.name}" for (_, iter, init, _) in self.reduction_vars.values()])
-                dtype = ', '.join([f"{dtype}" for (_, _, _, dtype) in self.reduction_vars.values()])
-                return f"iter_args({args}) -> ({dtype})"
-            return ""
-
-        assert "<REDUCTION_ITER_ARG>" not in self.render_hooks
-        self.render_hooks["<REDUCTION_ITER_ARG>"] = hook
-        return "<REDUCTION_ITER_ARG>"
-
-    def reduction_acc(self):
-        def hook():
-            if len(self.reduction_vars):
-                acc = ', '.join([f"%{acc.name}" for acc in self.reduction_vars.keys()])
-                return f"{acc} ="
-            return ""
-
-        assert "<REDUCTION_ACC>" not in self.render_hooks
-        self.render_hooks["<REDUCTION_ACC>"] = hook
-        return "<REDUCTION_ACC>"
-
     def def_function(self):
         _, call_args, _ = self.kernel_group.args.python_argdefs()
         if self.outer_func_render is not None:
@@ -679,26 +640,76 @@ def hook():
         self.render_hooks[key] = hook
         return key
 
-    def def_local_vars(self):
+    def def_local_vars(self, indent_size=0):
         key = "<LOCAL_VARS>"
         def hook():
             code = IndentedBuffer()
-            code.tabwidth = 2
-            code.splice("\n")
-            with code.indent():
-                code.splice(self.const_buffer)
-                code.splice(self.alloc_buffer)
-            return code.getvalue()
+            code.tabwidth = 1
+            code.splice(self.const_buffer)
+            code.splice(self.alloc_buffer)
+            return textwrap.indent(code.getvalue(), " "*indent_size).strip()
 
         assert key not in self.render_hooks
         self.render_hooks[key] = hook
         return key
 
+    def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_common.MLIRMultiDimTile,
+                   subtile_size:list=[], async_type=None, indent_size=0):
+        # Prepare code block
+        local_code = IndentedBuffer()
+        with V.set_kernel_handler(self):
+            tag = f"mvint_{self.dma_read_counter}" if dma_type == "MVIN" else f"mvoutt_{self.dma_write_counter}"
+            index_var = self.parse_index_list(index_list, local_code)
+            node_layout = self.named_nodes[dram_var].get_layout()
+            numel = self.get_arg_info(self.named_nodes[dram_var].get_name()).get_numel()
+            if dram_var in self.exception_nodes:
+                numel = self.exception_nodes[dram_var]["numel"]
+            mlir_dtype = mlir_common.DTYPE_TO_MLIR[node_layout.dtype]
+            dram_shape = f"memref<{numel}x{mlir_dtype}>"
+            dram_stride = []
+            for idx in index_list:
+                if idx.is_Mul:
+                    dram_stride.append(int(idx.args[0]))
+                elif idx == sympy.Symbol("c0"):
+                    dram_stride.append(0)
+                elif not idx.is_Number:
+                    dram_stride.append(1)
+                else:
+                    dram_stride.append(0)
+
+            sram_var = tile_desc.get_name()
+            tile_shape = tile_desc.get_mlir_shape(mlir_dtype)
+            tile_stride = tile_desc.get_tile_stride()
+            vlane_split_axis = tile_desc.vlane_split_axis
+            vlane_stride = tile_desc.vlane_stride
+
+            zero_cse = self.get_const_cse(0, "index")
+            sram_index_var = ", ".join([f"%{str(zero_cse)}"]*tile_desc.get_nr_dim())
+
+            attribute_parts = [f"dram_stride={dram_stride}", f"sram_stride={tile_stride}", "padding=0"]
+            if subtile_size:
+                attribute_parts.append(f"subtile_size={subtile_size}, async={int(async_type) if async_type is not None else 1}")
+            attribute = "  {" + ", ".join(attribute_parts) + "}"
+            code = self.get_dma_code(dma_type, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
+                                    tag, dram_shape, tile_shape, "")
+            local_code.writeline(code)
+            local_code.writeline(attribute)
+        return textwrap.indent(local_code.getvalue(), " "*indent_size).strip()
+
+    def def_sram_buffer(self, dram_name, tile_desc, id=0, indent_size=0):
+        # Prepare code block
+        with V.set_kernel_handler(self):
+            dtype = self.named_nodes[dram_name].get_layout().dtype
+            tile_shape = tile_desc.get_mlir_shape(mlir_common.DTYPE_TO_MLIR[dtype])
+            buffer_name = self.allocate_sram_buffer(dtype, dram_name, tile_desc, id, forced_name=dram_name)
+            code = f"%{tile_desc.name} = memref.get_global @{buffer_name} : {tile_shape}"
+        return textwrap.indent(code, " "*indent_size).strip()
+
     def render(self, template, kwargs, define_function=None):
-        # self.render_hooks = {}
         code = template.render(**kwargs)
         if define_function is not None:
             define_function(self)
+
         return PartialRender(
             code,
             self.render_hooks,
@@ -708,71 +719,14 @@ def get_spad_size_per_lane(self, tile_m, tile_n):
         size = tile_m * ((tile_n + self.vector_lane - 1) // self.vector_lane)
         return max(size, 2) # vector load/store
 
-    def load_prologue(self, name: str, index: sympy.Expr):
-        load_dim = []
-        if not isinstance(V.graph, NullHandler) and name in V.graph.graph_inputs:
-            load_dim = V.graph.graph_inputs[name].layout.size
-        if self.ranges == self.buffer_types[name][2]:
-            index_var = self.prologue_info['input_index_var'] if len(load_dim) != 1 else 'tile_n'
-            vlane_split_axis = self.kernel_group.prologue_tile_desc.vlane_split_axis if len(load_dim) != 1 else 0    # FIXME: Fixed split axis for 1d load dim
-        else:
-            # Broadcast pattern
-            zero_index = self.const_cse.generate(self.const_buffer, "arith.constant 0 : index")
-            if self.prologue_info['is_bmm']: # FIXME: hardcoded
-                idx = f"%b, %t_k, %t_n"
-                map_var = self.map_cse.generate(self.global_vars, f"affine_map<(d0, d1, d2) -> (d0 * 512 + d2)>")
-                vlane_split_axis = 2 # 3D GEMM prologue should be loaded by axis 2
-            else:
-                idx = f"%t_m, %{zero_index}"
-                map_var = self.map_cse.generate(self.global_vars, f"affine_map<(d0, d1) -> (d0)>")
-                vlane_split_axis = 1 # 2D GEMM prologue should be loaded by axis 1
-            index_var = self.apply_cse.generate(self.dma_loads, f"affine.apply #{map_var}({idx})")
-        index = self.rename_indexing(index)
-        dram_var = self.kernel_group.args.input(name)
-        dtype = V.graph.get_dtype(name)
-        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        vlane_stride = self.kernel_group.prologue_tile_desc.vlane_stride if len(load_dim) != 1 else 1    # FIXME: Fixed stride for 1d load dim
-        tile_numel_per_lane = self.kernel_group.prologue_tile_desc.get_numel_per_lane()
-        tile_shape = self.kernel_group.prologue_tile_desc.get_mlir_shape(mlir_dtype)
-        tile_stride = self.prologue_info['input_sram_stride']
-
-        # Compute vector unit size
-        vshape = self.kernel_group.prologue_tile_desc.get_mlir_vshape(mlir_dtype)
-        compute_vec_size = self.kernel_group.prologue_tile_desc.get_compute_vec_size()
-
-        if name not in self.buffer_names:
-            # Allocate sram buffer
-            dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
-            sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index, self.alloc_buffer)
-            self.buffer_names[name] = sram_var
-            code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                     f"{name}_tag", dram_shape, tile_shape, tile_stride)
-            self.cse.generate(self.dma_loads, code, assignment = False)
-
-        # Load vector from sram
-        sram_var = self.buffer_names[name]
-        zero_var = self.get_const_cse(0)
-        compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.prologue_tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
-
-        if compute_vec_size > 1:
-            operation = "affine.vector_load"
-            line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
-        else:
-            operation = "affine.load"
-            line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
-
-        out = self.cse.generate(self.loads, line)
-        self.register_var_info(out, [compute_vec_size, mlir_dtype])
-        return out
-
     def store_prologue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        tile_shape = self.kernel_group.prologue_tile_desc.get_mlir_shape(mlir_dtype)
+        tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
 
         # Compute vector unit size
-        vshape = self.kernel_group.prologue_tile_desc.get_mlir_vshape(mlir_dtype)
-        compute_vec_size = self.kernel_group.prologue_tile_desc.get_compute_vec_size()
+        vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
+        compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
 
         sram_var = self.buffer_names[name]
         zero_var = self.get_const_cse(0)
@@ -780,7 +734,7 @@ def store_prologue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         _, operand_type = self.var_info[value]
         if mlir_dtype != operand_type:
             value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
-        compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.prologue_tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
+        compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
         # Generate vector load instruction
         if compute_vec_size > 1:
             operation = "affine.vector_store"
@@ -791,25 +745,19 @@ def store_prologue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         self.stores.writeline(line)
 
     def load_epilogue(self, name: str, index: sympy.Expr):
-        is_1d_source = len(index.free_symbols) == 1
-        is_transpose = False    # FIXME: Only works for 2d input
-        if len(index.args) == 2:
-            for expr in index.args:
-                if len(expr.args):
-                    if expr.args[1].name == "index0" and expr.args[0] > 1:
-                        is_transpose = True
-                        break
-        key = 't_index_var' if is_transpose else 'index_var'
-        index_var = self.epilogue_info[key] if not is_1d_source else 'tile_n'
         index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.input(name)
+        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis if not is_1d_source else 0    # FIXME: Fixed split axis for 1d load dim
-        vlane_stride = self.kernel_group.tile_desc.vlane_stride if not is_1d_source else 1    # FIXME: Fixed stride for 1d load dim
-        tile_numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
+
+        # Want to use tile_desc from epilogue_info
+        index_var = self.parse_indices(index)
+        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()]
+        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis
+        vlane_stride = self.kernel_group.tile_desc.vlane_stride
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
-        tile_stride = self.epilogue_info['tile_stride']
+        tile_stride = self.kernel_group.tile_desc.get_tile_stride()
 
         # Compute vector unit size
         vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
@@ -818,16 +766,12 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         if name not in self.buffer_names:
             # Allocate sram buffer
             dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
-            sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
-            self.buffer_names[name] = sram_var
+            sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, self.kernel_group.tile_desc, index)
+            attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
             code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                     f"{name}_tag", dram_shape, tile_shape, tile_stride)
-            self.cse.generate(self.dma_loads, code, assignment = False)
-        elif name in self.reuse_buffer_names:
-            sram_var = self.reuse_buffer_names[name]
-            code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                     f"{name}_tag", dram_shape, tile_shape, tile_stride)
+                                     f"{name}_tag", dram_shape, tile_shape, attribute)
             self.cse.generate(self.dma_loads, code, assignment = False)
+            self.buffer_names[name] = sram_var
         else:
             sram_var = self.buffer_names[name]
 
@@ -861,24 +805,25 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         return out
 
     def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
-        index_var = self.epilogue_info['index_var']
+        index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.output(name)
+        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
+
+        index_var = self.parse_indices(index)
+        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()]
         vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis
         vlane_stride = self.kernel_group.tile_desc.vlane_stride
-        tile_numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
-
-        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
-        tile_stride = self.epilogue_info['tile_stride']
+        tile_stride = self.kernel_group.tile_desc.get_tile_stride()
 
         # Compute vector unit size
         vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
         compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
 
         if name not in self.buffer_names:
-            sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index_var, index)
+            sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, self.kernel_group.tile_desc, index)
             self.buffer_names[name] = sram_var
         else:
             zero_cse = self.get_const_cse(0)
@@ -901,8 +846,9 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         self.stores.writeline(DeferredLine(name, line))
 
         # Generate DMA instruction
+        attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                 f"{name}_tag", dram_shape, tile_shape, tile_stride)
+                                 f"{name}_tag", dram_shape, tile_shape, attribute)
         self.dma_stores.writeline(DeferredLine(name, code))
 
     def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
@@ -920,28 +866,35 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
                 sqr_sum = self.reduction_epilogue(dtype, src_dtype, "sum", ops.mul(value, value))
                 self.welford_reduce_out = (sum, sqr_sum, None)
                 return sum, sqr_sum, None
+
         # Check duplicated reductions
         reduction_key = src_dtype, reduction_type, value
         if reduction_key in self.reduction_epilogue_result:
             return self.reduction_epilogue_result[reduction_key]
 
         # Reduction fusion codegen part
-        type_name = mlir_common.DTYPE_TO_MLIR[dtype]
         vec_size = self.compute_body_loop.step
-        vshape = f"vector<{vec_size}x{type_name}>"
+        type_name = mlir_common.DTYPE_TO_MLIR[dtype]
+        new_tile_size = self.kernel_group.tile_desc.get_tile_size()[:-1] + [vec_size]
+        new_vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis
+        new_vlane_stride = self.kernel_group.tile_desc.vlane_stride
+        local_tile_desc = mlir_common.MLIRMultiDimTile(new_tile_size, self.vector_lane, new_vlane_split_axis, new_vlane_stride, vec_size)
+
+        tile_shape = local_tile_desc.get_mlir_shape(type_name)
+        vshape = local_tile_desc.get_mlir_vshape(type_name)
 
-        tile_shape = f"memref<{self.reduction_body_loop.size * self.vector_lane}x{vec_size}x{type_name}, 1>"
         name = f"{reduction_type}_buffer{self.reduction_buffer_idx}"
         self.reduction_buffer_idx += 1
         index = "dummy_index" # Not used
-        tile_numel_per_lane = self.compute_body_loop.step * self.reduction_body_loop.size
-        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, None, index, self.const_buffer)
+        tile_numel_per_lane = self.compute_body_loop.step * self.reduction_body_loop.size # ???
+        sram_var, _ = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index, self.const_buffer)
         self.reduction_epilogue_result[reduction_key] = sram_var
 
         # Load partial result
-        zero_var = self.get_const_cse(0)
+        zero_var_list = [f"%{self.get_const_cse(0)}"] * local_tile_desc.get_nr_dim()
+        zero_var_list[-2] = f"%{self.reduction_loop_idx}"
+        compute_index_var = ", ".join(zero_var_list)
         operation = "affine.vector_load"
-        compute_index_var = ",".join([f"%{self.reduction_loop_idx}"] + [f"%{zero_var}"])
         line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
         out = self.cse.generate(self.loads, line)
         self.register_var_info(out, [self.compute_body_loop.step, type_name])
@@ -953,78 +906,85 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         operation = "affine.vector_store"
         line = f"{operation} %{result}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
         self.compute.writeline(line) # Need to be placed after partial reduction
-        self.reduction_info[sram_var] = reduction_type
+        self.reduction_info[sram_var] = [reduction_type, local_tile_desc]
         return sram_var
 
     def store_reduction_epilogue(self, name, index, value):
+        index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.output(name)
+        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         dtype = V.graph.get_dtype(name)
-        type_name = mlir_common.DTYPE_TO_MLIR[dtype]
-        index = self.rename_indexing(index)
-
-        # Tile is always reuduced in inner loop
-        numel_per_lane = self.kernel_group.tile_desc.get_numel_per_lane()
-        reduction_axis_size = self.kernel_group.tile_desc.get_tile_size()[-2]
-        nr_outer_loop = numel_per_lane // reduction_axis_size
+        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
-        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis - 1
+        index_var = self.parse_indices(index, self.reductions_suffix, comments="// Store reduction")
+        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()][:-1] # Assume that there is only one reduction axis
+        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis
         vlane_stride = self.kernel_group.tile_desc.vlane_stride
-        tile_numel_per_lane = vlane_stride * nr_outer_loop * 2
 
-        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
-        tile_shape = f"memref<{self.kernel_group.tile_desc.get_tile_size()[1]}x{type_name}, 1>"
-        tile_stride = [1]
-        sram_var, index_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, tile_numel_per_lane, tile_shape, index,
-                                                                         index, buffer=self.const_buffer)
+        # Create final buffer descriptor
+        nr_outer_loop = self.reduction_nr_outer_loop
+        tile_size = self.kernel_group.tile_desc.get_tile_size()[:-1]
+        final_tile_desc = mlir_common.MLIRMultiDimTile(tile_size, self.vector_lane, vlane_split_axis, vlane_stride*nr_outer_loop*2)
+        final_tile_shape = final_tile_desc.get_mlir_shape(mlir_dtype)
+        final_tile_stride = final_tile_desc.get_tile_stride()
+        sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, final_tile_desc, index, buffer=self.const_buffer)
+
+        # Set partial buffer descriptor
+        partial_tile_desc = self.reduction_info[value][1]
+        partial_vec_size = partial_tile_desc.get_compute_vec_size()
+        partial_vshape = partial_tile_desc.get_mlir_vshape(mlir_dtype)
+        partial_tile_shape = partial_tile_desc.get_mlir_shape(mlir_dtype)
+
+        # Prepare constant
+        init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(self.reduction_info[value][0], dtype)} : {mlir_dtype}")
+        partial_zero_var_list = [f"%{self.get_const_cse(0)}"] * partial_tile_desc.get_nr_dim()
+        final_zero_var_list = [f"%{self.get_const_cse(0)}"] * final_tile_desc.get_nr_dim()
         for i in range(self.reduction_body_loop.size):
-            vec_size = self.compute_body_loop.step
-            vshape = f"vector<{vec_size}x{type_name}>"
-
-            partial_tile_shape = f"memref<{self.reduction_body_loop.size * self.vector_lane}x{vec_size}x{type_name}, 1>"
             # Load partial result
-            init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(self.reduction_info[value], dtype)} : {type_name}")
-            init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {type_name} to {vshape}")
-            zero_var = self.const_cse.generate(self.const_buffer, f"arith.constant {0} : index")
-            index_var = self.const_cse.generate(self.const_buffer, f"arith.constant {i} : index")
-            compute_index_var = ",".join([f"%{index_var}"] + [f"%{zero_var}"])
+            body_index_var = self.const_cse.generate(self.const_buffer, f"arith.constant {i} : index")
+            partial_zero_var_list[-2] = f"%{body_index_var}"
+            compute_index_var = ",".join(partial_zero_var_list)
 
             operation = "affine.vector_load"
-            line = f"{operation} %{value}[{compute_index_var}] : {partial_tile_shape}, {vshape}"
+            line = f"{operation} %{value}[{compute_index_var}] : {partial_tile_shape}, {partial_vshape}"
             out = self.cse.generate(self.reductions_suffix, line)
             operation = "affine.vector_store"
-            line = f"{operation} %{init_vec}, %{value}[{compute_index_var}] : {partial_tile_shape}, {vshape}"
+            init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {mlir_dtype} to {partial_vshape}")
+            line = f"{operation} %{init_vec}, %{value}[{compute_index_var}] : {partial_tile_shape}, {partial_vshape}"
             self.reductions_suffix.writeline(line)
 
             # 2 step reduction
             new_vec_size = 2
-            new_vshape = f"vector<{vec_size//new_vec_size}x{new_vec_size}x{type_name}>"
-            new_reduced_shape = f"vector<{new_vec_size}x{type_name}>"
-            out = self.cse.generate(self.reductions_suffix, f"vector.shape_cast %{out} : {vshape} to {new_vshape}")
-            init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {type_name} to {new_reduced_shape}")
-            out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(self.reduction_info[value], out, init_vec, axis=0, shape=new_vshape, reduced_shape=new_reduced_shape))
+            new_vshape = f"vector<{partial_vec_size//new_vec_size}x{new_vec_size}x{mlir_dtype}>"
+            new_reduced_shape = f"vector<{new_vec_size}x{mlir_dtype}>"
+            out = self.cse.generate(self.reductions_suffix, f"vector.shape_cast %{out} : {partial_vshape} to {new_vshape}")
+            init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {mlir_dtype} to {new_reduced_shape}")
+            out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(self.reduction_info[value][0], out, init_vec, axis=0, shape=new_vshape, reduced_shape=new_reduced_shape))
             out2 = self.cse.generate(self.reductions_suffix, f"vector.shuffle %{out}, %{out} [1, 0] : {new_reduced_shape}, {new_reduced_shape}")
 
             self.compute, self.reductions_suffix = self.reductions_suffix, self.compute
-            self.register_var_info(out, [new_vec_size, type_name])
-            self.register_var_info(out2, [new_vec_size, type_name])
-            out = reduction_partial_combine_vec(self.reduction_info[value], out, out2)
+            self.register_var_info(out, [new_vec_size, mlir_dtype])
+            self.register_var_info(out2, [new_vec_size, mlir_dtype])
+            out = reduction_partial_combine_vec(self.reduction_info[value][0], out, out2)
             self.compute, self.reductions_suffix = self.reductions_suffix, self.compute
 
-            # Final reduction
-            #final_reduced_shape = type_name
-            #init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(self.reduction_info[value], dtype)} : {type_name}")
-            #out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(self.reduction_info[value], out, init, axis=0, shape=vshape, reduced_shape=final_reduced_shape))
-
             if self.welford_reduce_out is not None:
-                # mean
-                divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(768)} : f32")
+                # NOTE: It not a real welford algorithm... We just used E(X^2) - E(X)^2
+                divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.reduction_axis_size)} : f32")
+                if self.reduction_axis_size - 1 > 0:
+                    divider2 = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.reduction_axis_size-1)} : f32")
+                else:
+                    divider2 = divider
+
                 if self.buffer_types[name][1] > 1:
                     divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to {new_reduced_shape}")
+                    divider_vec2 = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider2} : f32 to {new_reduced_shape}")
                 else:
                     divider_vec = divider
+                    divider_vec2 = divider2
 
                 if self.current_node.node.origin_node: # FIXME: This is a temporary solution
-                    # mean = E(X) / N
+                    # mean = SUM(X) / N
                     self.reduction_mean.append(self.cse.generate(self.reductions_suffix, f"arith.divf %{out}, %{divider_vec} : {new_reduced_shape}"))
                     out = self.reduction_mean[i]
                 else:
@@ -1032,43 +992,37 @@ def store_reduction_epilogue(self, name, index, value):
                     sqr_mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{out}, %{divider_vec} : {new_reduced_shape}")
                     mean_sqr = self.cse.generate(self.reductions_suffix, f"arith.mulf %{self.reduction_mean[i]}, %{self.reduction_mean[i]} : {new_reduced_shape}")
                     variance = self.cse.generate(self.reductions_suffix, f"arith.subf %{sqr_mean}, %{mean_sqr} : {new_reduced_shape}")
-                    m2 = self.cse.generate(self.reductions_suffix, f"arith.mulf %{variance}, %{divider_vec} : {new_reduced_shape}")
+                    m2 = self.cse.generate(self.reductions_suffix, f"arith.mulf %{variance}, %{divider_vec2} : {new_reduced_shape}")
                     out = m2
 
+            final_zero_var_list[-1] = f"%{body_index_var}"
+            final_compute_index_var = ",".join(final_zero_var_list)
             operation = "affine.vector_store"
-            line = f"{operation} %{out}, %{sram_var}[%{index_var}] : {tile_shape}, {new_reduced_shape}"
+            line = f"{operation} %{out}, %{sram_var}[{final_compute_index_var}] : {final_tile_shape}, {new_reduced_shape}"
             self.reductions_suffix.writeline(DeferredLine(name, line))
 
         # MVOUT Encoding
         # Generate DMA instruction
-        index_var = self.reduction_idx
-        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, type_name, dram_var, index_var, sram_var, sram_index_var,
-                                f"{name}_tag", dram_shape, tile_shape, tile_stride)
+        attribute = f"{{dram_stride={dram_stride}, sram_stride={final_tile_stride}, padding=0}}"
+        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
+                                f"{name}_tag", dram_shape, final_tile_shape, attribute)
         self.reductions_suffix.writeline(DeferredLine(name, code))
 
-    def get_scratchpad_buffer(self, dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index, buffer=None):
-        return super().get_scratchpad_buffer(dtype, name, tile_size_per_lane, dram_tile_shape, index_var, raw_index, True, buffer=buffer)
-
-    def set_tile_size(self, template_epilogue_info, prologue=False):
-        tile_desc = mlir_common.MLIRMultiDimTile(template_epilogue_info['tile_size'],
-            self.vector_lane,
-            vlane_split_axis=template_epilogue_info['vlane_split_axis'],
-            vlane_stride=template_epilogue_info['vlane_stride'])
+    def set_tile_size(self, template_fusion_info, prologue=False):
+        tile_desc = template_fusion_info["dram_tile_desc"]
+        if "dim_aliasing" in template_fusion_info:
+            self.dim_aliasing = template_fusion_info["dim_aliasing"]
 
-        if "reuse_buffer_names" in template_epilogue_info:
-            self.reuse_buffer_names.update(template_epilogue_info["reuse_buffer_names"])
-
-        if 'nr_rdim' in template_epilogue_info and template_epilogue_info['nr_rdim']==1:
+        if 'nr_rdim' in template_fusion_info and template_fusion_info['nr_rdim']==1:
             tile_desc.nr_rdim = 1
             numel_per_lane = tile_desc.get_numel_per_lane()
-            reduction_axis_size = tile_desc.get_tile_size()[-2]
+            reduction_axis_size = tile_desc.get_tile_size()[-1]
             nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size
             tile_desc.vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality...
 
             self.reduction_fusion = True
-            self.reduction_axis_size =  tile_desc.get_tile_size()[-2]
-            self.reduction_nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size
-            self.reduction_idx = template_epilogue_info["reduction_idx"]
+            self.reduction_axis_size =  tile_desc.get_tile_size()[-1]
+            self.reduction_nr_outer_loop = nr_outer_loop
             self.reduction_loop_idx = "reduce_loop_idx"
             self.compute_body_loop.size = reduction_axis_size
             self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop
@@ -1083,6 +1037,14 @@ def set_tile_size(self, template_epilogue_info, prologue=False):
                 self.compute_body_loop.step = tile_desc.get_compute_vec_size()
         return tile_desc
 
+    def rename_indexing(self, index) -> sympy.Expr:
+        for dim_name, dim_aliased_name in self.dim_aliasing.items():
+            index = index.subs(sympy.Symbol(dim_name), sympy.Symbol("tmp_"+dim_aliased_name))
+        # To avoid this case ({"index0":"index1", "index1":"index0"})
+        for dim_aliased_name in self.dim_aliasing.values():
+            index = index.subs(sympy.Symbol("tmp_"+dim_aliased_name), sympy.Symbol(dim_aliased_name))
+        return index
+
 class MLIRTemplateCaller(CUDATemplateCaller):
     def __str__(self):
         return f"MLIRTemplateCaller(source_file={self.bmreq.source_file})"

From 4fd7b6949f302ad995c539369c5f1b96d2cea9ad Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 11 Jul 2025 04:42:20 +0000
Subject: [PATCH 380/432] [CI+Test] Add fusion test + update test case

---
 .github/workflows/pull-request.yml        | 71 +++++++++++++++++------
 .github/workflows/pull-request_mobile.yml | 71 +++++++++++++++++------
 tests/Fusion/test_matmul_reduction.py     | 35 +++++------
 tests/Fusion/test_prologue_fusion.py      | 28 +++++++--
 tests/test_conv2d.py                      | 10 +++-
 tests/test_matmul.py                      | 23 +++++++-
 tests/test_reduce.py                      | 11 ++--
 7 files changed, 178 insertions(+), 71 deletions(-)

diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index 3dbb3e36..9d440df6 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -493,12 +493,7 @@ jobs:
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_addmm_residual.py
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+
       - name: Run test_matmul_activation.py
         env:
           GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
@@ -508,12 +503,7 @@ jobs:
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_activation.py
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+
       - name: Run test_matmul_scalar.py
         env:
           GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
@@ -523,12 +513,57 @@ jobs:
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_scalar.py
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+
+      - name: Run test_matmul_reduction.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_matmul_reduction.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_reduction.py
+
+      - name: Run test_matmul_layernorm.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_matmul_layernorm.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_layernorm.py
+
+      - name: Run test_bmm_reduction.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_bmm_reduction.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_bmm_reduction.py
+
+      - name: Run test_prologue_fusion.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_prologue_fusion.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_prologue_fusion.py
+
+      - name: Run test_transformer_fusion.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_transformer_fusion.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_transformer_fusion.py
+
       - name: Run test_conv_fusion.py
         env:
           GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
diff --git a/.github/workflows/pull-request_mobile.yml b/.github/workflows/pull-request_mobile.yml
index 945bac3b..45d73fa8 100644
--- a/.github/workflows/pull-request_mobile.yml
+++ b/.github/workflows/pull-request_mobile.yml
@@ -493,12 +493,7 @@ jobs:
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_addmm_residual.py
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+
       - name: Run test_matmul_activation.py
         env:
           GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
@@ -508,12 +503,7 @@ jobs:
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_activation.py
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+
       - name: Run test_matmul_scalar.py
         env:
           GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
@@ -523,12 +513,7 @@ jobs:
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_scalar.py
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GIT_ACCESS_TOKEN }}
+
       - name: Run test_conv_fusion.py
         env:
           GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
@@ -539,6 +524,56 @@ jobs:
             -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_conv_fusion.py
 
+      - name: Run test_matmul_reduction.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_matmul_reduction.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_reduction.py
+
+      - name: Run test_matmul_layernorm.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_matmul_layernorm.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_layernorm.py
+
+      - name: Run test_bmm_reduction.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_bmm_reduction.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_bmm_reduction.py
+
+      - name: Run test_prologue_fusion.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_prologue_fusion.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_prologue_fusion.py
+
+      - name: Run test_transformer_fusion.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_transformer_fusion.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_transformer_fusion.py
+
   test_moe:
     name: Run test_moe
     runs-on: self-hosted
diff --git a/tests/Fusion/test_matmul_reduction.py b/tests/Fusion/test_matmul_reduction.py
index 07dd914d..31ea1b0d 100644
--- a/tests/Fusion/test_matmul_reduction.py
+++ b/tests/Fusion/test_matmul_reduction.py
@@ -17,24 +17,22 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
         print("cpu out: ", cpu_out)
         exit(1)
 
-def test_matmul_reduce(device, size=512):
-    def matmul_fused(a, b, c):
+def test_matmul_reduce(device, M=512, N=512, K=512):
+    def matmul_fused(a, b):
         result = torch.matmul(a, b)
         return result, result.max(dim=-2).values
     torch.manual_seed(0)
-    N = size
-    input = torch.randn(N, N)
-    weight = torch.randn(N, N)
-    #input = torch.arange(1, N * N + 1, dtype=torch.float32).reshape(N, N).to(dtype=torch.float32)
-    #weight = torch.eye(N, dtype=torch.float32)
+    input = torch.randn(M, K)
+    weight = torch.randn(K, N)
+    #input = torch.arange(1, M * K + 1, dtype=torch.float32).reshape(M, K).to(dtype=torch.float32)
+    #weight = torch.eye(K, dtype=torch.float32)
     x1 = input.to(device=device)
     w1 = weight.to(device=device)
     x2 = input.to("cpu")
     w2 = weight.to("cpu")
-    c = 7
     opt_fn = torch.compile(dynamic=False)(matmul_fused)
-    res = opt_fn(x1, w1, c)
-    y = matmul_fused(x2, w2, c)
+    res = opt_fn(x1, w1)
+    y = matmul_fused(x2, w2)
     test_result("Matmul Reduction Fusion activation", res[0], y[0])
     test_result("Matmul Reduction Fusion reduction", res[1], y[1])
 
@@ -45,7 +43,7 @@ def matmul_fused(a, b, c):
         return result, var, mean
     torch.manual_seed(0)
     N = size
-    input = torch.randn(3072, 768)
+    input = torch.randn(1024, 768)
     weight = torch.randn(512, 768)
     #input = torch.arange(1, N * N + 1, dtype=torch.float32).reshape(N, N).to(dtype=torch.float32)
     #weight = torch.eye(N, dtype=torch.float32)
@@ -61,17 +59,16 @@ def matmul_fused(a, b, c):
     test_result("Matmul var_mean Fusion reduction", res[1], y[1])
     test_result("Matmul var_mean Fusion reduction", res[2], y[2])
 
-def test_matmul_add_var_mean(device, size=512):
+def test_matmul_add_var_mean(device, M=768, N=512, K=3072):
     def matmul_fused(a, b, c, d):
         result = torch.matmul(a, b.T) + c.T
         var, mean = torch.var_mean(result + d, dim=-2)
         return result, var, mean
     torch.manual_seed(0)
-    N = size
-    input = torch.randn(768, 3072)
-    weight = torch.randn(512, 3072)
-    bias = torch.randn(768, 512)
-    residual = torch.randn(768,512)
+    input = torch.randn(M, K)
+    weight = torch.randn(N, K)
+    bias = torch.zeros(N, M)
+    residual = torch.randn(M,N)
     x1 = input.to(device=device)
     w1 = weight.to(device=device)
     b1 = bias.to(device=device)
@@ -95,6 +92,6 @@ def matmul_fused(a, b, c, d):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    #test_matmul_reduce(device)
+    test_matmul_reduce(device, 3072, 512, 768)
     test_matmul_var_mean(device)
-    #test_matmul_add_var_mean(device)
+    test_matmul_add_var_mean(device)
diff --git a/tests/Fusion/test_prologue_fusion.py b/tests/Fusion/test_prologue_fusion.py
index 926782be..d5d1cdb1 100644
--- a/tests/Fusion/test_prologue_fusion.py
+++ b/tests/Fusion/test_prologue_fusion.py
@@ -53,13 +53,28 @@ def matmul_fused(a, b, c):
     y = matmul_fused(x2, w2, c2)
     test_result("Matmul Element-wise Fusion Forward", res, y)
 
-def test_elem_bmm_fusion(device, batch_size=1, m=512, n=512, k=64):
+def test_elem_bmm_weight_fusion(device, batch_size=1, m=512, n=512, k=64):
     def bmm(a, b, c, d):
-        return torch.bmm(a , (d - b)/c)
+        return torch.bmm(a , (d+b)*c)
     torch.manual_seed(0)
     a = torch.randn(batch_size, m, k).to(device=device)
     b = torch.randn(batch_size, 1, n).to(device=device)
-    c = torch.randn(batch_size, 1, n) * 1000
+    c = torch.randn(batch_size, 1, n)
+    c = c.to(device=device)
+    d = torch.randn(batch_size, k, n).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(bmm)
+    res = opt_fn(a, b, c, d)
+    out = bmm(a.cpu(), b.cpu(), c.cpu(), d.cpu())
+    print(torch.max(torch.abs(res.cpu() - out)))
+    test_result("BMM Element-wise Fusion Forward", res, out)
+
+def test_elem_bmm_input_fusion(device, batch_size=1, m=512, n=512, k=64):
+    def bmm(a, b, c, d):
+        return torch.bmm((a+b)*c , d)
+    torch.manual_seed(0)
+    a = torch.randn(batch_size, m, k).to(device=device)
+    b = torch.randn(batch_size, 1, k).to(device=device)
+    c = torch.randn(batch_size, 1, k)
     c = c.to(device=device)
     d = torch.randn(batch_size, k, n).to(device=device)
     opt_fn = torch.compile(dynamic=False)(bmm)
@@ -76,6 +91,7 @@ def bmm(a, b, c, d):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_elem_broadcast_fusion(device)
-    test_elem_fusion(device)
-    test_elem_bmm_fusion(device, batch_size=12, m=64, n=512, k=512)
\ No newline at end of file
+    #test_elem_broadcast_fusion(device)
+    #test_elem_fusion(device)
+    #test_elem_bmm_input_fusion(device, batch_size=4, m=512, n=512, k=64)
+    test_elem_bmm_weight_fusion(device, batch_size=12, m=512, n=512, k=64)
\ No newline at end of file
diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py
index 9d8b855a..8667792a 100644
--- a/tests/test_conv2d.py
+++ b/tests/test_conv2d.py
@@ -43,6 +43,10 @@ def custom_conv2d(a, b, bias):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_conv2d(device, batch_size=1, in_channels=128, out_channels=128, input_size=28, kernel_size=3, stride=1, padding=1)
-    test_conv2d(device, batch_size=1, in_channels=3, out_channels=64, input_size=64, kernel_size=7, stride=2, padding=3)
-    test_conv2d(device, batch_size=1, in_channels=3, out_channels=64, input_size=64, kernel_size=7, stride=1, padding=3)
+    test_conv2d(device, batch_size=1, in_channels=3, out_channels=32, input_size=32, kernel_size=3, stride=1, padding=1)
+    test_conv2d(device, batch_size=1, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=2, padding=3)
+    test_conv2d(device, batch_size=2, in_channels=3, out_channels=64, input_size=32//2, kernel_size=7, stride=1, padding=3)
+    test_conv2d(device, batch_size=4, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=1, padding=3)
+    test_conv2d(device, batch_size=4, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=1, padding=3)
+    test_conv2d(device, batch_size=2, in_channels=128, out_channels=256, input_size=13, kernel_size=5, stride=1, padding=2)
+    test_conv2d(device, batch_size=2, in_channels=128, out_channels=512, input_size=14, kernel_size=7, stride=1, padding=3)
diff --git a/tests/test_matmul.py b/tests/test_matmul.py
index bd219051..6f41468b 100644
--- a/tests/test_matmul.py
+++ b/tests/test_matmul.py
@@ -50,6 +50,24 @@ def custom_matmul(bias, a, b):
     y = custom_matmul(b2, x2, w2)
     test_result("Addmm Forward", res, y)
 
+def test_addmm2(device, input_size=128, hidden_size=128, output_size=128):
+    def custom_matmul(bias, a, b):
+        return torch.matmul(a, b) #+ bias
+    torch.manual_seed(0)
+    input = torch.randn(input_size, hidden_size)
+    weight = torch.randn(hidden_size, output_size)
+    bias = torch.randn(input_size, 1, dtype=torch.float32)
+    x1 = input.to(device=device)
+    w1 = weight.to(device=device)
+    b1 = bias.to(device=device)
+    x2 = input.to("cpu")
+    w2 = weight.to("cpu")
+    b2 = bias.to("cpu")
+    opt_fn = torch.compile(dynamic=False)(custom_matmul)
+    res = opt_fn(b1, x1, w1)
+    y = custom_matmul(b2, x2, w2)
+    test_result("Addmm2 Forward", res, y)
+
 def test_linear(device, input_size=128, hidden_size=128, output_size=128):
     def custom_linear(a, b, bias):
         linear = torch.nn.Linear(hidden_size, output_size)
@@ -83,7 +101,10 @@ def custom_linear(a, b, bias):
     test_matmul(device, 128, 128, 128)
     test_matmul(device, 256, 256, 256)
     test_matmul(device, 128, 256, 256)
-    test_matmul(device, 129, 61, 56)
+    test_matmul(device, 128, 63, 56)
     test_addmm(device, 128, 256, 512)
     test_addmm(device, 128, 256, 512, bias_rank=2)
     test_addmm(device, 129, 61, 56)
+    test_addmm2(device, 129, 61, 56)
+    test_addmm(device, 129*4, 61*4, 56*4)
+    test_addmm2(device, 129*4, 61*4, 56*4)
diff --git a/tests/test_reduce.py b/tests/test_reduce.py
index c1556787..e1a84b7f 100644
--- a/tests/test_reduce.py
+++ b/tests/test_reduce.py
@@ -50,9 +50,8 @@ def reduce_sum(a, dim, keepdim):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    #test_reduce_sum(device, (29, 47), 1, keepdim=True)
-    #test_reduce_sum(device, (17, 68), 0, keepdim=True)
-    #test_reduce_sum(device, (327, 447), 1, keepdim=True)
-    #test_reduce_sum(device, (327, 447), 0, keepdim=True)
-    test_reduce_sum2(device, shape)
-
+    test_reduce_sum(device, (29, 47), 1, keepdim=True)
+    test_reduce_sum(device, (17, 68), 0, keepdim=True)
+    test_reduce_sum(device, (327, 447), 1, keepdim=True)
+    test_reduce_sum(device, (327, 447), 0, keepdim=True)
+    test_reduce_sum2(device, shape)
\ No newline at end of file

From 9ae7a0849526f1585408c1b1124b4e716a5493f4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 11 Jul 2025 05:32:09 +0000
Subject: [PATCH 381/432] [Fix] Fix var_mean codegen + cheatsheet folder issue

---
 PyTorchSimFrontend/mlir/mlir_template.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index ccb9b0d1..b5aa2593 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -2,6 +2,7 @@
 import itertools
 import textwrap
 import re
+import os
 import contextlib
 import math
 import sympy
@@ -218,7 +219,9 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p
                     used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
                     check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane)
                     if check_spad_size:
-                        file_path = f"{CONFIG_TORCHSIM_DIR}/validation/gemm_candidates/gemm_{M}_{K}_{N}.txt"
+                        dir_path = f"{CONFIG_TORCHSIM_DIR}/validation/gemm_candidates"
+                        os.makedirs(dir_path, exist_ok=True)
+                        file_path = f"{dir_path}/gemm_{M}_{K}_{N}.txt"
                         line_to_write = f"{tile_M} {tile_K} {tile_N}\n"
                         try:
                             with open(file_path, "r") as f:
@@ -978,10 +981,8 @@ def store_reduction_epilogue(self, name, index, value):
 
                 if self.buffer_types[name][1] > 1:
                     divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to {new_reduced_shape}")
-                    divider_vec2 = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider2} : f32 to {new_reduced_shape}")
                 else:
                     divider_vec = divider
-                    divider_vec2 = divider2
 
                 if self.current_node.node.origin_node: # FIXME: This is a temporary solution
                     # mean = SUM(X) / N
@@ -992,7 +993,7 @@ def store_reduction_epilogue(self, name, index, value):
                     sqr_mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{out}, %{divider_vec} : {new_reduced_shape}")
                     mean_sqr = self.cse.generate(self.reductions_suffix, f"arith.mulf %{self.reduction_mean[i]}, %{self.reduction_mean[i]} : {new_reduced_shape}")
                     variance = self.cse.generate(self.reductions_suffix, f"arith.subf %{sqr_mean}, %{mean_sqr} : {new_reduced_shape}")
-                    m2 = self.cse.generate(self.reductions_suffix, f"arith.mulf %{variance}, %{divider_vec2} : {new_reduced_shape}")
+                    m2 = self.cse.generate(self.reductions_suffix, f"arith.mulf %{variance}, %{divider_vec} : {new_reduced_shape}")
                     out = m2
 
             final_zero_var_list[-1] = f"%{body_index_var}"

From 9e0e2d46bf8130caad01f5fe5ba9a20cd082d7e3 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 11 Jul 2025 06:12:38 +0000
Subject: [PATCH 382/432] [Frontend] Add exception handling for reduction loop
 only kernel

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 84418ec7..615925d1 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -147,6 +147,15 @@ def codegen_nodes(self, nodes):
         _, (group, reduction_group) = max(
             nodes, key=lambda x: int(x.is_reduction())
         ).group
+
+        # There is no normal loop, then revert simplified group
+        if len(group) == 0:
+            for idx, node in enumerate(nodes):
+                self.revert_group(node)
+            _, (group, reduction_group) = max(
+                nodes, key=lambda x: int(x.is_reduction())
+            ).group
+
         ex_kernel = self.target_kernel(kernel_group=self.kernel_group)
         ex_kernel.kernel_group = self.kernel_group
 

From 2109244aca1591ef1b5bde9df40d51d6daaa7d32 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 11 Jul 2025 06:15:00 +0000
Subject: [PATCH 383/432] [Template] Fix a minor bug in GEMM template

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index ace6ea9d..f706c2e5 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -143,6 +143,7 @@ def render(self,
         if (M == 0) or (N == 0) or (K == 0): # exception for MoE
             template = EMPTY_TEMPLATE
             nr_rdim = 0
+            epilogue_dim_aliasing = {}
         elif n_epilogue_node>=1 and epilogue_nodes[0].is_reduction():
             template = GEMM_REDUCTION_TEMPLATE
             epilogue_dim_aliasing = {"index0":"index1", "index1":"index0"}

From 3a8e0f8a0cb485f18fa6a6a14ca90c46b1221893 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 11 Jul 2025 07:31:31 +0000
Subject: [PATCH 384/432] [Frontend] Fix dram stride calculate logic

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 6dbe9047..68aa1b11 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1550,8 +1550,6 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
             local_dims = total_dims # Brodatcast tile shape
 
         index_var = self.parse_indices(index, buffer=buffer)
-        input_argument = [f"index{str(i)}" for i in local_dims]
-        dram_stride = [index.coeff(sympy.Symbol(arg)) for arg in input_argument]
 
         if kg_tile_desc.vlane_split_axis in local_dims:
             local_vlane_split_axis = local_dims.index(kg_tile_desc.vlane_split_axis)
@@ -1619,6 +1617,16 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
             # Update
             local_tile_desc.set_tile_size(new_tile_size)
             local_tile_desc.vlane_split_axis = new_vlane_split_axis
+
+        # Calculate dram stride
+        if index.is_Symbol:
+            dram_stride = [0] * local_tile_desc.get_nr_dim()
+            dim_idx = int(str(index)[5:])
+            dram_stride[dim_idx] = 1
+        elif index.is_Number:
+            dram_stride = [0] * local_tile_desc.get_nr_dim()
+        else:
+            dram_stride = [arg.as_coeff_mul()[0] for arg in index.as_ordered_terms()]
         return local_tile_desc, index_var, dram_stride
 
     def get_dma_code(self, dma_type_name, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, dram_index_var, sram_var, sram_index_var,

From ae09ef2066e7187f3473ae11320881baa1a51a40 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 11 Jul 2025 08:31:43 +0000
Subject: [PATCH 385/432] [Frontend] Fix dram_stride

---
 .../mlir/mlir_codegen_backend.py              | 20 ++++++++++++++++---
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  1 +
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 68aa1b11..6dd4f66a 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -4,6 +4,7 @@
 import os
 import math
 import torch
+from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
 from torch._dynamo.utils import dynamo_timed
 from torch._inductor.codegen import cpp, wrapper, common, memory_planning
@@ -1619,14 +1620,27 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
             local_tile_desc.vlane_split_axis = new_vlane_split_axis
 
         # Calculate dram stride
+        dram_stride = [0] * local_tile_desc.get_nr_dim()
         if index.is_Symbol:
-            dram_stride = [0] * local_tile_desc.get_nr_dim()
             dim_idx = int(str(index)[5:])
             dram_stride[dim_idx] = 1
         elif index.is_Number:
-            dram_stride = [0] * local_tile_desc.get_nr_dim()
+            pass
         else:
-            dram_stride = [arg.as_coeff_mul()[0] for arg in index.as_ordered_terms()]
+            dram_dict = defaultdict(list)
+            # Assume that div will have high priority than mod
+            for arg in index.as_ordered_terms():
+                coeff, dim = arg.as_coeff_mul()
+                real_dim = list(dim[0].free_symbols)[0]
+                dram_dict[str(real_dim)].append(coeff)
+            # Add missing dims if not added
+            max_dim = len(self.ranges) if not store_reduction else len(self.ranges) - 1
+            for i in range(max_dim):
+                target_dim = f"index{i}"
+                if target_dim not in str(index):
+                    dram_dict[target_dim] = [0]
+            sorted_keys = sorted(dram_dict.keys())
+            dram_stride = sum((dram_dict[key] for key in sorted_keys), [])
         return local_tile_desc, index_var, dram_stride
 
     def get_dma_code(self, dma_type_name, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, dram_index_var, sram_var, sram_index_var,
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 615925d1..20528d80 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -260,6 +260,7 @@ def codegen_template_code(self, kernel, render, template_node, prologue_nodes, e
 
             tile_desc = kernel.set_tile_size(kernel.epilogue_info)
             kernel.kernel_group.set_tile_info(tile_desc)
+            kernel.call_ranges = None
             if epilogue_nodes:
                 with kernel.epilogue_buffer_group.as_local():
                     kernel.load = kernel.load_epilogue

From 39405e7f843c467cc3f5c7e8db7f7a1ffe29309c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 11 Jul 2025 11:09:54 +0000
Subject: [PATCH 386/432] [Frontend] Fix 1

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 6dd4f66a..ab04d3e6 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1631,6 +1631,8 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
             # Assume that div will have high priority than mod
             for arg in index.as_ordered_terms():
                 coeff, dim = arg.as_coeff_mul()
+                if len(dim) == 0:
+                    continue
                 real_dim = list(dim[0].free_symbols)[0]
                 dram_dict[str(real_dim)].append(coeff)
             # Add missing dims if not added

From 5776d03eab30a9e70afa2ffeee85a38df1c3a0b9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 11 Jul 2025 11:35:02 +0000
Subject: [PATCH 387/432] [Frontend] Fix wip

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 20528d80..67a8b026 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -113,6 +113,8 @@ def can_fuse_horizontal(self, node1, node2):
             # Revert act_node.group : simplify_and_reorder() modified _body, _size, group
             if template_node.group != act_node.group:
                 self.revert_group(act_node)
+                if template_node.group != act_node.group:
+                    return False
             return True
 
         # Check elementwise fusion
@@ -214,6 +216,7 @@ def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, loop_size
 
     def codegen_template_code(self, kernel, render, template_node, prologue_nodes, epilogue_nodes):
         with kernel:
+            _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs()
             for node in [template_node, *prologue_nodes, *epilogue_nodes]:
                 node.mark_run()
             partial_code = render()
@@ -275,12 +278,12 @@ def codegen_template_code(self, kernel, render, template_node, prologue_nodes, e
                     for node in epilogue_nodes:
                         node.codegen((vars, reduction_vars))
 
-        with V.set_kernel_handler(kernel):
-            src_code = (
-                partial_code
-                if isinstance(partial_code, str)
-                else partial_code.finalize()
-            )
+            with V.set_kernel_handler(kernel):
+                src_code = (
+                    partial_code
+                    if isinstance(partial_code, str)
+                    else partial_code.finalize()
+                )
         # For consistency, white space could make wrong write_path
         buffer = IndentedBuffer()
         buffer.splice(src_code)
@@ -301,7 +304,6 @@ def codegen_template(self, template_node, epilogue_nodes):
         _, (numel, rnumel) = template_node.group
         template_buffer = template_node.node
         kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group)
-        _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs()
 
         src_code = self.codegen_template_code(kernel, render, template_node, prologue_nodes, epilogue_nodes)
         wrapper = V.graph.wrapper_code

From 7699887f87cf65298f851ac2bc55dc5037ea7b44 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 11 Jul 2025 13:37:54 +0000
Subject: [PATCH 388/432] Revert final render position

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 67a8b026..e5d28779 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -278,12 +278,12 @@ def codegen_template_code(self, kernel, render, template_node, prologue_nodes, e
                     for node in epilogue_nodes:
                         node.codegen((vars, reduction_vars))
 
-            with V.set_kernel_handler(kernel):
-                src_code = (
-                    partial_code
-                    if isinstance(partial_code, str)
-                    else partial_code.finalize()
-                )
+        with V.set_kernel_handler(kernel):
+            src_code = (
+                partial_code
+                if isinstance(partial_code, str)
+                else partial_code.finalize()
+            )
         # For consistency, white space could make wrong write_path
         buffer = IndentedBuffer()
         buffer.splice(src_code)
@@ -304,6 +304,7 @@ def codegen_template(self, template_node, epilogue_nodes):
         _, (numel, rnumel) = template_node.group
         template_buffer = template_node.node
         kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group)
+        _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs()
 
         src_code = self.codegen_template_code(kernel, render, template_node, prologue_nodes, epilogue_nodes)
         wrapper = V.graph.wrapper_code

From b8868671d0f9513ac151ac8d82e85b92f925e49f Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 11 Jul 2025 14:47:41 +0000
Subject: [PATCH 389/432] [Frontend] Do not fuse for edge case

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 3 +++
 PyTorchSimFrontend/mlir/mlir_template.py   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index e5d28779..a5d8bd3d 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -115,6 +115,9 @@ def can_fuse_horizontal(self, node1, node2):
                 self.revert_group(act_node)
                 if template_node.group != act_node.group:
                     return False
+                # We don't fuse this case...
+                if template_node.group[1][0][0] == 1:
+                    return False
             return True
 
         # Check elementwise fusion
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index b5aa2593..9e7a104e 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -1030,6 +1030,9 @@ def set_tile_size(self, template_fusion_info, prologue=False):
             self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_loop_idx, nr_outer_loop)
         else:
             tile_desc.vec_size=64
+            if tile_desc.get_numel_per_lane() < tile_desc.vec_size:
+                tile_desc.vec_size = tile_desc.get_numel_per_lane()
+
             if prologue:
                 self.prologue_compute_body_loop.size = tile_desc.get_numel_per_lane()
                 self.prologue_compute_body_loop.step = tile_desc.get_compute_vec_size()

From 0bded100531682bf61e3c7210a7ab1a308ac5f45 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 11 Jul 2025 15:24:44 +0000
Subject: [PATCH 390/432] [Frontend] Fusion condition change

---
 PyTorchSimFrontend/mlir/mlir_common.py     |  5 +++++
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 11 +++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 00bf4169..9151ac0b 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -499,6 +499,11 @@ def dummy_tile_size():
                 tile_size[-1] = self.vector_lane
                 tile_size[-2] = 4 * self.vector_lane
                 tile_size[-3] = 2
+            elif len(tile_size) == 4:
+                tile_size[-1] = self.vector_lane
+                tile_size[-2] = 4 * self.vector_lane
+                tile_size[-3] = 2
+                tile_size[-4] = 1
             else:
                 raise NotImplementedError("dummy tile size fail!")
             return tile_size
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index a5d8bd3d..f1a2513e 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -94,6 +94,8 @@ def can_fuse_horizontal(self, node1, node2):
         if node1.is_template() or node2.is_template():
             # Don't fuse maxpool template code
             from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
+            from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
+            from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
             if node1.is_template() and len(node1.get_nodes())==1 and isinstance(node1.node.template, MLIRMaxPoolTemplate) or \
                 node2.is_template() and len(node1.get_nodes())==1 and isinstance(node2.node.template, MLIRMaxPoolTemplate):
                 return False
@@ -112,12 +114,13 @@ def can_fuse_horizontal(self, node1, node2):
 
             # Revert act_node.group : simplify_and_reorder() modified _body, _size, group
             if template_node.group != act_node.group:
-                self.revert_group(act_node)
-                if template_node.group != act_node.group:
-                    return False
                 # We don't fuse this case...
-                if template_node.group[1][0][0] == 1:
+                if (isinstance(template_node, MLIRBMMTemplate) or isinstance(template_node, MLIRGemmTemplate)) and template_node.group[1][0][0] == 1:
                     return False
+
+                if template_node.group[1][0] != act_node.get_nodes()[0].node.data.get_size():
+                    return False
+                self.revert_group(act_node)
             return True
 
         # Check elementwise fusion

From b3c5d9ce87456bb984eb58abe5b185b1b4838b7f Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 12 Jul 2025 06:54:47 +0000
Subject: [PATCH 391/432] [Frontend/Fusion] Add prologue fusion condition

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index f1a2513e..9b07d3c7 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -64,6 +64,8 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
                 return False
             if len(node1.read_writes.writes) != 1:
                 return False
+            if len(node1.users) != 1:
+                return False
             if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]:
                 node1 = self.revert_group(node1)
                 return True

From b2e7110946af3c0b7e64ff13257e11ae10eef517 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 12 Jul 2025 10:09:53 +0000
Subject: [PATCH 392/432] [Frontend] Fix dram_stride + tile_size for reduction
 only case

---
 .../mlir/mlir_codegen_backend.py              | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index ab04d3e6..377eec7a 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1212,7 +1212,6 @@ def store_reduction(self, name, index, value):
             vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
         sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index)
         if self.welford_reduce_out is not None:
-            # raise NotImplementedError()
             sum, sqr_sum, _ = self.welford_reduce_out
             # mean
             divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.ranges[self.reduction_depth])} : f32")
@@ -1559,9 +1558,14 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
 
         # Case 0. Tile is 0-D scalar
         if len(local_dims) == 0:
-            local_tile_desc.set_tile_size([kg_tile_desc.get_used_vlane() * kg_tile_desc.vlane_stride])         # Force it to use vector instruction.
-            local_tile_desc.vlane_split_axis = local_vlane_split_axis    # last axis
-            local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+            if not store_reduction:
+                local_tile_desc.set_tile_size([kg_tile_desc.get_used_vlane() * kg_tile_desc.vlane_stride])         # Force it to use vector instruction.
+                local_tile_desc.vlane_split_axis = local_vlane_split_axis    # last axis
+                local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+            else:
+                local_tile_desc.set_tile_size([1])
+                local_tile_desc.vlane_split_axis = 0
+                local_tile_desc.vlane_stride = 1
             dram_stride = [0] # Edge case
         # Case 1. Tile is 1-D vector type
         elif len(local_dims) == 1 and len(local_dims) <= self.reduction_depth:
@@ -1571,7 +1575,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
         # Case 2. Tile is 1-D vector type with reduction
         elif len(local_dims) == 1 and len(local_dims) == self.reduction_depth + 1:
             local_tile_desc.set_tile_size([1, kg_tile_desc.get_dim_size(local_dims[0])])
-            local_tile_desc.vlane_split_axis = local_vlane_split_axis
+            local_tile_desc.vlane_split_axis = local_vlane_split_axis + 1
             local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
         # Case 3. Tile is 2-D tile
         elif len(local_dims) == 2:
@@ -1643,6 +1647,11 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
                     dram_dict[target_dim] = [0]
             sorted_keys = sorted(dram_dict.keys())
             dram_stride = sum((dram_dict[key] for key in sorted_keys), [])
+
+        # FIXME. It will be nice to modify node instead of this exception handling...
+        if len(self.itervars) == 1 and self.reduction_depth == 0:
+            # In case of reduction loop only case, we will add dummy loop so shift it once
+            dram_stride = [0] + dram_stride[:-1]
         return local_tile_desc, index_var, dram_stride
 
     def get_dma_code(self, dma_type_name, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, dram_index_var, sram_var, sram_index_var,

From dfd7809a8c1b62687b773c6501b62024d39ff2f9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 12 Jul 2025 12:00:29 +0000
Subject: [PATCH 393/432] [Frontend/Fusion] Add nop op fusion condition

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 9b07d3c7..3b354b44 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -35,6 +35,8 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
         base_template_node2 = [node for node in node2.get_nodes() if node.is_template()]
         if node1.get_device() != node2.get_device():
             return False
+        if not (isinstance(node1, (SchedulerNode, FusedSchedulerNode)) and isinstance(node2, (SchedulerNode, FusedSchedulerNode))):
+            return False
 
         if len(base_template_node1) == 1 and len(base_template_node2) == 0 and extension_config.CONFIG_FUSION_REDUCTION:
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate

From 237905f0c6597d60cde8fb2f8aca6dc217421e7f Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 12 Jul 2025 12:14:28 +0000
Subject: [PATCH 394/432] [Frontend] Handle edge case of parse_index_list

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 377eec7a..4c20fced 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -957,7 +957,8 @@ def parse_indices(self, expr, buffer=None, comments="") -> common.CSEVariable:
     def parse_index_list(self, expr_list:list, buffer=None) -> common.CSEVariable:
         if buffer is None:
             buffer = self.applys
-        expr_list = [arg for arg in expr_list if arg != sympy.Number(0)]
+        zero_var = self.get_const_cse(0)
+        expr_list = [arg if arg != sympy.Number(0) else sympy.Symbol(str(zero_var)) for arg in expr_list]
 
         if len(expr_list) == 1 and expr_list[0].is_number:
             # Constant case

From 0b9c0c39fc921a6ccb865c048d7ba803952c2d32 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 12 Jul 2025 12:34:31 +0000
Subject: [PATCH 395/432] Fix 2

---
 PyTorchSimFrontend/llvm/llvm_caller_codegen.py  | 2 +-
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py b/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
index 835d9b80..3690f533 100644
--- a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
+++ b/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
@@ -231,6 +231,6 @@ def get_spad_size(self, binary_path):
                 spad_end = int(parts[1], 16)
 
         if spad_start is None or spad_end is None:
-            raise ValueError("Could not find .spad addresses")
+            return 0
         spad_size = spad_end - spad_start
         return spad_size
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 4c20fced..5c344d1d 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -959,6 +959,7 @@ def parse_index_list(self, expr_list:list, buffer=None) -> common.CSEVariable:
             buffer = self.applys
         zero_var = self.get_const_cse(0)
         expr_list = [arg if arg != sympy.Number(0) else sympy.Symbol(str(zero_var)) for arg in expr_list]
+        dim_list = [f"d{i}" for i in range(len(expr_list))]
 
         if len(expr_list) == 1 and expr_list[0].is_number:
             # Constant case
@@ -972,18 +973,18 @@ def parse_index_list(self, expr_list:list, buffer=None) -> common.CSEVariable:
         for idx, arg in enumerate(expr_list):
             if arg.is_Mul and arg.args[0].is_number:
                 new_arg = sympy.Symbol(str(self.convert_index(arg.args[1], buffer)))
-                new_expr_list[idx] = arg.subs(arg.args[1], new_arg)
+                new_expr_list[idx] = arg.subs(arg.args[1], dim_list[idx])
                 indices.append(str(new_arg))
             elif not arg.is_number:
                 new_arg = sympy.Symbol(str(self.convert_index(arg, buffer)))
-                new_expr_list[idx] = new_arg
+                new_expr_list[idx] = new_arg.subs(new_arg, dim_list[idx])
                 indices.append(str(new_arg))
             else:
                 new_expr_list[idx] = arg
 
         # Extract index var
         expr_str = str(sum(new_expr_list))
-        args = ", ".join(map(str, indices))
+        args = ", ".join(map(str, dim_list))
         map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args})[] -> ({expr_str})>")
         args = ", ".join([f"%{i}" for i in indices])
         index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})[]")

From 5bcc9693d8ea6cae042cf6340569e2f05a444eba Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 14 Jul 2025 05:40:13 +0000
Subject: [PATCH 396/432] [Frontend] Fix apply gen code

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 5 ++++-
 PyTorchSimFrontend/mlir/mlir_conv_template.py   | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 5c344d1d..99a48fb6 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -958,7 +958,7 @@ def parse_index_list(self, expr_list:list, buffer=None) -> common.CSEVariable:
         if buffer is None:
             buffer = self.applys
         zero_var = self.get_const_cse(0)
-        expr_list = [arg if arg != sympy.Number(0) else sympy.Symbol(str(zero_var)) for arg in expr_list]
+        expr_list = [arg for arg in expr_list]
         dim_list = [f"d{i}" for i in range(len(expr_list))]
 
         if len(expr_list) == 1 and expr_list[0].is_number:
@@ -980,7 +980,10 @@ def parse_index_list(self, expr_list:list, buffer=None) -> common.CSEVariable:
                 new_expr_list[idx] = new_arg.subs(new_arg, dim_list[idx])
                 indices.append(str(new_arg))
             else:
+                const_var = self.get_const_cse(int(arg))
+                new_arg = sympy.Symbol(f"{const_var}")
                 new_expr_list[idx] = arg
+                indices.append(str(new_arg))
 
         # Extract index var
         expr_str = str(sum(new_expr_list))
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index cd4ddf82..4792c6ac 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -57,7 +57,7 @@
   {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
   %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
   %c0 = arith.constant 0 : index
-  {{- kernel.def_local_vars(indent_size=2) }}
+  {{ kernel.def_local_vars(indent_size=2) }}
 
   affine.for %tile_m = 0 to {{ BATCH }} step {{ TILE_M }} {
     affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {

From 831fa9f8e2e4a8f1a297d837d6e4bd52e5ccb09b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 14 Jul 2025 12:42:12 +0000
Subject: [PATCH 397/432] [Frontend] Indirect access fix

---
 .../mlir/mlir_codegen_backend.py               | 18 +++++++++++-------
 PyTorchSimFrontend/mlir/mlir_scheduling.py     |  2 +-
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 99a48fb6..51a79ebd 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -919,19 +919,22 @@ def convert_index(self, expr, buffer):
         index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})")
         return index
 
-    def parse_indices(self, expr, buffer=None, comments="") -> common.CSEVariable:
+    def parse_indices(self, expr, buffer=None, comments="", indirect_dims=[]) -> common.CSEVariable:
         if buffer is None:
             buffer = self.applys
 
         # Constant case
-        if expr.is_number:
+        if expr.is_number and len(indirect_dims) == 0:
             return self.get_const_cse(int(expr))
 
         # Identity case
-        if len(expr.args) == 0:
+        if len(expr.args) == 0 and len(indirect_dims) == 0:
             return expr
 
-        args = list(expr.args)
+        if len(expr.args) == 0:
+            args = [expr]
+        else:
+            args = list(expr.args)
         # Sort index variable.. ex) (%index1, %index0)
         args_dict = {term: list(term.free_symbols)[0] for term in args if term.free_symbols}
         sorted_args = sorted(args_dict.keys(), key=lambda term: str(args_dict[term]))
@@ -947,11 +950,12 @@ def parse_indices(self, expr, buffer=None, comments="") -> common.CSEVariable:
                 indices.append(str(new_arg))
 
         # Extract index var
+        indirect_args = [f"%{i}" for i in indirect_dims]
         expr_str = str(expr)
         args = ", ".join(map(str, indices))
-        map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args})[] -> ({expr_str})>")
+        map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args})[{','.join(indirect_dims)}] -> ({expr_str})>")
         args = ", ".join([f"%{i}" for i in indices])
-        index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})[] {comments}")
+        index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})[{','.join(indirect_args)}] {comments}")
         return index
 
     def parse_index_list(self, expr_list:list, buffer=None) -> common.CSEVariable:
@@ -1554,7 +1558,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
         if broadcast and (total_dims != local_dims or (self.reduction_depth!=len(total_dims) and total_dims[:self.reduction_depth] == local_dims)):
             local_dims = total_dims # Brodatcast tile shape
 
-        index_var = self.parse_indices(index, buffer=buffer)
+        index_var = self.parse_indices(index, buffer=buffer, indirect_dims=indirect_dims)
 
         if kg_tile_desc.vlane_split_axis in local_dims:
             local_vlane_split_axis = local_dims.index(kg_tile_desc.vlane_split_axis)
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 3b354b44..c8ed9efc 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -122,7 +122,7 @@ def can_fuse_horizontal(self, node1, node2):
                 if (isinstance(template_node, MLIRBMMTemplate) or isinstance(template_node, MLIRGemmTemplate)) and template_node.group[1][0][0] == 1:
                     return False
 
-                if template_node.group[1][0] != act_node.get_nodes()[0].node.data.get_size():
+                if list(template_node.group[1][0]) != list(act_node.get_nodes()[0].node.data.get_size()):
                     return False
                 self.revert_group(act_node)
             return True

From 7abca4d11127462e3e0d1fd15ff3f1a6b166cfb7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 14 Jul 2025 15:20:38 +0000
Subject: [PATCH 398/432] [Frontend/Fusion] Add something OMG

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 50 ++++++++++++----------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index c8ed9efc..e63df4fb 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -100,8 +100,11 @@ def can_fuse_horizontal(self, node1, node2):
             from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
             from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
-            if node1.is_template() and len(node1.get_nodes())==1 and isinstance(node1.node.template, MLIRMaxPoolTemplate) or \
-                node2.is_template() and len(node1.get_nodes())==1 and isinstance(node2.node.template, MLIRMaxPoolTemplate):
+            template_node1 = next((n for n in node1.get_nodes() if n.is_template()), None)
+            template_node2 = next((n for n in node2.get_nodes() if n.is_template()), None)
+
+            if template_node1 and len(node1.get_nodes()) == 1 and isinstance(template_node1.node.template, MLIRMaxPoolTemplate) or \
+               template_node2 and len(node2.get_nodes()) == 1 and isinstance(template_node2.node.template, MLIRMaxPoolTemplate):
                 return False
 
             # Pointwise check
@@ -111,7 +114,7 @@ def can_fuse_horizontal(self, node1, node2):
                 return False
 
             # Pattern check
-            template_node, act_node = (node1, node2) if node1.is_template() else (node2, node1)
+            template_node, act_node = (template_node1, node2) if template_node1 else (template_node2, node1)
             has_depedency = set(act_node.inverse_users) <= set(template_node.get_nodes())
             if not has_depedency:
                 return False
@@ -119,7 +122,7 @@ def can_fuse_horizontal(self, node1, node2):
             # Revert act_node.group : simplify_and_reorder() modified _body, _size, group
             if template_node.group != act_node.group:
                 # We don't fuse this case...
-                if (isinstance(template_node, MLIRBMMTemplate) or isinstance(template_node, MLIRGemmTemplate)) and template_node.group[1][0][0] == 1:
+                if (isinstance(template_node.node.template, MLIRBMMTemplate) or isinstance(template_node.node.template, MLIRGemmTemplate)) and template_node.group[1][0][0] == 1:
                     return False
 
                 if list(template_node.group[1][0]) != list(act_node.get_nodes()[0].node.data.get_size()):
@@ -132,25 +135,26 @@ def can_fuse_horizontal(self, node1, node2):
             return True
         return False
 
-    def revert_group(self, act_node):
-        args, var_ranges = dependencies.index_vars_no_squeeze(
-                act_node.node.data.get_size(), act_node.node.data.get_reduction_size(), prefix="q"
-        )
-        body = LoopBody(
-            act_node.node.get_store_function(),
-            (args if act_node.node.get_reduction_type() else args[:1]),
-            var_ranges,
-        )
-        index_size = []
-        reduce_size = []
-        for v, s in var_ranges.items():
-            if v in args[0]:
-                index_size.append(s)
-            else:
-                reduce_size.append(s)
-        node_device = act_node.get_device()
-        ranges = (index_size, reduce_size)
-        act_node._sizes, act_node._body, act_node.group = (ranges), body, (node_device, self.group_fn(ranges))
+    def revert_group(self, act_nodes):
+        for act_node in act_nodes.get_nodes():
+            args, var_ranges = dependencies.index_vars_no_squeeze(
+                    act_node.node.data.get_size(), act_node.node.data.get_reduction_size(), prefix="q"
+            )
+            body = LoopBody(
+                act_node.node.get_store_function(),
+                (args if act_node.node.get_reduction_type() else args[:1]),
+                var_ranges,
+            )
+            index_size = []
+            reduce_size = []
+            for v, s in var_ranges.items():
+                if v in args[0]:
+                    index_size.append(s)
+                else:
+                    reduce_size.append(s)
+            node_device = act_node.get_device()
+            ranges = (index_size, reduce_size)
+            act_node._sizes, act_node._body, act_node.group = (ranges), body, (node_device, self.group_fn(ranges))
 
     def group_fn(self, sizes):
         return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes)

From ed3130742b21c5b3e83fa999329891c228e3b54b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 15 Jul 2025 01:58:20 +0000
Subject: [PATCH 399/432] [Frontend] Fix dima_alising for conv_template

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 2 +-
 PyTorchSimFrontend/mlir/mlir_template.py      | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 4792c6ac..73cf710f 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -275,7 +275,7 @@ def render(self,
             dram_var = "Y",
             dram_idx = Y_idx,
             dram_tile_desc = Y_tile_desc,
-            dim_aliasing = {"index0":"c0", "index1":"tile_n", "index2":"o_h", "index3":"tile_m"}
+            dim_aliasing = {"index0":"tile_m", "index1":"tile_n", "index2":"o_h", "index3":"o_w"}
         )
         kernel.exception_nodes["X"] = {"numel" : (I_W+2*PADDING_W)*(I_H+2*PADDING_H)*I_C*BATCH}
         code = self._template_from_string(conv_template).render(**kernel.render_options)
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 9e7a104e..f802f8e8 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -1030,8 +1030,6 @@ def set_tile_size(self, template_fusion_info, prologue=False):
             self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_loop_idx, nr_outer_loop)
         else:
             tile_desc.vec_size=64
-            if tile_desc.get_numel_per_lane() < tile_desc.vec_size:
-                tile_desc.vec_size = tile_desc.get_numel_per_lane()
 
             if prologue:
                 self.prologue_compute_body_loop.size = tile_desc.get_numel_per_lane()

From 80b7a85c233577f32d6b8ad80fe5279cf1dd3104 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 15 Jul 2025 05:19:05 +0000
Subject: [PATCH 400/432] [Frontend/Scheduling] Fix reduction fusion condition

---
 .github/workflows/pull-request.yml         | 10 ----------
 .github/workflows/pull-request_mobile.yml  | 10 ----------
 PyTorchSimFrontend/mlir/mlir_scheduling.py |  2 ++
 3 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index 9d440df6..bc5c9dab 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -524,16 +524,6 @@ jobs:
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_reduction.py
 
-      - name: Run test_matmul_layernorm.py
-        env:
-          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
-        run: |
-          echo "Running test_matmul_layernorm.py"
-          docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
-            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_layernorm.py
-
       - name: Run test_bmm_reduction.py
         env:
           GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
diff --git a/.github/workflows/pull-request_mobile.yml b/.github/workflows/pull-request_mobile.yml
index 45d73fa8..0043eaf4 100644
--- a/.github/workflows/pull-request_mobile.yml
+++ b/.github/workflows/pull-request_mobile.yml
@@ -534,16 +534,6 @@ jobs:
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_reduction.py
 
-      - name: Run test_matmul_layernorm.py
-        env:
-          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
-        run: |
-          echo "Running test_matmul_layernorm.py"
-          docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
-            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_layernorm.py
-
       - name: Run test_bmm_reduction.py
         env:
           GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index e63df4fb..ffc001da 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -87,6 +87,8 @@ def can_fuse_horizontal(self, node1, node2):
         _, (vars2, reduce2) = node2.group
 
         # Reduction is currently not supported
+        if node1.is_reduction() and node2.is_reduction() and not node1.is_template() and not node2.is_template():
+            return vars1 == vars2 and reduce1 == reduce2 and node1.inverse_users == node2.inverse_users
         if node1.is_reduction() or node2.is_reduction():
             return False
 

From 06741953f65e99cc24c524eadbb5693037697353 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 17 Jul 2025 01:43:44 +0000
Subject: [PATCH 401/432] [Frontend/template] Fix tile stride in convolution
 templates

Also, update mlir version(refactored fine-grained dma pass)
---
 PyTorchSimFrontend/mlir/mlir_conv_mt_template.py  | 2 +-
 PyTorchSimFrontend/mlir/mlir_conv_sb_template.py  | 2 +-
 PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py | 2 +-
 tests/Fusion/test_prologue_fusion.py              | 6 +++---
 tests/Fusion/test_transformer_fusion.py           | 1 +
 tests/test_indirect_access.py                     | 2 +-
 6 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
index 7968f813..8cd57077 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
@@ -202,7 +202,7 @@ def render(self,
         X_idx = [X_dim[0]*(I_W+2*PADDING_W)*BATCH*I_C, X_dim[1]*I_C*STRIDE_W, X_dim[2]*I_C*(I_W+2*PADDING_W), X_dim[3]]
 
         W_tile_size = [TILE_K_H, 1, TILE_K, TILE_N]
-        W_tile_stride = [TILE_K_W * TILE_K * TILE_N, TILE_K * TILE_N, 1, TILE_K]
+        W_tile_stride = [TILE_K * TILE_N, TILE_K * TILE_N, 1, TILE_K]
         W_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, 3, vlane_stride)
         W_tile_desc.set_tile_size_stride(W_tile_size, W_tile_stride)
         W_tile_desc.set_name("weight_buffer")
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
index f2df1e43..6c31776d 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
@@ -210,7 +210,7 @@ def render(self,
         W_idx = [W_dim[0]*K_W*I_C*O_C , W_dim[1]*I_C*O_C, W_dim[2]*O_C, W_dim[3]]
 
         Y_tile_size = [1, TILE_N, TILE_O_H, TILE_M]
-        Y_tile_stride = [TILE_O_W * TILE_M * TILE_N, TILE_M * TILE_N, TILE_M, 1] # N, C, H, W
+        Y_tile_stride = [TILE_O_H * TILE_M * TILE_N, TILE_M, TILE_M * TILE_N, 1] # N, C, H, W
         Y_tile_desc = mlir_common.MLIRMultiDimTile(Y_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
         Y_tile_desc.set_tile_size_stride(Y_tile_size, Y_tile_stride)
         Y_tile_desc.set_name("output_buffer")
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
index 3b60dcbc..a4ea0b20 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
@@ -211,7 +211,7 @@ def render(self,
         W_idx = [W_dim[0]*K_W*I_C*O_C , W_dim[1]*I_C*O_C, W_dim[2]*O_C, W_dim[3]]
 
         Y_tile_size = [1, TILE_N, TILE_O_H, TILE_M]
-        Y_tile_stride = [TILE_O_W * TILE_M * TILE_N, TILE_M, TILE_M * TILE_N, 1] # N, C, H, W
+        Y_tile_stride = [TILE_O_H * TILE_M * TILE_N, TILE_M, TILE_M * TILE_N, 1] # N, C, H, W
         Y_tile_desc = mlir_common.MLIRMultiDimTile(Y_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
         Y_tile_desc.set_tile_size_stride(Y_tile_size, Y_tile_stride)
         Y_tile_desc.set_name("output_buffer")
diff --git a/tests/Fusion/test_prologue_fusion.py b/tests/Fusion/test_prologue_fusion.py
index d5d1cdb1..797f9e76 100644
--- a/tests/Fusion/test_prologue_fusion.py
+++ b/tests/Fusion/test_prologue_fusion.py
@@ -91,7 +91,7 @@ def bmm(a, b, c, d):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    #test_elem_broadcast_fusion(device)
-    #test_elem_fusion(device)
-    #test_elem_bmm_input_fusion(device, batch_size=4, m=512, n=512, k=64)
+    test_elem_broadcast_fusion(device)
+    test_elem_fusion(device)
+    test_elem_bmm_input_fusion(device, batch_size=4, m=512, n=512, k=64)
     test_elem_bmm_weight_fusion(device, batch_size=12, m=512, n=512, k=64)
\ No newline at end of file
diff --git a/tests/Fusion/test_transformer_fusion.py b/tests/Fusion/test_transformer_fusion.py
index 15bacb39..0f68948e 100644
--- a/tests/Fusion/test_transformer_fusion.py
+++ b/tests/Fusion/test_transformer_fusion.py
@@ -206,6 +206,7 @@ def test_DecoderBlock_validation(head=12, embed_dim=768, input_seq=512):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
+    test_MHA(device)
     test_DecoderBlock(device)
     # test_DecoderBlock_validation()
     # test_Attention(device, head=16, seq=512, d_k=64)
diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py
index 6d16c9d0..b7b20074 100644
--- a/tests/test_indirect_access.py
+++ b/tests/test_indirect_access.py
@@ -27,7 +27,7 @@ def vectoradd(a, idx, b):
     opt_fn = torch.compile(dynamic=False)(vectoradd)
     res = opt_fn(x, idx, y)
     out = vectoradd(x.cpu(), idx.cpu(), y.cpu())
-    test_result("VectorAdd", res, out)
+    test_result("Indirect VectorAdd", res, out)
 
 def test_embedding(device, vocab_size, dim):
     emb = torch.nn.Embedding(vocab_size, dim)

From 6cd7b8be35922e9707a421dfac6a645ddae1e1b4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 17 Jul 2025 07:49:59 +0000
Subject: [PATCH 402/432] [Frontend] Update fusion condition

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index ffc001da..8ea995df 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -14,7 +14,7 @@
 from torch._inductor import dependencies
 
 from . import mlir_common
-from . import mlir_lowering
+from . import mlir_lowering # DO NOT REMOVE THIS LINE, it is used for lowering
 
 class MLIRScheduling(BaseScheduling):
     count = 0
@@ -41,15 +41,15 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
         if len(base_template_node1) == 1 and len(base_template_node2) == 0 and extension_config.CONFIG_FUSION_REDUCTION:
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
             from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
-            if (isinstance(base_template_node1[0].node.template, MLIRGemmTemplate) or isinstance(base_template_node1[0].node.template, MLIRBMMTemplate)) and node2.is_reduction() and len(node2.get_nodes())==1:
+            if (isinstance(base_template_node1[0].node.template, MLIRGemmTemplate) or isinstance(base_template_node1[0].node.template, MLIRBMMTemplate)) and node2.is_reduction():
                 # For matmul/bmm+reduction case
-                size_match = node1.get_nodes()[0].node.get_numel() == reduce(operator.mul, node2.node.get_size(), 1) * reduce(operator.mul, node2.node.get_reduction_size(), 1)
-                stride = [i.strip()[:-1].split(",")[-1].strip() for i in str(node2.node).split("\n") if "r0" in i][1]
+                size_match = node1.get_nodes()[0].node.get_numel() == reduce(operator.mul, node2.get_nodes()[0].node.get_size(), 1) * reduce(operator.mul, node2.get_nodes()[0].node.get_reduction_size(), 1)
+                stride = [i.strip()[:-1].split(",")[-1].strip() for i in str(node2.get_nodes()[0].node).split("\n") if "r0" in i][1]
                 target_symbol = symbols("r0")
                 # We can't fuse dim=-1
                 layout_possible = int(sympify(stride).coeff(target_symbol)) != 1
                 # Directed linked?
-                dependency_check = node2 in [node.node for node in base_template_node1[0].users]# and len(node2.read_writes.reads)==1
+                dependency_check = node2.get_nodes()[0] in [node.node for node in base_template_node1[0].users]# and len(node2.read_writes.reads)==1
                 dependency_size = all([i.get_numel() == node1.get_nodes()[0].node.get_numel() for i in node2.read_writes.reads])
                 return size_match and layout_possible and dependency_check and dependency_size
 
@@ -66,7 +66,7 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
                 return False
             if len(node1.read_writes.writes) != 1:
                 return False
-            if len(node1.users) != 1:
+            if len([node for node in node1.users if node.get_name() != "OUTPUT"]) != 1: # FIXME. Any good way to check this?
                 return False
             if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]:
                 node1 = self.revert_group(node1)

From 4771bcbb5f0e40956ce04fb25a24e3e68380486a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 17 Jul 2025 07:51:15 +0000
Subject: [PATCH 403/432] [Test] Add test_bmm_reduction fusion

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py |  2 +-
 tests/Fusion/test_bmm_reduction.py           | 52 ++++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 tests/Fusion/test_bmm_reduction.py

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index b81b3862..9a9785e1 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -125,7 +125,7 @@
   %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
   {% endif %}
   %c0 = arith.constant 0 : index
-  {{ kernel.def_local_vars() }}
+  {{ kernel.def_local_vars(indent_size=2) }}
   affine.for %index0=0 to {{ B }} {
     affine.for %index2 = 0 to {{ N }} step {{ TILE_N }} {
       affine.for %index1 = 0 to {{ M }} step {{ TILE_M }} {
diff --git a/tests/Fusion/test_bmm_reduction.py b/tests/Fusion/test_bmm_reduction.py
new file mode 100644
index 00000000..42e38095
--- /dev/null
+++ b/tests/Fusion/test_bmm_reduction.py
@@ -0,0 +1,52 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def test_bmm_reduce(device, batch=12, size=512):
+    def bmm(a, b):
+        result = torch.bmm(a, b.transpose(1,2))
+        return result, result.max(dim=1).values
+    torch.manual_seed(0)
+    N = size
+    input = torch.randn(batch, N, 64)
+    weight = torch.randn(batch, N, 64)
+    #input = torch.arange(1, N * N + 1, dtype=torch.float32).reshape(N, N).to(dtype=torch.float32)
+    #weight = torch.eye(N, dtype=torch.float32)
+    x1 = input.to(device=device)
+    w1 = weight.to(device=device)
+    x2 = input.to("cpu")
+    w2 = weight.to("cpu")
+    opt_fn = torch.compile(dynamic=False)(bmm)
+    res = opt_fn(x1, w1)
+    y = bmm(x2, w2)
+    test_result("BMM Reduction Fusion activation", res[0], y[0])
+    test_result("BMM Reduction Fusion reduction", res[1], y[1])
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    #test_bmm_reduce(device)
+    test_bmm_reduce(device, 12, 512)
+    test_bmm_reduce(device, 4, 256)
+    test_bmm_reduce(device, 6, 768)
+    test_bmm_reduce(device, 2, 128)

From 22e167d2d677b186f00d8ca1cd8a84607e5e8db9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 17 Jul 2025 13:57:53 +0000
Subject: [PATCH 404/432] [Frontend/Fusion] Add prologue fusion condition

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 8ea995df..f81c7b05 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -55,10 +55,8 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
 
         # For prologue fusion case
         if extension_config.CONFIG_FUSION_PROLOGUE and len(base_template_node1) == 0 and len(node1.get_nodes())==1 and len(base_template_node2) == 1:
-            # Return false if node2 is Convolution template
-            # if node2.get_nodes()[0].node.origin_node.target._name == 'aten::mm' or \
-            #     node2.get_nodes()[0].node.origin_node.target._name == 'aten::addmm':
-            #     return False
+            from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
+            from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
             target_node = base_template_node2[0].node
             if target_node.origin_node is not None and hasattr(target_node.origin_node.target, "_name") and target_node.origin_node.target._name == 'aten::convolution':
                 return False
@@ -66,8 +64,11 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
                 return False
             if len(node1.read_writes.writes) != 1:
                 return False
-            if len([node for node in node1.users if node.get_name() != "OUTPUT"]) != 1: # FIXME. Any good way to check this?
+            if len(node1.users) != 1:
                 return False
+            # We don't fuse this case...
+            if (isinstance(target_node.template, MLIRBMMTemplate) or isinstance(target_node.template, MLIRGemmTemplate)) and base_template_node2[0].group[1][0][0] == 1:
+                    return False
             if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]:
                 node1 = self.revert_group(node1)
                 return True

From 46614427d04cb74ad4e9be5be338aa43c05bf15c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 18 Jul 2025 06:14:57 +0000
Subject: [PATCH 405/432] [Frontend] Fix reverting the group when ther is no
 loop

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 26 +++++++++++++++++-----
 tests/Mixtral_8x7B/test_attention.py       | 24 ++++++++++++++++++--
 tests/MoE/test_moe.py                      |  5 -----
 3 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index f81c7b05..773414d5 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -1,5 +1,6 @@
 import os
 import math
+import sympy
 from functools import reduce
 import operator
 from sympy import symbols, sympify, Symbol
@@ -138,11 +139,12 @@ def can_fuse_horizontal(self, node1, node2):
             return True
         return False
 
-    def revert_group(self, act_nodes):
+    def revert_group(self, act_nodes, args=None, var_ranges=None):
         for act_node in act_nodes.get_nodes():
-            args, var_ranges = dependencies.index_vars_no_squeeze(
-                    act_node.node.data.get_size(), act_node.node.data.get_reduction_size(), prefix="q"
-            )
+            if args is None or var_ranges is None:
+                args, var_ranges = dependencies.index_vars_no_squeeze(
+                        act_node.node.data.get_size(), act_node.node.data.get_reduction_size(), prefix="q"
+                )
             body = LoopBody(
                 act_node.node.get_store_function(),
                 (args if act_node.node.get_reduction_type() else args[:1]),
@@ -167,10 +169,22 @@ def codegen_nodes(self, nodes):
             nodes, key=lambda x: int(x.is_reduction())
         ).group
 
-        # There is no normal loop, then revert simplified group
+        # Note: We assume that ther is at least one loop in the nodes
+        # But, inductor simplifies the group, there could be no loop
+        # In that case, we add dummy loop(size=1) to the group
         if len(group) == 0:
             for idx, node in enumerate(nodes):
-                self.revert_group(node)
+                if len(node.node.data.get_size()) == 0:
+                    continue
+                if len(reduction_group) != 0:
+                    sym0, sym1 = sympy.Symbol("q0"), sympy.Symbol("q1")
+                    args = [[sym0] + [sympy.Number(0)] * (len(node.node.data.get_size())-1), [sym1]]
+                    var_ranges = {sym0: sympy.Number(1), sym1: reduction_group[0]}
+                else:
+                    sym0 = sympy.Symbol("q0")
+                    args = [[sym0] + [sympy.Number(0)] * (len(node.node.data.get_size())-1), []]
+                    var_ranges = {sym0: sympy.Number(1)}
+                self.revert_group(node, args, var_ranges)
             _, (group, reduction_group) = max(
                 nodes, key=lambda x: int(x.is_reduction())
             ).group
diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py
index cc2adc96..aa1af651 100644
--- a/tests/Mixtral_8x7B/test_attention.py
+++ b/tests/Mixtral_8x7B/test_attention.py
@@ -2,7 +2,7 @@
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
-from model import Transformer, TransformerBlock, ModelArgs, Attention, FeedForward, KVCache, precompute_freqs_cis, sample
+from model import Transformer, TransformerBlock, ModelArgs, Attention, FeedForward, KVCache, RMSNorm, precompute_freqs_cis, sample
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -139,6 +139,25 @@ def concat_tensors(a, b):
 
     test_result("ConcatTensors", res, out)
 
+def test_rmsnorm(device, seq=32):
+    dim = 512
+    eps = 1e-5
+    T = seq
+    rmsnorm = RMSNorm(dim=dim, eps=eps)
+    rmsnorm = rmsnorm.to(device=device)
+
+    x = torch.randn([1, T, dim], dtype=torch.float32)
+    cpu_x = copy.deepcopy(x)
+    x = x.to(device)
+
+    cpu_model = copy.deepcopy(rmsnorm).to("cpu")
+    opt_fn = torch.compile(dynamic=False)(rmsnorm)
+
+    res = opt_fn(x)
+    cpu_res = cpu_model(cpu_x)
+
+    test_result("RMSNorm", res, cpu_res)
+
 if __name__ == "__main__":
     import os
     import sys
@@ -147,7 +166,8 @@ def concat_tensors(a, b):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
+    test_rmsnorm(device, seq=1)
+    test_concat(device, size1=(1, 8, 64, 64), size2=(1,8,1,64), dim=2)
     test_decode(device, 32, 3)
-    #test_concat(device, size1=(1, 8, 32, 64), size2=(1,8,1,64), dim=2)
     #test_attention(device)
     #test_ffn(device)
diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py
index cf2f37f4..c5ab8107 100644
--- a/tests/MoE/test_moe.py
+++ b/tests/MoE/test_moe.py
@@ -1,12 +1,7 @@
 # Owner(s): ["module: inductor"]
 import os
-import shutil
 import sys
-import time
-import contextlib
-import unittest
 import copy
-import numpy as np
 import matplotlib.pyplot as plt
 
 

From 2bea699200cb0e17b84b452bf28a3aaf7bf1a2f2 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 18 Jul 2025 14:24:05 +0000
Subject: [PATCH 406/432] [Frontend] Add mask in the reduction if needed

---
 .../mlir/mlir_codegen_backend.py              | 92 +++++--------------
 PyTorchSimFrontend/mlir/mlir_template.py      |  8 +-
 2 files changed, 32 insertions(+), 68 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 51a79ebd..79d735a3 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -846,6 +846,7 @@ def __init__(self, kernel_group, reason=None):
         self.reduction_prefix = IndentedBuffer()
         self.reduction_suffix = IndentedBuffer()
         self.applys = IndentedBuffer()
+        self.masks = IndentedBuffer()
         self.dma_loads = IndentedBuffer()
         self.dma_stores = IndentedBuffer()
         self.indexed_buffer = IndentedBuffer()
@@ -859,6 +860,7 @@ def __init__(self, kernel_group, reason=None):
         self.reduction_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc")
         self.spad_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="spad")
         self.apply_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="apply")
+        self.mask_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="mask")
         self.iterator_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="iter")
         self.init_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="init")
         self.init_vec_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="init_vec")
@@ -1030,25 +1032,9 @@ def load(self, name: str, index: sympy.Expr):
         self.cse.generate(self.dma_loads, code, assignment = False) # FIXME: assignment = False does not support caching
         compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"])
         # Generate vector load instruction
-        needs_mask = self.compute_body_loop.size % self.compute_body_loop.step != 0 and len(index.free_symbols) == len(self.ranges)
         if compute_vec_size > 1:
-            if needs_mask:
-                index_shape = f"vector<{self.compute_body_loop.step}xindex>"
-                mask_shape = f"vector<{compute_vec_size}xi1>"
-                step_vec = self.cse.generate(self.loads, f"vector.step : {index_shape}")
-                upper_bound = self.get_const_cse(self.compute_body_loop.size, "index")
-                gap = self.cse.generate(self.loads, f"arith.subi %{upper_bound}, %{self.compute_idx} : index")
-                gap_vec = self.cse.generate(self.loads, f"vector.broadcast %{gap} : index to {index_shape}")
-                mask_var = self.cse.generate(self.loads, f"arith.cmpi ult, %{step_vec}, %{gap_vec} : {index_shape}")
-                if padding:
-                    pad_val = self.const_cse.generate(self.const_buffer, f"arith.constant 0x{mlir_common.MLIR_INF['-inf'][mlir_dtype]:x} : {mlir_dtype}")
-                else:
-                    pad_val = self.get_const_cse(0, mlir_dtype)
-                pad_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{pad_val} : {mlir_dtype} to {vshape}")
-                line = f"vector.maskedload %{sram_var}[{compute_index_var}], %{mask_var}, %{pad_vec} : {tile_shape}, {mask_shape}, {vshape} into {vshape}"
-            else:
-                operation = "affine.vector_load"
-                line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
+            operation = "affine.vector_load"
+            line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
         else:
             operation = "affine.load"
             line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
@@ -1149,6 +1135,7 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         else:
             # Adjust shape and inital value
             init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {type_name} to {reduced_shape}")
+            self.register_var_info(init_vec, [vec_len, type_name])
         acc_var = init_vec
 
         # Reduction body prepare
@@ -1167,6 +1154,9 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         self.init_cse.reduction_cache[reduction_key] = init_vec
 
         # Reduction body codegen
+        mask_shape, mask_var = self.get_mask()
+        if mask_var is not None:
+            value = ops.where(mask_var, value, init_vec)
         result = reduction_partial_combine_vec(reduction_type, value, body_iter_arg)
         self.compute_body_loop.reduction_vars[body_acc] = (reduction_type, body_iter_arg, iterator, reduced_shape)
         self.compute_body_loop.affine_yield[result] = reduced_shape
@@ -1423,6 +1413,7 @@ def codegen_loops(self):
                 code.writelines(self.compute_body_loop.lines())
                 with contextlib.ExitStack() as stack:
                     stack.enter_context(code.indent(attribute="{inner_loop=false}",suffix=self.compute_body_loop.epilogue_line()))
+                    code.splice(self.masks)
                     code.splice(self.loads)
                     code.splice(self.compute)
                     code.splice(self.stores)
@@ -1701,55 +1692,6 @@ def get_dma_code(self, dma_type_name, vlane_split_axis, vlane_stride, mlir_dtype
 
         return f"memref.dma_start {src_operand}, {dst_operand}, %{dma_type}, {tag_var}, {dma_attribute} : {src_shape}, {dst_shape}, {tag_shape} {attribute}"
 
-    def adjust_tile_size(self):
-        if self.read_writes is not None:
-            read_writes = list(self.read_writes.reads) + list(self.read_writes.writes)
-            cv_list = []
-            for node in read_writes:
-                if len(node) > 1:
-                    cv_list.append(self.get_constant_vector2(node[1]))
-            max_element = max(cv_list, key=len)
-            max_nr_dim = len(max_element)
-
-            sorted_max_element = sorted(max_element, key=lambda x:x[0])
-            # Force vector tile size when 3D node is originated from view
-            if max_nr_dim == 3 and max_nr_dim != len(self.itervars):
-                self.tile_desc.n_col = min(self.tile_desc.get_tile_size(), sorted_max_element[1][0])
-                self.tile_desc.n_row = 1
-                return
-
-        # Case 1. vector kernel
-        if len(self.itervars) == 1:
-            tile_size = self.tile_desc.get_tile_size() if self.tile_desc.get_tile_size() < self.ranges[0] else self.ranges[0]
-            min_tile_size_unit = self.vector_lane * self.vlen // (8 * self.precision) # TODO: VCIX widening is not implemented
-            self.tile_desc.n_col = math.ceil(tile_size / min_tile_size_unit) * min_tile_size_unit # padding
-            self.tile_desc.n_row = 1
-        elif len(self.itervars) == 0:
-            self.tile_desc.n_col = 1
-            self.tile_desc.n_row = 1
-
-        # Case 2. 2-D tensor (e.g., softmax)
-        if len(self.itervars) == 2 and self.reduction_depth == len(self.itervars):
-            # Avoid too much padding
-            if (self.ranges[0] <= self.vector_lane and self.ranges[0] <= self.tile_desc.n_row):
-                self.tile_desc.n_row = self.ranges[0]
-                self.tile_desc.used_vector_lane = self.ranges[0]
-
-        # Case 2. 2-D reduction (e.g., batchnorm)
-        if len(self.itervars) == 2 and self.reduction_depth == len(self.itervars) - 1:
-            if (((self.ranges[0] + 1) // 2) <= self.vector_lane and ((self.ranges[0] + 1) // 2) <= self.tile_desc.n_row):
-                self.tile_desc.n_row = ((self.ranges[0] + 1) // 2) * 2
-                self.tile_desc.used_vector_lane = (self.ranges[0] + 1) // 2
-
-        # Case 2. 3-D tensor kernel without reduction. Access vector granule!
-        if len(self.itervars) == 3 and self.reduction_depth == len(self.itervars):
-            self.tile_desc.n_col = self.ranges[-1]
-            self.tile_desc.n_row = 1
-
-        # Case 3. N-D tensor kernel with reduction. Not implemented. Need this?
-        if len(self.itervars) >= 3 and self.reduction_depth < len(self.itervars):
-            raise NotImplementedError()
-
     def allocate_sram_buffer(self, dtype, dram_name, tile_desc, raw_index, buffer=None, forced_name=None):
         c_type = mlir_common.DTYPE_TO_C[dtype]
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
@@ -1805,6 +1747,22 @@ def get_tag_cse(self, value, shape="memref<1xi32>"):
             self.tags[value] = self.alloc_cse.generate(self.alloc_buffer, f"memref.alloc() : {shape}")
         return self.tags[value]
 
+    def get_mask(self):
+        if self.compute_body_loop.size % self.compute_body_loop.step == 0:
+            return None, None
+        compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
+        index_shape = f"vector<{self.compute_body_loop.step}xindex>"
+        mask_shape = f"vector<{compute_vec_size}xi1>"
+
+        upper_bound = self.get_const_cse(self.compute_body_loop.size)
+        step_vec = self.const_cse.generate(self.const_buffer, f"vector.step : {index_shape}")
+
+        gap = self.mask_cse.generate(self.masks, f"arith.subi %{upper_bound}, %{self.compute_idx} : index")
+        gap_vec = self.mask_cse.generate(self.masks, f"vector.broadcast %{gap} : index to {index_shape}")
+        mask_var = self.mask_cse.generate(self.masks, f"arith.cmpi ult, %{step_vec}, %{gap_vec} : {index_shape}")
+        self.register_var_info(mask_var, [compute_vec_size, "i1"])
+        return mask_shape, mask_var
+
     def convert_indirect_indexing(self, index :sympy.Expr):
         if "tmp" not in str(index):
             return index
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index f802f8e8..c6cd4a7e 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -450,6 +450,7 @@ def template_store():
                 stack.enter_context(compute_body.indent(attribute="{inner_loop=false}",suffix=self.compute_body_loop.epilogue_line()))
                 if self.reduction_fusion:
                     compute_body.writelines(self.reduction_body_loop.lines())
+                    compute_body.splice(self.masks)
                     stack.enter_context(compute_body.indent(attribute="{inner_loop=false}"))
                     compute_body.splice(self.loads)
                     compute_body.splice(self.compute)
@@ -889,7 +890,6 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         name = f"{reduction_type}_buffer{self.reduction_buffer_idx}"
         self.reduction_buffer_idx += 1
         index = "dummy_index" # Not used
-        tile_numel_per_lane = self.compute_body_loop.step * self.reduction_body_loop.size # ???
         sram_var, _ = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index, self.const_buffer)
         self.reduction_epilogue_result[reduction_key] = sram_var
 
@@ -903,6 +903,12 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         self.register_var_info(out, [self.compute_body_loop.step, type_name])
 
         # Reduction body codegen
+        init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
+        init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {type_name} to {vshape}")
+        self.register_var_info(init_vec, [local_tile_desc.get_compute_vec_size(), type_name])
+        mask_shape, mask_var = self.get_mask()
+        if mask_var is not None:
+            value = ops.where(mask_var, value, init_vec)
         result = reduction_partial_combine_vec(reduction_type, value, out)
 
         # Store partial result

From 9b235108760d765663d0ddcc33f2f9bc92b9aa2a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 19 Jul 2025 07:24:01 +0000
Subject: [PATCH 407/432] [Rename] Use encoder instead of decoder

---
 experiments/BERT.py                     |  8 ++---
 test_extension_backend.py               |  4 +--
 tests/Fusion/test_transformer_fusion.py | 46 ++++++++++++-------------
 tests/test_conv2d.py                    |  6 +++-
 tests/test_pool.py                      |  6 ++--
 tests/test_resnet.py                    |  6 ++--
 tests/test_scheduler.py                 |  2 +-
 tests/test_sparsity.py                  | 34 +++++++++---------
 tests/test_spmm_scheduler.py            |  2 +-
 tests/test_transformer.py               | 18 +++++-----
 10 files changed, 68 insertions(+), 64 deletions(-)

diff --git a/experiments/BERT.py b/experiments/BERT.py
index 7086ad9a..3534505d 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -7,8 +7,8 @@
 
 def run_BERT(size, input_seq, config):
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    # from tests.test_transformer import DecoderBlock
-    from tests.Fusion.test_transformer_fusion import DecoderBlock
+    # from tests.test_transformer import EncoderBlock
+    from tests.Fusion.test_transformer_fusion import EncoderBlock
     scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
     device = scheduler.execution_engine.module.custom_device()
 
@@ -16,10 +16,10 @@ def run_BERT(size, input_seq, config):
     embedding_size = {'base': 768, 'large': 1024, 'xlarge': 2048}
     heads = {'base': 12, 'large': 16, 'xlarge': 32} # hidden/64 https://arxiv.org/pdf/1909.11942
     cpu_query = torch.randn(input_seq, hidden_dim[size])
-    decoder_block = DecoderBlock(embedding_size[size], heads[size]).eval()
+    encoder_block = EncoderBlock(embedding_size[size], heads[size]).eval()
 
     query = cpu_query.clone().to(device=device)
-    opt_fn = torch.compile(dynamic=False)(decoder_block.to(device=device))
+    opt_fn = torch.compile(dynamic=False)(encoder_block.to(device=device))
 
     SchedulerDNNModel.register_model(f"BERT-{size}", opt_fn)
     request = Request(f"BERT-{size}", [query], [], request_queue_idx=0)
diff --git a/test_extension_backend.py b/test_extension_backend.py
index 10bc9854..f0a9353a 100644
--- a/test_extension_backend.py
+++ b/test_extension_backend.py
@@ -12,7 +12,7 @@
 from tests.test_matmul import test_matmul
 from tests.test_bmm import test_BMM
 from tests.test_cnn import test_CNN
-from tests.test_transformer import test_DecoderBlock
+from tests.test_transformer import test_EncoderBlock
 from tests.test_resnet import test_resnet
 from tests.test_mlp import test_mlp, test_mlp_inf
 from tests.MoE.test_moe import test_moe
@@ -46,7 +46,7 @@
     #test_matmul(device, 33, 45, 68)
     #test_BMM(device)
     #test_CNN(device)
-    #test_DecoderBlock(device)
+    #test_EncoderBlock(device)
     #test_resnet(device)
     #test_mlp(device)
     #test_mlp_inf(device, batch_size=64, input_size=256, hidden_size=512, output_size=256, sparsity=0.97)
diff --git a/tests/Fusion/test_transformer_fusion.py b/tests/Fusion/test_transformer_fusion.py
index 0f68948e..0e500b5b 100644
--- a/tests/Fusion/test_transformer_fusion.py
+++ b/tests/Fusion/test_transformer_fusion.py
@@ -53,9 +53,9 @@ def forward(self, query, key, value):
         del value
         return self.linears[-1](x)
 
-class DecoderBlock_origin(torch.nn.Module):
+class EncoderBlock_origin(torch.nn.Module):
     def __init__(self, embed_dim, num_heads):
-        super(DecoderBlock_origin, self).__init__()
+        super(EncoderBlock_origin, self).__init__()
         self.multihead_attn = my_MultiheadAttention_origin(num_heads, embed_dim)
         self.layer_norm = torch.nn.LayerNorm(embed_dim)
         self.ffn1 = torch.nn.Linear(embed_dim, embed_dim*4)
@@ -111,9 +111,9 @@ def forward(self, x, residual):
         out = torch.matmul(self.weight, x.transpose(-1, -2)) + self.bias[:, None] # (1, 768, 512)
         return self.layer_norm(out.transpose(-1, -2) + residual)
 
-class DecoderBlock(torch.nn.Module):
+class EncoderBlock(torch.nn.Module):
     def __init__(self, embed_dim, num_heads):
-        super(DecoderBlock, self).__init__()
+        super(EncoderBlock, self).__init__()
         self.multihead_attn = my_MultiheadAttention(num_heads, embed_dim)
         self.layer_norm = torch.nn.LayerNorm(embed_dim)
         self.ffn1 = torch.nn.Linear(embed_dim, embed_dim*4)
@@ -130,18 +130,18 @@ def forward(self, x):
         act_result = self.act(ffn1_result)
         return self.matmulln2(act_result, result)
 
-def test_DecoderBlock(device, head=12, embed_dim=768, input_seq=512):
+def test_EncoderBlock(device, head=12, embed_dim=768, input_seq=512):
     cpu_query = torch.randn(input_seq, embed_dim)
-    decoder_block = DecoderBlock(embed_dim, head)
-    cpu_res = decoder_block(cpu_query)
+    encoder_block = EncoderBlock(embed_dim, head)
+    cpu_res = encoder_block(cpu_query)
 
     query = cpu_query.clone().to(device=device)
-    decoder_block.to(device=device)
+    encoder_block.to(device=device)
     with torch.no_grad():
-        opt_fn = torch.compile(dynamic=False)(decoder_block)
+        opt_fn = torch.compile(dynamic=False)(encoder_block)
         res = opt_fn(query)
 
-    test_result("Decoder Block Forwrad", res, cpu_res)
+    test_result("Encoder Block Forwrad", res, cpu_res)
 
 def test_Attention(device, head=16, seq=512, d_k=64):
     def attention(query, key, value):
@@ -165,18 +165,18 @@ def attention(query, key, value):
 def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512):
     MHA = my_MultiheadAttention(num_heads, embed_dim)
     cpu_query = torch.randn(input_seq, embed_dim)
-    cpu_res = MHA(cpu_query, cpu_query, cpu_query)
-
-    query = cpu_query.clone().to(device=device)
-    MHA.to(device=device)
-    opt_fn = torch.compile(dynamic=False)(MHA)
-    res = opt_fn(query, query, query)
+    with torch.no_grad():
+        cpu_res = MHA(cpu_query, cpu_query, cpu_query)
+        query = cpu_query.clone().to(device=device)
+        MHA.to(device=device)
+        opt_fn = torch.compile(dynamic=False)(MHA)
+        res = opt_fn(query, query, query)
 
     test_result("MHA Forward", res, cpu_res)
 
-def test_DecoderBlock_validation(head=12, embed_dim=768, input_seq=512):
-    bert_origin = DecoderBlock_origin(embed_dim, head)
-    bert = DecoderBlock(embed_dim, head)
+def test_EncoderBlock_validation(head=12, embed_dim=768, input_seq=512):
+    bert_origin = EncoderBlock_origin(embed_dim, head)
+    bert = EncoderBlock(embed_dim, head)
 
     bert.multihead_attn.linears[0].weight = bert_origin.multihead_attn.linears[0].weight
     bert.multihead_attn.linears[0].bias = bert_origin.multihead_attn.linears[0].bias
@@ -196,7 +196,7 @@ def test_DecoderBlock_validation(head=12, embed_dim=768, input_seq=512):
     origin_res = bert_origin(origin_query)
     res = bert(query)
 
-    test_result("Decoder Block Validation", res, origin_res)
+    test_result("Encoder Block Validation", res, origin_res)
 
 if __name__ == "__main__":
     import os
@@ -206,8 +206,8 @@ def test_DecoderBlock_validation(head=12, embed_dim=768, input_seq=512):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_MHA(device)
-    test_DecoderBlock(device)
-    # test_DecoderBlock_validation()
+    #test_MHA(device)
+    test_EncoderBlock(device)
+    # test_EncoderBlock_validation()
     # test_Attention(device, head=16, seq=512, d_k=64)
     # test_MHA(device, num_heads=12, embed_dim=768)
diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py
index 8667792a..96ee05eb 100644
--- a/tests/test_conv2d.py
+++ b/tests/test_conv2d.py
@@ -43,10 +43,14 @@ def custom_conv2d(a, b, bias):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_conv2d(device, batch_size=1, in_channels=3, out_channels=32, input_size=32, kernel_size=3, stride=1, padding=1)
+    test_conv2d(device, batch_size=8, in_channels=3, out_channels=32, input_size=32, kernel_size=1, stride=1, padding=0)
     test_conv2d(device, batch_size=1, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=2, padding=3)
     test_conv2d(device, batch_size=2, in_channels=3, out_channels=64, input_size=32//2, kernel_size=7, stride=1, padding=3)
     test_conv2d(device, batch_size=4, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=1, padding=3)
     test_conv2d(device, batch_size=4, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=1, padding=3)
     test_conv2d(device, batch_size=2, in_channels=128, out_channels=256, input_size=13, kernel_size=5, stride=1, padding=2)
     test_conv2d(device, batch_size=2, in_channels=128, out_channels=512, input_size=14, kernel_size=7, stride=1, padding=3)
+    test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=14, kernel_size=3, stride=2, padding=1)
+    test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=7, kernel_size=3, stride=2, padding=1)
+    test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=2, kernel_size=1, stride=1, padding=0)
+    test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=14, kernel_size=1, stride=2, padding=0)
diff --git a/tests/test_pool.py b/tests/test_pool.py
index e94df65b..304a5e7c 100644
--- a/tests/test_pool.py
+++ b/tests/test_pool.py
@@ -50,6 +50,6 @@ def avgpool(a):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_maxpool(device, b=1, c=8, h=16, w=16)
-    test_maxpool(device, b=1, c=8, h=112, w=112)
-    test_avgpool(device)
+    #test_maxpool(device, b=1, c=8, h=16, w=16)
+    #test_maxpool(device, b=1, c=8, h=112, w=112)
+    test_avgpool(device, b=1, c=512, h=7, w=7)
diff --git a/tests/test_resnet.py b/tests/test_resnet.py
index 5e96b922..f54ce9be 100644
--- a/tests/test_resnet.py
+++ b/tests/test_resnet.py
@@ -18,13 +18,13 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
         print("cpu out: ", cpu_out)
         exit(1)
 
-def test_resnet(device):
+def test_resnet(device, batch=1):
     from torchvision.models import resnet
-    # model = resnet._resnet(resnet.BasicBlock, [1, 1, 0, 0], weights=None, progress=False).eval()
     with torch.no_grad():
+        #model = resnet._resnet(resnet.BasicBlock, [1, 1, 1, 1], weights=None, progress=False).eval()
         model = resnet18().eval()
         model.to(device, memory_format=torch.channels_last)
-        input = torch.randn(1, 3, 224, 224)
+        input = torch.randn(batch, 3, 224, 224)
         x1 = input.to(device=device, memory_format=torch.channels_last)
         x2 = input.cpu().to(memory_format=torch.channels_last)
         opt_fn = torch.compile(dynamic=False)(model)
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index e05fa392..c64093a0 100644
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -2,7 +2,7 @@
 import sys
 import torch
 from torchvision.models import resnet18 as model1
-from test_transformer import DecoderBlock as model2
+from test_transformer import EncoderBlock as model2
 
 base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
 sys.path.append(base_path)
diff --git a/tests/test_sparsity.py b/tests/test_sparsity.py
index b3945520..3e079f83 100644
--- a/tests/test_sparsity.py
+++ b/tests/test_sparsity.py
@@ -8,7 +8,7 @@
 import torch._dynamo
 import torch.utils.cpp_extension
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-from test_transformer import DecoderBlock, test_result
+from test_transformer import EncoderBlock, test_result
 from test_mlp import MLP
 
 def apply_random_zero(tensor, zero_prob, block_size=8):
@@ -35,30 +35,30 @@ def count_zeros_in_tensor_list(tensor_list):
 
 def test_dec_inf(device, sparsity=0.0, block=8):
     torch.manual_seed(0)
-    decoder_block = DecoderBlock(768, 12)
+    encoder_block = EncoderBlock(768, 12)
     cpu_query = torch.randn(512, 768)
     query = cpu_query.clone().to(device=device)
 
-    cpu_y = decoder_block(cpu_query)
+    cpu_y = encoder_block(cpu_query)
     with torch.no_grad():
-        decoder_block.multihead_attn.linears[0].weight.copy_(apply_random_zero(decoder_block.multihead_attn.linears[0].weight, sparsity, block_size=block))
-        decoder_block.multihead_attn.linears[1].weight.copy_(apply_random_zero(decoder_block.multihead_attn.linears[1].weight, sparsity, block_size=block))
-        decoder_block.multihead_attn.linears[2].weight.copy_(apply_random_zero(decoder_block.multihead_attn.linears[2].weight, sparsity, block_size=block))
-        decoder_block.multihead_attn.linears[3].weight.copy_(apply_random_zero(decoder_block.multihead_attn.linears[3].weight, sparsity, block_size=block))
-        decoder_block.ffn1.weight.copy_(apply_random_zero(decoder_block.ffn1.weight, sparsity, block_size=block))
-        decoder_block.ffn2.weight.copy_(apply_random_zero(decoder_block.ffn2.weight, sparsity, block_size=block))
+        encoder_block.multihead_attn.linears[0].weight.copy_(apply_random_zero(encoder_block.multihead_attn.linears[0].weight, sparsity, block_size=block))
+        encoder_block.multihead_attn.linears[1].weight.copy_(apply_random_zero(encoder_block.multihead_attn.linears[1].weight, sparsity, block_size=block))
+        encoder_block.multihead_attn.linears[2].weight.copy_(apply_random_zero(encoder_block.multihead_attn.linears[2].weight, sparsity, block_size=block))
+        encoder_block.multihead_attn.linears[3].weight.copy_(apply_random_zero(encoder_block.multihead_attn.linears[3].weight, sparsity, block_size=block))
+        encoder_block.ffn1.weight.copy_(apply_random_zero(encoder_block.ffn1.weight, sparsity, block_size=block))
+        encoder_block.ffn2.weight.copy_(apply_random_zero(encoder_block.ffn2.weight, sparsity, block_size=block))
 
     count_zeros_in_tensor_list([
-        decoder_block.multihead_attn.linears[0].weight,
-        decoder_block.multihead_attn.linears[1].weight,
-        decoder_block.multihead_attn.linears[2].weight,
-        decoder_block.multihead_attn.linears[3].weight,
-        decoder_block.ffn1.weight,
-        decoder_block.ffn2.weight
+        encoder_block.multihead_attn.linears[0].weight,
+        encoder_block.multihead_attn.linears[1].weight,
+        encoder_block.multihead_attn.linears[2].weight,
+        encoder_block.multihead_attn.linears[3].weight,
+        encoder_block.ffn1.weight,
+        encoder_block.ffn2.weight
     ])
 
-    decoder_block.to(device=device)
-    opt_fn = torch.compile(dynamic=False)(decoder_block)
+    encoder_block.to(device=device)
+    opt_fn = torch.compile(dynamic=False)(encoder_block)
     y = opt_fn(query)
     test_result("MLP Forward", y, cpu_y)
 
diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py
index 73bbdbae..1cf0d3b3 100644
--- a/tests/test_spmm_scheduler.py
+++ b/tests/test_spmm_scheduler.py
@@ -5,7 +5,7 @@
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
 from test_sparse_core import SparseMLP as model1
-from test_transformer import DecoderBlock as model2
+from test_transformer import EncoderBlock as model2
 CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
 
 if __name__ == "__main__":
diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index cfa2a622..4d45707e 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -53,9 +53,9 @@ def forward(self, query, key, value):
         del value
         return self.linears[-1](x)
 
-class DecoderBlock(torch.nn.Module):
+class EncoderBlock(torch.nn.Module):
     def __init__(self, embed_dim, num_heads):
-        super(DecoderBlock, self).__init__()
+        super(EncoderBlock, self).__init__()
         self.multihead_attn = my_MultiheadAttention(num_heads, embed_dim)
         self.layer_norm = torch.nn.LayerNorm(embed_dim)
         self.ffn1 = torch.nn.Linear(embed_dim, embed_dim*4)
@@ -71,17 +71,17 @@ def forward(self, x):
         ffn2_result = self.ffn2(act_result)
         return self.layer_norm(ffn2_result + result)
 
-def test_DecoderBlock(device, head=12, embed_dim=768, input_seq=512):
+def test_EncoderBlock(device, head=12, embed_dim=768, input_seq=512):
     cpu_query = torch.randn(1, input_seq, embed_dim)
-    decoder_block = DecoderBlock(embed_dim, head)
-    cpu_res = decoder_block(cpu_query)
+    encoder_block = EncoderBlock(embed_dim, head)
+    cpu_res = encoder_block(cpu_query)
 
     query = cpu_query.clone().to(device=device)
-    decoder_block.to(device=device)
-    opt_fn = torch.compile(dynamic=False)(decoder_block)
+    encoder_block.to(device=device)
+    opt_fn = torch.compile(dynamic=False)(encoder_block)
     res = opt_fn(query)
 
-    test_result("Decoder Block Forwrad", res, cpu_res)
+    test_result("Encoder Block Forwrad", res, cpu_res)
 
 def test_Attention(device, head=16, seq=512, d_k=64):
     def attention(query, key, value):
@@ -122,6 +122,6 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_DecoderBlock(device)
+    test_EncoderBlock(device)
     # test_Attention(device, head=16, seq=512, d_k=64)
     # test_MHA(device, num_heads=12, embed_dim=768)

From 4d1d0f50e4ebeb36c3bae70e975b275c9d027096 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 19 Jul 2025 08:13:08 +0000
Subject: [PATCH 408/432] [Frotend/Fusion] Relax the prologue fusion condition

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 20 +++++++++-------
 PyTorchSimFrontend/mlir/mlir_template.py   | 28 ++--------------------
 2 files changed, 13 insertions(+), 35 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 773414d5..3eff0ddc 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -65,8 +65,6 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
                 return False
             if len(node1.read_writes.writes) != 1:
                 return False
-            if len(node1.users) != 1:
-                return False
             # We don't fuse this case...
             if (isinstance(target_node.template, MLIRBMMTemplate) or isinstance(target_node.template, MLIRGemmTemplate)) and base_template_node2[0].group[1][0][0] == 1:
                     return False
@@ -250,12 +248,19 @@ def codegen_template_code(self, kernel, render, template_node, prologue_nodes, e
             _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs()
             for node in [template_node, *prologue_nodes, *epilogue_nodes]:
                 node.mark_run()
+            # Partial codgen template nodes
             partial_code = render()
+
+            # Swap load/store functions
+            kernel.load = kernel.load_epilogue
+            kernel.store = kernel.store_epilogue
+            kernel.store_reduction = kernel.store_reduction_epilogue
+            kernel.reduction = kernel.reduction_epilogue
+
+            # Codegen prologue nodes
             if prologue_nodes:
                 # Flush created varaibles, since template fusion doen't share variable
                 with kernel.prologue_buffer_group.as_local():
-                    kernel.load = kernel.load_epilogue
-                    kernel.store = kernel.store_prologue
                     _, (group, reduction_group) = max(
                         [prologue_nodes[-1]], key=lambda x: int(x.is_reduction())
                     ).group
@@ -292,16 +297,12 @@ def codegen_template_code(self, kernel, render, template_node, prologue_nodes, e
                         }
                         node.codegen((vars, reduction_vars))
 
+            # Codegen epilogue nodes
             tile_desc = kernel.set_tile_size(kernel.epilogue_info)
             kernel.kernel_group.set_tile_info(tile_desc)
             kernel.call_ranges = None
             if epilogue_nodes:
                 with kernel.epilogue_buffer_group.as_local():
-                    kernel.load = kernel.load_epilogue
-                    kernel.store = kernel.store_epilogue
-                    kernel.store_reduction = kernel.store_reduction_epilogue
-                    kernel.reduction = kernel.reduction_epilogue
-
                     _, (group, reduction_group) = max(
                         epilogue_nodes, key=lambda x: int(x.is_reduction())
                     ).group
@@ -315,6 +316,7 @@ def codegen_template_code(self, kernel, render, template_node, prologue_nodes, e
                 if isinstance(partial_code, str)
                 else partial_code.finalize()
             )
+
         # For consistency, white space could make wrong write_path
         buffer = IndentedBuffer()
         buffer.splice(src_code)
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index c6cd4a7e..66d6b578 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -423,6 +423,7 @@ def codegen_prologue_body(self):
                     compute_body.splice(self.compute)
                     compute_body.splice(self.stores)
                 body.splice(compute_body)
+            body.splice(self.dma_stores)
         return body
 
     def codegen_epilogue_body(self):
@@ -723,31 +724,6 @@ def get_spad_size_per_lane(self, tile_m, tile_n):
         size = tile_m * ((tile_n + self.vector_lane - 1) // self.vector_lane)
         return max(size, 2) # vector load/store
 
-    def store_prologue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
-        dtype = V.graph.get_dtype(name)
-        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
-
-        # Compute vector unit size
-        vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
-        compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
-
-        sram_var = self.buffer_names[name]
-        zero_var = self.get_const_cse(0)
-
-        _, operand_type = self.var_info[value]
-        if mlir_dtype != operand_type:
-            value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
-        compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
-        # Generate vector load instruction
-        if compute_vec_size > 1:
-            operation = "affine.vector_store"
-            line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
-        else:
-            operation = "affine.store"
-            line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}"
-        self.stores.writeline(line)
-
     def load_epilogue(self, name: str, index: sympy.Expr):
         index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.input(name)
@@ -847,7 +823,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         else:
             operation = "affine.store"
             line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}"
-        self.stores.writeline(DeferredLine(name, line))
+        self.stores.writeline(line)
 
         # Generate DMA instruction
         attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"

From 8253c1dbac6d4e00e805d590d3934b110f11df4a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 19 Jul 2025 12:46:49 +0000
Subject: [PATCH 409/432] [Frontend] Avoid tricky cases in the prologue fusion

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 2 ++
 tests/test_conv2d.py                       | 1 +
 2 files changed, 3 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 3eff0ddc..e037207f 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -65,6 +65,8 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
                 return False
             if len(node1.read_writes.writes) != 1:
                 return False
+            if node1.node not in target_node.inputs or any(["view" in str(ori) for ori in node1.node.origins]): #FIXME
+                return False
             # We don't fuse this case...
             if (isinstance(target_node.template, MLIRBMMTemplate) or isinstance(target_node.template, MLIRGemmTemplate)) and base_template_node2[0].group[1][0][0] == 1:
                     return False
diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py
index 96ee05eb..c679b431 100644
--- a/tests/test_conv2d.py
+++ b/tests/test_conv2d.py
@@ -43,6 +43,7 @@ def custom_conv2d(a, b, bias):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
+    torch._dynamo.config.cache_size_limit = 64
     test_conv2d(device, batch_size=8, in_channels=3, out_channels=32, input_size=32, kernel_size=1, stride=1, padding=0)
     test_conv2d(device, batch_size=1, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=2, padding=3)
     test_conv2d(device, batch_size=2, in_channels=3, out_channels=64, input_size=32//2, kernel_size=7, stride=1, padding=3)

From edc3f57261c9a8ad8253b9071800018de4cf07f5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Sat, 19 Jul 2025 13:52:43 +0000
Subject: [PATCH 410/432] [Frontend] Fix store epilogue

---
 PyTorchSimFrontend/mlir/mlir_template.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 66d6b578..1da2e755 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -805,10 +805,12 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         if name not in self.buffer_names:
             sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, self.kernel_group.tile_desc, index)
             self.buffer_names[name] = sram_var
+            store_force = False
         else:
             zero_cse = self.get_const_cse(0)
             sram_dims = len(tile_shape.split("x")) - 1
             sram_index_var = ",".join([f"%{zero_cse}"] * sram_dims)
+            store_force = True
         sram_var = self.buffer_names[name]
         zero_var = self.get_const_cse(0)
 
@@ -823,6 +825,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         else:
             operation = "affine.store"
             line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}"
+        line = line if store_force else DeferredLine(name, line)
         self.stores.writeline(line)
 
         # Generate DMA instruction

From 1d49f43e340343b727f883c04c0812a27965b022 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 21 Jul 2025 04:43:57 +0000
Subject: [PATCH 411/432] [Config] Remove deprecated config

---
 PyTorchSimFrontend/extension_config.py | 3 ---
 scripts/chiplet_prep.sh                | 9 ++++-----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 1761e05c..d60826a1 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -53,9 +53,6 @@
 
 # For block sparse
 CONFIG_BLOCK_SPARSE = int(os.environ.get('BLOCK_SPARSE', default=0))
-CONFIG_FORCE_TILE_M = int(os.environ.get("TORCHSIM_FORCE_TIME_M", default=sys.maxsize))
-CONFIG_FORCE_TILE_N = int(os.environ.get("TORCHSIM_FORCE_TIME_N", default=sys.maxsize))
-CONFIG_FORCE_TILE_K = int(os.environ.get("TORCHSIM_FORCE_TIME_K", default=sys.maxsize))
 
 # For GEMM tile size
 CONFIG_MANUAL_TILE_SIZE = int(os.environ.get('TORCHSIM_MANUAL_TILE_SIZE', default=False))
diff --git a/scripts/chiplet_prep.sh b/scripts/chiplet_prep.sh
index 99fc9b30..cddf1a58 100755
--- a/scripts/chiplet_prep.sh
+++ b/scripts/chiplet_prep.sh
@@ -1,14 +1,13 @@
 #!/bin/bash
 
 sizes=(256 512 1024 2048)
-# 각 size에 대해 처리
 for size in "${sizes[@]}"; do
     echo "Processing size: $size"
 
-    # 환경 변수 설정
-    export TORCHSIM_FORCE_TIME_M=$((size / 2))
-    export TORCHSIM_FORCE_TIME_K=$((size / 2))
-    export TORCHSIM_FORCE_TIME_N=$((size / 2))
+    # Set environment variables
+    export TORCHSIM_TILE_M=$((size / 2))
+    export TORCHSIM_TILE_K=$((size / 2))
+    export TORCHSIM_TILE_N=$((size / 2))
     export TORCHSIM_DUMP_PATH=$(pwd)/chiplet_result/$size
     python3 chiplet_prep.py $size
     #python3 chiplet_run.py $(pwd)/chiplet_result

From 903ff136b73d7ead01b15dd599ef90b3705dac08 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 21 Jul 2025 12:43:16 +0000
Subject: [PATCH 412/432] [TogSim] Update tile_stride logic

---
 AsmParser/onnx_utility.py                   |   3 +-
 AsmParser/tog_generator.py                  |   3 +-
 PyTorchSimBackend/include/Instruction.h     |  16 +--
 PyTorchSimBackend/include/TileGraphParser.h |   6 +-
 PyTorchSimBackend/src/Instruction.cc        |  31 ++---
 PyTorchSimBackend/src/TileGraphParser.cc    | 146 ++++++++------------
 6 files changed, 83 insertions(+), 122 deletions(-)

diff --git a/AsmParser/onnx_utility.py b/AsmParser/onnx_utility.py
index d46e8347..4f76ef35 100644
--- a/AsmParser/onnx_utility.py
+++ b/AsmParser/onnx_utility.py
@@ -66,12 +66,13 @@ def __init__(self, tile_info, inst_list=list(), node_id=0):
         super().__init__(node_id)
         self.inst = inst_list
         self.torchsim_base_addr = tile_info["base_addr"]
-        self.torchsim_stride_list = tile_info["stride_list"]
         self.torchsim_tile_size = tile_info["tile_size"]
+        self.torchsim_tile_stride = tile_info["tile_stride"]
         self.torchsim_element_size = tile_info["element_size"]
         self.torchsim_tag_idx_list = tile_info["tag_idx_list"]
         self.torchsim_tag_stride_list = tile_info["tag_stride_list"]
         self.torchsim_loop_idx_list = tile_info["loop_idx_list"]
+        self.torchsim_loop_stride_list = tile_info["loop_stride_list"]
         self.torchsim_is_async = tile_info["is_async"]
         self.torchsim_indirect_mode = tile_info["indirect_mode"]
 
diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
index 1dea2f8d..5f586d99 100644
--- a/AsmParser/tog_generator.py
+++ b/AsmParser/tog_generator.py
@@ -91,12 +91,13 @@ def _create_node(self, dump_data):
         elif node_type == self.DMANodeKind:
             tile_info = {}
             tile_info["base_addr"] = dump_data["base_address"]
-            tile_info["stride_list"] = dump_data["stride_list"]
             tile_info["tile_size"] = dump_data["tile_size"]
+            tile_info["tile_stride"] = dump_data["tile_stride"]
             tile_info["element_size"] = dump_data["element_size"]
             tile_info["tag_idx_list"] = dump_data["tag_idx_list"]
             tile_info["tag_stride_list"] = dump_data["tag_stride_list"]
             tile_info["loop_idx_list"] = dump_data["loop_idx_list"]
+            tile_info["loop_stride_list"] = dump_data["loop_stride_list"]
             tile_info["is_async"] = dump_data["is_async"]
             tile_info["indirect_mode"] = dump_data["indirect_mode"]
             is_write = dump_data["is_write"]
diff --git a/PyTorchSimBackend/include/Instruction.h b/PyTorchSimBackend/include/Instruction.h
index 84b17d7c..4c14dd81 100644
--- a/PyTorchSimBackend/include/Instruction.h
+++ b/PyTorchSimBackend/include/Instruction.h
@@ -22,9 +22,10 @@ std::string opcode_to_string(Opcode opcode);
 class Instruction : public std::enable_shared_from_this<Instruction> {
  public:
   Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_parents, addr_type dram_addr,
-              std::vector<size_t> tile_size, size_t precision, std::vector<int> &idx_list,
-              std::vector<int> &stride_list,  std::vector<int> tag_idx_list, std::vector<int> tag_stride_list,
-              std::vector<int> accum_tag_idx_list, std::vector<int> loop_size_list);
+              std::vector<size_t> tile_size, std::vector<int> tile_stride, size_t precision,
+              std::vector<int> tag_idx_list, std::vector<int> tag_stride_list,
+              std::vector<int> accum_tag_idx_list);
+  Instruction(Opcode opcode);
   void finish_instruction();
   void add_child(std::shared_ptr<Instruction> child);
   bool check_ready() { return ready_counter == 0; }
@@ -60,10 +61,6 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   bool load_indirect_index(const std::string& path, uint64_t*& indirect_index, const std::vector<uint64_t>& tile_size);
   void set_trace_address(std::vector<addr_type>& trace_address) { _trace_address = trace_address; }
   size_t get_free_sram_size() { return _free_sram_size; }
-  void adjust_dram_address() {
-    int offset = std::inner_product(_idx_list.begin(), _idx_list.end(), _stride_list.begin(), 0);
-    dram_addr += offset * _precision;
-  }
   addr_type get_base_dram_address() { return dram_addr; }
   void set_free_sram_size(size_t sram_size) { _free_sram_size=sram_size; }
   void* get_owner() { return _owner; }
@@ -73,7 +70,6 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   int get_compute_type() { return _compute_type; }
   void set_numa_id(int numa_id) { _numa_id = numa_id; }
   uint32_t get_numa_id() { return _numa_id; }
-  std::vector<int>& get_idx_list() { return _idx_list; }
   std::vector<int>& get_tag_idx_list() { return _tag_idx_list; }
   std::vector<int>& get_tag_stride_list() { return _tag_stride_list; }
   std::vector<int>& get_tag_id() { return _tag_key; }
@@ -103,6 +99,7 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   size_t ready_counter;
   std::set<std::shared_ptr<Instruction>> child_inst;
   std::vector<size_t> tile_size;
+  std::vector<int> tile_stride;
   size_t _tile_numel;
   size_t _nr_waiting_request=0;
   size_t _precision=0;
@@ -110,13 +107,10 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   addr_type dram_addr;
   uint32_t _numa_id = 0; // For DMA instruction
   int _compute_type = 0;
-  std::vector<int> _idx_list;
-  std::vector<int> _stride_list;
   std::vector<int> _tag_idx_list;
   std::vector<int> _tag_stride_list;
   std::vector<int> _tag_key;
   std::vector<int> _accum_tag_idx_list;
-  std::vector<int> _loop_size_list;
   std::vector<addr_type> _trace_address;
   std::string _addr_name;
   int _addr_id;
diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/PyTorchSimBackend/include/TileGraphParser.h
index b5322b76..5b561127 100644
--- a/PyTorchSimBackend/include/TileGraphParser.h
+++ b/PyTorchSimBackend/include/TileGraphParser.h
@@ -175,17 +175,18 @@ class TileMemoryNode : public TileNode {
   std::string get_base_addr_name() { return _base_addr_name; }
   size_t get_precision() { return _element_size; }
   std::vector<size_t> get_tile_size() { return _tile_size; }
-  std::vector<int>& get_stride_list () { return _stride_list; }
+  std::vector<int>& get_tile_stride() { return _tile_stride; }
   std::vector<std::string>& get_tag_idx_list() { return _tag_idx_list; }
   std::vector<int>& get_tag_stride_list() { return _tag_stride_list; }
   std::vector<std::string>& get_loop_idx_list() { return _loop_idx_list; }
+  std::vector<int>& get_loop_stride_list () { return _loop_stride_list; }
   bool is_async_node() { return _is_async; }
   bool is_indirect() { return _is_indirect; }
   void print_node() override;
 
  private:
   std::vector<size_t> _tile_size;
-  std::vector<int> _stride_list;
+  std::vector<int> _tile_stride;
   size_t _element_size;
   bool _is_async;
   bool _is_indirect;
@@ -193,6 +194,7 @@ class TileMemoryNode : public TileNode {
   std::vector<std::string> _tag_idx_list;
   std::vector<int> _tag_stride_list;
   std::vector<std::string> _loop_idx_list;
+  std::vector<int> _loop_stride_list;
 };
 
 class TileMemoryWaitNode : public TileNode {
diff --git a/PyTorchSimBackend/src/Instruction.cc b/PyTorchSimBackend/src/Instruction.cc
index b706ca8f..aef9079c 100644
--- a/PyTorchSimBackend/src/Instruction.cc
+++ b/PyTorchSimBackend/src/Instruction.cc
@@ -11,23 +11,22 @@ std::string opcode_to_string(Opcode opcode) {
 }
 
 Instruction::Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_parents,
-            addr_type dram_addr, std::vector<size_t> tile_size, size_t precision,
-            std::vector<int>& idx_list, std::vector<int>& stride_list,
+            addr_type dram_addr, std::vector<size_t> tile_size, std::vector<int> tile_stride, size_t precision,
             std::vector<int> tag_idx_list, std::vector<int> tag_stride_list,
-            std::vector<int> accum_tag_idx_list, std::vector<int> loop_size_list)
+            std::vector<int> accum_tag_idx_list)
   : opcode(opcode), compute_cycle(compute_cycle), ready_counter(num_parents), dram_addr(dram_addr),
-    tile_size(tile_size), _precision(precision), _idx_list(idx_list),
-    _stride_list(stride_list), _tag_idx_list(tag_idx_list), _tag_stride_list(tag_stride_list),
-    _accum_tag_idx_list(accum_tag_idx_list), _loop_size_list(loop_size_list) {
+    tile_size(tile_size), tile_stride(tile_stride), _precision(precision),
+    _tag_idx_list(tag_idx_list), _tag_stride_list(tag_stride_list),
+    _accum_tag_idx_list(accum_tag_idx_list) {
   assert(_tag_idx_list.size()==_tag_stride_list.size());
   _tile_numel = 1;
   for (auto dim : tile_size)
     _tile_numel *= dim;
+}
 
-  /* Supporting vector */
-  if (_stride_list.size() == 1) {
-    _stride_list.push_back(1);
-  }
+Instruction::Instruction(Opcode opcode)
+  : opcode(opcode) {
+  _tile_numel = 1;
 }
 
 void Instruction::finish_instruction() {
@@ -73,8 +72,8 @@ std::shared_ptr<std::set<addr_type>> Instruction::get_dram_address(addr_type dra
   while (tile_size.size() < 4)
     tile_size.insert(tile_size.begin(), 1);
 
-  while (_stride_list.size() < 4)
-    _stride_list.insert(_stride_list.begin(), 0);
+  while (tile_stride.size() < 4)
+    tile_stride.insert(tile_stride.begin(), 0);
   if (_is_indirect_mode) {
     spdlog::trace("[Indirect Access] Indirect mode, dump_path: {}", _indirect_index_path);
     load_indirect_index(_indirect_index_path, indirect_index, tile_size);
@@ -85,10 +84,10 @@ std::shared_ptr<std::set<addr_type>> Instruction::get_dram_address(addr_type dra
     for (int dim1=0; dim1<tile_size.at(1); dim1++) {
       for (int dim2=0; dim2<tile_size.at(2); dim2++) {
         for (int dim3=0; dim3<tile_size.at(3); dim3++) {
-          addr_type address = dim0*_stride_list.at(_stride_list.size() - 4) + \
-                              dim1*_stride_list.at(_stride_list.size() - 3) + \
-                              dim2*_stride_list.at(_stride_list.size() - 2) + \
-                              dim3*_stride_list.at(_stride_list.size() - 1);
+          addr_type address = dim0*tile_stride.at(tile_stride.size() - 4) + \
+                              dim1*tile_stride.at(tile_stride.size() - 3) + \
+                              dim2*tile_stride.at(tile_stride.size() - 2) + \
+                              dim3*tile_stride.at(tile_stride.size() - 1);
           address = dram_addr + address * _precision;
           if (indirect_index != NULL) {
             uint64_t index_val = indirect_index[index_count++];
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index 2b13ebc1..0f3e2ce9 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -199,12 +199,12 @@ TileMemoryNode::TileMemoryNode(onnx::NodeProto& node) : TileNode(node) {
       _base_addr_name = attribute.s();
     } else if (attribute.name() == "torchsim_element_size") {
       _element_size = attribute.i();
-    } else if (attribute.name() == "torchsim_stride_list") {
-      for (int i = 0; i < attribute.ints_size(); i++)
-        _stride_list.push_back(attribute.ints(i));
     } else if (attribute.name() == "torchsim_tile_size") {
       for (int i = 0; i < attribute.ints_size(); i++)
         _tile_size.push_back(attribute.ints(i));
+    } else if (attribute.name() == "torchsim_tile_stride") {
+      for (int i = 0; i < attribute.ints_size(); i++)
+        _tile_stride.push_back(attribute.ints(i));
     } else if (attribute.name() == "torchsim_tag_idx_list") {
       for (int i = 0; i < attribute.strings_size(); i++)
         _tag_idx_list.push_back(attribute.strings(i));
@@ -214,6 +214,9 @@ TileMemoryNode::TileMemoryNode(onnx::NodeProto& node) : TileNode(node) {
     } else if (attribute.name() == "torchsim_loop_idx_list") {
       for (int i = 0; i < attribute.strings_size(); i++)
         _loop_idx_list.push_back(attribute.strings(i));
+    } else if (attribute.name() == "torchsim_loop_stride_list") {
+      for (int i = 0; i < attribute.ints_size(); i++)
+        _loop_stride_list.push_back(attribute.ints(i));
     } else if (attribute.name() == "torchsim_is_async") {
       _is_async = attribute.i();
     } else if (attribute.name() == "torchsim_indirect_mode") {
@@ -230,8 +233,9 @@ void TileMemoryNode::print_node() {
   std::string spaces(get_depth(), '\t');
   spdlog::debug("{} base_addr_name: {}", spaces, _base_addr_name);
   spdlog::debug("{} element_size: {}", spaces, _element_size);
-  spdlog::debug("{} stride_list: {} ", spaces, _stride_list);
+  spdlog::debug("{} loop_stride_list: {} ", spaces, _loop_stride_list);
   spdlog::debug("{} tile_size: {} ", spaces, _tile_size);
+  spdlog::debug("{} tile_stride: {} ", spaces, _tile_stride);
   spdlog::debug("{} tag_list: {}", spaces, fmt::join(_tag_idx_list, ", "));
   spdlog::debug("{} tag_stride_list: {}", spaces, fmt::join(_tag_stride_list, ", "));
   spdlog::debug("{} index_list: {}", spaces, fmt::join(_loop_idx_list, ", "));
@@ -342,30 +346,26 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
   for (auto& tile_node: _body_node) {
     if (tile_node->get_type() == TileType::LOAD_NODE) {
       std::shared_ptr<TileMemoryNode> mem_node = std::static_pointer_cast<TileMemoryNode>(tile_node);
-      auto base_addr_name = mem_node->get_base_addr_name();
-      int base_addr_id = tog_parser->register_addr_name(base_addr_name);
-      std::vector<std::string>& tag_idx_list = mem_node->get_tag_idx_list();
-      std::vector<int>& tag_stride_list = mem_node->get_tag_stride_list();
-      std::vector<int> skip_idx_list;
-      std::vector<int> values;
-
-      /* Lookup given name's address */
-      addr_type base_addr = tog_parser->lookup(base_addr_name);
       std::vector<int> iter_list;
-      std::vector<int> tag_list;
-      std::vector<int> accum_tag_list;
-      std::vector<int> loop_size_list;
-      std::vector<uint32_t> outer_loop_idx;
-      std::vector<uint32_t> outer_loop_size;
       int nr_inner_loop = 0;
       auto& loop_idx_list = mem_node->get_loop_idx_list();
       for (auto loop_idx: loop_idx_list) {
-        auto iter_value = getLoopIndexValue(iter, loop_idx);
+        int iter_value = getLoopIndexValue(iter, loop_idx);
         iter_list.push_back(iter_value);
-        loop_size_list.push_back(tog_parser->get_loop_size(loop_idx));
         if (tog_parser->get_loop_type(loop_idx)==LoopType::INNER_LOOP)
           nr_inner_loop++;
       }
+
+      /* Base address setting */
+      std::string base_addr_name = mem_node->get_base_addr_name();
+      int base_addr_id = tog_parser->register_addr_name(base_addr_name);
+      addr_type base_addr = tog_parser->lookup(base_addr_name);
+      addr_type offset = std::inner_product(iter_list.begin(), iter_list.end(), mem_node->get_loop_stride_list().begin(), 0);
+
+      std::vector<int> tag_list;
+      std::vector<int> accum_tag_list;
+      std::vector<uint32_t> outer_loop_idx;
+      std::vector<uint32_t> outer_loop_size;
       /* Add accumulation loop info to accum_tag list */
       for (auto loop_idx = loop_idx_list.begin();
             loop_idx != loop_idx_list.end() - nr_inner_loop; ++loop_idx) {
@@ -387,7 +387,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       }
 
       uint32_t systolic_size = std::stoi(tog_parser->getMetaByName("systolic_size"));
-      for (auto loop_idx: tag_idx_list) {
+      for (auto loop_idx: mem_node->get_tag_idx_list()) {
         if (iter.find(loop_idx) == iter.end())
           tag_list.push_back(0);
         else {
@@ -406,25 +406,32 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         int stride_idx = calculateAddress(outer_loop_size, tog_parser->lookupNumaInfo(base_addr_name));
         numa_id = total_idx / stride_idx;
       }
+
       /* Check need to make this memory node */
+      std::vector<int>& tag_stride_list = mem_node->get_tag_stride_list();
       std::vector<int> key = tog_parser->calc_tag(accum_tag_list, tag_list, tag_stride_list);
       if (tog_parser->check_memory_tag(base_addr_name, key))
         continue;
       tog_parser->register_memory_tag(base_addr_name, key);
 
       printIndexMap("[TOGParser] Load Node " + mem_node->get_base_addr_name() + " Numa_id: " + std::to_string(numa_id), iter);
+      spdlog::trace("[TOGParser] Load Node {} key = [{}], accum = [{}], tag = [{}], stride = [{}]", mem_node->get_base_addr_name(),
+             fmt::join(key, ", "),
+             fmt::join(accum_tag_list, ", "),
+             fmt::join(tag_list, ", "),
+             fmt::join(tag_stride_list, ", "));
       std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
         Opcode::MOVIN, 0,
-        0, base_addr,
-        mem_node->get_tile_size(), mem_node->get_precision(), iter_list,
-        mem_node->get_stride_list(), tag_list, tag_stride_list, accum_tag_list, loop_size_list
+        0, base_addr+offset,
+        mem_node->get_tile_size(), mem_node->get_tile_stride(), mem_node->get_precision(),
+        tag_list, tag_stride_list, accum_tag_list
       );
       inst->set_addr_name(base_addr_name, base_addr_id);
       inst->prepare_tag_key();
       inst->set_nr_inner_loop(nr_inner_loop);
-      inst->adjust_dram_address();
       inst->set_is_async(mem_node->is_async_node());
       inst->set_numa_id(numa_id);
+
       if (mem_node->is_indirect()) {
         inst->set_indirect_index_path(tog_parser->get_indirect_path());
         tog_parser->inc_indirect_counter();
@@ -439,14 +446,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       tile_vec.back()->append_instuction(inst);
     } else if (tile_node->get_type() == TileType::STORE_NODE) {
       std::shared_ptr<TileMemoryNode> mem_node = std::static_pointer_cast<TileMemoryNode>(tile_node);
-      auto base_addr_name = mem_node->get_base_addr_name();
-      int base_addr_id = tog_parser->register_addr_name(base_addr_name);
-      /* Lookup given name's address */
-      addr_type base_addr = tog_parser->lookup(base_addr_name);
-      std::vector<int>& tag_stride_list = mem_node->get_tag_stride_list();
-      std::vector<int> accum_tag_list;
       std::vector<int> iter_list;
-      std::vector<int> loop_size_list;
       std::vector<uint32_t> outer_loop_idx;
       std::vector<uint32_t> outer_loop_size;
       int nr_inner_loop = 0;
@@ -454,7 +454,6 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       for (auto loop_idx: loop_idx_list) {
         auto iter_value = getLoopIndexValue(iter, loop_idx);
         iter_list.push_back(iter_value);
-        loop_size_list.push_back(tog_parser->get_loop_size(loop_idx));
         if (tog_parser->get_loop_type(loop_idx)==LoopType::INNER_LOOP)
           nr_inner_loop++;
         if (tog_parser->get_loop_type(loop_idx)==LoopType::PARALLEL_LOOP) {
@@ -465,6 +464,12 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         }
       }
 
+      /* Lookup given name's address */
+      std::string base_addr_name = mem_node->get_base_addr_name();
+      int base_addr_id = tog_parser->register_addr_name(base_addr_name);
+      addr_type base_addr = tog_parser->lookup(base_addr_name);
+      addr_type offset = std::inner_product(iter_list.begin(), iter_list.end(), mem_node->get_loop_stride_list().begin(), 0);
+
       /* Calc numa id */
       int numa_id = 0;
       auto numa_stride_size = tog_parser->lookupNumaInfo(base_addr_name).size();
@@ -477,14 +482,13 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       printIndexMap("[TOGParser] Store Node " + mem_node->get_base_addr_name() + " Numa_id: " + std::to_string(numa_id), iter);
       std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
         Opcode::MOVOUT, 0,
-        0, base_addr,
-        mem_node->get_tile_size(), mem_node->get_precision(), iter_list,
-        mem_node->get_stride_list(), std::vector<int>(1), tag_stride_list, accum_tag_list, loop_size_list
+        0, base_addr+offset,
+        mem_node->get_tile_size(), mem_node->get_tile_stride(), mem_node->get_precision(),
+        std::vector<int>(1), mem_node->get_tag_stride_list(), std::vector<int>()
       );
       inst->set_addr_name(base_addr_name, base_addr_id);
       inst->prepare_tag_key();
       inst->set_nr_inner_loop(nr_inner_loop);
-      inst->adjust_dram_address();
       inst->set_is_async(mem_node->is_async_node());
       inst->set_numa_id(numa_id);
       if (mem_node->is_indirect()) {
@@ -530,11 +534,16 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
           new_tag_stride_list.push_back(i);
       }
 
+      spdlog::trace("[TOGParser] Wait Node {}, accum = [{}], tag = [{}], stride = [{}]", wait_node->get_base_addr_name(),
+             fmt::join(accum_tag_list, ", "),
+             fmt::join(tag_list, ", "),
+             fmt::join(new_tag_stride_list, ", "));
+
       std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
         Opcode::BAR, 0,
         0, base_addr,
-        std::vector<size_t>(), 0, iter_list,
-        iter_list, tag_list, new_tag_stride_list, accum_tag_list, std::vector<int>()
+        std::vector<size_t>(), std::vector<int>(), 0,
+        tag_list, new_tag_stride_list, accum_tag_list
       );
       inst->set_addr_name(base_addr_name, base_addr_id);
       inst->prepare_tag_key();
@@ -543,15 +552,14 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
     } else if (tile_node->get_type() == TileType::COMPUTE_NODE) {
       printIndexMap("[TOGParser] Compute Node ", iter);
       std::shared_ptr<TileComputeNode> compute_node = std::static_pointer_cast<TileComputeNode>(tile_node);
-      std::vector<int> iter_list;
       std::vector<int> tag_list = {0};
       std::vector<int> tag_stride_list = {1};
       std::vector<int> accum_tag_list;
       std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
         Opcode::COMP, compute_node->get_cycle(),
         0, 0,
-        std::vector<size_t>(), 0, iter_list, iter_list,
-        tag_list, tag_stride_list, accum_tag_list, std::vector<int>()
+        std::vector<size_t>(), std::vector<int>(), 0,
+        tag_list, tag_stride_list, accum_tag_list
       );
       inst->set_overlapping_cycle(compute_node->get_overlapping_cycle());
       inst->set_compute_type(compute_node->get_compute_type());
@@ -620,72 +628,28 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
     } else if (tile_node->get_type() == TileType::STONNE_NODE) {
       printIndexMap("[TOGParser] Stonne Node ", iter);
       std::shared_ptr<TileStonneNode> stonne_node = std::static_pointer_cast<TileStonneNode>(tile_node);
-      /* Lookup given name's address */
-      std::vector<int> iter_list;
-      std::vector<int> tag_list;
-      std::vector<int> tag_stride_list;
-      std::vector<int> accum_tag_list;
-
-      /* Put dummy computation instruction */
-      std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
-        Opcode::COMP, 0,
-        0, 0,
-        std::vector<size_t>(), 0, iter_list,
-        iter_list, tag_list, tag_stride_list, accum_tag_list, std::vector<int>()
-      );
+      std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(Opcode::COMP);
       link_map[tile_node] = inst;
       tile_vec.back()->append_instuction(inst);
       tile_vec.back()->set_custom_data(stonne_node->getDesc());
       tile_vec.back()->set_stonne_tile(true);
     } else if (tile_node->get_type() == TileType::STONNE_TRACE_COMPUTE_NODE) {
       std::shared_ptr<TileStonneTraceComputeNode> stonne_node = std::static_pointer_cast<TileStonneTraceComputeNode>(tile_node);
-      /* Lookup given name's address */
-      std::vector<int> iter_list;
-      std::vector<int> tag_list;
-      std::vector<int> tag_stride_list;
-      std::vector<int> accum_tag_list;
-
-      std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
-        Opcode::COMP, stonne_node->get_cycle(),
-        0, 0,
-        std::vector<size_t>(), 0, iter_list,
-        iter_list, tag_list, tag_stride_list, accum_tag_list, std::vector<int>()
-      );
+      std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(Opcode::COMP);
+      inst->set_compute_cycle(stonne_node->get_cycle());
       link_map[tile_node] = inst;
       tile_vec.back()->append_instuction(inst);
       tile_vec.back()->set_stonne_tile(true);
     } else if (tile_node->get_type() == TileType::STONNE_TRACE_LOAD_NODE) {
       std::shared_ptr<TileStonneTraceLoadNode> stonne_node = std::static_pointer_cast<TileStonneTraceLoadNode>(tile_node);
-      /* Lookup given name's address */
-      std::vector<int> iter_list;
-      std::vector<int> tag_list;
-      std::vector<int> tag_stride_list;
-      std::vector<int> accum_tag_list;
-
-      std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
-        Opcode::MOVIN, 0,
-        0, 0,
-        std::vector<size_t>(), 0, iter_list,
-        iter_list, tag_list, tag_stride_list, accum_tag_list, std::vector<int>()
-      );
+      std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(Opcode::MOVIN);
       inst->set_trace_address(stonne_node->get_address());
       link_map[tile_node] = inst;
       tile_vec.back()->append_instuction(inst);
       tile_vec.back()->set_stonne_tile(true);
     } else if (tile_node->get_type() == TileType::STONNE_TRACE_STORE_NODE) {
       std::shared_ptr<TileStonneTraceStoreNode> stonne_node = std::static_pointer_cast<TileStonneTraceStoreNode>(tile_node);
-      /* Lookup given name's address */
-      std::vector<int> iter_list;
-      std::vector<int> tag_list;
-      std::vector<int> tag_stride_list;
-      std::vector<int> accum_tag_list;
-
-      std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
-        Opcode::MOVOUT, 0,
-        0, 0,
-        std::vector<size_t>(), 0, iter_list,
-        iter_list, tag_list, tag_stride_list, accum_tag_list, std::vector<int>()
-      );
+      std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(Opcode::MOVOUT);
       inst->set_trace_address(stonne_node->get_address());
       link_map[tile_node] = inst;
       tile_vec.back()->append_instuction(inst);

From 94b13e1a71893ba93fcd79b31deeac7d18cb3b1d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 21 Jul 2025 12:45:46 +0000
Subject: [PATCH 413/432] [Frontend] Make dma tag unique

---
 PyTorchSimFrontend/extension_config.py        |  2 +-
 .../mlir/mlir_codegen_backend.py              | 19 +++++++++++--------
 PyTorchSimFrontend/mlir/mlir_template.py      |  9 ++++-----
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index d60826a1..8994cffe 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -37,7 +37,7 @@
 # Backendsim config
 CONFIG_TORCHSIM_BACKEND_CONFIG = os.environ.get('TORCHSIM_CONFIG',
                                         default=f'{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
-CONFIG_BACKENDSIM_SPIKE_ONLY = int(os.environ.get("BACKENDSIM_SPIKE_ONLY", True))
+CONFIG_BACKENDSIM_SPIKE_ONLY = int(os.environ.get("BACKENDSIM_SPIKE_ONLY", False))
 CONFIG_BACKENDSIM_EAGER_MODE = int(os.environ.get("BACKENDSIM_EAGER_MODE", default=False))
 CONFIG_BACKENDSIM_DRYRUN = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
 CONFIG_BACKENDSIM_DEBUG_LEVEL = os.environ.get("BACKENDSIM_DEBUG_LEVEL", "")
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 79d735a3..725fec5d 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -877,6 +877,7 @@ def __init__(self, kernel_group, reason=None):
         self.spadbuf_counter = 0
         self.dma_read_counter = 1
         self.dma_write_counter = 1
+        self.dma_tag_id = 0
         self.affine_yield = {}
         self.welford_reduce_out = None
         self.reduce_iterator = {}
@@ -1028,7 +1029,7 @@ def load(self, name: str, index: sympy.Expr):
         # MVIN Encoding
         attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding={padding}}}"
         code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                 f"{name}_tag", dram_shape, tile_shape, attribute)
+                                 dram_shape, tile_shape, attribute)
         self.cse.generate(self.dma_loads, code, assignment = False) # FIXME: assignment = False does not support caching
         compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"])
         # Generate vector load instruction
@@ -1090,7 +1091,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         # Generate DMA instruction
         attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                 f"{name}_tag", dram_shape, tile_shape, attribute)
+                                 dram_shape, tile_shape, attribute)
         self.dma_stores.writeline(common.DeferredLine(name, code))
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
@@ -1243,7 +1244,7 @@ def store_reduction(self, name, index, value):
         # Generate DMA instruction
         attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                 f"{name}_tag", dram_shape, tile_shape, attribute)
+                                 dram_shape, tile_shape, attribute)
         self.reductions_suffix.writeline(common.DeferredLine(name, code))
 
         # Restore origin cse
@@ -1655,7 +1656,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
         return local_tile_desc, index_var, dram_stride
 
     def get_dma_code(self, dma_type_name, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, dram_index_var, sram_var, sram_index_var,
-                     tag_name, dram_shape, tile_shape, attribute):
+                     dram_shape, tile_shape, attribute):
         dma_key = (vlane_split_axis, vlane_stride, mlir_dtype)
         if dma_type_name == "MVIN" and dma_key in self.dma_read_cache:
             dma_type, vlane_split_axis, vlane_stride = self.dma_read_cache[dma_key]
@@ -1670,9 +1671,8 @@ def get_dma_code(self, dma_type_name, vlane_split_axis, vlane_stride, mlir_dtype
                 self.dma_read_cache[dma_key] = [dma_type, vlane_split_axis, vlane_stride]
             else:
                 dma_type = self.get_const_cse(DMA_TYPE[f"{dma_type_name}{self.dma_write_counter}"])
-                # self.dma_write_counter += 1 Is it okay?
                 self.dma_write_cache[dma_key] = [dma_type, vlane_split_axis, vlane_stride]
-        tag = self.get_tag_cse(tag_name)
+        tag = self.get_tag_cse()
         zero_cse = self.get_const_cse(0)
 
         # Prepare opearnds and attributes
@@ -1742,9 +1742,12 @@ def get_const_cse(self, value, dtype="index") -> common.CSEVariable:
             self.consts[str(value)+dtype] = self.const_cse.generate(self.const_buffer, f"arith.constant {value} : {dtype}")
         return self.consts[str(value)+dtype]
 
-    def get_tag_cse(self, value, shape="memref<1xi32>"):
+    def get_tag_cse(self, value=None, shape="memref<1xi32>"):
+        if value is None:
+            value = self.dma_tag_id
+            self.dma_tag_id += 1
         if value not in self.tags:
-            self.tags[value] = self.alloc_cse.generate(self.alloc_buffer, f"memref.alloc() : {shape}")
+            self.tags[value] = self.alloc_cse.generate(self.alloc_buffer, f"memref.alloc() : {shape} // {value}")
         return self.tags[value]
 
     def get_mask(self):
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 1da2e755..0455cbf1 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -663,7 +663,6 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com
         # Prepare code block
         local_code = IndentedBuffer()
         with V.set_kernel_handler(self):
-            tag = f"mvint_{self.dma_read_counter}" if dma_type == "MVIN" else f"mvoutt_{self.dma_write_counter}"
             index_var = self.parse_index_list(index_list, local_code)
             node_layout = self.named_nodes[dram_var].get_layout()
             numel = self.get_arg_info(self.named_nodes[dram_var].get_name()).get_numel()
@@ -696,7 +695,7 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com
                 attribute_parts.append(f"subtile_size={subtile_size}, async={int(async_type) if async_type is not None else 1}")
             attribute = "  {" + ", ".join(attribute_parts) + "}"
             code = self.get_dma_code(dma_type, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                    tag, dram_shape, tile_shape, "")
+                                     dram_shape, tile_shape, "")
             local_code.writeline(code)
             local_code.writeline(attribute)
         return textwrap.indent(local_code.getvalue(), " "*indent_size).strip()
@@ -749,7 +748,7 @@ def load_epilogue(self, name: str, index: sympy.Expr):
             sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, self.kernel_group.tile_desc, index)
             attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
             code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                     f"{name}_tag", dram_shape, tile_shape, attribute)
+                                     dram_shape, tile_shape, attribute)
             self.cse.generate(self.dma_loads, code, assignment = False)
             self.buffer_names[name] = sram_var
         else:
@@ -831,7 +830,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         # Generate DMA instruction
         attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                 f"{name}_tag", dram_shape, tile_shape, attribute)
+                                 dram_shape, tile_shape, attribute)
         self.dma_stores.writeline(DeferredLine(name, code))
 
     def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
@@ -991,7 +990,7 @@ def store_reduction_epilogue(self, name, index, value):
         # Generate DMA instruction
         attribute = f"{{dram_stride={dram_stride}, sram_stride={final_tile_stride}, padding=0}}"
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                f"{name}_tag", dram_shape, final_tile_shape, attribute)
+                                dram_shape, final_tile_shape, attribute)
         self.reductions_suffix.writeline(DeferredLine(name, code))
 
     def set_tile_size(self, template_fusion_info, prologue=False):

From 737ed02eac858481e40201655c55b34b17645193 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 21 Jul 2025 15:14:59 +0000
Subject: [PATCH 414/432] [TOGSim] Handle edge case tag matching

---
 PyTorchSimBackend/src/TileGraphParser.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index 0f3e2ce9..12056f94 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -375,6 +375,10 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
           accum_tag_list.push_back(iter_value);
         }
       }
+      /* Default accum tag */
+      if (accum_tag_list.empty()) {
+        accum_tag_list.push_back(0);
+      }
 
       for (auto loop_idx = loop_idx_list.begin();
             loop_idx != loop_idx_list.end(); ++loop_idx) {
@@ -527,6 +531,10 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
           tag_list.push_back(iter_value);
         }
       }
+      /* Default accum tag */
+      if (accum_tag_list.empty()) {
+        accum_tag_list.push_back(0);
+      }
 
       /* Skip accum stride */
       for (auto i : tag_stride_list) {

From b18bcc0f78bad09e642f208811a4c53e6054e98b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 21 Jul 2025 15:15:37 +0000
Subject: [PATCH 415/432] [Frontend/Fusion] Do not allow prologue fusion for
 CONV

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 25 ++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index e037207f..f1c72c44 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -24,6 +24,7 @@ def __init__(self, scheduler):
         self.scheduler = scheduler
         self.scheduler.can_fuse_origin = self.scheduler.can_fuse
         self.scheduler.can_fuse = self.can_fuse_with_exceptions
+        #self.scheduler.enter_context = self.enter_context_fixed # FIXME. Monkey patch: For fixing the inductor bug
         self.kernel_group = mlir_common.MLIRWrapperKenrelGroup()
         self._ready_to_flush = False
         self.outer_function = set()
@@ -67,9 +68,14 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
                 return False
             if node1.node not in target_node.inputs or any(["view" in str(ori) for ori in node1.node.origins]): #FIXME
                 return False
-            # We don't fuse this case...
-            if (isinstance(target_node.template, MLIRBMMTemplate) or isinstance(target_node.template, MLIRGemmTemplate)) and base_template_node2[0].group[1][0][0] == 1:
-                    return False
+
+            # Currently only BMM, MM support prologue fusion
+            if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)):
+                return False
+            # We don't fuse this edge case...
+            if base_template_node2[0].group[1][0][0] == 1:
+                return False
+
             if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]:
                 node1 = self.revert_group(node1)
                 return True
@@ -368,4 +374,15 @@ def codegen_template(self, template_node, epilogue_nodes):
             V.graph.wrapper_code.writeline(
                 f"yield ({target_kernel_name}, ({args}))"
             )
-        self._set_flush_status(True)
\ No newline at end of file
+        self._set_flush_status(True)
+
+    def enter_context_fixed(self, node):
+        def get_order(n):
+            if n not in self.scheduler.origin_to_index:
+                self.scheduler.origin_to_index.update({n: i for i, n in enumerate(n.graph.nodes)})
+            return self.scheduler.origin_to_index[n]
+
+        origins = [(get_order(e), idx, e) for n in node.get_nodes() for idx, e in enumerate(n.node.origins)]
+        if origins:
+            _, _, last = max(origins)
+            V.graph.wrapper_code.enter_context(last)

From c90ee3dfdd1f049617591cdb77d66d61efb63d0a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 22 Jul 2025 04:41:12 +0000
Subject: [PATCH 416/432] [Log] Add silent mode for autotune

---
 PyTorchSimFrontend/extension_codecache.py | 13 ++++----
 PyTorchSimFrontend/mlir/mlir_autotune.py  |  2 +-
 Simulator/simulator.py                    | 38 +++++++++++++----------
 3 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 79d2b9d0..55719e08 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -159,7 +159,7 @@ def load(cls, source_code,
              cycle_wrapper_name="cycle_wrapper",
              cycle_binary_name="cycle_bin",
              arg_attributes=[], vectorlane_size=16,
-             spad_info=None, origins=None, **kwargs):
+             spad_info=None, origins=None, silent_mode=False, **kwargs):
         vlen = kwargs['vlen']
         vlenb = vlen // 8
         write_path = get_write_path(source_code)
@@ -239,7 +239,7 @@ def load(cls, source_code,
 
             # Run cyclesim
             cyclesim = CycleSimulator()
-            cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), " ".join(array_size), vectorlane_size)
+            cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), " ".join(array_size), vectorlane_size, silent_mode=silent_mode)
 
             # Create TOG
             w_offset, x_offset = vectorlane_size, vectorlane_size
@@ -335,13 +335,14 @@ def __init__(self):
         self.cycle_wrapper_name = "cycle_wrapper"
         self.cycle_binary_name = "cycle_binary"
 
-    def mlir(self, source_code, arg_attributes=[], vectorlane_size=16, tile_size=[], spad_info=None, origins=None, **kwargs):
+    def mlir(self, source_code, arg_attributes=[], vectorlane_size=16, tile_size=[], spad_info=None, origins=None, silent_mode=False, **kwargs):
         def task():
             key = MLIRCodeCache.load(source_code,
                                           valdiation_wrapper_name=self.validation_binary_name,
                                           validation_binary_name=self.validation_binary_name,
                                           arg_attributes=arg_attributes, vectorlane_size=vectorlane_size,
-                                          tile_size=tile_size, spad_info=spad_info, origins=origins, **kwargs)
+                                          tile_size=tile_size, spad_info=spad_info, origins=origins,
+                                          silent_mode=silent_mode, **kwargs)
             return key
         future = self.submit(task)
         if "loop_size" in kwargs:
@@ -363,7 +364,7 @@ def dummy_simulator(*args, **kwargs):
                 funcsim.run_spike(args, arg_attributes,
                                   runtime_path, self.validation_binary_name,
                                   vectorlane_size=vectorlane_size, spad_info=spad_info,
-                                  cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS)
+                                  cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS, silent_mode=silent_mode)
             if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
                 return
 
@@ -373,7 +374,7 @@ def dummy_simulator(*args, **kwargs):
             backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG)
             backsim.vectorlane_size = vectorlane_size
             attribute_path = backsim.create_attribute_file(attribute_path, args, loop_size=loop_size)
-            result_path = backsim.simulation(onnx_path, attribute_path)
+            result_path = backsim.simulation(onnx_path, attribute_path, silent_mode=silent_mode)
             result = BackendSimulator.get_result_from_file(result_path)
             return result
 
diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py
index 804cd2e8..af101f44 100644
--- a/PyTorchSimFrontend/mlir/mlir_autotune.py
+++ b/PyTorchSimFrontend/mlir/mlir_autotune.py
@@ -50,7 +50,7 @@ def make_run_fn(
             self.source_code, vectorlane_size=self.extra_args["vector_lane"],
             loop_size=None, spad_info=self.extra_args["spad_info"],
             vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"],
-            origins="Unknown")
+            origins="Unknown", silent_mode=True)
 
         args = [
             tensor
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 105edfa2..0d1bab6a 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -74,7 +74,7 @@ def dump_args(self, args, arg_attributes, load_path, dump_path):
 
         return array_size, file_path
 
-    def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=4, spad_info=None, cleanup=False):
+    def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=4, spad_info=None, cleanup=False, silent_mode=False):
         load_path = runtime_path
         dump_path = runtime_path
 
@@ -101,8 +101,8 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
         os.makedirs(os.path.join(runtime_path, "indirect_access"), exist_ok=True)
         os.makedirs(os.path.join(runtime_path, "dma_access"), exist_ok=True)
         run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
-
-        print("[SpikeSimulator] cmd> ", run)
+        if not silent_mode:
+            print("[SpikeSimulator] cmd> ", run)
         run_cmd = shlex.split(run)
         try:
             subprocess.check_call(run_cmd)
@@ -146,7 +146,7 @@ class CycleSimulator():
     def __init__(self) -> None:
         pass
 
-    def compile_and_simulate(self, target_binary, array_size, vectorlane_size):
+    def compile_and_simulate(self, target_binary, array_size, vectorlane_size, silent_mode=False):
         def show_progress():
             i = 0
             while not finished:
@@ -160,7 +160,7 @@ def show_progress():
         gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-r", "--stdout-file=sto.log", "-d", dir_path, extension_config.CONFIG_GEM5_SCRIPT_PATH, "-c", target_binary, "--vlane", str(vectorlane_size)]
         try:
             # Create progress thread
-            is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
+            is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) or silent_mode
             if not is_dryrun:
                 print("[Gem5Simulator] cmd> ", " ".join(gem5_cmd))
                 finished = False
@@ -202,7 +202,7 @@ def get_backend_command(self):
         cmd = f"{bin} --config {config}"
         return cmd
 
-    def simulation(self, model_path, attribute_path=""):
+    def simulation(self, model_path, attribute_path="", silent_mode=False):
         def show_progress():
             i = 0
             while not finished:
@@ -216,21 +216,25 @@ def show_progress():
             cmd += f" --log_level {extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL}"
         if attribute_path:
             cmd = f"{cmd} --attributes_list {attribute_path}"
-        print("[BackendSimulator] cmd> ", cmd)
+        if not silent_mode:
+            print("[BackendSimulator] cmd> ", cmd)
 
         # Create progress thread
-        finished = False
-        progress_thread = threading.Thread(target=show_progress)
-        progress_thread.start()
+        if not silent_mode:
+            finished = False
+            progress_thread = threading.Thread(target=show_progress)
+            progress_thread.start()
         try:
             result = subprocess.check_output(shlex.split(cmd))
-            finished = True
-            progress_thread.join()
+            if not silent_mode:
+                finished = True
+                progress_thread.join()
         except subprocess.CalledProcessError as e:
-            finished = True
-            progress_thread.join()
-            print("[BackendSimulator] Command failed with exit code", e.returncode)
-            print("[BackendSimulator] Error output:", e.output)
+            if not silent_mode:
+                finished = True
+                progress_thread.join()
+                print("[BackendSimulator] Command failed with exit code", e.returncode)
+                print("[BackendSimulator] Error output:", e.output)
             assert 0
         result_path = extension_config.CONFIG_BACKEND_RESULT_PATH_KEY
         if result_path is None:
@@ -242,7 +246,7 @@ def show_progress():
         result_path = os.path.join(result_path, file_name)
         with open(result_path, "w") as f:
             f.write(result.decode())
-            print(f'[BackendSimulator] Simulation of "{model_path}" is stored to "{result_path}"')
+        print(f'[BackendSimulator] Simulation of "{model_path}" is stored to "{result_path}"')
         return result_path
 
     def interactive_simulation(self):

From 381d8670df374c29cce25a821cf51975ba91da78 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 22 Jul 2025 05:03:05 +0000
Subject: [PATCH 417/432] [Frotend] Increase autotune thread numbers

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 725fec5d..75cc9c7c 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1483,7 +1483,7 @@ def get_cycle(choice):
 
         if len(choices) == 0: # can't autotune
             return None
-        with ThreadPoolExecutor(max_workers=1) as executor:
+        with ThreadPoolExecutor(max_workers=8) as executor:
             results = list(executor.map(get_cycle, choices))
         max_idx = results.index(min(results))
         print(f"[Auto-tune] Optimal tile size: {choices[max_idx][2].tile_desc.get_tile_size()}, vlane_stride: {choices[max_idx][2].tile_desc.vlane_stride}, cycles: {results[max_idx]}")

From 143b95bbfd47f69f797217c2d77f78b00cc990df Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 22 Jul 2025 05:03:31 +0000
Subject: [PATCH 418/432] [Simulator] Use filelock to avoid race condition

---
 PyTorchSimFrontend/extension_codecache.py | 49 ++++++++++++-----------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 55719e08..83561bd4 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -353,30 +353,33 @@ def dummy_simulator(*args, **kwargs):
             validate = kwargs.get('validate', False)
             # Wait for compilation
             key = future.result()
+            from filelock import FileLock
+            lock_dir = get_lock_dir()
+            lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+            with lock:
+                # Run simulator pass
+                result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key))
+                # Dump arguments and meta data
+                dump_metadata(args, arg_attributes, result_path)
+                runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
+                if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE or validate:
+                    funcsim = FunctionalSimulator(result_path, key)
+                    funcsim.run_spike(args, arg_attributes,
+                                    runtime_path, self.validation_binary_name,
+                                    vectorlane_size=vectorlane_size, spad_info=spad_info,
+                                    cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS, silent_mode=silent_mode)
+                if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
+                    return
 
-            # Run simulator pass
-            result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key))
-            # Dump arguments and meta data
-            dump_metadata(args, arg_attributes, result_path)
-            runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
-            if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE or validate:
-                funcsim = FunctionalSimulator(result_path, key)
-                funcsim.run_spike(args, arg_attributes,
-                                  runtime_path, self.validation_binary_name,
-                                  vectorlane_size=vectorlane_size, spad_info=spad_info,
-                                  cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS, silent_mode=silent_mode)
-            if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
-                return
-
-            onnx_path = os.path.join(result_path, "tile_graph.onnx")
-            attribute_path = os.path.join(runtime_path, "attribute")
-            backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
-            backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG)
-            backsim.vectorlane_size = vectorlane_size
-            attribute_path = backsim.create_attribute_file(attribute_path, args, loop_size=loop_size)
-            result_path = backsim.simulation(onnx_path, attribute_path, silent_mode=silent_mode)
-            result = BackendSimulator.get_result_from_file(result_path)
-            return result
+                onnx_path = os.path.join(result_path, "tile_graph.onnx")
+                attribute_path = os.path.join(runtime_path, "attribute")
+                backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
+                backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG)
+                backsim.vectorlane_size = vectorlane_size
+                attribute_path = backsim.create_attribute_file(attribute_path, args, loop_size=loop_size)
+                result_path = backsim.simulation(onnx_path, attribute_path, silent_mode=silent_mode)
+                result = BackendSimulator.get_result_from_file(result_path)
+                return result
 
         def dryrun_simulator(*args, **kwargs):
             autotune = kwargs.get('autotune', False)

From 3e5ee249e11e4a1787de0b41d267d698b2664a31 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 23 Jul 2025 08:43:09 +0000
Subject: [PATCH 419/432] [Frontend/autotune] Add failure case

---
 .../mlir/mlir_codegen_backend.py              | 25 +++++++++++--------
 Simulator/simulator.py                        |  4 ++-
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 75cc9c7c..ff87c1d3 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1467,17 +1467,18 @@ def get_cycle(choice):
                         out = bench_runner(validate=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE)
                     return out[-1]
                 except (extension_codecache.SpadOverflowError, RuntimeError) as e:
-                    if isinstance(e, RuntimeError) and str(e) != "STACK_OVERFLOW":
-                        print(f"Benchmark[trial-{n_try}] failed with unexpected error: {e}")
-                        raise
-                    print(f"Benchmark failed due to spad overflow with tile size: {self.kernel_group.tile_desc.get_tile_size()}")
-                    self.kernel_group = kernel_group # Reset to the original tile desc
-                    self.reset("spad_overflow")
-                    src_code = super().codegen_nodes(nodes, kernel_name)
-                    bench_runner = self.run_bench(nodes, kernel_name, src_code)
-                    kernel_group = self.kernel_group
-                    self._prepare_simulator_headers(src_code)
-            raise RuntimeError("[Auto-tune] Exceeded maximum number of autotuning attempts")
+                    return float("inf")
+                    #if isinstance(e, RuntimeError) and str(e) != "STACK_OVERFLOW":
+                    #    print(f"Benchmark[trial-{n_try}] failed with unexpected error: {e}")
+                    #    return float("inf")
+                    #print(f"Benchmark failed due to spad overflow with tile size: {self.kernel_group.tile_desc.get_tile_size()}")
+                    #self.kernel_group = kernel_group # Reset to the original tile desc
+                    #self.reset("spad_overflow")
+                    #src_code = super().codegen_nodes(nodes, kernel_name)
+                    #bench_runner = self.run_bench(nodes, kernel_name, src_code)
+                    #kernel_group = self.kernel_group
+                    #self._prepare_simulator_headers(src_code)
+            return float("inf") # Exceeded maximum number of autotuning attempts
 
         choices = self.make_choices(nodes, kernel_name)
 
@@ -1486,6 +1487,8 @@ def get_cycle(choice):
         with ThreadPoolExecutor(max_workers=8) as executor:
             results = list(executor.map(get_cycle, choices))
         max_idx = results.index(min(results))
+        if min(results) == float("inf"):
+            raise RuntimeError("Failed to find optimal tile size...")
         print(f"[Auto-tune] Optimal tile size: {choices[max_idx][2].tile_desc.get_tile_size()}, vlane_stride: {choices[max_idx][2].tile_desc.vlane_stride}, cycles: {results[max_idx]}")
         optimal_src_code = choices[max_idx][1]
         return optimal_src_code
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 0d1bab6a..292c5a9b 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -105,7 +105,9 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
             print("[SpikeSimulator] cmd> ", run)
         run_cmd = shlex.split(run)
         try:
-            subprocess.check_call(run_cmd)
+            stdout_setting = subprocess.DEVNULL if silent_mode else None
+            stderr_setting = subprocess.DEVNULL if silent_mode else None
+            subprocess.check_call(run_cmd, stdout=stdout_setting, stderr=stderr_setting)
         except subprocess.CalledProcessError as e:
             print("[SpikeSimulator] Command failed with exit code", e.returncode)
             error_msg = ""

From 35947ae3eaec609d83c480de188b5d24f190722a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 24 Jul 2025 06:55:02 +0000
Subject: [PATCH 420/432] [TOGSim] minor update in logging system

---
 PyTorchSimBackend/src/TileGraphParser.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index 12056f94..9374dcb5 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -419,7 +419,8 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       tog_parser->register_memory_tag(base_addr_name, key);
 
       printIndexMap("[TOGParser] Load Node " + mem_node->get_base_addr_name() + " Numa_id: " + std::to_string(numa_id), iter);
-      spdlog::trace("[TOGParser] Load Node {} key = [{}], accum = [{}], tag = [{}], stride = [{}]", mem_node->get_base_addr_name(),
+      spdlog::trace("[TOGParser] Load Node {}({}) key = [{}], accum = [{}], tag = [{}], stride = [{}]", mem_node->get_base_addr_name(),
+             base_addr_id,
              fmt::join(key, ", "),
              fmt::join(accum_tag_list, ", "),
              fmt::join(tag_list, ", "),
@@ -726,7 +727,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa
       spdlog::info("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", it.key(), fmt::join(_arg_numa_stride[it.key()], ", "));
     }
   }
-  if (_attribute_json.contains("sram_alloc")) {
+  if (_attribute_json.contains("sram_alloc") and _attribute_json.contains("l2d_type") and _attribute_json["l2d_type"] == "datacache") {
     auto sram_alloc_list = _attribute_json["sram_alloc"];
     spdlog::info("[TOGParser/Attribute] ================= SRAM Alloc Plan ================");
     for (auto it = sram_alloc_list.begin(); it != sram_alloc_list.end(); ++it) {

From 9cda90c45b5219e7db0705fb68717a8351d74115 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 24 Jul 2025 06:55:42 +0000
Subject: [PATCH 421/432] [Frontend/Conv] Fix conv template bias codegen

---
 PyTorchSimFrontend/mlir/mlir_conv_mt_template.py | 2 +-
 PyTorchSimFrontend/mlir/mlir_conv_template.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
index 8cd57077..c5ec004c 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
@@ -66,7 +66,7 @@
           {%- if BIAS %}
           {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_N, TILE_O_H, TILE_O_W], indent_size=10) }}
           {%- else %}
-          affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N) }}xf32>
+          affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
           {%- endif %}
           affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
             affine.for %tile_k = 0 to {{ I_C * K_W }} step {{ TILE_K }} {
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 73cf710f..9cbd6514 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -67,7 +67,7 @@
           {%- if BIAS %}
           {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_N, TILE_O_H, TILE_O_W], indent_size=10) }}
           {%- else %}
-          affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : memref<{{ TILE_O_H }}x{{ TILE_O_W }}x{{ TILE_M }}x{{ TILE_N }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_O_W * TILE_M, TILE_N) }}xf32>
+          affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
           {%- endif %}
           affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
             affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} {

From 3ea1c81ef51f25c85e563f5665f88f655f901c1b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 24 Jul 2025 06:56:28 +0000
Subject: [PATCH 422/432] [Frontend/Template] Update dummy pooling template

---
 .../mlir/mlir_maxpool_template.py             | 63 ++++++++++++-------
 PyTorchSimFrontend/mlir/mlir_template.py      |  3 +-
 2 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
index 5395efb2..6f605d56 100644
--- a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
@@ -8,26 +8,21 @@
 from torch._inductor.ir import ReinterpretView
 from torch._inductor.codecache import write_atomic
 import PyTorchSimFrontend.extension_codecache as extension_codecache
+from PyTorchSimFrontend.mlir import mlir_common
+import sympy
 
 # This template only represents the DMA operations
-TEMPLATE = r"""#map0 = affine_map<(d0, d1) -> (d0 * {{ W }} + d1)>
-memref.global @X_spad : memref<{{ in_tile }}x{{ in_tile }}xf32, 1>
-memref.global @Y_spad : memref<{{ out_tile }}x{{ out_tile }}xf32, 1>
+TEMPLATE = r"""
+{{kernel.def_global_vars()}}
 
-func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=[X], outputs=[Y], names_str="X, Y")}} {
-  %c_mvin = arith.constant 2 : index
-  %c_mvout = arith.constant 3 : index
-  %axis = arith.constant 1 : index
-  %vstride = arith.constant 1 : index
-  %X_buffer = memref.get_global @X_spad : memref<{{ in_tile }}x{{ in_tile }}xf32, 1>
-  %Y_buffer = memref.get_global @Y_spad : memref<{{ out_tile }}x{{ out_tile }}xf32, 1>
-  %tag = memref.alloc() : memref<1xi32>
-  %c0 = arith.constant 0 : index
-  affine.for %i = 0 to {{ BCH }} step {{ out_tile }} {
-    affine.for %j = 0 to {{ W }} step {{ out_tile }} {
-      %index0 = affine.apply #map0(%i, %j)
-      memref.dma_start %X[%index0], %X_buffer[%c0, %c0], %c_mvin, %tag[%c0], %axis, %vstride : memref<{{ IN }}xf32>, memref<{{ in_tile }}x{{ in_tile }}xf32, 1>, memref<1xi32> {dram_stride=[{{W}}, 1]}
-      memref.dma_start %Y_buffer[%c0, %c0], %Y[%index0], %c_mvout, %tag[%c0], %axis, %vstride : memref<{{ out_tile }}x{{ out_tile }}xf32, 1>, memref<{{ OUT }}xf32>, memref<1xi32> {dram_stride=[{{W}}, 1]}
+func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=[X], outputs=[Y], names_str="X, Y", input_reorder=input_reorder)}} {
+  {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
+  {{- kernel.def_local_vars(indent_size=2) }}
+  affine.for %index0 = 0 to {{ BCH }} step {{ out_tile }} {
+    affine.for %index1 = 0 to {{ W }} step {{ out_tile }} {
+      {{ kernel.def_dma_op("MVIN", "X", X_idx, X_tile_desc, indent_size=6) }}
+      {{ kernel.def_dma_op("MVOUT", "Y", Y_idx, Y_tile_desc, indent_size=6) }}
     } { outer_loop=true }
   } { outer_loop=true }
   return
@@ -35,8 +30,8 @@
 """
 
 class MLIRMaxPoolTemplate(MLIRTemplate):
-    def __init__(self, input_nodes, layout, kernel_size, stride, padding, dilation, ceil_mode):
-        super().__init__("kernel", input_nodes, layout)
+    def __init__(self, input_nodes, layout, kernel_size, stride, padding, dilation, ceil_mode, input_reorder=None):
+        super().__init__("kernel", input_nodes, layout, input_reorder)
         self.kernel_size = kernel_size
         self.stride = stride
         self.padding = padding
@@ -63,26 +58,46 @@ def render(self,
         BCH = B * C * H
         kernel.loop_size = None
 
+        # Prepare tile descriptors
+        vlane_stride = 1 # Used dummy value
+        vlane_split_axis = 1
+        X_tile_size = [in_tile, in_tile]
+        X_tile_stride = [1, in_tile]
+        X_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        X_tile_desc.set_tile_size_stride(X_tile_size, X_tile_stride)
+        X_tile_desc.set_name("X_buffer")
+        X_idx = [sympy.Symbol("index0"), sympy.Symbol("index1")*W] # To keep index arguemnt order, we used index_list
+
+        Y_tile_size = [out_tile, out_tile]
+        Y_tile_stride = [1, out_tile]
+        Y_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        Y_tile_desc.set_tile_size_stride(Y_tile_size, Y_tile_stride)
+        Y_tile_desc.set_name("W_buffer")
+        Y_idx = [sympy.Symbol("index0"), sympy.Symbol("index1")*W]
+
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
-            IN=X.get_numel(),
-            OUT=Y.get_numel(),
             X=X,
             Y=Y,
             BCH=BCH,
             W=W,
-            in_tile=in_tile,
             out_tile=out_tile,
-            DATA_STYPE="f32",
+            X_idx = X_idx,
+            Y_idx = Y_idx,
+            X_tile_desc = X_tile_desc,
+            Y_tile_desc = Y_tile_desc,
+            input_reorder = self.input_reorder
         )
         kernel.epilogue_info = dict(
             output_node = self.output_node.name,
             sram_var = "Y_buffer",
             dram_var = "Y",
+            dram_tile_desc = Y_tile_desc,
         )
+        kernel.exception_nodes["Y"] = {"numel" : Y.get_numel()}
         code = self._template_from_string(TEMPLATE).render(**kernel.render_options)
-        kernel.add_loop_info([kernel.render_options["IN"]], [kernel.vector_lane, kernel.vector_lane])
+        kernel.add_loop_info([X.get_numel()], [kernel.vector_lane, kernel.vector_lane])
         return code
 
     def codegen_header(self, code, extra_headers):
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 0455cbf1..0b2a08f8 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -665,9 +665,10 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com
         with V.set_kernel_handler(self):
             index_var = self.parse_index_list(index_list, local_code)
             node_layout = self.named_nodes[dram_var].get_layout()
-            numel = self.get_arg_info(self.named_nodes[dram_var].get_name()).get_numel()
             if dram_var in self.exception_nodes:
                 numel = self.exception_nodes[dram_var]["numel"]
+            else:
+                numel = self.get_arg_info(self.named_nodes[dram_var].get_name()).get_numel()
             mlir_dtype = mlir_common.DTYPE_TO_MLIR[node_layout.dtype]
             dram_shape = f"memref<{numel}x{mlir_dtype}>"
             dram_stride = []

From 328bd837cb3a69b4f0af8779dc33e1f53d4a93ad Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 24 Jul 2025 07:00:15 +0000
Subject: [PATCH 423/432] [Frontend/Config] Add timing pool config

---
 PyTorchSimFrontend/extension_config.py   | 2 ++
 PyTorchSimFrontend/mlir/mlir_lowering.py | 5 +++--
 experiments/resnet18.py                  | 3 ++-
 experiments/resnet50.py                  | 1 +
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 8994cffe..05987d7f 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -94,3 +94,5 @@ def load_plan_from_module(module_path):
 
 # For ILS experiment
 CONFIG_TLS_MODE = int(os.environ.get('TORCHSIM_TLS_MODE', default=1))
+
+CONFIG_USE_TIMING_POOLING = int(os.environ.get('TORCHSIM_USE_TIMING_POOLING', default=0))
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index aa3cf16e..9aa08754 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -15,7 +15,7 @@
 from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate
 from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate
 from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
-from PyTorchSimFrontend.extension_config import CONFIG_VECTOR_LANE
+from PyTorchSimFrontend.extension_config import CONFIG_VECTOR_LANE, CONFIG_USE_TIMING_POOLING
 
 aten = torch.ops.aten
 aten_spmm = MLIRExternKernelChoice(torch.sparse.mm, "custom_op::sparse_addmm")
@@ -180,4 +180,5 @@ def sparse_addmm(*args, **kwargs):
 lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()})
 lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
 lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
-#lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
\ No newline at end of file
+if CONFIG_USE_TIMING_POOLING:
+    lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
\ No newline at end of file
diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index c12cc930..5d9dcf86 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -29,7 +29,7 @@ def run_resnet(batch, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -40,6 +40,7 @@ def run_resnet(batch, config):
     result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"resnet18_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
     if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index ec2e26ff..bd52afc1 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -40,6 +40,7 @@ def run_resnet(batch, config):
     result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"resnet50_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
     if 'BACKENDSIM_SPIKE_ONLY' in os.environ:

From bde2ca45ea1885f729a7492e5d374389d037b36f Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 24 Jul 2025 13:51:17 +0000
Subject: [PATCH 424/432] [Test] Add ResNet50 test case

---
 .github/workflows/pull-request.yml | 13 ++++++++++++-
 tests/test_resnet.py               | 21 +++++++++++++++------
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index bc5c9dab..f1366eb6 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -358,7 +358,8 @@ jobs:
           registry: ghcr.io
           username: ${{ github.actor }}
           password: ${{ secrets.GIT_ACCESS_TOKEN }}
-      - name: Run test_resnet.py
+
+      - name: Run test_resnet18.py
         env:
           GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
@@ -368,6 +369,16 @@ jobs:
             -e TORCHSIM_DUMP_PATH=/dump \
             ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_resnet.py
 
+      - name: Run test_resnet50.py
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          echo "Running test_resnet.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_resnet.py --model_type resnet50
+
   test_transformer:
     name: Run test_transformer.py
     runs-on: self-hosted
diff --git a/tests/test_resnet.py b/tests/test_resnet.py
index f54ce9be..97c60528 100644
--- a/tests/test_resnet.py
+++ b/tests/test_resnet.py
@@ -1,7 +1,8 @@
+import argparse
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
-from torchvision.models import resnet18
+from torchvision.models import resnet18, resnet50
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -18,11 +19,16 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
         print("cpu out: ", cpu_out)
         exit(1)
 
-def test_resnet(device, batch=1):
+def test_resnet(device, batch=1, model_type='resnet18'):
     from torchvision.models import resnet
     with torch.no_grad():
         #model = resnet._resnet(resnet.BasicBlock, [1, 1, 1, 1], weights=None, progress=False).eval()
-        model = resnet18().eval()
+        if model_type == 'resnet50':
+            model = resnet50().eval()
+        elif model_type == 'resnet18':
+            model = resnet18().eval()
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
         model.to(device, memory_format=torch.channels_last)
         input = torch.randn(batch, 3, 224, 224)
         x1 = input.to(device=device, memory_format=torch.channels_last)
@@ -31,16 +37,19 @@ def test_resnet(device, batch=1):
         res = opt_fn(x1)
         cpu_model = model.cpu().to(memory_format=torch.channels_last)
         cpu_res = cpu_model(x2)
-    test_result("ResNet18 inference", res, cpu_res)
+    test_result(f"{model_type} inference", res, cpu_res)
     print("Max diff > ", torch.max(torch.abs(res.cpu() - cpu_res)))
-    print("ResNet18 Simulation Done")
+    print(f"{model_type} Simulation Done")
 
 if __name__ == "__main__":
     import os
     import sys
+    args = argparse.ArgumentParser()
+    args.add_argument('--model_type', type=str, default="resnet18", help='ex) resnet18')
+    args = args.parse_args()
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_resnet(device)
+    test_resnet(device, model_type=args.model_type)

From a20a0c6976dcfc36c4c71b4fc8af2b317b1a193c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 24 Jul 2025 13:52:05 +0000
Subject: [PATCH 425/432] [Frontend/Template] Conv single stride tile size fix

---
 PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
index a4ea0b20..74309b30 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
@@ -194,7 +194,7 @@ def render(self,
         # Prepare tile descriptors
         vlane_stride = 1
         vlane_split_axis = 1
-        X_tile_size = [TILE_I_H, TILE_K_H, TILE_M, TILE_K]
+        X_tile_size = [TILE_I_H, TILE_K_W, TILE_M, TILE_K]
         X_tile_stride = [TILE_K_W*TILE_M*TILE_K, TILE_M*TILE_K, 1, TILE_M]
         X_tile_desc = mlir_common.MLIRMultiDimTile(X_tile_size, kernel.vector_lane, 3, vlane_stride)
         X_tile_desc.set_tile_size_stride(X_tile_size, X_tile_stride)

From 0404422db7a87fd7c85b3bfd9d0b5c3276898cb4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 24 Jul 2025 13:52:58 +0000
Subject: [PATCH 426/432] [Frontend/Fusion] Make reduction-reduction
 configurable

---
 PyTorchSimFrontend/extension_config.py     | 3 ++-
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 4 ++--
 experiments/layernorm.py                   | 1 +
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 05987d7f..59f3818c 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -68,7 +68,8 @@
 CONFIG_SUBTILE_K = int(os.environ.get('TORCHSIM_SUBTILE_K', default=CONFIG_VECTOR_LANE))
 
 # Advanced fusion options
-CONFIG_FUSION_REDUCTION = int(os.environ.get('TORCHSIM_FUSION_REDUCTION', default=True))
+CONFIG_FUSION_REDUCTION_EPILOGUE = int(os.environ.get('TORCHSIM_FUSION_REDUCTION_EPILOGUE', default=True))
+CONFIG_FUSION_REDUCTION_REDUCTION = int(os.environ.get('TORCHSIM_FUSION_REDUCTION_REDUCTION', default=True))
 CONFIG_FUSION_PROLOGUE = int(os.environ.get('TORCHSIM_FUSION_PROLOGUE', default=True))
 
 # SRAM Buffer allocation plan
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index f1c72c44..786971fe 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -40,7 +40,7 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
         if not (isinstance(node1, (SchedulerNode, FusedSchedulerNode)) and isinstance(node2, (SchedulerNode, FusedSchedulerNode))):
             return False
 
-        if len(base_template_node1) == 1 and len(base_template_node2) == 0 and extension_config.CONFIG_FUSION_REDUCTION:
+        if len(base_template_node1) == 1 and len(base_template_node2) == 0 and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE:
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
             from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
             if (isinstance(base_template_node1[0].node.template, MLIRGemmTemplate) or isinstance(base_template_node1[0].node.template, MLIRBMMTemplate)) and node2.is_reduction():
@@ -95,7 +95,7 @@ def can_fuse_horizontal(self, node1, node2):
         _, (vars2, reduce2) = node2.group
 
         # Reduction is currently not supported
-        if node1.is_reduction() and node2.is_reduction() and not node1.is_template() and not node2.is_template():
+        if node1.is_reduction() and node2.is_reduction() and not node1.is_template() and not node2.is_template() and extension_config.CONFIG_FUSION_REDUCTION_REDUCTION:
             return vars1 == vars2 and reduce1 == reduce2 and node1.inverse_users == node2.inverse_users
         if node1.is_reduction() or node2.is_reduction():
             return False
diff --git a/experiments/layernorm.py b/experiments/layernorm.py
index 378833f7..f149394e 100644
--- a/experiments/layernorm.py
+++ b/experiments/layernorm.py
@@ -39,6 +39,7 @@ def run_layernorm(size, config):
     result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"LayerNorm_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    os.environ['TORCHSIM_FUSION_REDUCTION_REDUCTION'] = "0"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
     if 'BACKENDSIM_SPIKE_ONLY' in os.environ:

From 8ef7fc664babaa2cb48c56a64a72ee998d722d55 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 24 Jul 2025 13:58:39 +0000
Subject: [PATCH 427/432] [Artifact] Add cycle validation artifact script

---
 experiments/artifact/baseline_cycle.csv | 24 ++++++++
 experiments/artifact/run_cycle.sh       | 81 +++++++++++++++++++++++++
 experiments/artifact/summary_cycle.py   | 78 ++++++++++++++++++++++++
 3 files changed, 183 insertions(+)
 create mode 100644 experiments/artifact/baseline_cycle.csv
 create mode 100755 experiments/artifact/run_cycle.sh
 create mode 100644 experiments/artifact/summary_cycle.py

diff --git a/experiments/artifact/baseline_cycle.csv b/experiments/artifact/baseline_cycle.csv
new file mode 100644
index 00000000..afd795e4
--- /dev/null
+++ b/experiments/artifact/baseline_cycle.csv
@@ -0,0 +1,24 @@
+Workload,TPUv3,mNPUSim,Timeloop,Maestro,SCALE-Sim v3,TOGSim(Ours)
+gemm_256x256x256,3101,2426,512,522,1020,
+gemm_512x512x512,9490,8584,4096,4162,6128,
+gemm_1024x1024x1024,47099,32625,32768,33282,40896,
+gemm_2048x2048x2048,317435,3046069,262144,266242,294656,
+conv_64x56x56x64x64x3x1x1,1496802,1076160,451584,1683074,610880,
+conv_64x28x28x128x128x3x1x1,241935,577408,225792,391042,269952,
+conv_64x14x14x256x256x3x1x1,246790,540160,451584,167426,327424,
+conv_64x7x7x512x512x3x1x1,247383,1128192,903168,117762,622336,
+layernorm_512x768,24895,,,,,
+layernorm_2048x768,98234,,,,,
+layernorm_8192x768,389863,,,,,
+softmax_512x512,12902,,,,,
+softmax_2048x2048,169750,,,,,
+softmax_8192x8192,2700994,,,,,
+attention_12x512x64,110093,101964,30720,49944,48912,
+attention_16x512x64,145425,135952,40960,66592,65216,
+attention_32x512x64,288250,271904,196608,133184,130432,
+resnet18,844524,232699,495520,1518676,206111,
+resnet50,1094428,524398,721016,1596151,398447,
+bert_base,305077,493756,169984,149820,706886,
+bert_large,445317,785205,286720,249676,237056,
+bert_xlarge,1283925,3030001,1507328,898700,990016,
+
diff --git a/experiments/artifact/run_cycle.sh b/experiments/artifact/run_cycle.sh
new file mode 100755
index 00000000..c5bc82c9
--- /dev/null
+++ b/experiments/artifact/run_cycle.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+set -e
+
+export TORCHSIM_CONFIG=$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
+mkdir -p $LOG_DIR
+
+# Matmul
+for sz in "256 256 256" "512 512 512" "1024 1024 1024" "2048 2048 2048"; do
+  name="gemm_${sz// /x}"
+  echo ""
+  echo "==================================================="
+  echo "[*] Running Matmul size=$sz"
+  echo "==================================================="
+  python3 $TORCHSIM_DIR/experiments/gemm.py --size $sz | tee $LOG_DIR/${name}.log
+done
+
+# Conv
+for sz in \
+  "64 56 56 64 64 3 1 1" \
+  "64 28 28 128 128 3 1 1" \
+  "64 14 14 256 256 3 1 1" \
+  "64 7 7 512 512 3 1 1"; do
+  name="conv_${sz// /x}"
+  echo ""
+  echo "==================================================="
+  echo "[*] Running Conv size=$sz"
+  echo "==================================================="
+  python3 $TORCHSIM_DIR/experiments/conv.py --size $sz | tee $LOG_DIR/${name}.log
+done
+
+# Attention
+for sz in "12 512 64" "16 512 64" "32 512 64"; do
+  name="attention_${sz// /x}"
+  echo ""
+  echo "==================================================="
+  echo "[*] Running Attention size=$sz"
+  echo "==================================================="
+  python3 $TORCHSIM_DIR/experiments/attention.py --size $sz | tee $LOG_DIR/${name}.log
+done
+
+# LayerNorm
+for sz in "512 768" "2048 768" "8192 768"; do
+  name="layernorm_${sz// /x}"
+  echo ""
+  echo "==================================================="
+  echo "[*] Running LayerNorm size=$sz"
+  echo "==================================================="
+  python3 $TORCHSIM_DIR/experiments/layernorm.py --size $sz | tee $LOG_DIR/${name}.log
+done
+
+# Softmax
+for sz in "512 512" "2048 2048" "8192 8192"; do
+  name="softmax_${sz// /x}"
+  echo ""
+  echo "==================================================="
+  echo "[*] Running Softmax size=$sz"
+  echo "==================================================="
+  python3 $TORCHSIM_DIR/experiments/softmax.py --size $sz | tee $LOG_DIR/${name}.log
+done
+
+# ResNet
+for model in "resnet18" "resnet50"; do
+  echo ""
+  echo "==================================================="
+  echo "[*] Running $model"
+  echo "==================================================="
+  python3 $TORCHSIM_DIR/experiments/${model}.py | tee $LOG_DIR/${model}.log
+done
+
+# BERT
+for model in "base" "large" "xlarge"; do
+  echo ""
+  echo "==================================================="
+  echo "[*] Running BERT size=$model"
+  echo "==================================================="
+  python3 $TORCHSIM_DIR/experiments/BERT.py --size $model | tee $LOG_DIR/bert_${model}.log
+done
+
+# Cycle Summary
+python3 $TORCHSIM_DIR/experiments/artifact/summary_cycle.py
\ No newline at end of file
diff --git a/experiments/artifact/summary_cycle.py b/experiments/artifact/summary_cycle.py
new file mode 100644
index 00000000..01a667f8
--- /dev/null
+++ b/experiments/artifact/summary_cycle.py
@@ -0,0 +1,78 @@
+import os
+import math
+import csv
+import re
+
+LOG_DIR = os.path.join(os.environ.get("TORCHSIM_DIR", "."), "experiments/artifact/logs")
+BASELINE_CSV = os.path.join(os.environ.get("TORCHSIM_DIR", "."), "experiments/artifact/baseline_cycle.csv")
+
+def format_with_error(value, ref, error_list=None):
+    try:
+        if value == "" or ref == "" or float(ref) == 0:
+            return "N/A"
+        val = float(value)
+        ref = float(ref)
+        err = ((val - ref) / ref) * 100
+        if error_list is not None:
+            error_list.append(abs(err))
+        val_str = f"{int(val):>7}"
+        err_str = f"{err:+.2f}%"
+        return f"{val_str} ({err_str:>8})"
+    except (ValueError, TypeError):
+        return "N/A"
+
+def compute_mean(errors):
+    if not errors:
+        return "N/A"
+    abs_errors = [abs(err) for err in errors]
+    return f"{sum(abs_errors) / len(errors):.2f}%"
+
+if __name__ == "__main__":
+    # 1. Generate cycle_map
+    cycle_map = {}
+    for file in os.listdir(LOG_DIR):
+        if file.endswith(".log"):
+            full_path = os.path.join(LOG_DIR, file)
+            name = file[:-4]
+            with open(full_path) as f:
+                for line in f:
+                    match = re.search(r"Total execution cycle:\s*([0-9]+)", line)
+                    if match:
+                        cycle_map[name] = int(match.group(1))
+                        break
+
+    # Error list init
+    mnpusim_errors = []
+    timeloop_errors = []
+    maestro_errors = []
+    scalesim_errors = []
+    togsim_errors = []
+
+    # Header
+    print("[*] Summary of Total Execution Cycles with TPUv3-relative (%) Error")
+    print("=" * 190)
+    print(f"{'Workload':>30} {'TPUv3':>25} {'mNPUSim':>25} {'Timeloop':>25} {'Maestro':>25} {'SCALE-Sim v3':>25} {'TOGSim(Ours)':>25}")
+    print("=" * 190)
+
+    with open(BASELINE_CSV, newline="") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            workload = row["Workload"].lstrip('\ufeff')
+            tpv3 = row["TPUv3"]
+    
+            mnpusim  = format_with_error(row["mNPUSim"], tpv3, mnpusim_errors)
+            timeloop = format_with_error(row["Timeloop"], tpv3, timeloop_errors)
+            maestro  = format_with_error(row["Maestro"], tpv3, maestro_errors)
+            scalesim = format_with_error(row["SCALE-Sim v3"], tpv3, scalesim_errors)
+    
+            togsim_val = cycle_map.get(workload, "")
+            if "softmax" in workload or "layernorm" in workload:
+                togsim_str = format_with_error(str(togsim_val), tpv3, [])
+            else:
+                togsim_str = format_with_error(str(togsim_val), tpv3, togsim_errors)
+    
+            print(f"{workload:>30} {tpv3:>25} {mnpusim:>25} {timeloop:>25} {maestro:>25} {scalesim:>25} {togsim_str:>25}")
+
+    # MAE row
+    print("=" * 190)
+    print(f"{'[*] Mean Absolute Error(%)':>30} {'0.00%':>25} {compute_mean(mnpusim_errors):>25} {compute_mean(timeloop_errors):>25} {compute_mean(maestro_errors):>25} {compute_mean(scalesim_errors):>25} {compute_mean(togsim_errors):>25}")
\ No newline at end of file

From 6b6af14783bcb26edb36ab7f9d1628094779fc11 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 25 Jul 2025 10:25:24 +0000
Subject: [PATCH 428/432] [Artifact] Add speedup artifact script

---
 experiments/artifact/baseline_latency.csv     |  11 ++
 .../{ => cycle_validation}/run_cycle.sh       |   6 +-
 .../{ => cycle_validation}/summary_cycle.py   |   0
 experiments/artifact/speedup/run_speedup.sh   | 102 ++++++++++++++++++
 .../artifact/speedup/scripts/ils_parser.sh    |  34 ++++++
 .../speedup/scripts/run_speed_ils_bert.sh     |  65 +++++++++++
 .../speedup/scripts/run_speed_ils_conv.sh     |  66 ++++++++++++
 .../speedup/scripts/run_speed_ils_matmul.sh   |  63 +++++++++++
 .../speedup/scripts/run_speed_ils_resnet.sh   |  73 +++++++++++++
 .../artifact/speedup/summary_speedup.py       |  90 ++++++++++++++++
 10 files changed, 509 insertions(+), 1 deletion(-)
 create mode 100644 experiments/artifact/baseline_latency.csv
 rename experiments/artifact/{ => cycle_validation}/run_cycle.sh (91%)
 rename experiments/artifact/{ => cycle_validation}/summary_cycle.py (100%)
 create mode 100755 experiments/artifact/speedup/run_speedup.sh
 create mode 100755 experiments/artifact/speedup/scripts/ils_parser.sh
 create mode 100755 experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
 create mode 100755 experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
 create mode 100755 experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
 create mode 100755 experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
 create mode 100644 experiments/artifact/speedup/summary_speedup.py

diff --git a/experiments/artifact/baseline_latency.csv b/experiments/artifact/baseline_latency.csv
new file mode 100644
index 00000000..159b10a3
--- /dev/null
+++ b/experiments/artifact/baseline_latency.csv
@@ -0,0 +1,11 @@
+Workload,Accel-Sim,mNPUSim,PyTorchSim-SN,PyTorchSim-CA,PyTorchSim-ILS
+conv_1x56x56x64x64x3x1x1,38.86346,3.915,,,
+conv_1x28x28x128x128x3x1x1,44.58898,2.588,,,
+conv_1x14x14x256x256x3x1x1,70.53904,3.162,,,
+conv_1x7x7x512x512x3x1x1,78.92694,1.527,,,
+gemm_512x512x512,53.5097,5.767,,,
+gemm_1024x1024x1024,150.8592,40.946,,,
+gemm_2048x2048x2048,951.4306,5157.396,,,
+resnet50,1222.504,294.242,,,
+bert_large,1436.558,350.84,,,
+
diff --git a/experiments/artifact/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh
similarity index 91%
rename from experiments/artifact/run_cycle.sh
rename to experiments/artifact/cycle_validation/run_cycle.sh
index c5bc82c9..a32cd0a6 100755
--- a/experiments/artifact/run_cycle.sh
+++ b/experiments/artifact/cycle_validation/run_cycle.sh
@@ -17,6 +17,10 @@ done
 
 # Conv
 for sz in \
+  "1 56 56 64 64 3 1 1" \
+  "1 28 28 128 128 3 1 1" \
+  "1 14 14 256 256 3 1 1" \
+  "1 7 7 512 512 3 1 1" \
   "64 56 56 64 64 3 1 1" \
   "64 28 28 128 128 3 1 1" \
   "64 14 14 256 256 3 1 1" \
@@ -78,4 +82,4 @@ for model in "base" "large" "xlarge"; do
 done
 
 # Cycle Summary
-python3 $TORCHSIM_DIR/experiments/artifact/summary_cycle.py
\ No newline at end of file
+python3 $TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.py | tee "$TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.out"
\ No newline at end of file
diff --git a/experiments/artifact/summary_cycle.py b/experiments/artifact/cycle_validation/summary_cycle.py
similarity index 100%
rename from experiments/artifact/summary_cycle.py
rename to experiments/artifact/cycle_validation/summary_cycle.py
diff --git a/experiments/artifact/speedup/run_speedup.sh b/experiments/artifact/speedup/run_speedup.sh
new file mode 100755
index 00000000..7d0c0da2
--- /dev/null
+++ b/experiments/artifact/speedup/run_speedup.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
+CONFIG_DIR="$TORCHSIM_DIR/PyTorchSimBackend/configs"
+SIMULATOR_BIN="$TORCHSIM_DIR/PyTorchSimBackend/build/bin/Simulator"
+
+configs=(
+    "systolic_ws_128x128_c2_simple_noc_tpuv3.json"
+    "systolic_ws_128x128_c2_booksim_tpuv3.json"
+)
+
+target_list=(
+  "gemm_512x512x512"
+  "gemm_1024x1024x1024"
+  "gemm_2048x2048x2048"
+  "conv_1x56x56x64x64x3x1x1"
+  "conv_1x28x28x128x128x3x1x1"
+  "conv_1x14x14x256x256x3x1x1"
+  "conv_1x7x7x512x512x3x1x1"
+  "resnet50"
+  "bert_large"
+)
+
+TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
+output_dir="$TORCHSIM_DIR/experiments/artifact/speedup/results"
+mkdir -p "$output_dir"
+
+echo "[*] Scanning log files in: $LOG_DIR"
+echo ""
+
+for log_file in "$LOG_DIR"/*.log; do
+  filename=$(basename "$log_file")
+  workload="${filename%.log}"
+
+  if [[ ! " ${target_list[@]} " =~ " ${workload} " ]]; then
+    continue
+  fi
+  echo "==> Workload: $workload"
+
+  declare -a ONNX_ATTR_PAIRS=()
+
+  # === Grep launch line ===
+  while IFS= read -r line; do
+    if [[ "$line" == launch* ]]; then
+      read -r _ onnx_path attr_path _ <<< "$line"
+      ONNX_ATTR_PAIRS+=("$onnx_path|$attr_path")
+    fi
+  done < "$log_file"
+
+  # Normal configs
+  for config in "${configs[@]}"; do
+    output_file="$output_dir/${workload}_${config}.txt" 
+    echo "Running with config=$config"
+    echo "===== config=$config | model=$workload =====" >> "$output_file"
+    sum_all_iters=0.0
+    iter_count=0
+
+     # === Run 5 iterations ===
+    for iter in {1..5}; do
+      echo "[Iter $iter] Running simulation for workload=$workload config=$config"
+      cmd=""
+      for pair in "${ONNX_ATTR_PAIRS[@]}"; do
+        IFS="|" read -r onnx_path attr_path <<< "$pair"
+        cmd+=" $SIMULATOR_BIN --config $CONFIG_DIR/$config --models_list $onnx_path --attributes_list $attr_path;"
+      done
+
+      output=$(bash -c "$cmd")
+      sim_times=$(echo "$output" | grep "Simulation time:" | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/')
+
+      if [[ -n "$sim_times" ]]; then
+        sum_per_iter=0.0
+        while IFS= read -r sim_time; do
+          echo "Iteration $iter: simulation_time = $sim_time" >> "$output_file"
+          sum_per_iter=$(awk -v a="$sum_per_iter" -v b="$sim_time" 'BEGIN {printf "%.6f", a + b}')
+        done <<< "$sim_times"
+
+        echo "Iteration $iter: total_simulation_time = $sum_per_iter" >> "$output_file"
+        sum_all_iters=$(awk -v a="$sum_all_iters" -v b="$sum_per_iter" 'BEGIN {printf "%.6f", a + b}')
+        iter_count=$((iter_count + 1))
+      else
+        echo "Iteration $iter: No simulation time found." >> "$output_file"
+      fi
+    done
+
+    # === Final average ===
+    if [[ $iter_count -gt 0 ]]; then
+      avg=$(awk -v total="$sum_all_iters" -v n="$iter_count" 'BEGIN {printf "%.6f", total / n}')
+      echo "Average simulation time for $workload with config $config: $avg seconds"
+      echo "Average simulation time = $avg" >> "$output_file"
+    else
+      echo "No valid simulation times found for config $config"
+      echo "Average simulation time = NA" >> "$output_file"
+    fi
+  done
+done
+
+# ILS mode should be run separately
+$TORCHSIM_DIR/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
+$TORCHSIM_DIR/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
+$TORCHSIM_DIR/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
+$TORCHSIM_DIR/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
+
+python3 $TORCHSIM_DIR/experiments/artifact/speedup/summary_speedup.py | tee "$TORCHSIM_DIR/experiments/artifact/speedup/summary_speedup.log"
\ No newline at end of file
diff --git a/experiments/artifact/speedup/scripts/ils_parser.sh b/experiments/artifact/speedup/scripts/ils_parser.sh
new file mode 100755
index 00000000..a02d8edb
--- /dev/null
+++ b/experiments/artifact/speedup/scripts/ils_parser.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+ignore_rest=false
+gem5_cmd=""
+result_path=""
+gem5_time=""
+togsim_time=""
+
+total_gem5=0
+total_togsim=0
+
+while IFS= read -r line; do
+  if [[ "$line" == launch* ]]; then
+    tile_graph_path=$(echo "$line" | awk '{for (i=1; i<=NF; i++) if ($i ~ /tile_graph\.onnx$/) print $i}')
+    if [[ -n "$tile_graph_path" ]]; then
+      dir_path=$(dirname "$tile_graph_path")
+      sto_log_path="$dir_path/m5out/sto.log"
+      echo "sto.log path: $sto_log_path"
+      gem5_time=$(grep "Simulation time:" "$sto_log_path" | \
+                sed -E 's/^Simulation time: ([0-9.]+) seconds$/\1/')
+      echo "GEM5: $gem5_time" 
+      total_gem5=$(echo "$total_gem5 + $gem5_time" | bc)
+    fi
+  fi
+  if [[ "$line" == *"Simulation time:"* ]]; then
+    togsim_time=$(echo "$line" | sed -E 's/.*Simulation time: ([0-9.]+) seconds/\1/')
+    echo "TOGSim: $togsim_time"
+  fi
+done
+
+if [[ -n "$total_gem5" && -n "$total_togsim" ]]; then
+  total_time=$(python3 -c "print(round($total_gem5 + $total_togsim, 6))")
+  echo "Simulation time: $total_time seconds"
+fi
\ No newline at end of file
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
new file mode 100755
index 00000000..66829f02
--- /dev/null
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+base_dir=$TORCHSIM_DIR/experiments/artifact/speedup
+config=(
+    # "systolic_ws_8x8_c1_simple_noc.json"
+    "systolic_ws_128x128_c2_simple_noc_tpuv3.json"
+    #"systolic_ws_128x128_c2_booksim_tpuv3.json"
+    # "systolic_ws_128x128_c2_simple_noc_tpuv4.json"
+)
+TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
+SIZE_LIST=(
+  # "base"
+  "large"
+  # "xlarge"
+)
+seq=512
+output_dir="$base_dir/results"
+mkdir -p "$output_dir"
+
+for i in "${config[@]}"; do
+  echo "Running with config=$i"
+  for size in "${SIZE_LIST[@]}"; do
+    ops="bert_$size"
+    output_file="$output_dir/ils_${ops}_${i}.txt"
+    workload="$TORCHSIM_DIR/experiments/BERT.py --size $size --input_size $seq"
+    echo "===== config=$i | model=$ops =====" >> "$output_file"
+    sum=0.0
+    count=0
+    config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i"
+
+    for iter in {1..5}; do
+      echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
+      output=$(bash -c "
+        export TORCHSIM_TLS_MODE=0;
+        export TORCHSIM_VALIDATION_MODE=0;
+        export TORCHSIM_CONFIG=$config_path;
+        export AUTOTUNE=0;
+        printenv;
+        python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh
+      ")
+
+      sim_time=$(echo "$output" | grep "Simulation time:" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/')
+
+      if [[ -n "$sim_time" ]]; then
+        echo "Iteration $iter: Simulation time = $sim_time"
+        echo "Iteration $iter: simulation_time = $sim_time" >> "$output_file"
+        sum=$(awk -v a="$sum" -v b="$sim_time" 'BEGIN {printf "%.6f", a + b}')
+        count=$((count + 1))
+      else
+        echo "Iteration $iter: Simulation time not found."
+        echo "Iteration $iter: simulation_time = NA" >> "$output_file"
+      fi
+    done
+
+    if [[ $count -gt 0 ]]; then
+      avg=$(awk -v total="$sum" -v n="$count" 'BEGIN {printf "%.6f", total / n}')
+      echo "Average simulation time for $ops with config $i: $avg seconds"
+      echo "Average simulation time = $avg" >> "$output_file"
+    else
+      echo "No valid simulation times found for $ops with config $i"
+      echo "Average simulation time = NA" >> "$output_file"
+    fi
+    echo "" >> "$output_file"
+  done
+done
\ No newline at end of file
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
new file mode 100755
index 00000000..2f9718f1
--- /dev/null
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+base_dir=$TORCHSIM_DIR/experiments/artifact/speedup
+config=(
+    # "systolic_ws_8x8_c1_simple_noc.json"
+    "systolic_ws_128x128_c2_simple_noc_tpuv3.json"
+    #"systolic_ws_128x128_c2_booksim_tpuv3.json"
+    # "systolic_ws_128x128_c2_simple_noc_tpuv4.json"
+)
+TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
+SHAPE_LIST=(
+  #B H W I_C O_C K S P
+  "1 56 56 64 64 3 1 1"
+  "1 28 28 128 128 3 1 1"
+  "1 14 14 256 256 3 1 1"
+  "1 7 7 512 512 3 1 1"
+)
+output_dir="$base_dir/results"
+mkdir -p "$output_dir"
+
+for i in "${config[@]}"; do
+  echo "Running with config=$i"
+  for shape in "${SHAPE_LIST[@]}"; do
+    ops="conv_${shape// /x}"
+    output_file="$output_dir/ils_${ops}_${i}.txt"
+    workload="$TORCHSIM_DIR/experiments/conv.py --size $shape"
+    echo "===== config=$i | model=$ops =====" >> "$output_file"
+    sum=0.0
+    count=0
+    config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i"
+
+    for iter in {1..5}; do
+      echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
+      output=$(bash -c "
+        export TORCHSIM_TLS_MODE=0;
+        export TORCHSIM_VALIDATION_MODE=0;
+        export TORCHSIM_CONFIG=$config_path;
+        export AUTOTUNE=0;
+        printenv;
+        python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh
+      ")
+
+      sim_time=$(echo "$output" | grep "Simulation time:" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/')
+
+      if [[ -n "$sim_time" ]]; then
+        echo "Iteration $iter: Simulation time = $sim_time"
+        echo "Iteration $iter: simulation_time = $sim_time" >> "$output_file"
+        sum=$(awk -v a="$sum" -v b="$sim_time" 'BEGIN {printf "%.6f", a + b}')
+        count=$((count + 1))
+      else
+        echo "Iteration $iter: Simulation time not found."
+        echo "Iteration $iter: simulation_time = NA" >> "$output_file"
+      fi
+    done
+
+    if [[ $count -gt 0 ]]; then
+      avg=$(awk -v total="$sum" -v n="$count" 'BEGIN {printf "%.6f", total / n}')
+      echo "Average simulation time for $ops with config $i: $avg seconds"
+      echo "Average simulation time = $avg" >> "$output_file"
+    else
+      echo "No valid simulation times found for $ops with config $i"
+      echo "Average simulation time = NA" >> "$output_file"
+    fi
+    echo "" >> "$output_file"
+  done
+done
\ No newline at end of file
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
new file mode 100755
index 00000000..8ff7e2b6
--- /dev/null
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+base_dir=$TORCHSIM_DIR/experiments/artifact/speedup
+config=(
+    # "systolic_ws_8x8_c1_simple_noc.json"
+    "systolic_ws_128x128_c2_simple_noc_tpuv3.json"
+    #"systolic_ws_128x128_c2_booksim_tpuv3.json"
+    # "systolic_ws_128x128_c2_simple_noc_tpuv4.json"
+)
+TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
+SHAPE_LIST=(
+  "512 512 512"
+  "1024 1024 1024"
+  "2048 2048 2048"
+)
+output_dir="$base_dir/results"
+mkdir -p "$output_dir"
+
+for i in "${config[@]}"; do
+  echo "Running with config=$i"
+  for shape in "${SHAPE_LIST[@]}"; do
+    ops="gemm_${shape// /x}"
+    output_file="$output_dir/ils_${ops}_${i}.txt"
+    workload="$TORCHSIM_DIR/experiments/gemm.py --size $shape"
+    echo "===== config=$i | model=$ops =====" >> "$output_file"
+    sum=0.0
+    count=0
+    config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i"
+
+    for iter in {1..5}; do
+      echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
+      output=$(bash -c "
+        export TORCHSIM_TLS_MODE=0;
+        export TORCHSIM_VALIDATION_MODE=1;
+        export TORCHSIM_CONFIG=$config_path;
+        export AUTOTUNE=0;
+        printenv;
+        python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh
+      ")
+
+      sim_time=$(echo "$output" | grep "Simulation time:" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/')
+
+      if [[ -n "$sim_time" ]]; then
+        echo "Iteration $iter: simulation_time = $sim_time" >> "$output_file"
+        sum=$(awk -v a="$sum" -v b="$sim_time" 'BEGIN {printf "%.6f", a + b}')
+        count=$((count + 1))
+      else
+        echo "Iteration $iter: Simulation time not found."
+        echo "Iteration $iter: simulation_time = NA" >> "$output_file"
+      fi
+    done
+
+    if [[ $count -gt 0 ]]; then
+      avg=$(awk -v total="$sum" -v n="$count" 'BEGIN {printf "%.6f", total / n}')
+      echo "Average simulation time for $ops with config $i: $avg seconds"
+      echo "Average simulation time = $avg" >> "$output_file"
+    else
+      echo "No valid simulation times found for $ops with config $i"
+      echo "Average simulation time = NA" >> "$output_file"
+    fi
+    echo "" >> "$output_file"
+  done
+done
\ No newline at end of file
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
new file mode 100755
index 00000000..aa35735c
--- /dev/null
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+base_dir=$TORCHSIM_DIR/experiments/artifact/speedup
+config=(
+    # "systolic_ws_8x8_c1_simple_noc.json"
+    "systolic_ws_128x128_c2_simple_noc_tpuv3.json"
+    #"systolic_ws_128x128_c2_booksim_tpuv3.json"
+    # "systolic_ws_128x128_c2_simple_noc_tpuv4.json"
+)
+TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
+SIZE_LIST=(
+  # "18"
+  "50"
+)
+BATCH_LIST=(
+  "1"
+  # "8"
+  # "16"
+  # "32"
+  # "64"
+  # "128"
+)
+output_dir="$base_dir/results"
+mkdir -p "$output_dir"
+
+for i in "${config[@]}"; do
+  echo "Running with config=$i"
+  for size in "${SIZE_LIST[@]}"; do
+    for batch in "${BATCH_LIST[@]}"; do
+      ops="resnet$size"
+      output_file="$output_dir/ils_${ops}_${i}.txt"
+      workload="$TORCHSIM_DIR/experiments/resnet$size.py --batch $batch"
+      echo "===== config=$i | model=$ops =====" >> "$output_file"
+      sum=0.0
+      count=0
+      config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i"
+
+      for iter in {1..5}; do
+        echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
+        output=$(bash -c "
+          export TORCHSIM_TLS_MODE=0;
+          export TORCHSIM_VALIDATION_MODE=0;
+          export TORCHSIM_CONFIG=$config_path;
+          export AUTOTUNE=0;
+          printenv;
+          python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh
+        ")
+
+        sim_time=$(echo "$output" | grep "Simulation time:" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/')
+
+        if [[ -n "$sim_time" ]]; then
+          echo "Iteration $iter: Simulation time = $sim_time"
+          echo "Iteration $iter: simulation_time = $sim_time" >> "$output_file"
+          sum=$(awk -v a="$sum" -v b="$sim_time" 'BEGIN {printf "%.6f", a + b}')
+          count=$((count + 1))
+        else
+          echo "Iteration $iter: Simulation time not found."
+          echo "Iteration $iter: simulation_time = NA" >> "$output_file"
+        fi
+      done
+
+      if [[ $count -gt 0 ]]; then
+        avg=$(awk -v total="$sum" -v n="$count" 'BEGIN {printf "%.6f", total / n}')
+        echo "Average simulation time for $ops with config $i: $avg seconds"
+        echo "Average simulation time = $avg" >> "$output_file"
+      else
+        echo "No valid simulation times found for $ops with config $i"
+        echo "Average simulation time = NA" >> "$output_file"
+      fi
+      echo "" >> "$output_file"
+    done
+  done
+done
\ No newline at end of file
diff --git a/experiments/artifact/speedup/summary_speedup.py b/experiments/artifact/speedup/summary_speedup.py
new file mode 100644
index 00000000..e454d752
--- /dev/null
+++ b/experiments/artifact/speedup/summary_speedup.py
@@ -0,0 +1,90 @@
+import os
+import csv
+import re
+
+LOG_DIR = os.path.join(os.environ.get("TORCHSIM_DIR", "."), "experiments/artifact/speedup/results")
+BASELINE_CSV = os.path.join(os.environ.get("TORCHSIM_DIR", "."), "experiments/artifact/baseline_latency.csv")
+
+def format_with_speedup(value, ref, speedup_list=None):
+    try:
+        if value == "" or ref == "" or float(value) == 0:
+            return "N/A"
+        val = float(value)
+        ref = float(ref)
+        spd = ref / val
+        if speedup_list is not None:
+            speedup_list.append(spd)
+        val_str = f"{float(val):>7.3f}"
+        spd_str = f"{spd:.2f}×"
+        return f"{val_str} ({spd_str:>7})"
+    except (ValueError, TypeError):
+        return "N/A"
+
+def compute_geomean(errors):
+    if not errors:
+        return "N/A"
+    filtered = [abs(e) for e in errors if e > 0]
+    if not filtered:
+        return "0.00x"
+    prod = 1.0
+    for e in filtered:
+        prod *= e
+    geo = prod ** (1.0 / len(filtered))
+    return f"{geo:.2f}x"
+
+if __name__ == "__main__":
+    # 1. Generate cycle_map
+    average_time_map = {}
+    for file in os.listdir(LOG_DIR):
+        if file.endswith(".txt"):
+            full_path = os.path.join(LOG_DIR, file)
+            full_name = file[:-4]
+            name = full_name.split("_systolic", 1)[0]
+            if "ils" in full_name:
+                name = name
+            elif "booksim" in full_name:
+                name = name +"cn"
+            elif "simple_noc" in full_name:
+                name = name +"sn"
+            else:
+                raise ValueError(f"Unsupported file name format: {file}")
+            with open(full_path) as f:
+                for line in f:
+                    match = re.search(r"Average simulation time\s*=\s*([0-9]+(?:\.[0-9]+)?)", line)
+                    if match:
+                        average_time_map[name] = float(match.group(1))
+                        break
+
+    # Speedup list init
+    accelsim_speedup = []
+    mnpusim_speedup = []
+    torchsim_ils_sn_speedup = []
+    torchsim_sn_speedup = []
+    torchsim_cn_speedup = []
+
+    # Header
+    print("[*] Summary of Latency (Seconds) and Speedup (vs Accel-Sim)")
+    print("=" * 165)
+    print(f"{'Workload':>30} {'Accel-Sim':>25} {'mNPUSim':>25} {'PyTorchSim(ILS)-SN':>25} {'PyTorchSim-SN':>25} {'PyTorchSim-CN':>25}")
+    print("=" * 165)
+
+    with open(BASELINE_CSV, newline="") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            workload = row["Workload"].lstrip('\ufeff')
+            accelsim = row["Accel-Sim"]
+    
+            mnpusim = format_with_speedup(row["mNPUSim"], accelsim, mnpusim_speedup)
+
+            togsim_ils_sn_val = average_time_map.get("ils_" + workload, "")
+            togsim_sn_val = average_time_map.get(workload+"sn", "")
+            togsim_cn_val = average_time_map.get(workload+"cn", "")
+            torchsim_ils_sn = format_with_speedup(togsim_ils_sn_val, accelsim, torchsim_ils_sn_speedup)
+            torchsim_sn = format_with_speedup(togsim_sn_val, accelsim, torchsim_sn_speedup)
+            torchsim_cn = format_with_speedup(togsim_cn_val, accelsim, torchsim_cn_speedup)
+
+            print(f"{workload:>30} {accelsim:>25} {mnpusim:>25} {torchsim_ils_sn:>25} {torchsim_sn:>25} {torchsim_cn:>25}")
+
+    # MAE row
+    print("=" * 165)
+    print(f"{'[*] Geomean Speedup':>30} {'1x':>25} {compute_geomean(mnpusim_speedup):>25} {compute_geomean(torchsim_ils_sn_speedup):>25} {compute_geomean(torchsim_sn_speedup):>25} {compute_geomean(torchsim_cn_speedup):>25}")
\ No newline at end of file

From 3e427db02b94c63e5794e711ca3b1e8341e735af Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 28 Jul 2025 06:46:22 +0000
Subject: [PATCH 429/432] [Artifact] Add plot generation function

---
 .../cycle_validation/summary_cycle.py         | 95 ++++++++++++++++---
 .../artifact/speedup/summary_speedup.py       | 90 +++++++++++++++---
 2 files changed, 158 insertions(+), 27 deletions(-)

diff --git a/experiments/artifact/cycle_validation/summary_cycle.py b/experiments/artifact/cycle_validation/summary_cycle.py
index 01a667f8..ac3904be 100644
--- a/experiments/artifact/cycle_validation/summary_cycle.py
+++ b/experiments/artifact/cycle_validation/summary_cycle.py
@@ -2,14 +2,66 @@
 import math
 import csv
 import re
+import matplotlib.pyplot as plt
+import numpy as np
 
-LOG_DIR = os.path.join(os.environ.get("TORCHSIM_DIR", "."), "experiments/artifact/logs")
-BASELINE_CSV = os.path.join(os.environ.get("TORCHSIM_DIR", "."), "experiments/artifact/baseline_cycle.csv")
+TORCHSIM_DIR = os.environ.get("TORCHSIM_DIR", ".")
+LOG_DIR = os.path.join(TORCHSIM_DIR, "experiments/artifact/logs")
+BASELINE_CSV = os.path.join(TORCHSIM_DIR, "experiments/artifact/baseline_cycle.csv")
+
+def plot_error_bars(data: dict, filename: str):
+    colors = {
+        'SCALE-Sim v3': 'gold',
+        'mNPUSim': 'orange',
+        'Timeloop': 'green',
+        'Maestro': 'violet',
+        'PyTorchSim-SN': 'royalblue',
+    }
+
+    labels = list(data.keys())
+    num_sims = len(colors)
+    bar_width = 1
+    fig, ax = plt.subplots(figsize=(48, 8))
+
+    grouped_data = {sim: [[], []] for sim in colors}
+    x_pos = []
+    x_offset = 0
+
+    for key, value in data.items():
+        for i, (sim, color) in enumerate(colors.items()):
+            grouped_data[sim][0].append(value[i])
+            grouped_data[sim][1].append(x_offset + bar_width * i)
+        x_pos.append(x_offset + bar_width * (num_sims // 2))
+        x_offset += bar_width * (num_sims + 2)
+
+    for sim, (heights, xpos) in grouped_data.items():
+        bars = ax.bar(xpos, heights, width=bar_width, color=colors[sim], label=sim, edgecolor='black')
+        mae_val = heights[-1]
+        ax.text(
+            xpos[-1],
+            mae_val + 2 if mae_val >= 0 else mae_val - 6,
+            f'{mae_val:.1f}%',
+            ha='center',
+            va='bottom' if mae_val >= 0 else 'top',
+            fontsize=9,
+            rotation=90
+        )
+
+    ax.set_xticks(x_pos)
+    ax.set_xticklabels(labels, rotation=20, ha='right')
+    ax.set_ylim(-100, 150)
+    ax.set_yticks(np.arange(-100, 151, 50))
+    ax.yaxis.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
+    ax.legend()
+
+    plt.savefig(filename)
+    plt.close()
+    print(f"Saved plot to {filename}")
 
 def format_with_error(value, ref, error_list=None):
     try:
         if value == "" or ref == "" or float(ref) == 0:
-            return "N/A"
+            return "N/A", 0.0
         val = float(value)
         ref = float(ref)
         err = ((val - ref) / ref) * 100
@@ -17,15 +69,15 @@ def format_with_error(value, ref, error_list=None):
             error_list.append(abs(err))
         val_str = f"{int(val):>7}"
         err_str = f"{err:+.2f}%"
-        return f"{val_str} ({err_str:>8})"
+        return f"{val_str} ({err_str:>8})", err
     except (ValueError, TypeError):
-        return "N/A"
+        return "N/A", 0.0
 
-def compute_mean(errors):
+def compute_mae(errors):
     if not errors:
         return "N/A"
     abs_errors = [abs(err) for err in errors]
-    return f"{sum(abs_errors) / len(errors):.2f}%"
+    return sum(abs_errors) / len(errors)
 
 if __name__ == "__main__":
     # 1. Generate cycle_map
@@ -48,6 +100,9 @@ def compute_mean(errors):
     scalesim_errors = []
     togsim_errors = []
 
+    # Plot data
+    plot_data ={}
+
     # Header
     print("[*] Summary of Total Execution Cycles with TPUv3-relative (%) Error")
     print("=" * 190)
@@ -60,19 +115,29 @@ def compute_mean(errors):
             workload = row["Workload"].lstrip('\ufeff')
             tpv3 = row["TPUv3"]
     
-            mnpusim  = format_with_error(row["mNPUSim"], tpv3, mnpusim_errors)
-            timeloop = format_with_error(row["Timeloop"], tpv3, timeloop_errors)
-            maestro  = format_with_error(row["Maestro"], tpv3, maestro_errors)
-            scalesim = format_with_error(row["SCALE-Sim v3"], tpv3, scalesim_errors)
+            mnpusim, mnpusim_err   = format_with_error(row["mNPUSim"], tpv3, mnpusim_errors)
+            timeloop, timeloop_err = format_with_error(row["Timeloop"], tpv3, timeloop_errors)
+            maestro, maestro_err   = format_with_error(row["Maestro"], tpv3, maestro_errors)
+            scalesim, scalesim_err = format_with_error(row["SCALE-Sim v3"], tpv3, scalesim_errors)
     
             togsim_val = cycle_map.get(workload, "")
             if "softmax" in workload or "layernorm" in workload:
-                togsim_str = format_with_error(str(togsim_val), tpv3, [])
+                togsim_str, togsim_err = format_with_error(str(togsim_val), tpv3, [])
             else:
-                togsim_str = format_with_error(str(togsim_val), tpv3, togsim_errors)
-    
+                togsim_str, togsim_err = format_with_error(str(togsim_val), tpv3, togsim_errors)
+            plot_data[workload] = [scalesim_err, mnpusim_err, timeloop_err, maestro_err, togsim_err]
             print(f"{workload:>30} {tpv3:>25} {mnpusim:>25} {timeloop:>25} {maestro:>25} {scalesim:>25} {togsim_str:>25}")
 
     # MAE row
+    mae_mnpusim = compute_mae(mnpusim_errors)
+    mae_timeloop = compute_mae(timeloop_errors)
+    mae_maestro = compute_mae(maestro_errors)
+    mae_scalesim = compute_mae(scalesim_errors)
+    mae_togsim = compute_mae(togsim_errors)
+    plot_data["MAE"] = [mae_scalesim, mae_mnpusim, mae_timeloop, mae_maestro, mae_togsim]
     print("=" * 190)
-    print(f"{'[*] Mean Absolute Error(%)':>30} {'0.00%':>25} {compute_mean(mnpusim_errors):>25} {compute_mean(timeloop_errors):>25} {compute_mean(maestro_errors):>25} {compute_mean(scalesim_errors):>25} {compute_mean(togsim_errors):>25}")
\ No newline at end of file
+    print(f"{'[*] Mean Absolute Error(%)':>30} {'0.00%':>25} {mae_mnpusim:>24.2f}% {mae_timeloop:>24.2f}% {mae_maestro:>24.2f}% {mae_scalesim:>24.2f}% {mae_togsim:>24.2f}%")
+
+    # Plot the error bars
+    path = os.path.join(TORCHSIM_DIR, "experiments/artifact/cycle_validation/cycle_validation.png")
+    plot_error_bars(plot_data, path)
diff --git a/experiments/artifact/speedup/summary_speedup.py b/experiments/artifact/speedup/summary_speedup.py
index e454d752..73c5b089 100644
--- a/experiments/artifact/speedup/summary_speedup.py
+++ b/experiments/artifact/speedup/summary_speedup.py
@@ -1,14 +1,69 @@
 import os
 import csv
 import re
+import matplotlib.pyplot as plt
+import numpy as np
 
-LOG_DIR = os.path.join(os.environ.get("TORCHSIM_DIR", "."), "experiments/artifact/speedup/results")
-BASELINE_CSV = os.path.join(os.environ.get("TORCHSIM_DIR", "."), "experiments/artifact/baseline_latency.csv")
+TORCHSIM_DIR = os.environ.get("TORCHSIM_DIR", ".")
+LOG_DIR = os.path.join(TORCHSIM_DIR, "experiments/artifact/speedup/results")
+BASELINE_CSV = os.path.join(TORCHSIM_DIR, "experiments/artifact/baseline_latency.csv")
+
+
+def plot_speedup_bars(data: dict, filename: str):
+    colors = {
+        'Accel-Sim': '#A6A6A6',
+        'mNPUSim': '#E97132',
+        'PyTorchSim(ILS)-SN': '#4EA72E',
+        'PyTorchSim-SN': '#0070C0',
+        'PyTorchSim-CN': '#A6CAEC',
+    }
+
+    labels = list(data.keys())
+    num_sims = len(colors)
+    bar_width = 1
+    fig, ax = plt.subplots(figsize=(48, 16))
+
+    grouped_data = {sim: [[], []] for sim in colors}
+    x_pos = []
+    x_offset = 0
+
+    for key, value in data.items():
+        for i, (sim, color) in enumerate(colors.items()):
+            grouped_data[sim][0].append(value[i])
+            grouped_data[sim][1].append(x_offset + bar_width * i)
+        x_pos.append(x_offset + bar_width * (num_sims // 2))
+        x_offset += bar_width * (num_sims + 2)
+
+    for sim, (heights, xpos) in grouped_data.items():
+        bars = ax.bar(xpos, heights, width=bar_width, color=colors[sim], label=sim, edgecolor='black')
+        mae_val = heights[-1]
+        ax.text(
+            xpos[-1],
+            mae_val + 2 if mae_val >= 0 else mae_val - 6,
+            f'{mae_val:.1f}x',
+            ha='center',
+            va='bottom' if mae_val >= 0 else 'top',
+            fontsize=9,
+            rotation=90
+        )
+
+    ax.set_xticks(x_pos)
+    ax.set_xticklabels(labels, rotation=20, ha='right')
+    ax.set_yscale('log')
+    ax.set_ylim(0.1, 150)
+    ax.set_yticks([0.1, 1, 10, 100])
+    ax.get_yaxis().set_major_formatter(plt.ScalarFormatter())
+    ax.yaxis.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
+    ax.legend()
+
+    plt.savefig(filename)
+    plt.close()
+    print(f"Saved plot to {filename}")
 
 def format_with_speedup(value, ref, speedup_list=None):
     try:
         if value == "" or ref == "" or float(value) == 0:
-            return "N/A"
+            return "N/A", 0.0
         val = float(value)
         ref = float(ref)
         spd = ref / val
@@ -16,9 +71,9 @@ def format_with_speedup(value, ref, speedup_list=None):
             speedup_list.append(spd)
         val_str = f"{float(val):>7.3f}"
         spd_str = f"{spd:.2f}×"
-        return f"{val_str} ({spd_str:>7})"
+        return f"{val_str} ({spd_str:>7})", spd
     except (ValueError, TypeError):
-        return "N/A"
+        return "N/A", 0.0
 
 def compute_geomean(errors):
     if not errors:
@@ -30,7 +85,7 @@ def compute_geomean(errors):
     for e in filtered:
         prod *= e
     geo = prod ** (1.0 / len(filtered))
-    return f"{geo:.2f}x"
+    return geo
 
 if __name__ == "__main__":
     # 1. Generate cycle_map
@@ -62,6 +117,9 @@ def compute_geomean(errors):
     torchsim_sn_speedup = []
     torchsim_cn_speedup = []
 
+    # Plot data
+    plot_data ={}
+
     # Header
     print("[*] Summary of Latency (Seconds) and Speedup (vs Accel-Sim)")
     print("=" * 165)
@@ -74,17 +132,25 @@ def compute_geomean(errors):
             workload = row["Workload"].lstrip('\ufeff')
             accelsim = row["Accel-Sim"]
     
-            mnpusim = format_with_speedup(row["mNPUSim"], accelsim, mnpusim_speedup)
+            mnpusim, mnpusim_spd = format_with_speedup(row["mNPUSim"], accelsim, mnpusim_speedup)
 
             togsim_ils_sn_val = average_time_map.get("ils_" + workload, "")
             togsim_sn_val = average_time_map.get(workload+"sn", "")
             togsim_cn_val = average_time_map.get(workload+"cn", "")
-            torchsim_ils_sn = format_with_speedup(togsim_ils_sn_val, accelsim, torchsim_ils_sn_speedup)
-            torchsim_sn = format_with_speedup(togsim_sn_val, accelsim, torchsim_sn_speedup)
-            torchsim_cn = format_with_speedup(togsim_cn_val, accelsim, torchsim_cn_speedup)
-
+            torchsim_ils_sn, ils_sn_spd = format_with_speedup(togsim_ils_sn_val, accelsim, torchsim_ils_sn_speedup)
+            torchsim_sn, sn_spd = format_with_speedup(togsim_sn_val, accelsim, torchsim_sn_speedup)
+            torchsim_cn, cn_spd = format_with_speedup(togsim_cn_val, accelsim, torchsim_cn_speedup)
+            plot_data[workload] = [1.0, mnpusim_spd, ils_sn_spd, sn_spd, cn_spd]
             print(f"{workload:>30} {accelsim:>25} {mnpusim:>25} {torchsim_ils_sn:>25} {torchsim_sn:>25} {torchsim_cn:>25}")
 
     # MAE row
+    geomean_accelsim = 1.0
+    geomean_mnpusim = compute_geomean(mnpusim_speedup)
+    geomean_torchsim_ils_sn = compute_geomean(torchsim_ils_sn_speedup)
+    geomean_torchsim_sn = compute_geomean(torchsim_sn_speedup)
+    geomean_torchsim_cn = compute_geomean(torchsim_cn_speedup)
+    plot_data["Geomean"] = [geomean_accelsim, geomean_mnpusim, geomean_torchsim_ils_sn, geomean_torchsim_sn, geomean_torchsim_cn]
     print("=" * 165)
-    print(f"{'[*] Geomean Speedup':>30} {'1x':>25} {compute_geomean(mnpusim_speedup):>25} {compute_geomean(torchsim_ils_sn_speedup):>25} {compute_geomean(torchsim_sn_speedup):>25} {compute_geomean(torchsim_cn_speedup):>25}")
\ No newline at end of file
+    print(f"{'Geomean Speedup':>30} {'1x':>25} {geomean_mnpusim:>24.2f}x {geomean_torchsim_ils_sn:>24.2f}x {geomean_torchsim_sn:>24.2f}x {geomean_torchsim_cn:>24.2f}x")
+    path = os.path.join(TORCHSIM_DIR, "experiments/artifact/speedup/speedup.png")
+    plot_speedup_bars(plot_data, path)

From 63eaab0975622054a022e0798f4d251f9283cdb0 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 28 Jul 2025 07:46:07 +0000
Subject: [PATCH 430/432] [CI] Add tag release workflow

---
 .github/workflows/tag_release.yml             | 67 +++++++++++++++++++
 .../artifact/speedup/scripts/ils_parser.sh    |  2 +-
 2 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/tag_release.yml

diff --git a/.github/workflows/tag_release.yml b/.github/workflows/tag_release.yml
new file mode 100644
index 00000000..c6bc3c7a
--- /dev/null
+++ b/.github/workflows/tag_release.yml
@@ -0,0 +1,67 @@
+name: Build & Push Docker Image on Tag
+
+on:
+  push:
+    tags:
+      - 'v*'
+
+jobs:
+  build:
+    runs-on: self-hosted
+
+    permissions:
+      contents: read
+      packages: write
+      id-token: write
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Set Tag Environment
+        run: |
+          echo "IMAGE_TAG=torchsim-ci:${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
+          echo "GITHUB_SHA=$GITHUB_SHA" >> $GITHUB_ENV
+          echo "GITHUB_SHA=$GITHUB_SHA"
+
+      - name: Pull Cached Image & Set environment
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          docker pull ghcr.io/psal-postech/torchsim_base:latest || echo "No cache available"
+          gem5_response_file=/tmp/releases-gem5-latest.json
+          response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} )
+          GEM5_ASSET_ID=$(cat ${gem5_response_file} | jq ".assets[0]."id"")
+          echo "GEM5_ASSET_ID=$GEM5_ASSET_ID"
+          echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" >> $GITHUB_ENV
+
+          llvm_response_file=/tmp/releases-gem5-latest.json
+          response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/latest > ${llvm_response_file} )
+          LLVM_ASSET_ID=$(cat ${llvm_response_file} | jq ".assets[0]."id"")
+          echo "LLVM_ASSET_ID=$LLVM_ASSET_ID"
+          echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" >> $GITHUB_ENV
+
+          mkdir -p /tmp/torchsim-ci/${GITHUB_SHA}
+          echo "DUMP_PATH=/tmp/torchsim-ci/${GITHUB_SHA}"
+
+      - name: Build and Push Docker Image
+        uses: docker/build-push-action@v4
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          build-args: |
+            GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
+            LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
+            GIT_ACCESS_TOKEN=${{ env.GIT_ACCESS_TOKEN }}
+            TORCHSIM_SHA=${{ env.GITHUB_SHA }}
+          tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG}}
\ No newline at end of file
diff --git a/experiments/artifact/speedup/scripts/ils_parser.sh b/experiments/artifact/speedup/scripts/ils_parser.sh
index a02d8edb..913daeea 100755
--- a/experiments/artifact/speedup/scripts/ils_parser.sh
+++ b/experiments/artifact/speedup/scripts/ils_parser.sh
@@ -19,7 +19,7 @@ while IFS= read -r line; do
       gem5_time=$(grep "Simulation time:" "$sto_log_path" | \
                 sed -E 's/^Simulation time: ([0-9.]+) seconds$/\1/')
       echo "GEM5: $gem5_time" 
-      total_gem5=$(echo "$total_gem5 + $gem5_time" | bc)
+      total_gem5=$(awk -v a="$total_gem5" -v b="$gem5_time" 'BEGIN {printf "%.6f", a+b}')
     fi
   fi
   if [[ "$line" == *"Simulation time:"* ]]; then

From b01daa65a273b8254bfee6d23928926778ff7208 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 30 Jul 2025 01:55:44 +0000
Subject: [PATCH 431/432] [Artifact] Minor fix for summary script

---
 experiments/artifact/cycle_validation/summary_cycle.py | 2 +-
 experiments/artifact/speedup/summary_speedup.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/experiments/artifact/cycle_validation/summary_cycle.py b/experiments/artifact/cycle_validation/summary_cycle.py
index ac3904be..529d0161 100644
--- a/experiments/artifact/cycle_validation/summary_cycle.py
+++ b/experiments/artifact/cycle_validation/summary_cycle.py
@@ -86,7 +86,7 @@ def compute_mae(errors):
         if file.endswith(".log"):
             full_path = os.path.join(LOG_DIR, file)
             name = file[:-4]
-            with open(full_path) as f:
+            with open(full_path, errors="ignore") as f:
                 for line in f:
                     match = re.search(r"Total execution cycle:\s*([0-9]+)", line)
                     if match:
diff --git a/experiments/artifact/speedup/summary_speedup.py b/experiments/artifact/speedup/summary_speedup.py
index 73c5b089..67a741a0 100644
--- a/experiments/artifact/speedup/summary_speedup.py
+++ b/experiments/artifact/speedup/summary_speedup.py
@@ -103,7 +103,7 @@ def compute_geomean(errors):
                 name = name +"sn"
             else:
                 raise ValueError(f"Unsupported file name format: {file}")
-            with open(full_path) as f:
+            with open(full_path, errors="ignore") as f:
                 for line in f:
                     match = re.search(r"Average simulation time\s*=\s*([0-9]+(?:\.[0-9]+)?)", line)
                     if match:

From 5ec17ff2d18c08d10ae4bf5b101cd9e131a3fba3 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 30 Jul 2025 02:46:13 +0000
Subject: [PATCH 432/432] [CI] Fix docker file and action credential issue

---
 .github/workflows/docker-image.yml        | 13 +++++++------
 .github/workflows/pull-request.yml        | 11 +++++++----
 .github/workflows/pull-request_mobile.yml | 13 +++++++------
 .github/workflows/tag_release.yml         | 11 +++++++----
 Dockerfile                                | 22 ++++++++++++++--------
 5 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index e5bab560..ece4a6d5 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -18,8 +18,10 @@ jobs:
       # Step 1: Checkout the repository
       - name: Checkout Code
         uses: actions/checkout@v4
-      # Step 2: Log in to GitHub Container Registry (optional)
-      # If you need to push the built image, authenticate here.
+        with:
+          repository: PSAL-POSTECH/PyTorchSim
+          ref: ${{ env.GITHUB_SHA }}
+          submodules: recursive
       - name: Log in to GitHub Container Registry
         uses: docker/login-action@v3
         with:
@@ -53,9 +55,7 @@ jobs:
 
       # Step 4: Build and Push Docker Image
       - name: Build and Push Docker Image
-        uses: docker/build-push-action@v4
-        env:
-          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        uses: docker/build-push-action@v6
         with:
           context: .
           file: ./Dockerfile
@@ -63,8 +63,9 @@ jobs:
           build-args: |
             GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
             LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
-            GIT_ACCESS_TOKEN=${{ env.GIT_ACCESS_TOKEN }}
             TORCHSIM_SHA=${{ env.GITHUB_SHA }}
+          secrets: |
+            GIT_ACCESS_TOKEN=${{ secrets.GIT_ACCESS_TOKEN }}
           tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG }}
 
   test_add:
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index f1366eb6..ecdbf861 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -18,6 +18,10 @@ jobs:
       # Step 1: Checkout the repository
       - name: Checkout Code
         uses: actions/checkout@v4
+        with:
+          repository: PSAL-POSTECH/PyTorchSim
+          ref: ${{ github.event.pull_request.head.sha }}
+          submodules: recursive
       # Step 2: Log in to GitHub Container Registry (optional)
       # If you need to push the built image, authenticate here.
       - name: Log in to GitHub Container Registry
@@ -53,9 +57,7 @@ jobs:
 
       # Step 4: Build and Push Docker Image
       - name: Build and Push Docker Image
-        uses: docker/build-push-action@v4
-        env:
-          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        uses: docker/build-push-action@v6
         with:
           context: .
           file: ./Dockerfile
@@ -63,8 +65,9 @@ jobs:
           build-args: |
             GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
             LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
-            GIT_ACCESS_TOKEN=${{ env.GIT_ACCESS_TOKEN }}
             TORCHSIM_SHA=${{ env.GITHUB_SHA }}
+          secrets: |
+            GIT_ACCESS_TOKEN=${{ secrets.GIT_ACCESS_TOKEN }}
           tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG}}
 
   test_add:
diff --git a/.github/workflows/pull-request_mobile.yml b/.github/workflows/pull-request_mobile.yml
index 0043eaf4..053e3eac 100644
--- a/.github/workflows/pull-request_mobile.yml
+++ b/.github/workflows/pull-request_mobile.yml
@@ -18,8 +18,10 @@ jobs:
       # Step 1: Checkout the repository
       - name: Checkout Code
         uses: actions/checkout@v4
-      # Step 2: Log in to GitHub Container Registry (optional)
-      # If you need to push the built image, authenticate here.
+        with:
+          repository: PSAL-POSTECH/PyTorchSim
+          ref: ${{ env.github.event.pull_request.head.sha }}
+          submodules: recursive
       - name: Log in to GitHub Container Registry
         uses: docker/login-action@v3
         with:
@@ -53,9 +55,7 @@ jobs:
 
       # Step 4: Build and Push Docker Image
       - name: Build and Push Docker Image
-        uses: docker/build-push-action@v4
-        env:
-          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        uses: docker/build-push-action@v6
         with:
           context: .
           file: ./Dockerfile
@@ -63,8 +63,9 @@ jobs:
           build-args: |
             GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
             LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
-            GIT_ACCESS_TOKEN=${{ env.GIT_ACCESS_TOKEN }}
             TORCHSIM_SHA=${{ env.GITHUB_SHA }}
+          secrets: |
+            GIT_ACCESS_TOKEN=${{ secrets.GIT_ACCESS_TOKEN }}
           tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG}}
 
   test_add:
diff --git a/.github/workflows/tag_release.yml b/.github/workflows/tag_release.yml
index c6bc3c7a..258c0e40 100644
--- a/.github/workflows/tag_release.yml
+++ b/.github/workflows/tag_release.yml
@@ -17,6 +17,10 @@ jobs:
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
+        with:
+          repository: PSAL-POSTECH/PyTorchSim
+          ref: ${{ github.ref_name }}
+          submodules: recursive
 
       - name: Log in to GitHub Container Registry
         uses: docker/login-action@v3
@@ -52,9 +56,7 @@ jobs:
           echo "DUMP_PATH=/tmp/torchsim-ci/${GITHUB_SHA}"
 
       - name: Build and Push Docker Image
-        uses: docker/build-push-action@v4
-        env:
-          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        uses: docker/build-push-action@v6
         with:
           context: .
           file: ./Dockerfile
@@ -62,6 +64,7 @@ jobs:
           build-args: |
             GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
             LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
-            GIT_ACCESS_TOKEN=${{ env.GIT_ACCESS_TOKEN }}
             TORCHSIM_SHA=${{ env.GITHUB_SHA }}
+          secrets: |
+            GIT_ACCESS_TOKEN=${{ secrets.GIT_ACCESS_TOKEN }}
           tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG}}
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 8e149883..44f6fd5e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,3 +1,4 @@
+# syntax=docker/dockerfile:1.4
 # Copyright (c) 2020 The Regents of the University of California
 # All Rights Reserved.
 #
@@ -26,7 +27,6 @@
 FROM ghcr.io/psal-postech/torchsim_base:latest
 
 # Pass Access Token securely
-ARG GIT_ACCESS_TOKEN
 ARG GEM5_ASSET_ID
 ARG LLVM_ASSET_ID
 ARG TORCHSIM_SHA
@@ -34,14 +34,18 @@ ENV PATH $PATH:/root/.local/bin
 ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
 
 # Download GEM5 for torchsim
-RUN curl -L -H "Accept: application/octet-stream" -H "Authorization: Bearer ${GIT_ACCESS_TOKEN}"  https://api.github.com/repos/PSAL-POSTECH/gem5/releases/assets/${GEM5_ASSET_ID} -o /tmp/gem5-release.tar.gz && \
+RUN --mount=type=secret,id=GIT_ACCESS_TOKEN \
+    GIT_ACCESS_TOKEN=$(cat /run/secrets/GIT_ACCESS_TOKEN) && \
+    curl -L -H "Accept: application/octet-stream" -H "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/gem5/releases/assets/${GEM5_ASSET_ID} -o /tmp/gem5-release.tar.gz && \
     mkdir -p /gem5 && \
     tar -xzf /tmp/gem5-release.tar.gz -C /gem5 && \
     rm /tmp/gem5-release.tar.gz
 ENV GEM5_PATH /gem5/release/gem5.opt
 
 # Download LLVM RISC-V for torchsim
-RUN curl -L -H "Accept: application/octet-stream" -H "Authorization: Bearer ${GIT_ACCESS_TOKEN}"  https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/assets/${LLVM_ASSET_ID} -o /tmp/riscv-llvm-release.tar.gz && \
+RUN --mount=type=secret,id=GIT_ACCESS_TOKEN \
+    GIT_ACCESS_TOKEN=$(cat /run/secrets/GIT_ACCESS_TOKEN) && \
+    curl -L -H "Accept: application/octet-stream" -H "Authorization: Bearer ${GIT_ACCESS_TOKEN}"  https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/assets/${LLVM_ASSET_ID} -o /tmp/riscv-llvm-release.tar.gz && \
     tar -xzf /tmp/riscv-llvm-release.tar.gz -C / && \
     rm /tmp/riscv-llvm-release.tar.gz
 
@@ -52,18 +56,20 @@ ENV TORCHSIM_DIR /workspace/PyTorchSim
 ENV LLVM_DIR /riscv-llvm
 
 # Install Spike simulator
-RUN git clone https://${GIT_ACCESS_TOKEN}@github.com/PSAL-POSTECH/riscv-isa-sim.git --branch TorchSim && cd riscv-isa-sim && mkdir build && cd build && \
-    ../configure --prefix=$RISCV && make -j && make install
+RUN --mount=type=secret,id=GIT_ACCESS_TOKEN \
+    GIT_ACCESS_TOKEN=$(cat /run/secrets/GIT_ACCESS_TOKEN) && \
+    git clone https://$GIT_ACCESS_TOKEN@github.com/PSAL-POSTECH/riscv-isa-sim.git --branch TorchSim && cd riscv-isa-sim && mkdir build && cd build && \
+    ../configure --prefix=$RISCV && make -j && make install && cd ../../ && rm -rf riscv-isa-sim
 
 # Install Proxy kernel
 RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \
      cd riscv-pk && git checkout 4f3debe4d04f56d31089c1c716a27e2d5245e9a1 && mkdir build && cd build && \
     ../configure --prefix=$RISCV --host=riscv64-unknown-elf && make -j && make install
 
-# Prepare ONNXim project
-RUN git clone https://${GIT_ACCESS_TOKEN}@github.com/PSAL-POSTECH/PyTorchSim.git && cd PyTorchSim && git checkout ${TORCHSIM_SHA}
+# Prepare PyTorchSim project
+COPY . /workspace/PyTorchSim
+
 RUN cd PyTorchSim/PyTorchSimBackend && \
-    git submodule update --recursive --init && \
     mkdir -p build && \
     cd build && \
     conan install .. --build=missing && \