From dcca9de414b018f423f40a2d3b2e110e4cd5b351 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Fri, 27 Dec 2024 05:20:29 +0000
Subject: [PATCH 01/24] [Fix] Fusion mm + relu

---
 .../mlir/mlir_codegen_backend.py              | 24 ++++++++++++++-----
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  2 +-
 PyTorchSimFrontend/mlir/mlir_template.py      |  7 +++---
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index faa08889..29c85c7c 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -378,7 +378,7 @@ def find_node_by_name(self, name):
                 if output_node.data.name == name:
                     return output_node
 
-    def get_dma_info(self, name, index, dtype, is_store):
+    def get_dma_info(self, name, index, dtype):
         current_tile = MLIRTile(self.tile_desc.n_row, self.tile_desc.n_col, self.tile_desc.vector_lane, self.tile_desc.used_vector_lane)
         cv = self.get_constant_vector(index)
         cv2 = self.get_constant_vector2(index)
@@ -574,7 +574,7 @@ def load(self, name: str, index: sympy.Expr):
         var = self.args.input(name)
         dtype = V.graph.get_dtype(name)
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
-        stride, chunk, tile_shape, tile_size_per_lane = self.get_dma_info(name, index, dtype, 0)
+        stride, chunk, tile_shape, tile_size_per_lane = self.get_dma_info(name, index, dtype)
         dram_tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
 
         # Define scratch pad buffer
@@ -609,6 +609,10 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         dtype = V.graph.get_dtype(name)
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
 
+        chunk_size = self.tile_desc.get_chunk_size()
+        chunk = chunk_size << 1 | (self.tile_desc.tile_per_lane_layout == MLIRTile.TILE_PER_LANE_COL_WISE)
+        self.consts.add(chunk)
+
         if name in self.buffer_names:
             buffer = self.buffer_names[name]
         else:
@@ -623,7 +627,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         self.cse.generate(self.stores, line, assignment = False)
 
         self.tags.add(f"{name}_tag")
-        code = f"affine.dma_start %{buffer}[0, 0], %{var}[%index2], %tag[0], %c_mvout, %N, %c_set : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>, memref<{self.render_options['M'] * self.render_options['N']}x{type_name}>, memref<1xi32>" #FIXME: Using constant index and tag
+        code = f"affine.dma_start %{buffer}[0, 0], %{var}[%index2], %tag[0], %c_mvout, %N, %c{chunk} : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>, memref<{self.render_options['M'] * self.render_options['N']}x{type_name}>, memref<1xi32>" #FIXME: Using constant index and tag
         self.cse.generate(self.stores, code, assignment = False)
 
     def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
@@ -635,7 +639,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         var = self.args.output(name)
         dtype = V.graph.get_dtype(name)
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
-        stride, chunk, tile_shape, tile_size_per_lane = self.get_dma_info(name, index, dtype, 1)
+        stride, chunk, tile_shape, tile_size_per_lane = self.get_dma_info(name, index, dtype)
         dram_tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
 
         # Define scratch pad buffer
@@ -795,7 +799,7 @@ def store_reduction(self, name, index, value):
         code = f"affine.dma_start %{buffer}[0, 0], %{var}[{prefix}{indices}], %{name}_tag[0], %c{dmaType}, %c{mm_stride}, %c{chunk} : memref<{tile_row}x{tile_col}x{type_name}, 1>, memref<{self.buffer_types[name][1]}x{type_name}>, memref<1xi32>"
         self.cse.generate(self.reductions_suffix, code, assignment = False)
 
-    def codegen_body(self):
+    def codegen_body(self, subtile):
         # if not (
         #     self.loads
         #     or self.stores
@@ -807,13 +811,17 @@ def template_store(options):
             async_flag = 1
             line = f"affine.dma_start %Y_buffer[0, 0], %Y[%index2], %tag[0], %c_mvout, %N, %c_set"\
                    f": memref<{options['TILE_M']}x{options['TILE_N']}xf32, 1>,"\
-                   f"memref<{options['M'] * options['N']}xf32>, memref<1xi32> " #FIXME: Using constant index and tag
+                   f"memref<{options['M'] * options['N']}xf32>, memref<1xi32>" #FIXME: Using constant index
             self.cse.generate(self.stores, line, assignment = False)
         self.body.splice(self.codegen_init())
         self.body.splice(self.loads)
         self.body.splice(self.compute)
         if len(self.stores._lines) == 0:
             template_store(self.render_options)
+        if subtile:
+            for i in range(len(self.stores._lines)):
+                if "affine.dma_start" in self.stores._lines[i]:
+                    self.stores._lines[i] += f" {{ subtile_size=[{self.vector_lane}, {self.vector_lane}], async=1 }}"
         self.body.splice(self.stores)
         self.loads.clear()
         self.compute.clear()
@@ -907,6 +915,10 @@ def _codegen_kernel(self, arg_defs, kernel_name):
         return code
 
     def adjust_tile_size(self):
+        if self.is_template_kernel:
+            self.tile_desc.n_row = self.render_options['TILE_M']
+            self.tile_desc.n_col = self.render_options['TILE_N']
+            return
         if self.read_writes is not None:
             read_writes = list(self.read_writes.reads) + list(self.read_writes.writes)
             cv_list = []
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 83d43174..c3ff6012 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -62,7 +62,7 @@
         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                 outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
       } { accumulation_loop=true }
-    {{kernel.store_output()}} { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1 }
+      {{kernel.store_output(subtile=True)}}
     } { outer_loop=true }
   } { outer_loop=true }
   return
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 476e73f5..82e39c47 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -20,7 +20,7 @@
 
 from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
 from PyTorchSimFrontend.mlir.mlir_common import BaseMLIRHardwareInfo
-from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
+from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel, MLIRTile
 
 class MLIRTemplateKernel(MLIRKernel, BaseMLIRHardwareInfo):
     def __init__(self,
@@ -182,10 +182,9 @@ def hook():
         self.render_hooks["<OUPUT>"] = hook
         return "<OUPUT>"
 
-    def store_output(self):
-
+    def store_output(self, subtile=False):
         def hook():
-            self.codegen_body()
+            self.codegen_body(subtile=subtile)
             return textwrap.indent(self.body.getvalue(), "      ").strip()  #TODO: First line is not indented
 
         assert "<STORE_OUTPUT>" not in self.render_hooks

From 1847629da289f1dd5a7a637ff72198908585b515 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Thu, 2 Jan 2025 13:11:30 +0000
Subject: [PATCH 02/24] [Fix] specify variable name in c wrapper

---
 PyTorchSimFrontend/mlir/mlir_caller_codegen.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
index b6a2d7d7..92f250df 100644
--- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
+++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
@@ -28,7 +28,7 @@ def load_arg(self):
             if self.is_in_arg(arg_attribute[0]):
                 argv_idx = self.get_argv_idx() if arg_name not in self.load_args else self.load_args[arg_name]
                 self.load_args[arg_name] = argv_idx
-                self.writeline(f'if(load_arg({arg_name}, sizeof({arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}')
+                self.writeline(f'if(load_arg(c_{arg_name}, sizeof(c_{arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}')
                 with self.code.indent():
                     self.writeline(f'return -1{self.ending}')
                 self.writeline(self.closed_bracket)
@@ -37,7 +37,7 @@ def dump_arg(self):
         for arg_name, arg_attribute in self.arg_attributes:
             if self.is_out_arg(arg_attribute[0]):
                 argv_idx = self.get_argv_idx() if not self.is_inout_arg(arg_attribute[0]) else self.load_args[arg_name]
-                self.writeline(f'if(dump_arg({arg_name}, sizeof({arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}')
+                self.writeline(f'if(dump_arg(c_{arg_name}, sizeof(c_{arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}')
                 with self.code.indent():
                     self.writeline(f'return -1{self.ending}')
                 self.writeline(self.closed_bracket)
@@ -53,7 +53,7 @@ def generate_args_define(self):
         for arg_name, (_, arg_type, arg_size) in self.arg_attributes:
             if not arg_name in name_set:
                 if self.validation:
-                    self.writeline(f'{DTYPE_TO_C[arg_type]} {arg_name}[{arg_size}]{self.ending}')
+                    self.writeline(f'{DTYPE_TO_C[arg_type]} c_{arg_name}[{arg_size}]{self.ending}')
                 else:
                     if torch.is_floating_point(torch.tensor([], dtype=arg_type)):
                         bits = torch.finfo(arg_type).bits
@@ -61,7 +61,7 @@ def generate_args_define(self):
                         bits = 8
                     else:
                         bits = torch.iinfo(arg_type).bits
-                    self.writeline(f'{DTYPE_TO_C[arg_type]}* {arg_name} = malloc({arg_size * bits // 8}){self.ending}')
+                    self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({arg_size * bits // 8}){self.ending}')
                 name_set.add(arg_name)
         self.writeline(self.newline)
 
@@ -77,7 +77,7 @@ def generate_main(self):
             else:
                 self.generate_args_define()
 
-            func_arguments = [f"{arg_name}, {arg_name}, 0, {arg_shape}, 1" if arg_type != torch.bool else f"{arg_name}, {arg_name}, 0, {(arg_shape + 7) // 8}, 1" for arg_name, (_, arg_type, arg_shape) in self.arg_attributes]
+            func_arguments = [f"c_{arg_name}, c_{arg_name}, 0, {arg_shape}, 1" if arg_type != torch.bool else f"c_{arg_name}, c_{arg_name}, 0, {(arg_shape + 7) // 8}, 1" for arg_name, (_, arg_type, arg_shape) in self.arg_attributes]
             self.writeline(f"wrapper_{self.kernel_name}({', '.join(func_arguments)}){self.ending}{self.newline}")
 
             if self.validation:

From 1d95e4acbeefbb51da4e764e2e9a10863320f22c Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Thu, 2 Jan 2025 13:26:30 +0000
Subject: [PATCH 03/24] [Temporal] block schedule node fusion

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 29c85c7c..35819333 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1080,6 +1080,7 @@ def can_fuse_vertical(self, node1, node2):
         return self.can_fuse_horizontal(node1, node2) and not node1.is_reduction()
 
     def can_fuse_horizontal(self, node1, node2):
+        return False
         _, (vars1, reduce1) = node1.group
         _, (vars2, reduce2) = node2.group
         if vars1 == vars2 and reduce1 == reduce2:

From 87673acaa64ce8f95f00d45584d67fb8db2e5aa5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 2 Jan 2025 12:47:42 +0000
Subject: [PATCH 04/24] [CI] Build base image on base branch

---
 .github/workflows/docker-base-image.yml | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/.github/workflows/docker-base-image.yml b/.github/workflows/docker-base-image.yml
index 708c9efe..dd8c0953 100644
--- a/.github/workflows/docker-base-image.yml
+++ b/.github/workflows/docker-base-image.yml
@@ -2,7 +2,7 @@ name: Docker Base Image CI
 
 on:
   push:
-    branches: [ "master" ]
+    branches: [ "base" ]
 
 jobs:
   build:
@@ -27,21 +27,8 @@ jobs:
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
-      # Step 3: Check if the Base Image Exists
-      - name: Check if Base Image Exists
-        id: check-image
-        run: |
-          if docker pull ghcr.io/psal-postech/torchsim_base:latest; then
-            echo "Base image already exists. Skipping build and push."
-            echo "exists=true" >> $GITHUB_ENV
-          else
-            echo "Base image does not exist. Proceeding with build and push."
-            echo "exists=false" >> $GITHUB_ENV
-          fi
-
       # Step 4: Build and Push Docker Image
       - name: Build and Push Docker Image
-        if: env.exists == 'false'
         uses: docker/build-push-action@v4
         with:
           context: .

From fbe662f8a5b5397db33e3cee7082d54141b6ab52 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 3 Jan 2025 07:34:49 +0000
Subject: [PATCH 05/24] [Backend] Support DMA's constant indexing

---
 PyTorchSimBackend/src/TileGraphParser.cc | 34 +++++++++++++++++++-----
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index 4a88f4b9..f3c5f22d 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -23,6 +23,26 @@ uint32_t calculateAddress(const std::vector<uint32_t>& loop_size, const std::vec
   return address;
 }
 
+
+int getLoopIndexValue(const std::map<std::string, int>& iter, const std::string& loop_idx) {
+    // Check if loop_idx starts with "c"
+    if (!loop_idx.empty() && loop_idx[0] == 'c') {
+        // Extract substring after 'c' and convert to integer
+        const char* numberPart = loop_idx.c_str() + 1; // Skip the first character 'c'
+        int convertedValue = std::atoi(numberPart);
+        return convertedValue;
+    }
+
+    // If loop_idx does not start with 'c', check in the map
+    auto it = iter.find(loop_idx);
+    if (it != iter.end()) {
+        return it->second;
+    }
+
+    // If loop_idx is not found, throw an exception
+    throw std::runtime_error("Key not found in map and does not start with 'c': " + loop_idx);
+}
+
 std::vector<uint32_t> calc_output_idx(TileGraphParser* tog_parser, std::map<std::string, int>& iter) {
   // Extract outer loop
   // Extract inner loop
@@ -272,7 +292,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       int nr_inner_loop = 0;
       auto& loop_idx_list = mem_node->get_loop_idx_list();
       for (auto loop_idx: loop_idx_list) {
-        auto iter_value = iter.at(loop_idx);
+        auto iter_value = getLoopIndexValue(iter, loop_idx);
         iter_list.push_back(iter_value);
         loop_size_list.push_back(tog_parser->get_loop_size(loop_idx));
         if (tog_parser->get_loop_type(loop_idx)==LoopType::INNER_LOOP)
@@ -283,7 +303,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
             loop_idx != loop_idx_list.end() - nr_inner_loop; ++loop_idx) {
         // Check loop type and process
         if (tog_parser->get_loop_type(*loop_idx)==LoopType::ACCUMULATION_LOOP) {
-          auto iter_value = iter.at(*loop_idx);
+          auto iter_value = getLoopIndexValue(iter, *loop_idx);
           tag_list.push_back(iter_value);
         }
       }
@@ -292,7 +312,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
             loop_idx != loop_idx_list.end(); ++loop_idx) {
         if (tog_parser->get_loop_type(*loop_idx)==LoopType::PARALLEL_LOOP) {
           uint32_t step = (uint32_t)tog_parser->get_loop_step(*loop_idx);
-          auto iter_value = iter.at(*loop_idx) / step;
+          auto iter_value = getLoopIndexValue(iter, *loop_idx) / step;
           outer_loop_idx.push_back(iter_value);
           outer_loop_size.push_back(tog_parser->get_loop_size(*loop_idx));
         }
@@ -302,7 +322,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         if (iter.find(loop_idx) == iter.end())
           tag_list.push_back(0);
         else {
-          auto iter_value = iter.at(loop_idx);
+          auto iter_value = getLoopIndexValue(iter, loop_idx);
           tag_list.push_back(iter_value);
         }
       }
@@ -342,14 +362,14 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       int nr_inner_loop = 0;
       auto& loop_idx_list = mem_node->get_loop_idx_list();
       for (auto loop_idx: loop_idx_list) {
-        auto iter_value = iter.at(loop_idx);
+        auto iter_value = getLoopIndexValue(iter, loop_idx);
         iter_list.push_back(iter_value);
         loop_size_list.push_back(tog_parser->get_loop_size(loop_idx));
         if (tog_parser->get_loop_type(loop_idx)==LoopType::INNER_LOOP)
           nr_inner_loop++;
         if (tog_parser->get_loop_type(loop_idx)==LoopType::PARALLEL_LOOP) {
           uint32_t step = (uint32_t) tog_parser->get_loop_step(loop_idx);
-          auto iter_value = iter.at(loop_idx) / step;
+          auto iter_value = getLoopIndexValue(iter, loop_idx) / step;
           outer_loop_idx.push_back(iter_value);
           outer_loop_size.push_back(tog_parser->get_loop_size(loop_idx)/ step);
         }
@@ -400,7 +420,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         if (iter.find(loop_idx) == iter.end())
           tag_list.push_back(0);
         else {
-          auto iter_value = iter.at(loop_idx) * inner_step;
+          auto iter_value = getLoopIndexValue(iter, loop_idx) * inner_step;
           tag_list.push_back(iter_value);
         }
       }

From f583e22e90d1a031a60accf518127a358f003b72 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 3 Jan 2025 08:25:05 +0000
Subject: [PATCH 06/24] [Backend] Allowing null tile operation graph

---
 AsmParser/tog_generator.py               | 28 +++++++++++++-----------
 PyTorchSimBackend/src/TileGraphParser.cc |  6 ++---
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
index 9a39b782..1b5971e2 100644
--- a/AsmParser/tog_generator.py
+++ b/AsmParser/tog_generator.py
@@ -56,8 +56,9 @@ def decrease_depth_stack(self):
 
     def load_file(self, path):
         self.module = import_module_from_path(self.module_name, path)
-        self.raw_graph = self.module.graph
-        self.parse_graph()
+        if hasattr(self.module, "graph"):
+            self.raw_graph = self.module.graph
+            self.parse_graph()
 
     def _create_node(self, dump_data):
         node_id = dump_data["node_id"]
@@ -193,17 +194,18 @@ def parse_graph(self):
 
     def generate_tile_graph(self, name="tile_graph", cycle_list=list, offset=int, vector_lane=int):
         node_list = list(self.node_dict.values())[1:]
-        node_list[0].set_parent([])
-        for iter_node in self.node_dict.values():
-            if isinstance(iter_node, compute_node):
-                if cycle_list:
-                    iter_node.torchsim_cycle = cycle_list.pop(0)
-                else:
-                    print("[TOGGen] Error compute cycle timing is missing...!")
-                    iter_node.torchsim_cycle = 10
-                # FIXME.
-                if iter_node.torchsim_compute_type == 1:
-                    iter_node.torchsim_overlapping_cycle = iter_node.torchsim_cycle - offset
+        if len(node_list):
+            node_list[0].set_parent([])
+            for iter_node in self.node_dict.values():
+                if isinstance(iter_node, compute_node):
+                    if cycle_list:
+                        iter_node.torchsim_cycle = cycle_list.pop(0)
+                    else:
+                        print("[TOGGen] Error compute cycle timing is missing...!")
+                        iter_node.torchsim_cycle = 10
+                    # FIXME.
+                    if iter_node.torchsim_compute_type == 1:
+                        iter_node.torchsim_overlapping_cycle = iter_node.torchsim_cycle - offset
 
         origin_info = "_".join(map(str, self.origins))
         onnx_node_list = [node.to_onnx() for node in node_list] # Exclude root node
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
index f3c5f22d..b9ea2b08 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -638,13 +638,13 @@ TileGraphParser::TileGraphParser(std::string onnx_path, json& attribute_json) {
       tile->print_node();
   }
 
+  _tile_graph = std::make_unique<TileGraph>(TileGraph(onnx_path, graph_name));
   /* Generate subgraph */
   if (_loop_nodes.empty()) {
-    spdlog::error("[TileGraphParser] No loop found...");
-    exit(EXIT_FAILURE);
+    spdlog::warn("[TileGraphParser] Null Kernel \"{}\"", onnx_path);
+    return;
   }
 
-  _tile_graph = std::make_unique<TileGraph>(TileGraph(onnx_path, graph_name));
   int last_outer_idx = -1;
   /* Extract outer loop */
   for (int i=0;i<_loop_nodes.size();i++) {

From 4c96fbb8cedec3731edc5ce034ab5a6de39851c2 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 3 Jan 2025 08:27:38 +0000
Subject: [PATCH 07/24] [Frontend] Fix dma_start mlir code

dma_start operation support index operation. If the literal value
(e.g., 0) is uses, then mlir c++ code is not work correctly.
---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  |  6 +--
 .../mlir/mlir_codegen_backend.py              | 53 +++++++++++++------
 PyTorchSimFrontend/mlir/mlir_conv_template.py |  9 ++--
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 10 ++--
 .../mlir/mlir_maxpool_template.py             |  5 +-
 5 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 41a2e0cd..9d12f57b 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -52,16 +52,16 @@
         affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
           %index0 = affine.apply #map0(%b, %t_m, %t_k)
           %index1 = affine.apply #map1(%b, %t_k, %t_n)
-          affine.dma_start %X[%index0], %X_buffer[0, 0], %tag[0], %c_mvin,
+          affine.dma_start %X[%index0], %X_buffer[%c0, %c0], %tag[0], %c_mvin,
           {%- if X_transposed -%} %M, %x_chunk {%- else -%} %K, %c_set {%- endif -%}
              : memref<{{ B * M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1{% if X_transposed %}, transpose=1{% endif %} }
-          affine.dma_start %W[%index1], %W_buffer[0, 0], %tag[0], %c_mvin2,
+          affine.dma_start %W[%index1], %W_buffer[%c0, %c0], %tag[0], %c_mvin2,
           {%- if W_transposed -%} %K, %w_chunk {%- else -%} %N, %c_set {%- endif -%}
              : memref<{{ B * K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1{% if W_transposed %}, transpose=1{% endif %} }
           linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                   outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
         } { accumulation_loop=true }
-        affine.dma_start %Y_buffer[0, 0], %Y[%index2], %tag[0], %c_mvout, %N, %c_set : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ B * M * N }}xf32>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1 }
+        affine.dma_start %Y_buffer[%c0, %c0], %Y[%index2], %tag[0], %c_mvout, %N, %c_set : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ B * M * N }}xf32>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1 }
       } { outer_loop=true }
     } { outer_loop=true }
   } { outer_loop=true }
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 35819333..1f3f74ee 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -485,7 +485,7 @@ def get_dma_info(self, name, index, dtype):
 
     def parse_indices(self, expr):
         if len(expr.args) == 0:
-            return expr, expr
+            return expr
 
         # Extract index var
         expr_str = str(expr)
@@ -509,7 +509,7 @@ def parse_indices(self, expr):
         map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args}) -> ({expr_str})>")
         args = ", ".join([f"%{i}" for i in indices])
         index = self.cse.generate(self.loads, f"affine.apply #{map_var}({args})")
-        return index, expr
+        return index
 
     def codegen_nodes(self, nodes, kernel_name):
         _, (group, reduction_group) = max(
@@ -551,10 +551,11 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         else:
             mvin3 = 14
             self.consts.add(mvin3)
+            self.consts.add(0)
             dram_tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
             buffer, indices = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], dram_tile_shape, self.loads, index)
             self.buffer_names[name] = buffer
-            line = f"affine.dma_start %{var}[%index2], %{buffer}[0, 0], %tag[0], %c{mvin3}, %N, %c_set : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32>"
+            line = f"affine.dma_start %{var}[%index2], %{buffer}[%c0, %c0], %tag[0], %c{mvin3}, %N, %c_set : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32>"
             self.cse.generate(self.loads, line, assignment = False)
 
         tile_size_per_lane = self.render_options['TILE_M'] * self.render_options['TILE_N'] // self.vector_lane
@@ -569,8 +570,11 @@ def load(self, name: str, index: sympy.Expr):
         if self.is_template_kernel:
             return self.load_epilogue(name, index)
         index = self.rename_indexing(index)
-        indices, index = self.parse_indices(index)
-        prefix = "" if index.is_number else "%"
+        indices = self.parse_indices(index)
+        prefix = self.newvar_prefix
+        if index.is_number:
+            prefix = prefix + "c"
+            self.consts.add(int(index))
         var = self.args.input(name)
         dtype = V.graph.get_dtype(name)
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
@@ -592,7 +596,8 @@ def load(self, name: str, index: sympy.Expr):
             self.consts.add(chunk)
             self.dma_cache[dma_key] = dmaType, stride, chunk
         self.tags.add(f"{name}_tag")
-        code = f"affine.dma_start %{var}[{prefix}{indices}], %{buffer}[0, 0], %{name}_tag[0], %c{dmaType}, %c{stride}, %c{chunk} : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32>"
+        self.consts.add(0)
+        code = f"affine.dma_start %{var}[{prefix}{indices}], %{buffer}[%c0, %c0], %{name}_tag[0], %c{dmaType}, %c{stride}, %c{chunk} : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32>"
         self.cse.generate(self.loads, code, assignment = False) # FIXME: assignment = False does not support caching
 
         operation = "affine.vector_load" if tile_size_per_lane > 1 else "affine.load"
@@ -603,8 +608,11 @@ def load(self, name: str, index: sympy.Expr):
         return out
 
     def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
-        indices, index = self.parse_indices(index)
-        prefix = "" if index.is_number else "%"
+        indices = self.parse_indices(index)
+        prefix = self.newvar_prefix
+        if index.is_number:
+            prefix = prefix + "c"
+            self.consts.add(int(index))
         var = self.args.output(name)
         dtype = V.graph.get_dtype(name)
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
@@ -627,15 +635,19 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         self.cse.generate(self.stores, line, assignment = False)
 
         self.tags.add(f"{name}_tag")
-        code = f"affine.dma_start %{buffer}[0, 0], %{var}[%index2], %tag[0], %c_mvout, %N, %c{chunk} : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>, memref<{self.render_options['M'] * self.render_options['N']}x{type_name}>, memref<1xi32>" #FIXME: Using constant index and tag
+        self.consts.add(0)
+        code = f"affine.dma_start %{buffer}[%c0, %c0], %{var}[%index2], %tag[0], %c_mvout, %N, %c{chunk} : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>, memref<{self.render_options['M'] * self.render_options['N']}x{type_name}>, memref<1xi32>" #FIXME: Using constant index and tag
         self.cse.generate(self.stores, code, assignment = False)
 
     def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         if self.is_template_kernel:
             return self.store_epilogue(name, index, value, args, kwargs)
         index = self.rename_indexing(index)
-        indices, index = self.parse_indices(index)
-        prefix = "" if index.is_number else "%"
+        indices = self.parse_indices(index)
+        prefix = self.newvar_prefix
+        if index.is_number:
+            prefix = prefix + "c"
+            self.consts.add(int(index))
         var = self.args.output(name)
         dtype = V.graph.get_dtype(name)
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
@@ -657,8 +669,9 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
         line = f"{operation} %{value}, %{buffer}[0, 0] : memref<{dram_tile_shape}x{type_name}, 1>{shape}"
         self.cse.generate(self.stores, line, assignment = False)
+        self.consts.add(0)
         self.tags.add(f"{name}_tag")
-        code = f"affine.dma_start %{buffer}[0, 0], %{var}[{prefix}{indices}], %{name}_tag[0], %c{dmaType}, %c{stride}, %c{chunk} : memref<{dram_tile_shape}x{type_name}, 1>, memref<{self.buffer_types[name][1]}x{type_name}>, memref<1xi32>"
+        code = f"affine.dma_start %{buffer}[%c0, %c0], %{var}[{prefix}{indices}], %{name}_tag[0], %c{dmaType}, %c{stride}, %c{chunk} : memref<{dram_tile_shape}x{type_name}, 1>, memref<{self.buffer_types[name][1]}x{type_name}>, memref<1xi32>"
         self.cse.generate(self.stores, code, assignment = False)
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
@@ -739,9 +752,11 @@ def store_reduction(self, name, index, value):
         dtype = V.graph.get_dtype(name)
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
         index = self.rename_indexing(index)
-        indices, index = self.parse_indices(index)
-        prefix = "" if index.is_number else "%"
-
+        indices = self.parse_indices(index)
+        prefix = self.newvar_prefix
+        if index.is_number:
+            prefix = prefix + "c"
+            self.consts.add(int(index))
         # Tile is always reuduced in inner loop
         tile_col = self.tile_desc.n_row
         tile_row = 1
@@ -796,7 +811,8 @@ def store_reduction(self, name, index, value):
         self.consts.add(chunk)
         self.tags.add(f"{name}_tag")
         # Change row, col
-        code = f"affine.dma_start %{buffer}[0, 0], %{var}[{prefix}{indices}], %{name}_tag[0], %c{dmaType}, %c{mm_stride}, %c{chunk} : memref<{tile_row}x{tile_col}x{type_name}, 1>, memref<{self.buffer_types[name][1]}x{type_name}>, memref<1xi32>"
+        self.consts.add(0)
+        code = f"affine.dma_start %{buffer}[%c0, %c0], %{var}[{prefix}{indices}], %{name}_tag[0], %c{dmaType}, %c{mm_stride}, %c{chunk} : memref<{tile_row}x{tile_col}x{type_name}, 1>, memref<{self.buffer_types[name][1]}x{type_name}>, memref<1xi32>"
         self.cse.generate(self.reductions_suffix, code, assignment = False)
 
     def codegen_body(self, subtile):
@@ -809,7 +825,8 @@ def codegen_body(self, subtile):
         def template_store(options):
             subtile_size = [self.vector_lane, self.vector_lane]
             async_flag = 1
-            line = f"affine.dma_start %Y_buffer[0, 0], %Y[%index2], %tag[0], %c_mvout, %N, %c_set"\
+            self.consts.add(0)
+            line = f"affine.dma_start %Y_buffer[%c0, %c0], %Y[%index2], %tag[0], %c_mvout, %N, %c_set"\
                    f": memref<{options['TILE_M']}x{options['TILE_N']}xf32, 1>,"\
                    f"memref<{options['M'] * options['N']}xf32>, memref<1xi32>" #FIXME: Using constant index
             self.cse.generate(self.stores, line, assignment = False)
@@ -848,6 +865,8 @@ def codegen_loops(self):
         loops = [LoopLevel(var, size, idx-len(self.itervars), tile_row=tile_row, tile_col=tile_col) for idx, (var, size) in enumerate(zip(self.itervars, self.ranges))]
         loops, reductions = [LoopNest(loops[: self.reduction_depth]),
                              LoopNest(loops[self.reduction_depth :])]
+        if (self.reduction_depth==0):
+            loops = LoopNest([LoopLevel("dummy", 1, 1, 0)])
         reductions.mark_reduction(self.reduction_vars)
         if len(self.affine_yield) > 0:
             vars = ', '.join([f"%{name}" for name, _ in self.affine_yield.items()])
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 56d43a11..1d640c75 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -27,6 +27,7 @@
   %c_mvin3 = arith.constant 14 : index
   %c_mvout = arith.constant 3 : index
   %c_set = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
 
   %N = arith.constant {{ N }} : index
   %K = arith.constant {{ K }} : index
@@ -38,16 +39,16 @@
   affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
     affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
         %index2 = affine.apply #map1(%t_m, %t_n)
-        affine.dma_start %B[%index2], %Y_buffer[0, 0], %tag[0], %c_mvin3, %N, %c_set : memref<{{ M * N }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1 }
+        affine.dma_start %B[%index2], %Y_buffer[%c0, %c0], %tag[0], %c_mvin3, %N, %c_set : memref<{{ M * N }}xf32>, memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1 }
         affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
             %index0 = affine.apply #map0(%t_m, %t_k)
             %index1 = affine.apply #map1(%t_k, %t_n)
-            affine.dma_start %X[%index0], %X_buffer[0, 0], %tag[0], %c_mvin, %K, %c_set : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1 }
-            affine.dma_start %W[%index1], %W_buffer[0, 0], %tag[0], %c_mvin2, %N, %c_set : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1 }
+            affine.dma_start %X[%index0], %X_buffer[%c0, %c0], %tag[0], %c_mvin, %K, %c_set : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1 }
+            affine.dma_start %W[%index1], %W_buffer[%c0, %c0], %tag[0], %c_mvin2, %N, %c_set : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1 }
             linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                     outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
         } { accumulation_loop=true }
-        affine.dma_start %Y_buffer[0, 0], %Y[%index2], %tag[0], %c_mvout, %N, %c_set : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ M * N }}xf32>, memref<1xi32>  { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1 }
+        affine.dma_start %Y_buffer[%c0, %c0], %Y[%index2], %tag[0], %c_mvout, %N, %c_set : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ M * N }}xf32>, memref<1xi32>  { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1 }
     } { outer_loop=true }
   } { outer_loop=true }
   return
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index c3ff6012..63708810 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -33,8 +33,8 @@
   %W_buffer = memref.get_global @W_spad : memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
   %Y_buffer = memref.get_global @Y_spad : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
   %tag = memref.alloc() : memref<1xi32>{% if not Bias %}
-  %v0 = arith.constant dense<0.0> : vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>{% else %}
-  %c0 = arith.constant 0 : index{% endif %}
+  %v0 = arith.constant dense<0.0> : vector<{{ TILE_M * TILE_N // kernel.vector_lane }}xf32>{% endif %}
+  %c0 = arith.constant 0 : index
 
   affine.for %t_m = 0 to {{ M }} step {{ TILE_M }} {
     affine.for %t_n = 0 to {{ N }} step {{ TILE_N }} {
@@ -42,7 +42,7 @@
       {% if Bias -%}
       affine.dma_start %Bias[
         {%- if Bias_rank == 2 -%} %index2 {%- else -%} %t_n {%- endif -%}
-        ], %Y_buffer[0, 0], %tag[0], %c_mvin3, %
+        ], %Y_buffer[%c0, %c0], %tag[0], %c_mvin3, %
         {%- if Bias_rank == 2 -%} N {%- else -%} c0 {%- endif -%}
         , %c_set : memref<
         {%- if Bias_rank == 2 -%}  {{ M * N }} {%- else -%} {{ N }} {%- endif -%}
@@ -53,10 +53,10 @@
       affine.for %t_k = 0 to {{ K }} step {{ TILE_K }} {
         %index0 = affine.apply #map0(%t_m, %t_k)
         %index1 = affine.apply #map1(%t_k, %t_n)
-        affine.dma_start %X[%index0], %X_buffer[0, 0], %tag[0], %c_mvin,
+        affine.dma_start %X[%index0], %X_buffer[%c0, %c0], %tag[0], %c_mvin,
         {%- if X_transposed -%} %M, %x_chunk {%- else -%} %K, %x_chunk {%- endif -%}
            : memref<{{ M * K }}xf32>, memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ TILE_K }}], async=1{% if X_transposed %}, transpose=1{% endif %} }
-        affine.dma_start %W[%index1], %W_buffer[0, 0], %tag[0], %c_mvin2,
+        affine.dma_start %W[%index1], %W_buffer[%c0, %c0], %tag[0], %c_mvin2,
         {%- if W_transposed -%} %K, %w_chunk {%- else -%} %N, %w_chunk {%- endif -%}
            : memref<{{ K * N }}xf32>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>, memref<1xi32> { subtile_size=[{{ TILE_K }}, {{ kernel.vector_lane }}], async=1{% if W_transposed %}, transpose=1{% endif %} }
         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
diff --git a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
index 87c9ce0e..1f93f82a 100644
--- a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
@@ -23,11 +23,12 @@
   %X_buffer = memref.get_global @X_spad : memref<{{ in_tile }}x{{ in_tile }}xf32, 1>
   %Y_buffer = memref.get_global @Y_spad : memref<{{ out_tile }}x{{ out_tile }}xf32, 1>
   %tag = memref.alloc() : memref<1xi32>
+  %c0 = arith.constant 0 : index
   affine.for %i = 0 to {{ BCH }} step {{ out_tile }} {
     affine.for %j = 0 to {{ W }} step {{ out_tile }} {
       %index0 = affine.apply #map0(%i, %j)
-      affine.dma_start %X[%index0], %X_buffer[0, 0], %tag[0], %c_mvin, %dummy, %in_chunk : memref<{{ IN }}xf32>, memref<{{ in_tile }}x{{ in_tile }}xf32, 1>, memref<1xi32>
-      affine.dma_start %Y_buffer[0, 0], %Y[%index0], %tag[0], %c_mvout, %dummy, %out_chunk : memref<{{ out_tile }}x{{ out_tile }}xf32, 1>, memref<{{ OUT }}xf32>, memref<1xi32>
+      affine.dma_start %X[%index0], %X_buffer[%c0, %c0], %tag[0], %c_mvin, %dummy, %in_chunk : memref<{{ IN }}xf32>, memref<{{ in_tile }}x{{ in_tile }}xf32, 1>, memref<1xi32>
+      affine.dma_start %Y_buffer[%c0, %c0], %Y[%index0], %tag[0], %c_mvout, %dummy, %out_chunk : memref<{{ out_tile }}x{{ out_tile }}xf32, 1>, memref<{{ OUT }}xf32>, memref<1xi32>
     } { outer_loop=true }
   } { outer_loop=true }
   return

From f69639941766e2e8c4a242658aa9af8ad89946e0 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 3 Jan 2025 09:11:52 +0000
Subject: [PATCH 08/24] [Backendsim] Fix null TOG stat

---
 PyTorchSimBackend/src/Core.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index a253540e..3cd29a14 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -310,6 +310,8 @@ bool Core::can_issue_compute(std::shared_ptr<Instruction>& inst) {
 
 void Core::print_stats() {
   update_stats();
+  if (_core_cycle == 0)
+    _core_cycle = 1;
   spdlog::info(
       "Core [{}] : MatMul active cycle {} Vector active cycle {} ",
       _id, _stat_tot_compute_cycle[SYSTOLIC_ARRAY], _stat_tot_compute_cycle[VECTOR_UNIT]);

From 2bc18b3bbe54914d4dc5c0a842850ad3a6fe682e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 3 Jan 2025 10:06:31 +0000
Subject: [PATCH 09/24] [Frontend] Add missing c0 in the bmm template

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 9d12f57b..70dc796b 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -23,8 +23,8 @@
   %c_mvin3 = arith.constant 14 : index{% endif %}
   %c_mvout = arith.constant 3 : index
   %c_set = arith.constant 2 : index
-  %c{{ TILE_K * 2 + 0}} = arith.constant {{ TILE_K * 2 + 0}} : index{% if Bias_rank == 1 %}
-  %c0 = arith.constant 0 : index{% endif %}{% if X_transposed %}
+  %c{{ TILE_K * 2 + 0}} = arith.constant {{ TILE_K * 2 + 0}} : index
+  %c0 = arith.constant 0 : index{% if X_transposed %}
   %x_chunk = arith.constant {{ kernel.vector_lane * 2 + 0 }} : index{% endif %}{% if W_transposed %}
   %w_chunk = arith.constant {{ TILE_K * 2 + 0 }} : index{% endif %}
   %M = arith.constant {{ M }} : index

From ad8760816ea14649c3f657a3f582ad9178b3e563 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 3 Jan 2025 10:53:55 +0000
Subject: [PATCH 10/24] [Test] Fix test sparsity case

---
 tests/test_sparsity.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_sparsity.py b/tests/test_sparsity.py
index d4f4c273..c72dbb98 100644
--- a/tests/test_sparsity.py
+++ b/tests/test_sparsity.py
@@ -8,7 +8,8 @@
 import torch._dynamo
 import torch.utils.cpp_extension
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-from test_extension_backend import DecoderBlock, MLP, test_result
+from test_transformer import DecoderBlock, test_result
+from test_mlp import MLP
 
 def apply_random_zero(tensor, zero_prob, block_size=8):
     if not 0 <= zero_prob <= 1:

From 0cdbfba29893046b3117e2e39572af8f5a33d61c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 3 Jan 2025 11:25:09 +0000
Subject: [PATCH 11/24] [Backendsim] Fix null TOG stat

---
 PyTorchSimBackend/src/Core.cc | 2 --
 PyTorchSimBackend/src/main.cc | 4 ++++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc
index 3cd29a14..a253540e 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/PyTorchSimBackend/src/Core.cc
@@ -310,8 +310,6 @@ bool Core::can_issue_compute(std::shared_ptr<Instruction>& inst) {
 
 void Core::print_stats() {
   update_stats();
-  if (_core_cycle == 0)
-    _core_cycle = 1;
   spdlog::info(
       "Core [{}] : MatMul active cycle {} Vector active cycle {} ",
       _id, _stat_tot_compute_cycle[SYSTOLIC_ARRAY], _stat_tot_compute_cycle[VECTOR_UNIT]);
diff --git a/PyTorchSimBackend/src/main.cc b/PyTorchSimBackend/src/main.cc
index e32b3eed..67f19d6d 100644
--- a/PyTorchSimBackend/src/main.cc
+++ b/PyTorchSimBackend/src/main.cc
@@ -95,6 +95,8 @@ void interactive_mode(Simulator* simulator) {
     if (isDryRun)
       std::cout << "[" << simulator->get_core_cycle() << "] BackendSim> ";
   }
+  if (simulator->get_core_cycle()==0)
+    simulator->until(0);
   simulator->print_core_stat();
 }
 
@@ -148,6 +150,8 @@ int main(int argc, char** argv) {
     /* launch kernels */
     launchKernel(simulator, onnx_path, attribute_path);
     simulator->run_simulator();
+    if (simulator->get_core_cycle()==0)
+      simulator->until(1);
     simulator->print_core_stat();
   } else if (execution_mode.compare("interactive") == 0) {
     /* Get onnx_path, attribute from user input, request_time */

From e55f70d424b49686a251c07c2a0379b19b289dce Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 3 Jan 2025 12:20:11 +0000
Subject: [PATCH 12/24] [CI] Fix checkout logic

---
 .github/workflows/docker-image.yml | 3 ++-
 Dockerfile                         | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 70c58a04..a908924d 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -37,6 +37,7 @@ jobs:
           docker pull ghcr.io/psal-postech/torchsim_base:latest || echo "No cache available"
           echo "IMAGE_TAG=torchsim-ci:$GITHUB_SHA" >> $GITHUB_ENV
           echo "GITHUB_SHA=$GITHUB_SHA" >> $GITHUB_ENV
+          echo "GITHUB_REF_NAME=$GITHUB_REF_NAME" >> $GITHUB_ENV
 
           gem5_response_file=/tmp/releases-gem5-latest.json
           response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} )
@@ -66,7 +67,7 @@ jobs:
             GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
             LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
             GIT_ACCESS_TOKEN=${{ env.GIT_ACCESS_TOKEN }}
-            TORCHSIM_SHA=${{ env.GITHUB_SHA }}
+            GITHUB_REF_NAME = ${{ env.GITHUB_REF_NAME }}
           tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG }}
 
   test_add:
diff --git a/Dockerfile b/Dockerfile
index 8e149883..530b0152 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,7 @@ FROM ghcr.io/psal-postech/torchsim_base:latest
 ARG GIT_ACCESS_TOKEN
 ARG GEM5_ASSET_ID
 ARG LLVM_ASSET_ID
-ARG TORCHSIM_SHA
+ARG GITHUB_REF_NAME
 ENV PATH $PATH:/root/.local/bin
 ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
 
@@ -61,7 +61,7 @@ RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \
     ../configure --prefix=$RISCV --host=riscv64-unknown-elf && make -j && make install
 
 # Prepare ONNXim project
-RUN git clone https://${GIT_ACCESS_TOKEN}@github.com/PSAL-POSTECH/PyTorchSim.git && cd PyTorchSim && git checkout ${TORCHSIM_SHA}
+RUN git clone https://${GIT_ACCESS_TOKEN}@github.com/PSAL-POSTECH/PyTorchSim.git && cd PyTorchSim && git checkout ${GITHUB_REF_NAME}
 RUN cd PyTorchSim/PyTorchSimBackend && \
     git submodule update --recursive --init && \
     mkdir -p build && \

From 37de49ce01c93d84d45567d35202eed3c76e3853 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 3 Jan 2025 12:41:41 +0000
Subject: [PATCH 13/24] [CI] Seperate PR, docker test

---
 .github/workflows/docker-image.yml |   3 +-
 .github/workflows/pull-request.yml | 373 +++++++++++++++++++++++++++++
 Dockerfile                         |   4 +-
 3 files changed, 376 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/pull-request.yml

diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index a908924d..70c58a04 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -37,7 +37,6 @@ jobs:
           docker pull ghcr.io/psal-postech/torchsim_base:latest || echo "No cache available"
           echo "IMAGE_TAG=torchsim-ci:$GITHUB_SHA" >> $GITHUB_ENV
           echo "GITHUB_SHA=$GITHUB_SHA" >> $GITHUB_ENV
-          echo "GITHUB_REF_NAME=$GITHUB_REF_NAME" >> $GITHUB_ENV
 
           gem5_response_file=/tmp/releases-gem5-latest.json
           response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} )
@@ -67,7 +66,7 @@ jobs:
             GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
             LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
             GIT_ACCESS_TOKEN=${{ env.GIT_ACCESS_TOKEN }}
-            GITHUB_REF_NAME = ${{ env.GITHUB_REF_NAME }}
+            TORCHSIM_SHA=${{ env.GITHUB_SHA }}
           tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG }}
 
   test_add:
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
new file mode 100644
index 00000000..7b558692
--- /dev/null
+++ b/.github/workflows/pull-request.yml
@@ -0,0 +1,373 @@
+name: PR test CI
+
+on:
+  pull_request:
+    branches: [ "master", "develop" ]
+
+jobs:
+  build:
+    runs-on: self-hosted
+
+    permissions:
+      contents: read
+      packages: write
+      attestations: write
+      id-token: write
+
+    steps:
+      # Step 1: Checkout the repository
+      - name: Checkout Code
+        uses: actions/checkout@v4
+      # Step 2: Log in to GitHub Container Registry (optional)
+      # If you need to push the built image, authenticate here.
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      # Step 3: Pull the Cached Image
+      - name: Pull Cached Image & Set environment
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        run: |
+          docker pull ghcr.io/psal-postech/torchsim_base:latest || echo "No cache available"
+          echo "IMAGE_TAG=torchsim-ci:${{github.event.pull_request.head.sha}}" >> $GITHUB_ENV
+          echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}" >> $GITHUB_ENV
+
+          gem5_response_file=/tmp/releases-gem5-latest.json
+          response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} )
+          GEM5_ASSET_ID=$(cat ${gem5_response_file} | jq ".assets[0]."id"")
+          echo "GEM5_ASSET_ID=$GEM5_ASSET_ID"
+          echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" >> $GITHUB_ENV
+
+          llvm_response_file=/tmp/releases-gem5-latest.json
+          response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/latest > ${llvm_response_file} )
+          LLVM_ASSET_ID=$(cat ${llvm_response_file} | jq ".assets[0]."id"")
+          echo "LLVM_ASSET_ID=$LLVM_ASSET_ID"
+          echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" >> $GITHUB_ENV
+
+          mkdir -p /tmp/torchsim-ci/${GITHUB_SHA}
+          echo "DUMP_PATH=/tmp/torchsim-ci/${GITHUB_SHA}"
+
+      # Step 4: Build and Push Docker Image
+      - name: Build and Push Docker Image
+        uses: docker/build-push-action@v4
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          build-args: |
+            GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
+            LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
+            GIT_ACCESS_TOKEN=${{ env.GIT_ACCESS_TOKEN }}
+            TORCHSIM_SHA=${{ env.GITHUB_SHA }}
+          tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG }}
+
+  test_add:
+    name: Run test_add.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_add.py
+        run: |
+          echo "Running test_add.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_add.py
+
+  test_relu:
+    name: Run test_relu.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_relu.py
+        run: |
+          echo "Running test_relu.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_relu.py
+
+  test_batchnorm:
+    name: Run test_batchnorm.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_batchnorm.py
+        run: |
+          echo "Running test_batchnorm.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_batchnorm.py
+
+  test_bmm:
+    name: Run test_bmm.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_bmm.py
+        run: |
+          echo "Running test_bmm.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_bmm.py
+
+  test_cnn:
+    name: Run test_cnn.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_cnn.py
+        run: |
+          echo "Running test_cnn.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_cnn.py
+
+  test_conv2d:
+    name: Run test_conv2d.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_conv2d.py
+        run: |
+          echo "Running test_conv2d.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_conv2d.py
+
+  test_matmul:
+    name: Run test_matmul.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_matmul.py
+        run: |
+          echo "Running test_matmul.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_matmul.py
+
+  test_reduce:
+    name: Run test_reduce.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_reduce.py
+        run: |
+          echo "Running test_reduce.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_reduce.py
+
+  test_softmax:
+    name: Run test_softmax.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_softmax.py
+        run: |
+          echo "Running test_softmax.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_softmax.py
+
+  test_transpose2D:
+    name: Run test_transpose2D.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_transpose2D.py
+        run: |
+          echo "Running test_transpose2D.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transpose2D.py
+
+  test_view3D_2D:
+    name: Run test_view3D_2D.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_view3D_2D.py
+        run: |
+          echo "Running test_view3D_2D.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_view3D_2D.py
+
+  test_layernorm:
+    name: Run test_layernorm.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_layernorm.py
+        run: |
+          echo "Running test_layernorm.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_layernorm.py
+
+  test_mlp:
+    name: Run test_mlp.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_mlp.py
+        run: |
+          echo "Running test_mlp.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_mlp.py
+
+  test_resnet:
+    name: Run test_resnet.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_resnet.py
+        run: |
+          echo "Running test_resnet.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_resnet.py
+
+  test_transformer:
+    name: Run test_transformer.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_transformer.py
+        run: |
+          echo "Running test_transformer.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transformer.py
+
+  test_transpose3D:
+    name: Run test_transpose3D.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_transpose3D.py
+        run: |
+          echo "Running test_transpose3D.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transpose3D.py
+
+  test_sparsity:
+    name: Run test_sparsity.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_sparsity.py
+        run: |
+          echo "Running test_sparsity.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_sparsity.py
+
+  test_pool:
+    name: Run test_pool.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_pool.py
+        run: |
+          echo "Running test_pool.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_pool.py
+
+  test_perceptron:
+    name: Run test_perceptron.py
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_single_perceptron.py
+        run: |
+          echo "Running test_single_perceptron.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_single_perceptron.py
+
+  test_fusion:
+    name: Run test_fusion
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_addmm_residual.py
+        run: |
+          echo "Running test_addmm_residual.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_addmm_residual.py
+      - name: Run test_matmul_activation.py
+        run: |
+          echo "Running test_matmul_activation.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_activation.py
+      - name: Run test_matmul_scalar.py
+        run: |
+          echo "Running test_matmul_scalar.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_scalar.py
+
+  test_moe:
+    name: Run test_moe
+    runs-on: self-hosted
+    needs: build
+    steps:
+      - name: Run test_moe.py
+        run: |
+          echo "Running test_moe.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/MoE/test_moe.py
+
+  test_cleanup:
+    name: Clean test cases
+    runs-on: self-hosted
+    needs: [test_add, test_batchnorm, test_bmm, test_cnn, test_conv2d,
+            test_matmul, test_reduce, test_softmax,
+            test_transpose2D, test_view3D_2D, test_layernorm,
+            test_mlp, test_resnet, test_transformer, test_transpose3D,
+            test_sparsity, test_relu, test_pool, test_perceptron,
+            test_fusion, test_moe]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Clean test case
+        run: |
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} chown -R $(id -u):$(id -g) /dump
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 530b0152..8e149883 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,7 @@ FROM ghcr.io/psal-postech/torchsim_base:latest
 ARG GIT_ACCESS_TOKEN
 ARG GEM5_ASSET_ID
 ARG LLVM_ASSET_ID
-ARG GITHUB_REF_NAME
+ARG TORCHSIM_SHA
 ENV PATH $PATH:/root/.local/bin
 ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
 
@@ -61,7 +61,7 @@ RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \
     ../configure --prefix=$RISCV --host=riscv64-unknown-elf && make -j && make install
 
 # Prepare ONNXim project
-RUN git clone https://${GIT_ACCESS_TOKEN}@github.com/PSAL-POSTECH/PyTorchSim.git && cd PyTorchSim && git checkout ${GITHUB_REF_NAME}
+RUN git clone https://${GIT_ACCESS_TOKEN}@github.com/PSAL-POSTECH/PyTorchSim.git && cd PyTorchSim && git checkout ${TORCHSIM_SHA}
 RUN cd PyTorchSim/PyTorchSimBackend && \
     git submodule update --recursive --init && \
     mkdir -p build && \

From cd724da6239206d0f9ff461e5f9fa534dfc73ab1 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 3 Jan 2025 12:45:25 +0000
Subject: [PATCH 14/24] [CI] Docker test on master branch

---
 .github/workflows/docker-image.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 70c58a04..2b420ff8 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -2,9 +2,7 @@ name: Docker Image CI
 
 on:
   push:
-    branches: [ "master", "develop" ]
-  pull_request:
-    branches: [ "master", "develop" ]
+    branches: [ "master" ]
 
 jobs:
   build:

From 0f957082b5765dadb524438834871582fc365438 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 3 Jan 2025 22:13:01 +0900
Subject: [PATCH 15/24] [CI] Update pull-request.yml

---
 .github/workflows/pull-request.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index 7b558692..92f5aa7d 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -65,7 +65,7 @@ jobs:
             LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
             GIT_ACCESS_TOKEN=${{ env.GIT_ACCESS_TOKEN }}
             TORCHSIM_SHA=${{ env.GITHUB_SHA }}
-          tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG }}
+          tags: ghcr.io/psal-postech/${GITHUB_SHA}
 
   test_add:
     name: Run test_add.py
@@ -370,4 +370,4 @@ jobs:
         run: |
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} chown -R $(id -u):$(id -g) /dump
\ No newline at end of file
+            ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} chown -R $(id -u):$(id -g) /dump

From 9f893e672333baca119c7d3a427aacc573d5bdb4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 6 Jan 2025 04:18:29 +0000
Subject: [PATCH 16/24] [CI] Fix tag name reference

---
 .github/workflows/pull-request.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index 92f5aa7d..003a0d01 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -33,9 +33,9 @@ jobs:
           GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }}
         run: |
           docker pull ghcr.io/psal-postech/torchsim_base:latest || echo "No cache available"
-          echo "IMAGE_TAG=torchsim-ci:${{github.event.pull_request.head.sha}}" >> $GITHUB_ENV
+          echo "IMAGE_TAG=torchsim-ci:${GITHUB_SHA}" >> $GITHUB_ENV
           echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}" >> $GITHUB_ENV
-
+          echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}"
           gem5_response_file=/tmp/releases-gem5-latest.json
           response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} )
           GEM5_ASSET_ID=$(cat ${gem5_response_file} | jq ".assets[0]."id"")
@@ -65,7 +65,7 @@ jobs:
             LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
             GIT_ACCESS_TOKEN=${{ env.GIT_ACCESS_TOKEN }}
             TORCHSIM_SHA=${{ env.GITHUB_SHA }}
-          tags: ghcr.io/psal-postech/${GITHUB_SHA}
+          tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG}}
 
   test_add:
     name: Run test_add.py

From 7df1abe7a69f1ac363147fcb87a42aa8b87663ca Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Fri, 3 Jan 2025 11:44:19 +0000
Subject: [PATCH 17/24] [Frontend] GEMM combination mapping

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py |  2 +-
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  2 +-
 PyTorchSimFrontend/mlir/mlir_template.py      | 25 +++++++++++++++++++
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 1d640c75..f9c95ae2 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -174,7 +174,7 @@ def render(self,
         M = self.gemm_input_shape[2] * self.gemm_input_shape[3]
         N = self.gemm_weight_shape[0]
         K = self.gemm_weight_shape[1]
-        TILE_M, TILE_N, TILE_K = kernel.gemmini_gemm_mapping(M, N, K)
+        TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
         kernel.tile_size = [TILE_M, TILE_N, TILE_K]
         kernel.loop_size = [M, N, K]
 
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 63708810..afd0d9d6 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -110,7 +110,7 @@ def render(self,
             TILE_M, TILE_N, TILE_K = 0, 0, 0
             template = EMPTY_TEMPLATE
         else:
-            TILE_M, TILE_N, TILE_K = kernel.gemmini_gemm_mapping(M, N, K)
+            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K)
             template = GEMM_TEMPLATE
         kernel.tile_size = [TILE_M, TILE_N, TILE_K]
         kernel.loop_size =[M, N, K]
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 82e39c47..20db65af 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -106,6 +106,31 @@ def gemmini_gemm_mapping(self, M, N, K):
 
         return inner_I, inner_J, inner_K
 
+    def gemm_combination_mapping(self, M, N, K):
+        spad_size = self.spad_info["spad_size"] * self.vector_lane
+        max_spad_size = spad_size // 2 # double buffer
+        M_padded = ((M + self.vector_lane - 1) // self.vector_lane) * self.vector_lane
+        N_padded = ((N + self.vector_lane - 1) // self.vector_lane) * self.vector_lane
+        K_padded = ((K + self.vector_lane - 1) // self.vector_lane) * self.vector_lane
+
+        max_used_spad_size = 0
+        mapping = (self.vector_lane, self.vector_lane, self.vector_lane)
+        for tile_M in range(self.vector_lane, M_padded + 1, self.vector_lane):
+            for tile_N in range(self.vector_lane, N_padded + 1, self.vector_lane):
+                for tile_K in range(self.vector_lane, K_padded + 1, self.vector_lane):
+                    used_spad_size = (tile_M * tile_K + tile_K * tile_N + tile_M * tile_N) * self.precision
+                    if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size:
+                        max_used_spad_size = used_spad_size
+                        mapping = (tile_M, tile_N, tile_K)
+
+        Outer_M = math.ceil(M_padded / mapping[0])
+        Outer_N = math.ceil(N_padded / mapping[1])
+        Outer_K = math.ceil(K_padded / mapping[2])
+
+        # split mapping equally to avoid unnecessary padding
+        mapping = (M_padded // Outer_M, N_padded // Outer_N, K_padded // Outer_K)
+        return mapping
+
     def meta_kernel(self):
         wrapper = V.graph.wrapper_code
         arg_attributes = self.kernel_arg_attributes

From ff66ded515d8653cd33abc3ea4a73dfbaf8a9ef4 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Fri, 3 Jan 2025 11:45:11 +0000
Subject: [PATCH 18/24] [Frontend] CONV bias & test case

---
 PyTorchSimFrontend/mlir/mlir_conv_template.py |  5 +++--
 tests/test_conv2d.py                          | 20 ++++++++++---------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index f9c95ae2..edc5bf0a 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -60,7 +60,7 @@
 def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}{% if BIAS %}, {{ BIAS }}{% endif %}, {{ OUT }}):
     {{ INPUT }}_cpu = {{ INPUT }}.cpu()
     {{ WEIGHT }}_cpu = {{ WEIGHT }}.cpu(){% if BIAS %}
-    {{ BIAS }}_cpu = {{ BIAS }}.cpu(){% endif %} #FIXME: BIAS is not used in the current implementation
+    {{ BIAS }}_cpu = {{ BIAS }}.cpu(){% endif %}
     {{ OUT }}_cpu = {{ OUT }}.cpu()
 
     # Torch support NCHW, so we need to transpose for now
@@ -111,7 +111,8 @@ def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}{% if BIAS %}, {{ BIAS }}{% endif %
             {% endif %}
 
     {{ OUT }}_cpu = {{ OUT }}_cpu.reshape(output_shape)
-    {{ OUT }}_cpu = {{ OUT }}_cpu.permute(0, 3, 1, 2)
+    {{ OUT }}_cpu = {{ OUT }}_cpu.permute(0, 3, 1, 2){% if BIAS %}
+    {{ OUT }}_cpu += {{ BIAS }}_cpu.reshape(-1, 1, 1) #TODO: BIAS should be added in the kernel{% endif %}
     {{ OUT }}.copy_({{ OUT }}_cpu)
 """
 
diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py
index 10f47faa..29924156 100644
--- a/tests/test_conv2d.py
+++ b/tests/test_conv2d.py
@@ -13,20 +13,22 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
         print("cpu out: ", cpu_out)
         exit(1)
 
-def test_conv2d(device):
-    def custom_conv2d(a, b):
+def test_conv2d(device, batch_size=1, in_channels=8, out_channels=16, input_size=64, kernel_size=3, stride=1, padding=0):
+    def custom_conv2d(a, b, bias):
         i_c = a.shape[1]
         o_c = b.shape[0]
-        conv2d = torch.nn.Conv2d(i_c, o_c, b.shape[-1], stride=1, padding=0, dilation=1)
+        conv2d = torch.nn.Conv2d(i_c, o_c, b.shape[-1], stride=stride, padding=padding, dilation=1)
         conv2d.weight = torch.nn.Parameter(b)
+        conv2d.bias = torch.nn.Parameter(bias)
         return conv2d(a)
     torch.manual_seed(0)
-    conv_input = torch.randn(1, 8, 64, 64).to(device=device)
-    conv_kernel = torch.randn(16, 8, 3, 3).to(device=device)
+    conv_input = torch.randn(batch_size, in_channels, input_size, input_size).to(memory_format=torch.channels_last, device=device)
+    conv_kernel = torch.randn(out_channels, in_channels, kernel_size, kernel_size).to(memory_format=torch.channels_last, device=device)
+    conv_bias = torch.randn(out_channels).to(device=device)
     opt_fn = torch.compile(dynamic=False)(custom_conv2d)
-    res = opt_fn(conv_input, conv_kernel)
-    out = custom_conv2d(conv_input.cpu(), conv_kernel.cpu())
-    test_result("Conv2d Forward", res, out, rtol=1e-1, atol=1e-1)
+    res = opt_fn(conv_input, conv_kernel, conv_bias)
+    out = custom_conv2d(conv_input.cpu(), conv_kernel.cpu(), conv_bias.cpu())
+    test_result("Conv2d Forward", res, out, rtol=1e-3, atol=1e-3)
     print("Max diff > ", torch.max(torch.abs(res.cpu() - out)))
 
 if __name__ == "__main__":
@@ -37,4 +39,4 @@ def custom_conv2d(a, b):
     from Scheduler.scheduler import ExecutionEngine
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
-    test_conv2d(device)
+    test_conv2d(device, batch_size=1, in_channels=128, out_channels=128, input_size=28, kernel_size=3, stride=1, padding=1)

From 8f11838d32bda16471b853f9b9a41e8dd04053d2 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 6 Jan 2025 07:38:00 +0000
Subject: [PATCH 19/24] [CI] Add exit when test failed to MoE, MLP

---
 tests/MLP/test_mlp.py | 128 +-----------------------------------------
 tests/MoE/test_moe.py | 127 +----------------------------------------
 2 files changed, 2 insertions(+), 253 deletions(-)

diff --git a/tests/MLP/test_mlp.py b/tests/MLP/test_mlp.py
index 5927502d..6f6c9444 100644
--- a/tests/MLP/test_mlp.py
+++ b/tests/MLP/test_mlp.py
@@ -21,133 +21,6 @@
 
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-try:
-    from PyTorchSimFrontend.mlir.mlir_codegen_backend import (
-        MLIRScheduling,
-        ExtensionWrapperCodegen,
-    )
-except ImportError:
-    from PyTorchSimFrontend.mlir.mlir_codegen_backend import (
-        MLIRScheduling,
-        ExtensionWrapperCodegen,
-    )
-
-from torch._C import FileCheck
-from torch._inductor import metrics
-from torch._inductor.codegen.common import (
-    get_scheduling_for_device,
-    get_wrapper_codegen_for_device,
-    register_backend_for_device,
-)
-from torch.testing._internal.common_utils import IS_MACOS
-from torch.testing._internal.common_utils import TestCase as TorchTestCase
-
-
-def remove_build_path():
-    if sys.platform == "win32":
-        # Not wiping extensions build folder because Windows
-        return
-    default_build_root = torch.utils.cpp_extension.get_default_build_root()
-    if os.path.exists(default_build_root):
-        shutil.rmtree(default_build_root, ignore_errors=True)
-
-class TestCase(TorchTestCase):
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-        cls._stack = contextlib.ExitStack()
-        cls._stack.enter_context(
-            config.patch(
-                {
-                    "debug": True,
-                    "debug_index_asserts": True,
-                    "cpp.min_chunk_size": 1,
-                    "triton.autotune_pointwise": False,  # too slow
-                    "implicit_fallbacks": False,
-                    "generate_intermediate_hooks": True,
-                }
-            )
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        cls._stack.close()
-        super().tearDownClass()
-
-    def setUp(self):
-        torch._dynamo.reset()
-        torch._inductor.metrics.reset()
-        super().setUp()
-        self._start = time.perf_counter()
-
-    def tearDown(self):
-        super().tearDown()
-        torch._dynamo.reset()
-        if os.environ.get("ERROR_ON_SLOW") == "1":
-            elapsed = time.perf_counter() - self._start
-            assert elapsed < 120
-
-
-class ExtensionBackendTests(TestCase):
-    module = None
-
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        # Build Extension
-        remove_build_path()
-        source_file_path = os.path.dirname(os.path.abspath(__file__))
-        source_file = os.path.join(
-            source_file_path, "PyTorchSimFrontend/extension_device.cpp"
-        )
-        cls.module = torch.utils.cpp_extension.load(
-            name="extension_device",
-            sources=[
-                str(source_file),
-            ],
-            extra_cflags=["-g"],
-            verbose=True,
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        cls._stack.close()
-        super().tearDownClass()
-
-        remove_build_path()
-
-    def setUp(self):
-        torch._dynamo.reset()
-        super().setUp()
-
-        # cpp extensions use relative paths. Those paths are relative to
-        # this file, so we'll change the working directory temporarily
-        self.old_working_dir = os.getcwd()
-        os.chdir(os.path.dirname(os.path.abspath(__file__)))
-        assert self.module is not None
-
-    def tearDown(self):
-        super().tearDown()
-        torch._dynamo.reset()
-
-        # return the working directory (see setUp)
-        os.chdir(self.old_working_dir)
-
-    def test_open_device_registration(self):
-        torch.utils.rename_privateuse1_backend("extension_device")
-
-        register_backend_for_device(
-            "extension_device", MLIRScheduling, ExtensionWrapperCodegen
-        )
-        self.assertTrue(
-            get_scheduling_for_device("extension_device") == MLIRScheduling
-        )
-        self.assertTrue(
-            get_wrapper_codegen_for_device("extension_device")
-            == ExtensionWrapperCodegen
-        )
-
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     pass_message = f"|{name} Test Passed|"
     fail_message = f"|{name} Test Failed|"
@@ -162,6 +35,7 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
 
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
+        exit(1)
 
 class MLP(nn.Module):
     def __init__(self, input_size, output_size, hidden_size):
diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py
index a7ab630b..ff6dd00b 100644
--- a/tests/MoE/test_moe.py
+++ b/tests/MoE/test_moe.py
@@ -22,132 +22,6 @@
 
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-try:
-    from PyTorchSimFrontend.mlir.mlir_codegen_backend import (
-        MLIRScheduling,
-        ExtensionWrapperCodegen,
-    )
-except ImportError:
-    from .PyTorchSimFrontend.mlir.mlir_codegen_backend import (
-        MLIRScheduling,
-        ExtensionWrapperCodegen,
-    )
-
-from torch._C import FileCheck
-from torch._inductor import metrics
-from torch._inductor.codegen.common import (
-    get_scheduling_for_device,
-    get_wrapper_codegen_for_device,
-    register_backend_for_device,
-)
-from torch.testing._internal.common_utils import IS_MACOS
-from torch.testing._internal.common_utils import TestCase as TorchTestCase
-
-def remove_build_path():
-    if sys.platform == "win32":
-        # Not wiping extensions build folder because Windows
-        return
-    default_build_root = torch.utils.cpp_extension.get_default_build_root()
-    if os.path.exists(default_build_root):
-        shutil.rmtree(default_build_root, ignore_errors=True)
-
-class TestCase(TorchTestCase):
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-        cls._stack = contextlib.ExitStack()
-        cls._stack.enter_context(
-            config.patch(
-                {
-                    "debug": True,
-                    "debug_index_asserts": True,
-                    "cpp.min_chunk_size": 1,
-                    "triton.autotune_pointwise": False,  # too slow
-                    "implicit_fallbacks": False,
-                    "generate_intermediate_hooks": True,
-                }
-            )
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        cls._stack.close()
-        super().tearDownClass()
-
-    def setUp(self):
-        torch._dynamo.reset()
-        torch._inductor.metrics.reset()
-        super().setUp()
-        self._start = time.perf_counter()
-
-    def tearDown(self):
-        super().tearDown()
-        torch._dynamo.reset()
-        if os.environ.get("ERROR_ON_SLOW") == "1":
-            elapsed = time.perf_counter() - self._start
-            assert elapsed < 120
-
-
-class ExtensionBackendTests(TestCase):
-    module = None
-
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        # Build Extension
-        remove_build_path()
-        source_file_path = os.path.dirname(os.path.abspath(__file__))
-        source_file = os.path.join(
-            source_file_path, "PyTorchSimFrontend/extension_device.cpp"
-        )
-        cls.module = torch.utils.cpp_extension.load(
-            name="extension_device",
-            sources=[
-                str(source_file),
-            ],
-            extra_cflags=["-g"],
-            verbose=True,
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        cls._stack.close()
-        super().tearDownClass()
-
-        remove_build_path()
-
-    def setUp(self):
-        torch._dynamo.reset()
-        super().setUp()
-
-        # cpp extensions use relative paths. Those paths are relative to
-        # this file, so we'll change the working directory temporarily
-        self.old_working_dir = os.getcwd()
-        os.chdir(os.path.dirname(os.path.abspath(__file__)))
-        assert self.module is not None
-
-    def tearDown(self):
-        super().tearDown()
-        torch._dynamo.reset()
-
-        # return the working directory (see setUp)
-        os.chdir(self.old_working_dir)
-
-    def test_open_device_registration(self):
-        torch.utils.rename_privateuse1_backend("extension_device")
-
-        register_backend_for_device(
-            "extension_device", MLIRScheduling, ExtensionWrapperCodegen
-        )
-        self.assertTrue(
-            get_scheduling_for_device("extension_device") == MLIRScheduling
-        )
-        self.assertTrue(
-            get_wrapper_codegen_for_device("extension_device")
-            == ExtensionWrapperCodegen
-        )
-
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     pass_message = f"|{name} Test Passed|"
     fail_message = f"|{name} Test Failed|"
@@ -162,6 +36,7 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
 
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
+        exit(1)
 
 class SparseDispatcher(object):
     """Helper for implementing a mixture of experts.

From 7c33c7eab708f29bf5666bc79d1a62fabee92e85 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Tue, 7 Jan 2025 04:33:11 +0000
Subject: [PATCH 20/24] [Fix] #91 MVOUT coarse-grained DMA

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 6 +-----
 PyTorchSimFrontend/mlir/mlir_conv_template.py   | 2 +-
 PyTorchSimFrontend/mlir/mlir_gemm_template.py   | 2 +-
 PyTorchSimFrontend/mlir/mlir_template.py        | 4 ++--
 4 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 1f3f74ee..140ba3fd 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -815,7 +815,7 @@ def store_reduction(self, name, index, value):
         code = f"affine.dma_start %{buffer}[%c0, %c0], %{var}[{prefix}{indices}], %{name}_tag[0], %c{dmaType}, %c{mm_stride}, %c{chunk} : memref<{tile_row}x{tile_col}x{type_name}, 1>, memref<{self.buffer_types[name][1]}x{type_name}>, memref<1xi32>"
         self.cse.generate(self.reductions_suffix, code, assignment = False)
 
-    def codegen_body(self, subtile):
+    def codegen_body(self):
         # if not (
         #     self.loads
         #     or self.stores
@@ -835,10 +835,6 @@ def template_store(options):
         self.body.splice(self.compute)
         if len(self.stores._lines) == 0:
             template_store(self.render_options)
-        if subtile:
-            for i in range(len(self.stores._lines)):
-                if "affine.dma_start" in self.stores._lines[i]:
-                    self.stores._lines[i] += f" {{ subtile_size=[{self.vector_lane}, {self.vector_lane}], async=1 }}"
         self.body.splice(self.stores)
         self.loads.clear()
         self.compute.clear()
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index edc5bf0a..3f52a61d 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -48,7 +48,7 @@
             linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                     outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
         } { accumulation_loop=true }
-        affine.dma_start %Y_buffer[%c0, %c0], %Y[%index2], %tag[0], %c_mvout, %N, %c_set : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ M * N }}xf32>, memref<1xi32>  { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1 }
+        affine.dma_start %Y_buffer[%c0, %c0], %Y[%index2], %tag[0], %c_mvout, %N, %c_set : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ M * N }}xf32>, memref<1xi32>  { async=1 }
     } { outer_loop=true }
   } { outer_loop=true }
   return
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index afd0d9d6..954059c0 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -62,7 +62,7 @@
         linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                 outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
       } { accumulation_loop=true }
-      {{kernel.store_output(subtile=True)}}
+      {{kernel.store_output()}}
     } { outer_loop=true }
   } { outer_loop=true }
   return
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 20db65af..63d00dab 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -207,9 +207,9 @@ def hook():
         self.render_hooks["<OUPUT>"] = hook
         return "<OUPUT>"
 
-    def store_output(self, subtile=False):
+    def store_output(self):
         def hook():
-            self.codegen_body(subtile=subtile)
+            self.codegen_body()
             return textwrap.indent(self.body.getvalue(), "      ").strip()  #TODO: First line is not indented
 
         assert "<STORE_OUTPUT>" not in self.render_hooks

From 4086a47a2c3bbf544b834489451569baca76a317 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 8 Jan 2025 11:33:55 +0000
Subject: [PATCH 21/24] [Fix] BMM coarse-grained MVOUT

---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py | 2 +-
 tests/test_bmm.py                            | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 70dc796b..cd99d52e 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -61,7 +61,7 @@
           linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
                   outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>)
         } { accumulation_loop=true }
-        affine.dma_start %Y_buffer[%c0, %c0], %Y[%index2], %tag[0], %c_mvout, %N, %c_set : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ B * M * N }}xf32>, memref<1xi32> { subtile_size=[{{ kernel.vector_lane }}, {{ kernel.vector_lane }}], async=1 }
+        affine.dma_start %Y_buffer[%c0, %c0], %Y[%index2], %tag[0], %c_mvout, %N, %c_set : memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>, memref<{{ B * M * N }}xf32>, memref<1xi32> { async=1 }
       } { outer_loop=true }
     } { outer_loop=true }
   } { outer_loop=true }
diff --git a/tests/test_bmm.py b/tests/test_bmm.py
index 1e0eab59..483980d8 100644
--- a/tests/test_bmm.py
+++ b/tests/test_bmm.py
@@ -13,12 +13,12 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
         print("cpu out: ", cpu_out)
         exit(1)
 
-def test_BMM(device):
+def test_BMM(device, batch_size=1, m=32, n=16, k=64):
     def bmm(a, b):
         return torch.bmm(a, b.transpose(1, 2))
     torch.manual_seed(0)
-    a = torch.randn(1, 32, 64).to(device=device)
-    b = torch.randn(1, 16, 64).to(device=device)
+    a = torch.randn(batch_size, m, k).to(device=device)
+    b = torch.randn(batch_size, n, k).to(device=device)
     opt_fn = torch.compile(dynamic=False)(bmm)
     res = opt_fn(a, b)
     out = bmm(a.cpu(), b.cpu())
@@ -33,3 +33,4 @@ def bmm(a, b):
     module = ExecutionEngine.setup_device()
     device = module.custom_device()
     test_BMM(device)
+    test_BMM(device, 2, 512, 512, 512)

From d06d1974951a0f03ecacbab83c29f095be5213f7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 9 Jan 2025 01:30:50 +0000
Subject: [PATCH 22/24] [Frontend] Make operator return type #96

Currently, we don't tracking the all variables type. Return type
could be diverse accroding to operations. Now, opeations should
return it return type. So every type of variables(SSA form) are
tracking and we can support automatic type casting and broadcasting
---
 .../llvm/llvm_codegen_backend.py              |   2 +-
 .../mlir/mlir_codegen_backend.py              | 524 +++++++++++++++---
 PyTorchSimFrontend/mlir/mlir_common.py        |  36 +-
 3 files changed, 440 insertions(+), 122 deletions(-)

diff --git a/PyTorchSimFrontend/llvm/llvm_codegen_backend.py b/PyTorchSimFrontend/llvm/llvm_codegen_backend.py
index e8daa889..6951b5bd 100644
--- a/PyTorchSimFrontend/llvm/llvm_codegen_backend.py
+++ b/PyTorchSimFrontend/llvm/llvm_codegen_backend.py
@@ -212,7 +212,7 @@ def maximum(operand1, operand2, tile_size=4):
 
     @staticmethod
     def relu(x, tile_size=4):
-        return ops.maximum(x, ops.constant(0.0, torch.int32))
+        return ops.maximum(x, ops.constant(0.0, "f32"))
 
 SYMPY_TO_LLVM = {
     sympy.core.mul.Mul: "mul",
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 140ba3fd..0bbda189 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -107,53 +107,131 @@ def write_header(self):
         )
 
 class ExtensionOverrides(common.OpOverrides):
+    # Binary element wise operations
     @staticmethod
-    def add(operand1, operand2, tile_size=16, dtype="f32"):
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'arith.add{dtype[0]} %{operand1}, %{operand2} : {shape}'
+    def binary_elementwise_common(operand1, operand2, var_info):
+        op_type1 = var_info[operand1]
+        op_type2 = var_info[operand2]
+        # Tile size check
+        if op_type1[0] != op_type2[0]:
+            # Try to broad cast
+            lhs_tile_size, lhs_dtype = op_type1
+            rhs_tile_size, rhs_dtype = op_type2
+            if lhs_tile_size > rhs_tile_size:
+                operand2 = ops.broadcast(operand2, operand1, var_info=var_info)
+                op_type2 = var_info[operand2]
+            elif lhs_tile_size < rhs_tile_size:
+                operand1 = ops.broadcast(operand1, operand2, var_info=var_info)
+                op_type1 = var_info[operand1]
+
+        # Data type check
+        if op_type1[1] != op_type2[1]:
+            if op_type1[1] == "index" or op_type1 == "index":
+                if op_type1[1] == "index":
+                    operand1 = ops.index_cast(operand1, op_type2[1], var_info)
+                    op_type1 = var_info[operand1]
+                if op_type2[1] == "index":
+                    operand2 = ops.index_cast(operand2, op_type1[1], var_info)
+                    op_type2 = var_info[operand2]
+            elif op_type1[1][0] == "i" and op_type2[1][0] == "f":
+                operand1 = ops.to_dtype(operand1, op_type2[1], var_info)
+                op_type1 = var_info[operand1]
+            elif op_type1[1][0] == "f" and op_type2[1][0] == "i":
+                operand2 = ops.to_dtype(operand2, op_type1[1], var_info)
+                op_type2 = var_info[operand2]
+            else:
+                raise NotImplementedError("Unsupported type converting")
+
+        # Updated var info
+        tile_size = op_type1[0]
+        ret_type = op_type1[1]
+        return tile_size, ret_type, operand1, operand2
 
     @staticmethod
-    def sub(operand1, operand2, tile_size=16, dtype="f32"):
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'arith.sub{dtype[0]} %{operand1}, %{operand2} : {shape}'
+    def add(operand1, operand2, *args, var_info=None):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        opcode = f'arith.add{ret_type[0]}'
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def mul(operand1, operand2, tile_size=16, dtype="f32"):
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'arith.mul{dtype[0]} %{operand1}, %{operand2} : {shape}'
+    def sub(operand1, operand2, *args, var_info=None):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        opcode = f'arith.sub{ret_type[0]}'
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def div(operand1, operand2, tile_size=16, dtype="f32"):
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'arith.div{dtype[0]} %{operand1}, %{operand2} : {shape}'
+    def mul(operand1, operand2, *args, var_info=None):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        opcode = f'arith.mul{ret_type[0]}'
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def truediv(operand1, operand2, tile_size=16, dtype="f32"):
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'arith.div{dtype[0]} %{operand1}, %{operand2} : {shape}'
+    def div(operand1, operand2, *args, var_info=None):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        if ret_type[0] == "f":
+            opcode = f'arith.divf'
+        else:
+            opcode = f'arith.divui'
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def truediv(operand1, operand2, *args, var_info=None):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        if ret_type[0] == "f":
+            opcode = f'arith.divf'
+        else:
+            opcode = f'arith.divui'
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def to_dtype(x, dst_type, src_dtype=None, tile_size=16, dtype="f32"):
-        mlir_dtype = mlir_common.DTYPE_TO_MLIR[dst_type]
-        src_mlir_dtype = mlir_common.DTYPE_TO_MLIR[src_dtype]
+    def minimum(operand1, operand2, *args, var_info=None):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        if ret_type[0] == "f":
+            opcode = f'arith.minimumf'
+        else:
+            opcode = f'arith.minimumui'
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
-        dst_bits = 1 if dst_type == torch.bool else torch.finfo(dst_type).bits if dst_type.is_floating_point else torch.iinfo(dst_type).bits
-        src_bits = 1 if src_dtype == torch.bool else torch.finfo(src_dtype).bits if src_dtype.is_floating_point else torch.iinfo(src_dtype).bits
-        shape = f"vector<{tile_size}x{mlir_dtype}>" if tile_size > 1 else mlir_dtype
+    @staticmethod
+    def maximum(operand1, operand2, *args, var_info=None):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        if ret_type[0] == "f":
+            opcode = f'arith.maximumf'
+        else:
+            opcode = f'arith.maximumui'
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def to_dtype(operand, dst_mlir_dtype, *args, var_info=None):
+        src_mlir_dtype = var_info[operand][1]
+        tile_size = var_info[operand][0]
+
+        dst_bits = int(dst_mlir_dtype[1:])
+        src_bits = int(src_mlir_dtype[1:])
+        shape = f"vector<{tile_size}x{dst_mlir_dtype}>" if tile_size > 1 else dst_mlir_dtype
         src_shape = f"vector<{tile_size}x{src_mlir_dtype}>" if tile_size > 1 else src_mlir_dtype
-        if dst_type.is_floating_point and not src_dtype.is_floating_point:
+        if dst_mlir_dtype[0] == "i" and src_mlir_dtype[0] == "f":
             raise NotImplementedError("floating point to integer conversion")
-        elif not dst_type.is_floating_point and src_dtype.is_floating_point:
+        if dst_mlir_dtype[0] == "f" and src_mlir_dtype[0] == "i":
             raise NotImplementedError("integer to floating point conversion")
         else:
             if dst_bits > src_bits:
-                return f"arith.extui %{x} : {src_shape} to {shape}"
+                return f"arith.extui %{operand} : {src_shape} to {shape}"
             elif dst_bits < src_bits:
-                return f"arith.trunc %{x} : {src_shape} to {shape}"
+                return f"arith.trunc %{operand} : {src_shape} to {shape}"
 
     @staticmethod
-    def constant(value, src_type, tile_size=16, dtype="f32"):
-        src_type = mlir_common.DTYPE_TO_MLIR[src_type]
+    def constant(value, src_type, *args, var_info=None):
+        if isinstance(src_type, torch.dtype):
+            src_type = mlir_common.DTYPE_TO_MLIR[src_type]
+
         # if value represented by e notation, convert to float (ex 1e-3 -> 1.0e-3)
         if "e" in str(value):
             value = float(value)
@@ -161,89 +239,346 @@ def constant(value, src_type, tile_size=16, dtype="f32"):
             value = format(value, ".20f")
         if src_type[0] == "i":
             value = int(value)
-        return f'arith.constant {value} : {src_type}'
+        return f'arith.constant {value} : {src_type}', [1, src_type]
 
+    # transcendental functions
     @staticmethod
-    def exp(operand, tile_size=16, dtype="f32"):
+    def exp(operand, *args, var_info=None):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.exp %{operand} : {shape}'
+        return f'math.exp %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def maximum(operand1, operand2, tile_size=16, dtype="f32"):
+    def sqrt(operand, *args, var_info=None):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
+        # Type check & auto cast
+        if dtype[0] != "f":
+            operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
+            var_info[operand] = dtype
+
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'arith.maximum{dtype[0]} %{operand1}, %{operand2} : {shape}'
+        return f'math.sqrt %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def sqrt(x, tile_size=16, dtype="f32"):
+    def rsqrt(operand, *args, var_info=None):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
+        # Type check & auto cast
+        if dtype[0] != "f":
+            operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
+            var_info[operand] = dtype
+
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.sqrt %{x} : {shape}'
+        return f'math.rsqrt %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def ne(operand1, operand2, tile_size=16, dtype="f32"):
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else "i1"
-        return f'arith.cmp{dtype[0]} one, %{operand1}, %{operand2} : {shape}'
+    def pow(operand1, operand2, *args, var_info=None):
+        op_type1 = var_info[operand1]
+        op_type2 = var_info[operand2]
+
+        # Type check & auto cast
+        if op_type1[1][0] != "f":
+            operand1, dtype = ops.to_dtype(operand1, "f32", var_info=var_info)
+            var_info[operand1] = dtype
+
+        # Type check & auto cast
+        if op_type2[1][0] != "f":
+            operand2, dtype = ops.to_dtype(operand2, "f32", var_info=var_info)
+            var_info[operand2] = dtype
+
+        op_type1 = var_info[operand1]
+        tile_size = op_type1[0]
+        dtype = op_type1[1]
+
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f"math.pow{dtype[0]} %{operand1}, %{operand2} : {shape}", []
 
     @staticmethod
-    def lt(operand1, operand2, tile_size=16, dtype="f32"):
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else "i1"
-        return f'arith.cmp{dtype[0]} olt, %{operand1}, %{operand2} : {shape}'
+    def log(operand, *args, var_info=None):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
+        # Type check & auto cast
+        if dtype[0] != "f":
+            operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
+            var_info[operand] = dtype
+
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f'math.log %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def gt(operand1, operand2, tile_size=16, dtype="f32"):
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else "i1"
-        return f'arith.cmp{dtype[0]} ogt, %{operand1}, %{operand2} : {shape}'
+    def reciprocal(operand, *args, var_info):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
+        # Type check & auto cast
+        if dtype[0] != "f":
+            operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
+            var_info[operand] = dtype
 
+        return ops.div(ops.constant(1.0, dtype), operand), [tile_size, dtype]
+
+    # Logical operations
     @staticmethod
-    def le(operand1, operand2, tile_size=16, dtype="f32"):
+    def neg(operand, *args, var_info=None):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
+        # Type check & auto cast
+        if dtype[0] != "f":
+            operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
+            var_info[operand] = dtype
+
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'arith.cmp{dtype[0]} ole, %{operand1}, %{operand2} : {shape}'
+        return f'arith.negf %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def relu(x, tile_size=16, dtype=None):
-        return ops.maximum(x, ops.constant(0.0, torch.float32))
+    def eq(operand1, operand2, *args, var_info=None):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        if ret_type[0] == "f":
+            op_type = "arith.cmpf"
+            attribute = "oeq"
+        elif ret_type[0] == "i":
+            op_type = "arith.cmpi"
+            attribute = "eq"
+        else:
+            raise ValueError(f"Unsupported data type for 'eq' operation: {ret_type}")
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def sigmoid(x, tile_size=16, dtype=None):
-        one = ops.constant(1, torch.float32)
-        return ops.truediv(one, ops.add(one, ops.exp(ops.neg(x))))
+    def ne(operand1, operand2, *args, var_info=None):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        if ret_type[0] == "f":
+            op_type = "arith.cmpf"
+            attribute = "one"
+        elif ret_type[0] == "i":
+            op_type = "arith.cmpi"
+            attribute = "sne"
+        else:
+            raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}")
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def neg(x, tile_size=16, dtype="f32"):
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'arith.neg{dtype[0]} %{x} : {shape}'
+    def lt(operand1, operand2, *args, var_info=None):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        if ret_type[0] == "f":
+            op_type = "arith.cmpf"
+            attribute = "olt"
+        elif ret_type[0] == "i":
+            op_type = "arith.cmpi"
+            attribute = "slt"
+        else:
+            raise ValueError(f"Unsupported data type for 'lt' operation: {ret_type}")
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def where(condition, x, y, tile_size=16, dtype="f32"):
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        cond_shape = f"vector<{tile_size}xi1>," if tile_size > 1 else ""
-        return f"arith.select %{condition}, %{x}, %{y} : {cond_shape} {shape}"
+    def gt(operand1, operand2, *args, var_info=None):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        if ret_type[0] == "f":
+            op_type = "arith.cmpf"
+            attribute = "ogt"
+        elif ret_type[0] == "i":
+            op_type = "arith.cmpi"
+            attribute = "sgt"
+        else:
+            raise ValueError(f"Unsupported data type for 'gt' operation: {ret_type}")
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def logical_not(operand, tile_size=16, dtype="f32"):
-        tile_size=16
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        result_shape = f"vector<{tile_size}xi1>" if tile_size > 1 else "i1"
+    def le(operand1, operand2, *args, var_info=None):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        if ret_type[0] == "f":
+            op_type = "arith.cmpf"
+            attribute = "ole"
+        elif ret_type[0] == "i":
+            op_type = "arith.cmpi"
+            attribute = "sle"
+        else:
+            raise ValueError(f"Unsupported data type for 'le' operation: {ret_type}")
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
+
+    @staticmethod
+    def ge(operand1, operand2, *args, var_info=None):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        if ret_type[0] == "f":
+            op_type = "arith.cmpf"
+            attribute = "oge"
+        elif ret_type[0] == "i":
+            op_type = "arith.cmpi"
+            attribute = "sge"
+        else:
+            raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}")
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
+
+    @staticmethod
+    def and_(operand1, operand2, *args, var_info=None):
+        op_type1 = var_info[operand1]
+        op_type2 = var_info[operand2]
+
+        # Type check & auto cast
+        if op_type1[1][0] != "i":
+            operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info)
+            var_info[operand1] = dtype
+
+        # Type check & auto cast
+        if op_type2[1][0] != "i":
+            operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info)
+            var_info[operand2] = dtype
+
+        ret_type = op_type1[1]
+        tile_size = op_type1[0]
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'arith.andi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def or_(operand1, operand2, *args, var_info=None):
+        op_type1 = var_info[operand1]
+        op_type2 = var_info[operand2]
+
+        # Type check & auto cast
+        if op_type1[1][0] != "i":
+            operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info)
+            var_info[operand1] = dtype
+
+        # Type check & auto cast
+        if op_type2[1][0] != "i":
+            operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info)
+            var_info[operand2] = dtype
+
+        ret_type = op_type1[1]
+        tile_size = op_type1[0]
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'arith.ori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def xor(operand1, operand2, *args, var_info=None):
+        op_type1 = var_info[operand1]
+        op_type2 = var_info[operand2]
+
+        # Type check & auto cast
+        if op_type1[1][0] != "i":
+            operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info)
+            var_info[operand1] = dtype
+
+        # Type check & auto cast
+        if op_type2[1][0] != "i":
+            operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info)
+            var_info[operand2] = dtype
+
+        ret_type = op_type1[1]
+        tile_size = op_type1[0]
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'arith.xori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+
+    @staticmethod
+    def logical_and(operand, *args, var_info=None):
+        raise NotImplementedError("logical_and")
+
+    @staticmethod
+    def logical_not(operand, *args, var_info=None):
         raise NotImplementedError("logical_not")
-        return f"arith.cmp{dtype[0]} oeq, %{operand}, %zero_vec{tile_size} : {shape} -> {result_shape}"
 
     @staticmethod
-    def rsqrt(x, tile_size=16, dtype="f32"):
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.rsqrt %{x} : {shape}'
+    def logical_or(operand, *args, var_info=None):
+        raise NotImplementedError("logical_not")
 
     @staticmethod
-    def pow(a, b, tile_size=16, dtype="f32"):
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f"math.pow{dtype[0]} %{a}, %{b} : {shape}"
+    def logical_xor(operand, *args, var_info=None):
+        raise NotImplementedError("logical_not")
 
     @staticmethod
-    def log(x, tile_size=16, dtype="f32"):
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.log %{x} : {shape}'
+    def relu(operand, *args, var_info=None):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        ret_type = "f32"
+        return ops.maximum(operand, ops.constant(0.0, "f32")), [tile_size, ret_type]
+
+    @staticmethod
+    def sigmoid(operand, *args, var_info=None):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        ret_type = "f32"
+        one = ops.constant(1, "f32")
+        return ops.truediv(one, ops.add(one, ops.exp(ops.neg(operand)))), [tile_size, ret_type]
+
+    # Special operaitons
+    @staticmethod
+    def where(condition, operand1, operand2, *args, var_info=None):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        cond_type = var_info[condition]
+        if cond_type[0] != tile_size:
+            condition = ops.broadcast(condition, operand1, var_info=var_info)
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        cond_shape = f"vector<{tile_size}xi1>," if tile_size > 1 else ""
+        return f"arith.select %{condition}, %{operand1}, %{operand2} : {cond_shape} {shape}", [tile_size, ret_type]
+
 
     @staticmethod
-    def reciprocal(a, tile_size=16, dtype="f32"):
-        return ops.div(ops.constant(1.0, torch.float32), a)
+    def masked(mask, body, other, *args, var_info=None, tile_size=16, dtype="f32", ninf_declared=False):
+        result = body()
+        val = ops.constant(0.0, "f32")
+        result = ops.where(mask, result, val)
+        return result, var_info[result]
+
+    @staticmethod
+    def _index_expr(operand, *args, var_info=None, **kwargs):
+        symbols = sorted(operand.free_symbols)
+        renamed_symbols = {symbol: sympy.Symbol(f"d{i}") for i, symbol in enumerate(symbols)}
+
+        renamed_expression = operand.subs(renamed_symbols)
+
+        affine_map_str = "(" + ", ".join([f"d{i}" for i in range(len(symbols))]) + ") -> ("
+        affine_map_str += sympy.printing.ccode(renamed_expression) + ")"
+
+        map_operands = [f"%{str(symbol)}" for symbol in symbols]
+        return f"affine.apply affine_map<{affine_map_str}>({', '.join(map_operands)})", [1, "index"]
+
+    @staticmethod
+    def index_expr(operand, *args, var_info=None, **kwargs):
+        result = ops._index_expr(operand)
+        ret_type = [1, "index"]
+        return result, ret_type
+
+    @staticmethod
+    def index_cast(operand, target_type, *args, var_info=None, **kwrags):
+        return f"arith.index_cast %{operand} : index to {target_type}", [1, target_type]
+
+    @staticmethod
+    def broadcast(operand1, operand2, *args, var_info=None):
+        op_type1 = var_info[operand1]
+        op_type2 = var_info[operand2]
+        src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>" if op_type1[0] > 1 else op_type1[1]
+        des_shape = f"vector<{op_type2[0]}x{op_type1[1]}>" if op_type2[0] > 1 else op_type1[1] # Use tile size only
+        expand = f"vector.broadcast %{operand1} : {src_shape} to {des_shape}"
+        return expand, op_type2
 
 RTYPE_TO_MLIR = {
     "sum": "add",
@@ -325,7 +660,7 @@ def __init__(self):
         self.reduction_suffix = IndentedBuffer()
         self.body = IndentedBuffer()
         self.global_vars = IndentedBuffer()
-        self.global_vars_set = set()
+        self.global_vars_dict = dict()
         self.header = IndentedBuffer()
         self.gem5_header = IndentedBuffer()
         self.reduction_vars = {}
@@ -563,7 +898,8 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         shape = f", vector<{tile_size_per_lane}x{type_name}>" if tile_size_per_lane > 1 else ""
         line = f"{operation} %{buffer}[0, 0] : memref<{self.render_options['TILE_M']}x{self.render_options['TILE_N']}x{type_name}, 1>{shape}"
         out = self.cse.generate(self.loads, line)
-        self.tile_info[out] = tile_size_per_lane, dtype
+        var_info = [tile_size_per_lane, mlir_common.DTYPE_TO_MLIR[dtype]]
+        self.register_var_info(out, var_info)
         return out
 
     def load(self, name: str, index: sympy.Expr):
@@ -582,7 +918,7 @@ def load(self, name: str, index: sympy.Expr):
         dram_tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
 
         # Define scratch pad buffer
-        buffer, indices = self.get_scratchpad_buffer(dtype, name, self.tile_desc.n_row, self.tile_desc.n_col, dram_tile_shape, self.loads, indices)
+        buffer, indices = self.get_scratchpad_buffer(dtype, name, self.tile_desc.n_row, self.tile_desc.n_col, dram_tile_shape, self.loads, indices, index)
         # MVIN Encoding
         dma_key = (stride, chunk, dtype)
         if dma_key in self.dma_cache:
@@ -604,7 +940,8 @@ def load(self, name: str, index: sympy.Expr):
         shape = f", vector<{tile_size_per_lane}x{type_name}>" if tile_size_per_lane > 1 else ""
         line = f"{operation} %{buffer}[0, 0] : memref<{dram_tile_shape}x{type_name}, 1>{shape}"
         out = self.cse.generate(self.loads, line)
-        self.tile_info[out] = tile_size_per_lane, dtype
+        var_info = [tile_size_per_lane, mlir_common.DTYPE_TO_MLIR[dtype]]
+        self.register_var_info(out, var_info)
         return out
 
     def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
@@ -625,7 +962,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
             buffer = self.buffer_names[name]
         else:
             dram_tile_shape = f"{self.render_options['TILE_M']}x{self.render_options['TILE_N']}"
-            buffer, indices = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], dram_tile_shape, self.stores, indices)
+            buffer, indices = self.get_scratchpad_buffer(dtype, name, self.render_options['TILE_M'], self.render_options['TILE_N'], dram_tile_shape, self.stores, indices, index)
             self.buffer_names[name] = buffer
 
         tile_size_per_lane = self.render_options['TILE_M'] * self.render_options['TILE_N'] // self.vector_lane
@@ -655,7 +992,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         dram_tile_shape = f"{tile_shape[0]}x{tile_shape[1]}"
 
         # Define scratch pad buffer
-        buffer, indices = self.get_scratchpad_buffer(dtype, name, self.tile_desc.n_row, self.tile_desc.n_col, dram_tile_shape, self.stores, indices)
+        buffer, indices = self.get_scratchpad_buffer(dtype, name, self.tile_desc.n_row, self.tile_desc.n_col, dram_tile_shape, self.stores, indices, index)
 
         # MVOUT Encoding
         dmaType = 3 # MVIN 2, MVIN2 1, MVIN3 14, MVOUT 3
@@ -663,7 +1000,7 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         self.consts.add(stride)
         self.consts.add(chunk)
 
-        store_size = self.tile_info[value][0]
+        store_size = self.var_info[value][0]
         operation = "affine.vector_store" if tile_size_per_lane > 1 and store_size > 1 else "affine.store"
         shape = f", vector<{tile_size_per_lane}x{type_name}>" if tile_size_per_lane > 1 and store_size > 1 else ""
 
@@ -728,13 +1065,14 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                     init_vec = init
                     axis = "0, 1"
                     acc_var = init
-                    self.tile_info[acc] = 1, dtype
+                    var_info = [1, mlir_common.DTYPE_TO_MLIR[dtype]]
                 else:
                     reduced_shape = f"vector<{vec_len}x{type_name}>"
                     init_vec = self.cse.generate(self.reduction_prefix, f"vector.broadcast %{init} : {type_name} to {reduced_shape}")
                     axis = "0"
                     acc_var = init_vec
-                    self.tile_info[acc] = vec_len, dtype
+                    var_info = [vec_len, mlir_common.DTYPE_TO_MLIR[dtype]]
+                self.register_var_info(acc, var_info)
             else:
                 raise NotImplementedError()
 
@@ -761,7 +1099,7 @@ def store_reduction(self, name, index, value):
         tile_col = self.tile_desc.n_row
         tile_row = 1
         dram_tile_shape = f"{tile_row}x{tile_col}"
-        buffer, indices = self.get_scratchpad_buffer(dtype, name, tile_row, tile_col, dram_tile_shape, self.reductions_suffix, indices)
+        buffer, indices = self.get_scratchpad_buffer(dtype, name, tile_row, tile_col, dram_tile_shape, self.reductions_suffix, indices, index)
         if self.welford_reduce_out is not None:
             # raise NotImplementedError()
             sum, sqr_sum, _ = self.welford_reduce_out
@@ -769,7 +1107,7 @@ def store_reduction(self, name, index, value):
             # mean
             divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.ranges[self.reduction_depth])} : f32")
             if self.buffer_types[name][1] > 1:
-                divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to vector<{self.tile_info[sum][0]}x{type_name}>")
+                divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to vector<{self.var_info[sum][0]}x{type_name}>")
             else:
                 divider_vec = f"f{self.buffer_types[name][1]}"
             mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{sum}, %{divider_vec} : {shape}")
@@ -1001,7 +1339,7 @@ def set_ranges(self, lengths, reduction_lengths, read_writes):
             self.itervars[self.reduction_depth :],
         )
 
-    def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, indices):
+    def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape, code_buffer, indices, raw_index):
         c_type = mlir_common.DTYPE_TO_C[dtype]
         mlir_type = mlir_common.DTYPE_TO_MLIR[dtype]
         # Make sure each lane's buffer has at least two element
@@ -1014,13 +1352,17 @@ def get_scratchpad_buffer(self, dtype, name, tile_row, tile_col, dram_tile_shape
                 mapping = self.map_cse.generate(self.global_vars, f"affine_map<({indices}) -> ({indices} floordiv 8)>")
             indices = self.cse.generate(self.loads, f"affine.apply #{mapping}(%{indices})") # FIXME. Only loads?
 
-        if name not in self.global_vars_set:
+        if name not in self.global_vars_dict:
+            self.global_vars_dict[name] = set()
+
+        if str(raw_index) not in self.global_vars_dict[name]:
+            new_name = f"{name}_{len(self.global_vars_dict[name])}"
             # Add definition to header
-            self.header.writeline(f"{c_type} {name}_spad[{tile_size // self.vector_lane}] __attribute__ ((section(\".spad\")));")
-            self.gem5_header.writeline(f"{c_type} {name}_spad[{tile_size}];")
-            self.global_vars_set.add(name)
-            self.global_vars.writeline(f"memref.global @{name}_spad : memref<{dram_tile_shape}x{mlir_type}, 1>")
-        buffer = self.cse.generate(code_buffer, f"memref.get_global @{name}_spad : memref<{dram_tile_shape}x{mlir_type}, 1>")
+            self.header.writeline(f"{c_type} {new_name}_spad[{tile_size // self.vector_lane}] __attribute__ ((section(\".spad\")));")
+            self.gem5_header.writeline(f"{c_type} {new_name}_spad[{tile_size}];")
+            self.global_vars.writeline(f"memref.global @{new_name}_spad : memref<{dram_tile_shape}x{mlir_type}, 1>")
+        self.global_vars_dict[name].add(str(raw_index))
+        buffer = self.cse.generate(code_buffer, f"memref.get_global @{new_name}_spad : memref<{dram_tile_shape}x{mlir_type}, 1>")
         return buffer, indices
 
     def roundup_vectorlane(self, size, amp=1):
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index f76fd0cc..912704b5 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -172,7 +172,7 @@ def __init__(self, args=None):
         self.tile_col = extension_config.CONFIG_TILE_COL
         if self.tile_col == -1:
             self.tile_col = 8 # FIXME: tile_col is not always vector_lane * vlen
-        self.tile_info = {}
+        self.var_info = {}
 
     def load(self, name: str, index: sympy.Expr):
         raise NotImplementedError()
@@ -193,32 +193,8 @@ def check_dtype_in_args(self, args):
                 dtype = arg
         return dtype
 
-    def expand(self, args, buf_bounds):
-        cse_args = [arg for arg in args if isinstance(arg, common.CSEVariable)]
-        if len(cse_args) == 0:
-            return args, 1, self.check_dtype_in_args(args)
-        elif len(cse_args) == 1:
-            if not cse_args[0] in self.tile_info:
-                return args, 1, self.check_dtype_in_args(cse_args)
-            info = self.tile_info[cse_args[0]]
-            return args, info[0], info[1]
-        lhs_idx = args.index(cse_args[-2])
-        rhs_idx = args.index(cse_args[-1])
-        if not args[lhs_idx] in self.tile_info or not args[rhs_idx] in self.tile_info:
-            return args, 1, self.check_dtype_in_args(args)
-        lhs_tile_size, lhs_dtype = self.tile_info[args[lhs_idx]]
-        rhs_tile_size, rhs_dtype = self.tile_info[args[rhs_idx]]
-        lhs_shape = f"vector<{lhs_tile_size}x{DTYPE_TO_MLIR[lhs_dtype]}>" if lhs_tile_size > 1 else DTYPE_TO_MLIR[lhs_dtype]
-        rhs_shape = f"vector<{rhs_tile_size}x{DTYPE_TO_MLIR[rhs_dtype]}>" if rhs_tile_size > 1 else DTYPE_TO_MLIR[rhs_dtype]
-        temp = list(args)
-        if lhs_tile_size > rhs_tile_size:
-            expand = f"vector.broadcast %{args[rhs_idx]} : {rhs_shape} to {lhs_shape}"
-            temp[rhs_idx] = self.cse.generate(self.compute, expand, bounds=buf_bounds)
-        elif lhs_tile_size < rhs_tile_size:
-            expand = f"vector.broadcast %{args[lhs_idx]} : {lhs_shape} to {rhs_shape}"
-            temp[lhs_idx] = self.cse.generate(self.compute, expand, bounds=buf_bounds)
-        args = tuple(temp)
-        return args, max(lhs_tile_size, rhs_tile_size), lhs_dtype
+    def register_var_info(self, var, var_info):
+        self.var_info[var] = var_info
 
     def __enter__(self):
         class CSEProxy:
@@ -235,13 +211,13 @@ def inner(*args, **kwargs):
                         buf_bounds = self.node_to_bounds.get(
                             fx_node, ValueRanges.unknown()
                         )
-                    args, tile_size, dtype = self.expand(args, buf_bounds)
+                    code, ret_info = getattr(parent_handler, name)(*args, var_info=self.var_info)
                     csevar = self.cse.generate(
                         self.compute,
-                        getattr(parent_handler, name)(*args, tile_size=tile_size, dtype=DTYPE_TO_MLIR[dtype], **kwargs),  # type: ignore[has-type]
+                        code,
                         bounds=buf_bounds,
                     )
-                    self.tile_info[csevar] = tile_size, dtype
+                    self.register_var_info(csevar, ret_info)
                     csevar.update_on_args(name, args, kwargs)
                     return csevar
 

From f90d46d9b8071b65e78e99553bc91d9af1f47740 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 9 Jan 2025 04:44:43 +0000
Subject: [PATCH 23/24] [Frontend] Make custom pooling disabled

---
 .../mlir/mlir_codegen_backend.py              | 29 +++++++++++++++----
 PyTorchSimFrontend/mlir/mlir_lowering.py      |  3 +-
 tests/test_resnet.py                          |  8 ++---
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 0bbda189..3d65aa53 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -108,6 +108,15 @@ def write_header(self):
 
 class ExtensionOverrides(common.OpOverrides):
     # Binary element wise operations
+    @staticmethod
+    def custom_cast(operand, target_type, *args, var_info=None):
+        dtype = var_info[operand][1]
+        if dtype == "index":
+            ret = ops.index_cast(operand, target_type, var_info=var_info)
+        else:
+            ret = ops.to_dtype(operand, target_type, var_info=var_info)
+        return ret, var_info[ret]
+
     @staticmethod
     def binary_elementwise_common(operand1, operand2, var_info):
         op_type1 = var_info[operand1]
@@ -533,8 +542,12 @@ def sigmoid(operand, *args, var_info=None):
     def where(condition, operand1, operand2, *args, var_info=None):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         cond_type = var_info[condition]
-        if cond_type[0] != tile_size:
+        if cond_type[0] < tile_size:
             condition = ops.broadcast(condition, operand1, var_info=var_info)
+        elif cond_type[0] > tile_size:
+            operand1 = ops.broadcast(operand1, condition, var_info=var_info)
+            operand2 = ops.broadcast(operand2, condition, var_info=var_info)
+        tile_size, ret_type = var_info[operand1]
 
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         cond_shape = f"vector<{tile_size}xi1>," if tile_size > 1 else ""
@@ -550,7 +563,7 @@ def masked(mask, body, other, *args, var_info=None, tile_size=16, dtype="f32", n
 
     @staticmethod
     def _index_expr(operand, *args, var_info=None, **kwargs):
-        symbols = sorted(operand.free_symbols)
+        symbols = sorted([str(i) for i in operand.free_symbols])
         renamed_symbols = {symbol: sympy.Symbol(f"d{i}") for i, symbol in enumerate(symbols)}
 
         renamed_expression = operand.subs(renamed_symbols)
@@ -569,7 +582,11 @@ def index_expr(operand, *args, var_info=None, **kwargs):
 
     @staticmethod
     def index_cast(operand, target_type, *args, var_info=None, **kwrags):
-        return f"arith.index_cast %{operand} : index to {target_type}", [1, target_type]
+        op_type = var_info[operand]
+        src_shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else op_type[1]
+        des_shape = f"vector<{op_type[0]}x{target_type}>" if op_type[0] > 1 else target_type
+        return f"arith.index_cast %{operand} : {src_shape} to {des_shape}", [op_type[0], target_type]
+
 
     @staticmethod
     def broadcast(operand1, operand2, *args, var_info=None):
@@ -578,7 +595,7 @@ def broadcast(operand1, operand2, *args, var_info=None):
         src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>" if op_type1[0] > 1 else op_type1[1]
         des_shape = f"vector<{op_type2[0]}x{op_type1[1]}>" if op_type2[0] > 1 else op_type1[1] # Use tile size only
         expand = f"vector.broadcast %{operand1} : {src_shape} to {des_shape}"
-        return expand, op_type2
+        return expand, [op_type2[0], op_type1[1]]
 
 RTYPE_TO_MLIR = {
     "sum": "add",
@@ -1000,9 +1017,11 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         self.consts.add(stride)
         self.consts.add(chunk)
 
-        store_size = self.var_info[value][0]
+        store_size, operand_type = self.var_info[value]
         operation = "affine.vector_store" if tile_size_per_lane > 1 and store_size > 1 else "affine.store"
         shape = f", vector<{tile_size_per_lane}x{type_name}>" if tile_size_per_lane > 1 and store_size > 1 else ""
+        if type_name != operand_type:
+            value = ops.custom_cast(value, type_name, var_info=self.var_info)
 
         line = f"{operation} %{value}, %{buffer}[0, 0] : memref<{dram_tile_shape}x{type_name}, 1>{shape}"
         self.cse.generate(self.stores, line, assignment = False)
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index c1950f82..e7ca37eb 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -144,5 +144,4 @@ def custom_maxpool(
 lowerings.update({getattr(aten.mm, overload): tuned_mm for overload in aten.mm.overloads()})
 lowerings.update({getattr(aten.addmm, overload): tuned_addmm for overload in aten.addmm.overloads()})
 lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()})
-lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
-lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()})
\ No newline at end of file
+lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
\ No newline at end of file
diff --git a/tests/test_resnet.py b/tests/test_resnet.py
index 1fe0c674..37f8a583 100644
--- a/tests/test_resnet.py
+++ b/tests/test_resnet.py
@@ -14,11 +14,11 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
         exit(1)
 
 def test_resnet(device):
-    from torchvision.models import resnet18
-    model = resnet18().eval()
-    model.to(device)
+    from torchvision.models import resnet
+    model = resnet._resnet(resnet.BasicBlock, [1, 1, 0, 0], weights=None, progress=False).eval()
+    model.to(device, memory_format=torch.channels_last)
     input = torch.randn(1, 3, 224, 224).to(device=device)
-    x1 = input.to(device=device)
+    x1 = input.to(device=device, memory_format=torch.channels_last)
     opt_fn = torch.compile(dynamic=False)(model)
     res = opt_fn(x1)
     print("ResNet18 Simulation Done")

From 9893b09c7cf287635e1aedf68f11c513fa055f59 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 9 Jan 2025 10:58:55 +0000
Subject: [PATCH 24/24] [CI] Add build condition

---
 .github/workflows/docker-base-image.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/docker-base-image.yml b/.github/workflows/docker-base-image.yml
index dd8c0953..82614c8c 100644
--- a/.github/workflows/docker-base-image.yml
+++ b/.github/workflows/docker-base-image.yml
@@ -3,6 +3,8 @@ name: Docker Base Image CI
 on:
   push:
     branches: [ "base" ]
+  repository_dispatch:
+    types: [ build_base ]
 
 jobs:
   build: