diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 61eb96e1..eba48da2 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -6,7 +6,7 @@ on:
 
 jobs:
   build-and-test:
-    runs-on: ubuntu-latest
+    runs-on: self-hosted
 
     permissions:
       contents: read
@@ -35,6 +35,7 @@ jobs:
           context: .
           file: ./Dockerfile
           push: true
+          no-cache: true
           tags: ghcr.io/psal-postech/torchsim-test:${{ github.sha }}
 
       # Step 4: Wait for GHCR propagation
diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml
index c27df48a..32d6543c 100644
--- a/.github/workflows/pytorchsim_test.yml
+++ b/.github/workflows/pytorchsim_test.yml
@@ -662,3 +662,37 @@ jobs:
             -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
             -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_scheduler.py
+
+  test_accuracy:
+    name: Run test_accuracy
+    runs-on: self-hosted
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Prepare volume directory
+        run: mkdir -p /tmp/torchsim-ci/${GITHUB_SHA}
+
+      - name: Run run_cycle.sh
+        run: |
+          echo "Running run_cycle.sh"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
+            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            ${{ inputs.image_name }} bash -c \
+            "cd /workspace && PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh && \
+            cp PyTorchSim/experiments/artifact/cycle_validation/summary_cycle.out /dump/summary_cycle.out"
+          ls /tmp/torchsim-ci/${GITHUB_SHA}
+
+      - name: Upload Accuracy Report Artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: accuracy-report
+          path: /tmp/torchsim-ci/${{ github.sha }}/summary_cycle.out
+          if-no-files-found: error
diff --git a/.gitignore b/.gitignore
index 88eb2fb8..9decced5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,3 @@
 __pycache__/
-PyTorchSimBackend/build/
+TOGSim/build/
 .vscode
diff --git a/.gitmodules b/.gitmodules
index f65e5f2b..24f9ccaf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,18 +1,15 @@
-[submodule "PyTorchSimBackend/extern/onnx"]
-	path = PyTorchSimBackend/extern/onnx
+[submodule "TOGSim/extern/onnx"]
+	path = TOGSim/extern/onnx
 	url = https://github.com/onnx/onnx.git
-[submodule "PyTorchSimBackend/extern/protobuf"]
-	path = PyTorchSimBackend/extern/protobuf
+[submodule "TOGSim/extern/protobuf"]
+	path = TOGSim/extern/protobuf
 	url = https://github.com/protocolbuffers/protobuf.git
-[submodule "PyTorchSimBackend/extern/booksim"]
-	path = PyTorchSimBackend/extern/booksim
+[submodule "TOGSim/extern/booksim"]
+	path = TOGSim/extern/booksim
 	url = https://github.com/PSAL-POSTECH/booksim.git
-[submodule "PyTorchSimBackend/extern/torch2timeloop"]
-        path = PyTorchSimBackend/extern/torch2timeloop
-        url = https://github.com/Accelergy-Project/pytorch2timeloop-converter.git
-[submodule "PyTorchSimBackend/extern/ramulator2"]
-	path = PyTorchSimBackend/extern/ramulator2
+[submodule "TOGSim/extern/ramulator2"]
+	path = TOGSim/extern/ramulator2
 	url = https://github.com/PSAL-POSTECH/ramulator2
-[submodule "PyTorchSimBackend/extern/stonneCore"]
-	path = PyTorchSimBackend/extern/stonneCore
+[submodule "TOGSim/extern/stonneCore"]
+	path = TOGSim/extern/stonneCore
 	url = https://github.com/PSAL-POSTECH/stonne_core.git
diff --git a/Dockerfile b/Dockerfile
index 293dcb60..37721940 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,7 +4,7 @@ FROM ghcr.io/psal-postech/torchsim_base:latest
 # Prepare PyTorchSim project
 COPY . /workspace/PyTorchSim
 
-RUN cd PyTorchSim/PyTorchSimBackend && \
+RUN cd PyTorchSim/TOGSim && \
     mkdir -p build && \
     cd build && \
     conan install .. --build=missing && \
diff --git a/Dockerfile.ksc2025 b/Dockerfile.ksc2025
new file mode 100644
index 00000000..2ac210e0
--- /dev/null
+++ b/Dockerfile.ksc2025
@@ -0,0 +1,90 @@
+# Copyright (c) 2020 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime
+
+# Copied from Gem5 Docker file
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt -y update && apt -y upgrade && \
+    apt -y install build-essential git m4 scons zlib1g zlib1g-dev \
+    libprotobuf-dev protobuf-compiler libprotoc-dev libgoogle-perftools-dev \
+    python3-dev python-is-python3 doxygen libboost-all-dev \
+    libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config pip \
+    python3-venv black libssl-dev libasan5 libubsan1
+RUN pip install mypy pre-commit jupyter
+
+# Pass Access Token securely
+ENV PATH=$PATH:/root/.local/bin
+ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
+
+# Build Gem5
+RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch TorchSim
+RUN cd gem5 && scons build/RISCV/gem5.opt -j $(nproc)
+ENV GEM5_PATH=/workspace/gem5/build/RISCV/gem5.opt
+
+# Build LLVM RISC-V
+RUN git clone https://github.com/PSAL-POSTECH/llvm-project.git --branch torchsim --depth 1
+RUN cd llvm-project && mkdir build && cd build && \
+    cmake -DLLVM_ENABLE_PROJECTS=mlir -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/riscv-llvm -DLLVM_TARGETS_TO_BUILD=RISCV -G "Unix Makefiles" ../llvm && \
+    make -j && make install
+
+# Store RISC-V LLVM for TorchSim
+ENV TORCHSIM_LLVM_PATH=/riscv-llvm/bin
+ENV TORCHSIM_LLVM_INCLUDE_PATH=/riscv-llvm/include
+ENV TORCHSIM_DIR=/workspace/PyTorchSim
+ENV LLVM_DIR=/riscv-llvm
+
+# Download RISC-V tool chain
+RUN apt install -y wget && \
+    wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz && \
+    wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && \
+    tar -zxvf riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && tar -zxvf riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && \
+    rm *.tar.gz
+
+ENV RISCV=/workspace/riscv
+ENV PATH=$RISCV/bin:$PATH
+
+# Install Spike simulator
+RUN apt -y install device-tree-compiler
+RUN git clone https://github.com/PSAL-POSTECH/riscv-isa-sim.git --branch TorchSim && cd riscv-isa-sim && mkdir build && cd build && \
+    ../configure --prefix=$RISCV && make -j && make install
+
+# Install Proxy kernel
+RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \
+     cd riscv-pk && git checkout 4f3debe4d04f56d31089c1c716a27e2d5245e9a1 && mkdir build && cd build && \
+    ../configure --prefix=$RISCV --host=riscv64-unknown-elf && make -j && make install
+
+# Install torchsim dependency
+RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0
+
+# Prepare ONNXim project
+RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch tutorial
+RUN cd PyTorchSim/TOGSim && \
+    git submodule update --recursive --init && \
+    mkdir -p build && \
+    cd build && \
+    conan install .. --build=missing && \
+    cmake .. && \
+    make -j$(nproc)
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json
deleted file mode 100644
index 8f196e81..00000000
--- a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-  "core_type" : ["stonne", "ws_mesh"],
-  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
-  "num_cores" : 2,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_stonne_per_core" : 8,
-  "num_stonne_port" : 64,
-  "num_systolic_array_per_core" : 2,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 16,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 15000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":1
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
deleted file mode 100644
index c7ef15f7..00000000
--- a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "core_type" : ["stonne"],
-  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
-  "num_cores" : 1,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_stonne_per_core" : 8,
-  "num_stonne_port" : 64,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 8,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 15000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json
deleted file mode 100644
index 2293e197..00000000
--- a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "core_type" : ["stonne"],
-  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
-  "num_cores" : 1,
-  "core_freq" : 700,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_stonne_per_core" : 1,
-  "num_stonne_port" : 8,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 700,
-  "dram_channels": 8,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 7000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json
deleted file mode 100644
index 08548638..00000000
--- a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "core_type" : ["stonne"],
-  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
-  "num_cores" : 1,
-  "core_freq" : 1000,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_stonne_per_core" : 1,
-  "num_stonne_port" : 32,
-
-  "dram_type" : "simple",
-  "dram_freq" : 1000,
-  "dram_channels": 1,
-  "dram_req_size": 32,
-  "dram_latency" : 100,
-  "dram_print_interval": 10000,
-  "l2d_type" : "datacache",
-  "l2d_config" : "S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 7000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
deleted file mode 100644
index 5d7b0d35..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 700,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :700,
-  "dram_channels": 16,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
- 
-  "icnt_type" : "booksim2",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_node_per_core" : 16,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
deleted file mode 100644
index 38acafc0..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 700,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 700,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 10000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
deleted file mode 100644
index 7348d5bc..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 16,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 15000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0": 0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
deleted file mode 100644
index 69ec8bd0..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 8,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 15000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0": 0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
deleted file mode 100644
index bff4e224..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 1050,
-  "sram_size" : 16777216,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 4,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :1200,
-  "dram_channels": 16,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-  "l2d_type" : "datacache",
-  "l2d_config" : "S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 19200,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
deleted file mode 100644
index b2661894..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_num_partitions" : 2,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
- 
-  "icnt_type" : "booksim2",
-  "icnt_latency" : 1,
-  "icnt_freq" : 1000,
-  "icnt_node_per_core" : 16,
-  "icnt_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt",
-  "icnt_print_interval" : 10000,
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
deleted file mode 100644
index 922ede5b..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_num_partitions" : 1,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
- 
-  "icnt_type" : "booksim2",
-  "icnt_latency" : 1,
-  "icnt_freq" : 1000,
-  "icnt_node_per_core" : 16,
-  "icnt_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
deleted file mode 100644
index 034542fe..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 700,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :700,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 20000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
deleted file mode 100644
index 82f42c00..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 28000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
deleted file mode 100644
index 132a52e6..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 28000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":1
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
deleted file mode 100644
index a93e8ae2..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 1050,
-  "sram_size" : 32768,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 4,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :1200,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-  "l2d_type" : "datacache",
-  "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 48000,
-  "icnt_node_per_core" : 1,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json
deleted file mode 100644
index e9a64f2e..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 1,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 1000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json
deleted file mode 100644
index 37e18b35..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 2,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json
deleted file mode 100644
index 49225d77..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 4,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json
deleted file mode 100644
index 4ea2c6ff..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 1,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "booksim2",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json
deleted file mode 100644
index 8aee751b..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 1,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json
deleted file mode 100644
index f76fec32..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "core_type" : ["ws_mesh","ws_mesh"],
-  "num_cores" : 2,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 1,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m4.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json
deleted file mode 100644
index 7571b830..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 2,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m8.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json
deleted file mode 100644
index be163336..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 4,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/extern/torch2timeloop b/PyTorchSimBackend/extern/torch2timeloop
deleted file mode 160000
index 62aa1754..00000000
--- a/PyTorchSimBackend/extern/torch2timeloop
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 62aa175421165cc9cd7dfb182a02fc3e26c01e3a
diff --git a/PyTorchSimBackend/src/TMA.cc b/PyTorchSimBackend/src/TMA.cc
deleted file mode 100644
index 7744b0f5..00000000
--- a/PyTorchSimBackend/src/TMA.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "TMA.h"
-#include "TileGraph.h"
-
-TMA::TMA(uint32_t id, uint32_t dram_req_size) {
-  _id = id;
-  _dram_req_size = dram_req_size;
-  _current_inst = nullptr;
-  _finished = true;
-}
-
-void TMA::issue_tile(std::shared_ptr<Instruction> inst) {
-  _current_inst = std::move(inst);
-  std::vector<size_t>& tile_size = _current_inst->get_tile_size();
-  if (tile_size.size() <= 0 || tile_size.size() > get_max_dim()) {
-    spdlog::error("[TMA {}] issued tile is not supported format..", _id);
-    exit(EXIT_FAILURE);
-  }
-  _finished = false;
-}
-
-std::shared_ptr<std::vector<mem_fetch*>> TMA::get_memory_access() {
-  auto addr_set = _current_inst->get_dram_address(_dram_req_size);
-  auto access_vec = std::make_shared<std::vector<mem_fetch *>>();
-  Tile* owner = (Tile*)_current_inst->get_owner();
-  std::shared_ptr<TileSubGraph> owner_subgraph = owner->get_owner();
-  unsigned long long base_daddr = _current_inst->get_base_dram_address();
-  // Todo. We use a ternsor level buffer allocation, so we don't need to check all memfetch
-  bool is_cacheable = owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size);
-  spdlog::trace("[SRAM Trace] Core-{}, Address: 0x{:016x}, Is_cacheable: {}", _id, base_daddr, is_cacheable);
-  spdlog::trace("[NUMA Trace] Core-{}, Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}",
-    _id, owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write());
-
-  for (auto addr: *addr_set) {
-    mem_access_type acc_type = _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W : mem_access_type::GLOBAL_ACC_R;
-    mf_type type = _current_inst->is_dma_write() ? mf_type::WRITE_REQUEST : mf_type::READ_REQUEST;
-    mem_fetch* access = new mem_fetch(addr, acc_type, type, _dram_req_size, _current_inst->get_numa_id(), static_cast<void*>(_current_inst.get()));
-    access->set_cacheable(is_cacheable);
-    _current_inst->inc_waiting_request();
-    access_vec->push_back(access);
-  }
-  _finished = true;
-  return access_vec;
-}
-
-uint32_t TMA::generate_mem_access_id() {
-  static uint32_t id_counter{0};
-  return id_counter++;
-}
\ No newline at end of file
diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 20152e9f..577c45e9 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -7,7 +7,7 @@
 from AsmParser.tog_generator import tog_generator
 from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen
 from PyTorchSimFrontend import extension_config
-from Simulator.simulator import FunctionalSimulator, CycleSimulator, BackendSimulator
+from Simulator.simulator import FunctionalSimulator, CycleSimulator, TOGSimulator
 
 LOCK_TIMEOUT = 600
 
@@ -27,21 +27,6 @@ def dump_metadata(args, arg_attributes, path):
             file.write(f'{arg_name}=({arg_attribute[0]}, {arg.dtype}, {arg.shape})\n')
     return
 
-def parse_stack_sizes(file_path):
-    meta_path = file_path.split(".")[0]+".meta"
-    cmd = ["riscv64-unknown-elf-objcopy", "--dump-section", f".stack_sizes={meta_path}", file_path, "/dev/null"]
-    subprocess.run(cmd, check=True)
-
-    with open(meta_path, 'rb') as f:
-        stack_sizes_data = list(f.read())
-    if len(stack_sizes_data) <= 17:
-        raise ValueError("Invalid .stack_sizes section size")
-
-    stack_size_bytes = stack_sizes_data[8:-9]
-    stack_size = int.from_bytes(stack_size_bytes, byteorder='little')
-    return stack_size
-
-
 def llvm_compile_command(input, output):
     opt_output = f"{input[:-3]}_opt.ll"
     return [re.sub(r"[ \n]+", " ",
@@ -142,6 +127,10 @@ class SpadOverflowError(Exception):
     def __init__(self, message="SPAD overflow occurred."):
         super().__init__(message)
 
+class TileSizeError(Exception):
+    def __init__(self, message="SPAD overflow occurred."):
+        super().__init__(message)
+
 class MLIRCodeCache:
     cache = dict()
     clear = staticmethod(cache.clear)   # Todo: Cache
@@ -176,7 +165,7 @@ def load(cls, source_code,
         else:
             link_option = ""
         # Generate LLVM kernel calller and binary for validation
-        if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE:
+        if extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE:
             # Use custom malloc to avoid size error
             new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
             cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
@@ -193,7 +182,7 @@ def load(cls, source_code,
                     print("Error output:", e.output)
                     assert(0)
 
-                val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, arg_attributes)
+                val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE, arg_attributes)
                 val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name)
                 val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name,
                                                    validation_binary_name, new_link_option)
@@ -224,7 +213,7 @@ def load(cls, source_code,
                 print("Error output:", e.output)
                 assert(0)
 
-            if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
+            if not extension_config.CONFIG_TORCHSIM_TIMING_MODE:
                 return key
 
             # Generate MLIR kernel calller and binary for cycle calculation
@@ -278,8 +267,12 @@ def task():
             loop_size = kwargs["loop_size"]
         else:
             loop_size = []
+
+        # In the autotune mode, skip validation to speed up
+        autotune = kwargs.get('autotune', False)
+        validate = kwargs.get('validate', False) if not autotune else False
+
         def dummy_simulator(*args, **kwargs):
-            validate = kwargs.get('validate', False)
             # Wait for compilation
             key = future.result()
             from filelock import FileLock
@@ -291,57 +284,49 @@ def dummy_simulator(*args, **kwargs):
                 # Dump arguments and meta data
                 dump_metadata(args, arg_attributes, result_path)
                 runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
-                if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE or validate:
+                if not autotune and (extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE or validate):
                     funcsim = FunctionalSimulator(result_path, key)
                     funcsim.run_spike(args, arg_attributes,
                                     runtime_path, self.validation_binary_name,
                                     vectorlane_size=vectorlane_size, spad_info=spad_info,
                                     cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS, silent_mode=silent_mode)
-                if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
+                if not extension_config.CONFIG_TORCHSIM_TIMING_MODE:
                     return
 
                 onnx_path = os.path.join(result_path, "tile_graph.onnx")
                 attribute_path = os.path.join(runtime_path, "attribute")
-                backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
-                backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG)
+                togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
+                backsim = TOGSimulator(togsim_path, extension_config.CONFIG_TOGSIM_CONFIG)
                 backsim.vectorlane_size = vectorlane_size
                 attribute_path = backsim.create_attribute_file(attribute_path, args, loop_size=loop_size)
                 result_path = backsim.simulation(onnx_path, attribute_path, silent_mode=silent_mode)
-                result = BackendSimulator.get_result_from_file(result_path)
+                result = TOGSimulator.get_result_from_file(result_path)
                 return result
 
         def dryrun_simulator(*args, **kwargs):
-            autotune = kwargs.get('autotune', False)
             key = future.result()
-             # Run simulator pass
-            result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key))
-            # Dump arguments and meta data
-            dump_metadata(args, arg_attributes, result_path)
-            runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
-            if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
-                return
-
-            if autotune:
-                onnx_path = os.path.join(result_path, "tile_graph.onnx")
-                attribute_path = os.path.join(runtime_path, "attribute")
-                backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
-                backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG)
-                backsim.vectorlane_size = vectorlane_size
-                attribute_path = backsim.create_attribute_file(attribute_path, args, loop_size=loop_size)
-                result_path_2 = backsim.simulation(onnx_path, attribute_path)
-                result = BackendSimulator.get_result_from_file(result_path_2)
-                return result_path, runtime_path, result
+            from filelock import FileLock
+            lock_dir = get_lock_dir()
+            lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+            with lock:
+                # Run simulator pass
+                result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key))
+                # Dump arguments and meta data
+                dump_metadata(args, arg_attributes, result_path)
+                runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
+                if not extension_config.CONFIG_TORCHSIM_TIMING_MODE:
+                    return
 
-            # Todo. Support valude dependent mode for graph mode
-            if False: # extension_config.CONFIG_TORCHSIM_VALIDATION_MODE:
-                funcsim = FunctionalSimulator(result_path, key)
-                funcsim.run_spike(args, arg_attributes,
-                                  runtime_path, self.validation_binary_name,
-                                  vectorlane_size=vectorlane_size, spad_info=spad_info,
-                                  cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS)
+                # Todo. Support valude dependent mode for graph mode
+                if False: # extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE:
+                    funcsim = FunctionalSimulator(result_path, key)
+                    funcsim.run_spike(args, arg_attributes,
+                                    runtime_path, self.validation_binary_name,
+                                    vectorlane_size=vectorlane_size, spad_info=spad_info,
+                                    cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS)
             return result_path, runtime_path, None
 
-        is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
+        is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) and not autotune
         target_simulator = dryrun_simulator if is_dryrun else dummy_simulator
         target_simulator.arg_attributes = arg_attributes
         target_simulator.future = future
diff --git a/PyTorchSimFrontend/extension_codegen_backend.py b/PyTorchSimFrontend/extension_codegen_backend.py
deleted file mode 100644
index e569d251..00000000
--- a/PyTorchSimFrontend/extension_codegen_backend.py
+++ /dev/null
@@ -1,216 +0,0 @@
-import dataclasses
-import contextlib
-from typing import List
-from typing import Dict
-from torch._inductor.codegen import cpp, wrapper, common
-from torch._inductor.scheduler import BaseScheduling
-from torch._inductor.virtualized import V
-from torch._inductor.utils import IndentedBuffer
-import sympy
-
-cexpr = cpp.CppPrinter().doprint
-
-class ExtensionWrapperCodegen(wrapper.WrapperCodeGen):
-    def __init__(self):
-        super().__init__()
-
-class ExtensionOverrides(common.OpOverrides):
-    pass
-
-class ExtensionKernel(common.Kernel):
-    overrides = ExtensionOverrides
-    newvar_prefix = "auto "
-    suffix = ";"
-
-    def __init__(self, args=None):
-        super().__init__(args)
-        self.call_ranges = None
-        self.ranges = None
-        self.itervars = None
-        self.reduction_depth = None
-        self.reduction_prefix = IndentedBuffer()
-        self.reduction_suffix = IndentedBuffer()
-        self.reduction_vars = {}
-        self.reduction_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc")
-
-    def load(self, name: str, index: sympy.Expr):
-        index = self.rename_indexing(index)
-        var = self.args.input(name)
-        line = f"{var}[{index}]"
-        dtype = V.graph.get_dtype(name)
-        self.cse.prefix = cpp.DTYPE_TO_CPP[dtype] + " "
-        return self.cse.generate(self.loads, line)
-
-    def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
-        index = self.rename_indexing(index)
-        var = self.args.output(name)
-        line = f"{var}[{index}] = {value}"
-        self.cse.generate(self.stores, line, assignment = False)
-
-    def reduction(self, dtype, src_dtype, reduction_type, value):
-        argmax_or_argmin = reduction_type in {"argmax", "argmin"}
-        if argmax_or_argmin:
-            raise NotImplementedError() #TODO: argmin, argmax
-        else:
-            reduction_key = src_dtype, reduction_type, value
-            acc = self.reduction_cse.generate(
-                self.loads, f"reduction {reduction_key}", write=False
-            )
-            self.reduction_vars[acc] = reduction_type
-            acc_type = cpp.reduction_acc_type(reduction_type, dtype)
-            self.reduction_prefix.writeline(f"{acc_type} {acc} = {cpp.reduction_init(reduction_type, dtype)};")
-            line = f"{acc} = {cpp.reduction_combine(reduction_type, acc, value)}"
-            self.cse.generate(self.stores, line, assignment = False)
-            self.reduction_cse.reduction_cache[reduction_key] = acc
-        return acc
-
-    def store_reduction(self, name, index, value):
-        index = self.rename_indexing(index)
-        var = self.args.output(name)
-        self.reduction_suffix.writeline(f"{var}[{index}] = {value};")\
-
-    def codegen_loops(self):
-        code = common.BracesBuffer()
-        # Loop body part
-        loops = [LoopLevel(var, size) for var, size in zip(self.itervars, self.ranges)]
-        loops, reductions = [LoopNest(loops[: self.reduction_depth]),
-                             LoopNest(loops[self.reduction_depth :])]
-        reductions.mark_reduction(self.reduction_vars)
-
-        with contextlib.ExitStack() as stack:
-            loops.codegen(code, stack)
-            with contextlib.ExitStack() as stack_outer:
-                if self.reduction_prefix:
-                    stack_outer.enter_context(code.indent())
-                code.splice(self.reduction_prefix)
-
-                with contextlib.ExitStack() as stack:
-                    reductions.codegen(code, stack)
-                    code.splice(self.loads)
-                    code.splice(self.compute)
-                    code.splice(self.stores)
-                code.splice(self.reduction_suffix)
-        return code
-
-    def define_kernel(self, wrapper, src_code, kernel_name):
-        if src_code in wrapper.src_to_kernel:
-            kernel_name = wrapper.src_to_kernel[src_code]
-        else:
-            wrapper.src_to_kernel[src_code] = kernel_name
-            wrapper.define_kernel(kernel_name, src_code, cuda=False)
-
-    def codegen_kernel(self, wrapper):
-        arg_defs, call_args, arg_types = self.args.cpp_argdefs()
-        arg_defs = ",\n".ljust(25).join(arg_defs)
-        arg_types = ",".join(arg_types)
-        code = common.BracesBuffer()
-
-        # Todo. kernel name custom
-        kernel_name = f"Extensin_Kernel"
-        kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel"
-        code.writeline(f'extern "C" void {kernel_decl_name}({arg_defs})')
-        with code.indent():
-            for old, new in self.args.aliases():
-                code.writeline(f"auto {old} = {new};")
-            # Loop body part
-            code.splice(self.codegen_loops())
-
-        codecache_def = IndentedBuffer()
-        if not V.graph.cpp_wrapper:
-            codecache_def.writeline("async_compile.cpp('''")
-        codecache_def.splice(code)
-        if not V.graph.cpp_wrapper:
-            codecache_def.writeline("''')")
-
-        self.define_kernel(wrapper, codecache_def.getvalue(), kernel_name)
-        # generate the code to call this
-        wrapper.generate_kernel_call(kernel_name, call_args, cuda=False)
-        print(code.getvalue())
-        return code.getvalue()
-
-    def set_ranges(self, lengths, reduction_lengths):
-        if self.call_ranges:
-            assert self.call_ranges == tuple(lengths) + tuple(
-                reduction_lengths
-            ), f"{self.call_ranges} == {tuple(lengths)} + {tuple(reduction_lengths)}"
-            assert self.reduction_depth == len(lengths)
-        else:
-            self.call_ranges = tuple(lengths) + tuple(reduction_lengths)
-            self.ranges = [self.rename_indexing(x) for x in self.call_ranges]
-            self.itervars = [sympy.Symbol(f"i{n}") for n in range(len(self.ranges))]
-            self.reduction_depth = len(lengths)
-        return (
-            self.itervars[: self.reduction_depth],
-            self.itervars[self.reduction_depth :],
-        )
-
-@dataclasses.dataclass
-class LoopLevel:
-    var: sympy.Expr
-    size: sympy.Expr
-    reduction_vars: Dict[str, str] = None
-
-    # Todo. Type change for reduction
-    INDEX_TYPE = "long"
-    def lines(self):
-        line = f"for({self.INDEX_TYPE} {self.var}=0; {self.var}<{cexpr(self.size)}; ++{self.var})"
-        return [line]
-
-@dataclasses.dataclass
-class LoopNest:
-    loops: List[LoopLevel]
-
-    def __bool__(self):
-        return bool(self.loops)
-
-    def mark_reduction(self, reduction_vars):
-        for loop in self.loops:
-            loop.reduction_vars = reduction_vars
-
-    def mark_parallel(self, par_depth):
-        loops = self.loops
-        loops[0].parallel = par_depth
-        for i in range(1, par_depth):
-            loops[i].collapsed = True
-        loops[0].simd = loops[par_depth - 1].simd
-
-    def codegen(self, code, stack):
-        for loop in self.loops:
-            code.writelines(loop.lines())
-            stack.enter_context(code.indent())
-
-class ExtensionScheduling(BaseScheduling):
-    count = 0
-    def __init__(self, scheduler):
-        self.scheduler = scheduler
-        self._scheduling = cpp.CppScheduling(scheduler)
-
-    def can_fuse_vertical(self, node1, node2):
-        return False
-
-    def can_fuse_horizontal(self, node1, node2):
-        return False
-
-    def group_fn(self, sizes):
-        return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes)
-
-    def codegen_nodes(self, nodes):
-        _, (group, reduction_group) = max(
-            nodes, key=lambda x: int(x.is_reduction())
-        ).group
-
-        ex_kernel = ExtensionKernel()
-        for node in nodes:
-            vars, reduction_vars = ex_kernel.set_ranges(group, reduction_group)
-            with ex_kernel:
-                node.run(vars, reduction_vars)
-
-        wrapper = V.graph.wrapper_code
-        ex_kernel.codegen_kernel(wrapper)
-        pass
-
-    def codegen_sync(self):
-        pass
-
-    def flush(self):
-        self._scheduling.flush()
\ No newline at end of file
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 59f3818c..3d6fbb76 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -3,74 +3,124 @@
 import tempfile
 import importlib
 
-# Hardware info config
-CONFIG_VECTOR_LANE = int(os.environ.get("TORCHSIM_VECTOR_LANE", default=128))
-CONFIG_VECTOR_LANE_STRIDE = int(os.environ.get("TORCHSIM_VECTOR_LANE_STRIDE", default=2))
-CONFIG_SPAD_INFO = {
-  "spad_vaddr" : 0xD0000000,
-  "spad_paddr" : 0x2000000000,
-  "spad_size" : int(os.environ.get("TORCHSIM_SPAD_SIZE", default=128)) << 10 # Note: spad size per lane
-}
-CONFIG_PRECISION = 4 # 32bit
-CONFIG_NUM_CORES = 1
-CONFIG_VLEN = 256 # 256bits / 32bits = 8 [elements]
-
-# Tile size config
-CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-
-# DUMP PATH
-CONFIG_BACKEND_RESULT_PATH_KEY = os.getenv("BACKEND_RESULT_PATH_KEY")
-
-CONFIG_TORCHSIM_DUMP_PATH = os.environ.get('TORCHSIM_DUMP_PATH',
-                        default = f"{tempfile.gettempdir()}/torchinductor")
-CONFIG_TORCHSIM_DUMP_FILE = int(os.environ.get('TORCHSIM_DUMP_FILE', default=True))
-CONFIG_TORCHSIM_VALIDATION_MODE = int(os.environ.get('TORCHSIM_VALIDATION_MODE', default=True))
-CONFIG_CLEANUP_DUMP_ARGS = int(os.environ.get('CLEANUP_DUMP_ARGS', default=False))
-
-# LLVM PATH
-CONFIG_TORCHSIM_LLVM_PATH = os.environ.get('TORCHSIM_LLVM_PATH', default="/usr/bin")
-CONFIG_TORCHSIM_CUSTOM_PASS_PATH = os.environ.get('TORCHSIM_CUSTOM_PASS_PATH',
-                                           default=f"{CONFIG_TORCHSIM_DIR}/GemminiLowerPass/build")
-CONFIG_TORCHSIM_DUMP_MLIR_IR = int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False))
-CONFIG_TORCHSIM_DUMP_LLVM_IR = int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False))
-
-# Backendsim config
-CONFIG_TORCHSIM_BACKEND_CONFIG = os.environ.get('TORCHSIM_CONFIG',
-                                        default=f'{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
-CONFIG_BACKENDSIM_SPIKE_ONLY = int(os.environ.get("BACKENDSIM_SPIKE_ONLY", False))
-CONFIG_BACKENDSIM_EAGER_MODE = int(os.environ.get("BACKENDSIM_EAGER_MODE", default=False))
-CONFIG_BACKENDSIM_DRYRUN = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
-CONFIG_BACKENDSIM_DEBUG_LEVEL = os.environ.get("BACKENDSIM_DEBUG_LEVEL", "")
-
-# GEM5 config
-CONFIG_GEM5_PATH = os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt")
-CONFIG_GEM5_SCRIPT_PATH = os.environ.get('GEM5_SCRIPT_PATH',
-                                  default=f"{CONFIG_TORCHSIM_DIR}/gem5_script/script_systolic.py")
-
-# AUTOTUNE config
-CONFIG_AUTOTUNE = int(os.environ.get('AUTOTUNE', default=True))
-CONFIG_MAX_AUTOTUNE_TRY = int(os.environ.get('MAX_AUTOTUNE_TRY', default=10))
-
-# For block sparse
-CONFIG_BLOCK_SPARSE = int(os.environ.get('BLOCK_SPARSE', default=0))
-
-# For GEMM tile size
-CONFIG_MANUAL_TILE_SIZE = int(os.environ.get('TORCHSIM_MANUAL_TILE_SIZE', default=False))
-CONFIG_TILE_M = int(os.environ.get('TORCHSIM_TILE_M', default=CONFIG_VECTOR_LANE))
-CONFIG_TILE_N = int(os.environ.get('TORCHSIM_TILE_N', default=CONFIG_VECTOR_LANE))
-CONFIG_TILE_K = int(os.environ.get('TORCHSIM_TILE_K', default=CONFIG_VECTOR_LANE))
-CONFIG_GEMM_CHEATSHEET_PATH = os.environ.get('TORCHSIM_GEMM_CHEATSHEET_PATH',
-                                            default=f"{CONFIG_TORCHSIM_DIR}/validation/gemm_tpuv3_cheatsheet.json")
-CONFIG_SUBTILE = int(os.environ.get('TORCHSIM_SUBTILE', default=True))
-CONFIG_MANUAL_SUBTILE_SIZE = int(os.environ.get('TORCHSIM_MANUAL_SUBTILE_SIZE', default=False))
-CONFIG_SUBTILE_M = int(os.environ.get('TORCHSIM_SUBTILE_M', default=CONFIG_VECTOR_LANE))
-CONFIG_SUBTILE_N = int(os.environ.get('TORCHSIM_SUBTILE_N', default=CONFIG_VECTOR_LANE))
-CONFIG_SUBTILE_K = int(os.environ.get('TORCHSIM_SUBTILE_K', default=CONFIG_VECTOR_LANE))
-
-# Advanced fusion options
-CONFIG_FUSION_REDUCTION_EPILOGUE = int(os.environ.get('TORCHSIM_FUSION_REDUCTION_EPILOGUE', default=True))
-CONFIG_FUSION_REDUCTION_REDUCTION = int(os.environ.get('TORCHSIM_FUSION_REDUCTION_REDUCTION', default=True))
-CONFIG_FUSION_PROLOGUE = int(os.environ.get('TORCHSIM_FUSION_PROLOGUE', default=True))
+def __getattr__(name):
+
+    # Hardware info config
+    if name == "CONFIG_VECTOR_LANE":
+        return int(os.environ.get("TORCHSIM_VECTOR_LANE", default=128))
+    if name == "CONFIG_VECTOR_LANE_STRIDE":
+        return int(os.environ.get("TORCHSIM_VECTOR_LANE_STRIDE", default=2))
+    if name == "CONFIG_SPAD_INFO":
+        return {
+          "spad_vaddr" : 0xD0000000,
+          "spad_paddr" : 0x2000000000,
+          "spad_size" : int(os.environ.get("TORCHSIM_SPAD_SIZE", default=128)) << 10 # Note: spad size per lane
+        }
+    if name == "CONFIG_PRECISION":
+          return 4 # 32bit
+    if name == "CONFIG_NUM_CORES":
+          return 1
+    if name == "CONFIG_VLEN":
+          return 256 # 256bits / 32bits = 8 [elements]
+
+    # Tile size config
+    if name == "CONFIG_TORCHSIM_DIR":
+          return os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+
+    if name == "CONFIG_TORCHSIM_DUMP_PATH":
+          return os.environ.get('TORCHSIM_DUMP_PATH', default = f"{tempfile.gettempdir()}/torchinductor")
+    if name == "CONFIG_TORCHSIM_DUMP_FILE":
+          return int(os.environ.get('TORCHSIM_DUMP_FILE', default=True))
+    if name == "CONFIG_TORCHSIM_FUNCTIONAL_MODE":
+          return int(os.environ.get('TORCHSIM_FUNCTIONAL_MODE', default=True))
+    if name == "CONFIG_TORCHSIM_TIMING_MODE":
+          return int(os.environ.get("TORCHSIM_TIMING_MODE", True))
+    if name == "CONFIG_CLEANUP_DUMP_ARGS":
+          return int(os.environ.get('CLEANUP_DUMP_ARGS', default=False))
+
+    # LLVM PATH
+    if name == "CONFIG_TORCHSIM_LLVM_PATH":
+        return os.environ.get('TORCHSIM_LLVM_PATH', default="/usr/bin")
+    if name == "CONFIG_TORCHSIM_CUSTOM_PASS_PATH":
+        return os.environ.get('TORCHSIM_CUSTOM_PASS_PATH',
+                                              default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/GemminiLowerPass/build")
+    if name == "CONFIG_TORCHSIM_DUMP_MLIR_IR":
+        return int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False))
+    if name == "CONFIG_TORCHSIM_DUMP_LLVM_IR":
+        return int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False))
+
+    # TOGSim config
+    if name == "CONFIG_TOGSIM_CONFIG":
+        return os.environ.get('TORCHSIM_CONFIG',
+                default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json")
+    if name == "CONFIG_TOGSIM_EAGER_MODE":
+        return int(os.environ.get("TOGSIM_EAGER_MODE", default=False))
+    if name == "CONFIG_TOGSIM_DRYRUN":
+        return int(os.environ.get('TOGSIM_DRYRUN', default=False))
+    if name == "CONFIG_TOGSIM_DEBUG_LEVEL":
+        return os.environ.get("TOGSIM_DEBUG_LEVEL", "")
+
+    # GEM5 config
+    if name == "CONFIG_GEM5_PATH":
+        return os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt")
+    if name == "CONFIG_GEM5_SCRIPT_PATH":
+        return os.environ.get('GEM5_SCRIPT_PATH',
+                                      default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/gem5_script/script_systolic.py")
+
+    # AUTOTUNE config
+    if name == "CONFIG_AUTOTUNE":
+        return int(os.environ.get('AUTOTUNE', default=False))
+    if name == "CONFIG_AUTOTUNE_TEMPLATE":
+        return int(os.environ.get('AUTOTUNE_TEMPLATE', default=False))
+    if name == "CONFIG_MAX_AUTOTUNE_TRY":
+        return int(os.environ.get('MAX_AUTOTUNE_TRY', default=10))
+    if name == "CONFIG_AUTOTUNE_TEMPLATE_TOPK":
+        return int(os.environ.get('AUTOTUNE_TEMPLATE_TOPK', default=4))
+
+    # For block sparse
+    if name == "CONFIG_BLOCK_SPARSE":
+        return int(os.environ.get('BLOCK_SPARSE', default=0))
+
+    # For GEMM tile size
+    if name == "CONFIG_MANUAL_TILE_SIZE":
+        return int(os.environ.get('TORCHSIM_MANUAL_TILE_SIZE', default=False))
+    if name == "CONFIG_TILE_M":
+        return int(os.getenv("TORCHSIM_TILE_M", __getattr__("CONFIG_VECTOR_LANE")))
+    if name == "CONFIG_TILE_N":
+        return int(os.getenv("TORCHSIM_TILE_N", __getattr__("CONFIG_VECTOR_LANE")))
+    if name == "CONFIG_TILE_K":
+        return int(os.getenv("TORCHSIM_TILE_K", __getattr__("CONFIG_VECTOR_LANE")))
+
+    if name == "CONFIG_SUBTILE":
+        return int(os.environ.get('TORCHSIM_SUBTILE', default=True))
+    if name == "CONFIG_MANUAL_SUBTILE_SIZE":
+        return int(os.environ.get('TORCHSIM_MANUAL_SUBTILE_SIZE', default=False))
+    if name == "CONFIG_SUBTILE_M":
+        return int(os.environ.get('TORCHSIM_SUBTILE_M', default=__getattr__("CONFIG_VECTOR_LANE")))
+    if name == "CONFIG_SUBTILE_N":
+        return int(os.environ.get('TORCHSIM_SUBTILE_N', default=__getattr__("CONFIG_VECTOR_LANE")))
+    if name == "CONFIG_SUBTILE_K":
+        return int(os.environ.get('TORCHSIM_SUBTILE_K', default=__getattr__("CONFIG_VECTOR_LANE")))
+
+    if name == "CONFIG_GEMM_CHEATSHEET_PATH":
+          return os.environ.get('TORCHSIM_GEMM_CHEATSHEET_PATH',
+                          default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/validation/gemm_tpuv3_cheatsheet.json")
+    # Compiler Optimization
+    if name == "CONFIG_COMPILER_OPTIMIZATION":
+          return os.environ.get('TORCHSIM_COMPILER_OPTIMIZATION', default="all")  # options: all, none, custom
+    # Advanced fusion options
+    if name == "CONFIG_FUSION":
+          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "fusion" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
+    if name == "CONFIG_FUSION_REDUCTION_EPILOGUE":
+          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "reduction_epliogue" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
+    if name == "CONFIG_FUSION_REDUCTION_REDUCTION":
+          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "reduction_reduction" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
+    if name == "CONFIG_FUSION_PROLOGUE":
+          return True if ((__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all") or ("prologue" in __getattr__("CONFIG_COMPILER_OPTIMIZATION"))) else False
+    if name == "CONFIG_SINGLE_BATCH_CONV":
+          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "single_batch_conv" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
+    if name == "CONFIG_MULTI_TILE_CONV":
+          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "multi_tile_conv" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
 
 # SRAM Buffer allocation plan
 def load_plan_from_module(module_path):
@@ -97,3 +147,5 @@ def load_plan_from_module(module_path):
 CONFIG_TLS_MODE = int(os.environ.get('TORCHSIM_TLS_MODE', default=1))
 
 CONFIG_USE_TIMING_POOLING = int(os.environ.get('TORCHSIM_USE_TIMING_POOLING', default=0))
+
+CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=0))
\ No newline at end of file
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 22a727c5..167544f2 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -13,7 +13,7 @@
 from torch._inductor.codecache import write
 from PyTorchSimFrontend.extension_codecache import get_write_path
 from PyTorchSimFrontend import extension_config
-from Simulator.simulator import BackendSimulator, TORCH_TO_NUMPY
+from Simulator.simulator import TOGSimulator, TORCH_TO_NUMPY
 
 graph_template = {
     0: {
@@ -46,7 +46,7 @@
 
 class MLIRExternKernelChoice(ExternKernelChoice):
     def call_name(self):
-        is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
+        is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False))
         if is_dryrun:
             return f"yield from sparse_mm_dummy_stonne_outer"
         return f"torch.ops.extension_op.{self.name}"
@@ -275,11 +275,11 @@ def prepare_outer_product_matrix(a, b, out):
 def sparse_mm_stonne_outer(a, b, out):
     onnx_path, attribute_path, c_result_path = prepare_outer_product_matrix(a, b, out)
 
-    backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
-    stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json'
-    backsim = BackendSimulator(backend_path, stonne_config_path)
+    togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
+    stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/TOGSim/configs/stonne_single_c1_simple_noc.json'
+    backsim = TOGSimulator(togsim_path, stonne_config_path)
     result_path = backsim.simulation(onnx_path)
-    BackendSimulator.get_result_from_file(result_path)
+    TOGSimulator.get_result_from_file(result_path)
 
     # Load result data
     #with open(c_result_path, 'rb') as f:
diff --git a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py b/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
deleted file mode 100644
index 3690f533..00000000
--- a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
+++ /dev/null
@@ -1,236 +0,0 @@
-import os
-import subprocess
-import shlex
-import re
-
-from torch._inductor.utils import IndentedBuffer
-from torch._inductor.codegen import cpp
-from torch._inductor.codecache import write_atomic
-
-from PyTorchSimFrontend.llvm.llvm_common import LLVMKernelArgs
-
-class LLVMKernelCallerCodeGen():
-    """
-    Generate C that calls the llvm kernel.
-    """
-
-    def __init__(self, validation, arg_attributes):
-        super().__init__()
-        self.code = IndentedBuffer()
-        self.ending = ";"
-        self.open_bracket = "{"
-        self.closed_bracket = "}"
-        self.newline = "\n"
-        self.kernel_name = "kernel"
-        self.validation = validation
-        self.n_arg = len(arg_attributes)
-        self.arg_attributes = arg_attributes
-        self.arg_use_count = 1
-        self.load_args = {}
-        self.kernel_start_addr = ""
-        self.kernel_end_addr = ""
-
-    def get_argv_idx(self):
-        self.arg_use_count += 1
-        return self.arg_use_count-1
-
-    def write_header(self):
-        self.writeline('#include <stdio.h>')
-        self.writeline('#include <stdlib.h>')
-        self.writeline("#include <stdint.h>")
-        if self.validation:
-            self.writeline("#include <unistd.h>")
-            self.writeline('#include <string.h>')
-            self.writeline('#include <fcntl.h>')
-
-    def is_in_arg(self, arg_name):
-        value = self.arg_attributes[arg_name][0]
-        return LLVMKernelArgs.is_llvm_arg_in(value)
-
-    def is_out_arg(self, arg_name):
-        value = self.arg_attributes[arg_name][0]
-        return LLVMKernelArgs.is_llvm_arg_out(value)
-
-    def load_arg(self):
-        for i, arg_name in enumerate(self.arg_attributes.keys()):
-            if self.is_in_arg(arg_name):
-                argv_idx = self.get_argv_idx() if arg_name not in self.load_args else self.load_args[arg_name]
-                self.load_args[arg_name] = argv_idx
-                self.writeline(f'if(load_arg({arg_name}, sizeof({arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}')
-                with self.code.indent():
-                    self.writeline(f'return -1{self.ending}')
-                self.writeline(self.closed_bracket)
-
-    def dump_arg(self):
-        for i, arg_name in enumerate(self.arg_attributes.keys()):
-            if self.is_out_arg(arg_name):
-                argv_idx = self.get_argv_idx() if arg_name not in self.load_args else self.load_args[arg_name]
-                self.writeline(f'if(dump_arg({arg_name}, sizeof({arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}')
-                with self.code.indent():
-                    self.writeline(f'return -1{self.ending}')
-                self.writeline(self.closed_bracket)
-
-    def write_exit(self):
-        self.writeline(f'return 0{self.ending}')
-
-    def generate_kernel_declare(self):
-        args_type_p = [f'{cpp.DTYPE_TO_CPP[arg_type[1]]}*' for arg_type in self.arg_attributes.values()]
-
-        self.writeline(f"void {self.kernel_name}({', '.join(args_type_p)}){self.ending}{self.newline}")
-
-    def generate_args_define(self):
-        for arg_name, (_, arg_type, arg_shape) in self.arg_attributes.items():
-            self.writeline(f'{cpp.DTYPE_TO_CPP[arg_type]} {arg_name}[atoi(argv[{self.get_argv_idx()}])] __attribute__ ((aligned (4096))){self.ending}')
-        self.writeline(self.newline)
-
-    def generate_load_dump_fn(self):
-        self.writeline(f'{self.newline}int load_arg(void *arg, size_t size, const char *path) {self.open_bracket}')
-        with self.code.indent():
-            self.writeline(f'int fd = open(path, 0x00000000){self.ending}')
-            self.writeline(f'if (fd == -1) {self.open_bracket}')
-            with self.code.indent():
-                self.writeline(f'return -1{self.ending}')
-            self.writeline(self.closed_bracket)
-
-            self.writeline(f'if (read(fd, arg, size) == -1) {self.open_bracket}')
-            with self.code.indent():
-                self.writeline(f'return -1{self.ending}')
-            self.writeline(self.closed_bracket)
-            self.writeline(f'close(fd){self.ending}')
-            self.writeline(f'return 0{self.ending}')
-        self.writeline(self.closed_bracket)
-
-        self.writeline(f'{self.newline}int dump_arg(void *arg, size_t size, const char *path) {self.open_bracket}')
-        with self.code.indent():
-            self.writeline(f'int fd = open(path, 0x00000001 | 0x00000040, 0644){self.ending}')
-            self.writeline(f'if (fd == -1) {self.open_bracket}')
-            with self.code.indent():
-                self.writeline(f'return -1{self.ending}')
-            self.writeline(self.closed_bracket)
-
-            self.writeline(f'if (write(fd, arg, size) == -1) {self.open_bracket}')
-            with self.code.indent():
-                self.writeline(f'return -1{self.ending}')
-            self.writeline(self.closed_bracket)
-            self.writeline(f'close(fd){self.ending}')
-            self.writeline(f'return 0{self.ending}')
-        self.writeline(self.closed_bracket)
-
-    def generate_main(self):
-        self.writeline(f'{self.newline}int main(int argc, char *argv[]) {self.open_bracket}{self.newline}')
-        with self.code.indent():
-            if self.validation:
-                self.load_arg()
-                self.writeline(self.newline)
-
-            self.writeline(f"{self.kernel_name}({', '.join(list(self.arg_attributes))}){self.ending}{self.newline}")
-
-            if self.validation:
-                self.dump_arg()
-
-            self.write_exit()
-        self.writeline(self.closed_bracket)
-
-    def writeline(self, line):
-        self.code.writeline(line)
-
-    def generate_wrapper_file(self, path, name):
-        self.dump_path = path
-
-        self.write_header()
-        self.generate_kernel_declare()
-
-        if self.validation:
-            self.generate_load_dump_fn()
-        self.generate_main()
-
-        write_path = os.path.join(path, name+".c",)
-        write_atomic(write_path, self.code.getvalue())
-        return
-
-    def add_extention(self, name, extension):
-        return name + "." + extension
-
-    def compile_wih_kernel(self, write_path, llvm_name, wrapper_name, binary_name, link_option=""):
-        main_path = os.path.join(write_path, self.add_extention(wrapper_name, 'c'))
-        main_obj_path = os.path.join(write_path, self.add_extention(wrapper_name, 'o'))
-        kernel_path = os.path.join(write_path, self.add_extention(llvm_name, 's'))
-        kernel_obj_path = os.path.join(write_path, self.add_extention(llvm_name, 'o'))
-
-        main_compile = f'riscv64-unknown-elf-gcc -march=rv64gcv -c {main_path} -o {main_obj_path}'
-        kernel_compile = f'clang -c --target="riscv64" -march=rv64gcv -O2 -nostdlib {kernel_path} -o {kernel_obj_path}'
-
-        target = os.path.join(write_path, binary_name)
-        link = f'riscv64-unknown-elf-gcc -march=rv64gcv {main_obj_path} {kernel_obj_path} -o {target} -lm {link_option}'
-
-        main_compile_cmd = shlex.split(main_compile)
-        kernel_compile_cmd = shlex.split(kernel_compile)
-        link_cmd = shlex.split(link)
-
-        try:
-            subprocess.check_call(main_compile_cmd)
-            subprocess.check_call(kernel_compile_cmd)
-            subprocess.check_call(link_cmd)
-        except subprocess.CalledProcessError as e:
-            print("Command failed with exit code", e.returncode)
-            print("Error output:", e.output)
-            assert(0)
-
-    def parse_stack_sizes(self, file_path, vlenb=256):
-        with open(file_path, 'r') as f:
-            stack_sizes_data = f.readlines()
-
-        in_proc = False
-        stack_base = None
-        dynamic_expr = None
-        max_offset = 0
-
-        for line in stack_sizes_data:
-            line = line.strip()
-            if line.startswith(".cfi_startproc"):
-                in_proc = True
-                continue
-            elif line.startswith(".cfi_endproc") and in_proc:
-                if dynamic_expr:
-                    total_stack = eval(dynamic_expr, {"vlenb": vlenb})
-                    return total_stack
-                elif stack_base:
-                    return stack_base
-                else:
-                    return max_offset
-
-            # Skip outer function
-            if not in_proc:
-                continue
-
-            if line.startswith(".cfi_def_cfa_offset"):
-                stack_base = int(line.split()[-1])
-
-            if ".cfi_escape" in line and "#" in line:
-                comment = line.split("#")[-1].strip()
-                m = re.search(r"sp \+ (\d+)\s*\+\s*(\d+)\s*\*\s*vlenb", comment)
-                if m:
-                    base, scale = int(m.group(1)), int(m.group(2))
-                    dynamic_expr = f"{base} + {scale} * vlenb"
-
-    def get_spad_size(self, binary_path):
-        cmd = ["riscv64-unknown-elf-readelf", "-s", binary_path]
-        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-        if result.returncode != 0:
-            raise RuntimeError(f"Readelf error: {result.stderr}")
-
-        output = result.stdout
-        spad_start = None
-        spad_end = None
-        for line in output.splitlines():
-            if '.spad' in line and 'SECTION' in line:
-                parts = line.split()
-                spad_start = int(parts[1], 16)
-            elif 'spad_end' in line:
-                parts = line.split()
-                spad_end = int(parts[1], 16)
-
-        if spad_start is None or spad_end is None:
-            return 0
-        spad_size = spad_end - spad_start
-        return spad_size
\ No newline at end of file
diff --git a/PyTorchSimFrontend/llvm/llvm_common.py b/PyTorchSimFrontend/llvm/llvm_common.py
deleted file mode 100644
index 1c76b826..00000000
--- a/PyTorchSimFrontend/llvm/llvm_common.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import torch
-from torch._inductor.codegen import common
-from torch._inductor.virtualized import V
-import sympy
-
-from typing import Callable
-
-import sympy
-
-import torch.fx
-from torch.utils._sympy.value_ranges import ValueRanges
-
-from torch._inductor.utils import (
-    free_symbol_startswith,
-    get_sympy_Expr_dtype,
-    IndentedBuffer,
-    sympy_subs,
-    unique,
-)
-
-schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
-
-DTYPE_TO_LLVM = {
-    torch.float32: "float",
-    torch.float64: "double",
-    torch.float16: "half",
-    torch.int64: "i64",
-    torch.int32: "i32",
-    torch.int16: "i16",
-    torch.int8: "i8",
-    torch.uint8: "i8",
-    torch.bool: "i8",
-    torch.bfloat16: "bfloat",
-}
-
-DTYPE_SIZE = {
-    torch.float32: 4,
-    torch.float64: 8,
-    torch.float16: 2,
-    torch.int64: 8,
-    torch.int32: 4,
-    torch.int16: 2,
-    torch.int8: 1,
-    torch.uint8: 1,
-    torch.bool: 1,
-    torch.bfloat16: 2,
-}
-
-DTYPE_LOWP_FP = [
-    torch.bfloat16,
-    torch.float16,
-]
-
-class LLVMKernelArgs(common.KernelArgs):
-    LLVM_ARGS_IN = 0x01
-    LLVM_ARGS_OUT = 0x02
-    LLVM_ARGS_INOUT = 0x04
-    LLVM_ARGS_VAR = 0x08
-
-    @staticmethod
-    def is_llvm_arg_in(value):
-        return (LLVMKernelArgs.LLVM_ARGS_IN & value) | (LLVMKernelArgs.LLVM_ARGS_INOUT & value)
-
-    @staticmethod
-    def is_llvm_arg_out(value):
-        return (LLVMKernelArgs.LLVM_ARGS_OUT & value) | (LLVMKernelArgs.LLVM_ARGS_INOUT & value)
-
-    def llvm_argdefs(self, only_args=False):
-        buffer_types = {x.get_name(): [x.get_dtype(), x.get_numel()] for x in V.graph.buffers}
-        for name, val in V.graph.graph_inputs.items():
-            if isinstance(val, sympy.Expr):
-                buffer_types[name] = [get_sympy_Expr_dtype(val), 1]
-            else:
-                buffer_types[name] = [val.get_dtype(), val.get_numel()]
-        buffer_types.update(
-            {name: val.dtype for name, val in V.graph.constants.items()}
-        )
-
-        call_args = []
-        arg_defs = []
-        arg_attributes = {}
-        for inplaced in unique(self.inplace_buffers.values()):
-            if self._buffer_is_marked_removed(inplaced):
-                continue
-            outer = inplaced.other_names[-1]
-            inner = inplaced.inner_name
-            arg_defs.append(f"ptr %{inner}")
-            if not only_args:
-                call_args.append(outer)
-                arg_attributes[outer] = [self.LLVM_ARGS_INOUT] + buffer_types[outer]
-        for outer, inner in self.input_buffers.items():
-            if outer in self.inplace_buffers:
-                continue
-            arg_defs.append(f"ptr readonly %{inner}")
-            if not only_args:
-                call_args.append(outer)
-                arg_attributes[outer] = [self.LLVM_ARGS_IN] + buffer_types[outer]
-        for outer, inner in self.output_buffers.items():
-            if outer in self.inplace_buffers or self._buffer_is_marked_removed(inner):
-                continue
-            arg_defs.append(f"ptr %{inner}")
-            if not only_args:
-                call_args.append(outer)
-                arg_attributes[outer] = [self.LLVM_ARGS_OUT] + buffer_types[outer]
-        for outer, inner in self.sizevars.items():
-            arg_defs.append(f"ptr readonly %{inner}")
-            if not only_args:
-                call_args.append(outer)
-                arg_attributes[outer] = [self.LLVM_ARGS_VAR] + buffer_types[outer]
-        return arg_defs, call_args, arg_attributes
-
-class BaseLLVMKernel(common.Kernel):
-    newvar_prefix = "%"
-    name_prefix = "body"
-    vector_prefix = "vector_body"
-    suffix = ""
-    overrides = None
-    load_format = None
-    store_format = None
-
-    def __init__(self, args=None):
-        super().__init__(args)
-        self.vector_compute = IndentedBuffer()
-        self.reductions_suffix = IndentedBuffer()
-        self.cse = common.CSE(self.newvar_prefix, self.suffix, self.name_prefix)
-        self.vector_cse = common.CSE(self.newvar_prefix, self.suffix, self.vector_prefix)
-        self.tile_size = None
-        self.tile_shape = {}
-
-    def load(self, name: str, index: sympy.Expr):
-        raise NotImplementedError()
-
-    def store_reduction(self, name, index, value):
-        raise NotImplementedError()
-
-    def store(self, name, index, value, mode=None):
-        raise NotImplementedError()
-
-    def reduction(self, dtype, src_dtype, reduction_type, value):
-        raise NotImplementedError()
-
-    def widening(self, args, buf_bounds):
-        if not args[0] in self.tile_shape or not args[1] in self.tile_shape:
-            return args, [1, 1]
-        tile_shape0 = self.tile_shape[args[0]]
-        tile_shape1 = self.tile_shape[args[1]]
-        vec_len0 = tile_shape0[0] * tile_shape0[1]
-        vec_len1 = tile_shape1[0] * tile_shape1[1]
-        if tile_shape0 != tile_shape1:
-            temp = list(args)
-            idx = 0 if tile_shape0[0] != tile_shape1[0] else 1
-            if tile_shape0[idx] > tile_shape1[idx]:
-                if idx == 0:
-                    indexes = [f"i32 {i%tile_shape1[idx-1]}" for i in range(vec_len0)]
-                else:
-                    indexes = [f"i32 {i//tile_shape1[idx-1]}" for i in range(vec_len0)]
-                line = f"shufflevector <{vec_len1} x float> %{args[1]}, <{vec_len1} x float> undef, <{vec_len0} x i32> <{', '.join(indexes)}>"
-                temp[1] = self.cse.generate(self.compute, line, bounds=buf_bounds)
-            elif tile_shape0[idx] < tile_shape1[idx]:
-                if idx == 0:
-                    indexes = [f"i32 {i%tile_shape0[idx-1]}" for i in range(vec_len1)]
-                else:
-                    indexes = [f"i32 {i//tile_shape0[idx-1]}" for i in range(vec_len1)]
-                line = f"shufflevector <{vec_len0} x float> %{args[0]}, <{vec_len0} x float> undef, <{vec_len1} x i32> <{', '.join(indexes)}>"
-                temp[0] = self.cse.generate(self.compute, line, bounds=buf_bounds)
-            args = tuple(temp)
-        return args, max(tile_shape0, tile_shape1)
-
-    def __enter__(self):
-        class CSEProxy:
-            self.name = "CSEProxy"
-
-            @staticmethod
-            def __getattr__(name: str) -> Callable[..., common.CSEVariable]:  # type: ignore[misc]
-                def inner(*args, **kwargs):
-                    # TritonTemplateKernel has no current_node
-                    buf_bounds = ValueRanges.unknown()
-                    if hasattr(V.interpreter, "current_node"):
-                        fx_node = V.interpreter.current_node
-                        assert isinstance(self.node_to_bounds, dict)
-                        buf_bounds = self.node_to_bounds.get(
-                            fx_node, ValueRanges.unknown()
-                        )
-
-                    vector_csevar = None
-                    if isinstance(args[0], list):
-                        vector_args = (args[0][0], args[1][0])
-                        vector_csevar = self.vector_cse.generate(
-                            self.vector_compute,
-                            getattr(parent_handler, "vector_" + name)(*vector_args, **kwargs),  # type: ignore[has-type]
-                            bounds=buf_bounds,
-                        )
-                        vector_csevar.update_on_args(name, vector_args, kwargs)
-                        args = (args[0][1], args[1][1])
-                    if len(args) == 2:
-                        args, tile_shape = self.widening(args, buf_bounds)
-                    elif len(args) == 1:
-                        tile_shape = self.tile_shape[args[0]]
-                    else:
-                        assert(0) # not implemented yet.
-                    vec_len = tile_shape[0] * tile_shape[1]
-                    csevar = self.cse.generate(
-                        self.compute,
-                        getattr(parent_handler, name)(*args, tile_size=vec_len, **kwargs),  # type: ignore[has-type]
-                        bounds=buf_bounds,
-                    )
-                    self.tile_shape[csevar] = tile_shape
-                    csevar.update_on_args(name, args, kwargs)
-                    if vector_csevar is not None:
-                        return [vector_csevar, csevar]
-                    return csevar
-
-                return inner
-
-            @staticmethod
-            def indirect_indexing(index_var, size, check=True):
-                # Skip CSE since this doesn't return an expression
-                return self.indirect_indexing(index_var, size, check)  # type: ignore[attr-defined]
-
-            @staticmethod
-            def load(name: str, index: sympy.Expr):
-                if name in self.cse.invalidated_stores:
-                    # A load from an invalidated store requires us to
-                    # keep the actual buffer around
-                    V.kernel.must_keep_buffers.add(name)
-                if free_symbol_startswith(index, "%"):
-                    return self.indirect_load(name, index)
-                store_cache = self.cse.store_cache
-                if name in store_cache:
-                    return store_cache[name]
-                return self.load(name, index)
-
-            @staticmethod
-            def store(name, index, value, mode=None):
-                self.store_buffer_names.add(name)
-                if mode is None:
-                    self.cse.store_cache[name] = value
-                    if self.current_node:
-                        for other_name in self.current_node.get_mutations():
-                            self.cse.store_cache[other_name] = value
-                if name not in V.graph.removed_buffers:
-                    return self.store(name, index, value, mode=mode)
-
-            @staticmethod
-            def store_reduction(name, index, value):
-                self.store_buffer_names.add(name)
-                self.cse.store_cache[name] = value
-                if self.current_node:
-                    for other_name in self.current_node.get_mutations():
-                        self.cse.store_cache[other_name] = value
-
-                if name not in V.graph.removed_buffers:
-                    return self.store_reduction(name, index, value)
-
-            @staticmethod
-            def reduction(dtype, src_dtype, reduction_type, value):
-                return self.reduction(dtype, src_dtype, reduction_type, value)
-
-            @staticmethod
-            def bucketize(
-                values,
-                offsets_name: str,
-                offsets_size: sympy.Expr,
-                indexing_dtype: torch.dtype,
-                right: bool,
-            ):
-                """
-                [Note: Inductor bucketize op]
-
-                Given values (tensor) and offsets_name (reference to the name of a 1D
-                tensor), calculate the bucket that each value belongs to.
-
-                e.g. for values [-1, 0, 1, 2, 3, 4, 5, 9], offsets [0, 4, 4, 8], right=True
-                return =        [ 0, 1, 1, 1, 1, 3, 3, 4].
-
-                When right == False, bucket i refers to range (offsets[i], offsets[i+1]].
-                When right == True,  bucket i refers to range [offsets[i], offsets[i+1]).
-
-                Offsets must be non-decreasing or the result is undefined.
-                """
-                return self.bucketize(
-                    values, offsets_name, offsets_size, indexing_dtype, right
-                )
-
-        super().__enter__()
-        assert self.overrides
-        parent_handler = self.overrides(V.get_ops_handler())
-        self.exit_stack.enter_context(V.set_ops_handler(CSEProxy()))
-        self.exit_stack.enter_context(V.set_kernel_handler(self))
-        return self
-
-    def rename_indexing(self, index) -> sympy.Expr:
-        # adds the necessary kernel args for index expressions
-        # and renames variables in index expressions to kernel arg names
-        if isinstance(index, (list, tuple)):
-            return [self.rename_indexing(x) for x in index]
-        index = V.graph.sizevars.simplify(index)
-        sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name)
-        replacements = {
-            x: self.args.size(x)
-            for x in sorted_symbols
-            if x.name.startswith("s") or x.name.startswith("ps")
-        }
-        return sympy_subs(index, replacements)
diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py
index af101f44..e52d6cff 100644
--- a/PyTorchSimFrontend/mlir/mlir_autotune.py
+++ b/PyTorchSimFrontend/mlir/mlir_autotune.py
@@ -1,20 +1,28 @@
 import functools
 import torch
+import os
 import dataclasses
-from torch._inductor.autotune_process import BenchmarkRequest
 from torch._inductor.autotune_process import TensorMeta
+from torch._inductor.codecache import get_hash, write
+from PyTorchSimFrontend import extension_config
+from Simulator.simulator import TOGSimulator
 
 from typing import (
     Any,
     Callable,
-    Dict,
     Iterable,
     List,
     Optional,
-    Sequence,
-    TYPE_CHECKING,
     Union,
 )
+
+# FIXME. Avoid circular import
+def hash_prefix(hash_value):
+    return hash_value[1:12]
+
+def get_write_path(src_code):
+    return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(get_hash(src_code.strip())))
+
 @dataclasses.dataclass
 class MLIRBenchmarkRequest():
     def __init__(
@@ -46,16 +54,30 @@ def make_run_fn(
     ) -> Callable[[], None]:
         from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile
         custom_async_compile = CustomAsyncCompile()
+
+        # Check already cached result.
+        write_path = get_write_path(self.source_code)
+        key,  _ = write(self.source_code, "mlir", specified_dir=write_path)
+        result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "togsim_result/0")
+        if os.path.exists(result_path):
+            result = TOGSimulator.get_result_from_file(result_path)
+            def cached_run_fn(*args, **kwargs):
+                return result
+            return cached_run_fn
+
+        # Run a candidate code
         run_method = custom_async_compile.mlir(
             self.source_code, vectorlane_size=self.extra_args["vector_lane"],
             loop_size=None, spad_info=self.extra_args["spad_info"],
             vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"],
-            origins="Unknown", silent_mode=True)
+            origins="Unknown", silent_mode=True,
+            validate=self.extra_args['validate'], autotune=self.extra_args['autotune'])
 
         args = [
             tensor
             for tensor in list(input_tensors) + list(output_tensors)
         ]
+
         # Generate partial function.
         return functools.partial(
             run_method,
diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 79e03bd5..178ea987 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -6,8 +6,6 @@
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
 from torch._inductor.ir import IRNode
-from torch._inductor.codecache import write_atomic
-import PyTorchSimFrontend.extension_codecache as extension_codecache
 from PyTorchSimFrontend.mlir import mlir_common
 
 BMM_TEMPLATE = r"""
@@ -162,51 +160,31 @@ def render(self,
                template_buffer_node = None,
                epilogue_nodes: Optional[List[IRNode]] = None,
                prologue_nodes: Optional[List[IRNode]] = None,
+               tile_info = None,
                **kwargs):
-        if template_buffer_node is not None:
-            self.output_node = template_buffer_node
-
-        # Extract input arguments info
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        Y = self.output_node
-        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
-
-        W_tensor =  empty_strided(W.layout.size, W.layout.stride)
-        X_tensor =  empty_strided(X.layout.size, X.layout.stride)
-        if len(W_tensor.size()) > 3 or len(W_tensor.size()) == 2:
-          W_tensor = W_tensor.view([-1, W_tensor.shape[-2], W_tensor.shape[-1]])
-        if len(X_tensor.size()) > 3 or len(X_tensor.size()) == 2:
-          X_tensor = X_tensor.view([-1, X_tensor.shape[-2], X_tensor.shape[-1]])
-        B, M, N, K = X_tensor.size()[0], X_tensor.size()[1], W_tensor.size()[2], X_tensor.size()[2]
-
-        W_stride = W_tensor.stride()
-        X_stride = X_tensor.stride()
-
-        # Select tile size
-        n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
-        TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node)
-        SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
-        SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
-        SUB_TILE_K = TILE_K # if (TILE_K < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
+        X, W, Y, Bias, W_tensor, X_tensor, B, M, N, K, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes)
+        if tile_info is None:
+            TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_extra_node, 0, n_prologue_node)[0]
+        else:
+            TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info
 
         TOG_latency = M if TILE_M > M else TILE_M
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
-        TILE_K = TILE_K // 2 if prologue_nodes else TILE_K
 
         # Select template code
         nr_reduction_nodes = [node for node in epilogue_nodes if node.is_reduction()] if epilogue_nodes is not None else []
         if nr_reduction_nodes:
-          template = BMM_REDUCTION_TEMPLATE
-          epilogue_dim_aliasing = {"index0":"index0", "index1":"index2", "index2": "index1"}
-          nr_rdim = 1
+            template = BMM_REDUCTION_TEMPLATE
+            epilogue_dim_aliasing = {"index0":"index0", "index1":"index2", "index2": "index1"}
+            nr_rdim = 1
         elif prologue_nodes:
-          template = BMM_PROLOGUE_TEMPLATE
-          epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2"}
-          nr_rdim = 0
+            template = BMM_PROLOGUE_TEMPLATE
+            epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2"}
+            nr_rdim = 0
         else:
-          template = BMM_TEMPLATE
-          epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2"}
-          nr_rdim = 0
+            template = BMM_TEMPLATE
+            epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2"}
+            nr_rdim = 0
 
         # Prepare tile descriptors
         vlane_stride = 1
@@ -323,19 +301,53 @@ def render(self,
             dram_idx = Y_idx,
             dram_tile_desc = Y_tile_desc,
             nr_rdim = nr_rdim,
+            r_dim_size = M,
             dim_aliasing = epilogue_dim_aliasing
         )
         code = self._template_from_string(template).render(**kernel.render_options)
         kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
         return code
 
-    def codegen_header(self, code, extra_headers):
-        write_path = extension_codecache.get_write_path(code)
-        if not os.path.exists(write_path):
-            os.makedirs(write_path)
-        spike_write_path = os.path.join(write_path, "global_var.h")
-        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-        if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, extra_headers[0])
-        if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, extra_headers[1])
\ No newline at end of file
+    def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes):
+        if template_buffer_node is not None:
+            self.output_node = template_buffer_node
+
+        # Extract input arguments info
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
+        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+
+        W_tensor =  empty_strided(W.layout.size, W.layout.stride)
+        X_tensor =  empty_strided(X.layout.size, X.layout.stride)
+        if len(W_tensor.size()) > 3 or len(W_tensor.size()) == 2:
+          W_tensor = W_tensor.view([-1, W_tensor.shape[-2], W_tensor.shape[-1]])
+        if len(X_tensor.size()) > 3 or len(X_tensor.size()) == 2:
+          X_tensor = X_tensor.view([-1, X_tensor.shape[-2], X_tensor.shape[-1]])
+        B, M, N, K = X_tensor.size()[0], X_tensor.size()[1], W_tensor.size()[2], X_tensor.size()[2]
+
+        W_stride = W_tensor.stride()
+        X_stride = X_tensor.stride()
+
+        # Select tile size
+        n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
+        n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0
+        return X,W,Y,Bias,W_tensor,X_tensor,B,M,N,K,n_extra_node, n_prologue_node
+
+    def get_tile_candidates(self,
+               kernel: MLIRTemplateKernel,
+               template_buffer_node = None,
+               epilogue_nodes: Optional[List[IRNode]] = None,
+               prologue_nodes: Optional[List[IRNode]] = None,
+               **kwargs):
+        X, W, Y, Bias, W_tensor, X_tensor, B, M, N, K, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes)
+        return self.select_tile(kernel, M, N, K, n_extra_node, 0, n_prologue_node)
+
+    def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node):
+        tile_candidates = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node)
+        for idx, (TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates):
+            SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane) or n_prologue_node else kernel.vector_lane
+            SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
+            SUB_TILE_K = TILE_K # if (TILE_K < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
+            TILE_K = TILE_K // 2 if n_prologue_node else TILE_K
+            tile_candidates[idx] = TILE_M,TILE_N,TILE_K,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K
+        return tile_candidates
diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
index 3fff9958..dff6b0fd 100644
--- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
+++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
@@ -1,16 +1,46 @@
+import os
+import subprocess
+import shlex
+import re
 import torch
-from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
-from PyTorchSimFrontend.llvm.llvm_caller_codegen import LLVMKernelCallerCodeGen
-from PyTorchSimFrontend.mlir.mlir_common import DTYPE_TO_C
+from torch._inductor.utils import IndentedBuffer
+from torch._inductor.codecache import write_atomic
+from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs, DTYPE_TO_C
 
-class MLIRKernelCallerCodeGen(LLVMKernelCallerCodeGen):
+class MLIRKernelCallerCodeGen():
+    """
+    Generate C that calls the llvm kernel.
+    """
 
     def __init__(self, validation, arg_attributes, cycle_sim=False):
-        super().__init__(validation, arg_attributes)
+        super().__init__()
+        self.code = IndentedBuffer()
+        self.ending = ";"
+        self.open_bracket = "{"
+        self.closed_bracket = "}"
+        self.newline = "\n"
+        self.kernel_name = "kernel"
+        self.validation = validation
+        self.n_arg = len(arg_attributes)
+        self.arg_attributes = arg_attributes
+        self.arg_use_count = 1
+        self.load_args = {}
+        self.kernel_start_addr = ""
+        self.kernel_end_addr = ""
         self.cycle_sim = cycle_sim
 
+    def get_argv_idx(self):
+        self.arg_use_count += 1
+        return self.arg_use_count-1
+
     def write_header(self):
-        super().write_header()
+        self.writeline('#include <stdio.h>')
+        self.writeline('#include <stdlib.h>')
+        self.writeline("#include <stdint.h>")
+        if self.validation:
+            self.writeline("#include <unistd.h>")
+            self.writeline('#include <string.h>')
+            self.writeline('#include <fcntl.h>')
         global_var_header = "gem5_global_var.h" if self.cycle_sim else "global_var.h"
         self.writeline(f"#include \"{global_var_header}\"")
 
@@ -42,6 +72,9 @@ def dump_arg(self):
                     self.writeline(f'return -1{self.ending}')
                 self.writeline(self.closed_bracket)
 
+    def write_exit(self):
+        self.writeline(f'return 0{self.ending}')
+
     def generate_kernel_declare(self):
         # memref to llvm arguments (memref -> ptr, ptr, i64, <?xi64>, <?xi64>) allocated pointer, aligned pointer, offset, size, stride
         args_type_p = [f'{DTYPE_TO_C[arg_type[1]]}*, {DTYPE_TO_C[arg_type[1]]}*, int64_t, int64_t, int64_t' for (_, arg_type) in self.arg_attributes]
@@ -86,4 +119,142 @@ def generate_main(self):
                 self.dump_arg()
 
             self.write_exit()
-        self.writeline(self.closed_bracket)
\ No newline at end of file
+        self.writeline(self.closed_bracket)
+
+    def generate_load_dump_fn(self):
+        self.writeline(f'{self.newline}int load_arg(void *arg, size_t size, const char *path) {self.open_bracket}')
+        with self.code.indent():
+            self.writeline(f'int fd = open(path, 0x00000000){self.ending}')
+            self.writeline(f'if (fd == -1) {self.open_bracket}')
+            with self.code.indent():
+                self.writeline(f'return -1{self.ending}')
+            self.writeline(self.closed_bracket)
+
+            self.writeline(f'if (read(fd, arg, size) == -1) {self.open_bracket}')
+            with self.code.indent():
+                self.writeline(f'return -1{self.ending}')
+            self.writeline(self.closed_bracket)
+            self.writeline(f'close(fd){self.ending}')
+            self.writeline(f'return 0{self.ending}')
+        self.writeline(self.closed_bracket)
+
+        self.writeline(f'{self.newline}int dump_arg(void *arg, size_t size, const char *path) {self.open_bracket}')
+        with self.code.indent():
+            self.writeline(f'int fd = open(path, 0x00000001 | 0x00000040, 0644){self.ending}')
+            self.writeline(f'if (fd == -1) {self.open_bracket}')
+            with self.code.indent():
+                self.writeline(f'return -1{self.ending}')
+            self.writeline(self.closed_bracket)
+
+            self.writeline(f'if (write(fd, arg, size) == -1) {self.open_bracket}')
+            with self.code.indent():
+                self.writeline(f'return -1{self.ending}')
+            self.writeline(self.closed_bracket)
+            self.writeline(f'close(fd){self.ending}')
+            self.writeline(f'return 0{self.ending}')
+        self.writeline(self.closed_bracket)
+
+
+    def writeline(self, line):
+        self.code.writeline(line)
+
+    def generate_wrapper_file(self, path, name):
+        self.dump_path = path
+
+        self.write_header()
+        self.generate_kernel_declare()
+
+        if self.validation:
+            self.generate_load_dump_fn()
+        self.generate_main()
+
+        write_path = os.path.join(path, name+".c",)
+        write_atomic(write_path, self.code.getvalue())
+        return
+
+    def add_extention(self, name, extension):
+        return name + "." + extension
+
+    def compile_wih_kernel(self, write_path, llvm_name, wrapper_name, binary_name, link_option=""):
+        main_path = os.path.join(write_path, self.add_extention(wrapper_name, 'c'))
+        main_obj_path = os.path.join(write_path, self.add_extention(wrapper_name, 'o'))
+        kernel_path = os.path.join(write_path, self.add_extention(llvm_name, 's'))
+        kernel_obj_path = os.path.join(write_path, self.add_extention(llvm_name, 'o'))
+
+        main_compile = f'riscv64-unknown-elf-gcc -march=rv64gcv -c {main_path} -o {main_obj_path}'
+        kernel_compile = f'clang -c --target="riscv64" -march=rv64gcv -O2 -nostdlib {kernel_path} -o {kernel_obj_path}'
+
+        target = os.path.join(write_path, binary_name)
+        link = f'riscv64-unknown-elf-gcc -march=rv64gcv {main_obj_path} {kernel_obj_path} -o {target} -lm {link_option}'
+
+        main_compile_cmd = shlex.split(main_compile)
+        kernel_compile_cmd = shlex.split(kernel_compile)
+        link_cmd = shlex.split(link)
+
+        try:
+            subprocess.check_call(main_compile_cmd)
+            subprocess.check_call(kernel_compile_cmd)
+            subprocess.check_call(link_cmd)
+        except subprocess.CalledProcessError as e:
+            print("Command failed with exit code", e.returncode)
+            print("Error output:", e.output)
+            assert(0)
+
+    def parse_stack_sizes(self, file_path, vlenb=256):
+        with open(file_path, 'r') as f:
+            stack_sizes_data = f.readlines()
+
+        in_proc = False
+        stack_base = None
+        dynamic_expr = None
+        max_offset = 0
+
+        for line in stack_sizes_data:
+            line = line.strip()
+            if line.startswith(".cfi_startproc"):
+                in_proc = True
+                continue
+            elif line.startswith(".cfi_endproc") and in_proc:
+                if dynamic_expr:
+                    total_stack = eval(dynamic_expr, {"vlenb": vlenb})
+                    return total_stack
+                elif stack_base:
+                    return stack_base
+                else:
+                    return max_offset
+
+            # Skip outer function
+            if not in_proc:
+                continue
+
+            if line.startswith(".cfi_def_cfa_offset"):
+                stack_base = int(line.split()[-1])
+
+            if ".cfi_escape" in line and "#" in line:
+                comment = line.split("#")[-1].strip()
+                m = re.search(r"sp \+ (\d+)\s*\+\s*(\d+)\s*\*\s*vlenb", comment)
+                if m:
+                    base, scale = int(m.group(1)), int(m.group(2))
+                    dynamic_expr = f"{base} + {scale} * vlenb"
+
+    def get_spad_size(self, binary_path):
+        cmd = ["riscv64-unknown-elf-readelf", "-s", binary_path]
+        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(f"Readelf error: {result.stderr}")
+
+        output = result.stdout
+        spad_start = None
+        spad_end = None
+        for line in output.splitlines():
+            if '.spad' in line and 'SECTION' in line:
+                parts = line.split()
+                spad_start = int(parts[1], 16)
+            elif 'spad_end' in line:
+                parts = line.split()
+                spad_end = int(parts[1], 16)
+
+        if spad_start is None or spad_end is None:
+            return 0
+        spad_size = spad_end - spad_start
+        return spad_size
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 21d2868e..c24260ce 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -2,26 +2,29 @@
 import sympy
 import re
 import os
+import math
 from functools import reduce
 from operator import mul
 import torch
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
+from torch._dynamo.testing import rand_strided
+from torch._inductor.autotune_process import TensorMeta
 from torch._dynamo.utils import dynamo_timed
 from torch._inductor.codegen import cpp, wrapper, common, memory_planning
 from torch._inductor.virtualized import V, _ops as ops
-from torch._inductor.codecache import write_atomic, write
+from torch._inductor.codecache import write_atomic
 from torch._inductor.utils import (
     IndentedBuffer,
     is_welford_reduction,
     sympy_product
 )
 from torch.utils._sympy.functions import ModularIndexing, FloorDiv
-import PyTorchSimFrontend.extension_codecache as extension_codecache
-
+from PyTorchSimFrontend import extension_codecache
 from PyTorchSimFrontend import extension_config
 from . import mlir_common
 from .mlir_common import LoopLevel, LoopNest
+from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
 
 def reduction_init(reduction_type, dtype):
     if dtype in cpp.DTYPE_LOWP_FP:
@@ -96,8 +99,8 @@ def write_header(self):
 
                 from torch import device, empty, empty_strided
                 from {extension_codecache.__name__} import CustomAsyncCompile
-                from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_BACKENDSIM_EAGER_MODE
-                from Simulator.simulator import BackendSimulator
+                from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_TOGSIM_EAGER_MODE
+                from Simulator.simulator import TOGSimulator
                 from PyTorchSimFrontend.extension_op import sparse_mm_dummy_stonne_outer
                 from torch._inductor.select_algorithm import extern_kernels
 
@@ -119,7 +122,7 @@ def sram_plan_prefix(buffer_name, buffer):
                 start = buffer.data_ptr()
                 end = start + buffer_size
                 # print(f'Alloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})')
-                BackendSimulator.sram_alloc(buffer_name, [start, end])
+                TOGSimulator.sram_alloc(buffer_name, [start, end])
 
             def sram_plan_postfix(buffer_name, buffer):
                 if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN):
@@ -128,7 +131,7 @@ def sram_plan_postfix(buffer_name, buffer):
                 start = buffer.data_ptr()
                 end = start + buffer_size
                 # print(f'Dealloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})')
-                BackendSimulator.sram_dealloc(buffer_name, [start, end])
+                TOGSimulator.sram_dealloc(buffer_name, [start, end])
 
             def host2device_memcopy(buffer):
                 pass
@@ -421,6 +424,10 @@ def exp(operand, *args, var_info=None, **kwargs):
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
         return f'math.exp %{operand} : {shape}', [tile_size, dtype]
 
+    @staticmethod
+    def exp2(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError()
+
     @staticmethod
     def erf(operand, *args, var_info=None, **kwargs):
         # Check scalar
@@ -1076,8 +1083,8 @@ def load(self, name: str, index: sympy.Expr):
 
         # Extract sram info
         local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index, buffer=apply_buffer)
-        vlane_split_axis = local_tile_desc.vlane_split_axis
-        vlane_stride = local_tile_desc.vlane_stride
+        vlane_split_axis = local_tile_desc.vmap.vlane_split_axis
+        vlane_stride = local_tile_desc.vmap.vlane_stride
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
@@ -1123,8 +1130,8 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
         # Prepare dma instruction
         local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index)
-        vlane_split_axis = local_tile_desc.vlane_split_axis
-        vlane_stride = local_tile_desc.vlane_stride
+        vlane_split_axis = local_tile_desc.vmap.vlane_split_axis
+        vlane_stride = local_tile_desc.vmap.vlane_stride
 
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
@@ -1271,8 +1278,8 @@ def store_reduction(self, name, index, value):
 
         # Tile is always reuduced in inner loop
         local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index, broadcast=False, store_reduction=True, buffer=self.reductions_suffix)
-        vlane_split_axis = local_tile_desc.vlane_split_axis
-        vlane_stride = local_tile_desc.vlane_stride
+        vlane_split_axis = local_tile_desc.vmap.vlane_split_axis
+        vlane_stride = local_tile_desc.vmap.vlane_stride
 
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
@@ -1288,7 +1295,7 @@ def store_reduction(self, name, index, value):
             # mean
             reduction_numel = reduce(mul, self.ranges[self.reduction_depth:], 1)
             divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(reduction_numel)} : f32")
-            if self.buffer_types[name][1] > 1:
+            if compute_vec_size > 1:
                 divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to vector<{self.var_info[sum][0]}x{mlir_dtype}>")
             else:
                 divider_vec = divider
@@ -1354,15 +1361,15 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
             self.register_var_info(div_vec, [compute_vec_size, "index"])
             self.register_var_info(mod_vec, [compute_vec_size, "index"])
             dim = ops.modular(ops.div(vector_index, div_vec), mod_vec)
-            if idx == tile_desc.vlane_split_axis: # Need to add vector lane offset
-                offset = tile_desc.vlane_stride #* strides[idx]
-                outer_sz = tile_size[idx] // tile_desc.vlane_stride
+            if idx == tile_desc.vmap.vlane_split_axis: # Need to add vector lane offset
+                offset = tile_desc.vmap.vlane_stride #* strides[idx]
+                outer_sz = tile_size[idx] // tile_desc.vmap.vlane_stride
 
                 nr_vector_lane = self.get_const_cse(self.vector_lane, "index")
                 nr_vector_lane_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{nr_vector_lane} : index to vector<{compute_vec_size}xindex>")
                 self.register_var_info(nr_vector_lane_vec, [compute_vec_size, "index"])
 
-                vlane_stride_coeff = self.get_const_cse(tile_desc.vlane_stride, "index")
+                vlane_stride_coeff = self.get_const_cse(tile_desc.vmap.vlane_stride, "index")
                 vlane_outer_coeff = self.get_const_cse(outer_sz, "index")
                 vlane_stride_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{vlane_stride_coeff} : index to vector<{compute_vec_size}xindex>")
                 vlane_outer_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{vlane_outer_coeff} : index to vector<{compute_vec_size}xindex>")
@@ -1432,9 +1439,9 @@ def index_expr(self, index, dtype):
             # FIXME. This is a temporary solution to get tile stride of the reduction case
             tile_desc = mlir_common.MLIRMultiDimTile(
                 base_tile_desc.get_tile_size(),
-                base_tile_desc.vector_lane,
-                base_tile_desc.vlane_split_axis,
-                base_tile_desc.vlane_stride,
+                base_tile_desc.vmap.vector_lane,
+                base_tile_desc.vmap.vlane_split_axis,
+                base_tile_desc.vmap.vlane_stride,
                 base_tile_desc.get_compute_vec_size(),
             )
             axis_order = list(range(len(tile_desc.get_tile_size())))
@@ -1536,83 +1543,148 @@ def codegen_loops(self):
     def make_choices(self, nodes, kernel_name):
         choices = []
         initial_tile_size = self.kernel_group.tile_desc.get_tile_size()
-        previous_ranges = self.ranges
-        prevent_infinite_loop = 0
-        if len(initial_tile_size) < 2:
-            return choices # Can't autotune for 1-D tile size
+        prev_ranges = self.ranges
+        prev_tail_threshold = self.kernel_group.tile_desc.tail_ratio_threshold
+
+        # Allow more tail ratio during autotuning
+        self.kernel_group.tile_desc.tail_ratio_threshold = 0.3
+
+        if prev_ranges == [1] or len(prev_ranges) == 0:
+            return choices
+        #if len(initial_tile_size) < 2:
+        #    return choices # Can't autotune for 1-D tile size
+
         for vlane_stride in [2, 4, 8]:
-            os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = str(vlane_stride)
-            previous_tile_size = initial_tile_size
-            increase_dim = -2 # increase the first dimension
-            while previous_tile_size[increase_dim] * 2 <= previous_ranges[increase_dim] and previous_tile_size[increase_dim] <= 2 ** 13 and prevent_infinite_loop < 10:
-                incrase_dim = -1 # only increase the last dimension
-                prevent_infinite_loop += 1
-                while previous_tile_size[incrase_dim] * 2 <= previous_ranges[incrase_dim] and previous_tile_size[incrase_dim] <= 2 ** 13:
+            self.kernel_group.tile_desc.set_tile_size(initial_tile_size)
+            self.kernel_group.tile_desc.vmap.vlane_stride = vlane_stride
+            prevent_infinite_loop = 0
+
+            # Get the dimension to increase
+            candidate_axes = [
+                axis for axis, constr in enumerate(self.kernel_group.tile_desc.tile_constraint)
+                if not constr.fixed
+            ]
+            search_space = set()
+
+            # Try initial tile size
+            self.reset(None)
+            src_code = super().codegen_nodes(nodes, kernel_name)
+            current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size())
+            search_space.add(current_tile_sz)
+
+            print(f"[Auto-tune] Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}")
+            self._prepare_simulator_headers(src_code)
+            bench_runner = self.run_bench(nodes, kernel_name, src_code)
+            choices.append((bench_runner, src_code, current_tile_sz, self.kernel_group.tile_desc.vmap.vlane_stride))
+
+            while prevent_infinite_loop < 10 and candidate_axes:
+                for axis in list(candidate_axes):
+                    prev_tile_sz = self.kernel_group.tile_desc.get_tile_size()
+
+                    # If tile size is maximized for this axis, remove from candidate axes
+                    if prev_tile_sz[axis] >= prev_ranges[axis] * 2 or prev_tile_sz[axis] >= 2 ** 13:
+                        candidate_axes.remove(axis)
+                        self.reset(None)
+                        continue
+
+                    # Try increase tile size for this axis
+                    try:
+                        self.kernel_group.tile_desc.scale_tile_dim(axis, prev_ranges[axis], 2)
+                    except extension_codecache.TileSizeError as e:
+                        # Failed to find proper tile size
+                        candidate_axes.remove(axis)
+                        self.reset(None)
+                        continue
+
+                    self.reset(None)
                     src_code = super().codegen_nodes(nodes, kernel_name)
-                    if self.stop_autotune:
-                        print(f"[Auto-tune] Skipping autotuning due to enough tile size: {self.kernel_group.tile_desc.get_tile_size()}")
-                        break
-                    print(f"[Auto-tune] Trying tile size: {self.kernel_group.tile_desc.get_tile_size()}, vlane_stride: {vlane_stride}")
-                    previous_tile_size = self.kernel_group.tile_desc.get_tile_size()
+                    current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size())
+
+                    # FIXME. How to intergrate this constraint to tile system?
+                    pad = self.kernel_group.tile_desc.vmap.get_used_vlane(current_tile_sz) * self.kernel_group.tile_desc.vmap.vlane_stride
+                    vlane_size = current_tile_sz[self.kernel_group.tile_desc.vmap.vlane_split_axis]
+                    if vlane_size > pad and vlane_size % pad:
+                        prevent_infinite_loop += 1
+                        continue
+
+                    # If tile size is converged for this axis, remove from candidate axes
+                    if current_tile_sz in search_space:
+                        candidate_axes.remove(axis)
+                        continue
+
+                    # Add this choice
+                    search_space.add(current_tile_sz)
+                    print(f"[Auto-tune] Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}")
                     self._prepare_simulator_headers(src_code)
                     bench_runner = self.run_bench(nodes, kernel_name, src_code)
-                    choices.append((bench_runner, src_code, self.kernel_group))
-                    self.reset(f"tile_size_{incrase_dim}")
-                previous_tile_size[incrase_dim] = initial_tile_size[incrase_dim]
-                self.kernel_group.tile_desc.set_tile_size(previous_tile_size)
-                self.reset(f"tile_size_{increase_dim}")
-            self.reset("vlane_stride")
+                    choices.append((bench_runner, src_code, self.kernel_group.tile_desc.get_tile_size(), self.kernel_group.tile_desc.vmap.vlane_stride))
+                    prevent_infinite_loop += 1
+        self.kernel_group.tile_desc.prev_tail_threshold = prev_tail_threshold
         return choices
 
-    def autotune(self, nodes, kernel_name):
+    def autotune(self, *args):
         def get_cycle(choice):
-            bench_runner, src_code, kernel_group = choice
+            bench_runner = choice[0]
             for n_try in range(extension_config.CONFIG_MAX_AUTOTUNE_TRY): # TODO: make simple
                 try:
-                    # bench_runner = self.run_bench(nodes, kernel_name, src_code)
-                    if int(os.environ.get('BACKENDSIM_DRYRUN', default=False)):
-                        _, _, out = bench_runner(autotune=1)
-                    else:
-                        out = bench_runner(validate=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE)
+                    out = bench_runner()
                     return out[-1]
                 except (extension_codecache.SpadOverflowError, RuntimeError) as e:
                     return float("inf")
-                    #if isinstance(e, RuntimeError) and str(e) != "STACK_OVERFLOW":
-                    #    print(f"Benchmark[trial-{n_try}] failed with unexpected error: {e}")
-                    #    return float("inf")
-                    #print(f"Benchmark failed due to spad overflow with tile size: {self.kernel_group.tile_desc.get_tile_size()}")
-                    #self.kernel_group = kernel_group # Reset to the original tile desc
-                    #self.reset("spad_overflow")
-                    #src_code = super().codegen_nodes(nodes, kernel_name)
-                    #bench_runner = self.run_bench(nodes, kernel_name, src_code)
-                    #kernel_group = self.kernel_group
-                    #self._prepare_simulator_headers(src_code)
             return float("inf") # Exceeded maximum number of autotuning attempts
-
-        choices = self.make_choices(nodes, kernel_name)
+        choices = self.make_choices(*args)
 
         if len(choices) == 0: # can't autotune
-            return None
+            return [None, None]
         with ThreadPoolExecutor(max_workers=8) as executor:
             results = list(executor.map(get_cycle, choices))
         max_idx = results.index(min(results))
         if min(results) == float("inf"):
             raise RuntimeError("Failed to find optimal tile size...")
-        print(f"[Auto-tune] Optimal tile size: {choices[max_idx][2].tile_desc.get_tile_size()}, vlane_stride: {choices[max_idx][2].tile_desc.vlane_stride}, cycles: {results[max_idx]}")
-        optimal_src_code = choices[max_idx][1]
-        return optimal_src_code
+        self._log_autotune_result(choices[max_idx], results[max_idx])
+        optimal_src_code, loop_size = choices[max_idx][1], choices[max_idx][-1]
+        return optimal_src_code, loop_size
+
+    def run_bench(self, nodes, kernel_name, src_code):
+        _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs()
+        input_call_args = tuple(self.args.input_buffers.keys())
+        output_call_args = tuple(self.args.output_buffers.keys())
+        full_input_nodes = tuple([V.graph.get_buffer(k) for k in input_call_args])
+        full_output_nodes = tuple([V.graph.get_buffer(k) for k in output_call_args])
+
+        bmreq = MLIRBenchmarkRequest(
+            kernel_name=kernel_name,
+            input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes),
+            output_tensor_meta=TensorMeta.from_irnodes(full_output_nodes),
+            extra_args={
+                "vector_lane" : self.vector_lane,
+                "spad_info": self.spad_info,
+                "vlen" : self.vlen,
+                "arg_attributes" : arg_attributes,
+                "validate" : extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE,
+                "autotune" : True,
+            },
+            source_code=src_code,
+        )
+        dummy_inputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.input_tensor_meta]
+        dummy_outputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.output_tensor_meta]
+        return bmreq.make_run_fn(dummy_inputs, dummy_outputs)
+
+    def _log_autotune_result(self, best_choice, best_cycle):
+        print(
+            f"[Auto-tune] Optimal tile size: {list(best_choice[2])}, "
+            f"vlane_stride: {best_choice[3]}, "
+            f"cycles: {best_cycle}"
+        )
 
     def codegen_nodes(self, nodes, kernel_name):
         src_code = super().codegen_nodes(nodes, kernel_name)
         self._prepare_simulator_headers(src_code)
-        if not extension_config.CONFIG_AUTOTUNE or extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
-            return src_code
-        else:
-            optimal_src_code = self.autotune(nodes, kernel_name)
-            if optimal_src_code:
+        if extension_config.CONFIG_AUTOTUNE and extension_config.CONFIG_TORCHSIM_TIMING_MODE:
+            optimal_src_code = self.autotune(nodes, kernel_name)[0]
+            if optimal_src_code is not None:
                 return optimal_src_code
-            else:
-                return src_code
+        return src_code
 
     def _prepare_simulator_headers(self, src_code):
         write_path = extension_codecache.get_write_path(src_code)
@@ -1664,78 +1736,73 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
 
         index_var = self.parse_indices(index, buffer=buffer, indirect_dims=indirect_dims)
 
-        if kg_tile_desc.vlane_split_axis in local_dims:
-            local_vlane_split_axis = local_dims.index(kg_tile_desc.vlane_split_axis)
+        if kg_tile_desc.vmap.vlane_split_axis in local_dims:
+            local_vlane_split_axis = local_dims.index(kg_tile_desc.vmap.vlane_split_axis)
         else:
             local_vlane_split_axis = max(len(local_dims) - 1, 0)
 
         # Case 0. Tile is 0-D scalar
         if len(local_dims) == 0:
             if not store_reduction:
-                local_tile_desc.set_tile_size([kg_tile_desc.get_used_vlane() * kg_tile_desc.vlane_stride])         # Force it to use vector instruction.
-                local_tile_desc.vlane_split_axis = local_vlane_split_axis    # last axis
-                local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+                local_tile_desc.set_tile_size([kg_tile_desc.get_used_vlane() * kg_tile_desc.vmap.vlane_stride])         # Force it to use vector instruction.
+                local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis    # last axis
+                local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride
             else:
                 local_tile_desc.set_tile_size([1])
-                local_tile_desc.vlane_split_axis = 0
-                local_tile_desc.vlane_stride = 1
+                local_tile_desc.vmap.vlane_split_axis = 0
+                local_tile_desc.vmap.vlane_stride = 1
             dram_stride = [0] # Edge case
         # Case 1. Tile is 1-D vector type
         elif len(local_dims) == 1 and len(local_dims) <= self.reduction_depth:
             local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(local_dims[0])])
-            local_tile_desc.vlane_split_axis = local_vlane_split_axis
-            local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+            local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis
+            local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride
         # Case 2. Tile is 1-D vector type with reduction
         elif len(local_dims) == 1 and len(local_dims) == self.reduction_depth + 1:
             local_tile_desc.set_tile_size([1, kg_tile_desc.get_dim_size(local_dims[0])])
-            local_tile_desc.vlane_split_axis = local_vlane_split_axis + 1
-            local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+            local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + 1
+            local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride
         # Case 3. Tile is 2-D tile
         elif len(local_dims) == 2:
             is_reduction = self.reduction_depth == 1 and not store_reduction
             if is_reduction:
                 local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims], [1, 0])
-                local_tile_desc.vlane_split_axis = local_vlane_split_axis
-                local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+                local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis
+                local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride
             else:
                 local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims])
-                local_tile_desc.vlane_split_axis = local_vlane_split_axis
-                local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+                local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis
+                local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride
         # Case 3. Tile is 3-D tile
         elif len(local_dims) == 3:
             is_reduction = self.reduction_depth < 3 and not store_reduction
             if is_reduction:
                 axis_order = [1, 2, 0] if self.get_nr_rdim()==1 else [2, 1, 0]
                 local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims], axis_order)
-                local_tile_desc.vlane_split_axis = local_vlane_split_axis
-                local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+                local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis
+                local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride
             else:
                 local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims])
-                local_tile_desc.vlane_split_axis = local_vlane_split_axis
-                local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+                local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis
+                local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride
         # Case 4. Tile is 4-D tile (e.g., Convolution epilogue)
         elif len(local_dims) == 4:
             is_reduction = self.reduction_depth < 3 and not store_reduction
             if is_reduction:
                 raise NotImplementedError("Currently not implemented... ;)")
             local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims])
-            local_tile_desc.vlane_split_axis = local_vlane_split_axis
-            local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride
+            local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis
+            local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride
         else:
             raise NotImplementedError("Currently not implemented... ;)")
 
         if len(implicit_local_dims)!=0 and len(local_dims) != len(implicit_local_dims) and self.is_modular_indexing(index):
-            tile_size = local_tile_desc.get_tile_size()
-            new_tile_size = []
-            new_vlane_split_axis = local_tile_desc.vlane_split_axis
-            implicit_dim_size = list(kg_tile_desc.implicit_dim_size.values())
-            for i, target_dim_size in enumerate(implicit_dim_size):
-                new_tile_size += [1]*(len(target_dim_size)-1) + tile_size[i:i+1]
-                if local_tile_desc.vlane_split_axis >= i:
-                    new_vlane_split_axis += len(target_dim_size)-1
-            # Update
-            local_tile_desc.set_tile_size(new_tile_size)
-            local_tile_desc.vlane_split_axis = new_vlane_split_axis
+            for axis_constraints in self.kernel_group.tile_desc.implicit_dim_size.values():
+                if len(axis_constraints) <= 1:
+                    continue
+                sorted_constraints = sorted(axis_constraints, key=lambda c: int(c.args[1]))
+                for constraint in sorted_constraints[1:]:
+                    index = index.replace(constraint.original_expr, 0)
 
         # Calculate dram stride
         dram_stride = [0] * local_tile_desc.get_nr_dim()
@@ -1780,6 +1847,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
                         new_tile_sizes = list(self.kernel_group.tile_desc.get_tile_size())
                         new_tile_sizes[dim_idx] = new_size
                         self.kernel_group.tile_desc.set_tile_size(new_tile_sizes)
+                        self.kernel_group.tile_desc.tile_constraint[dim_idx].fixed = True
 
                         # Send recompile signal
                         self.reset("recompile")
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 73996351..c655dde3 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -1,19 +1,18 @@
 import dataclasses
 import math
+from dataclasses import dataclass
 from typing import Dict
 from typing import List
 from collections import defaultdict
 from functools import reduce
 from operator import mul
 import torch
-from torch._dynamo.testing import rand_strided
-from torch._inductor.autotune_process import TensorMeta
 from torch._inductor.codegen import common
 from torch._inductor.codegen import cpp
 from torch._inductor.virtualized import V
 from torch._inductor.ir import MultiOutputLayout
 from torch._inductor.dependencies import MemoryDep, StarDep, WeakDep
-from torch.utils._sympy.functions import ModularIndexing
+from torch.utils._sympy.functions import ModularIndexing, FloorDiv, Mod
 import sympy
 import contextlib
 
@@ -32,7 +31,7 @@
     unique,
 )
 from PyTorchSimFrontend import extension_config
-from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
+from PyTorchSimFrontend import extension_codecache
 schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
 
 DTYPE_TO_MLIR = {
@@ -209,169 +208,72 @@ def set_info(outer, inner, arg_type):
             set_info(outer, inner, self.MLIR_ARGS_VAR)
         return arg_defs, call_args, arg_attributes, buffer_types
 
-class MLIRMultiDimTile():
-    def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None, vec_size=None):
-        self.name = ""
-        self._tile_size = list(tile_size)
-        self._tile_stride = None
-        self.tile_axis_order = list(range(len(tile_size)))
-        self.vec_size = vec_size
-        self.update_tile_stride()
-
-        # Vector lane mapping config
+class VectorLaneMapping():
+    def __init__(self, vector_lane: int, forced_vec_size: int, vlane_split_axis: int, vlane_stride: int):
         self.vector_lane = vector_lane
         self.vlane_split_axis = vlane_split_axis
         self.vlane_stride = vlane_stride
-        self.implicit_dim_size = None
-        self.nr_rdim = 0
-
-        # Dram offset
-        self.offset = sympy.Integer(0)
-
-    def set_name(self, name: str):
-        self.name = name
-
-    def set_tile_size(self, tile_size, tile_axis_order=None):
-        self._tile_size = tile_size
-        if tile_axis_order is None:
-            self.tile_axis_order = list(range(len(tile_size)))
-        else:
-            self.tile_axis_order = tile_axis_order
-        self.update_tile_stride()
-
-    def set_tile_size_stride(self, tile_size, tile_stride):
-        self._tile_size = tile_size
-        self._tile_stride = tile_stride
-
-    def get_name(self) -> str:
-        return self.name
-
-    def get_tile_size(self):
-        return self._tile_size
-
-    def get_numel(self):
-        """
-        Return size of multi-dimensional tile
-        """
-        size = 1
-        for dim_size in self._tile_size:
-            size *= dim_size
-        return size
-
-    def get_numel_per_lane(self):
-        tile_size_per_lane = self.get_tile_size_per_lane()
-        size = 1
-        for dim_size in tile_size_per_lane:
-            size *= dim_size
-        return size
-
-    def update_tile_stride(self):
-        strides = [1] * len(self._tile_size)
-        init = 1
+        self.forced_vec_size = forced_vec_size
 
-        original_indices = list(range(len(self.tile_axis_order)))
-        sorted_pairs = sorted(
-            zip(self.tile_axis_order, self._tile_size, original_indices),
-            key=lambda x: x[0], reverse=True
+    def get_used_vlane(self, tile_size: list[int]):
+        return min(
+            math.ceil(tile_size[self.vlane_split_axis] / self.vlane_stride),
+            self.vector_lane
         )
-        for _, size, original_indices in sorted_pairs:
-            strides[original_indices] = init
-            init *= size
-        self._tile_stride = strides
 
-    def get_tile_stride(self):
-        return self._tile_stride
+    def get_tile_size_per_lane(self, tile_size: list[int]):
+        per_lane = tile_size.copy()
+        used = self.get_used_vlane(tile_size)
+        if self.vlane_split_axis < 0 or self.vlane_split_axis >= len(per_lane):
+            raise AssertionError("Not allowed split_axis")
+        per_lane[self.vlane_split_axis] = math.ceil(per_lane[self.vlane_split_axis] / used)
+        return per_lane
 
-    def get_tile_stride_per_lane(self):
-        tile_stride = list(self.get_tile_stride())  # original strides
-        tile_size = list(self.get_tile_size())      # original tile size
-        split_axis = self.vlane_split_axis
+    def get_numel_per_lane(self, tile_size: list[int]):
+        return math.prod(self.get_tile_size_per_lane(tile_size))
 
-        tile_size_per_lane = self.get_tile_size_per_lane()
-        coeff = tile_size[split_axis]//tile_size_per_lane[split_axis]
+    def get_tile_stride_per_lane(self, tile_size: list[int], tile_stride: list[int]):
+        tile_stride = tile_stride.copy()  # original strides
+        get_tile_size_per_lane = self.get_tile_size_per_lane(tile_size)
+        coeff = tile_size[self.vlane_split_axis]//get_tile_size_per_lane[self.vlane_split_axis]
 
         # Propagate stride according to per-lane tile size
         for i in range(len(tile_stride)):
-            if tile_stride[i] > tile_stride[split_axis]:
+            if tile_stride[i] > tile_stride[self.vlane_split_axis]:
                 tile_stride[i] = tile_stride[i] // coeff
         return tile_stride
 
-    def get_tile_size_per_lane(self):
-        tile_size_per_lane = list(self._tile_size)
-        if self.vlane_split_axis < 0 or self.vlane_split_axis >= len(tile_size_per_lane):
-            raise AssertionError("Not allowed split_axis")
-        used_vlane = self.get_used_vlane()
-        tile_size_per_lane[self.vlane_split_axis] = \
-            self.div_round_up(tile_size_per_lane[self.vlane_split_axis], used_vlane)
-        return tile_size_per_lane
-
-    def get_nr_dim(self):
-        """
-        Return number of dimensions
-        """
-        return len(self._tile_size)
-
-    def get_dim_size(self, index):
-        if isinstance(index, int):
-            return self._tile_size[index]
-        elif "index" in str(index):
-            return self._tile_size[int(str(index)[5:])]
-        raise NotImplementedError("Unsupported format of index")
-
-    def get_mlir_shape(self, dtype):
-        str_tile_size = [str(dim) for dim in self._tile_size]
-        shape = "x".join(str_tile_size)
-        return f"memref<{shape}x{dtype}, 1>"
-
-    def get_mlir_vshape(self, mlir_dtype):
-        return f"vector<{self.get_compute_vec_size()}x{mlir_dtype}>" if self.get_compute_vec_size() > 1 else f"{mlir_dtype}"
-
-    def get_used_vlane(self):
-        """
-        Return number of used vector lane
-        """
-        if self.vlane_split_axis < 0 or self.vlane_split_axis >= len(self._tile_size):
-            raise AssertionError("Not allowed split_axis")
-        return min(self.div_round_up(self._tile_size[self.vlane_split_axis], self.vlane_stride), self.vector_lane)
-
-    def get_vlane_stride(self):
-        return self.vlane_stride
-
-    def get_compute_vec_size(self):
-        # Granule size used in compute loop
-        if self.vec_size is not None:
-            return self.vec_size
-        if self.nr_rdim:
-            assert self.nr_rdim!=0
-            val = self.get_numel_per_lane() // self.get_reduction_numel()
-            if self.get_numel_per_lane() >= val * 8:
-                return val*8
-            elif self.get_numel_per_lane() >= val * 4:
-                return val*4
-            elif self.get_numel_per_lane() >= val * 2:
-                return val*2
+    def get_compute_vec_size(self, tile_size: list[int], reduction_numel: int, nr_rdim: int) -> int:
+        if self.forced_vec_size is not None:
+            return self.forced_vec_size
+
+        per_lane = self.get_numel_per_lane(tile_size)
+        stride = self.vlane_stride
+        if nr_rdim:
+            val = per_lane // max(reduction_numel, 1)
+            for mult in [8, 4, 2]:
+                if per_lane >= val * mult:
+                    return val * mult
             return val
-        if (self.get_numel_per_lane() // self.vlane_stride) >= 8:
-            return self.vlane_stride * 8
-        if (self.get_numel_per_lane() // self.vlane_stride) >= 4:
-            return self.vlane_stride * 4
-        if (self.get_numel_per_lane() // self.vlane_stride) >= 2:
-            return self.vlane_stride * 2
-        return self.vlane_stride
+        for mult in [8, 4, 2]:
+            if (per_lane // stride) >= mult:
+                return stride * mult
+        return stride
 
-    @staticmethod
-    def div_round_up(size, round_val):
-        return (size + round_val - 1) // round_val
+class TileAdjustMixin():
+    def __init__(self):
+        self.tail_ratio_threshold = 0.01
 
     def apply_divisor(self, axis: int, divisor: int, mode: str = "split"):
-        # Apply divisor to tile size at given axis.
-        # This method based on axis order.
+        """Split or pad a given axis of the tile."""
         old_size = self._tile_size[axis]
-        if divisor == 1:
+        if divisor <= 1:
             return
-        padded = self.div_round_up(old_size, divisor) * divisor
-        outer  = self.div_round_up(old_size, divisor)
-        inner  = divisor
+
+        padded = math.ceil(old_size / divisor) * divisor
+        outer = math.ceil(old_size / divisor)
+        inner = divisor
+
         if mode == "pad":
             self._tile_size[axis] = padded
             self.update_tile_stride()
@@ -382,54 +284,277 @@ def apply_divisor(self, axis: int, divisor: int, mode: str = "split"):
             new_sizes.insert(axis + 1, inner)
             self._tile_size = new_sizes
 
-            # Update tile_axis_order
             old_order_val = self.tile_axis_order[axis]
             new_order = list(self.tile_axis_order)
             new_order.insert(axis + 1, old_order_val + 0.1)
-            sorted_pairs = sorted(
-                zip(range(len(new_order)), new_order),
-                key=lambda x: x[1]
-            )
-            self.tile_axis_order = [idx for idx, _ in sorted_pairs]
+            self.tile_axis_order = [idx for idx, _ in sorted(
+                zip(range(len(new_order)), new_order), key=lambda x: x[1]
+            )]
             self.update_tile_stride()
 
-            if self.vlane_split_axis == axis:
-                self.vlane_split_axis = axis
-            elif self.vlane_split_axis > axis:
-                self.vlane_split_axis += 1
+            # Adjust split axis for vmap
+            if self.vmap.vlane_split_axis > axis:
+                self.vmap.vlane_split_axis += 1
             return
-        else:
-            raise ValueError(f"Unknown mode: {mode}. Supported modes are 'pad' and 'split'.")
 
-    def get_reduction_numel(self):
-        return reduce(mul, self.get_tile_size()[-1*self.nr_rdim:], 1)
+        raise ValueError(f"Unknown mode: {mode}. Supported: 'pad', 'split'.")
 
-    def is_dim_dividable(self, dim_sizes):
+    def is_dim_dividable(self, dim_sizes: list[int]) -> bool:
         if len(dim_sizes) != len(self._tile_size):
-            raise ValueError("dim_sizes must match the tile size dimensions.")
-        dim_sizes_cpy = [int(d) for d in dim_sizes]
-        remain = dim_sizes_cpy[self.vlane_split_axis] % self.vlane_stride
+            raise ValueError("dim_sizes must match the tile size dimensions")
+
+        dim_sizes_cpy = list(dim_sizes)
+        axis, stride = self.vmap.vlane_split_axis, self.vmap.vlane_stride
+        remain = dim_sizes_cpy[axis] % stride
         if remain:
-            dim_sizes_cpy[self.vlane_split_axis] += self.vlane_stride - remain
+            dim_sizes_cpy[axis] += stride - remain
+
         return all(d % t == 0 for d, t in zip(dim_sizes_cpy, self._tile_size))
 
-    def adjust_tile_to_divisible(self, dim_sizes):
+    def adjust_tile_to_divisible(self, dim_sizes: list[int]) -> list[int]:
+        """Adjust current tile to be divisible by given dimensions."""
+        if len(dim_sizes) != len(self._tile_size):
+            raise ValueError("dim_sizes must match the tile size dimensions")
+
         def _adjust_one(dim_size, tile_size):
             for candidate in range(tile_size, 0, -1):
                 if dim_size % candidate == 0:
                     return candidate
             return 1
 
-        if len(dim_sizes) != len(self._tile_size):
-            raise ValueError("dim_sizes must match the tile size dimensions.")
         candidate_tile_size = [_adjust_one(d, t) for d, t in zip(dim_sizes, self._tile_size)]
-        # FIXME. Is this the only solution?
-        # Round up
-        remain = candidate_tile_size[self.vlane_split_axis] % self.vlane_stride
+        for i in range(len(candidate_tile_size)):
+            self.tile_constraint[i].must_divide_dim = True
+
+        axis, stride = self.vmap.vlane_split_axis, self.vmap.vlane_stride
+        remain = candidate_tile_size[axis] % stride
+
         if remain:
-            candidate_tile_size[self.vlane_split_axis] += self.vlane_stride - remain
+            candidate_tile_size[axis] += stride - remain
+            self.tile_constraint[axis].must_divide_dim = False
         return candidate_tile_size
 
+    def scale_tile_dim(self, axis, dim_sz, scale_factor=2):
+        axis_constrinat = self.tile_constraint[axis]
+        current_sz = self._tile_size[axis]
+        new_sz = axis_constrinat.adjust(current_sz, int(current_sz * scale_factor), dim_sz)
+        self._tile_size[axis] = new_sz
+        self.update_tile_stride()
+        return current_sz != new_sz
+
+    def decrease_tile_size(self, dim_size):
+        tile_size = self._tile_size
+        vlane_split_axis, vlane_stride, vector_lane = self.vmap.vlane_split_axis, self.vmap.vlane_stride, self.vmap.vector_lane
+        tile_size = list(tile_size)
+
+        # Decrease vlane_split_axis when it is too large
+        if tile_size[vlane_split_axis] > 2 * vlane_stride * vector_lane:
+            if self.scale_tile_dim(vlane_split_axis, dim_size[vlane_split_axis], scale_factor=0.5):
+                return
+
+        for i in range(len(tile_size)):
+            if i == vlane_split_axis:
+                continue
+            if tile_size[i] > 1:
+                if self.scale_tile_dim(i, dim_size[i], scale_factor=0.5):
+                    return
+
+        # Decrease vlane_split_axis at the end to maximize the vlane usage
+        self.scale_tile_dim(vlane_split_axis, dim_size[vlane_split_axis], scale_factor=0.5)
+        return
+
+    def trim_large_tail(self, ranges: list[int]):
+        for i, (dim_range, tile_range) in enumerate(zip(ranges, self._tile_size)):
+            ALPHA = 1.0
+            BETA = 0.5
+            constraint = self.tile_constraint[i]
+            if constraint.fixed:
+                continue
+            elif constraint.must_divide_dim:
+                BETA = 0
+
+            padding_ratio = TileAdjustMixin.get_padding_ratio(tile_range, dim_range)
+            if padding_ratio < self.tail_ratio_threshold:
+                continue
+            best_tile = tile_range
+            best_cost = (
+                ALPHA * padding_ratio +
+                BETA * (dim_range / tile_range)
+            )
+
+            min_tile = 1
+            for candidate in range(tile_range - 1, min_tile - 1, -1):
+                new_candidate = constraint.adjust(tile_range, candidate, dim_range)
+                ratio = TileAdjustMixin.get_padding_ratio(new_candidate, dim_range)
+                iter_penalty = (dim_range / new_candidate)
+
+                cost = ALPHA * ratio + BETA * iter_penalty
+                if cost < best_cost:
+                    best_tile, best_cost = new_candidate, cost
+            self._tile_size[i] = best_tile
+
+    def select_vlane_axis(self):
+        best_vlane_split_axis = 0
+        best_used_vlane = math.ceil(self._tile_size[0] / self.vmap.vlane_stride)
+        for i, dim in enumerate(self._tile_size[1:len(self._tile_size)-self.nr_rdim]):
+            used_vlane = math.ceil(dim / self.vmap.vlane_stride)
+            if used_vlane > best_used_vlane:
+                best_used_vlane = used_vlane
+                best_vlane_split_axis = i+1
+        self.vmap.vlane_split_axis = best_vlane_split_axis
+
+    def pad_vlane_tile(self):
+        # FIXME. this doesn't follow tile constraints...
+        vlane_split_axis, vlane_stride, vector_lane = self.vmap.vlane_split_axis, self.vmap.vlane_stride, self.vmap.vector_lane
+        used_vlane = min(math.ceil(self._tile_size[vlane_split_axis] / vlane_stride), vector_lane)
+        padded_size = used_vlane * vlane_stride
+        self._tile_size[vlane_split_axis] = math.ceil(self._tile_size[vlane_split_axis] / padded_size) * padded_size
+
+    def apply_constraints(self, constraints, ranges):
+        for idx, (axis_constraints, axis_size) in enumerate(zip(constraints.values(), ranges)):
+            for const in axis_constraints:
+                if const.args[1] == 1:
+                    continue
+                divider = int(const.args[1])
+
+                if not self.tile_constraint[idx].fixed:
+                    self.tile_constraint[idx].fixed = True
+                    self._tile_size[idx] = divider
+                elif self.tile_constraint[idx].fixed and self._tile_size[idx] > divider:
+                    self._tile_size[idx] = divider
+        self.update_tile_stride()
+
+    @staticmethod
+    def init_tile_size(ranges, vlane_stride, vector_lane):
+        nr_dim = len(ranges)
+        tile_size = [1] * nr_dim
+        if len(tile_size) == 2:
+            tile_size[-1] = vlane_stride * vector_lane
+            tile_size[-2] = 2 * vector_lane
+        elif len(tile_size) == 0: # Scalar
+            tile_size = [1]
+            ranges = [1]
+        elif len(tile_size) == 1 and ranges[0]==1:
+            tile_size[0] = 1
+        elif len(tile_size) == 1:
+            tile_size[0] = 2 * vlane_stride * vector_lane
+        elif len(tile_size) == 3:
+            tile_size[-1] = vector_lane
+            tile_size[-2] = 4 * vector_lane
+            tile_size[-3] = 2
+        elif len(tile_size) == 4:
+            tile_size[-1] = vector_lane
+            tile_size[-2] = 4 * vector_lane
+            tile_size[-3] = 2
+            tile_size[-4] = 1
+        else:
+            raise NotImplementedError("dummy tile size fail!")
+        return tile_size
+
+    @staticmethod
+    def get_padding_ratio(tile_range: int, dim_range: int) -> float:
+        if tile_range <= 0 or dim_range <= 0:
+            raise ValueError("tile_range and dim_range must be positive integers")
+        tail = dim_range % tile_range
+        padding = (tile_range - tail) % tile_range
+        return float(padding / dim_range)
+
+@dataclass
+class TileConstraint:
+    multiple_of: int = 1
+    must_divide_dim: bool = False
+    fixed: bool = False
+
+    def adjust(self, old: int, new: int, dim: int) -> int:
+        if self.fixed:
+            return old # Fixed tile size
+
+        tail = new % self.multiple_of
+        new -= tail
+        if not self.must_divide_dim:
+            return max(new, self.multiple_of)
+
+        while new > 0:
+            if dim % new == 0:
+                return new
+            new -= self.multiple_of
+        raise extension_codecache.TileSizeError("Cannot find suitable tile size under the given constraints.")
+
+class MLIRMultiDimTile(TileAdjustMixin):
+    def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None, forced_vec_size=None):
+        super().__init__()
+        self.name = ""
+        self._tile_size = list(tile_size)
+        self._tile_stride = None
+        self.tile_constraint = [TileConstraint(vlane_stride) for _ in tile_size]
+        self.tile_axis_order = list(range(len(tile_size)))
+        self.update_tile_stride()
+
+        # Vector lane mapping config
+        self.vmap = VectorLaneMapping(
+            vector_lane=vector_lane,
+            forced_vec_size=forced_vec_size,
+            vlane_split_axis=vlane_split_axis,
+            vlane_stride=vlane_stride
+        )
+
+        self.implicit_dim_size = None
+        self.nr_rdim = 0
+        self.offset = sympy.Integer(0) # Dram offset
+
+    def set_name(self, name: str): self.name = name
+    def get_name(self) -> str: return self.name
+    def get_tile_size(self): return list(self._tile_size)
+    def get_tile_stride(self): return list(self._tile_stride)
+    def get_numel(self) -> int :return math.prod(self._tile_size)
+    def get_nr_dim(self) -> str: return len(self._tile_size)
+    def get_reduction_numel(self): return reduce(mul, self.get_tile_size()[-1*self.nr_rdim:], 1)
+
+    def set_tile_size(self, tile_size, tile_axis_order=None, constraints=None):
+        self._tile_size = list(tile_size)
+        self.tile_axis_order = list(range(len(tile_size))) if tile_axis_order is None else tile_axis_order
+        self.update_tile_stride()
+
+    def set_tile_size_stride(self, tile_size, tile_stride):
+        self._tile_size = list(tile_size)
+        self._tile_stride = list(tile_stride)
+
+    def update_tile_stride(self):
+        strides = [1] * len(self._tile_size)
+        init = 1
+
+        original_indices = list(range(len(self.tile_axis_order)))
+        sorted_pairs = sorted(
+            zip(self.tile_axis_order, self._tile_size, original_indices),
+            key=lambda x: x[0], reverse=True
+        )
+        for _, size, original_indices in sorted_pairs:
+            strides[original_indices] = init
+            init *= size
+        self._tile_stride = strides
+
+    def get_dim_size(self, index):
+        if isinstance(index, int):
+            return self._tile_size[index]
+        elif "index" in str(index):
+            return self._tile_size[int(str(index)[5:])]
+        raise NotImplementedError("Unsupported format of index")
+
+   # Vector mapping delegation
+    def get_tile_size_per_lane(self): return self.vmap.get_tile_size_per_lane(self._tile_size)
+    def get_used_vlane(self): return self.vmap.get_used_vlane(self._tile_size)
+    def get_numel_per_lane(self): return self.vmap.get_numel_per_lane(self._tile_size)
+    def get_tile_stride_per_lane(self): return self.vmap.get_tile_stride_per_lane(self._tile_size, self._tile_stride)
+    def get_compute_vec_size(self): return self.vmap.get_compute_vec_size(self._tile_size, self.get_reduction_numel(), self.nr_rdim)
+
+    # Helper functions for codegen
+    def get_mlir_shape(self, dtype):
+        shape = "x".join([str(dim) for dim in self._tile_size])
+        return f"memref<{shape}x{dtype}, 1>"
+
+    def get_mlir_vshape(self, mlir_dtype):
+        return f"vector<{self.get_compute_vec_size()}x{mlir_dtype}>" if self.get_compute_vec_size() > 1 else f"{mlir_dtype}"
+
 class MLIRWrapperKenrelGroup(cpp.KernelGroup):
     def __init__(self):
         super().__init__()
@@ -525,191 +650,96 @@ def call_kernel(self, kernel_name):
     def is_modular_indexing(self, expr):
         return "ModularIndexing" in str(expr)
 
-    def compute_tile_size(self, nodes, vars, reduction_vars):
-        # Handle implict dims. Input operand could have larger dimension space.
-        implicit_ranges = False
-        target_operand : MemoryDep = None
-        implicit_dim_size = defaultdict(list)
-        for read_operand in nodes[0].read_writes.reads:
-            read_operand : MemoryDep
-            if isinstance(read_operand, StarDep) or isinstance(read_operand, WeakDep): # FIXME: WeakDep & StarDep are not supported (MoE case)
-                continue
-            read_index = read_operand.index
-            for arg in read_index.args:
-                if "ModularIndexing" in str(arg) or "//" in str(arg):
-                    implicit_ranges = True
-                    target_operand = read_operand
-                    break
-
-        if implicit_ranges:
-            #print("This operation contain implicit dimension space!")
-            linearized_stride = [1] * len(target_operand.var_names)
-            for i in range(len(target_operand[3])-2, -1, -1):
-                linearized_stride[i] = linearized_stride[i+1] * target_operand[3][i+1]
-
-            linearized_index = sympy.Integer(0)
-            for dim, stride in zip(target_operand[2], linearized_stride):
-                linearized_index += stride * dim
-
-            new_dim_expression = []
-            new_dim_size = []
-            for arg in target_operand.index.args:
+    def implicit_dim_ops(self, nodes):
+        target_patterns = (ModularIndexing, FloorDiv, Mod)
+        target_operands = []
+        for target_node in nodes:
+            for read_operand in target_node.read_writes.reads:
+                read_operand: MemoryDep
+                if isinstance(read_operand, StarDep) or isinstance(read_operand, WeakDep):
+                    continue
+                read_index = read_operand.index
+                for arg_expr in read_index.args:
+                    if arg_expr.atoms(*target_patterns):
+                        target_operands.append(read_operand)
+        return target_operands
+
+    def extract_dividers(self, implicit_ops):
+        # When a specific axis is processed, the key constraint to verify is the divider.
+        # The tile size must be forced to match the divider size.
+        dim_dividers = defaultdict(set)
+        for operand in implicit_ops:
+            subs_map = {
+                s: sympy.symbols(s.name.replace("c", "index", 1))
+                for s in operand.index.free_symbols
+            }
+            rev_subs_map = {
+                sympy.symbols(s.name.replace("c", "index", 1)) : s
+                for s in operand.index.free_symbols
+            }
+            new_index = operand.index.subs(subs_map)
+            for arg in new_index.args:
                 if len(arg.free_symbols) != 1:
                     raise NotImplementedError("Not supporting this view operation...!")
-
                 if arg.is_Mul and arg.args[0].is_number:
                     arg = arg.args[1]
 
                 if isinstance(arg, ModularIndexing):
                     modular_expr = ModularIndexing(arg.args[0], arg.args[1], arg.args[2])
+                    modular_expr.original_expr = arg
                 elif arg.is_symbol:
-                    modular_expr = ModularIndexing(arg, 1, target_operand.ranges[arg])
+                    modular_expr = ModularIndexing(arg, 1, operand.ranges[rev_subs_map[arg]])
+                    modular_expr.original_expr = arg
                 elif "//" in str(arg):
-                    modular_expr = ModularIndexing(arg.args[0], arg.args[1], target_operand.ranges[arg.args[0]]//arg.args[1])
+                    modular_expr = ModularIndexing(arg.args[0], arg.args[1], operand.ranges[rev_subs_map[arg.args[0]]]//arg.args[1])
+                    modular_expr.original_expr = arg
                 else:
                     raise NotImplementedError("What is this case?")
-                new_dim_expression.append(modular_expr)
-                new_dim_size.append(modular_expr.args[2])
-                implicit_dim_size[int(str(modular_expr.args[0])[1:])].append(int(modular_expr.args[2]))
-
-            # Sanity check
-            for dim, sub_dims in implicit_dim_size.items():
-                sz = reduce(mul, sub_dims, 1)
-                if sz != target_operand[3][dim]:
-                    raise NotImplementedError("Not supporting type...")
-
-        vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop
-
-        # FIXME: Naive decrease tile size
-        def decrease_tile_size(tile_size, vlane_split_axis):
-            is_decreased = False
-
-            # Decrease vlane_split_axis when it is too large
-            if tile_size[vlane_split_axis] > vlane_stride * self.vector_lane:
-                tile_size[vlane_split_axis] = int(tile_size[vlane_split_axis] // 2)
-                return tile_size
-
-            for i in range(len(tile_size)):
-                if i == vlane_split_axis:
-                    continue
-                if tile_size[i] > 1:
-                    tile_size[i] = int(tile_size[i] // 2)
-                    is_decreased = True
-                    break
-
-            # Decrease vlane_split_axis at the end to maximize the vlane usage
-            if not is_decreased:
-                if tile_size[vlane_split_axis] > 1:
-                    tile_size[vlane_split_axis] = int(tile_size[vlane_split_axis] // 2)
-            return tile_size
-
-        # Dummy tile size
-        def dummy_tile_size():
-            tile_size = [1] * (len(vars) + len(reduction_vars))
-            if len(tile_size) == 2:
-                tile_size[-1] = vlane_stride * self.vector_lane
-                tile_size[-2] = 2 * self.vector_lane
-            elif len(tile_size) == 0: # Scalar
-                tile_size = [1]
-                self.ranges = [1]
-            elif len(tile_size) == 1:
-                tile_size[0] = 2 * vlane_stride * self.vector_lane
-            elif len(tile_size) == 3:
-                tile_size[-1] = self.vector_lane
-                tile_size[-2] = 4 * self.vector_lane
-                tile_size[-3] = 2
-            elif len(tile_size) == 4:
-                tile_size[-1] = self.vector_lane
-                tile_size[-2] = 4 * self.vector_lane
-                tile_size[-3] = 2
-                tile_size[-4] = 1
-            else:
-                raise NotImplementedError("dummy tile size fail!")
-            return tile_size
+                dim_dividers[modular_expr.args[0]].add(modular_expr)
+        return dim_dividers
 
+    def compute_tile_size(self, nodes, vars, reduction_vars):
+        vlane_split_axis = len(vars) - 1
         vlane_stride = extension_config.CONFIG_VECTOR_LANE_STRIDE
-        if self.recodegen is None:
-            tile_size = dummy_tile_size()
-        else:
+
+        # Set initial tile size & vector lane mapping
+        if self.kernel_group.tile_desc is None:
+            tile_size = MLIRMultiDimTile.init_tile_size(self.ranges, vlane_stride, self.vector_lane)
+            init_tile_desc = MLIRMultiDimTile(tile_size, self.vector_lane, vlane_split_axis, vlane_stride)
+            init_tile_desc.nr_rdim = len(reduction_vars)
+            self.kernel_group.set_tile_info(init_tile_desc)
+
+        # Handle edge case
+        if len(self.ranges)==1 and self.ranges[0] == 1: # Scalar case 2
+            self.kernel_group.tile_desc.vmap.vlane_stride = 1
+            self.kernel_group.tile_desc.vmap.vlane_split_axis = 0
+        elif vlane_split_axis == -1: # Reduction only case
+            self.kernel_group.tile_desc.vmap.vlane_split_axis = 0
+            self.kernel_group.tile_desc.vmap.vlane_stride = self.kernel_group.tile_desc.get_tile_size()[0]
+
+        # Handle implict dims. Input operand could be high dimension tensor.
+        # Note: https://github.com/PSAL-POSTECH/PyTorchSim/issues/173
+        implicit_ops = self.implicit_dim_ops(nodes)
+        if implicit_ops:
+            tile_constraints = self.extract_dividers(implicit_ops)
+            self.kernel_group.tile_desc.apply_constraints(tile_constraints, self.ranges)
+            self.kernel_group.tile_desc.implicit_dim_size = tile_constraints
+
+        # Check recodegen reason
+        if self.recodegen is not None:
             if self.recodegen == "spad_overflow":
-                tile_size = self.kernel_group.tile_desc.get_tile_size()
-                decrease_tile_size(tile_size, vlane_split_axis)
-            elif self.recodegen == "vlane_stride":
-                tile_size = dummy_tile_size()
-            elif "tile_size" in self.recodegen:
-                dim = int(self.recodegen.split("_")[-1])
-                tile_size = self.kernel_group.tile_desc.get_tile_size() # TODO:
-                tile_size[dim] = tile_size[dim] * 2
+                self.kernel_group.tile_desc.decrease_tile_size(self.ranges)
             elif self.recodegen == "recompile":
                 return self.kernel_group.tile_desc
             else:
                 raise NotImplementedError(f"Unknown recodegen reason: {self.recodegen}")
 
-        # FIXME: Not considering removed buffers
-        n_buffer = sum(
-            len(node.read_writes.reads) + len(node.read_writes.writes)
-            for node in nodes
-        )
-
-        spad_overflow = True
-        # Find proper tile size
-        while spad_overflow:
-            # Adjust tile size to avoid too much paddings
-            for i in range(1, len(tile_size)+1):
-                target_range = self.ranges[-i]
-                if implicit_ranges:
-                    target_range = implicit_dim_size[len(tile_size)-i][-1]
-
-                if tile_size[-i] > target_range:
-                    remains = (target_range % vlane_stride)
-                    self.stop_autotune = True
-                    tile_size[-i] = target_range
-                    if remains:
-                        tile_size[-i] += vlane_stride - remains
-
-            # Adjust tile size
-            for i in range(len(vars)):
-                if tile_size[i] >= self.vector_lane: # maximize used vector lane
-                    vlane_split_axis = i
-            used_vlane = min((tile_size[vlane_split_axis] + vlane_stride - 1) // vlane_stride, self.vector_lane)
-            padded_size = used_vlane * vlane_stride
-            tile_size[vlane_split_axis] = ((tile_size[vlane_split_axis] + padded_size - 1) // padded_size) * padded_size
-
-            # Check spad overflow
-            spad_usage_per_vlane = n_buffer * math.prod(tile_size) * self.precision // used_vlane
-            if spad_usage_per_vlane >= self.spad_info["spad_size"]:
-                new_tile_size = decrease_tile_size(tile_size.copy(), vlane_split_axis)
-                if new_tile_size == tile_size:
-                    raise NotImplementedError("Error: Cannot find proper tile size")
-                tile_size = new_tile_size
-                spad_overflow = True
-                self.stop_autotune = True # for auto-tune
-                continue
-            else:
-                spad_overflow = False
-
-        # Maximize the utilizaiotn of vectorlane
-        if len(reduction_vars):
-            minimum_stride = max(self.roundup_vectorlane(tile_size[vlane_split_axis]) // self.vector_lane, 2)
-            vlane_stride = min(minimum_stride, 8)
-
-        # Handle scalar case
-        if len(self.ranges)==1 and self.ranges[0] == 1:
-            vlane_stride = 1
-            vlane_split_axis = 0
-            tile_size[0] = 1
-        elif vlane_split_axis == -1:
-            vlane_split_axis = 0
-            vlane_stride = tile_size[0]
-
-        # Select tile info.
-        # Note: Kernel Group have to share same tile desc for fusion
-        tile_desc = MLIRMultiDimTile(tile_size, self.vector_lane)
-        tile_desc.vlane_split_axis = vlane_split_axis
-        tile_desc.vlane_stride = vlane_stride
-        tile_desc.implicit_dim_size = implicit_dim_size
-        tile_desc.nr_rdim = len(reduction_vars)
-        return tile_desc
+        # Adjust tile size & vector lane mapping
+        self.kernel_group.tile_desc.trim_large_tail(self.ranges)
+        self.kernel_group.tile_desc.select_vlane_axis()
+        self.kernel_group.tile_desc.pad_vlane_tile()
+        self.kernel_group.tile_desc.update_tile_stride()
+        return self.kernel_group.tile_desc
 
     def codegen_nodes(self, nodes, kernel_name):
         recompile_try = 0
@@ -724,7 +754,6 @@ def codegen_nodes(self, nodes, kernel_name):
             tile_desc = self.compute_tile_size(nodes, vars, reduction_vars)
             self.compute_body_loop.size = tile_desc.get_numel_per_lane()
             self.compute_body_loop.step = tile_desc.get_compute_vec_size()
-            self.kernel_group.set_tile_info(tile_desc)
             try:
                 _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs()
                 with self as kernel:
@@ -743,29 +772,6 @@ def codegen_nodes(self, nodes, kernel_name):
             self.meta_kernel()
             return src_code
 
-    def run_bench(self, nodes, kernel_name, src_code):
-        _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs()
-        input_call_args = tuple(self.args.input_buffers.keys())
-        output_call_args = tuple(self.args.output_buffers.keys())
-        full_input_nodes = tuple([V.graph.get_buffer(k) for k in input_call_args])
-        full_output_nodes = tuple([V.graph.get_buffer(k) for k in output_call_args])
-
-        bmreq = MLIRBenchmarkRequest(
-            kernel_name=kernel_name,
-            input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes),
-            output_tensor_meta=TensorMeta.from_irnodes(full_output_nodes),
-            extra_args={
-                "vector_lane" : self.vector_lane,
-                "spad_info": self.spad_info,
-                "vlen" : self.vlen,
-                "arg_attributes" : arg_attributes
-            },
-            source_code=src_code,
-        )
-        dummy_inputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.input_tensor_meta]
-        dummy_outputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.output_tensor_meta]
-        return bmreq.make_run_fn(dummy_inputs, dummy_outputs)
-
     def codegen_kernel(self, kernel_name):
         arg_defs, _, _, _ = self.kernel_group.args.mlir_argdefs()
         arg_defs = ",\n".ljust(25).join(arg_defs)
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py
new file mode 100644
index 00000000..77826730
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py
@@ -0,0 +1,120 @@
+import os
+import math
+from typing import List, Optional
+
+from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
+from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
+from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
+from torch._inductor.ir import IRNode
+from PyTorchSimFrontend import extension_config
+
+class MLIRConvCommonTemplate(MLIRTemplate):
+    WRAPPER_TEMPLATE = None
+    def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
+        super().__init__("kernel", input_nodes, layout, input_reorder)
+        self.stride = kwargs["stride"]
+        self.padding = kwargs["padding"]
+        self.dilation = kwargs["dilation"]
+        self.weight_shape = [str(i) for i in input_nodes[1].layout.size]
+        self.input_shape = [str(i) for i in input_nodes[0].layout.size]
+        self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \
+            + "_".join([str(i) for i in self.stride]) \
+            + "_" + "_".join([str(i) for i in self.padding]) \
+            + "_" + "_".join([str(i) for i in self.dilation])
+        self.kernel_args = ['X', 'W', 'Bias', 'Y']
+
+    def get_padded_input_size(self, X):
+        input_padded = list(X.layout.size)
+        input_padded[2] += 2 * self.padding[0]
+        input_padded[3] += 2 * self.padding[1]
+        return math.prod(input_padded)
+
+    def render(self,
+               kernel: MLIRTemplateKernel,
+               template_buffer_node = None,
+               epilogue_nodes: Optional[List[IRNode]] = None,
+               tile_info = None,
+               **kwargs):
+        raise NotImplementedError()
+
+    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W):
+        raise NotImplementedError()
+
+    def extract_info(self, kernel, template_buffer_node, epilogue_nodes):
+        if template_buffer_node is not None:
+            self.output_node = template_buffer_node
+        self.kernel = kernel
+        self.epilogue_nodes = epilogue_nodes
+
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
+        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+
+        if epilogue_nodes is not None:
+            extra_node_rw = {
+                item.name for epilogue_node in epilogue_nodes
+                for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes
+                if item.name != Y.name
+            }
+        n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0
+
+        BATCH, I_C, I_H, I_W = X.layout.size
+        O_C, _, K_H, K_W = W.layout.size
+        O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2]
+        O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
+        PADDING_H=self.padding[0]
+        PADDING_W=self.padding[1]
+        STRIDE_H=self.stride[0]
+        STRIDE_W=self.stride[1]
+        return X,W,Y,Bias,n_extra_node,BATCH,I_C,I_H,I_W,O_C,K_H,K_W,O_H,O_W,PADDING_H,PADDING_W,STRIDE_H,STRIDE_W
+
+    def get_tile_candidates(self,
+               kernel: MLIRTemplateKernel,
+               template_buffer_node = None,
+               epilogue_nodes: Optional[List[IRNode]] = None,
+               **kwargs):
+        # Extract input arguments info
+        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
+        return self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)
+
+    def outer_func_render(self, kernel_name, input_args):
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
+        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+
+        eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False))
+        options = dict(
+            kernel=self.kernel,
+            KERNEL_NAME=kernel_name,
+            FUNC_NAME=self.function_name + f"_{len(input_args)}",
+            INPUT=X,
+            WEIGHT=W,
+            BIAS=Bias,
+            OUTPUT=Y,
+            PADDING_H=self.padding[0],
+            PADDING_W=self.padding[1],
+            VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE,
+            TOGSIM_EAGER_MODE=eager_mode,
+            input_reorder=self.input_reorder
+        )
+        code = self._template_from_string(self.WRAPPER_TEMPLATE).render(**options)
+        return code, self.function_name + f"_{len(input_args)}"
+
+    def get_arg_attributes(self):
+        arg_attributes = []
+
+        X = self.input_nodes[0]
+        X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)]
+        X_shape[0] += 2 * self.padding[0]
+        X_shape[1] += 2 * self.padding[1]
+
+        def compute_stride(shape):
+            stride = [1] * len(shape)
+            for i in range(len(shape)-2, -1, -1):
+                stride[i] = stride[i+1] * shape[i+1]
+            return stride
+
+        X_stride = compute_stride(X_shape)
+        arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]])
+
+        return arg_attributes
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
index 6dd17576..0bf01421 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
@@ -1,17 +1,10 @@
-import os
-import math
 from sympy import  Symbol, Number
 from typing import List, Optional
 
-from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
-from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
+from PyTorchSimFrontend.mlir.mlir_conv_common import MLIRConvCommonTemplate
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
 from torch._inductor.ir import IRNode
-from torch._inductor.codecache import write_atomic
-import PyTorchSimFrontend.extension_codecache as extension_codecache
 from PyTorchSimFrontend.mlir import mlir_common
-from torch._inductor.codecache import get_hash
-from PyTorchSimFrontend import extension_config
 
 CONV_TEMPLATE = r"""
 // Multi Channel Tile Conv2D kernel
@@ -104,7 +97,8 @@
 }
 """
 
-WRAPPER_TEMPLATE = r"""
+class MLIRConvMultiTileTemplate(MLIRConvCommonTemplate):
+    WRAPPER_TEMPLATE = r"""
 def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
     # Padding input
     padded_shape = list(X.shape)
@@ -126,67 +120,30 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
 
     # Launch kernel
     {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
-    {%- if BACKENDSIM_EAGER_MODE %}
+    {%- if TOGSIM_EAGER_MODE %}
     yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
     {%- endif %}
 """
-
-class MLIRConvMultiTileTemplate(MLIRTemplate):
     def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
-        super().__init__("kernel", input_nodes, layout, input_reorder)
-        self.stride = kwargs["stride"]
-        self.padding = kwargs["padding"]
-        self.dilation = kwargs["dilation"]
-        self.weight_shape = [str(i) for i in input_nodes[1].layout.size]
-        self.input_shape = [str(i) for i in input_nodes[0].layout.size]
-        self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \
-            + "_".join([str(i) for i in self.stride]) \
-            + "_" + "_".join([str(i) for i in self.padding]) \
-            + "_" + "_".join([str(i) for i in self.dilation])
-        self.kernel_args = ['X', 'W', 'Bias', 'Y']
-
-    def get_padded_input_size(self, X):
-        input_padded = list(X.layout.size)
-        input_padded[2] += 2 * self.padding[0]
-        input_padded[3] += 2 * self.padding[1]
-        return math.prod(input_padded)
+        super().__init__(input_nodes, layout, input_reorder, **kwargs)
 
     def render(self,
                kernel: MLIRTemplateKernel,
                template_buffer_node = None,
                epilogue_nodes: Optional[List[IRNode]] = None,
+               tile_info = None,
                **kwargs):
         # Extract input arguments info
-        if template_buffer_node is not None:
-            self.output_node = template_buffer_node
-        self.kernel = kernel
-        self.epilogue_nodes = epilogue_nodes
-
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        Y = self.output_node
-        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
-
-        if epilogue_nodes is not None:
-            extra_node_rw = {
-                item.name for epilogue_node in epilogue_nodes
-                for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes
-                if item.name != Y.name
-            }
-        n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0
-
-        BATCH, I_C, I_H, I_W = X.layout.size
-        O_C, _, K_H, K_W = W.layout.size
-        O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2]
-        O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
-        PADDING_H=self.padding[0]
-        PADDING_W=self.padding[1]
-        STRIDE_H=self.stride[0]
-        STRIDE_W=self.stride[1]
+        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
 
         # Select tile size adn template
         conv_template = CONV_TEMPLATE
-        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)
+        if tile_info is None:
+            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0]
+        else:
+            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info
         SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
+        TOG_latency = O_W if TILE_M > O_W else TILE_M
         TOG_latency = 8 if TOG_latency < 8 else TOG_latency
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
 
@@ -284,69 +241,13 @@ def render(self,
         return code
 
     def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): 
-        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
-        SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-
-        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
-        TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
-        TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-        SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
-        SUB_TILE_K = TILE_K
-
-        TOG_latency = O_W if TILE_M > O_W else TILE_M
-        return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency
-
-    def outer_func_render(self, kernel_name, input_args):
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        Y = self.output_node
-        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
-
-        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
-        options = dict(
-            kernel=self.kernel,
-            KERNEL_NAME=kernel_name,
-            FUNC_NAME=self.function_name + f"_{len(input_args)}",
-            INPUT=X,
-            WEIGHT=W,
-            BIAS=Bias,
-            OUTPUT=Y,
-            PADDING_H=self.padding[0],
-            PADDING_W=self.padding[1],
-            VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
-            BACKENDSIM_EAGER_MODE=eager_mode,
-            input_reorder=self.input_reorder
-        )
-        code = self._template_from_string(WRAPPER_TEMPLATE).render(**options)
-        return code, self.function_name + f"_{len(input_args)}"
-
-    def get_arg_attributes(self):
-        arg_attributes = []
-
-        X = self.input_nodes[0]
-        X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)]
-        X_shape[0] += 2 * self.padding[0]
-        X_shape[1] += 2 * self.padding[1]
-
-        def compute_stride(shape):
-            stride = [1] * len(shape)
-            for i in range(len(shape)-2, -1, -1):
-                stride[i] = stride[i+1] * shape[i+1]
-            return stride
-
-        X_stride = compute_stride(X_shape)
-        arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]])
-
-        return arg_attributes
-
-    def codegen_header(self, code, extra_headers):
-        write_path = extension_codecache.get_write_path(code)
-        if not os.path.exists(write_path):
-            os.makedirs(write_path)
-        spike_write_path = os.path.join(write_path, "global_var.h")
-        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-        if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, extra_headers[0])
-        if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, extra_headers[1])
-        self.hash_value = get_hash(code.strip())
\ No newline at end of file
+        tile_candidates = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
+        for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates):
+            TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
+            TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+            SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
+            SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
+            SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+            SUB_TILE_K = TILE_K
+            tile_candidates[idx] = TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K
+        return tile_candidates
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
index 8b1bf7c5..92b9a525 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
@@ -1,17 +1,10 @@
-import os
-import math
 from sympy import  Symbol, Number
 from typing import List, Optional
 
-from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
-from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
+from PyTorchSimFrontend.mlir.mlir_conv_common import MLIRConvCommonTemplate
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
 from torch._inductor.ir import IRNode
-from torch._inductor.codecache import write_atomic
-import PyTorchSimFrontend.extension_codecache as extension_codecache
 from PyTorchSimFrontend.mlir import mlir_common
-from torch._inductor.codecache import get_hash
-from PyTorchSimFrontend import extension_config
 
 CONV_TEMPLATE = r"""
 // Single Batch Conv2D kernel
@@ -105,7 +98,8 @@
 }
 """
 
-WRAPPER_TEMPLATE = r"""
+class MLIRConvSingleBatchTemplate(MLIRConvCommonTemplate):
+    WRAPPER_TEMPLATE = r"""
 def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
     # Padding input
     padded_shape = list(X.shape)
@@ -127,67 +121,30 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
 
     # Launch kernel
     {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
-    {%- if BACKENDSIM_EAGER_MODE %}
+    {%- if TOGSIM_EAGER_MODE %}
     yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
     {%- endif %}
 """
-
-class MLIRConvSingleBatchTemplate(MLIRTemplate):
     def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
-        super().__init__("kernel", input_nodes, layout, input_reorder)
-        self.stride = kwargs["stride"]
-        self.padding = kwargs["padding"]
-        self.dilation = kwargs["dilation"]
-        self.weight_shape = [str(i) for i in input_nodes[1].layout.size]
-        self.input_shape = [str(i) for i in input_nodes[0].layout.size]
-        self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \
-            + "_".join([str(i) for i in self.stride]) \
-            + "_" + "_".join([str(i) for i in self.padding]) \
-            + "_" + "_".join([str(i) for i in self.dilation])
-        self.kernel_args = ['X', 'W', 'Bias', 'Y']
-
-    def get_padded_input_size(self, X):
-        input_padded = list(X.layout.size)
-        input_padded[2] += 2 * self.padding[0]
-        input_padded[3] += 2 * self.padding[1]
-        return math.prod(input_padded)
+        super().__init__(input_nodes, layout, input_reorder, **kwargs)
 
     def render(self,
                kernel: MLIRTemplateKernel,
                template_buffer_node = None,
                epilogue_nodes: Optional[List[IRNode]] = None,
+               tile_info = None,
                **kwargs):
         # Extract input arguments info
-        if template_buffer_node is not None:
-            self.output_node = template_buffer_node
-        self.kernel = kernel
-        self.epilogue_nodes = epilogue_nodes
-
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        Y = self.output_node
-        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
-
-        if epilogue_nodes is not None:
-            extra_node_rw = {
-                item.name for epilogue_node in epilogue_nodes
-                for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes
-                if item.name != Y.name
-            }
-        n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0
-
-        BATCH, I_C, I_H, I_W = X.layout.size
-        O_C, _, K_H, K_W = W.layout.size
-        O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2]
-        O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
-        PADDING_H=self.padding[0]
-        PADDING_W=self.padding[1]
-        STRIDE_H=self.stride[0]
-        STRIDE_W=self.stride[1]
+        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
 
         # Select tile size adn template
         conv_template = CONV_TEMPLATE
-        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)
+        if tile_info is None:
+            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0]
+        else:
+            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info
         SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
+        TOG_latency = O_W if TILE_M > O_W else TILE_M
         TOG_latency = 8 if TOG_latency < 8 else TOG_latency
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
         # Prepare tile descriptors
@@ -283,66 +240,13 @@ def render(self,
         return code
 
     def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W):
-        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
-        TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-        TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
-        SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
-        SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_K = TILE_K
-        TOG_latency = O_W if TILE_M > O_W else TILE_M
-        return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency
-
-    def outer_func_render(self, kernel_name, input_args):
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        Y = self.output_node
-        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
-
-        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
-        options = dict(
-            kernel=self.kernel,
-            KERNEL_NAME=kernel_name,
-            FUNC_NAME=self.function_name + f"_{len(input_args)}",
-            INPUT=X,
-            WEIGHT=W,
-            BIAS=Bias,
-            OUTPUT=Y,
-            PADDING_H=self.padding[0],
-            PADDING_W=self.padding[1],
-            VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
-            BACKENDSIM_EAGER_MODE=eager_mode,
-            input_reorder=self.input_reorder
-        )
-        code = self._template_from_string(WRAPPER_TEMPLATE).render(**options)
-        return code, self.function_name + f"_{len(input_args)}"
-
-    def get_arg_attributes(self):
-        arg_attributes = []
-
-        X = self.input_nodes[0]
-        X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)]
-        X_shape[0] += 2 * self.padding[0]
-        X_shape[1] += 2 * self.padding[1]
-
-        def compute_stride(shape):
-            stride = [1] * len(shape)
-            for i in range(len(shape)-2, -1, -1):
-                stride[i] = stride[i+1] * shape[i+1]
-            return stride
-
-        X_stride = compute_stride(X_shape)
-        arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]])
-
-        return arg_attributes
-
-    def codegen_header(self, code, extra_headers):
-        write_path = extension_codecache.get_write_path(code)
-        if not os.path.exists(write_path):
-            os.makedirs(write_path)
-        spike_write_path = os.path.join(write_path, "global_var.h")
-        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-        if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, extra_headers[0])
-        if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, extra_headers[1])
-        self.hash_value = get_hash(code.strip())
\ No newline at end of file
+        tile_candidates = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
+        for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates):
+            TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+            TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
+            SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
+            SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane
+            SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+            SUB_TILE_K = TILE_K
+            tile_candidates[idx] = TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K
+        return tile_candidates
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
index 2284c86c..ab124852 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
@@ -1,17 +1,10 @@
-import os
-import math
 from sympy import  Symbol, Number
 from typing import List, Optional
 
-from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
-from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
+from PyTorchSimFrontend.mlir.mlir_conv_common import MLIRConvCommonTemplate
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
 from torch._inductor.ir import IRNode
-from torch._inductor.codecache import write_atomic
-import PyTorchSimFrontend.extension_codecache as extension_codecache
 from PyTorchSimFrontend.mlir import mlir_common
-from torch._inductor.codecache import get_hash
-from PyTorchSimFrontend import extension_config
 
 CONV_TEMPLATE = r"""
 // Single Batch Conv2D (Stride != 1) kernel
@@ -105,7 +98,8 @@
 }
 """
 
-WRAPPER_TEMPLATE = r"""
+class MLIRConvSingleBatchStridedTemplate(MLIRConvCommonTemplate):
+    WRAPPER_TEMPLATE = r"""
 def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
     # Padding input
     padded_shape = list(X.shape)
@@ -127,67 +121,30 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
 
     # Launch kernel
     {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
-    {%- if BACKENDSIM_EAGER_MODE %}
+    {%- if TOGSIM_EAGER_MODE %}
     yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
     {%- endif %}
 """
-
-class MLIRConvSingleBatchStridedTemplate(MLIRTemplate):
     def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
-        super().__init__("kernel", input_nodes, layout, input_reorder)
-        self.stride = kwargs["stride"]
-        self.padding = kwargs["padding"]
-        self.dilation = kwargs["dilation"]
-        self.weight_shape = [str(i) for i in input_nodes[1].layout.size]
-        self.input_shape = [str(i) for i in input_nodes[0].layout.size]
-        self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \
-            + "_".join([str(i) for i in self.stride]) \
-            + "_" + "_".join([str(i) for i in self.padding]) \
-            + "_" + "_".join([str(i) for i in self.dilation])
-        self.kernel_args = ['X', 'W', 'Bias', 'Y']
-
-    def get_padded_input_size(self, X):
-        input_padded = list(X.layout.size)
-        input_padded[2] += 2 * self.padding[0]
-        input_padded[3] += 2 * self.padding[1]
-        return math.prod(input_padded)
+        super().__init__(input_nodes, layout, input_reorder, **kwargs)
 
     def render(self,
                kernel: MLIRTemplateKernel,
                template_buffer_node = None,
                epilogue_nodes: Optional[List[IRNode]] = None,
+               tile_info = None,
                **kwargs):
         # Extract input arguments info
-        if template_buffer_node is not None:
-            self.output_node = template_buffer_node
-        self.kernel = kernel
-        self.epilogue_nodes = epilogue_nodes
-
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        Y = self.output_node
-        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
-
-        if epilogue_nodes is not None:
-            extra_node_rw = {
-                item.name for epilogue_node in epilogue_nodes
-                for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes
-                if item.name != Y.name
-            }
-        n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0
-
-        BATCH, I_C, I_H, I_W = X.layout.size
-        O_C, _, K_H, K_W = W.layout.size
-        O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2]
-        O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
-        PADDING_H=self.padding[0]
-        PADDING_W=self.padding[1]
-        STRIDE_H=self.stride[0]
-        STRIDE_W=self.stride[1]
+        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
 
         # Select tile size adn template
         conv_template = CONV_TEMPLATE
-        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)
+        if tile_info is None:
+            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0]
+        else:
+            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info
         SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
+        TOG_latency = O_W if TILE_M > O_W else TILE_M
         TOG_latency = 8 if TOG_latency < 8 else TOG_latency
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
 
@@ -284,66 +241,13 @@ def render(self,
         return code
 
     def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W):
-        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
-        TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-        TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
-        SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
-        SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_K = TILE_K
-        TOG_latency = O_W if TILE_M > O_W else TILE_M
-        return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency
-
-    def outer_func_render(self, kernel_name, input_args):
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        Y = self.output_node
-        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
-
-        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
-        options = dict(
-            kernel=self.kernel,
-            KERNEL_NAME=kernel_name,
-            FUNC_NAME=self.function_name + f"_{len(input_args)}",
-            INPUT=X,
-            WEIGHT=W,
-            BIAS=Bias,
-            OUTPUT=Y,
-            PADDING_H=self.padding[0],
-            PADDING_W=self.padding[1],
-            VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
-            BACKENDSIM_EAGER_MODE=eager_mode,
-            input_reorder=self.input_reorder
-        )
-        code = self._template_from_string(WRAPPER_TEMPLATE).render(**options)
-        return code, self.function_name + f"_{len(input_args)}"
-
-    def get_arg_attributes(self):
-        arg_attributes = []
-
-        X = self.input_nodes[0]
-        X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)]
-        X_shape[0] += 2 * self.padding[0]
-        X_shape[1] += 2 * self.padding[1]
-
-        def compute_stride(shape):
-            stride = [1] * len(shape)
-            for i in range(len(shape)-2, -1, -1):
-                stride[i] = stride[i+1] * shape[i+1]
-            return stride
-
-        X_stride = compute_stride(X_shape)
-        arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]])
-
-        return arg_attributes
-
-    def codegen_header(self, code, extra_headers):
-        write_path = extension_codecache.get_write_path(code)
-        if not os.path.exists(write_path):
-            os.makedirs(write_path)
-        spike_write_path = os.path.join(write_path, "global_var.h")
-        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-        if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, extra_headers[0])
-        if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, extra_headers[1])
-        self.hash_value = get_hash(code.strip())
\ No newline at end of file
+        tile_candidates = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
+        for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates):
+            TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+            TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
+            SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
+            SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
+            SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+            SUB_TILE_K = TILE_K
+            tile_candidates[idx] = TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K
+        return tile_candidates
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 890b76b7..66aa0a27 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -1,17 +1,10 @@
-import os
-import math
 from sympy import  Symbol, Number
 from typing import List, Optional
 
-from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
-from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
+from PyTorchSimFrontend.mlir.mlir_conv_common import MLIRConvCommonTemplate
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
 from torch._inductor.ir import IRNode
-from torch._inductor.codecache import write_atomic
-import PyTorchSimFrontend.extension_codecache as extension_codecache
 from PyTorchSimFrontend.mlir import mlir_common
-from torch._inductor.codecache import get_hash
-from PyTorchSimFrontend import extension_config
 
 CONV_TEMPLATE = r"""
 // Conv2D kernel
@@ -109,7 +102,8 @@
 }
 """
 
-WRAPPER_TEMPLATE = r"""
+class MLIRConvTemplate(MLIRConvCommonTemplate):
+    WRAPPER_TEMPLATE = r"""
 def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
     # Padding input
     padded_shape = list(X.shape)
@@ -131,67 +125,29 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
 
     # Launch kernel
     {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
-    {%- if BACKENDSIM_EAGER_MODE %}
+    {%- if TOGSIM_EAGER_MODE %}
     yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
     {%- endif %}
 """
-
-class MLIRConvTemplate(MLIRTemplate):
     def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
-        super().__init__("kernel", input_nodes, layout, input_reorder)
-        self.stride = kwargs["stride"]
-        self.padding = kwargs["padding"]
-        self.dilation = kwargs["dilation"]
-        self.weight_shape = [str(i) for i in input_nodes[1].layout.size]
-        self.input_shape = [str(i) for i in input_nodes[0].layout.size]
-        self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \
-            + "_".join([str(i) for i in self.stride]) \
-            + "_" + "_".join([str(i) for i in self.padding]) \
-            + "_" + "_".join([str(i) for i in self.dilation])
-        self.kernel_args = ['X', 'W', 'Bias', 'Y']
-
-    def get_padded_input_size(self, X):
-        input_padded = list(X.layout.size)
-        input_padded[2] += 2 * self.padding[0]
-        input_padded[3] += 2 * self.padding[1]
-        return math.prod(input_padded)
+        super().__init__(input_nodes, layout, input_reorder, **kwargs)
 
     def render(self,
                kernel: MLIRTemplateKernel,
                template_buffer_node = None,
                epilogue_nodes: Optional[List[IRNode]] = None,
+               tile_info = None,
                **kwargs):
         # Extract input arguments info
-        if template_buffer_node is not None:
-            self.output_node = template_buffer_node
-        self.kernel = kernel
-        self.epilogue_nodes = epilogue_nodes
-
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        Y = self.output_node
-        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
-
-        if epilogue_nodes is not None:
-            extra_node_rw = {
-                item.name for epilogue_node in epilogue_nodes
-                for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes
-                if item.name != Y.name
-            }
-        n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0
-
-        BATCH, I_C, I_H, I_W = X.layout.size
-        O_C, _, K_H, K_W = W.layout.size
-        O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2]
-        O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3]
-        PADDING_H=self.padding[0]
-        PADDING_W=self.padding[1]
-        STRIDE_H=self.stride[0]
-        STRIDE_W=self.stride[1]
+        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
 
         # Select tile size adn template
         conv_template = CONV_TEMPLATE
-        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)
-        SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
+        if tile_info is None:
+            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0]
+        else:
+            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info
+        TOG_latency = BATCH if TILE_M > BATCH else TILE_M
         TOG_latency = 8 if TOG_latency < 8 else TOG_latency
         kernel.loop_size = [TOG_latency, TILE_N, TILE_K]
 
@@ -289,68 +245,14 @@ def render(self,
         return code
 
     def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W):
-        TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
-        SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-        SUB_TILE_K = TILE_K
-        TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
-        TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
-        SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
-        SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
-        TOG_latency = BATCH if TILE_M > BATCH else TILE_M
-        TOG_latency = 8 if TOG_latency < 8 else TOG_latency
-        return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency
-
-    def outer_func_render(self, kernel_name, input_args):
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        Y = self.output_node
-        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
-
-        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
-        options = dict(
-            kernel=self.kernel,
-            KERNEL_NAME=kernel_name,
-            FUNC_NAME=self.function_name + f"_{len(input_args)}",
-            INPUT=X,
-            WEIGHT=W,
-            BIAS=Bias,
-            OUTPUT=Y,
-            PADDING_H=self.padding[0],
-            PADDING_W=self.padding[1],
-            VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
-            BACKENDSIM_EAGER_MODE=eager_mode,
-            input_reorder=self.input_reorder
-        )
-        code = self._template_from_string(WRAPPER_TEMPLATE).render(**options)
-        return code, self.function_name + f"_{len(input_args)}"
-
-    def get_arg_attributes(self):
-        arg_attributes = []
-
-        X = self.input_nodes[0]
-        X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)]
-        X_shape[0] += 2 * self.padding[0]
-        X_shape[1] += 2 * self.padding[1]
-
-        def compute_stride(shape):
-            stride = [1] * len(shape)
-            for i in range(len(shape)-2, -1, -1):
-                stride[i] = stride[i+1] * shape[i+1]
-            return stride
-
-        X_stride = compute_stride(X_shape)
-        arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]])
-
-        return arg_attributes
-
-    def codegen_header(self, code, extra_headers):
-        write_path = extension_codecache.get_write_path(code)
-        if not os.path.exists(write_path):
-            os.makedirs(write_path)
-        spike_write_path = os.path.join(write_path, "global_var.h")
-        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-        if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, extra_headers[0])
-        if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, extra_headers[1])
-        self.hash_value = get_hash(code.strip())
\ No newline at end of file
+        tile_candidates = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
+        for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates):
+            TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
+            TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
+            SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1
+            SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane
+            SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+            SUB_TILE_K = TILE_K
+            SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
+            tile_candidates[idx] = TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K
+        return tile_candidates
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index ae793c06..6271b548 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -1,4 +1,3 @@
-import os
 import json
 from pathlib import Path
 from torch import empty_strided
@@ -8,8 +7,6 @@
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
 from torch._inductor.ir import IRNode
-from torch._inductor.codecache import write_atomic
-import PyTorchSimFrontend.extension_codecache as extension_codecache
 from PyTorchSimFrontend import extension_config
 from PyTorchSimFrontend.mlir import mlir_common
 
@@ -114,30 +111,13 @@ def render(self,
                template_buffer_node = None,
                epilogue_nodes: Optional[List[IRNode]] = None,
                prologue_nodes: Optional[List[IRNode]] = None,
+               tile_info = None,
                **kwargs):
-        if template_buffer_node is not None:
-            self.output_node = template_buffer_node
-
-        # Extract input arguments info
-        X, W, Y = self.input_nodes[0], self.input_nodes[1], self.output_node
-        X_tensor = empty_strided(X.layout.size, X.layout.stride)
-        W_tensor = empty_strided(W.layout.size, W.layout.stride)
-        if len(W_tensor.size()) > 2 or len(X_tensor.size()) > 2:
-            raise NotImplementedError("Please report this case to us...")
-
-        # Extract fusion info
-        n_epilogue_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
-        n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0
-        n_extra_read = set()
-        if epilogue_nodes is not None:
-            for enode in epilogue_nodes:
-                n_extra_read.update(enode.node.get_read_names())
-            if self.output_node.name in n_extra_read:
-                n_extra_read.remove(self.output_node.name)
-
-        # Select tile size
-        M, N, K = X_tensor.size()[0], W_tensor.size()[1], X_tensor.size()[1]
-        TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node)
+        X, W, Y, M, N, K, n_epilogue_node, n_prologue_node, n_extra_read = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes)
+        if tile_info is None:
+            TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node)[0]
+        else:
+            TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info
 
         # Select template code
         if (M == 0) or (N == 0) or (K == 0): # exception for MoE
@@ -275,12 +255,47 @@ def render(self,
             dram_idx = Y_idx,
             dram_tile_desc = Y_tile_desc,
             nr_rdim = nr_rdim,
+            r_dim_size = M,
             dim_aliasing = epilogue_dim_aliasing
         )
         code = self._template_from_string(template).render(**kernel.render_options)
         kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
         return code
 
+    def get_tile_candidates(self,
+               kernel: MLIRTemplateKernel,
+               template_buffer_node = None,
+               epilogue_nodes: Optional[List[IRNode]] = None,
+               prologue_nodes: Optional[List[IRNode]] = None,
+               **kwargs):
+        X, W, Y, M, N, K, n_epilogue_node, n_prologue_node, n_extra_read = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes)
+        return self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node)
+
+    def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes):
+        if template_buffer_node is not None:
+            self.output_node = template_buffer_node
+
+        # Extract input arguments info
+        X, W, Y = self.input_nodes[0], self.input_nodes[1], self.output_node
+        X_tensor = empty_strided(X.layout.size, X.layout.stride)
+        W_tensor = empty_strided(W.layout.size, W.layout.stride)
+        if len(W_tensor.size()) > 2 or len(X_tensor.size()) > 2:
+            raise NotImplementedError("Please report this case to us...")
+
+        # Extract fusion info
+        n_epilogue_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
+        n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0
+        n_extra_read = set()
+        if epilogue_nodes is not None:
+            for enode in epilogue_nodes:
+                n_extra_read.update(enode.node.get_read_names())
+            if self.output_node.name in n_extra_read:
+                n_extra_read.remove(self.output_node.name)
+
+        # Select tile size
+        M, N, K = X_tensor.size()[0], W_tensor.size()[1], X_tensor.size()[1]
+        return X,W,Y,M,N,K,n_epilogue_node,n_prologue_node,len(n_extra_read)
+
     def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node):
         # Check cheat sheet
         cheatsheet_path = extension_config.CONFIG_GEMM_CHEATSHEET_PATH
@@ -292,52 +307,49 @@ def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_no
                     data = json.load(f)
 
         gemm_shape = f"{M}_{K}_{N}"
-        if gemm_shape in data:
+        if extension_config.CONFIG_MANUAL_TILE_SIZE:
+            # case 1: use manual tile size
+            TILE_M = extension_config.CONFIG_TILE_M
+            TILE_N = extension_config.CONFIG_TILE_N
+            TILE_K = extension_config.CONFIG_TILE_K
+            tile_candidates = [[TILE_M, TILE_N, TILE_K]]
+        elif gemm_shape in data:
+            # case 2: cached tile size
             tile_info = data[gemm_shape]
             TILE_M = tile_info["TILE_M"]
             TILE_N = tile_info["TILE_N"]
             TILE_K = tile_info["TILE_K"]
-        else: # case 2: use gemm_combination_mapping
+            tile_candidates = [[TILE_M, TILE_N, TILE_K]]
+        else:
+            # case 3: use gemm_combination_mapping
             min_tile = (n_extra_node + n_prologue_node) == 0
-            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, max(len(n_extra_read)-2, 0), n_prologue_node, min_tile=True)
-        # case 3: use manual tile size
-        if extension_config.CONFIG_MANUAL_TILE_SIZE:
-            TILE_M = extension_config.CONFIG_TILE_M
-            TILE_N = extension_config.CONFIG_TILE_N
-            TILE_K = extension_config.CONFIG_TILE_K
+            tile_candidates = kernel.gemm_combination_mapping(M, N, K, max(n_extra_read-2, 0), n_prologue_node, min_tile=True)
 
         # Edge case
         if (M == 0) or (N == 0) or (K == 0):
             TILE_M, TILE_N, TILE_K = 1, 1, 1
+            tile_candidates = [[TILE_M, TILE_N, TILE_K]]
 
-        # Calculate Sub Tile Size for fine-grained DMA
-        if extension_config.CONFIG_SUBTILE:
-            # Case 1: adjust selective fine-grained DMA (SFG-DMA)
-            SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane or n_prologue_node) else kernel.vector_lane
-            if (TILE_M == M and TILE_N == N and TILE_N <= 512):
-                SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
-            else: # Avoid Row Conflict of weights
+        full_tile_candidates = []
+        for idx, (TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates):
+            # Calculate Sub Tile Size for fine-grained DMA
+            if extension_config.CONFIG_SUBTILE:
+                # Case 1: adjust selective fine-grained DMA (SFG-DMA)
+                SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane or n_prologue_node) else kernel.vector_lane
+                if (TILE_M == M and TILE_N == N and TILE_N <= 512):
+                    SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
+                else: # Avoid Row Conflict of weights
+                    SUB_TILE_N = TILE_N
+                SUB_TILE_K = TILE_K
+                # Case 2: use manual sub tile size (FG-DMA)
+                if extension_config.CONFIG_MANUAL_SUBTILE_SIZE:
+                    SUB_TILE_M = extension_config.CONFIG_SUBTILE_M
+                    SUB_TILE_N = extension_config.CONFIG_SUBTILE_N
+                    SUB_TILE_K = extension_config.CONFIG_SUBTILE_K
+            # Case 3: None Subtile
+            else:
+                SUB_TILE_M = TILE_M
                 SUB_TILE_N = TILE_N
-            SUB_TILE_K = TILE_K
-            # Case 2: use manual sub tile size (FG-DMA)
-            if extension_config.CONFIG_MANUAL_SUBTILE_SIZE:
-                SUB_TILE_M = extension_config.CONFIG_SUBTILE_M
-                SUB_TILE_N = extension_config.CONFIG_SUBTILE_N
-                SUB_TILE_K = extension_config.CONFIG_SUBTILE_K
-        # Case 3: None Subtile
-        else:
-            SUB_TILE_M = TILE_M
-            SUB_TILE_N = TILE_N
-            SUB_TILE_K = TILE_K
-        return TILE_M,TILE_N,TILE_K, SUB_TILE_M,SUB_TILE_N,SUB_TILE_K
-
-    def codegen_header(self, code, extra_headers):
-        write_path = extension_codecache.get_write_path(code)
-        if not os.path.exists(write_path):
-            os.makedirs(write_path)
-        spike_write_path = os.path.join(write_path, "global_var.h")
-        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-        if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, extra_headers[0])
-        if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, extra_headers[1])
+                SUB_TILE_K = TILE_K
+            full_tile_candidates.append([TILE_M,TILE_N,TILE_K, SUB_TILE_M,SUB_TILE_N,SUB_TILE_K])
+        return full_tile_candidates
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index 6508ea86..af59d88f 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -15,7 +15,7 @@
 from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate
 from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate
 from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
-from PyTorchSimFrontend.extension_config import CONFIG_VECTOR_LANE, CONFIG_USE_TIMING_POOLING
+from PyTorchSimFrontend import extension_config
 
 aten = torch.ops.aten
 aten_spmm = MLIRExternKernelChoice(torch.sparse.mm, "custom_op::sparse_addmm")
@@ -106,11 +106,11 @@ def convolution(
     layout = conv_layout(x, weight, None, **kwargs)
 
     # Select conv kernel
-    if BATCH == 1 and stride[0] == 1:
+    if BATCH == 1 and stride[0] == 1 and extension_config.CONFIG_SINGLE_BATCH_CONV:
         mlir_template = MLIRConvSingleBatchTemplate([x, weight, bias], layout, **kwargs)
-    elif BATCH == 1 and stride[0] != 1:
+    elif BATCH == 1 and stride[0] != 1 and extension_config.CONFIG_SINGLE_BATCH_CONV:
         mlir_template = MLIRConvSingleBatchStridedTemplate([x, weight, bias], layout, **kwargs)
-    elif I_C < CONFIG_VECTOR_LANE // 8: # 8 is hard-coded for now. This should be changed to a better heuristic.
+    elif I_C < extension_config.CONFIG_VECTOR_LANE // 8 and extension_config.CONFIG_MULTI_TILE_CONV: # 8 is hard-coded for now. This should be changed to a better heuristic.
         mlir_template = MLIRConvMultiTileTemplate([x, weight, bias], layout, **kwargs)
     else:
         mlir_template = MLIRConvTemplate([x, weight, bias], layout, **kwargs)
@@ -187,5 +187,5 @@ def custom_unsafe_index(x, indices):
 lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
 lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
 lowerings.update({getattr(aten._unsafe_index, overload): custom_unsafe_index for overload in aten._unsafe_index.overloads()})
-if CONFIG_USE_TIMING_POOLING:
+if extension_config.CONFIG_USE_TIMING_POOLING:
     lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
index 6f605d56..3658f992 100644
--- a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
@@ -1,13 +1,9 @@
-import os
 from typing import List, Optional, cast
 
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
 from torch._inductor.ir import Buffer
 from torch._inductor.ir import IRNode
-from torch._inductor.ir import ReinterpretView
-from torch._inductor.codecache import write_atomic
-import PyTorchSimFrontend.extension_codecache as extension_codecache
 from PyTorchSimFrontend.mlir import mlir_common
 import sympy
 
@@ -42,6 +38,7 @@ def render(self,
                kernel: MLIRTemplateKernel,
                template_buffer_node = None,
                epilogue_nodes: Optional[List[IRNode]] = None,
+               tile_info = None,
                **kwargs):
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
@@ -99,14 +96,3 @@ def render(self,
         code = self._template_from_string(TEMPLATE).render(**kernel.render_options)
         kernel.add_loop_info([X.get_numel()], [kernel.vector_lane, kernel.vector_lane])
         return code
-
-    def codegen_header(self, code, extra_headers):
-        write_path = extension_codecache.get_write_path(code)
-        if not os.path.exists(write_path):
-            os.makedirs(write_path)
-        spike_write_path = os.path.join(write_path, "global_var.h")
-        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-        if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, extra_headers[0])
-        if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, extra_headers[1])
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 2bbdb41d..38603319 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -3,7 +3,7 @@
 import sympy
 from functools import reduce
 import operator
-from sympy import symbols, sympify, Symbol
+from sympy import symbols, sympify
 from PyTorchSimFrontend import extension_config
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
 
@@ -94,6 +94,8 @@ def can_fuse_vertical(self, node1, node2):
         return self.can_fuse_horizontal(node1, node2)
 
     def can_fuse_horizontal(self, node1, node2):
+        if not extension_config.CONFIG_FUSION:
+            return False
         if (len(node1.get_nodes())+ len(node2.get_nodes())) > self.max_fusion_size:
             return False
         _, (vars1, reduce1) = node1.group
@@ -214,7 +216,7 @@ def codegen_nodes(self, nodes):
         ex_kernel.call_kernel(kernel_name)
         _, args, _, _ = ex_kernel.args.mlir_argdefs()
         args = ", ".join(args)
-        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
+        eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False))
         if (eager_mode):
             V.graph.wrapper_code.writeline(
                 f"yield ({kernel_name}, ({args}))"
@@ -259,85 +261,6 @@ def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, loop_size
             wrapper.define_kernel(kernel_name, codecache_def.getvalue(), cuda=False)
         return kernel_name
 
-    def codegen_template_code(self, kernel, render, template_node, prologue_nodes, epilogue_nodes):
-        with kernel:
-            _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs()
-            for node in [template_node, *prologue_nodes, *epilogue_nodes]:
-                node.mark_run()
-            # Partial codgen template nodes
-            partial_code = render()
-
-            # Swap load/store functions
-            kernel.load = kernel.load_epilogue
-            kernel.store = kernel.store_epilogue
-            kernel.store_reduction = kernel.store_reduction_epilogue
-            kernel.reduction = kernel.reduction_epilogue
-
-            # Codegen prologue nodes
-            if prologue_nodes:
-                # Flush created varaibles, since template fusion doen't share variable
-                with kernel.prologue_buffer_group.as_local():
-                    _, (group, reduction_group) = max(
-                        [prologue_nodes[-1]], key=lambda x: int(x.is_reduction())
-                    ).group
-                    prologue_tile_desc = kernel.set_tile_size(kernel.prologue_info, prologue=True)
-                    kernel.kernel_group.set_tile_info(prologue_tile_desc)
-                    vars, reduction_vars = kernel.set_ranges(group, reduction_group)
-                    for node in prologue_nodes:
-                        # Reuse created spad
-                        read_list = sorted([i.name for i in node.read_writes.reads])
-                        candidate_found = False
-                        # Why? There is a case that memdep.get_size() != data.get_size()
-                        buf_dict = {}
-                        buf_dict.update({val.name : val for val in V.graph.buffers})
-                        buf_dict.update(V.graph.graph_inputs)
-                        for candidate_read in read_list:
-                            if candidate_read in buf_dict and reduce(operator.mul, buf_dict[candidate_read].get_size(), 1) == node.node.get_numel():
-                                prologue_input_arg = candidate_read
-                                candidate_found = True
-                                break
-                        assert(candidate_found)
-                        assert(len(node.read_writes.writes)==1)
-                        prologue_output_arg = list(node.read_writes.writes)[0].name
-                        template_buf = self.kernel_group.args.input_buffers[prologue_output_arg]
-                        target_buf = f"{template_buf}_buffer" # FIXME. How to pass spad buffer name?
-
-                        # To skip the dma code gen
-                        kernel.buffer_names[prologue_input_arg] = target_buf
-                        kernel.buffer_names[prologue_output_arg] = target_buf
-
-                        # Edge delete
-                        kernel.kernel_group.args.input_buffers = {
-                            (arg if buf != template_buf else prologue_input_arg): buf
-                            for arg, buf in kernel.kernel_group.args.input_buffers.items()
-                        }
-                        node.codegen((vars, reduction_vars))
-
-            # Codegen epilogue nodes
-            tile_desc = kernel.set_tile_size(kernel.epilogue_info)
-            kernel.kernel_group.set_tile_info(tile_desc)
-            kernel.call_ranges = None
-            if epilogue_nodes:
-                with kernel.epilogue_buffer_group.as_local():
-                    _, (group, reduction_group) = max(
-                        epilogue_nodes, key=lambda x: int(x.is_reduction())
-                    ).group
-                    vars, reduction_vars = kernel.set_ranges(group, reduction_group)
-                    for node in epilogue_nodes:
-                        node.codegen((vars, reduction_vars))
-
-        with V.set_kernel_handler(kernel):
-            src_code = (
-                partial_code
-                if isinstance(partial_code, str)
-                else partial_code.finalize()
-            )
-
-        # For consistency, white space could make wrong write_path
-        buffer = IndentedBuffer()
-        buffer.splice(src_code)
-        return buffer.getvalue()
-
     def codegen_template(self, template_node, epilogue_nodes):
         # Handle prologue pattern
         prologue_nodes = []
@@ -350,24 +273,13 @@ def codegen_template(self, template_node, epilogue_nodes):
                     epilogue_nodes = epilogue_nodes[i+1:]
                     break
 
-        _, (numel, rnumel) = template_node.group
+        # Generate template code
         template_buffer = template_node.node
-        kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group)
+        kernel, tile_candidates, render = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group)
         _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs()
-
-        src_code = self.codegen_template_code(kernel, render, template_node, prologue_nodes, epilogue_nodes)
-        wrapper = V.graph.wrapper_code
-
-        if src_code in wrapper.src_to_kernel: # [CONV] check inner function is already defined
-            kernel_name = wrapper.src_to_kernel[src_code]
-            kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_name=kernel_name) # update kernel name
-            src_code = self.codegen_template_code(kernel, render, template_node, prologue_nodes, epilogue_nodes)
+        src_code = kernel.codegen_nodes(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes)
 
         with V.set_kernel_handler(kernel):
-            spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n"
-            spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({kernel.spad_info['spad_size']*kernel.vector_lane})));"
-            codegen_header(src_code, (kernel.header.getvalue()+spad_end_symbol+spad_section_end_symbol, kernel.gem5_header.getvalue()))
-            kernel.meta_kernel()
             kernel_name = self.define_kernel(src_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info,
                                              kernel.loop_size, origins={str(i) for i in template_node.node.origins})
             self.define_function(kernel)
@@ -375,7 +287,7 @@ def codegen_template(self, template_node, epilogue_nodes):
         kernel.call_kernel(kernel_name)
         V.graph.removed_buffers |= kernel.removed_buffers
         _, args, _, _ = self.kernel_group.args.mlir_argdefs()
-        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
+        eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False))
         if (eager_mode):
             target_kernel_name = kernel_name if kernel.outer_func_name is None else kernel.outer_func_name + f"_{len(args)}"
             args = ", ".join(args)
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 820d5c0d..df3621eb 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -6,26 +6,30 @@
 import contextlib
 import math
 import sympy
+from functools import reduce
+import operator
 from collections import OrderedDict
 
 from typing import List, Optional
 from unittest.mock import patch
 
-from torch._inductor.codegen.common import Kernel, KernelTemplate, ChoiceCaller, OpOverrides, CSE, DeferredLine
-from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, View
+from torch._inductor.codegen.common import KernelTemplate, ChoiceCaller, CSE, DeferredLine
+from torch._inductor.ir import Buffer, IRNode, TemplateBuffer
 from torch._inductor.select_algorithm import PartialRender
 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
 from torch._inductor.autotune_process import TensorMeta
 from torch._inductor.virtualized import V, NullHandler, _ops as ops
 from torch._inductor.utils import IndentedBuffer
+from torch._inductor.codecache import write_atomic
 
+import PyTorchSimFrontend.extension_codecache as extension_codecache
 from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
 from PyTorchSimFrontend.mlir.mlir_common import BaseMLIRHardwareInfo
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel, reduction_init, reduction_partial_combine_vec, reduction_combine_vec, is_welford_reduction
 from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode
 from torch._inductor.codegen import common
 
-from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_DIR
+from PyTorchSimFrontend import extension_config
 from . import mlir_common
 
 class IndentedBufferGroup:
@@ -93,7 +97,8 @@ def __init__(self,
                  kernel_group = None,
                  outer_func_name=None,
                  outer_func_render=None,
-                 kernel_arg_attributes=None) -> None:
+                 kernel_arg_attributes=None,
+                 reason=None) -> None:
         super().__init__(kernel_group if kernel_group is not None else mlir_common.MLIRWrapperKenrelGroup())
         self.kernel_name = kernel_name
         self.input_nodes = input_nodes
@@ -125,6 +130,15 @@ def __init__(self,
         self.reduction_mean = []
         # Dim info
         self.dim_aliasing = {}
+        self.reason = reason
+
+    def reset(self, reason):
+        self.__init__(
+            self.kernel_name, self.input_nodes,
+            self.call_size, self.kernel_group,
+            self.outer_func_name, self.outer_func_render,
+            self.kernel_arg_attributes, reason
+        )
 
     def add_loop_info(self, mat_size, tile_size):
         for idx, (loop_size, stride) in enumerate(zip(mat_size, tile_size)):
@@ -185,7 +199,8 @@ def gemmini_gemm_mapping(self, M, N, K):
 
         return inner_I, inner_J, inner_K
 
-    def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, pad_k=True, min_tile=False):
+    def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, pad_k=True, min_tile=False, is_conv=False):
+        tile_candidates = []
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
@@ -219,7 +234,7 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p
                     used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
                     check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane)
                     if check_spad_size:
-                        dir_path = f"{CONFIG_TORCHSIM_DIR}/validation/gemm_candidates"
+                        dir_path = f"{extension_config.CONFIG_TORCHSIM_DIR}/validation/gemm_candidates"
                         os.makedirs(dir_path, exist_ok=True)
                         file_path = f"{dir_path}/gemm_{M}_{K}_{N}.txt"
                         line_to_write = f"{tile_M} {tile_K} {tile_N}\n"
@@ -249,52 +264,22 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p
                         max_used_spad_size = used_spad_size
                         maximize_i_j = tile_M * tile_N
                         mapping = (tile_M, tile_N, tile_K)
-        return mapping
-
-    def search_mapping_space(self, mapping, idx, increment, stride, dilation, n_extra_node=0):
-        if idx == 0 or idx == 1 or idx == 4 or idx == 5 or idx == 6:
-            raise NotImplementedError("Only O_H and O_W are supported for search_mapping_space")
-        spad_size_per_lane = self.spad_info["spad_size"]
-        spad_size = spad_size_per_lane * self.vector_lane
-        max_spad_size = spad_size // 2 # double buffer
-        max_spad_per_lane = spad_size_per_lane // 2 # double buffer
-
-        mapping = list(mapping)
-        mapping[idx] += increment
-        k_h, k_w, o_h, o_w, M, N, K = mapping
-        i_h = 1 + (o_h - 1) * stride[0] + (k_h - 1) * dilation[0]
-        i_w = 1 + (o_w - 1) * stride[1] + (k_w - 1) * dilation[1]
-        weight_size = k_w * k_h * K * N
-        input_size = i_w * i_h * M * K
-        output_size = o_w * o_h * M * N
-        used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision
-        weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N)
-        input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
-        output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M  * (1 + n_extra_node), N)
-        used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
-        if used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane:
-            mapping = (k_h, k_w, o_h, o_w, M, N, K)
-        else:
-            mapping[idx] -= increment
-
-        return mapping
+                    if check_spad_size:
+                        tile_candidates.append((used_spad_size, (tile_M, tile_N, tile_K)))
 
-    def pseudo_auto_tune(self, mapping, stride, dilation, O_H, O_W, n_extra_node=0):
-        # pseudo auto-tune
-        if mapping[2] == 1 and not (O_H == 1):
-            mapping = self.search_mapping_space(mapping, 2, 1, stride, dilation, n_extra_node=n_extra_node)
-        if mapping[3] == 1 and not (O_W == 1):
-            mapping = self.search_mapping_space(mapping, 3, 1, stride, dilation, n_extra_node=n_extra_node)
-        return mapping
+        tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True)
+        tile_candidates = [v for _, v in tile_candidates]
+        return tile_candidates
 
     def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0):
+        tile_candidates = []
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2 # double buffer
         max_spad_per_lane = spad_size_per_lane // 2 # double buffer
 
         max_used_spad_size = 0
-        M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False)
+        M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0]
         max_k_h_w = 1 # maximize kernel size
         max_o_h_w = 1 # maximize output size
         K = min(K, self.vector_lane)
@@ -312,27 +297,30 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation
                         input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
                         output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M  * (1 + n_extra_node), N)
                         used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
-                        if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h * k_w and max_o_h_w <= o_h * o_w:
-                            max_used_spad_size = used_spad_size
-                            max_k_h_w = k_h * k_w
-                            max_o_h_w = o_h * o_w
-                            mapping = (k_h, k_w, o_h, o_w, M, N, K)
+                        check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane)
+                        if check_spad_size:
+                            tile_candidates.append((used_spad_size, (k_h, k_w, o_h, o_w, M, N, K)))
+                            if max_used_spad_size < used_spad_size and max_k_h_w <= k_h * k_w and max_o_h_w <= o_h * o_w:
+                                max_used_spad_size = used_spad_size
+                                max_k_h_w = k_h * k_w
+                                max_o_h_w = o_h * o_w
+                                mapping = (k_h, k_w, o_h, o_w, M, N, K)
         if max_used_spad_size == 0:
             raise RuntimeError("Cannot find a valid mapping")
 
-        # FIXME: this should be implemented with auto-tuning
-        mapping = self.pseudo_auto_tune(mapping, stride, dilation, O_H, O_W, n_extra_node=n_extra_node)
-
-        return mapping
+        tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True)
+        tile_candidates = [v for _, v in tile_candidates]
+        return tile_candidates
 
     def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0):
+        tile_candidates = []
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2
         max_spad_per_lane = spad_size_per_lane // 2
 
         max_used_spad_size = 0
-        M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node=n_extra_node, pad_k=False)
+        M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0]
         max_k_h_w = K_W
         for o_h in sympy.divisors(O_H):
             for o_w in sympy.divisors(O_W):
@@ -347,22 +335,28 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation,
                     input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
                     output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M  * (1 + n_extra_node), N)
                     used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
-                    if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h:
-                        max_used_spad_size = used_spad_size
-                        max_k_h_w = k_h
-                        mapping = (k_h, K_W, o_h, o_w, M, N, K)
+                    check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane)
+                    if check_spad_size:
+                        tile_candidates.append((used_spad_size, (k_h, K_W, o_h, o_w, M, N, K)))
+                        if max_used_spad_size < used_spad_size and max_k_h_w <= k_h:
+                            max_used_spad_size = used_spad_size
+                            max_k_h_w = k_h
+                            mapping = (k_h, K_W, o_h, o_w, M, N, K)
         if max_used_spad_size == 0:
             raise RuntimeError("Cannot find a valid mapping")
-        return mapping
+        tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True)
+        tile_candidates = [v for _, v in tile_candidates]
+        return tile_candidates
 
     def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0):
+        tile_candidates = []
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
         max_spad_size = spad_size // 2
         max_spad_per_lane = spad_size_per_lane // 2
 
         max_used_spad_size = 0
-        M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node=n_extra_node, pad_k=False)
+        M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0]
         max_k_h_w = 1
         for o_h in sympy.divisors(O_H):
             for k_h in sympy.divisors(K_H):
@@ -377,13 +371,18 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio
                     input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * k_w, K)
                     output_size_per_lane = self.get_spad_size_per_lane(M * o_h  * (1 + n_extra_node), N)
                     used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
-                    if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h * k_w:
-                        max_used_spad_size = used_spad_size
-                        max_k_h_w = k_h * k_w
-                        mapping = (k_h, k_w, o_h, M, M, N, K)
+                    check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane)
+                    if check_spad_size:
+                        tile_candidates.append((used_spad_size, (k_h, k_w, o_h, M, M, N, K)))
+                        if max_used_spad_size < used_spad_size and max_k_h_w <= k_h * k_w:
+                            max_used_spad_size = used_spad_size
+                            max_k_h_w = k_h * k_w
+                            mapping = (k_h, k_w, o_h, M, M, N, K)
         if max_used_spad_size == 0:
             raise RuntimeError("Cannot find a valid mapping")
-        return mapping
+        tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True)
+        tile_candidates = [v for _, v in tile_candidates]
+        return tile_candidates
 
     def meta_kernel(self):
         wrapper = V.graph.wrapper_code
@@ -407,6 +406,131 @@ def call_kernel(self, kernel_name):
             kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}",
             call_args, cuda=False)
 
+    def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes, tile_info):
+        with self as kernel:
+            _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs()
+            for node in [template_node, *prologue_nodes, *epilogue_nodes]:
+                node.mark_run()
+
+            # Partial codgen template nodes
+            partial_code = render(kwargs={**render.keywords['kwargs'], 'tile_info': tile_info})
+
+            # Swap load/store functions
+            kernel.load = kernel.load_epilogue
+            kernel.store = kernel.store_epilogue
+            kernel.store_reduction = kernel.store_reduction_epilogue
+            kernel.reduction = kernel.reduction_epilogue
+
+            # Codegen prologue nodes
+            if prologue_nodes:
+                # Flush created varaibles, since template fusion doen't share variable
+                with kernel.prologue_buffer_group.as_local():
+                    _, (group, reduction_group) = max(
+                        [prologue_nodes[-1]], key=lambda x: int(x.is_reduction())
+                    ).group
+                    prologue_tile_desc = kernel.set_tile_size(kernel.prologue_info, prologue=True)
+                    kernel.kernel_group.set_tile_info(prologue_tile_desc)
+                    vars, reduction_vars = kernel.set_ranges(group, reduction_group)
+                    for node in prologue_nodes:
+                        # Reuse created spad
+                        read_list = sorted([i.name for i in node.read_writes.reads])
+                        candidate_found = False
+                        # Why? There is a case that memdep.get_size() != data.get_size()
+                        buf_dict = {}
+                        buf_dict.update({val.name : val for val in V.graph.buffers})
+                        buf_dict.update(V.graph.graph_inputs)
+                        for candidate_read in read_list:
+                            if candidate_read in buf_dict and reduce(operator.mul, buf_dict[candidate_read].get_size(), 1) == node.node.get_numel():
+                                prologue_input_arg = candidate_read
+                                candidate_found = True
+                                break
+                        assert(candidate_found)
+                        assert(len(node.read_writes.writes)==1)
+                        prologue_output_arg = list(node.read_writes.writes)[0].name
+                        template_buf = self.kernel_group.args.input_buffers[prologue_output_arg]
+                        target_buf = f"{template_buf}_buffer" # FIXME. How to pass spad buffer name?
+
+                        # To skip the dma code gen
+                        kernel.buffer_names[prologue_input_arg] = target_buf
+                        kernel.buffer_names[prologue_output_arg] = target_buf
+
+                        # Edge delete
+                        kernel.kernel_group.args.input_buffers = {
+                            (arg if buf != template_buf else prologue_input_arg): buf
+                            for arg, buf in kernel.kernel_group.args.input_buffers.items()
+                        }
+                        node.codegen((vars, reduction_vars))
+
+            # Codegen epilogue nodes
+            tile_desc = kernel.set_tile_size(kernel.epilogue_info)
+            kernel.kernel_group.set_tile_info(tile_desc)
+            kernel.call_ranges = None
+            if epilogue_nodes:
+                with kernel.epilogue_buffer_group.as_local():
+                    _, (group, reduction_group) = max(
+                        epilogue_nodes, key=lambda x: int(x.is_reduction())
+                    ).group
+                    vars, reduction_vars = kernel.set_ranges(group, reduction_group)
+                    for node in epilogue_nodes:
+                        node.codegen((vars, reduction_vars))
+
+        with V.set_kernel_handler(kernel):
+            src_code = (
+                partial_code
+                if isinstance(partial_code, str)
+                else partial_code.finalize()
+            )
+
+            # For consistency, white space could make wrong write_path
+            buffer = IndentedBuffer()
+            buffer.splice(src_code)
+            src_code = buffer.getvalue()
+            self._prepare_simulator_headers(src_code)
+        return src_code
+
+    def make_choices(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes):
+        choices = []
+        for tile_info in tile_candidates:
+            print(f"[Auto-tune] Trying tile size: {list(tile_info)}")
+            src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info)
+            bench_runner = self.run_bench([template_node], self.kernel_name, src_code)
+            choices.append((bench_runner, src_code, tile_info, self.loop_size))
+            self.reset(reason=None)
+        return choices
+
+    def _log_autotune_result(self, best_choice, best_cycle):
+        tile_size = best_choice[2]
+        print(
+            f"[Auto-tune] Optimal tile size: {list(tile_size)}, "
+            f"cycles: {best_cycle}"
+        )
+
+    def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes):
+        if extension_config.CONFIG_AUTOTUNE_TEMPLATE and len(tile_candidates):
+            src_code, loop_size = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes)
+            self.loop_size = loop_size
+        else:
+            tile_info = tile_candidates[0] if tile_candidates else None
+            src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info)
+
+        with V.set_kernel_handler(self):
+            self.meta_kernel()
+        return src_code
+
+    def _prepare_simulator_headers(self, src_code):
+        spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n"
+        spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
+
+        write_path = extension_codecache.get_write_path(src_code)
+        if not os.path.exists(write_path):
+            os.makedirs(write_path, exist_ok=True)
+        spike_write_path = os.path.join(write_path, "global_var.h")
+        gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
+        if not os.path.exists(spike_write_path):
+            write_atomic(spike_write_path, self.header.getvalue()+spad_end_symbol+spad_section_end_symbol)
+        if not os.path.exists(gem5_write_path):
+            write_atomic(gem5_write_path, self.gem5_header.getvalue())
+
     def codegen_prologue_body(self):
         body = IndentedBuffer()
         with self.prologue_buffer_group.as_local():
@@ -685,8 +809,8 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com
             sram_var = tile_desc.get_name()
             tile_shape = tile_desc.get_mlir_shape(mlir_dtype)
             tile_stride = tile_desc.get_tile_stride()
-            vlane_split_axis = tile_desc.vlane_split_axis
-            vlane_stride = tile_desc.vlane_stride
+            vlane_split_axis = tile_desc.vmap.vlane_split_axis
+            vlane_stride = tile_desc.vmap.vlane_stride
 
             zero_cse = self.get_const_cse(0, "index")
             sram_index_var = ", ".join([f"%{str(zero_cse)}"]*tile_desc.get_nr_dim())
@@ -734,8 +858,8 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         # Want to use tile_desc from epilogue_info
         index_var = self.parse_indices(index)
         dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()]
-        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis
-        vlane_stride = self.kernel_group.tile_desc.vlane_stride
+        vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis
+        vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = self.kernel_group.tile_desc.get_tile_stride()
 
@@ -773,7 +897,7 @@ def load_epilogue(self, name: str, index: sympy.Expr):
             vshape = f"vector<{vsize}x{mlir_dtype}>"
 
             if compute_vec_size > 1:
-                offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.reduction_axis_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})")
+                offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.r_tile_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})")
                 compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{offset}"])
                 operation = "affine.vector_load"
                 line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
@@ -793,8 +917,8 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
         index_var = self.parse_indices(index)
         dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()]
-        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis
-        vlane_stride = self.kernel_group.tile_desc.vlane_stride
+        vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis
+        vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = self.kernel_group.tile_desc.get_tile_stride()
 
@@ -859,8 +983,8 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         vec_size = self.compute_body_loop.step
         type_name = mlir_common.DTYPE_TO_MLIR[dtype]
         new_tile_size = self.kernel_group.tile_desc.get_tile_size()[:-1] + [vec_size]
-        new_vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis
-        new_vlane_stride = self.kernel_group.tile_desc.vlane_stride
+        new_vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis
+        new_vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride
         local_tile_desc = mlir_common.MLIRMultiDimTile(new_tile_size, self.vector_lane, new_vlane_split_axis, new_vlane_stride, vec_size)
 
         tile_shape = local_tile_desc.get_mlir_shape(type_name)
@@ -906,8 +1030,8 @@ def store_reduction_epilogue(self, name, index, value):
 
         index_var = self.parse_indices(index, self.reductions_suffix, comments="// Store reduction")
         dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()][:-1] # Assume that there is only one reduction axis
-        vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis
-        vlane_stride = self.kernel_group.tile_desc.vlane_stride
+        vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis
+        vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride
 
         # Create final buffer descriptor
         nr_outer_loop = self.reduction_nr_outer_loop
@@ -958,12 +1082,7 @@ def store_reduction_epilogue(self, name, index, value):
 
             if self.welford_reduce_out is not None:
                 # NOTE: It not a real welford algorithm... We just used E(X^2) - E(X)^2
-                divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.reduction_axis_size)} : f32")
-                if self.reduction_axis_size - 1 > 0:
-                    divider2 = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.reduction_axis_size-1)} : f32")
-                else:
-                    divider2 = divider
-
+                divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.r_dim_size)} : f32")
                 if self.buffer_types[name][1] > 1:
                     divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to {new_reduced_shape}")
                 else:
@@ -1002,19 +1121,20 @@ def set_tile_size(self, template_fusion_info, prologue=False):
         if 'nr_rdim' in template_fusion_info and template_fusion_info['nr_rdim']==1:
             tile_desc.nr_rdim = 1
             numel_per_lane = tile_desc.get_numel_per_lane()
-            reduction_axis_size = tile_desc.get_tile_size()[-1]
-            nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size
-            tile_desc.vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality...
+            r_tile_size = tile_desc.get_tile_size()[-1]
+            nr_outer_loop = (numel_per_lane + r_tile_size-1) // r_tile_size
+            tile_desc.vmap.forced_vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality...
 
             self.reduction_fusion = True
-            self.reduction_axis_size =  tile_desc.get_tile_size()[-1]
+            self.r_tile_size = tile_desc.get_tile_size()[-1]
+            self.r_dim_size = template_fusion_info['r_dim_size']
             self.reduction_nr_outer_loop = nr_outer_loop
             self.reduction_loop_idx = "reduce_loop_idx"
-            self.compute_body_loop.size = reduction_axis_size
+            self.compute_body_loop.size = r_tile_size
             self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop
             self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_loop_idx, nr_outer_loop)
         else:
-            tile_desc.vec_size=64
+            tile_desc.vmap.forced_vec_size = 64
 
             if prologue:
                 self.prologue_compute_body_loop.size = tile_desc.get_numel_per_lane()
@@ -1110,7 +1230,8 @@ def make_kernel_render(
                 template=self,
                 kwargs=kwargs
             )
-            return kernel, render, self.codegen_header
+            tile_candidates = self.get_tile_candidates(**kwargs)[:extension_config.CONFIG_AUTOTUNE_TEMPLATE_TOPK]
+            return kernel, tile_candidates, render
 
         return MLIRTemplateCaller(
             kernel_hash_name,
@@ -1122,5 +1243,8 @@ def make_kernel_render(
             self,
         )
 
+    def get_tile_candidates(self, **kwargs):
+        return []
+
     def render(self, **kwargs) -> str:
         raise NotImplementedError
\ No newline at end of file
diff --git a/README.md b/README.md
index 56b58b28..dbfdf2e8 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,11 @@
 # PyTorchSim: A Comprehensive, Fast, and Accurate NPU Simulation Framework
 [![Docker Image CI](https://github.com/PSAL-POSTECH/PyTorchSim/actions/workflows/docker-image.yml/badge.svg)](https://github.com/PSAL-POSTECH/PyTorchSim/actions/workflows/docker-image.yml)
 
-PyTorchSim is a comprehensive, high-speed, cycle-accurate NPU simulation framework
-- We define a RISC-V-based NPU architecture and implement PyTorch compiler backend to run inference & training for PyTorch models
-- Achieved high speed and accuracy with our novel Tile-Level Simulation (TLS) with compiler-generated Tile-Operation Graph (TOG), exploiting deterministic tile compute latency
-- A generic and extensible NPU architecture based on RISC-V vector extension
-- The functional simulator supports code correctness validation and data-dependent timing simulation
+PyTorchSim is a comprehensive, high-speed, cycle-accurate NPU simulation framework.
+- We define a RISC-V-based NPU architecture and implement PyTorch compiler backend to run inference & training for PyTorch models.
+- Achieved high speed and accuracy with our novel Tile-Level Simulation (TLS) with compiler-generated Tile-Operation Graph (TOG), exploiting deterministic tile compute latency.
+- A generic and extensible NPU architecture based on RISC-V vector extension.
+- The functional simulator supports code correctness validation and data-dependent timing simulation.
 
 
 For more details, please refer to our [paper](https://doi.org/10.1145/3725843.3756045)!
@@ -31,6 +31,7 @@ PyTorchSim **supports**:
 - [Multi-tenancy](#multi-tenancy)
 - [Compiler optimizations](#compiler-optimizations)
 - [Mapping](#mapping)
+- [L2 Cache](#l2-cache) (persistent cache)
 
 ## Model Zoo
 | Model | Source | Status | Note |
@@ -87,12 +88,17 @@ To download the latest Docker image and set up the environment, use the followin
 # Run the Docker container
 docker run -it --ipc=host --name torchsim -w /workspace/PyTorchSim ghcr.io/psal-postech/torchsim-ci:latest bash
 ```
+### Manual Setting (Optional)
+This script provides building [Gem5](https://github.com/PSAL-POSTECH/gem5.git), [LLVM](https://github.com/PSAL-POSTECH/llvm-project.git), and [Spike](https://github.com/PSAL-POSTECH/riscv-isa-sim.git) simulator from source code for specific experts.
+```bash
+bash script/build_from_source.sh
+```
 ### Run Examples
 The `tests` directory contains several AI workloads examples.
 ```bash
 python tests/test_matmul.py 
 ```
-The result is stored to `TORCHSIM_DUMP_PATH`/`hash`/backendsim_result/. The log file contains detailed core, memory, and interconnect stats.
+The result is stored to `TORCHSIM_DUMP_PATH/hash/togsim_result/`. The log file contains detailed core, memory, and interconnect stats.
 
 ### Run Your Own Model on PyTorchSim
 You can run your own PyTorch model on PyTorchSim by setting up a custom NPU device.  
@@ -125,9 +131,9 @@ Wrapper Codegen Path = /tmp/torchinductor_root/yd/cyda7nhzv5mtakfhfcxtmmhtsv6kg7
 [Gem5Simulator] cmd>  /workspace/gem5/build/RISCV/gem5.opt -r --stdout-file=sto.log -d /tmp/torchinductor/tmp/fy6nnyudtno/m5out /root/workspace/PyTorchSim/gem5_script/script_systolic.py -c /tmp/torchinductor/tmp/fy6nnyudtno/cycle_bin --vlane 128
 [Gem5Simulator] Simulation is still running... 
 [SpikeSimulator] cmd>  spike --isa rv64gcv --varch=vlen:256,elen:64 --vectorlane-size=128 -m0x80000000:0x1900000000,0x2000000000:0x1000000 --scratchpad-base-paddr=137438953472 --scratchpad-base-vaddr=3489660928 --scratchpad-size=131072  --kernel-addr=0000000000010400:10846 --base-path=/tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001 /workspace/riscv-pk/build/pk /tmp/torchinductor/tmp/fy6nnyudtno/validation_binary /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/arg0_1/0.raw /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/arg1_1/0.raw /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/buf0/0.raw
-[BackendSimulator] cmd>  /root/workspace/PyTorchSim/PyTorchSimBackend/build/bin/Simulator --config /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/attribute/0
-[BackendSimulator] Simulation is still running..  
-[BackendSimulator] Simulation of "/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/fy6nnyudtno/backendsim_result/0"
+[TOGSimulator] cmd>  /root/workspace/PyTorchSim/TOGSim/build/bin/Simulator --config /root/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/attribute/0
+[TOGSimulator] Simulation is still running..  
+[TOGSimulator] Simulation of "/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/0"
 ----------------------------
 |Matmul Forward Test Passed|
 ----------------------------
@@ -137,25 +143,25 @@ Simulation consists of three steps
 
 1. `Gem5Simulator` obatins compute latency for TOG.
 2. `SpikeSimulator` verifies the output code.
-3. `BackendSimulator` simulates a NPU architecture.
+3. `TOGSimulator` simulates a NPU architecture.
 
 If you want to turn off the `SpikeSimulator` for fast simulation, you can set as below.
 ```bash
-export TORCHSIM_VALIDATION_MODE=False
+export TORCHSIM_FUNCTIONAL_MODE=False
 ```
 Log contains memory & core stats.
 ```bash
 [info] HBM2-CH_0: avg BW utilization 37% (255 reads, 128 writes)
 [info] Row hits: 359, Row misses: 26, Row conflicts: 0
 [info] ========= Core stat =========
-[info] Core [0] : Systolic array [0] Utilization(%) 0.00, active cycle 0, idle cycle 1014
-[info] Core [0] : Systolic array [1] Utilization(%) 12.62, active cycle 128, idle cycle 886
-[info] Core [0] : TMA active cycle 3 TMA idle cycle 1011 DRAM BW 182.000 GB/s (6144)
-[info] Core [0] : Vector Unit Utilization(%) 4.34, active cycle 44, idle_cycle 0
-[info] Core [0] : Numa hit count : 0, Numa miss count : 0
-[info] Core [0] : Total cycle 1014
-[info] Total execution cycle: 1014
-[info] Simulation time: 0.039296 seconds
+[info] Core [0] : Systolic array [0] Utilization(%) 0.00, active_cycles 0, idle_cycles 1014
+[info] Core [0] : Systolic array [1] Utilization(%) 12.62, active_cycles 128, idle_cycles 886
+[info] Core [0] : DMA active_cycles 3 DMA idle_cycles 1011 DRAM BW 182.000 GB/s (6144)
+[info] Core [0] : Vector Unit Utilization(%) 4.34, active_cycles 44, idle_cycle 0
+[info] Core [0] : NUMA local memory: 34 requests, remote memory: 0 requests
+[info] Core [0] : Total_cycles 1014
+[info] Total execution cycles: 1014
+[info] Wall-clock time for simulation: 0.039296 seconds
 ```
 The log is dumped in `TORCHSIM_DUMP_PATH` and you can set the path as below.
 ```bash
@@ -175,61 +181,96 @@ opt_step()
 `tests/test_mlp.py` provides an example of MLP training.
 
 ## Multi-tenancy
-Load generator supports multi-tenancy experiments. You can simply run `tests/test_scheduler.py`
+Our load generator supports multi-tenancy experiments. You can run a simple example by executing `tests/test_scheduler.py`.
 ```bash
 python tests/test_scheduler.py
 ```
-Below is an example code of multi-tenancy
-`target_model1` and `target_model2` is your own PyTorch model.
-You can set the request arrival time and request queue index. Request queue is used for scheduling and you can set the number of queue to each core in [TOGSim configuration](#togsim-configuration)
-```python
-# Init scheduler
-scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+Below is an example code of multi-tenancy `resnet18` and `EncoderBlock`.
+In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSimulator config file(`.json`). The compiled PyTorch models are then registered with a unique model id.
+
+```python3
+import os
+import sys
+import torch
+from torchvision.models import resnet18
+from test_transformer import EncoderBlock
+base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+config = f'{base_path}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
+
+sys.path.append(base_path)
+from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
+scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
+
 # Register compiled model
-opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last))
-opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device()))
-SchedulerDNNModel.register_model("resnet18", opt_model1)
-SchedulerDNNModel.register_model("bert", opt_model2)
-
-# Init input data
-model_input1 = torch.randn(1, 3, 224, 224)
-model_input2 = torch.randn(128, 768)
-
-# Init request
-new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0)
-new_request2 = Request("bert", [model_input2], [], request_queue_idx=1)
-new_request3 = Request("resnet18", [model_input1], [], request_queue_idx=0)
-new_request4 = Request("bert", [model_input2], [], request_queue_idx=1)
-
-# Add request to scheduler
-scheduler.add_request(new_request1, request_time=0)
-scheduler.add_request(new_request2, request_time=0)
-scheduler.add_request(new_request3, request_time=0)
-scheduler.add_request(new_request4, request_time=0)
+target_model0 = resnet18().eval()
+target_model1 = EncoderBlock(768, 12).eval()
+opt_model0 = torch.compile(target_model0.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last))
+opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device()))
+SchedulerDNNModel.register_model("model0", opt_model0)
+SchedulerDNNModel.register_model("model1", opt_model1)
+```
+
+The config file(`.json`) specifies two key items:
+- `num_partition`: The total number of independent request queues to create.
+- `partition`: Defines the hardware mapping, assigning each queue (identified by its index) to a specific physical core.
+For example, the configuration below creates two scheduling queues (`0` and `1`) and maps `core_0` to queue `0` and `core_1` to queue `1`:
+```
+  "num_partition" : 2,
+  "partition": {
+    "core_0":0,
+    "core_1":1
+  }
+```
 
+Next, DNN model requests are generated and submitted. We provide a `poisson_request_generator` utility, which generates request arrival times.
+Each `Request` is created with its model name, data, and a request_queue_idx to specify its target queue, then added via `scheduler.add_request`.
+As shown in the code, `model0` requests are queued to `request_queue_idx=0`, while `model1` requests are queued to `request_queue_idx=1`.
+```python3
+# Load Generation
+model0_lambda = 5.0
+model1_lambda = 3.0
+max_time = 1000.0 # [s]
+
+# Generate Possion distribution requests for model0
+for model0_request_time in poisson_request_generator(model0_lambda, total_time=max_time):
+    x = torch.randn(1, 3, 224, 224)
+    new_request = Request("model0", [x], [], request_queue_idx=0)
+    scheduler.add_request(new_request, request_time=model0_request_time)
+
+# Generate Possion distribution requests for model1
+for model1_request_time in poisson_request_generator(model1_lambda, total_time=max_time):
+    x = torch.randn(128, 768)
+    new_request = Request("model1", [x], [], request_queue_idx=1)
+    scheduler.add_request(new_request, request_time=model1_request_time)
+```
+
+Finally, `scheduler.schedule()` is called in a loop until all requests are processed.
+```python3
 # Run scheduler
 while not scheduler.is_finished():
     scheduler.schedule()
 ```
+
 ## Compiler Optimizations
-PyTorchSim compiler supports fusions
+PyTorchSim compiler supports several fusion optimizations:
 - GEMM prologue fusion
 - GEMM epilogue fusion
 - GEMM reduction fusion
 - CONV epilogue fusion
 
-Depending on tensor shape, use different convolution template
+Depending on tensor shape, use different convolution template:
 - Single batch optimization
 - Multi-channel optimization
 
 ## Mapping
-PyTorchSim provids three mapping strategies
+PyTorchSim provides three mapping strategies.
 ### Heuristic-based mapping
 We adopt and modified heuristic-based mapping of [GEMMINI](https://github.com/ucb-bar/gemmini) by default, which maximizes the utilization of scratchpad memory.
 ### Auto-tuning
 Heuristic method is not optimal for some cases. PyTorchSim provides auto-tuning to find best mapping for GEMM, CONV, and vector operations. It reduces searching space by sorting of scratchpad memory utilization and pick top-k candiates. Searching parameters are tile shape and vector lane stride.
 ```bash
 export AUTOTUNE=True
+export AUTOTUNE_TEMPLATE=True
 ```
 ### Manunal setting
 User can exploit third-party(e.g. Timeloop) mapping. Set the cheatsheet path and write down their own mapping.
@@ -264,8 +305,27 @@ export TORCHSIM_TILE_M=512
 export TORCHSIM_TILE_N=512
 export TORCHSIM_TILE_K=512
 ```
+## L2 Cache
+It supports L2 cache as persistent cache. User can provide software-managed allocation/eviction strategy for tensors with persistent cache.
+
+Common Memory (CMEM) is a new feature introduced in the latest TPUs (newer than TPUv3). Multiple cores share this memory, which provides high bandwidth. Reusable tensors are stored and loaded from CMEM to avoid off-chip traffic. Our L2 cache can work like as CMEM
+
+To allocate a tensor in L2 cache, set the environment variable as shown below. The `tpuv4` directory provides example plans for L2 cache obtained from TPUv4 profiling.
+```bash
+export SRAM_BUFFER_PLAN_PATH=tpuv4/gemm_plan.py
+```
+The L2 cache strategy file is composed as follows:
+```
+plan = {
+    "arg0_1"
+}
+```
+In this example, only one input tensor is registered in L2 cache. You can refer to the tensor name from the wrapper code. After running the code, you can find the wrapper codegen path in the [result](#result) section.
+
+Last but not least, you must set `l2d_type` and `l2d_config` in the [TOGSim config](#togsim-configuration) to use L2 cache. The `l2d_config` follows the same configuration method as [AccelSim](https://github.com/accel-sim/accel-sim-framework).
+
 ## Compiler Configuration
-`PyTorchSimFrontend/extension_config.py` contains target hardware configuration to compile
+`PyTorchSimFrontend/extension_config.py` contains target hardware configuration to compile.
 
 You can configure these options using environment variables.
 ```bash
@@ -284,23 +344,27 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
 ## TOGSim Configuration
 ![NPU_Core](./docs/npu_core.jpg)
 
-`PyTorchSimBackend/configs` directory contains example NPU configuration files in the JSON format.
+`TOGSim/configs` directory contains example NPU configuration files in the JSON format.
 ```
   "num_cores" : 2,                   // Number of NPU cores
-  "core_freq" : 940,                 // Core's frequency (MHz)
+  "core_freq_mhz" : 940,             // Core's frequency (MHz)
   "num_systolic_array_per_core" : 2, // Number of systolic array per core
 
   "dram_type" : "ramulator2",        // DRAM type (ex. ramulator2, simple)
-  "dram_freq" : 940,                 // DRAM frequency (MHz)
+  "dram_freq_mhz" : 940,             // DRAM frequency (MHz)
   "dram_channels": 32,               // Number of DRAM channels
   "dram_req_size": 32,               // DRAM request size (B)
   "dram_latency" : 10,               // DRAM latency (cycle)
   "dram_nbl" : 2,                    // DRAM burst length size
   "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", // Ramulator2 config file path
 
-  "icnt_type" : "simple",            // Interconnect type (ex. booksim, simple)
-  "icnt_latency" : 7,                // Interconnect latency (cycle)
-  "icnt_freq" : 28000,               // Interconnect frequency (MHz)
+  "l2d_type" : "datacache",
+  "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32",
+
+  "icnt_type" : "simple",              // Interconnect type (ex. booksim, simple)
+  "icnt_latency" : 7,                  // Interconnect latency (cycle)
+  "icnt_freq_mhz" : 940,               // Interconnect frequency (MHz)
+  "icnt_injection_ports_per_core" : 16 // Interconnect injection ports per core
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", // Booksim2 config file path
 
   "precision" : 4,                   // Element's precision in tensor (Byte)
@@ -313,7 +377,7 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
 ```
 You can set TOGSim config path as below.
 ```bash
-export TORCHSIM_CONFIG=/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+export TORCHSIM_CONFIG=/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
 ```
 ## Future Works
 Currently, PyTorchSim supports PyTorch 2.2. Support for newer versions will be added soon.
@@ -346,11 +410,10 @@ If you use PyTorchSim for your research, please cite the following paper.
 @INPROCEEDINGS{yang2025pytorchsim,
   author={Yang, Wonhyuk and Shin, Yunseon and Woo, Okkyun and Park, Geonwoo and Ham, Hyungkyu and Kang, Jeehoon and Park, Jongse and Kim, Gwangsun},
   title={PyTorchSim: A Comprehensive, Fast, and Accurate NPU Simulation Framework},
-  booktitle={2025 58th IEEE/ACM International Symposium on Microarchitecture (MICRO)}, 
-  volume={},
-  number={},
-  pages={},
+  booktitle={Proceedings of the 58th IEEE/ACM International Symposium on Microarchitecture},
+  pages={1363–1380},
   year={2025},
-  doi={10.1145/3725843.3756045}
+  doi={10.1145/3725843.3756045},
+  series={MICRO '25}
 }
-```
\ No newline at end of file
+```
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 834698a6..0b633fa9 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 import importlib.util
 from PyTorchSimFrontend.extension_codecache import hash_prefix
-from Simulator.simulator import BackendSimulator
+from Simulator.simulator import TOGSimulator
 from PyTorchSimFrontend import extension_config
 
 def import_module_from_path(module_name, path):
@@ -140,11 +140,11 @@ def __str__(self):
     def register_model(model_name : str, compiled_model):
         SchedulerDNNModel.MODEL_MAP[model_name] = compiled_model
 
-class ExecutionEngine:
+class PyTorchSimRunner:
     PARTITION_BUSY = 0
     PARTITION_IDLE = 1
     SELECT_NOTHING = 2
-    def __init__(self, backend_simulator : BackendSimulator, num_partion=1) -> None:
+    def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None:
         self.module = self.setup_device()
         self.num_partion = num_partion
         self.launch_model_dicts = []
@@ -156,11 +156,11 @@ def __init__(self, backend_simulator : BackendSimulator, num_partion=1) -> None:
             self.partition_state.append(self.PARTITION_IDLE)
 
         self.finish_req_dict = {}
-        self.backend_simulator = backend_simulator
+        self.tog_simulator = tog_simulator
 
         # Dry run for compile and create generator
-        os.environ["BACKENDSIM_DRYRUN"] = "1"
-        os.environ["BACKENDSIM_EAGER_MODE"] = "1"
+        os.environ["TOGSIM_DRYRUN"] = "1"
+        os.environ["TOGSIM_EAGER_MODE"] = "1"
 
     @staticmethod
     def setup_device():
@@ -171,7 +171,7 @@ def setup_device():
 
         import torch.utils.cpp_extension
         module = torch.utils.cpp_extension.load(
-            name="extension_device",
+            name="npu",
             sources=[
                 str(source_file),
             ],
@@ -179,7 +179,7 @@ def setup_device():
             verbose=True,
         )
 
-        torch.utils.rename_privateuse1_backend("extension_device")
+        torch.utils.rename_privateuse1_backend("npu")
         from torch._inductor.codegen.common import (
             get_scheduling_for_device,
             get_wrapper_codegen_for_device,
@@ -192,13 +192,13 @@ def setup_device():
             MLIRScheduling
         )
         register_backend_for_device(
-            "extension_device", MLIRScheduling, ExtensionWrapperCodegen
+            "npu", MLIRScheduling, ExtensionWrapperCodegen
         )
         assert(
-            get_scheduling_for_device("extension_device") == MLIRScheduling
+            get_scheduling_for_device("npu") == MLIRScheduling
         )
         assert(
-        get_wrapper_codegen_for_device("extension_device")
+        get_wrapper_codegen_for_device("npu")
             == ExtensionWrapperCodegen
         )
         return module
@@ -222,7 +222,7 @@ def is_all_idle(self):
         return all([self.is_partition_idle(i) for i in range(self.num_partion)])
 
     def prepare_model(self, req_model: SchedulerDNNModel):
-        result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "backend_result", req_model.model_name)
+        result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "togsim_result", req_model.model_name)
         os.makedirs(result_path, exist_ok=True)
         index = str(len(os.listdir(result_path)))
 
@@ -244,7 +244,7 @@ def prepare_launch_kernel(self, kernel, inputs):
         onnx_path = os.path.join(result_path, "tile_graph.onnx")
 
         attribute_path = os.path.join(runtime_path, "attribute")
-        attribute_path = self.backend_simulator.create_attribute_file(attribute_path, inputs)
+        attribute_path = self.tog_simulator.create_attribute_file(attribute_path, inputs)
         return onnx_path, attribute_path
 
     def launch_kernel(self, current_cycle, partion_idx=0):
@@ -260,11 +260,11 @@ def launch_kernel(self, current_cycle, partion_idx=0):
         else:
             onnx_path, attribute_path = kernel, inputs
         self.partition_state[partion_idx] = self.PARTITION_BUSY
-        return self.backend_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx)
+        return self.tog_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx)
 
-class FIFOExecutionEngine(ExecutionEngine):
-    def __init__(self, backend_simulator: BackendSimulator, num_partion=1) -> None:
-        super().__init__(backend_simulator, num_partion)
+class FIFORunner(PyTorchSimRunner):
+    def __init__(self, tog_simulator: TOGSimulator, num_partion=1) -> None:
+        super().__init__(tog_simulator, num_partion)
 
     def select_kernel(self, partition_idx):
         while len(self.nested_launch_model_dicts[partition_idx]) or len(self.launch_model_dicts[partition_idx]):
@@ -297,9 +297,9 @@ def select_kernel(self, partition_idx):
         # No proper kernel now
         return self.SELECT_NOTHING
 
-class RRExecutionEngine(ExecutionEngine):
-    def __init__(self, backend_simulator: BackendSimulator, num_partion=1) -> None:
-        super().__init__(backend_simulator, num_partion)
+class RoundRobinRunner(PyTorchSimRunner):
+    def __init__(self, tog_simulator: TOGSimulator, num_partion=1) -> None:
+        super().__init__(tog_simulator, num_partion)
         self.next_pointer = None
 
     def select_kernel(self, partition_idx):
@@ -347,7 +347,7 @@ class Scheduler:
 
     FIFO_ENGINE = 0
     RR_ENGINE = 1
-    def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, backend_config=extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) -> None:
+    def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG) -> None:
         self.current_cycle = 0
         self.max_batch = max_batch
         self.num_request_queue = num_request_queue
@@ -356,13 +356,13 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE,
             self.request_queue.append([])
         self.finish_queue : List[Request] = []
 
-        backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
-        self.backend_simulator = BackendSimulator(backend_path, backend_config)
-        self.backend_simulator.interactive_simulation()
+        togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
+        self.tog_simulator = TOGSimulator(togsim_path, togsim_config)
+        self.tog_simulator.interactive_simulation()
         if engine_select == Scheduler.FIFO_ENGINE:
-            self.execution_engine = FIFOExecutionEngine(self.backend_simulator, self.num_request_queue)
+            self.execution_engine = FIFORunner(self.tog_simulator, self.num_request_queue)
         elif engine_select == Scheduler.RR_ENGINE:
-            self.execution_engine = RRExecutionEngine(self.backend_simulator, self.num_request_queue)
+            self.execution_engine = RoundRobinRunner(self.tog_simulator, self.num_request_queue)
         else:
             print(f"Not supporetd engine type {engine_select}")
             exit(1)
@@ -469,8 +469,8 @@ def schedule(self):
 
         # Need to forward the time until next_arrival_time
         if self.execution_engine.is_all_idle():
-            reason = self.backend_simulator.until(self.msec_to_cycle(next_time))
-            self.current_cycle = self.backend_simulator.cycle()
+            reason = self.tog_simulator.until(self.msec_to_cycle(next_time))
+            self.current_cycle = self.tog_simulator.cycle()
         else:
             self.run(next_time)
         return
@@ -480,7 +480,7 @@ def run(self, until_time):
         def execute_cycle():
             launch_ret_info = []
             for i in range(self.execution_engine.num_partion):
-                if self.execution_engine.partition_state[i] == ExecutionEngine.PARTITION_IDLE:
+                if self.execution_engine.partition_state[i] == PyTorchSimRunner.PARTITION_IDLE:
                     ret = self.execution_engine.launch_kernel(self.current_cycle, i)
                     launch_ret_info.append(ret)
 
@@ -490,12 +490,12 @@ def execute_cycle():
                 return []
 
             # Schedule jobs and update the current time
-            result_list = self.backend_simulator.until(self.msec_to_cycle(until_time))
-            self.current_cycle = self.backend_simulator.cycle()
+            result_list = self.tog_simulator.until(self.msec_to_cycle(until_time))
+            self.current_cycle = self.tog_simulator.cycle()
 
             for core_idx in result_list:
                 # Kernel is finished. So set idle state
-                self.execution_engine.partition_state[core_idx] = ExecutionEngine.PARTITION_IDLE
+                self.execution_engine.partition_state[core_idx] = PyTorchSimRunner.PARTITION_IDLE
 
             return result_list
 
@@ -526,7 +526,7 @@ def is_request_queue_empty(self):
 
     def is_finished(self):
         if self.is_request_queue_empty() and self.execution_engine.is_all_idle():
-            self.backend_simulator.wait()
+            self.tog_simulator.wait()
             return True
         return False
 
@@ -534,7 +534,7 @@ def current_time(self):
         return self.cycle_to_msec(self.current_cycle)
 
     def cycle_to_msec(self, cycle):
-        freq = self.backend_simulator.get_core_freq()
+        freq = self.tog_simulator.get_core_freq()
         return cycle / (freq  / 1000)
 
     def msec_to_cycle(self, msec):
@@ -542,5 +542,5 @@ def msec_to_cycle(self, msec):
         if (msec == -1):
             return msec
 
-        freq = self.backend_simulator.get_core_freq()
-        return int(msec * (freq / 1000))
\ No newline at end of file
+        freq = self.tog_simulator.get_core_freq()
+        return int(msec * (freq / 1000))
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 81970cbe..c586c2fd 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -12,7 +12,7 @@
 import torch
 import numpy as np
 
-from PyTorchSimFrontend.llvm.llvm_common import LLVMKernelArgs
+from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
 from PyTorchSimFrontend import extension_config
 
 TORCH_TO_NUMPY = {
@@ -64,10 +64,10 @@ def dump_args(self, args, arg_attributes, load_path, dump_path):
         for (arg_name, arg_attribute), arg in zip(arg_attributes, args):
             size = arg_attribute[2] if arg_attribute[1] != torch.bool else (arg_attribute[2] + 7) // 8
             array_size.append(size)
-            if LLVMKernelArgs.is_llvm_arg_in(arg_attribute[0]):
+            if MLIRKernelArgs.is_mlir_arg_in(arg_attribute[0]):
                 index = self.write_arg(arg, load_path, arg_name)
                 file_path.append(os.path.join(load_path, arg_name, f'{index}.raw'))
-            elif LLVMKernelArgs.is_llvm_arg_out(arg_attribute[0]):
+            elif MLIRKernelArgs.is_mlir_arg_out(arg_attribute[0]):
                 path = os.path.join(dump_path, arg_name)
                 os.makedirs(path, exist_ok=True)
                 file_path.append(os.path.join(path, f'{self.get_biggest_filename(path)}.raw'))
@@ -101,15 +101,17 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
         os.makedirs(os.path.join(runtime_path, "indirect_access"), exist_ok=True)
         os.makedirs(os.path.join(runtime_path, "dma_access"), exist_ok=True)
         run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
-        if not silent_mode:
-            print("[SpikeSimulator] cmd> ", run)
+        if not silent_mode and extension_config.CONFIG_DEBUG_MODE:
+            print("[Spike] cmd> ", run)
+        print("[Spike] Running Spike simulator")
         run_cmd = shlex.split(run)
         try:
             stdout_setting = subprocess.DEVNULL if silent_mode else None
             stderr_setting = subprocess.DEVNULL if silent_mode else None
             subprocess.check_call(run_cmd, stdout=stdout_setting, stderr=stderr_setting)
         except subprocess.CalledProcessError as e:
-            print("[SpikeSimulator] Command failed with exit code", e.returncode)
+            if not silent_mode:
+                print("[Spike] Command failed with exit code", e.returncode)
             error_msg = ""
             if e.returncode == 200:
                 error_msg = "INVALID_SPAD_ACCESS"
@@ -120,7 +122,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
             raise RuntimeError(f"{error_msg}")
 
         for (arg_name, arg_attribute), arg, path in zip(arg_attributes, args, file_path):
-            if LLVMKernelArgs.is_llvm_arg_out(arg_attribute[0]):
+            if MLIRKernelArgs.is_mlir_arg_out(arg_attribute[0]):
                 self.load_tensor(arg, arg_name, arg_attribute, path)
 
         if cleanup:
@@ -154,7 +156,7 @@ def show_progress():
             while not finished:
                 i = (i + 1) % 3
                 tail = "." * i + " " * (3-i)
-                sys.stdout.write("\r[Gem5Simulator] Simulation is still running." + tail)
+                sys.stdout.write("\r[Gem5] Gem5 is running." + tail)
                 time.sleep(1)
             print("")
 
@@ -162,9 +164,10 @@ def show_progress():
         gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-r", "--stdout-file=sto.log", "-d", dir_path, extension_config.CONFIG_GEM5_SCRIPT_PATH, "-c", target_binary, "--vlane", str(vectorlane_size)]
         try:
             # Create progress thread
-            is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) or silent_mode
+            is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) or silent_mode
             if not is_dryrun:
-                print("[Gem5Simulator] cmd> ", " ".join(gem5_cmd))
+                if extension_config.CONFIG_DEBUG_MODE:
+                    print("[Gem5] cmd> ", " ".join(gem5_cmd))
                 finished = False
                 progress_thread = threading.Thread(target=show_progress)
                 progress_thread.start()
@@ -174,11 +177,11 @@ def show_progress():
             else:
                 output = subprocess.check_output(gem5_cmd, stderr=subprocess.DEVNULL)
         except subprocess.CalledProcessError as e:
-            print("[Gem5Simulator] Command failed with exit code", e.returncode)
-            print("[Gem5Simulator] Error output:", e.output)
-            finished = True
-            progress_thread.join()
-            assert(0)
+            print(f"[Gem5] Gem5 simulation failed with error: \"{e.output.decode()}\"")
+            if not is_dryrun:
+                finished = True
+                progress_thread.join()
+            raise RuntimeError(f"Gem5 Simulation Failed: \"{e.output.decode()}\"")
 
         with open(f"{dir_path}/stats.txt", "r") as stat_file:
             raw_list = stat_file.readlines()
@@ -187,18 +190,18 @@ def show_progress():
         cycle_list = cycle_list[:-1]
         return cycle_list
 
-class BackendSimulator():
-    BACKEND_RESULT_PATH_KEY = "BACKEND_RESULT_PATH"
-    FINISH_STR = "Simulation Finished"
+class TOGSimulator():
+    TOGSIM_RESULT_PATH_KEY = "TOGSIM_RESULT_PATH"
+    FINISH_STR = "Simulation finished"
     ALLOC_POOL = dict() # For eagermode buffer plan
-    def __init__(self, backend_path, config_path, vectorlane_size=-1) -> None:
-        self.base_dir = backend_path
+    def __init__(self, togsim_path, config_path, vectorlane_size=-1) -> None:
+        self.base_dir = togsim_path
         self.config_path = config_path
         self.config_json = self.load_json(self.config_path)
         self.process = None
         self.vectorlane_size = vectorlane_size
 
-    def get_backend_command(self):
+    def get_togsim_command(self):
         bin = os.path.join(self.base_dir, "build/bin/Simulator")
         config = os.path.join(self.base_dir, self.config_path)
         cmd = f"{bin} --config {config}"
@@ -210,16 +213,16 @@ def show_progress():
             while not finished:
                 i = (i + 1) % 3
                 tail = "." * i + " " * (3-i)
-                sys.stdout.write("\r[BackendSimulator] Simulation is still running." + tail)
+                sys.stdout.write("\r[TOGSim] TOGSim is running." + tail)
                 time.sleep(1)
             print("")
-        cmd = f"{self.get_backend_command()} --models_list {model_path}"
-        if extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL:
-            cmd += f" --log_level {extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL}"
+        cmd = f"{self.get_togsim_command()} --models_list {model_path}"
+        if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL:
+            cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}"
         if attribute_path:
             cmd = f"{cmd} --attributes_list {attribute_path}"
-        if not silent_mode:
-            print("[BackendSimulator] cmd> ", cmd)
+        if not silent_mode and extension_config.CONFIG_DEBUG_MODE:
+            print("[TOGSim] cmd> ", cmd)
 
         # Create progress thread
         if not silent_mode:
@@ -235,28 +238,26 @@ def show_progress():
             if not silent_mode:
                 finished = True
                 progress_thread.join()
-                print("[BackendSimulator] Command failed with exit code", e.returncode)
-                print("[BackendSimulator] Error output:", e.output)
+                print("[TOGSim] Command failed with exit code", e.returncode)
+                print("[TOGSim] Error output:", e.output)
             assert 0
-        result_path = extension_config.CONFIG_BACKEND_RESULT_PATH_KEY
-        if result_path is None:
-            result_path = os.path.join(os.path.dirname(model_path), "backendsim_result")
-
         # Save result to result_path
+        result_path = os.path.join(os.path.dirname(model_path), "togsim_result")
         os.makedirs(result_path, exist_ok=True)
         file_name = str(len(os.listdir(result_path)))
         result_path = os.path.join(result_path, file_name)
         with open(result_path, "w") as f:
             f.write(result.decode())
-        print(f'[BackendSimulator] Simulation of "{model_path}" is stored to "{result_path}"')
+        print(f'[TOGSim] Simulation of "{model_path}" is stored to "{result_path}"')
         return result_path
 
     def interactive_simulation(self):
-        cmd = f"{self.get_backend_command()} --mode interactive"
-        if extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL:
-            cmd += f" --log_level {extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL}"
+        cmd = f"{self.get_togsim_command()} --mode interactive"
+        if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL:
+            cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}"
 
-        print("[BackendSimulator] cmd> ", cmd)
+        if extension_config.CONFIG_DEBUG_MODE:
+            print("[TOGSim] cmd> ", cmd)
         if self.process is None:
             self.process = subprocess.Popen(
                 shlex.split(cmd),
@@ -265,27 +266,27 @@ def interactive_simulation(self):
                 universal_newlines=True
             )
         else:
-            print("[BackendSimulator] Simulator is already running.")
+            print("[TOGSim] Simulator is already running.")
 
     def stop(self):
         if self.process:
             self.process.terminate()
             self.process.wait()
             self.process = None
-            print("[BackendSimulator] Simulator stopped.")
+            print("[TOGSim] Simulator stopped.")
 
     def wait(self):
         if self.process:
-            print("[BackendSimulator] Waiting for simulation to complete...")
+            print("[TOGSim] Waiting for simulation to complete...")
             self.quit()
             self.process.wait()
             self.process = None
-            print("[BackendSimulator] Simulation completed.")
+            print("[TOGSim] Simulation completed.")
 
     def send_command(self, command):
         if self.process:
             try:
-                if not extension_config.CONFIG_BACKENDSIM_DRYRUN:
+                if not extension_config.CONFIG_TOGSIM_DRYRUN:
                     print(command, flush=True)
                 self.process.stdin.write(command + '\n')
                 self.process.stdin.flush()
@@ -352,6 +353,8 @@ def create_attribute_file(self, attribute_path, inputs, **kwargs):
 
         with open(attribute_path, "w") as f:
             json.dump(json_content, f, indent=4)
+            f.flush()
+            os.fsync(f.fileno()) # There could be a race condition.
         return attribute_path
 
     def load_json(self, config_path):
@@ -367,8 +370,8 @@ def load_json(self, config_path):
             raise ValueError(f"Invalid JSON format: {e}")
 
     def get_core_freq(self):
-        if "core_freq" in self.config_json:
-            return self.config_json["core_freq"] * 1000 * 1000 # MHz
+        if "core_freq_mhz" in self.config_json:
+            return self.config_json["core_freq_mhz"] * 1000 * 1000 # MHz
         else:
             raise KeyError("Key 'core_freq' not found in JSON.")
 
@@ -403,13 +406,13 @@ def get_result_from_file(result_path):
         simulation_finished_idx = -1
         simulation_finished = False
         for idx, line in enumerate(lines):
-            if BackendSimulator.FINISH_STR in line:
+            if TOGSimulator.FINISH_STR in line:
                 simulation_finished = True
                 simulation_finished_idx = idx
                 break
 
         if simulation_finished_idx == -1:
-            print("[BackendSimulator] Treid to parsing wrong formated output file!")
+            print("[TOGSim] Tried to parsing wrong formated output file!")
             return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time
 
         total_stat_lines = lines[simulation_finished_idx:]
@@ -440,15 +443,15 @@ def get_result_from_file(result_path):
             if 'DRAM: AVG BW Util' in line:
                 avg_dram_bw = float(re.search(r'AVG BW Util (\d+\.?\d*)%', line).group(1))
 
-            if 'Total execution cycle' in line:
-                total_cycle = int(re.search(r'Total execution cycle: (\d+)', line).group(1))
+            if 'Total execution cycles' in line:
+                total_cycle = int(re.search(r'Total execution cycles: (\d+)', line).group(1))
 
             # Parse total simulation time
-            if 'Simulation time' in line:
-                simulation_time = float(re.search(r'Simulation time: (\d+\.?\d*) seconds', line).group(1))
+            if 'Wall-clock time for simulation' in line:
+                simulation_time = float(re.search(r'Wall-clock time for simulation: (\d+\.?\d*) seconds', line).group(1))
         return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time, total_cycle
 
 if __name__ == "__main__":
-    sim = BackendSimulator("/workspace/PyTorchSim/PyTorchSimBackend", "/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c4_simple_noc_tpuv4.json")
+    sim = TOGSimulator("/workspace/PyTorchSim/TOGSim", "/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json")
     sim.interactive_simulation()
     sim.until(4000)
\ No newline at end of file
diff --git a/PyTorchSimBackend/CMakeLists.txt b/TOGSim/CMakeLists.txt
similarity index 100%
rename from PyTorchSimBackend/CMakeLists.txt
rename to TOGSim/CMakeLists.txt
diff --git a/PyTorchSimBackend/conanfile.txt b/TOGSim/conanfile.txt
similarity index 100%
rename from PyTorchSimBackend/conanfile.txt
rename to TOGSim/conanfile.txt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/anynet.icnt b/TOGSim/configs/booksim2_configs/anynet.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/anynet.icnt
rename to TOGSim/configs/booksim2_configs/anynet.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/anynet_file b/TOGSim/configs/booksim2_configs/anynet_file
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/anynet_file
rename to TOGSim/configs/booksim2_configs/anynet_file
diff --git a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt
similarity index 75%
rename from PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt
rename to TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt
index d18ff6e7..3102fecc 100644
--- a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt
+++ b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt
@@ -2,7 +2,7 @@
 use_map = 0
 flit_size = 32
 topology = anynet
-network_file = /workspace/PyTorchSim/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net
+network_file = /workspace/PyTorchSim/TOGSim/configs/booksim2_configs/chiplet_32_32_2.net
 routing_function = min
 subnets = 1
 routing_delay = 4
diff --git a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.net
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net
rename to TOGSim/configs/booksim2_configs/chiplet_32_32_2.net
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m16.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m16.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m16.icnt
rename to TOGSim/configs/booksim2_configs/fly_c16_m16.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m32.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m32.icnt
rename to TOGSim/configs/booksim2_configs/fly_c16_m32.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m8.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m8.icnt
rename to TOGSim/configs/booksim2_configs/fly_c16_m8.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m1.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m1.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m1.icnt
rename to TOGSim/configs/booksim2_configs/fly_c1_m1.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m2.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m2.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m2.icnt
rename to TOGSim/configs/booksim2_configs/fly_c1_m2.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m8.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m8.icnt
rename to TOGSim/configs/booksim2_configs/fly_c1_m8.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c2_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c2_m32.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c2_m32.icnt
rename to TOGSim/configs/booksim2_configs/fly_c2_m32.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c2_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c2_m8.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c2_m8.icnt
rename to TOGSim/configs/booksim2_configs/fly_c2_m8.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m32.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m32.icnt
rename to TOGSim/configs/booksim2_configs/fly_c32_m32.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m4.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m4.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m4.icnt
rename to TOGSim/configs/booksim2_configs/fly_c32_m4.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m8.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m8.icnt
rename to TOGSim/configs/booksim2_configs/fly_c32_m8.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m2.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m2.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m2.icnt
rename to TOGSim/configs/booksim2_configs/fly_c4_m2.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m32.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m32.icnt
rename to TOGSim/configs/booksim2_configs/fly_c4_m32.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m8.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m8.icnt
rename to TOGSim/configs/booksim2_configs/fly_c4_m8.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8.icnt
rename to TOGSim/configs/booksim2_configs/fly_c64_m8.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-age.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8_sif-age.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-age.icnt
rename to TOGSim/configs/booksim2_configs/fly_c64_m8_sif-age.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt
rename to TOGSim/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/make_anynet_topology.py b/TOGSim/configs/booksim2_configs/make_anynet_topology.py
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/make_anynet_topology.py
rename to TOGSim/configs/booksim2_configs/make_anynet_topology.py
diff --git a/PyTorchSimBackend/configs/booksim2_configs/mesh_sif-age.icnt b/TOGSim/configs/booksim2_configs/mesh_sif-age.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/mesh_sif-age.icnt
rename to TOGSim/configs/booksim2_configs/mesh_sif-age.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/mesh_sif-rr.icnt b/TOGSim/configs/booksim2_configs/mesh_sif-rr.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/mesh_sif-rr.icnt
rename to TOGSim/configs/booksim2_configs/mesh_sif-rr.icnt
diff --git a/TOGSim/configs/heterogeneous_c2_simple_noc.json b/TOGSim/configs/heterogeneous_c2_simple_noc.json
new file mode 100644
index 00000000..60f160a8
--- /dev/null
+++ b/TOGSim/configs/heterogeneous_c2_simple_noc.json
@@ -0,0 +1,29 @@
+{
+  "core_type" : ["stonne", "ws_mesh"],
+  "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "num_cores" : 2,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_stonne_per_core" : 8,
+  "num_stonne_port" : 64,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16,
+
+  "num_partition" : 2,
+  "partition": {
+    "core_0":0,
+    "core_1":1
+  }
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/ramulator2_configs/DDR4.yaml b/TOGSim/configs/ramulator2_configs/DDR4.yaml
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator2_configs/DDR4.yaml
rename to TOGSim/configs/ramulator2_configs/DDR4.yaml
diff --git a/PyTorchSimBackend/configs/ramulator2_configs/HBM2.yaml b/TOGSim/configs/ramulator2_configs/HBM2.yaml
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator2_configs/HBM2.yaml
rename to TOGSim/configs/ramulator2_configs/HBM2.yaml
diff --git a/PyTorchSimBackend/configs/ramulator2_configs/HBM2_TPUv3.yaml b/TOGSim/configs/ramulator2_configs/HBM2_TPUv3.yaml
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator2_configs/HBM2_TPUv3.yaml
rename to TOGSim/configs/ramulator2_configs/HBM2_TPUv3.yaml
diff --git a/PyTorchSimBackend/configs/ramulator_configs/ALDRAM-config.cfg b/TOGSim/configs/ramulator_configs/ALDRAM-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/ALDRAM-config.cfg
rename to TOGSim/configs/ramulator_configs/ALDRAM-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/DDR3-config.cfg b/TOGSim/configs/ramulator_configs/DDR3-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/DDR3-config.cfg
rename to TOGSim/configs/ramulator_configs/DDR3-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/DDR4-config.cfg b/TOGSim/configs/ramulator_configs/DDR4-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/DDR4-config.cfg
rename to TOGSim/configs/ramulator_configs/DDR4-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/DSARP-config.cfg b/TOGSim/configs/ramulator_configs/DSARP-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/DSARP-config.cfg
rename to TOGSim/configs/ramulator_configs/DSARP-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/GDDR5-config.cfg b/TOGSim/configs/ramulator_configs/GDDR5-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/GDDR5-config.cfg
rename to TOGSim/configs/ramulator_configs/GDDR5-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config.cfg b/TOGSim/configs/ramulator_configs/HBM-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg b/TOGSim/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FCFS.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FCFS.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FCFS.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config_FCFS.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg b/TOGSim/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg b/TOGSim/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBMx0.5ch-config.cfg b/TOGSim/configs/ramulator_configs/HBMx0.5ch-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBMx0.5ch-config.cfg
rename to TOGSim/configs/ramulator_configs/HBMx0.5ch-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBMx2ch-config.cfg b/TOGSim/configs/ramulator_configs/HBMx2ch-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBMx2ch-config.cfg
rename to TOGSim/configs/ramulator_configs/HBMx2ch-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/LPDDR3-config.cfg b/TOGSim/configs/ramulator_configs/LPDDR3-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/LPDDR3-config.cfg
rename to TOGSim/configs/ramulator_configs/LPDDR3-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/LPDDR4-config.cfg b/TOGSim/configs/ramulator_configs/LPDDR4-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/LPDDR4-config.cfg
rename to TOGSim/configs/ramulator_configs/LPDDR4-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/PCM-config.cfg b/TOGSim/configs/ramulator_configs/PCM-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/PCM-config.cfg
rename to TOGSim/configs/ramulator_configs/PCM-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/SALP-config.cfg b/TOGSim/configs/ramulator_configs/SALP-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/SALP-config.cfg
rename to TOGSim/configs/ramulator_configs/SALP-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/STTMRAM-config.cfg b/TOGSim/configs/ramulator_configs/STTMRAM-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/STTMRAM-config.cfg
rename to TOGSim/configs/ramulator_configs/STTMRAM-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/TLDRAM-config.cfg b/TOGSim/configs/ramulator_configs/TLDRAM-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/TLDRAM-config.cfg
rename to TOGSim/configs/ramulator_configs/TLDRAM-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/WideIO-config.cfg b/TOGSim/configs/ramulator_configs/WideIO-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/WideIO-config.cfg
rename to TOGSim/configs/ramulator_configs/WideIO-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/WideIO2-config.cfg b/TOGSim/configs/ramulator_configs/WideIO2-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/WideIO2-config.cfg
rename to TOGSim/configs/ramulator_configs/WideIO2-config.cfg
diff --git a/TOGSim/configs/stonne_big_c1_simple_noc.json b/TOGSim/configs/stonne_big_c1_simple_noc.json
new file mode 100644
index 00000000..5d563fbe
--- /dev/null
+++ b/TOGSim/configs/stonne_big_c1_simple_noc.json
@@ -0,0 +1,22 @@
+{
+  "core_type" : ["stonne"],
+  "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "num_cores" : 1,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_stonne_per_core" : 8,
+  "num_stonne_port" : 64,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 8,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycless": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/stonne_single_c1_simple_noc.json b/TOGSim/configs/stonne_single_c1_simple_noc.json
new file mode 100644
index 00000000..304e84b3
--- /dev/null
+++ b/TOGSim/configs/stonne_single_c1_simple_noc.json
@@ -0,0 +1,22 @@
+{
+  "core_type" : ["stonne"],
+  "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "num_cores" : 1,
+  "core_freq_mhz" : 700,
+  "core_stats_print_period_cycles" : 10000,
+  "num_stonne_per_core" : 1,
+  "num_stonne_port" : 8,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 700,
+  "dram_channels": 8,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 700,
+  "icnt_injection_ports_per_core" : 8
+}
\ No newline at end of file
diff --git a/TOGSim/configs/stonne_validation_c1_simple_noc.json b/TOGSim/configs/stonne_validation_c1_simple_noc.json
new file mode 100644
index 00000000..38d4244c
--- /dev/null
+++ b/TOGSim/configs/stonne_validation_c1_simple_noc.json
@@ -0,0 +1,23 @@
+{
+  "core_type" : ["stonne"],
+  "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "num_cores" : 1,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 10000,
+  "num_stonne_per_core" : 1,
+  "num_stonne_port" : 32,
+
+  "dram_type" : "simple",
+  "dram_freq_mhz" : 1000,
+  "dram_channels": 1,
+  "dram_req_size_byte": 32,
+  "dram_latency" : 100,
+  "dram_stats_print_period_cycles": 10000,
+  "l2d_type" : "datacache",
+  "l2d_config" : "S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 1000,
+  "icnt_injection_ports_per_core" : 8
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
new file mode 100644
index 00000000..58519aad
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
@@ -0,0 +1,19 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 700,
+  "core_stats_print_period_cycles" : 10000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :700,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+
+  "icnt_type" : "booksim2",
+  "icnt_freq_mhz" : 700,
+  "icnt_injection_ports_per_core" : 16,
+  "booksim_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt"
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
new file mode 100644
index 00000000..1257891c
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
@@ -0,0 +1,18 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 700,
+  "core_stats_print_period_cycles" : 10000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 700,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycless": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 700,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
new file mode 100644
index 00000000..b92d8029
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
@@ -0,0 +1,19 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
new file mode 100644
index 00000000..34896fc7
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
@@ -0,0 +1,19 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 8,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
new file mode 100644
index 00000000..59be9fd4
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
@@ -0,0 +1,21 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 1050,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 4,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :1200,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+  "l2d_type" : "datacache",
+  "l2d_config" : "S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 1050,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
new file mode 100644
index 00000000..271e7e1c
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
@@ -0,0 +1,19 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "booksim2",
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16,
+  "booksim_config_path" : "../configs/booksim2_configs/fly_c32_m32.icnt"
+}
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json
similarity index 70%
rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
rename to TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json
index d51e9c5f..7382c4c8 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
+++ b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json
@@ -1,26 +1,25 @@
 {
   "num_cores" : 2,
-  "core_freq" : 940,
+  "core_freq_mhz" : 940,
   "sram_size" : 65536,
   "core_print_interval" : 10000,
   "num_systolic_array_per_core" : 2,
 
   "dram_type" : "ramulator2",
   "dram_freq" : 940,
-  "dram_channels": 32,
+  "dram_channels": 8,
   "dram_req_size": 32,
   "dram_latency" : 10,
-  "dram_size" : 32,
   "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
+ 
   "icnt_type" : "booksim2",
-  "icnt_latency" : 7,
-  "icnt_freq" : 28000,
-  "icnt_node_per_core" : 1,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m32.icnt",
-
+  "icnt_latency" : 1,
+  "icnt_freq" : 940,
+  "icnt_injection_ports_per_core" : 16,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c32_m8.icnt",
+ 
   "precision" : 4,
   "scheduler" : "simple",
   "num_partition" : 2,
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
new file mode 100644
index 00000000..6561ffc0
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
@@ -0,0 +1,21 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "dram_num_partitions" : 2,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "booksim2",
+  "icnt_freq_mhz" : 1000,
+  "icnt_injection_ports_per_core" : 16,
+  "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt",
+  "icnt_stats_print_period_cycles" : 10000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
new file mode 100644
index 00000000..fad63cc3
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
@@ -0,0 +1,20 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "dram_num_partitions" : 1,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "booksim2",
+  "icnt_freq_mhz" : 1000,
+  "icnt_injection_ports_per_core" : 16,
+  "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt"
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
new file mode 100644
index 00000000..2207f2b9
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
@@ -0,0 +1,18 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 700,
+  "core_stats_print_period_cycles" : 10000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :700,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 700,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
new file mode 100644
index 00000000..76f51b40
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
@@ -0,0 +1,19 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
new file mode 100644
index 00000000..42e003c7
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
@@ -0,0 +1,25 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16,
+
+  "num_partition" : 2,
+  "partition": {
+    "core_0":0,
+    "core_1":1
+  }
+}
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
new file mode 100644
index 00000000..44ec72fe
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
@@ -0,0 +1,21 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 1050,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 4,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :1200,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+  "l2d_type" : "datacache",
+  "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 1050,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json
new file mode 100644
index 00000000..045407b7
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json
@@ -0,0 +1,17 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 1,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycles": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json
new file mode 100644
index 00000000..d8f95d70
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json
@@ -0,0 +1,17 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 2,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycles": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json
new file mode 100644
index 00000000..a5fa9585
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json
@@ -0,0 +1,17 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 4,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycles": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_booksim.json b/TOGSim/configs/systolic_ws_8x8_c1_booksim.json
new file mode 100644
index 00000000..cf560171
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c1_booksim.json
@@ -0,0 +1,17 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 1,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycless": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "booksim2",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json
new file mode 100644
index 00000000..8da61d72
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json
@@ -0,0 +1,17 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 1,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycles": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+ 
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json
new file mode 100644
index 00000000..c5f429f9
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json
@@ -0,0 +1,18 @@
+{
+  "core_type" : ["ws_mesh","ws_mesh"],
+  "num_cores" : 2,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 1,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycless": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json
new file mode 100644
index 00000000..254520be
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json
@@ -0,0 +1,17 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 2,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycles": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json
new file mode 100644
index 00000000..e39867a7
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json
@@ -0,0 +1,17 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 4,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycless": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/extern/booksim b/TOGSim/extern/booksim
similarity index 100%
rename from PyTorchSimBackend/extern/booksim
rename to TOGSim/extern/booksim
diff --git a/PyTorchSimBackend/extern/onnx b/TOGSim/extern/onnx
similarity index 100%
rename from PyTorchSimBackend/extern/onnx
rename to TOGSim/extern/onnx
diff --git a/PyTorchSimBackend/extern/protobuf b/TOGSim/extern/protobuf
similarity index 100%
rename from PyTorchSimBackend/extern/protobuf
rename to TOGSim/extern/protobuf
diff --git a/PyTorchSimBackend/extern/ramulator2 b/TOGSim/extern/ramulator2
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator2
rename to TOGSim/extern/ramulator2
diff --git a/PyTorchSimBackend/extern/ramulator_custom/.gitignore b/TOGSim/extern/ramulator_custom/.gitignore
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/.gitignore
rename to TOGSim/extern/ramulator_custom/.gitignore
diff --git a/PyTorchSimBackend/extern/ramulator_custom/CMakeLists.txt b/TOGSim/extern/ramulator_custom/CMakeLists.txt
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/CMakeLists.txt
rename to TOGSim/extern/ramulator_custom/CMakeLists.txt
diff --git a/PyTorchSimBackend/extern/ramulator_custom/include/ramulator/Ramulator.hpp b/TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/include/ramulator/Ramulator.hpp
rename to TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Config.cpp b/TOGSim/extern/ramulator_custom/src/Config.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Config.cpp
rename to TOGSim/extern/ramulator_custom/src/Config.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Config.h b/TOGSim/extern/ramulator_custom/src/Config.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Config.h
rename to TOGSim/extern/ramulator_custom/src/Config.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Controller.h b/TOGSim/extern/ramulator_custom/src/Controller.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Controller.h
rename to TOGSim/extern/ramulator_custom/src/Controller.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DDR4.cpp b/TOGSim/extern/ramulator_custom/src/DDR4.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/DDR4.cpp
rename to TOGSim/extern/ramulator_custom/src/DDR4.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DDR4.h b/TOGSim/extern/ramulator_custom/src/DDR4.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/DDR4.h
rename to TOGSim/extern/ramulator_custom/src/DDR4.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DRAM.h b/TOGSim/extern/ramulator_custom/src/DRAM.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/DRAM.h
rename to TOGSim/extern/ramulator_custom/src/DRAM.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/HBM.cpp b/TOGSim/extern/ramulator_custom/src/HBM.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/HBM.cpp
rename to TOGSim/extern/ramulator_custom/src/HBM.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/HBM.h b/TOGSim/extern/ramulator_custom/src/HBM.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/HBM.h
rename to TOGSim/extern/ramulator_custom/src/HBM.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Memory.h b/TOGSim/extern/ramulator_custom/src/Memory.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Memory.h
rename to TOGSim/extern/ramulator_custom/src/Memory.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.cpp b/TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.cpp
rename to TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.h b/TOGSim/extern/ramulator_custom/src/MemoryFactory.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.h
rename to TOGSim/extern/ramulator_custom/src/MemoryFactory.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Ramulator.cpp b/TOGSim/extern/ramulator_custom/src/Ramulator.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Ramulator.cpp
rename to TOGSim/extern/ramulator_custom/src/Ramulator.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Refresh.cpp b/TOGSim/extern/ramulator_custom/src/Refresh.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Refresh.cpp
rename to TOGSim/extern/ramulator_custom/src/Refresh.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Refresh.h b/TOGSim/extern/ramulator_custom/src/Refresh.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Refresh.h
rename to TOGSim/extern/ramulator_custom/src/Refresh.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Request.cpp b/TOGSim/extern/ramulator_custom/src/Request.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Request.cpp
rename to TOGSim/extern/ramulator_custom/src/Request.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Request.h b/TOGSim/extern/ramulator_custom/src/Request.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Request.h
rename to TOGSim/extern/ramulator_custom/src/Request.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Scheduler.h b/TOGSim/extern/ramulator_custom/src/Scheduler.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Scheduler.h
rename to TOGSim/extern/ramulator_custom/src/Scheduler.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/SpeedyController.h b/TOGSim/extern/ramulator_custom/src/SpeedyController.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/SpeedyController.h
rename to TOGSim/extern/ramulator_custom/src/SpeedyController.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/StatType.cpp b/TOGSim/extern/ramulator_custom/src/StatType.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/StatType.cpp
rename to TOGSim/extern/ramulator_custom/src/StatType.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/StatType.h b/TOGSim/extern/ramulator_custom/src/StatType.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/StatType.h
rename to TOGSim/extern/ramulator_custom/src/StatType.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Statistics.h b/TOGSim/extern/ramulator_custom/src/Statistics.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Statistics.h
rename to TOGSim/extern/ramulator_custom/src/Statistics.h
diff --git a/PyTorchSimBackend/extern/stonneCore b/TOGSim/extern/stonneCore
similarity index 100%
rename from PyTorchSimBackend/extern/stonneCore
rename to TOGSim/extern/stonneCore
diff --git a/PyTorchSimBackend/include/Cache.h b/TOGSim/include/Cache.h
similarity index 100%
rename from PyTorchSimBackend/include/Cache.h
rename to TOGSim/include/Cache.h
diff --git a/PyTorchSimBackend/include/Cache_defs.h b/TOGSim/include/Cache_defs.h
similarity index 100%
rename from PyTorchSimBackend/include/Cache_defs.h
rename to TOGSim/include/Cache_defs.h
diff --git a/PyTorchSimBackend/include/Cache_stats.h b/TOGSim/include/Cache_stats.h
similarity index 100%
rename from PyTorchSimBackend/include/Cache_stats.h
rename to TOGSim/include/Cache_stats.h
diff --git a/PyTorchSimBackend/include/Common.h b/TOGSim/include/Common.h
similarity index 100%
rename from PyTorchSimBackend/include/Common.h
rename to TOGSim/include/Common.h
diff --git a/PyTorchSimBackend/include/Core.h b/TOGSim/include/Core.h
similarity index 85%
rename from PyTorchSimBackend/include/Core.h
rename to TOGSim/include/Core.h
index a3d55fa2..e4d2f30a 100644
--- a/PyTorchSimBackend/include/Core.h
+++ b/TOGSim/include/Core.h
@@ -9,7 +9,7 @@
 #include "Dram.h"
 #include "Tile.h"
 #include "SimulationConfig.h"
-#include "TMA.h"
+#include "DMA.h"
 
 class Core {
  public:
@@ -27,9 +27,9 @@ class Core {
   virtual void pop_memory_request();
   virtual mem_fetch* top_memory_request() { return _request_queue.front(); }
   virtual void push_memory_response(mem_fetch* response);
-  void check_tag() { _tma.check_table(); }
-  void inc_numa_hit() { _stat_numa_hit++; }
-  void inc_numa_miss() { _stat_numa_miss++; }
+  void check_tag() { _dma.check_table(); }
+  void inc_numa_local_access() { _stat_numa_local_access++; }
+  void inc_numa_remote_access() { _stat_numa_remote_access++; }
 
   std::queue<std::shared_ptr<Instruction>>& get_compute_pipeline(int compute_type);
   enum {
@@ -50,20 +50,18 @@ class Core {
   /* Core id & config file */
   const uint32_t _id;
   const SimulationConfig _config;
-  size_t _sram_size;
-  size_t _used_sram_size;
   uint32_t _num_systolic_array_per_core;
   uint32_t _systolic_array_rr = 0;
 
-  /* TMA Unit */
-  TMA _tma;
+  /* DMA Unit */
+  DMA _dma;
 
   /* cycle */
   cycle_type _core_cycle;
   cycle_type _stat_tot_vu_compute_cycle = 0;
   std::vector<cycle_type> _stat_tot_sa_compute_cycle;
-  cycle_type _stat_tot_tma_cycle = 0;
-  cycle_type _stat_tot_tma_idle_cycle = 0;
+  cycle_type _stat_tot_dma_cycle = 0;
+  cycle_type _stat_tot_dma_idle_cycle = 0;
   cycle_type _stat_tot_vu_compute_idle_cycle = 0;
   std::vector<cycle_type> _stat_tot_sa_compute_idle_cycle;
   std::vector<uint64_t> _stat_inst_count;
@@ -71,13 +69,13 @@ class Core {
   uint64_t _stat_tot_mem_response = 0;
   uint64_t _stat_gemm_inst = 0;
   uint64_t _stat_skip_dma = 0;
-  uint64_t _stat_numa_hit = 0;
-  uint64_t _stat_numa_miss = 0;
+  uint64_t _stat_numa_local_access = 0;
+  uint64_t _stat_numa_remote_access = 0;
 
   cycle_type _stat_vu_compute_cycle = 0;
   std::vector<cycle_type> _stat_sa_compute_cycle;
-  cycle_type _stat_tma_cycle = 0;
-  cycle_type _stat_tma_idle_cycle = 0;
+  cycle_type _stat_dma_cycle = 0;
+  cycle_type _stat_dma_idle_cycle = 0;
   cycle_type _stat_vu_compute_idle_cycle = 0;
   std::vector<cycle_type> _stat_sa_compute_idle_cycle;
   uint64_t _stat_mem_response = 0;
diff --git a/PyTorchSimBackend/include/TMA.h b/TOGSim/include/DMA.h
similarity index 94%
rename from PyTorchSimBackend/include/TMA.h
rename to TOGSim/include/DMA.h
index f8355470..2f41c6f3 100644
--- a/PyTorchSimBackend/include/TMA.h
+++ b/TOGSim/include/DMA.h
@@ -1,8 +1,9 @@
-#ifndef TMA_H
-#define TMA_H
+#ifndef DMA_H
+#define DMA_H
 
 #include <cstdint>
 #include <memory>
+#include <queue>
 #include <map>
 #include <vector>
 #include "Instruction.h"
@@ -16,9 +17,9 @@ struct VectorCompare {
     }
 };
 
-class TMA {
+class DMA {
  public:
-  TMA(uint32_t id, uint32_t dram_req_size);
+  DMA(uint32_t id, uint32_t dram_req_size);
 
   void issue_tile(std::shared_ptr<Instruction> inst);
   bool is_finished() { return _finished; }
@@ -114,7 +115,7 @@ class TMA {
   }
 
   std::shared_ptr<Instruction>& get_current_inst() { return _current_inst; }
-  std::shared_ptr<std::vector<mem_fetch*>> get_memory_access();
+  std::shared_ptr<std::vector<mem_fetch*>> get_memory_access(cycle_type core_cycle, int nr_req);
   uint32_t generate_mem_access_id();
   const uint32_t get_max_dim() { return _max_dim; }
 
@@ -130,5 +131,7 @@ class TMA {
   bool _finished=true;
   std::map<int, std::map<std::vector<int>, uint32_t>> tag_table;
   std::map<int, std::map<std::vector<int>, std::vector<std::shared_ptr<Instruction>>>> waiters;
+  std::queue<mem_fetch*> _pending_accesses;
+  bool _generated_once = false;
 };
 #endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/DelayQueue.h b/TOGSim/include/DelayQueue.h
similarity index 100%
rename from PyTorchSimBackend/include/DelayQueue.h
rename to TOGSim/include/DelayQueue.h
diff --git a/PyTorchSimBackend/include/Dram.h b/TOGSim/include/Dram.h
similarity index 99%
rename from PyTorchSimBackend/include/Dram.h
rename to TOGSim/include/Dram.h
index 5e51b96d..d28ac25f 100644
--- a/PyTorchSimBackend/include/Dram.h
+++ b/TOGSim/include/Dram.h
@@ -6,7 +6,7 @@
 #include <utility>
 
 #include "Common.h"
-#include "TMA.h"
+#include "DMA.h"
 #include "ramulator2.hh"
 #include "Hashing.h"
 #include "Cache.h"
diff --git a/PyTorchSimBackend/include/Hashing.h b/TOGSim/include/Hashing.h
similarity index 100%
rename from PyTorchSimBackend/include/Hashing.h
rename to TOGSim/include/Hashing.h
diff --git a/PyTorchSimBackend/include/Instruction.h b/TOGSim/include/Instruction.h
similarity index 96%
rename from PyTorchSimBackend/include/Instruction.h
rename to TOGSim/include/Instruction.h
index 4c14dd81..9fad13f4 100644
--- a/PyTorchSimBackend/include/Instruction.h
+++ b/TOGSim/include/Instruction.h
@@ -60,9 +60,7 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   std::vector<addr_type> get_trace_address() { return _trace_address; }
   bool load_indirect_index(const std::string& path, uint64_t*& indirect_index, const std::vector<uint64_t>& tile_size);
   void set_trace_address(std::vector<addr_type>& trace_address) { _trace_address = trace_address; }
-  size_t get_free_sram_size() { return _free_sram_size; }
   addr_type get_base_dram_address() { return dram_addr; }
-  void set_free_sram_size(size_t sram_size) { _free_sram_size=sram_size; }
   void* get_owner() { return _owner; }
   void set_owner(void *owner) { _owner = owner;}
   void set_owner_ready_queue(std::list<std::shared_ptr<Instruction>>* q) { _owner_ready_queue_ref = q; }
@@ -103,7 +101,6 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   size_t _tile_numel;
   size_t _nr_waiting_request=0;
   size_t _precision=0;
-  size_t _free_sram_size=0;
   addr_type dram_addr;
   uint32_t _numa_id = 0; // For DMA instruction
   int _compute_type = 0;
diff --git a/PyTorchSimBackend/include/Interconnect.h b/TOGSim/include/Interconnect.h
similarity index 95%
rename from PyTorchSimBackend/include/Interconnect.h
rename to TOGSim/include/Interconnect.h
index 8467b7aa..e6b325d0 100644
--- a/PyTorchSimBackend/include/Interconnect.h
+++ b/TOGSim/include/Interconnect.h
@@ -1,6 +1,6 @@
 #ifndef INTERCONNECT_H
 #define INTERCONNECT_H
-#include "TMA.h"
+#include "DMA.h"
 #include "booksim2/Interconnect.hpp"
 #include <cmath>
 #include <filesystem>
@@ -51,8 +51,9 @@ class SimpleInterconnect : public Interconnect {
     mem_fetch* access;
   };
 
-  std::vector<std::queue<Entity>> _in_buffers;
+  std::vector<std::vector<std::queue<Entity>>> _in_buffers;
   std::vector<std::queue<mem_fetch*>> _out_buffers;
+  std::vector<int> _rr_next_src;
   std::vector<bool> _busy_node;
 };
 
diff --git a/PyTorchSimBackend/include/IntervalTree.h b/TOGSim/include/IntervalTree.h
similarity index 100%
rename from PyTorchSimBackend/include/IntervalTree.h
rename to TOGSim/include/IntervalTree.h
diff --git a/PyTorchSimBackend/include/L2Cache.h b/TOGSim/include/L2Cache.h
similarity index 100%
rename from PyTorchSimBackend/include/L2Cache.h
rename to TOGSim/include/L2Cache.h
diff --git a/PyTorchSimBackend/include/Memfetch.h b/TOGSim/include/Memfetch.h
similarity index 100%
rename from PyTorchSimBackend/include/Memfetch.h
rename to TOGSim/include/Memfetch.h
diff --git a/PyTorchSimBackend/include/Model.h b/TOGSim/include/Model.h
similarity index 100%
rename from PyTorchSimBackend/include/Model.h
rename to TOGSim/include/Model.h
diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/TOGSim/include/SimulationConfig.h
similarity index 80%
rename from PyTorchSimBackend/include/SimulationConfig.h
rename to TOGSim/include/SimulationConfig.h
index 8f011d00..64cfa223 100644
--- a/PyTorchSimBackend/include/SimulationConfig.h
+++ b/TOGSim/include/SimulationConfig.h
@@ -18,8 +18,7 @@ struct SimulationConfig {
   std::vector<CoreType> core_type;
   std::string stonne_config_path;
   uint32_t num_cores;
-  uint32_t core_freq;
-  uint32_t sram_size;
+  uint32_t core_freq_mhz;
   uint32_t core_print_interval = 0;
   uint32_t num_systolic_array_per_core = 1;
   uint32_t num_stonne_per_core = 1;
@@ -28,7 +27,8 @@ struct SimulationConfig {
   /* DRAM config */
   DramType dram_type;
   uint32_t dram_num_partitions = 1;
-  uint32_t dram_freq;
+  uint32_t dram_channels_per_partitions = 0;
+  uint32_t dram_freq_mhz;
   uint32_t dram_channels;
   uint32_t dram_req_size;
   uint32_t dram_latency;
@@ -43,21 +43,20 @@ struct SimulationConfig {
 
   /* ICNT config */
   IcntType icnt_type;
-  uint32_t icnt_node_per_core = 1;
+  uint32_t icnt_injection_ports_per_core = 1;
   std::string icnt_config_path;
-  uint32_t icnt_freq;
+  uint32_t icnt_freq_mhz;
   uint32_t icnt_latency;
-  uint32_t icnt_print_interval=0;
+  uint32_t icnt_stats_print_period_cycles=0;
 
   /* Sheduler config */
-  uint32_t num_patition=1;
+  uint32_t num_partition=1;
   std::string scheduler_type;
 
   /* Core id, Partiton id mapping */
   std::map<uint32_t, uint32_t> partiton_map;
 
   /* Other configs */
-  uint32_t precision;
   std::string layout;
 
   uint64_t align_address(uint64_t addr) {
@@ -65,6 +64,6 @@ struct SimulationConfig {
   }
 
   float max_dram_bandwidth() {
-    return dram_freq * dram_channels * dram_req_size * 2 / dram_nbl / 1000; // GB/s
+    return dram_freq_mhz * dram_channels * dram_req_size * 2 / dram_nbl / 1000; // GB/s
   }
 };
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/Simulator.h b/TOGSim/include/Simulator.h
similarity index 100%
rename from PyTorchSimBackend/include/Simulator.h
rename to TOGSim/include/Simulator.h
diff --git a/PyTorchSimBackend/include/SparseCore.h b/TOGSim/include/SparseCore.h
similarity index 100%
rename from PyTorchSimBackend/include/SparseCore.h
rename to TOGSim/include/SparseCore.h
diff --git a/PyTorchSimBackend/include/Tile.h b/TOGSim/include/Tile.h
similarity index 100%
rename from PyTorchSimBackend/include/Tile.h
rename to TOGSim/include/Tile.h
diff --git a/PyTorchSimBackend/include/TileGraph.h b/TOGSim/include/TileGraph.h
similarity index 100%
rename from PyTorchSimBackend/include/TileGraph.h
rename to TOGSim/include/TileGraph.h
diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/TOGSim/include/TileGraphParser.h
similarity index 100%
rename from PyTorchSimBackend/include/TileGraphParser.h
rename to TOGSim/include/TileGraphParser.h
diff --git a/PyTorchSimBackend/include/scheduler/Scheduler.h b/TOGSim/include/scheduler/Scheduler.h
similarity index 100%
rename from PyTorchSimBackend/include/scheduler/Scheduler.h
rename to TOGSim/include/scheduler/Scheduler.h
diff --git a/PyTorchSimBackend/src/CMakeLists.txt b/TOGSim/src/CMakeLists.txt
similarity index 100%
rename from PyTorchSimBackend/src/CMakeLists.txt
rename to TOGSim/src/CMakeLists.txt
diff --git a/PyTorchSimBackend/src/Cache.cc b/TOGSim/src/Cache.cc
similarity index 100%
rename from PyTorchSimBackend/src/Cache.cc
rename to TOGSim/src/Cache.cc
diff --git a/PyTorchSimBackend/src/Cache_stats.cc b/TOGSim/src/Cache_stats.cc
similarity index 100%
rename from PyTorchSimBackend/src/Cache_stats.cc
rename to TOGSim/src/Cache_stats.cc
diff --git a/PyTorchSimBackend/src/Common.cc b/TOGSim/src/Common.cc
similarity index 73%
rename from PyTorchSimBackend/src/Common.cc
rename to TOGSim/src/Common.cc
index 5581f8bd..b5c092b3 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/TOGSim/src/Common.cc
@@ -39,15 +39,14 @@ SimulationConfig initialize_config(json config) {
     for (int i=0; i<parsed_config.num_cores; i++)
       parsed_config.core_type.push_back(CoreType::WS_MESH);
   }
-  parsed_config.core_freq = config["core_freq"];
-  parsed_config.sram_size = config["sram_size"];
+  parsed_config.core_freq_mhz = config["core_freq_mhz"];
   if (config.contains("num_systolic_array_per_core"))
     parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"];
   if (config.contains("num_stonne_per_core"))
     parsed_config.num_stonne_per_core = config["num_stonne_per_core"];
    if (config.contains("num_stonne_port"))
     parsed_config.num_stonne_port = config["num_stonne_port"];
-  parsed_config.core_print_interval = get_config_value<uint32_t>(config, "core_print_interval");
+  parsed_config.core_print_interval = get_config_value<uint32_t>(config, "core_stats_print_period_cycles");
 
   /* Stonne config */ 
   if (config.contains("stonne_config_path"))
@@ -63,20 +62,27 @@ SimulationConfig initialize_config(json config) {
   else
     throw std::runtime_error(fmt::format("Not implemented dram type {} ",
                                          (std::string)config["dram_type"]));
-  parsed_config.dram_freq = config["dram_freq"];
+  parsed_config.dram_freq_mhz = config["dram_freq_mhz"];
   if (config.contains("dram_latency"))
     parsed_config.dram_latency = config["dram_latency"];
-  if (config.contains("dram_config_path"))
-    parsed_config.dram_config_path = config["dram_config_path"];
+  if (config.contains("ramulator_config_path"))
+    parsed_config.dram_config_path = config["ramulator_config_path"];
   parsed_config.dram_channels = config["dram_channels"];
-  if (config.contains("dram_req_size"))
-    parsed_config.dram_req_size = config["dram_req_size"];
-  if (config.contains("dram_print_interval"))
-    parsed_config.dram_print_interval = config["dram_print_interval"];
-  if(config.contains("dram_nbl"))
-    parsed_config.dram_nbl = config["dram_nbl"];
-  if (config.contains("dram_num_partitions"))
+  if (config.contains("dram_req_size_byte"))
+    parsed_config.dram_req_size = config["dram_req_size_byte"];
+  if (config.contains("dram_stats_print_period_cycles"))
+    parsed_config.dram_print_interval = config["dram_stats_print_period_cycles"];
+  if(config.contains("dram_num_burst_length"))
+    parsed_config.dram_nbl = config["dram_num_burst_length"];
+  if (config.contains("dram_num_partitions")) {
     parsed_config.dram_num_partitions = config["dram_num_partitions"];
+    if (parsed_config.dram_channels % parsed_config.dram_num_partitions != 0) {
+      throw std::runtime_error("[Config] DRAM channels must be divisible by dram_num_partitions");
+    }
+  }
+  parsed_config.dram_channels_per_partitions =
+    parsed_config.dram_channels / parsed_config.dram_num_partitions;
+
 
    /* L2D config */
   if (config.contains("l2d_type")) {
@@ -104,19 +110,20 @@ SimulationConfig initialize_config(json config) {
   else
     throw std::runtime_error(fmt::format("Not implemented icnt type {} ",
                                          (std::string)config["icnt_type"]));
-  parsed_config.icnt_freq = config["icnt_freq"];
+  parsed_config.icnt_freq_mhz = config["icnt_freq_mhz"];
   if (config.contains("icnt_latency"))
     parsed_config.icnt_latency = config["icnt_latency"];
-  if (config.contains("icnt_config_path"))
-    parsed_config.icnt_config_path = config["icnt_config_path"];
-  if (config.contains("icnt_print_interval"))
-    parsed_config.icnt_print_interval = config["icnt_print_interval"];
-  if (config.contains("icnt_node_per_core"))
-    parsed_config.icnt_node_per_core = config["icnt_node_per_core"];
+  if (config.contains("booksim_config_path"))
+    parsed_config.icnt_config_path = config["booksim_config_path"];
+  if (config.contains("icnt_stats_print_period_cycles"))
+    parsed_config.icnt_stats_print_period_cycles = config["icnt_stats_print_period_cycles"];
+  if (config.contains("icnt_injection_ports_per_core"))
+    parsed_config.icnt_injection_ports_per_core = config["icnt_injection_ports_per_core"];
 
-  parsed_config.scheduler_type = config["scheduler"];
+  if (config.contains("scheduler"))
+    parsed_config.scheduler_type = config["scheduler"];
   if (config.contains("num_partition"))
-    parsed_config.num_patition = config["num_partition"];
+    parsed_config.num_partition = config["num_partition"];
   if (config.contains("partition")) {
     for (int i=0; i<parsed_config.num_cores; i++) {
       std::string core_partition = "core_" + std::to_string(i);
diff --git a/PyTorchSimBackend/src/Core.cc b/TOGSim/src/Core.cc
similarity index 71%
rename from PyTorchSimBackend/src/Core.cc
rename to TOGSim/src/Core.cc
index 4be41a70..30858193 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/TOGSim/src/Core.cc
@@ -4,11 +4,9 @@ Core::Core(uint32_t id, SimulationConfig config)
     : _id(id),
       _config(config),
       _core_cycle(0),
-      _stat_tma_cycle(0),
+      _stat_dma_cycle(0),
       _num_systolic_array_per_core(config.num_systolic_array_per_core),
-      _tma(id, config.dram_req_size) {
-  _sram_size = _config.sram_size * 1024;
-  _used_sram_size = 0;
+      _dma(id, config.dram_req_size) {
   _sa_compute_pipeline.resize(_num_systolic_array_per_core);
   _stat_tot_sa_compute_cycle.resize(_num_systolic_array_per_core);
   _stat_sa_compute_cycle.resize(_num_systolic_array_per_core);
@@ -25,14 +23,9 @@ bool Core::can_issue(const std::shared_ptr<Tile>& op) {
 
 void Core::issue(std::shared_ptr<Tile> op) {
   if (op->get_instructions().size()){
-    spdlog::trace("[Core {}][{}] New Tile is issued, remain sram: {} Required size: {}, Free size: {}",
-      _id, _core_cycle, _sram_size-_used_sram_size, op->get_required_sram_size(),
-      op->get_instructions().back()->get_free_sram_size());
-  } else {
-    spdlog::trace("[Core {}][{}] New Tile is issued, remain sram: {} Required size: {}",
-      _id, _core_cycle, _sram_size-_used_sram_size, op->get_required_sram_size());
+    spdlog::trace("[{}][Core {}][TILE_SCHEDULED]",
+      _core_cycle, _id);
   }
-  //_used_sram_size += op->get_required_sram_size();
   for (const auto& inst : op->get_instructions()) {
     if (inst->is_ready())
       op->enqueue_ready(inst);
@@ -125,39 +118,38 @@ void Core::dma_cycle() {
     /* Set tag table of async dma load */
     if (instruction->is_dma_read() && instruction->is_async_dma()) {
       auto& key = instruction->get_tag_id();
-      assert(!_tma.get_tag_finish(instruction->subgraph_id, key));
-      _tma.set_tag_finish(instruction->subgraph_id, key);
-      spdlog::trace("[Core {}][{}] {} ASYNC FINISHED, Used sram: {}, Release sram: {}, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}",
-                    _id, _core_cycle, opcode_to_string(instruction->get_opcode()),
-                    _used_sram_size, instruction->get_free_sram_size(),
+      assert(!_dma.get_tag_finish(instruction->subgraph_id, key));
+      _dma.set_tag_finish(instruction->subgraph_id, key);
+      spdlog::trace("[{}][Core {}] {} ASYNC FINISHED, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}",
+                    _core_cycle, _id, opcode_to_string(instruction->get_opcode()),
                     instruction->subgraph_id, instruction->get_addr_name(),
                     fmt::format("[{}]", fmt::join(instruction->get_tag_id(), ", ")),
                     fmt::format("[{}]", fmt::join(instruction->get_tag_idx_list(), ", ")),
                     fmt::format("[{}]", fmt::join(instruction->get_tag_stride_list(), ", ")));
-      for (auto & wait_inst : _tma.get_tag_waiter(instruction->subgraph_id, key)) {
-        _tma.mark_tag_used(instruction->subgraph_id, key);
+      for (auto & wait_inst : _dma.get_tag_waiter(instruction->subgraph_id, key)) {
+        _dma.mark_tag_used(instruction->subgraph_id, key);
         finish_instruction(wait_inst);
       }
     }
     _dma_finished_queue.erase(_dma_finished_queue.begin());
   }
 
-  if (_tma.is_finished()) {
+  if (_dma.is_finished()) {
     /* Finish instruction when it is DMA store */
-    if (_tma.get_current_inst() != nullptr) {
-      std::shared_ptr<Instruction> finished_inst = std::move(_tma.get_current_inst());
+    if (_dma.get_current_inst() != nullptr) {
+      std::shared_ptr<Instruction> finished_inst = std::move(_dma.get_current_inst());
       if (finished_inst->is_dma_write()) {
         /* Only DMA write operation is finished! */
         finish_instruction(finished_inst);
       } else if (finished_inst->is_dma_read() && finished_inst->is_async_dma()) {
         /* Register tag table for async dma load */
-        _tma.register_tag(finished_inst->subgraph_id, finished_inst->get_tag_id());
+        _dma.register_tag(finished_inst->subgraph_id, finished_inst->get_tag_id());
         finish_instruction(finished_inst);
       } else if(!finished_inst->is_dma_read()) {
-        spdlog::error("[Core {}][{}] TMA instruction in not valid", _id, _core_cycle);
+        spdlog::error("[{}][Core {}] DMA instruction in not valid", _core_cycle, _id);
         exit(EXIT_FAILURE);
       } else if (finished_inst->get_opcode() == Opcode::BAR) {
-        spdlog::trace("[Core {}][{}] {} FINISHED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
+        spdlog::trace("[{}][Core {}] {} FINISHED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
                       opcode_to_string(finished_inst->get_opcode()), finished_inst->get_addr_name(),
                       fmt::format("[{}]", fmt::join(finished_inst->get_tag_id(), ", ")),
                       fmt::format("[{}]", fmt::join(finished_inst->get_tag_idx_list(), ", ")),
@@ -170,27 +162,27 @@ void Core::dma_cycle() {
     /* Issue new DMA operation */
     if (!_ld_inst_queue.empty()) {
       std::shared_ptr<Instruction> inst = _ld_inst_queue.front();
-      _tma.issue_tile(inst);
+      _dma.issue_tile(inst);
       _ld_inst_queue.pop();
     } else if (!_st_inst_queue.empty()) {
       std::shared_ptr<Instruction> inst = _st_inst_queue.front();
-      _tma.issue_tile(inst);
+      _dma.issue_tile(inst);
       _st_inst_queue.pop();
     } else {
-      /* TMA is idle */
-      _stat_tma_idle_cycle++;
+      /* DMA is idle */
+      _stat_dma_idle_cycle++;
       return;
     }
   }
   /* Generate memfetch */
-  auto access_vec = _tma.get_memory_access();
+  auto access_vec = _dma.get_memory_access(_core_cycle, _config.icnt_injection_ports_per_core);
   for (auto access : *access_vec) {
     access->set_start_cycle(_core_cycle);
     _request_queue.push(access);
   }
 
-  /* Increase tma stat cycle */
-  _stat_tma_cycle++;
+  /* Increase dma stat cycle */
+  _stat_dma_cycle++;
 }
 
 void Core::cycle() {
@@ -218,20 +210,20 @@ void Core::cycle() {
             /* Check another MOVIN with same tag is issued */
             auto& key = inst->get_tag_id();
             if (inst->is_sparse_inst()) {
-              _tma.register_tag(inst->subgraph_id, key);
-              _tma.set_tag_sparse(inst->subgraph_id, key);
+              _dma.register_tag(inst->subgraph_id, key);
+              _dma.set_tag_sparse(inst->subgraph_id, key);
               finish_instruction(inst);
               issued = true;
               _stat_tot_skipped_inst.at(static_cast<size_t>(inst->get_opcode()))++;
               break;
-            } else if (inst->is_async_dma() && _tma.tag_key_exist(inst->subgraph_id, key)) {
-              bool finished = _tma.get_tag_finish(inst->subgraph_id, key);
+            } else if (inst->is_async_dma() && _dma.tag_key_exist(inst->subgraph_id, key)) {
+              bool finished = _dma.get_tag_finish(inst->subgraph_id, key);
               if (finished)
                 finish_instruction(inst);
               else
-                _tma.register_tag_waiter(inst->subgraph_id, key, inst);
-              spdlog::trace("[Core {}][{}] {} SKIPPED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
-                            opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(),
+                _dma.register_tag_waiter(inst->subgraph_id, key, inst);
+              spdlog::trace("[{}][Core {}][SIKIPPED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
+                            opcode_to_string(inst->get_opcode()),
                             inst->get_addr_name(),
                             fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
                             fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
@@ -240,8 +232,8 @@ void Core::cycle() {
               _stat_tot_skipped_inst.at(static_cast<size_t>(inst->get_opcode()))++;
               break;
             } else {
-              spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
-                            opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(),
+              spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
+                            opcode_to_string(inst->get_opcode()),
                             inst->get_addr_name(),
                             fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
                             fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
@@ -252,8 +244,12 @@ void Core::cycle() {
             }
           }
         case Opcode::MOVOUT:
-          spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {}", _id, _core_cycle,
-                        opcode_to_string(inst->get_opcode()), inst->get_free_sram_size());
+          spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
+                        opcode_to_string(inst->get_opcode()),
+                        inst->get_addr_name(),
+                        fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
+                        fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
+                        fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
           _st_inst_queue.push(inst);
           issued = true;
           break;
@@ -269,13 +265,14 @@ void Core::cycle() {
               inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle() - overlapped_cycle;
               inst->bubble_cycle = bubble_cycle;
             }
+
             if (inst->get_compute_cycle() == 0) {
               inst->finish_instruction();
               static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
               _stat_tot_skipped_inst.at(static_cast<size_t>(inst->get_opcode()))++;
               instructions.erase(it);
             } else {
-              spdlog::trace("[Core {}][SA {}][{}] {}-{} ISSUED, finsh at {}", _id, _systolic_array_rr, _core_cycle,
+              spdlog::trace("[{}][Core {}][INST_ISSUED][SA {}] {}-{}, finsh at {}", _core_cycle, _id, _systolic_array_rr,
                             opcode_to_string(inst->get_opcode()), inst->get_compute_type(), inst->finish_cycle);
               target_pipeline.push(inst);
               issued = true;
@@ -288,7 +285,7 @@ void Core::cycle() {
         case Opcode::BAR:
           {
             auto& key = inst->get_tag_id();
-            uint32_t finished = _tma.get_tag_finish(inst->subgraph_id, key);
+            uint32_t finished = _dma.get_tag_finish(inst->subgraph_id, key);
             if (finished == -1) {
               for (auto child_inst : inst->get_child_inst()) {
                 if (child_inst->get_opcode() == Opcode::COMP && child_inst->get_compute_type() == MATMUL) {
@@ -297,12 +294,12 @@ void Core::cycle() {
               }
               finish_instruction(inst);
             } else if (finished != 0) {
-              _tma.mark_tag_used(inst->subgraph_id, key);
+              _dma.mark_tag_used(inst->subgraph_id, key);
               finish_instruction(inst);
             } else {
-              _tma.register_tag_waiter(inst->subgraph_id, key, inst);
+              _dma.register_tag_waiter(inst->subgraph_id, key, inst);
             }
-            spdlog::trace("[Core {}][{}] {} ISSUED,  addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
+            spdlog::trace("[{}][Core {}][INST_ISSUED] {},  addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
                             opcode_to_string(inst->get_opcode()), inst->get_addr_name(),
                             fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
                             fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
@@ -344,31 +341,26 @@ void Core::cycle() {
 }
 
 void Core::finish_instruction(std::shared_ptr<Instruction>& inst) {
-  size_t free_sram_size = inst->get_free_sram_size();
   if (inst->finished) {
-    spdlog::error("[Core {}][{}] {} FINISHED, inst already finished!!", _id, _core_cycle,
+    spdlog::error("[{}][Core {}][ERROR] {} inst already finished!!", _core_cycle, _id,
                   opcode_to_string(inst->get_opcode()));
     exit(EXIT_FAILURE);
   }
   inst->finish_instruction();
   static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
   if (inst->get_opcode() == Opcode::COMP) {
-    spdlog::trace("[Core {}][{}] {}-{} FINISHED, Used sram: {}, Release sram: {}",
-      _id, _core_cycle, opcode_to_string(inst->get_opcode()), inst->get_compute_type(),
-      _used_sram_size, inst->get_free_sram_size());
+    spdlog::trace("[{}][Core {}][INST_FINISHED] {}-{}",
+      _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_compute_type());
   } else if (inst->get_opcode() != Opcode::BAR && inst->is_async_dma()){
-    spdlog::trace("[Core {}][{}] {} ASYNC REGISTERED, Used sram: {}, Release sram: {} subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}",
-      _id, _core_cycle, opcode_to_string(inst->get_opcode()), _used_sram_size,
-      inst->get_free_sram_size(), inst->subgraph_id, inst->get_addr_name(),
+    spdlog::trace("[{}][Core {}][ASYNC] {} subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}",
+      _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->subgraph_id, inst->get_addr_name(),
       inst->get_tag_id(),
       fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
       fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
   } else if ((inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) && !inst->is_async_dma()) {
-    spdlog::trace("[Core {}][{}] {} FINISHED, free_sram_size: {} addr_name: {}", _id, _core_cycle,
-      opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(),
-      inst->get_addr_name());
+    spdlog::trace("[{}][Core {}][INST_FINISHED] {} addr_name: {}", _core_cycle, _id,
+      opcode_to_string(inst->get_opcode()), inst->get_addr_name());
   }
-  //_used_sram_size -= free_sram_size;
 }
 
 bool Core::running() {
@@ -378,7 +370,7 @@ bool Core::running() {
   for (int i=0; i<_num_systolic_array_per_core;i++)
     running = running || !_sa_compute_pipeline.at(i).empty();
   running = running || !_dma_waiting_queue.empty() || !_dma_finished_queue.empty();
-  running = running || !_tma.empty();
+  running = running || !_dma.empty();
   running = running || !_ld_inst_queue.empty();
   running = running || !_st_inst_queue.empty();
   return running;
@@ -419,43 +411,62 @@ void Core::print_stats() {
   std::vector<float> sa_utilization;
   update_stats();
   spdlog::info("===== Instructions count =====");
-  for (int i=0; i < static_cast<size_t>(Opcode::COUNT); i++) {
-    if (i == static_cast<size_t>(Opcode::COMP))
-      spdlog::info("Core [{}] : {} inst count {} (GEMM: {}, Vector: {}), skipped inst count {}", _id, opcode_to_string(static_cast<Opcode>(i)), _stat_inst_count.at(i), _stat_gemm_inst, _stat_inst_count.at(i) - _stat_gemm_inst, _stat_tot_skipped_inst.at(i));
-    else
-      spdlog::info("Core [{}] : {} inst count {}, skipped inst count {}", _id, opcode_to_string(static_cast<Opcode>(i)), _stat_inst_count.at(i), _stat_tot_skipped_inst.at(i));
+  for (int i = 0; i < static_cast<size_t>(Opcode::COUNT); i++) {
+    auto opcode  = static_cast<Opcode>(i);
+    auto inst = _stat_inst_count.at(i);
+    auto skipped = _stat_tot_skipped_inst.at(i);
+    auto name = opcode_to_string(opcode);
+
+    if (opcode == Opcode::COMP) {
+      auto gemm   = _stat_gemm_inst;
+      auto vector = inst - gemm;
+      if (skipped)
+        spdlog::info("Core [{}] : {:8} inst_count {} (GEMM: {}, Vector: {}), skipped inst_count {}",
+            _id, name, inst, gemm, vector, skipped);
+      else
+        spdlog::info("Core [{}] : {:8} inst_count {} (GEMM: {}, Vector: {})",
+            _id, name, inst, gemm, vector);
+    }
+    else {
+      if (skipped)
+        spdlog::info("Core [{}] : {:8} inst_count {}, skipped inst_count {}",
+            _id, name, inst, skipped);
+      else
+        spdlog::info("Core [{}] : {:8} inst_count {}",
+            _id, name, inst);
+    }
   }
   spdlog::info("========= Core stat =========");
   for (int i=0; i<_num_systolic_array_per_core; i++)
     sa_utilization.push_back(static_cast<float>(_stat_tot_sa_compute_cycle.at(i) * 100) / _core_cycle);
   for (int i=0; i<_num_systolic_array_per_core; i++)
-    spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i),
+    spdlog::info("Core [{}] : Systolic array [{}] utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, i, sa_utilization.at(i),
       _stat_tot_sa_compute_cycle.at(i), _stat_tot_sa_compute_idle_cycle.at(i));
-  float dram_bw = _config.dram_req_size * _stat_tot_mem_response * _config.core_freq / (_core_cycle * 1000); // B/cycle
-  spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_tot_tma_cycle, _stat_tot_tma_idle_cycle, dram_bw, _stat_tot_mem_response);
-  spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
+  float dram_bw = _config.dram_req_size * _stat_tot_mem_response * _config.core_freq_mhz / (_core_cycle * 1000); // B/cycle
+  spdlog::info("Core [{}] : DMA active_cycles, {} DMA idle_cycles {}, DRAM BW {:.3f} GB/s ({} responses)", _id, _stat_tot_dma_cycle, _stat_tot_dma_idle_cycle, dram_bw, _stat_tot_mem_response);
+  spdlog::info("Core [{}] : Vector unit utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
     static_cast<float>(_stat_tot_vu_compute_cycle * 100) / _core_cycle, _stat_tot_vu_compute_cycle, _stat_tot_vu_compute_idle_cycle);
-  spdlog::info("Core [{}] : Numa hit count : {}, Numa miss count : {}", _id, _stat_numa_hit, _stat_numa_miss);
-  spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle);
+  spdlog::info("Core [{}] : NUMA local memory: {} requests, remote memory: {} requests", _id, _stat_numa_local_access, _stat_numa_remote_access);
+  spdlog::info("Core [{}] : Total_cycles {}", _id, _core_cycle);
 }
 
 void Core::print_current_stats() {
   std::vector<float> sa_utilization;
   for (int i=0; i<_num_systolic_array_per_core; i++)
     sa_utilization.push_back(static_cast<float>(_stat_sa_compute_cycle.at(i) * 100) / _config.core_print_interval);
-  float dram_bw = _config.dram_req_size * _stat_mem_response * _config.core_freq / (_config.core_print_interval * 1000); // B/cycle
+  float dram_bw = _config.dram_req_size * _stat_mem_response * _config.core_freq_mhz / (_config.core_print_interval * 1000); // B/cycle
   auto level = spdlog::level::info;
   if(_id != 0)
     level = spdlog::level::debug;
 
   spdlog::info("========= Core stat =========");
   for (int i=0; i<_num_systolic_array_per_core; i++)
-    spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i),
+    spdlog::info("Core [{}] : Systolic array [{}] utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, i, sa_utilization.at(i),
       _stat_sa_compute_cycle.at(i), _stat_sa_compute_idle_cycle.at(i));
-  spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_tma_cycle, _stat_tma_idle_cycle, dram_bw, _stat_mem_response);
-  spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
+  spdlog::info("Core [{}] : DMA active_cycles {}, DMA idle_cycles {}, DRAM BW {:.3f} GB/s ({} responses)", _id, _stat_dma_cycle, _stat_dma_idle_cycle, dram_bw, _stat_mem_response);
+  spdlog::info("Core [{}] : Vector unit Utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id,
     static_cast<float>(_stat_vu_compute_cycle * 100) / _config.core_print_interval, _stat_vu_compute_cycle, _stat_vu_compute_idle_cycle);
-  spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle);
+  spdlog::info("Core [{}] : Total_cycles {}", _id, _core_cycle);
   update_stats();
 }
 
@@ -468,13 +479,13 @@ void Core::update_stats() {
   }
 
   _stat_tot_vu_compute_cycle += _stat_vu_compute_cycle;
-  _stat_tot_tma_cycle += _stat_tma_cycle;
-  _stat_tot_tma_idle_cycle += _stat_tma_idle_cycle;
+  _stat_tot_dma_cycle += _stat_dma_cycle;
+  _stat_tot_dma_idle_cycle += _stat_dma_idle_cycle;
   _stat_tot_mem_response += +_stat_mem_response;
 
   _stat_vu_compute_cycle = 0;
-  _stat_tma_cycle = 0;
-  _stat_tma_idle_cycle = 0;
+  _stat_dma_cycle = 0;
+  _stat_dma_idle_cycle = 0;
   _stat_vu_compute_idle_cycle = 0;
   _stat_mem_response = 0;
 }
\ No newline at end of file
diff --git a/TOGSim/src/DMA.cc b/TOGSim/src/DMA.cc
new file mode 100644
index 00000000..f8f21025
--- /dev/null
+++ b/TOGSim/src/DMA.cc
@@ -0,0 +1,83 @@
+#include "DMA.h"
+#include "TileGraph.h"
+
+DMA::DMA(uint32_t id, uint32_t dram_req_size) {
+  _id = id;
+  _dram_req_size = dram_req_size;
+  _current_inst = nullptr;
+  _finished = true;
+}
+
+void DMA::issue_tile(std::shared_ptr<Instruction> inst) {
+  _current_inst = std::move(inst);
+  std::vector<size_t>& tile_size = _current_inst->get_tile_size();
+  if (tile_size.size() <= 0 || tile_size.size() > get_max_dim()) {
+    spdlog::error("[DMA {}] issued tile is not supported format..", _id);
+    exit(EXIT_FAILURE);
+  }
+  _finished = false;
+}
+
+std::shared_ptr<std::vector<mem_fetch*>> DMA::get_memory_access(cycle_type core_cycle, int nr_req) {
+
+  if (!_generated_once) {
+    std::shared_ptr<std::set<addr_type>> addr_set =
+      _current_inst->get_dram_address(_dram_req_size);
+
+    Tile* owner = (Tile*)_current_inst->get_owner();
+    std::shared_ptr<TileSubGraph> owner_subgraph = owner->get_owner();
+    unsigned long long base_daddr = _current_inst->get_base_dram_address();
+
+    bool is_cacheable =
+      owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size);
+
+    spdlog::trace("[{}][Core {}][SRAM] Address: 0x{:016x}, Is_cacheable: {}",
+                    core_cycle, _id, base_daddr, is_cacheable);
+    spdlog::trace("[{}][Core {}][NUMA] Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}",
+                    core_cycle, _id, owner_subgraph->get_core_id(),
+                    _current_inst->get_numa_id(), _current_inst->get_addr_name(),
+                    _current_inst->is_dma_write());
+    for (const auto& addr : *addr_set) {
+      mem_access_type acc_type =
+        _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W
+                                          : mem_access_type::GLOBAL_ACC_R;
+      mf_type type =
+        _current_inst->is_dma_write() ? mf_type::WRITE_REQUEST
+                                          : mf_type::READ_REQUEST;
+
+      mem_fetch* access = new mem_fetch(
+          addr, acc_type, type, _dram_req_size,
+          _current_inst->get_numa_id(),
+          static_cast<void*>(_current_inst.get()));
+
+      access->set_cacheable(is_cacheable);
+      _current_inst->inc_waiting_request();
+      _pending_accesses.push(access);
+    }
+    _generated_once = true;
+  }
+
+  if (nr_req == -1)
+    nr_req = _pending_accesses.size();
+
+  // Return pending accesses up to nr_req
+  auto access_vec = std::make_shared<std::vector<mem_fetch *>>();
+  for (int i = 0; i < nr_req; i++) {
+      if (_pending_accesses.empty())
+        break;
+      access_vec->push_back(_pending_accesses.front());
+      _pending_accesses.pop();
+  }
+
+  if (_pending_accesses.empty()) {
+    _finished = true;
+    _generated_once = false;
+  }
+
+  return access_vec;
+}
+
+uint32_t DMA::generate_mem_access_id() {
+  static uint32_t id_counter{0};
+  return id_counter++;
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/DelayQueue.cc b/TOGSim/src/DelayQueue.cc
similarity index 100%
rename from PyTorchSimBackend/src/DelayQueue.cc
rename to TOGSim/src/DelayQueue.cc
diff --git a/PyTorchSimBackend/src/Dram.cc b/TOGSim/src/Dram.cc
similarity index 97%
rename from PyTorchSimBackend/src/Dram.cc
rename to TOGSim/src/Dram.cc
index ab074bda..089c582e 100644
--- a/PyTorchSimBackend/src/Dram.cc
+++ b/TOGSim/src/Dram.cc
@@ -17,10 +17,10 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) {
   _n_bl = config.dram_nbl;
   _req_size = config.dram_req_size;
   _n_partitions = config.dram_num_partitions;
-  _n_ch_per_partition = _n_ch / _n_partitions;
+  _n_ch_per_partition = config.dram_channels_per_partitions;
   _config = config;
 
-  spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}", config.max_dram_bandwidth(), config.dram_freq, _n_ch, _req_size);
+  spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}B", config.max_dram_bandwidth(), config.dram_freq_mhz, _n_ch, _req_size);
   /* Initialize DRAM Channels */
   for (int ch = 0; ch < _n_ch; ch++) {
     m_to_crossbar_queue.push_back(std::queue<mem_fetch*>());
diff --git a/PyTorchSimBackend/src/Hashing.cc b/TOGSim/src/Hashing.cc
similarity index 100%
rename from PyTorchSimBackend/src/Hashing.cc
rename to TOGSim/src/Hashing.cc
diff --git a/PyTorchSimBackend/src/Instruction.cc b/TOGSim/src/Instruction.cc
similarity index 100%
rename from PyTorchSimBackend/src/Instruction.cc
rename to TOGSim/src/Instruction.cc
diff --git a/PyTorchSimBackend/src/Interconnect.cc b/TOGSim/src/Interconnect.cc
similarity index 77%
rename from PyTorchSimBackend/src/Interconnect.cc
rename to TOGSim/src/Interconnect.cc
index 8a684ff7..ab2d5d89 100644
--- a/PyTorchSimBackend/src/Interconnect.cc
+++ b/TOGSim/src/Interconnect.cc
@@ -4,12 +4,15 @@ SimpleInterconnect::SimpleInterconnect(SimulationConfig config)
   :  _latency(config.icnt_latency) {
   _cycles = 0;
   _config = config;
-  _n_nodes = config.num_cores + config.dram_channels;
+  _n_nodes = config.num_cores * _config.icnt_injection_ports_per_core + config.dram_channels;
   _in_buffers.resize(_n_nodes);
   _out_buffers.resize(_n_nodes);
   _busy_node.resize(_n_nodes);
+  _rr_next_src.resize(_n_nodes);
   for(int node = 0; node < _n_nodes; node++) {
     _busy_node[node] = false;
+    _in_buffers.at(node).resize(_n_nodes);
+    _rr_next_src[node] = 0;
   }
 }
 
@@ -19,35 +22,36 @@ bool SimpleInterconnect::running() {
 }
 
 void SimpleInterconnect::cycle() {
-  for(int node = 0; node < _n_nodes; node++) {
-    int src_node = (_rr_start + node ) % _n_nodes;
-    if(!_in_buffers[src_node].empty() && _in_buffers[src_node].front().finish_cycle <= _cycles) {
-      uint32_t dest = _in_buffers[src_node].front().dest;
-      if(!_busy_node[dest]) {
-        _out_buffers[dest].push(_in_buffers[src_node].front().access);  
-        _in_buffers[src_node].pop();
-        _busy_node[dest] = true;
-        // spdlog::trace("PUSH TO OUTBUFFER {} {}", src_node, dest);
+  for(int dest = 0; dest < _n_nodes; dest++) {
+    int src_start = _rr_next_src[dest];
+    bool pushed = false;
+
+    for(int i = 0; i < _n_nodes; i++) {
+      int src = (src_start + i) % _n_nodes;
+
+      if (!_in_buffers[src][dest].empty() &&
+          _in_buffers[src][dest].front().finish_cycle <= _cycles) {
+
+        _out_buffers[dest].push(_in_buffers[src][dest].front().access);
+        _in_buffers[src][dest].pop();
+        _rr_next_src[dest] = (src + 1) % _n_nodes;
+        pushed = true;
+        break;
       }
     }
   }
-  
-  for(int node = 0; node < _n_nodes; node++) {
-    _busy_node[node] = false;
-  }
-  _rr_start = (_rr_start + 1) % _n_nodes;
   _cycles++;
 }
 
 void SimpleInterconnect::push(uint32_t src, uint32_t dest, mem_fetch* request) {
   SimpleInterconnect::Entity entity;
-  if(_in_buffers[src].empty())
+  if(_in_buffers[src][dest].empty())
     entity.finish_cycle =  _cycles + _latency;
   else
-    entity.finish_cycle =  _in_buffers[src].back().finish_cycle + 1;
+    entity.finish_cycle =  _in_buffers[src][dest].back().finish_cycle + 1;
   entity.dest = dest;
   entity.access = request;
-  _in_buffers[src].push(entity);
+  _in_buffers[src][dest].push(entity);
 }
 
 bool SimpleInterconnect::is_full(uint32_t nid, mem_fetch* request) {
@@ -72,11 +76,11 @@ void SimpleInterconnect::pop(uint32_t nid) {
 
 Booksim2Interconnect::Booksim2Interconnect(SimulationConfig config) {
   _config = config;
-  _n_nodes = config.num_cores * _config.icnt_node_per_core + config.dram_channels;
-  spdlog::info("Initialize Booksim2"); 
+  _n_nodes = config.num_cores * _config.icnt_injection_ports_per_core + config.dram_channels;
+  spdlog::info("Initialize Booksim2");
   char* onnxim_path_env = std::getenv("TORCHSIM_DIR");
   std::string onnxim_path = onnxim_path_env != NULL?
-    std::string(onnxim_path_env) + "/PyTorchSimBackend" : std::string("./");
+    std::string(onnxim_path_env) + "/TOGSim" : std::string("./");
 
   _config_path = fs::path(onnxim_path).append("configs").append((std::string)config.icnt_config_path).string();
   spdlog::info("Config path : {}", _config_path);
diff --git a/PyTorchSimBackend/src/L2Cache.cc b/TOGSim/src/L2Cache.cc
similarity index 100%
rename from PyTorchSimBackend/src/L2Cache.cc
rename to TOGSim/src/L2Cache.cc
diff --git a/PyTorchSimBackend/src/Simulator.cc b/TOGSim/src/Simulator.cc
similarity index 89%
rename from PyTorchSimBackend/src/Simulator.cc
rename to TOGSim/src/Simulator.cc
index 6bc80286..41a2c7a5 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/TOGSim/src/Simulator.cc
@@ -3,9 +3,9 @@
 Simulator::Simulator(SimulationConfig config)
     : _config(config), _core_cycles(0) {
   // Create dram object
-  _core_period = 1000000 / (config.core_freq);
-  _icnt_period = 1000000 / (config.icnt_freq);
-  _dram_period = 1000000 / (config.dram_freq);
+  _core_period = 1000000 / (config.core_freq_mhz);
+  _icnt_period = 1000000 / (config.icnt_freq_mhz);
+  _dram_period = 1000000 / (config.dram_freq_mhz);
   _core_time = 0;
   _dram_time = 0;
   _icnt_time = 0;
@@ -14,20 +14,20 @@ Simulator::Simulator(SimulationConfig config)
   _n_cores = config.num_cores;
   _n_memories = config.dram_channels;
   _memory_req_size = config.dram_req_size;
-  _noc_node_per_core = config.icnt_node_per_core;
+  _noc_node_per_core = config.icnt_injection_ports_per_core;
   char* onnxim_path_env = std::getenv("TORCHSIM_DIR");
   std::string onnxim_path = onnxim_path_env != NULL?
-    std::string(onnxim_path_env) + "/PyTorchSimBackend" : std::string("./");
+    std::string(onnxim_path_env) + "/TOGSim" : std::string("./");
 
   // Create core objects
   _cores.resize(_n_cores);
   for (int core_index = 0; core_index < _n_cores; core_index++) {
     if (config.core_type[core_index] == CoreType::WS_MESH) {
-      spdlog::info("[Config/Core] Core {}: {} MHz, Spad size: {} KB, Systolic array per core: {}",
-        core_index, config.core_freq , config.sram_size, config.num_systolic_array_per_core);
+      spdlog::info("[Config/Core] Core {}: {} MHz, Systolic array per core: {}",
+        core_index, config.core_freq_mhz, config.num_systolic_array_per_core);
       _cores.at(core_index) = std::make_unique<Core>(core_index, _config);
     } else if(config.core_type[core_index] == CoreType::STONNE) {
-      spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq);
+      spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq_mhz);
       _cores.at(core_index) = std::make_unique<SparseCore>(core_index, _config);
     } else {
       throw std::runtime_error(fmt::format("Not implemented Core type {} ",
@@ -51,7 +51,7 @@ Simulator::Simulator(SimulationConfig config)
   }
 
   // Create interconnect object
-  spdlog::info("[Config/Interconnect] Inerconnect freq: {} MHz", config.icnt_freq);
+  spdlog::info("[Config/Interconnect] Interconnect freq: {} MHz", config.icnt_freq_mhz);
   if (config.icnt_type == IcntType::SIMPLE) {
     spdlog::info("[Config/Interconnect] SimpleInerconnect selected");
     _icnt = std::make_unique<SimpleInterconnect>(config);
@@ -62,10 +62,10 @@ Simulator::Simulator(SimulationConfig config)
     spdlog::error("[Configuration] Invalid interconnect type...!");
     exit(EXIT_FAILURE);
   }
-  _icnt_interval = config.icnt_print_interval;
+  _icnt_interval = config.icnt_stats_print_period_cycles;
 
   // Initialize Scheduler
-  for (int i=0; i<config.num_patition;i++)
+  for (int i=0; i<config.num_partition;i++)
     _partition_scheduler.push_back(std::make_unique<Scheduler>(Scheduler(config, &_core_cycles, &_core_time, i)));
 }
 
@@ -117,11 +117,11 @@ void Simulator::icnt_cycle() {
         mem_fetch *front = _cores[core_id]->top_memory_request();
         front->set_core_id(core_id);
         if (!_icnt->is_full(port_id, front)) {
-          //int node_id = _dram->get_channel_id(front) / 16;
-          //if (core_id == node_id)
-          //  _cores[core_id]->inc_numa_hit();
-          //else
-          //  _cores[core_id]->inc_numa_miss();
+          int node_id = _dram->get_channel_id(front) / _config.dram_channels_per_partitions;
+          if (core_id == node_id)
+            _cores[core_id]->inc_numa_local_access();
+          else
+            _cores[core_id]->inc_numa_remote_access();
           _icnt->push(port_id , get_dest_node(front), front);
           _cores[core_id]->pop_memory_request();
           _nr_from_core++;
@@ -229,7 +229,7 @@ void Simulator::cycle() {
     if (IS_ICNT_CYCLE(_cycle_mask))
       icnt_cycle();
   }
-  spdlog::info("Simulation Finished");
+  spdlog::info("Simulation finished");
   for (auto &core: _cores) {
     core->check_tag();
   }
@@ -291,5 +291,5 @@ void Simulator::print_core_stat()
   for (int core_id = 0; core_id < _n_cores; core_id++) {
     _cores[core_id]->print_stats();
   }
-  spdlog::info("Total execution cycle: {}", _core_cycles);
-}
\ No newline at end of file
+  spdlog::info("Total execution cycles: {}", _core_cycles);
+}
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/TOGSim/src/SparseCore.cc
similarity index 86%
rename from PyTorchSimBackend/src/SparseCore.cc
rename to TOGSim/src/SparseCore.cc
index 64d3da55..d5629b9c 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/TOGSim/src/SparseCore.cc
@@ -27,14 +27,14 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config)
   }
 
   Config stonneConfig = stonneCores.at(0)->getStonneConfig();
-  unsigned int core_freq = config.core_freq; // MHz;
+  unsigned int core_freq_mhz = config.core_freq_mhz; // MHz;
   num_ms = stonneConfig.m_MSNetworkCfg.ms_size;
   r_port_nr = config.num_stonne_port;
   w_port_nr = config.num_stonne_port;
 
-  double compute_throughput = static_cast<double>(num_ms) * core_freq / 1e3; // FLOPs/sec
-  double dn_bandwidth = static_cast<double>(r_port_nr) * config.dram_req_size * core_freq * 1e6 / 8.0 / 1e9; // GB/s
-  double rn_bandwidth = static_cast<double>(w_port_nr) * config.dram_req_size * core_freq * 1e6 / 8.0 / 1e9; // GB/s
+  double compute_throughput = static_cast<double>(num_ms) * core_freq_mhz / 1e3; // FLOPs/sec
+  double dn_bandwidth = static_cast<double>(r_port_nr) * config.dram_req_size * core_freq_mhz * 1e6 / 8.0 / 1e9; // GB/s
+  double rn_bandwidth = static_cast<double>(w_port_nr) * config.dram_req_size * core_freq_mhz * 1e6 / 8.0 / 1e9; // GB/s
   for (int i=0; i<nr_cores; i++) {
     spdlog::info("[Config/StonneCore {}][{}] Compute Throughput: {:.2f} GFLOPs/sec", id, i, compute_throughput);
     spdlog::info("[Config/StonneCore {}][{}] Distribution Network Bandwidth: {:.2f} GB/s",
@@ -68,7 +68,7 @@ void SparseCore::issue(std::shared_ptr<Tile> tile) {
     }
   }
   if (selected_core_idx == -1) {
-    spdlog::error("[StonneCore {}] Faield to issue tile", _id);
+    spdlog::error("[StonneCore {}] Failed to issue tile", _id);
     exit(1);
   }
   stonneCores.at(selected_core_idx)->init(1);
@@ -84,7 +84,7 @@ void SparseCore::issue(std::shared_ptr<Tile> tile) {
   setTraceMode(selected_core_idx, is_trace_mode);
   percore_tiles.at(selected_core_idx).push_back(tile);
   coreBusy.at(selected_core_idx) = true;
-  spdlog::info("[StonneCore {}][{}] issued new tile (trace_mode: {})", _id, selected_core_idx, is_trace_mode);
+  spdlog::info("[{}][StonneCore {}/{}][Launch] New operation (trace_mode: {})", _core_cycle, _id, selected_core_idx, is_trace_mode);
 };
 
 bool SparseCore::can_issue(const std::shared_ptr<Tile>& op) {
@@ -100,8 +100,8 @@ void SparseCore::checkStatus(uint32_t subcore_id) {
   int new_status = stonneCore->getMCFSMStats();
   int compute_cycle = stonneCore->getMSStats().n_multiplications;
   if (traceCoreStatus.at(subcore_id) != new_status) {
-    spdlog::trace("Stonne Core [{}][{}] status transition {} -> {}, Load/Store: {}/{}, compute_cycle: {}",
-      _id, _core_cycle, traceCoreStatus.at(subcore_id), new_status,
+    spdlog::trace("[{}][StonneCore {}/{}][Transition] status {} -> {}, Load/Store: {}/{}, compute_cycle: {}",
+      _core_cycle, _id, subcore_id, traceCoreStatus.at(subcore_id), new_status,
       traceLoadTraffic.at(subcore_id).size(), traceStoreTraffic.at(subcore_id).size(), (compute_cycle - traceCoreCycle.at(subcore_id))/num_ms);
     if (traceLoadTraffic.at(subcore_id).size()) {
       TraceNode load_node = TraceNode(traceNodeList.at(subcore_id).size()+2, "load", TraceNode::StonneTraceLoad);
@@ -151,14 +151,14 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
           traceStoreTraffic.at(subcore_id).insert(target_addr);
           break;
         default:
-          spdlog::error("[SparseCore] Invalid request type from core");
+          spdlog::error("[StonneCore] Invalid request type from core");
           return;
       }
       req->request_time = _core_cycle;
       req->stonneId = subcore_id;
       std::tuple<uint64_t, mem_access_type, mf_type, int> key = std::make_tuple(target_addr, acc_type, type, allocTrafficID());
       registerMemfetch(key, [this, req, acc_type, type]() {
-        spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
+        spdlog::trace("[{}][StonneCore][DRAM Response] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
               _core_cycle, _core_cycle - req->request_time, req->getAddress(), int(req->getcmd()), _config.dram_req_size);
         req->setReply();
         stonneCores.at(req->stonneId)->pushResponse(req);
@@ -168,7 +168,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
     /* Finish stonne core */
     if (coreBusy.at(subcore_id) && stonneCore->isFinished()) {
       stonneCore->finish();
-      spdlog::info("[SparseCore][{}] Operation finished at {}", _id, _core_cycle);
+      spdlog::info("[{}][StonneCore {}/{}][Finish] Operation done", _core_cycle, _id, subcore_id);
       std::shared_ptr<Tile> target_tile = percore_tiles.at(subcore_id).front();
       SST_STONNE::StonneOpDesc *opDesc = static_cast<SST_STONNE::StonneOpDesc*>(target_tile->get_custom_data());
       if (opDesc->trace_path != "")
@@ -239,7 +239,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
         {
           auto acc_type = mem_access_type::GLOBAL_ACC_R;
           auto type = mf_type::READ_REQUEST;
-          spdlog::trace("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle,
+          spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}", _core_cycle, _id, subcore_id,
                         opcode_to_string(inst->get_opcode()));
           for (auto addr : inst->get_trace_address()) {
             addr = addr - (addr & _config.dram_req_size-1);
@@ -247,8 +247,8 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
             std::tuple<uint64_t, mem_access_type, mf_type, int> key = std::make_tuple(addr, acc_type, type, allocTrafficID());
             uint64_t current_time = _core_cycle;
             registerMemfetch(key, [this, inst, addr, current_time, type]() {
-              spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
-                this->_core_cycle, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size);
+              spdlog::trace("[{}][StonneCore {}][RESPONSE] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
+                this->_core_cycle, _id, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size);
               inst->dec_waiting_request();
             });
           }
@@ -260,7 +260,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
         {
           auto acc_type = mem_access_type::GLOBAL_ACC_W;
           auto type = mf_type::WRITE_REQUEST;
-          spdlog::trace("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle,
+          spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}", _core_cycle, _id, subcore_id,
                         opcode_to_string(inst->get_opcode()));
           for (auto addr : inst->get_trace_address()) {
             addr = addr - (addr & _config.dram_req_size-1);
@@ -268,8 +268,8 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
             std::tuple<uint64_t, mem_access_type, mf_type, int> key = std::make_tuple(addr, acc_type, type, allocTrafficID());
             uint64_t current_time = _core_cycle;
             registerMemfetch(key, [this, inst, addr, current_time, type]() {
-              spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
-                this->_core_cycle, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size);
+              spdlog::trace("[{}][StonneCore {}][RESPONSE] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
+                this->_core_cycle, _id, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size);
               inst->dec_waiting_request();
             });
           }
@@ -285,7 +285,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
             inst->finish_cycle = _core_cycle + inst->get_compute_cycle();
           else
             inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle();
-          spdlog::trace("[Core {}][{}][{}] {} ISSUED, finsh at {}", _id, subcore_id, _core_cycle,
+          spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}, finsh at {}", _core_cycle, _id, subcore_id,
                           opcode_to_string(inst->get_opcode()), inst->finish_cycle);
           target_pipeline.push(inst);
           issued = true;
@@ -313,7 +313,7 @@ void SparseCore::cycle() {
     for (auto& req_pair : request_merge_table) {
       _request_queue.push(req_pair.second);
       request_merge_table.erase(req_pair.first);
-      spdlog::debug("[SparseCore][{}][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \
+      spdlog::debug("[{}][StonneCore][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \
               _core_cycle, _id, req_pair.second->get_addr(), int(req_pair.second->get_access_type()), int(req_pair.second->get_type()),
               _config.dram_req_size, nr_request);
       nr_request++;
@@ -366,9 +366,9 @@ void SparseCore::print_current_stats() {
     }
     cycle_type nr_mul = percore_stat.at(i).n_multiplications;
     percore_stat.at(i).reset();
-    spdlog::info("Stonne Core [{}][{}] : nr_multiplications: {}", _id, i, nr_mul);
+    spdlog::info("StonneCore [{}][{}] : nr_multiplications: {}", _id, i, nr_mul);
   }
-  spdlog::info("Stonne Core [{}] : Total cycle {}", _id, _core_cycle);
+  spdlog::info("StonneCore [{}] : Total cycle {}", _id, _core_cycle);
 }
 
 void SparseCore::print_stats() {
@@ -383,9 +383,9 @@ void SparseCore::print_stats() {
       percore_total_stat.at(i) += percore_stat.at(i);
     }
     cycle_type nr_mul = percore_total_stat.at(i).n_multiplications;
-    spdlog::info("Stonne Core [{}][{}] : nr_multiplications: {}", _id, i, nr_mul);
+    spdlog::info("StonneCore [{}][{}] : nr_multiplications: {}", _id, i, nr_mul);
   }
-  spdlog::info("Stonne Core [{}] : Total cycle {}", _id, _core_cycle);
+  spdlog::info("StonneCore [{}] : Total cycle {}", _id, _core_cycle);
 }
 
 std::shared_ptr<Tile> SparseCore::pop_finished_tile() {
@@ -399,18 +399,18 @@ std::shared_ptr<Tile> SparseCore::pop_finished_tile() {
 
 void SparseCore::finish_instruction(std::shared_ptr<Instruction>& inst) {
   if (inst->finished) {
-    spdlog::error("[Core {}][{}] {} FINISHED, inst already finished!!", _id, _core_cycle,
+    spdlog::error("[{}][StonneCore {}][Error] {} inst already finished!!", _core_cycle, _id,
                   opcode_to_string(inst->get_opcode()));
     exit(EXIT_FAILURE);
   }
   inst->finish_instruction();
   static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
   if (inst->get_opcode() == Opcode::COMP) {
-    spdlog::info("[StonneCore {}][{}] {} FINISHED",
-      _id, _core_cycle, opcode_to_string(inst->get_opcode()));
+    spdlog::info("[{}][StonneCore {}][INST_FINISHED] {}",
+      _core_cycle, _id, opcode_to_string(inst->get_opcode()));
   } else if (inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) {
-    spdlog::info("[StonneCore {}][{}] {} FINISHED, free_sram_size: {}", _id, _core_cycle,
-      opcode_to_string(inst->get_opcode()), inst->get_free_sram_size());
+    spdlog::info("[{}][StonneCore {}][INST_FINISHED] {}", _core_cycle, _id,
+      opcode_to_string(inst->get_opcode()));
   }
 }
 
@@ -460,5 +460,5 @@ void SparseCore::dumpTrace(int stonne_core_id, const std::string& path) {
       outFile << traceNodeList.at(stonne_core_id)[i];
   }
   outFile << "\n}" << std::endl;
-  spdlog::info("[StonneCore] Success to save trace dump file to \"{}\"", path);
+  spdlog::info("[{}][StonneCore] Success to save trace dump file to \"{}\"", _core_cycle, path);
 }
diff --git a/PyTorchSimBackend/src/Tile.cc b/TOGSim/src/Tile.cc
similarity index 100%
rename from PyTorchSimBackend/src/Tile.cc
rename to TOGSim/src/Tile.cc
diff --git a/PyTorchSimBackend/src/TileGraph.cc b/TOGSim/src/TileGraph.cc
similarity index 96%
rename from PyTorchSimBackend/src/TileGraph.cc
rename to TOGSim/src/TileGraph.cc
index 33e995e9..120d49e2 100644
--- a/PyTorchSimBackend/src/TileGraph.cc
+++ b/TOGSim/src/TileGraph.cc
@@ -111,7 +111,6 @@ void TileGraph::allocate_subgraph(int core_id, int slot_id) {
 
   for (auto it = _subgraph_vec.begin(); it != _subgraph_vec.end(); ++it) {
     if ((*it)->get_core_id() == -1 || (*it)->get_core_id() == core_id) {
-      spdlog::trace("[TileGraph] Core {} allocated new subgraph(affinity={}) (remains: {})", core_id, (*it)->get_core_id(), _subgraph_vec.size()-1);
       std::shared_ptr<TileSubGraph> subgraph = *it;
       _cpu_graph_map[core_id][slot_id] = subgraph;
       _subgraph_vec.erase(it);
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc
similarity index 98%
rename from PyTorchSimBackend/src/TileGraphParser.cc
rename to TOGSim/src/TileGraphParser.cc
index 4a562724..42776a51 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/TOGSim/src/TileGraphParser.cc
@@ -627,9 +627,6 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
           }
         }
       }
-      /* Set last instruction's free sram size */
-      if(parent->get_instructions().size())
-        parent->get_instructions().back()->set_free_sram_size(parent->get_required_sram_size());
 
       parent->append_child(child);
       /* Create new tile */
@@ -682,11 +679,6 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       tile_vec.back()->inc_required_sram_size(inst->get_tile_numel() * inst->get_precision());
   }
 
-  /* Set last instruction's free sram size */
-  std::shared_ptr<Tile> parent = tile_vec.back();
-  if (parent->get_instructions().size())
-    parent->get_instructions().back()->set_free_sram_size(parent->get_required_sram_size());
-
   return tile_vec;
 }
 
diff --git a/PyTorchSimBackend/src/helper/CommandLineParser.cc b/TOGSim/src/helper/CommandLineParser.cc
similarity index 100%
rename from PyTorchSimBackend/src/helper/CommandLineParser.cc
rename to TOGSim/src/helper/CommandLineParser.cc
diff --git a/PyTorchSimBackend/src/helper/CommandLineParser.h b/TOGSim/src/helper/CommandLineParser.h
similarity index 100%
rename from PyTorchSimBackend/src/helper/CommandLineParser.h
rename to TOGSim/src/helper/CommandLineParser.h
diff --git a/PyTorchSimBackend/src/main.cc b/TOGSim/src/main.cc
similarity index 95%
rename from PyTorchSimBackend/src/main.cc
rename to TOGSim/src/main.cc
index 214e7131..1af11257 100644
--- a/PyTorchSimBackend/src/main.cc
+++ b/TOGSim/src/main.cc
@@ -9,7 +9,7 @@
 namespace fs = std::filesystem;
 namespace po = boost::program_options;
 
-const char* env_value = std::getenv("BACKENDSIM_DRYRUN");
+const char* env_value = std::getenv("TOGSIM_DRYRUN");
 bool isDryRun = (env_value != nullptr && std::string(env_value) == "1");
 
 void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, std::string config_path, cycle_type request_time=0, int partiton_id=0) {
@@ -38,7 +38,7 @@ int until(Simulator *simulator, cycle_type until_cycle) {
 void interactive_mode(Simulator* simulator) {
   std::string command;
 
-  std::cout << "[" << simulator->get_core_cycle() << "] BackendSim> ";
+  std::cout << "[" << simulator->get_core_cycle() << "] TOGSim> ";
   while (std::getline(std::cin, command)) {
 
     std::istringstream iss(command);
@@ -79,7 +79,7 @@ void interactive_mode(Simulator* simulator) {
       spdlog::error("Error: unknown command {} Available commands are: launch, until, quit.", token);
     }
     if (isDryRun)
-      std::cout << "[" << simulator->get_core_cycle() << "] BackendSim> ";
+      std::cout << "[" << simulator->get_core_cycle() << "] TOGSim> ";
   }
   simulator->cycle();
   if (simulator->get_core_cycle()==0)
@@ -149,6 +149,6 @@ int main(int argc, char** argv) {
   /* Simulation time measurement */
   auto end = std::chrono::high_resolution_clock::now();
   std::chrono::duration<double> duration = end - start;
-  spdlog::info("Simulation time: {:2f} seconds", duration.count());
+  spdlog::info("Wall-clock time for simulation: {:2f} seconds", duration.count());
   return 0;
 }
diff --git a/PyTorchSimBackend/src/scheduler/Scheduler.cc b/TOGSim/src/scheduler/Scheduler.cc
similarity index 100%
rename from PyTorchSimBackend/src/scheduler/Scheduler.cc
rename to TOGSim/src/scheduler/Scheduler.cc
diff --git a/experiments/BERT.py b/experiments/BERT.py
index 3534505d..c5bb454e 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -9,7 +9,7 @@ def run_BERT(size, input_seq, config):
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
     # from tests.test_transformer import EncoderBlock
     from tests.Fusion.test_transformer_fusion import EncoderBlock
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
 
     hidden_dim = {'base': 768, 'large': 1024, 'xlarge': 2048}
@@ -36,7 +36,7 @@ def run_BERT(size, input_seq, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path FIXME: gem5 result is different as directoy name
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -51,7 +51,7 @@ def run_BERT(size, input_seq, config):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     run_BERT(size, input_seq, config)
diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh
index a32cd0a6..28e6ad5e 100755
--- a/experiments/artifact/cycle_validation/run_cycle.sh
+++ b/experiments/artifact/cycle_validation/run_cycle.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e
 
-export TORCHSIM_CONFIG=$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+export TORCHSIM_CONFIG=$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
 LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
 mkdir -p $LOG_DIR
 
diff --git a/experiments/artifact/cycle_validation/summary_cycle.py b/experiments/artifact/cycle_validation/summary_cycle.py
index 529d0161..c0f48ac3 100644
--- a/experiments/artifact/cycle_validation/summary_cycle.py
+++ b/experiments/artifact/cycle_validation/summary_cycle.py
@@ -88,7 +88,7 @@ def compute_mae(errors):
             name = file[:-4]
             with open(full_path, errors="ignore") as f:
                 for line in f:
-                    match = re.search(r"Total execution cycle:\s*([0-9]+)", line)
+                    match = re.search(r"Total execution cycles:\s*([0-9]+)", line)
                     if match:
                         cycle_map[name] = int(match.group(1))
                         break
diff --git a/experiments/artifact/speedup/run_speedup.sh b/experiments/artifact/speedup/run_speedup.sh
index 7d0c0da2..2b9625e9 100755
--- a/experiments/artifact/speedup/run_speedup.sh
+++ b/experiments/artifact/speedup/run_speedup.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
-CONFIG_DIR="$TORCHSIM_DIR/PyTorchSimBackend/configs"
-SIMULATOR_BIN="$TORCHSIM_DIR/PyTorchSimBackend/build/bin/Simulator"
+CONFIG_DIR="$TORCHSIM_DIR/TOGSim/configs"
+SIMULATOR_BIN="$TORCHSIM_DIR/TOGSim/build/bin/Simulator"
 
 configs=(
     "systolic_ws_128x128_c2_simple_noc_tpuv3.json"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
index 66829f02..4055b355 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
@@ -26,7 +26,7 @@ for i in "${config[@]}"; do
     echo "===== config=$i | model=$ops =====" >> "$output_file"
     sum=0.0
     count=0
-    config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i"
+    config_path="$TORCHSIM_DIR/TOGSim/configs/$i"
 
     for iter in {1..5}; do
       echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
index 2f9718f1..83b3798a 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
@@ -27,7 +27,7 @@ for i in "${config[@]}"; do
     echo "===== config=$i | model=$ops =====" >> "$output_file"
     sum=0.0
     count=0
-    config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i"
+    config_path="$TORCHSIM_DIR/TOGSim/configs/$i"
 
     for iter in {1..5}; do
       echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
index 8ff7e2b6..f1467614 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
@@ -25,7 +25,7 @@ for i in "${config[@]}"; do
     echo "===== config=$i | model=$ops =====" >> "$output_file"
     sum=0.0
     count=0
-    config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i"
+    config_path="$TORCHSIM_DIR/TOGSim/configs/$i"
 
     for iter in {1..5}; do
       echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
index aa35735c..2ed3ca2a 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
@@ -33,7 +33,7 @@ for i in "${config[@]}"; do
       echo "===== config=$i | model=$ops =====" >> "$output_file"
       sum=0.0
       count=0
-      config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i"
+      config_path="$TORCHSIM_DIR/TOGSim/configs/$i"
 
       for iter in {1..5}; do
         echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
diff --git a/experiments/attention.py b/experiments/attention.py
index e8f89dac..5a8c5f45 100644
--- a/experiments/attention.py
+++ b/experiments/attention.py
@@ -14,7 +14,7 @@ def attention(query, key, value):
         p_attn = scores.softmax(dim=-2)
         return torch.matmul(value.transpose(-1, -2), p_attn)
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
     query = torch.randn(size).to(device=device)
     key = torch.randn(size).to(device=device)
@@ -36,7 +36,7 @@ def attention(query, key, value):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -50,7 +50,7 @@ def attention(query, key, value):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     run_attention(size, config)
diff --git a/experiments/conv.py b/experiments/conv.py
index e8b97906..c8ca9a37 100644
--- a/experiments/conv.py
+++ b/experiments/conv.py
@@ -15,7 +15,7 @@ def custom_conv2d(a, b, bias):
         conv2d.weight = torch.nn.Parameter(b)
         # conv2d.bias = torch.nn.Parameter(bias)
         return conv2d(a)
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
     conv_input = torch.randn(batch_size, i_c, i_h, i_w).to(memory_format=torch.channels_last, device=device)
     conv_kernel = torch.randn(o_c, i_c, kernel_size, kernel_size).to(memory_format=torch.channels_last, device=device)
@@ -37,7 +37,7 @@ def custom_conv2d(a, b, bias):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -51,7 +51,7 @@ def custom_conv2d(a, b, bias):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     run_conv2d(size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7], config)
\ No newline at end of file
diff --git a/experiments/gemm.py b/experiments/gemm.py
index a1fdcff6..67dc4f79 100644
--- a/experiments/gemm.py
+++ b/experiments/gemm.py
@@ -10,7 +10,7 @@ def run_matmul(input_size, hidden_size, output_size, config):
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
     def custom_matmul(a, b):
         return torch.matmul(a, b)
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
     torch.manual_seed(0)
     input = torch.randn(input_size, hidden_size).to(device=device)
@@ -31,7 +31,7 @@ def custom_matmul(a, b):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -45,10 +45,10 @@ def custom_matmul(a, b):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     run_matmul(size[0], size[1], size[2], config)
diff --git a/experiments/layernorm.py b/experiments/layernorm.py
index f149394e..0beaac6c 100644
--- a/experiments/layernorm.py
+++ b/experiments/layernorm.py
@@ -8,7 +8,7 @@
 
 def run_layernorm(size, config):
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
     input = torch.randn(size).to(device=device)
     opt_fn = torch.compile(dynamic=False)(torch.nn.LayerNorm(size[-1]).to(device=device))
@@ -27,7 +27,7 @@ def run_layernorm(size, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -42,7 +42,7 @@ def run_layernorm(size, config):
     os.environ['TORCHSIM_FUSION_REDUCTION_REDUCTION'] = "0"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     run_layernorm(size, config)
diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index 5d9dcf86..23d62e40 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -8,7 +8,7 @@
 def run_resnet(batch, config):
     from torchvision.models import resnet18
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
     model = resnet18().eval()
     input = torch.randn(batch, 3, 224, 224).to(device=device)
@@ -29,7 +29,7 @@ def run_resnet(batch, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -43,7 +43,7 @@ def run_resnet(batch, config):
     os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     run_resnet(batch, config)
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index bd52afc1..60a46071 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -8,7 +8,7 @@
 def run_resnet(batch, config):
     from torchvision.models import resnet50
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
     model = resnet50().eval()
     input = torch.randn(batch, 3, 224, 224).to(device=device)
@@ -29,7 +29,7 @@ def run_resnet(batch, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -43,7 +43,7 @@ def run_resnet(batch, config):
     os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     run_resnet(batch, config)
diff --git a/experiments/softmax.py b/experiments/softmax.py
index 14d28fee..532ef091 100644
--- a/experiments/softmax.py
+++ b/experiments/softmax.py
@@ -8,7 +8,7 @@
 
 def run_softmax(size, config, dim=1):
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
     input = torch.randn(size).to(device=device)
     opt_fn = torch.compile(dynamic=False)(torch.nn.Softmax(dim=dim).to(device=device))
@@ -27,7 +27,7 @@ def run_softmax(size, config, dim=1):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -41,7 +41,7 @@ def run_softmax(size, config, dim=1):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     run_softmax(size, config)
diff --git a/scripts/CompilerOpt_experiment/DMAopt.sh b/scripts/CompilerOpt_experiment/DMAopt.sh
index 469cf766..22118b1e 100644
--- a/scripts/CompilerOpt_experiment/DMAopt.sh
+++ b/scripts/CompilerOpt_experiment/DMAopt.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json"
+export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json"
 
 # None FG DMA
 export TORCHSIM_SUBTILE=0
diff --git a/scripts/ILS_experiment/test_matmul.py b/scripts/ILS_experiment/test_matmul.py
index 09cc407d..667dfc66 100644
--- a/scripts/ILS_experiment/test_matmul.py
+++ b/scripts/ILS_experiment/test_matmul.py
@@ -60,7 +60,7 @@ def custom_matmul(bias, a, b):
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_matmul(device, *shape)
diff --git a/scripts/build_from_source.sh b/scripts/build_from_source.sh
new file mode 100644
index 00000000..fb9e82e3
--- /dev/null
+++ b/scripts/build_from_source.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+home="/workspace"
+cd $home
+
+# Gem5
+apt -y update && apt -y upgrade && apt -y install scons
+git clone https://github.com/PSAL-POSTECH/gem5.git
+cd gem5 && scons build/RISCV/gem5.opt -j $(nproc)
+export GEM5_PATH=$home/gem5/build/RISCV/gem5.opt
+cd $home
+
+# LLVM
+git clone https://github.com/PSAL-POSTECH/llvm-project.git
+cd llvm-project && mkdir build && cd build && \
+  cmake -DLLVM_ENABLE_PROJECTS=mlir -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/riscv-llvm -DLLVM_TARGETS_TO_BUILD=RISCV -G "Unix Makefiles" ../llvm && \
+  make -j && make install
+cd $home
+
+# Spike Simulator
+git clone https://github.com/PSAL-POSTECH/riscv-isa-sim.git --branch TorchSim && cd riscv-isa-sim && mkdir build && cd build && \
+    ../configure --prefix=$RISCV && make -j && make install
+cd $home
\ No newline at end of file
diff --git a/scripts/chiplet.sh b/scripts/chiplet.sh
index 3dfba3d9..2989e4fd 100755
--- a/scripts/chiplet.sh
+++ b/scripts/chiplet.sh
@@ -14,16 +14,16 @@ fi
 
 GEMM_PATH="$1"
 INDEX_NAME="$2"
-SIMULATOR_PATH="$TORCHSIM_DIR/PyTorchSimBackend/build/bin/Simulator"
+SIMULATOR_PATH="$TORCHSIM_DIR/TOGSim/build/bin/Simulator"
 GEMM_DIR_NAME=$(basename "$GEMM_PATH")
 echo "GEMM Directory Name: $GEMM_DIR_NAME"
 
 CONFIG_LIST=(
-    "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json"
+    "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json"
 )
 CONFIG_LIST2=(
-    "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json"
-    "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json"
+    "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json"
+    "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json"
 )
 shift
 shift
@@ -51,7 +51,7 @@ for CONFIG in "${CONFIG_LIST[@]}"; do
         # Run Simulator
         echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME"
         "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" &
-        echo "[BackendSimulator] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\""
+        echo "[TOGSim] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\""
     done
 done
 
@@ -65,6 +65,6 @@ for CONFIG in "${CONFIG_LIST2[@]}"; do
     # Run Simulator
     # echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME"
     "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" &
-    echo "[BackendSimulator] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\""
+    echo "[TOGSim] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\""
 done
 wait
\ No newline at end of file
diff --git a/scripts/chiplet_prep.py b/scripts/chiplet_prep.py
index 168532f1..32f7ad50 100644
--- a/scripts/chiplet_prep.py
+++ b/scripts/chiplet_prep.py
@@ -61,8 +61,8 @@ def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     parser = argparse.ArgumentParser(description='Process folder argument.')
     parser.add_argument('size', type=int, help='Folder value', default=256)
diff --git a/scripts/end2end.sh b/scripts/end2end.sh
index 7ca5c93d..579b8c14 100755
--- a/scripts/end2end.sh
+++ b/scripts/end2end.sh
@@ -7,34 +7,34 @@ BASE_PATH=$1 # Input as the first argument
 total_sum=0
 total_core=0
 total_vector=0
-# Find all backendsim_result folders
-mapfile -t backend_folders < <(find "$BASE_PATH" -type d -name "backendsim_result")
+# Find all togsim_result folders
+mapfile -t togsim_folders < <(find "$BASE_PATH" -type d -name "togsim_result")
 
-# Iterate over each backendsim_result folder
-for backend_folder in "${backend_folders[@]}"; do
-  # echo "Processing folder: $backend_folder"
+# Iterate over each togsim_result folder
+for togsim_folder in "${togsim_folders[@]}"; do
+  # echo "Processing folder: $togsim_folder"
 
-  # Find all files within the backendsim_result folder
-  mapfile -t files < <(find "$backend_folder" -type f)
+  # Find all files within the togsim_result folder
+  mapfile -t files < <(find "$togsim_folder" -type f)
 
   for file in "${files[@]}"; do
     # echo "Processing $file"
 
-    # Extract the last line containing "Total cycle"
-    total_cycle=$(grep "Total cycle" "$file" | tail -n 1 | sed -E 's/.*Total cycle ([0-9]+).*/\1/')
+    # Extract the last line containing "Total_cycles"
+    total_cycle=$(grep "Total_cycles" "$file" | tail -n 1 | sed -E 's/.*Total_cycles ([0-9]+).*/\1/')
     # echo "total_cycle: $total_cycle"
-    active_cycles=($(grep -o 'active cycle [0-9]*' "$file" | awk '{print $3}'))
+    active_cycles=($(grep -o 'active_cycles [0-9]*' "$file" | awk '{print $3}'))
     num_cycles=${#active_cycles[@]}
     if [ "$num_cycles" -ge 3 ]; then
         core_cycle=${active_cycles[$((num_cycles-3))]}
     else
-        echo "Error: cannot find core active cycle"
+        echo "Error: cannot find core active_cycles"
     fi
     if [[ "$num_cycles" -ge 1 ]]; then
-        # Extract the last two active cycles
+        # Extract the last two active_cycless
         vector_core_cycle=${active_cycles[$((num_cycles-1))]}
     else
-        echo "Error: cannot find vector core active cycle"
+        echo "Error: cannot find vector core active_cycles"
     fi
     echo "file: $file total_cycle: $total_cycle SA core_cycle: $core_cycle vector_core_cycle: $vector_core_cycle"
 
diff --git a/scripts/get_tog_result.sh b/scripts/get_tog_result.sh
index 9359e1e5..6fd399e0 100755
--- a/scripts/get_tog_result.sh
+++ b/scripts/get_tog_result.sh
@@ -3,8 +3,8 @@ total_cycles=0
 
 # Read through input stream line by line
 while IFS= read -r line; do
-    # Check if the line contains both "[BackendSimulator]" and "stored"
-    if [[ "$line" == *"[BackendSimulator]"* && "$line" == *"stored"* ]]; then
+    # Check if the line contains both "[TOGSimulator]" and "stored"
+    if [[ "$line" == *"[TOGSimulator]"* && "$line" == *"stored"* ]]; then
         # Extract the file path from the line
         file_path=$(echo "$line" | sed -n 's/.*stored to "\(.*\)"$/\1/p')
         
diff --git a/scripts/sim_time.sh b/scripts/sim_time.sh
index 15c60736..95df5982 100755
--- a/scripts/sim_time.sh
+++ b/scripts/sim_time.sh
@@ -6,15 +6,15 @@ BASE_PATH=$1 # Input as the first argument
 # Initialize total_sum as string for awk processing
 total_sum=0.0
 
-# Find all backendsim_result folders
-mapfile -t backend_folders < <(find "$BASE_PATH" -type d -name "backendsim_result")
+# Find all togsim_result folders
+mapfile -t togsim_folders < <(find "$BASE_PATH" -type d -name "togsim_result")
 
-# Iterate over each backendsim_result folder
-for backend_folder in "${backend_folders[@]}"; do
-  mapfile -t files < <(find "$backend_folder" -type f)
+# Iterate over each togsim_result folder
+for togsim_folder in "${togsim_folders[@]}"; do
+  mapfile -t files < <(find "$togsim_folder" -type f)
 
   for file in "${files[@]}"; do
-    sim_time=$(grep "Simulation time:" "$file" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+(\.[0-9]+)?).*/\1/')
+    sim_time=$(grep "Wall-clock time for simulation:" "$file" | tail -n 1 | sed -E 's/.*Wall-clock time for simulation: ([0-9]+(\.[0-9]+)?).*/\1/')
     echo "file: $file total_cycle: $sim_time"
 
     if [[ -n "$sim_time" ]]; then
diff --git a/scripts/sparsity_experiment/run.sh b/scripts/sparsity_experiment/run.sh
index 0b7bc6f5..94e00527 100755
--- a/scripts/sparsity_experiment/run.sh
+++ b/scripts/sparsity_experiment/run.sh
@@ -5,7 +5,7 @@ export TORCHSIM_FORCE_TIME_M=8
 export TORCHSIM_FORCE_TIME_N=8
 
 OUTPUT_DIR="12GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -13,7 +13,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="24GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -21,7 +21,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="48GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -29,7 +29,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="12GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -37,7 +37,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="24GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -45,7 +45,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="48GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
diff --git a/scripts/stonne_experiment2/tog_gen.py b/scripts/stonne_experiment2/tog_gen.py
index 2f184f4c..be30795b 100644
--- a/scripts/stonne_experiment2/tog_gen.py
+++ b/scripts/stonne_experiment2/tog_gen.py
@@ -5,7 +5,7 @@
 from collections import defaultdict
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 from AsmParser.tog_generator import tog_generator
-from Simulator.simulator import BackendSimulator
+from Simulator.simulator import TOGSimulator
 from PyTorchSimFrontend import extension_config
 
 def extract_simulation_stats(result_path):
@@ -19,9 +19,9 @@ def extract_simulation_stats(result_path):
     for line in lines:
         if "nr_multiplications" in line:
             nr_multiplications = line.strip().split(":")[-1].strip()
-        elif "Total execution cycle" in line:
+        elif "Total execution cycles" in line:
             total_cycle = line.strip().split(":")[-1].strip()
-        elif "Simulation time" in line:
+        elif "Wall-clock time for simulation" in line:
             sim_time = line.strip().split(":")[-1].replace("seconds", "").strip()
     return nr_multiplications, total_cycle, sim_time
 
@@ -71,9 +71,9 @@ def extract_simulation_stats(result_path):
         if "outerPro" in path:
             continue
         tog_path = os.path.join(path, "tile_graph.onnx")
-        backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
-        stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json'
-        backsim = BackendSimulator(backend_path, stonne_config_path)
+        togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
+        stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/TOGSim/configs/stonne_validation_c1_simple_noc.json'
+        backsim = TOGSimulator(togsim_path, stonne_config_path)
         result_path = backsim.simulation(tog_path)
         nr_multiplications, total_cycle, sim_time = extract_simulation_stats(result_path)
         sim_time, total_cycle = float(sim_time), int(total_cycle)
diff --git a/test_extension_backend.py b/test_extension_backend.py
index f0a9353a..5e6427ef 100644
--- a/test_extension_backend.py
+++ b/test_extension_backend.py
@@ -22,8 +22,8 @@
 from tests.Fusion.test_matmul_activation import test_matmul_activation
 
 if __name__ == "__main__":
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     #test_vectoradd(device, (47, 10))
     #test_vector_scalar_add(device, (10, 10))
diff --git a/tests/Diffusion/test_diffusion.py b/tests/Diffusion/test_diffusion.py
index 03d1b721..c5170209 100644
--- a/tests/Diffusion/test_diffusion.py
+++ b/tests/Diffusion/test_diffusion.py
@@ -553,8 +553,8 @@ def test_upsample2d(
     args = parser.parse_args()
 
     sys.path.append(os.environ.get("TORCHSIM_DIR", "/workspace/PyTorchSim"))
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
 
     #test_upsample2d(device)
diff --git a/tests/Fusion/test_addmm_residual.py b/tests/Fusion/test_addmm_residual.py
index a5e05182..ef753a67 100644
--- a/tests/Fusion/test_addmm_residual.py
+++ b/tests/Fusion/test_addmm_residual.py
@@ -43,8 +43,8 @@ def addmm_residual(a, b, c, d):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_addmm_residual(device, 32, 32, 32)
     test_addmm_residual(device, 128, 128, 128)
diff --git a/tests/Fusion/test_attention_fusion.py b/tests/Fusion/test_attention_fusion.py
index 95bdf165..123376d1 100644
--- a/tests/Fusion/test_attention_fusion.py
+++ b/tests/Fusion/test_attention_fusion.py
@@ -75,8 +75,8 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_MHA(device)
     # test_Attention(device, head=16, seq=512, d_k=64)
diff --git a/tests/Fusion/test_bmm_reduction.py b/tests/Fusion/test_bmm_reduction.py
index 42e38095..4f4d3ad6 100644
--- a/tests/Fusion/test_bmm_reduction.py
+++ b/tests/Fusion/test_bmm_reduction.py
@@ -42,8 +42,8 @@ def bmm(a, b):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     #test_bmm_reduce(device)
     test_bmm_reduce(device, 12, 512)
diff --git a/tests/Fusion/test_conv_fusion.py b/tests/Fusion/test_conv_fusion.py
index 42210b13..694f3bb9 100644
--- a/tests/Fusion/test_conv_fusion.py
+++ b/tests/Fusion/test_conv_fusion.py
@@ -101,8 +101,8 @@ def custom_conv_bn_relu(a, b, bias, c, d, e, f):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
 
     # Vanila test
diff --git a/tests/Fusion/test_matmul_activation.py b/tests/Fusion/test_matmul_activation.py
index 2381bd8c..2f1d014f 100644
--- a/tests/Fusion/test_matmul_activation.py
+++ b/tests/Fusion/test_matmul_activation.py
@@ -73,8 +73,8 @@ def test_matmul_activation(device, batch_size=16, input_size=32, output_size=8,
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_matmul_activation(device)
     test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="sigmoid")
diff --git a/tests/Fusion/test_matmul_reduction.py b/tests/Fusion/test_matmul_reduction.py
index 31ea1b0d..df8cf969 100644
--- a/tests/Fusion/test_matmul_reduction.py
+++ b/tests/Fusion/test_matmul_reduction.py
@@ -89,8 +89,8 @@ def matmul_fused(a, b, c, d):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_matmul_reduce(device, 3072, 512, 768)
     test_matmul_var_mean(device)
diff --git a/tests/Fusion/test_matmul_scalar.py b/tests/Fusion/test_matmul_scalar.py
index 0dcb54f9..0815bb90 100644
--- a/tests/Fusion/test_matmul_scalar.py
+++ b/tests/Fusion/test_matmul_scalar.py
@@ -39,7 +39,7 @@ def matmul_fused(a, b, c):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_matmul_scalar(device)
diff --git a/tests/Fusion/test_prologue_fusion.py b/tests/Fusion/test_prologue_fusion.py
index 797f9e76..b27312a9 100644
--- a/tests/Fusion/test_prologue_fusion.py
+++ b/tests/Fusion/test_prologue_fusion.py
@@ -88,8 +88,8 @@ def bmm(a, b, c, d):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_elem_broadcast_fusion(device)
     test_elem_fusion(device)
diff --git a/tests/Fusion/test_transformer_fusion.py b/tests/Fusion/test_transformer_fusion.py
index 0e500b5b..b1cceb2c 100644
--- a/tests/Fusion/test_transformer_fusion.py
+++ b/tests/Fusion/test_transformer_fusion.py
@@ -203,8 +203,8 @@ def test_EncoderBlock_validation(head=12, embed_dim=768, input_seq=512):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     #test_MHA(device)
     test_EncoderBlock(device)
diff --git a/tests/MLP/test_mlp.py b/tests/MLP/test_mlp.py
index 6f6c9444..31bcefdf 100644
--- a/tests/MLP/test_mlp.py
+++ b/tests/MLP/test_mlp.py
@@ -281,9 +281,9 @@ def train(model, device):
     return
 
 if __name__ == "__main__":
-    from Scheduler.scheduler import ExecutionEngine
+    from Scheduler.scheduler import PyTorchSimRunner
     torch.set_printoptions(threshold=float('inf'), linewidth=600)
-    module = ExecutionEngine.setup_device()
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
 
     test_mlp(device)
diff --git a/tests/MLP/test_mlp_cpu.py b/tests/MLP/test_mlp_cpu.py
index 49f44650..112f5d07 100644
--- a/tests/MLP/test_mlp_cpu.py
+++ b/tests/MLP/test_mlp_cpu.py
@@ -399,7 +399,6 @@ def train(model, device):
 
 
 if __name__ == "__main__":
-    from Scheduler.scheduler import ExecutionEngine
     # torch.set_printoptions(threshold=float('inf'), linewidth=600)
 
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py
index aa1af651..6a7747f7 100644
--- a/tests/Mixtral_8x7B/test_attention.py
+++ b/tests/Mixtral_8x7B/test_attention.py
@@ -163,8 +163,8 @@ def test_rmsnorm(device, seq=32):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_rmsnorm(device, seq=1)
     test_concat(device, size1=(1, 8, 64, 64), size2=(1,8,1,64), dim=2)
diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py
index c5ab8107..ae16f0b0 100644
--- a/tests/MoE/test_moe.py
+++ b/tests/MoE/test_moe.py
@@ -783,9 +783,9 @@ def evaluation(model, evaluation_loader):
         train(opt_model, train_loader)
 
 if __name__ == "__main__":
-    from Scheduler.scheduler import ExecutionEngine
+    from Scheduler.scheduler import PyTorchSimRunner
     torch.set_printoptions(threshold=float('inf'), linewidth=600)
-    module = ExecutionEngine.setup_device()
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
 
     test_moe(device)
diff --git a/tests/test_activation.py b/tests/test_activation.py
index de3542c3..575fc7e8 100644
--- a/tests/test_activation.py
+++ b/tests/test_activation.py
@@ -88,8 +88,8 @@ def test_SwiGLU(device, size=(128, 128)):
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_ReLU(device, (47, 10))
     test_ReLU(device, (128, 128))
diff --git a/tests/test_add.py b/tests/test_add.py
index 5e1ab15e..118632d5 100644
--- a/tests/test_add.py
+++ b/tests/test_add.py
@@ -58,8 +58,8 @@ def vectoradd(a, b):
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_vectoradd(device, (1, 1))
     test_vectoradd(device, (47, 10))
diff --git a/tests/test_batchnorm.py b/tests/test_batchnorm.py
index f7abacf5..251805f5 100644
--- a/tests/test_batchnorm.py
+++ b/tests/test_batchnorm.py
@@ -37,8 +37,8 @@ def test_BatchNorm(device, size=(1, 16, 64, 64)):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_BatchNorm(device)
     test_BatchNorm(device, size=(1,64, 32, 32))
diff --git a/tests/test_bmm.py b/tests/test_bmm.py
index 6d9279aa..d90410db 100644
--- a/tests/test_bmm.py
+++ b/tests/test_bmm.py
@@ -46,8 +46,8 @@ def bmm(a, b, bias):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_BMM(device)
     test_BMM(device, 2, 256, 128, 256)
diff --git a/tests/test_cnn.py b/tests/test_cnn.py
index aaad2836..54225747 100644
--- a/tests/test_cnn.py
+++ b/tests/test_cnn.py
@@ -53,7 +53,7 @@ def test_CNN(device):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_CNN(device)
diff --git a/tests/test_compile_overhead.py b/tests/test_compile_overhead.py
index cf0dc1bb..c32b4364 100644
--- a/tests/test_compile_overhead.py
+++ b/tests/test_compile_overhead.py
@@ -21,7 +21,7 @@
         #    shutil.rmtree("/tmp/torchinductor")
         #except FileNotFoundError:
         #    print("no cache")
-        scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
+        scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
         # Register compiled model
         opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
         SchedulerDNNModel.register_model("resnet18", opt_model1)
diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py
index 21bbfec7..e964319d 100644
--- a/tests/test_conv2d.py
+++ b/tests/test_conv2d.py
@@ -40,8 +40,8 @@ def custom_conv2d(a, b, bias):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     torch._dynamo.config.cache_size_limit = 64
     test_conv2d(device, batch_size=8, in_channels=3, out_channels=32, input_size=32, kernel_size=1, stride=1, padding=0)
diff --git a/tests/test_exponent.py b/tests/test_exponent.py
index c95823cb..e60f8407 100644
--- a/tests/test_exponent.py
+++ b/tests/test_exponent.py
@@ -31,7 +31,7 @@ def exponent(a):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_exponent(device, size=(32, 32))
diff --git a/tests/test_hetro.py b/tests/test_hetro.py
index 5e36d730..557ea5d6 100644
--- a/tests/test_hetro.py
+++ b/tests/test_hetro.py
@@ -26,7 +26,7 @@ def custom_matmul(a, b):
     K = args.K
     sparsity = args.sparsity
     mode = args.mode
-    config_path = f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/{args.config}"
+    config_path = f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/{args.config}"
 
     print("M: ", M)
     print("N: ", N)
@@ -36,7 +36,7 @@ def custom_matmul(a, b):
     with torch.no_grad():
         # Init scheduler
         scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
-                            backend_config=config_path)
+                            togsim_config=config_path)
 
         # Register compiled model
         opt_model1 = torch.compile(custom_matmul)
diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py
index b7b20074..c6afaf86 100644
--- a/tests/test_indirect_access.py
+++ b/tests/test_indirect_access.py
@@ -48,8 +48,8 @@ def test_embedding(device, vocab_size, dim):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_indirect_vectoradd(device)
     #test_embedding(device, 1024, 2048)
\ No newline at end of file
diff --git a/tests/test_layernorm.py b/tests/test_layernorm.py
index 1cea9d9f..28e38d37 100644
--- a/tests/test_layernorm.py
+++ b/tests/test_layernorm.py
@@ -41,8 +41,8 @@ def test_LayerNorm(device, size=(64, 64)):
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     #test_LayerNorm(device)
     test_LayerNorm(device, shape)
diff --git a/tests/test_matmul.py b/tests/test_matmul.py
index 6f41468b..cd30bd30 100644
--- a/tests/test_matmul.py
+++ b/tests/test_matmul.py
@@ -94,8 +94,8 @@ def custom_linear(a, b, bias):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_matmul(device, 32, 32, 32)
     test_matmul(device, 128, 128, 128)
diff --git a/tests/test_mlp.py b/tests/test_mlp.py
index b8118aa3..423d6e8e 100644
--- a/tests/test_mlp.py
+++ b/tests/test_mlp.py
@@ -109,8 +109,8 @@ def test_optimizer(device):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_mlp(device)
     test_mlp_inf(device, batch_size=1, input_size=256, hidden_size=512, output_size=256)
diff --git a/tests/test_pool.py b/tests/test_pool.py
index 304a5e7c..f5505dba 100644
--- a/tests/test_pool.py
+++ b/tests/test_pool.py
@@ -47,8 +47,8 @@ def avgpool(a):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     #test_maxpool(device, b=1, c=8, h=16, w=16)
     #test_maxpool(device, b=1, c=8, h=112, w=112)
diff --git a/tests/test_reduce.py b/tests/test_reduce.py
index e1a84b7f..4781112d 100644
--- a/tests/test_reduce.py
+++ b/tests/test_reduce.py
@@ -47,8 +47,8 @@ def reduce_sum(a, dim, keepdim):
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_reduce_sum(device, (29, 47), 1, keepdim=True)
     test_reduce_sum(device, (17, 68), 0, keepdim=True)
diff --git a/tests/test_resnet.py b/tests/test_resnet.py
index 97c60528..c83f13ba 100644
--- a/tests/test_resnet.py
+++ b/tests/test_resnet.py
@@ -49,7 +49,7 @@ def test_resnet(device, batch=1, model_type='resnet18'):
     args = args.parse_args()
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_resnet(device, model_type=args.model_type)
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index c64093a0..91bf0ad8 100644
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -7,13 +7,13 @@
 base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
 sys.path.append(base_path)
 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-config = f'{base_path}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
+config = f'{base_path}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
 
 target_model1 = model1().eval()
 target_model2 = model2(768, 12).eval()
 
 # Init scheduler
-scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
 # Register compiled model
 opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last))
 opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device()))
diff --git a/tests/test_scheduler_batching.py b/tests/test_scheduler_batching.py
index f3b54159..5a34d161 100644
--- a/tests/test_scheduler_batching.py
+++ b/tests/test_scheduler_batching.py
@@ -17,7 +17,7 @@
     target_model1 = model1().eval()
 
     # Init scheduler
-    scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
+    scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
     # Register compiled model
     opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
     SchedulerDNNModel.register_model("resnet18", opt_model1)
diff --git a/tests/test_single_perceptron.py b/tests/test_single_perceptron.py
index c7fdca06..beab1c54 100644
--- a/tests/test_single_perceptron.py
+++ b/tests/test_single_perceptron.py
@@ -82,7 +82,7 @@ def weight_update(a, b, lr):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_single_perceptron(device)
diff --git a/tests/test_softmax.py b/tests/test_softmax.py
index 9fba41dd..e6e8cc1e 100644
--- a/tests/test_softmax.py
+++ b/tests/test_softmax.py
@@ -58,8 +58,8 @@ def test_softmax(device, size=(128, 128), dim=1):
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_softmax(device, size=(64, 128))
     test_softmax(device, size=(64, 128), dim=0)
diff --git a/tests/test_sparse_core.py b/tests/test_sparse_core.py
index b2b16818..72eda0c8 100644
--- a/tests/test_sparse_core.py
+++ b/tests/test_sparse_core.py
@@ -80,9 +80,9 @@ def test_sparse_mlp(device, batch_size=32, input_size=128, hidden_size=128, outp
     import os
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim'))
-    from Scheduler.scheduler import ExecutionEngine
+    from Scheduler.scheduler import PyTorchSimRunner
 
-    module = ExecutionEngine.setup_device()
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_sparse_mlp(device, batch_size=8, input_size=16, hidden_size=32, output_size=64)
     
diff --git a/tests/test_sparsity.py b/tests/test_sparsity.py
index 3e079f83..a2493673 100644
--- a/tests/test_sparsity.py
+++ b/tests/test_sparsity.py
@@ -96,8 +96,8 @@ def test_mlp_inf(device, batch_size=64, input_size=64, hidden_size=32, output_si
     )
     args = parser.parse_args()
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
 
     #test_dec_inf(device, sparsity=args.sparsity, block=args.block)
diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py
index 1cf0d3b3..c7abf0ae 100644
--- a/tests/test_spmm_scheduler.py
+++ b/tests/test_spmm_scheduler.py
@@ -25,7 +25,7 @@
     output_size = args.output_size
     w1_sparsity = args.w1_sparsity
     w2_sparsity = args.w2_sparsity
-    config_path = f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/{args.config}"
+    config_path = f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/{args.config}"
 
     print("batch_size: ", batch_size)
     print("input_size: ", input_size)
@@ -37,7 +37,7 @@
     with torch.no_grad():
         # Init scheduler
         scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
-                            backend_config=config_path)
+                              togsim_config=config_path)
 
         target_model1 = model1(input_size, hidden_size, output_size, w1_sparsity, w2_sparsity, scheduler.execution_engine.module.custom_device()).eval()
         target_model2 = model2(768, 12).eval()
diff --git a/tests/test_stonne.py b/tests/test_stonne.py
index 5e4fe5fb..04ad05a8 100644
--- a/tests/test_stonne.py
+++ b/tests/test_stonne.py
@@ -54,7 +54,7 @@ def test_sparse_mm(device, input_size=128, hidden_size=128, output_size=128, spa
     args = parser.parse_args()
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim'))
  
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_sparse_mm(device, args.sz, args.sz, args.sz, args.sparsity)
\ No newline at end of file
diff --git a/tests/test_topk.py b/tests/test_topk.py
new file mode 100644
index 00000000..0d5c08ec
--- /dev/null
+++ b/tests/test_topk.py
@@ -0,0 +1,54 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def test_topk(device, size=(128, 128), k=5, dim=-1, largest=True, sorted=True):
+    # dim 해석을 위해 양수 인덱스로 변환
+    dim_ = dim if dim >= 0 else (len(size) + dim)
+    assert 0 <= dim_ < len(size), "dim이 텐서 차원 범위를 벗어났습니다."
+    assert k <= size[dim_], f"k(={k})는 size[dim](={size[dim_]}) 이하여야 합니다."
+
+    def topk_fn(a):
+        return torch.topk(a, k, dim=dim, largest=largest, sorted=sorted)
+
+    x = torch.randn(size)
+    x = x.to(device=device)
+
+    opt_topk = torch.compile(dynamic=False)(topk_fn)
+    res_values, res_indices = opt_topk(x)
+
+    ref_values, ref_indices = torch.topk(x.cpu(), k, dim=dim, largest=largest, sorted=sorted)
+
+    test_result("TopK/values", res_values, ref_values)
+    test_result("TopK/indices", res_indices, ref_indices)
+
+if __name__ == "__main__":
+    import os
+    import sys
+    import argparse
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+    parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
+    parser.add_argument('--shape', type=str, default="(512,768)")
+    args = parser.parse_args()
+    shape = tuple(map(int, args.shape.strip('()').split(',')))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    test_topk(device, (128, 128), k=2, dim=-1)
\ No newline at end of file
diff --git a/tests/test_transcendental.py b/tests/test_transcendental.py
index 5f296581..38c2f4f6 100644
--- a/tests/test_transcendental.py
+++ b/tests/test_transcendental.py
@@ -73,8 +73,8 @@ def cos(a):
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_tanh(device)
     test_exp(device)
diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index 4d45707e..a3ac55d7 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -119,8 +119,8 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_EncoderBlock(device)
     # test_Attention(device, head=16, seq=512, d_k=64)
diff --git a/tests/test_transpose2D.py b/tests/test_transpose2D.py
index 14f16fbb..af5aacf7 100644
--- a/tests/test_transpose2D.py
+++ b/tests/test_transpose2D.py
@@ -46,8 +46,8 @@ def transpose(a, b):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_Transpose2D(device, [64, 156])
     test_Transpose2D_2(device, [16, 64])
diff --git a/tests/test_transpose3D.py b/tests/test_transpose3D.py
index 937948c4..d6c1092d 100644
--- a/tests/test_transpose3D.py
+++ b/tests/test_transpose3D.py
@@ -61,8 +61,8 @@ def transpose(a, b):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_Transpose3D_1(device, [62, 34, 44])
     test_Transpose3D_1(device, [62, 134, 144])
diff --git a/tests/test_vectorops.py b/tests/test_vectorops.py
index 0677b7ae..ed895171 100644
--- a/tests/test_vectorops.py
+++ b/tests/test_vectorops.py
@@ -6,8 +6,8 @@
     import os
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     
     # Target shape
diff --git a/tests/test_view3D_2D.py b/tests/test_view3D_2D.py
index a5a31a85..148fe8fa 100644
--- a/tests/test_view3D_2D.py
+++ b/tests/test_view3D_2D.py
@@ -44,8 +44,8 @@ def view2D_3D(a):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_view3D_2D(device)
     test_view3D_2D(device, [12, 512, 64])
diff --git a/tests/test_vit.py b/tests/test_vit.py
index 6f587127..aeb4f148 100644
--- a/tests/test_vit.py
+++ b/tests/test_vit.py
@@ -202,8 +202,8 @@ def test_encoder_block_with_class_token(
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     #test_multihead_attention(device)
     #test_encoder_block(device, seq_len=197)
diff --git a/tutorial/session1/HelloPyTorchSim.ipynb b/tutorial/session1/HelloPyTorchSim.ipynb
new file mode 100644
index 00000000..dfb086a4
--- /dev/null
+++ b/tutorial/session1/HelloPyTorchSim.ipynb
@@ -0,0 +1,1216 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Hello, PyTorchSim!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import os\n",
+    "import sys\n",
+    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
+    "sys.path.append(base_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## One Touch Simulation\n",
+    "### Normal Matmul Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "\n",
+    "torch.manual_seed(0)\n",
+    "input = torch.randn(128, 128).to(device)\n",
+    "weight = torch.randn(128, 128).to(device)\n",
+    "\n",
+    "opt_fn = torch.compile(torch.matmul)\n",
+    "cpu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### PyTorchSim Matmul Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n",
+      "Building extension module npu...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ninja: no work to do.\n",
+      "Wrapper Codegen Path = /tmp/torchinductor_root/ro/croutbd6yxrzgdstfcplx7yrpn2do5frwhyx2md5r7rvrubdhdgd.py\n",
+      "[Gem5] Gem5 is running... \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running..  \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/0\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "\n",
+    "torch.manual_seed(0)\n",
+    "input = torch.randn(128, 128).to(device)\n",
+    "weight = torch.randn(128, 128).to(device)\n",
+    "\n",
+    "opt_fn = torch.compile(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n",
+    "    if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",
+    "        message = f\"|{name} Test Passed|\"\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(message)\n",
+    "        print(\"-\" * len(message))\n",
+    "    else:\n",
+    "        message = f\"|{name} Test Failed|\"\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(message)\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(\"npu out: \", npu_out.cpu())\n",
+    "        print(\"cpu out: \", cpu_out)\n",
+    "        exit(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------\n",
+      "|MatMul Test Passed|\n",
+      "--------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_result(\"MatMul\", npu_out, cpu_out)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from Scheduler.scheduler import PyTorchSimRunner\n",
+    "# npu_device = PyTorchSimRunner.setup_device().custom_device()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Normal Backward Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "torch.manual_seed(0)\n",
+    "cpu_input = torch.randn(128, 128).to(device)\n",
+    "cpu_weight = torch.randn(128, 128).to(device)\n",
+    "cpu_target = torch.randn(128, 128).to(device)\n",
+    "cpu_input.requires_grad = True\n",
+    "cpu_weight.requires_grad = True\n",
+    "\n",
+    "opt_fn = torch.compile(torch.matmul)\n",
+    "cpu_out = opt_fn(cpu_input, cpu_weight)\n",
+    "\n",
+    "loss_fn = torch.nn.CrossEntropyLoss()\n",
+    "cpu_loss = loss_fn(cpu_out, cpu_target)\n",
+    "cpu_loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### PyTorchSim Backward Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrapper Codegen Path = /tmp/torchinductor_root/5i/c5isqyualxbaqsmuhsux7oubvkypfmh4kvamqvgref6z3ypnrpw5.py\n",
+      "[Gem5] Gem5 is running... \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running..  \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/19\"\n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "0 <= device.index() && device.index() < static_cast<c10::DeviceIndex>(device_ready_queues_.size()) INTERNAL ASSERT FAILED at \"/opt/conda/conda-bld/pytorch_1704987394225/work/torch/csrc/autograd/engine.cpp\":1423, please report a bug to PyTorch. ",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[7], line 15\u001b[0m\n\u001b[1;32m     13\u001b[0m loss_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mnn\u001b[38;5;241m.\u001b[39mCrossEntropyLoss()\n\u001b[1;32m     14\u001b[0m npu_loss \u001b[38;5;241m=\u001b[39m loss_fn(npu_out, npu_target)\n\u001b[0;32m---> 15\u001b[0m \u001b[43mnpu_loss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_tensor.py:522\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m    512\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    513\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m    514\u001b[0m         Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m    515\u001b[0m         (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    520\u001b[0m         inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m    521\u001b[0m     )\n\u001b[0;32m--> 522\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    523\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m    524\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py:266\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m    261\u001b[0m     retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[1;32m    263\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[1;32m    264\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m    265\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 266\u001b[0m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m    267\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    268\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    269\u001b[0m \u001b[43m    \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    270\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    271\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    272\u001b[0m \u001b[43m    \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    273\u001b[0m \u001b[43m    \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    274\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: 0 <= device.index() && device.index() < static_cast<c10::DeviceIndex>(device_ready_queues_.size()) INTERNAL ASSERT FAILED at \"/opt/conda/conda-bld/pytorch_1704987394225/work/torch/csrc/autograd/engine.cpp\":1423, please report a bug to PyTorch. "
+     ]
+    }
+   ],
+   "source": [
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "npu_device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "torch.manual_seed(0)\n",
+    "npu_input = torch.randn(128, 128).to(npu_device)\n",
+    "npu_weight = torch.randn(128, 128).to(npu_device)\n",
+    "npu_target = torch.randn(128, 128).to(npu_device)\n",
+    "npu_input.requires_grad = True\n",
+    "npu_weight.requires_grad = True\n",
+    "\n",
+    "opt_fn = torch.compile(torch.matmul)\n",
+    "npu_out = opt_fn(npu_input, npu_weight)\n",
+    "\n",
+    "loss_fn = torch.nn.CrossEntropyLoss()\n",
+    "npu_loss = loss_fn(npu_out, npu_target)\n",
+    "npu_loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'test_result' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtest_result\u001b[49m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMatMul Input Grad\u001b[39m\u001b[38;5;124m\"\u001b[39m, npu_input\u001b[38;5;241m.\u001b[39mgrad, cpu_input\u001b[38;5;241m.\u001b[39mgrad)\n\u001b[1;32m      2\u001b[0m test_result(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMatMul Weight Grad\u001b[39m\u001b[38;5;124m\"\u001b[39m, npu_weight\u001b[38;5;241m.\u001b[39mgrad, cpu_weight\u001b[38;5;241m.\u001b[39mgrad)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'test_result' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "test_result(\"MatMul Input Grad\", npu_input.grad, cpu_input.grad)\n",
+    "test_result(\"MatMul Weight Grad\", npu_weight.grad, cpu_weight.grad)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Mapping\n",
+    "\n",
+    "Default mapping is based on heuristic."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrapper Codegen Path = /tmp/torchinductor_root/5z/c5z4ur2k2svn2gaawn776ev3t6gsa7esgu36la63523cqpbbt56d.py\n",
+      "[Gem5] Gem5 is running..  \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running.   \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-11-30 18:53:14.002] [info] Total execution cycle: 47158\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0 | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Manual Mapping\n",
+    "User can set tile size manually."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrapper Codegen Path = /tmp/torchinductor_root/mv/cmv6cp7oo3wwndv76iv3sib7r74tnbvodfwxi3rw33k7grlh3h4h.py\n",
+      "[Gem5] Gem5 is running.   \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running... \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/75hiq5mugpq/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/75hiq5mugpq/togsim_result/0\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "torch._dynamo.reset()\n",
+    "\n",
+    "os.environ['TORCHSIM_MANUAL_TILE_SIZE']=\"1\"\n",
+    "os.environ['TORCHSIM_TILE_M']=\"512\"\n",
+    "os.environ['TORCHSIM_TILE_N']=\"512\"\n",
+    "os.environ['TORCHSIM_TILE_K']=\"512\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-11-30 18:54:00.878] [info] Total execution cycle: 53704\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor/tmp/75hiq5mugpq/togsim_result/0 | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Autotune"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Auto-tune] Trying tile size: [1024, 1024, 256, 128, 1024, 256]\n",
+      "[Auto-tune] Trying tile size: [256, 1024, 1024, 128, 1024, 1024]\n",
+      "[Auto-tune] Trying tile size: [1024, 256, 1024, 128, 256, 1024]\n",
+      "[Auto-tune] Trying tile size: [1024, 1024, 128, 128, 1024, 128]\n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/x27ipc5avjg/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/0\"\n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/7j33rcic2qn/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/7j33rcic2qn/togsim_result/0\"\n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/vsaamplubl5/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/vsaamplubl5/togsim_result/0\"\n",
+      "[Auto-tune] Optimal tile size: [1024, 1024, 128, 128, 1024, 128], cycles: 46423\n",
+      "Wrapper Codegen Path = /tmp/torchinductor_root/3b/c3bebp4b4rp73grbvhbaq4xdxny7f5m7fgqkgpflp2cjn3x5uugr.py\n",
+      "[Gem5] Gem5 is running..  \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running.   \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/x27ipc5avjg/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/1\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "torch._dynamo.reset()\n",
+    "os.environ['TORCHSIM_MANUAL_TILE_SIZE']=\"0\"\n",
+    "os.environ['AUTOTUNE_TEMPLATE']=\"1\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-11-30 18:54:53.051] [info] Total execution cycle: 46422\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/1 | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Execution Mode\n",
+    "### Functional & Timing mode (Default)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrapper Codegen Path = /tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py\n",
+      "[Gem5] Gem5 is running..  \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running.   \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/4\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "torch._dynamo.reset()\n",
+    "os.environ['AUTOTUNE_TEMPLATE']=\"0\"\n",
+    "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"1\"\n",
+    "os.environ['TORCHSIM_TIMING_MODE']=\"1\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Functional only mode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Spike] Running Spike simulator\n"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"1\"\n",
+    "os.environ['TORCHSIM_TIMING_MODE']=\"0\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Timing only mode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[23], line 8\u001b[0m\n\u001b[1;32m      5\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m      7\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 8\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m     11\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;124;03m    Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m     \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     17\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m     19\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m inner\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified.<locals>.forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m    899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m    900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func.<locals>.g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper.<locals>.runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m     88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     89\u001b[0m     \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m     90\u001b[0m     \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m     91\u001b[0m     \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m     92\u001b[0m     \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m     93\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m         all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m     95\u001b[0m             compiled_fn,\n\u001b[1;32m     96\u001b[0m             args,\n\u001b[1;32m     97\u001b[0m             disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m     98\u001b[0m         )\n\u001b[1;32m    100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m    101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m    103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m    104\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m         out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    106\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    107\u001b[0m         \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m    108\u001b[0m         \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m    109\u001b[0m         warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m    110\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    111\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    112\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    113\u001b[0m         )\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base.<locals>.rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    116\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m    117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m    863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m    884\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m    885\u001b[0m     compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m    886\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m    887\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m    888\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m    889\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m    890\u001b[0m     )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m    124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m    127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir.<locals>.dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m    284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m    287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m     23\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m     26\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m     27\u001b[0m         file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"0\"\n",
+    "os.environ['TORCHSIM_TIMING_MODE']=\"1\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TOGSim Configuration\n",
+    "### Single Core"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[22], line 7\u001b[0m\n\u001b[1;32m      4\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m      6\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 7\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m     11\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;124;03m    Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m     \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     17\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m     19\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m inner\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified.<locals>.forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m    899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m    900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func.<locals>.g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper.<locals>.runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m     88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     89\u001b[0m     \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m     90\u001b[0m     \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m     91\u001b[0m     \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m     92\u001b[0m     \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m     93\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m         all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m     95\u001b[0m             compiled_fn,\n\u001b[1;32m     96\u001b[0m             args,\n\u001b[1;32m     97\u001b[0m             disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m     98\u001b[0m         )\n\u001b[1;32m    100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m    101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m    103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m    104\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m         out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    106\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    107\u001b[0m         \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m    108\u001b[0m         \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m    109\u001b[0m         warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m    110\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    111\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    112\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    113\u001b[0m         )\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base.<locals>.rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    116\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m    117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m    863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m    884\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m    885\u001b[0m     compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m    886\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m    887\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m    888\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m    889\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m    890\u001b[0m     )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m    124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m    127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir.<locals>.dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m    284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m    287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m     23\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m     26\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m     27\u001b[0m         file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ['TORCHSIM_CONFIG']=\"/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-11-30 18:32:01.843] [info] Total execution cycle: 47126\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/11 | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Multi-Core"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running.   \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/12\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ['TORCHSIM_CONFIG']=\"/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-11-30 18:34:48.969] [info] Total execution cycle: 40736\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/12 | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TOGSim log level\n",
+    "### log level info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[21], line 7\u001b[0m\n\u001b[1;32m      4\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m      6\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 7\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m     11\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;124;03m    Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m     \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     17\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m     19\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m inner\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified.<locals>.forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m    899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m    900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func.<locals>.g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper.<locals>.runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m     88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     89\u001b[0m     \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m     90\u001b[0m     \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m     91\u001b[0m     \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m     92\u001b[0m     \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m     93\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m         all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m     95\u001b[0m             compiled_fn,\n\u001b[1;32m     96\u001b[0m             args,\n\u001b[1;32m     97\u001b[0m             disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m     98\u001b[0m         )\n\u001b[1;32m    100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m    101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m    103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m    104\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m         out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    106\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    107\u001b[0m         \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m    108\u001b[0m         \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m    109\u001b[0m         warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m    110\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    111\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    112\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    113\u001b[0m         )\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base.<locals>.rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    116\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m    117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m    863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m    884\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m    885\u001b[0m     compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m    886\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m    887\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m    888\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m    889\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m    890\u001b[0m     )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m    124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m    127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir.<locals>.dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m    284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m    287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m     23\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m     26\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m     27\u001b[0m         file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ['TORCHSIM_DUMP_PATH']=\"/workspace/PyTorchSim\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### log level trace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running.   \n",
+      "[TOGSim] Simulation of \"/workspace/PyTorchSim/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/workspace/PyTorchSim/tmp/4q4qv6gbpia/togsim_result/1\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ['BACKENDSIM_DEBUG_LEVEL']=\"trace\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Scheduler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torchvision.models import resnet18\n",
+    "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request\n",
+    "from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_BACKEND_CONFIG\n",
+    "\n",
+    "scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=CONFIG_TORCHSIM_BACKEND_CONFIG)\n",
+    "device = scheduler.execution_engine.module.custom_device()\n",
+    "\n",
+    "model = resnet18().eval()\n",
+    "input = torch.randn(1, 3, 224, 224).to(device=device)\n",
+    "opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last))\n",
+    "\n",
+    "SchedulerDNNModel.register_model(\"resnet18\", opt_fn)\n",
+    "request = Request(\"resnet18\", [input], [], request_queue_idx=0)\n",
+    "scheduler.add_request(request, request_time=0)\n",
+    "\n",
+    "# Run scheduler\n",
+    "while not scheduler.is_finished():\n",
+    "    with torch.no_grad():\n",
+    "        scheduler.schedule()\n",
+    "\n",
+    "print(\"ResNet18 Simulation Done\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Generator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-11-30 13:05:13.597] [info] [LoadConfig] Success to open \"/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\"\n",
+      "[2025-11-30 13:05:13.597] [info] [Config/Core] CPU 0: Partition 0\n",
+      "[2025-11-30 13:05:13.597] [info] [Config/Core] CPU 1: Partition 0\n",
+      "[2025-11-30 13:05:13.597] [info] [Config/Core] Core 0: 700 MHz, Systolic array per core: 1\n",
+      "[2025-11-30 13:05:13.597] [info] [Config/Core] Core 1: 700 MHz, Systolic array per core: 1\n",
+      "[2025-11-30 13:05:13.597] [info] [Config/DRAM] Ramulator2 config: /root/workspace/PyTorchSim/PyTorchSimBackend/configs/../configs/ramulator2_configs/HBM2.yaml\n",
+      "[2025-11-30 13:05:13.597] [info] [Config/DRAM] DRAM Bandwidth 716 GB/s, Freq: 700 MHz, Channels: 32, Request_size: 32B\n",
+      "[2025-11-30 13:05:13.597] [info] [Config/L2Cache] No L2 cache\n",
+      "[2025-11-30 13:05:13.673] [info] [Config/Interconnect] Interconnect freq: 20000 MHz\n",
+      "[2025-11-30 13:05:13.673] [info] [Config/Interconnect] SimpleInerconnect selected\n",
+      "[0] BackendSim> [Reqest] Resnet18 request time:  0\n",
+      "[Request issue] partition: 0 batch size: 1\n",
+      "[Request-0 issue] partition: 0 arrival_time: 0 start_time: 0.0\n",
+      "Wrapper Codegen Path = /tmp/torchinductor_root/qx/cqxp5xnkdgcdwmer5w6ftyf46iegefhyjclg6mkz2smhktj4tpcy.py\n",
+      "launch /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx /tmp/torchinductor/tmp/w5hefiqdl3p/runtime_0001/attribute/0 0 0\n",
+      "[2025-11-30 13:05:22.114] [info] [LoadConfig] Success to open \"/tmp/torchinductor/tmp/w5hefiqdl3p/runtime_0001/attribute/0\"\n",
+      "[2025-11-30 13:05:22.114] [info] [LoadConfig] Success to open \"/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\"\n",
+      "[2025-11-30 13:05:22.115] [info] [TOGParser/Attribute] Address Attribute key: arg0 address: 0xa3056c0\n",
+      "[2025-11-30 13:05:22.115] [info] [TOGParser/Attribute] Address Attribute key: arg1 address: 0xc4a3d40\n",
+      "[2025-11-30 13:05:22.115] [info] [TOGParser] Register Metadata \"systolic_size\": \"128\"\n",
+      "[2025-11-30 13:05:22.115] [info] [TOGParser] Register Metadata \"stonneGraph\": \"0\"\n",
+      "[2025-11-30 13:05:22.116] [info] [Scheduler 0] Register graph path: /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx operation: primals_123 at 0\n",
+      "[2025-11-30 13:05:22.116] [info] [Scheduler 0] Tile Graph FIFO Scheduled\n",
+      "until -1\n",
+      "[2025-11-30 13:05:22.117] [info] HBM2-CH_0: BW utilization 0% (0 reads, 0 writes)\n",
+      "[2025-11-30 13:05:22.319] [info] [Scheduler 0] Graph path: /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx operation: primals_123 finish at 2424\n",
+      "[2025-11-30 13:05:22.319] [info] Total compute time 2424\n",
+      "cycle\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 33\u001b[0m\n\u001b[1;32m     31\u001b[0m \u001b[38;5;66;03m# Run scheduler\u001b[39;00m\n\u001b[1;32m     32\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m scheduler\u001b[38;5;241m.\u001b[39mis_finished():\n\u001b[0;32m---> 33\u001b[0m     \u001b[43mscheduler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mschedule\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:475\u001b[0m, in \u001b[0;36mScheduler.schedule\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    473\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcurrent_cycle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbackend_simulator\u001b[38;5;241m.\u001b[39mcycle()\n\u001b[1;32m    474\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 475\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnext_time\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    476\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:507\u001b[0m, in \u001b[0;36mScheduler.run\u001b[0;34m(self, until_time)\u001b[0m\n\u001b[1;32m    505\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m until_time \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m    506\u001b[0m     \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mis_any_idle(req_empty_info):\n\u001b[0;32m--> 507\u001b[0m         result \u001b[38;5;241m=\u001b[39m \u001b[43mexecute_cycle\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    508\u001b[0m         req_empty_info \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest_empty(i) \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mnum_partion)]\n\u001b[1;32m    509\u001b[0m         \u001b[38;5;66;03m# if result is not -1, schedule new request\u001b[39;00m\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:484\u001b[0m, in \u001b[0;36mScheduler.run.<locals>.execute_cycle\u001b[0;34m()\u001b[0m\n\u001b[1;32m    482\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mnum_partion):\n\u001b[1;32m    483\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mpartition_state[i] \u001b[38;5;241m==\u001b[39m PyTorchSimRunner\u001b[38;5;241m.\u001b[39mPARTITION_IDLE:\n\u001b[0;32m--> 484\u001b[0m         ret \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlaunch_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_cycle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    485\u001b[0m         launch_ret_info\u001b[38;5;241m.\u001b[39mappend(ret)\n\u001b[1;32m    487\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcheck_finish_request()\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:254\u001b[0m, in \u001b[0;36mPyTorchSimRunner.launch_kernel\u001b[0;34m(self, current_cycle, partion_idx)\u001b[0m\n\u001b[1;32m    252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpartition_state[partion_idx] \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mPARTITION_IDLE:\n\u001b[1;32m    253\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpartition_state[partion_idx]\n\u001b[0;32m--> 254\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselect_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpartion_idx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    255\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mSELECT_NOTHING:\n\u001b[1;32m    256\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mSELECT_NOTHING\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:290\u001b[0m, in \u001b[0;36mFIFORunner.select_kernel\u001b[0;34m(self, partition_idx)\u001b[0m\n\u001b[1;32m    287\u001b[0m         nested_gen \u001b[38;5;241m=\u001b[39m kernel(\u001b[38;5;241m*\u001b[39minputs)\n\u001b[1;32m    288\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnested_launch_model_dicts[partition_idx] \u001b[38;5;241m=\u001b[39m {req : nested_gen}\n\u001b[1;32m    289\u001b[0m         kernel, inputs \u001b[38;5;241m=\u001b[39m \\\n\u001b[0;32m--> 290\u001b[0m             \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnested_launch_model_dicts\u001b[49m\u001b[43m[\u001b[49m\u001b[43mpartition_idx\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43mreq\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    291\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m kernel, inputs\n\u001b[1;32m    292\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    293\u001b[0m     \u001b[38;5;66;03m# Retry\u001b[39;00m\n",
+      "File \u001b[0;32m/tmp/torchinductor_root/qx/cqxp5xnkdgcdwmer5w6ftyf46iegefhyjclg6mkz2smhktj4tpcy.py:227\u001b[0m, in \u001b[0;36mConv2D_1_3_224_22464_3_7_7_2_2_3_3_1_1_3\u001b[0;34m(X, W, Y)\u001b[0m\n\u001b[1;32m    224\u001b[0m W \u001b[38;5;241m=\u001b[39m W\u001b[38;5;241m.\u001b[39mpermute(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m0\u001b[39m)\u001b[38;5;241m.\u001b[39mcontiguous() \u001b[38;5;66;03m# (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)\u001b[39;00m\n\u001b[1;32m    226\u001b[0m \u001b[38;5;66;03m# Launch kernel\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m \u001b[43mmlir_kernel_1\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mW\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    228\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m (mlir_kernel_1, (X, W, Y))\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:307\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir.<locals>.dryrun_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    306\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdryrun_simulator\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 307\u001b[0m     key \u001b[38;5;241m=\u001b[39m \u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    308\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfilelock\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FileLock\n\u001b[1;32m    309\u001b[0m     lock_dir \u001b[38;5;241m=\u001b[39m get_lock_dir()\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/concurrent/futures/_base.py:453\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    450\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[1;32m    451\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__get_result()\n\u001b[0;32m--> 453\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_condition\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    455\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n\u001b[1;32m    456\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/threading.py:320\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    318\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:    \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m    319\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m         \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    321\u001b[0m         gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m    322\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "from torchvision.models import resnet18\n",
+    "\n",
+    "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator\n",
+    "CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
+    "\n",
+    "lambda_requests = 10\n",
+    "max_time = 30\n",
+    "\n",
+    "target_model1 = resnet18().eval()\n",
+    "\n",
+    "# Init scheduler\n",
+    "scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, backend_config=f\"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\")\n",
+    "# Register compiled model\n",
+    "opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)\n",
+    "SchedulerDNNModel.register_model(\"resnet18\", opt_model1)\n",
+    "\n",
+    "# Generate time stamp\n",
+    "for request_time in poisson_request_generator(lambda_requests, max_time):\n",
+    "    # Init input data\n",
+    "    model_input1 = torch.randn(1, 3, 224, 224)\n",
+    "\n",
+    "    # Init request\n",
+    "    new_request1 = Request(\"resnet18\", [model_input1], [], request_queue_idx=0)\n",
+    "\n",
+    "    # Add request to scheduler\n",
+    "    print(\"[Reqest] Resnet18 request time: \", request_time, flush=True)\n",
+    "    scheduler.add_request(new_request1, request_time=request_time)\n",
+    "\n",
+    "# Run scheduler\n",
+    "while not scheduler.is_finished():\n",
+    "    scheduler.schedule()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compiler Optimization\n",
+    "### GeMM + ReLU fusion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrapper Codegen Path = /tmp/torchinductor_root/vr/cvrlybtkuzkk6pmnlfxu7o55375z24tajmiow6mszaen5t4ra6zo.py\n",
+      "[Gem5] Gem5 is running.   \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running.   \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/5o2xythi5z3/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/5o2xythi5z3/togsim_result/0\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "def gemm_relu(a, b):\n",
+    "    return torch.relu(torch.matmul(a, b))\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n",
+    "out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cat: /tmp/torchinductor/tmp/5o2xythi5z3/backendsim_result/0: No such file or directory\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor/tmp/5o2xythi5z3/backendsim_result/0 | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Disable fusion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n",
+      "Building extension module npu...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ninja: no work to do.\n",
+      "Wrapper Codegen Path = /tmp/torchinductor_root/tl/ctlqjsvukam6d4kteerml7exwbt4paw7cjtjbxcwdlsd7e4koriq.py\n",
+      "[Gem5] Gem5 is running... \n",
+      "[Gem5] Gem5 is running..  \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running.   \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0\"\n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running..  \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/37dfo4nczcq/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/37dfo4nczcq/togsim_result/0\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ['TORCHSIM_COMPILER_OPTIMIZATION']=\"none\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "def gemm_relu(a, b):\n",
+    "    return torch.relu(torch.matmul(a, b))\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n",
+    "out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-11-30 12:52:49.376] [info] Total execution cycle: 47164\n",
+      "[2025-11-30 12:52:52.444] [info] Total execution cycle: 58510\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/backendsim_result/2 | grep \"Total execution cycle\"\n",
+    "!cat /tmp/torchinductor/tmp/37dfo4nczcq/backendsim_result/0 | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Single kernel mode (TODO: remove it?)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/assumptions.py:499\u001b[0m, in \u001b[0;36mmake_property.<locals>.getit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    498\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 499\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_assumptions\u001b[49m\u001b[43m[\u001b[49m\u001b[43mfact\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m    500\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m:\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'extended_negative'",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[2], line 10\u001b[0m\n\u001b[1;32m      7\u001b[0m model \u001b[38;5;241m=\u001b[39m resnet18()\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m      9\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(model)\n\u001b[0;32m---> 10\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1519\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1523\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1519\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1523\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:655\u001b[0m, in \u001b[0;36mcatch_errors_wrapper.<locals>.catch_errors\u001b[0;34m(frame, cache_entry, frame_state)\u001b[0m\n\u001b[1;32m    652\u001b[0m             \u001b[38;5;28;01mreturn\u001b[39;00m hijacked_callback(frame, cache_entry, hooks, frame_state)\n\u001b[1;32m    654\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compile_lock, _disable_current_modes():\n\u001b[0;32m--> 655\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcallback\u001b[49m\u001b[43m(\u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_entry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:727\u001b[0m, in \u001b[0;36mconvert_frame.<locals>._convert_frame\u001b[0;34m(frame, cache_entry, hooks, frame_state)\u001b[0m\n\u001b[1;32m    725\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mframes\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtotal\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    726\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 727\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[43minner_convert\u001b[49m\u001b[43m(\u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_entry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    728\u001b[0m     counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mframes\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mok\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    729\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:383\u001b[0m, in \u001b[0;36mconvert_frame_assert.<locals>._convert_frame_assert\u001b[0;34m(frame, cache_entry, hooks, frame_state)\u001b[0m\n\u001b[1;32m    370\u001b[0m signpost_event(\n\u001b[1;32m    371\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdynamo\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    372\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_convert_frame_assert._compile\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    379\u001b[0m     },\n\u001b[1;32m    380\u001b[0m )\n\u001b[1;32m    382\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config\u001b[38;5;241m.\u001b[39mpatch(_patch_config_if_changed()):\n\u001b[0;32m--> 383\u001b[0m     compiled_product \u001b[38;5;241m=\u001b[39m \u001b[43m_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    384\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_code\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    385\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_globals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    386\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_locals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    387\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_builtins\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    388\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcompiler_fn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    389\u001b[0m \u001b[43m        \u001b[49m\u001b[43mone_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    390\u001b[0m \u001b[43m        \u001b[49m\u001b[43mexport\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    391\u001b[0m \u001b[43m        \u001b[49m\u001b[43mexport_constraints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    392\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    393\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    394\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    395\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mframe_state\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    396\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcompile_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcompile_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    397\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    398\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m compiled_product\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:646\u001b[0m, in \u001b[0;36m_compile\u001b[0;34m(code, globals, locals, builtins, compiler_fn, one_graph, export, export_constraints, hooks, cache_size, frame, frame_state, compile_id)\u001b[0m\n\u001b[1;32m    644\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compile_context(CompileContext(compile_id)):\n\u001b[1;32m    645\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 646\u001b[0m         guarded_code \u001b[38;5;241m=\u001b[39m \u001b[43mcompile_inner\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mone_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransform\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    647\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m guarded_code\n\u001b[1;32m    648\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m (\n\u001b[1;32m    649\u001b[0m         Unsupported,\n\u001b[1;32m    650\u001b[0m         TorchRuntimeError,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    657\u001b[0m         BisectValidationException,\n\u001b[1;32m    658\u001b[0m     ) \u001b[38;5;28;01mas\u001b[39;00m e:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:562\u001b[0m, in \u001b[0;36m_compile.<locals>.compile_inner\u001b[0;34m(code, one_graph, hooks, transform)\u001b[0m\n\u001b[1;32m    560\u001b[0m CompileContext\u001b[38;5;241m.\u001b[39mget()\u001b[38;5;241m.\u001b[39mattempt \u001b[38;5;241m=\u001b[39m attempt\n\u001b[1;32m    561\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 562\u001b[0m     out_code \u001b[38;5;241m=\u001b[39m \u001b[43mtransform_code_object\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransform\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    563\u001b[0m     \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m    564\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mRestartAnalysis \u001b[38;5;28;01mas\u001b[39;00m e:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/bytecode_transformation.py:1033\u001b[0m, in \u001b[0;36mtransform_code_object\u001b[0;34m(code, transformations, safe)\u001b[0m\n\u001b[1;32m   1030\u001b[0m instructions \u001b[38;5;241m=\u001b[39m cleaned_instructions(code, safe)\n\u001b[1;32m   1031\u001b[0m propagate_line_nums(instructions)\n\u001b[0;32m-> 1033\u001b[0m \u001b[43mtransformations\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstructions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcode_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1034\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m clean_and_assemble_instructions(instructions, keys, code_options)[\u001b[38;5;241m1\u001b[39m]\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:151\u001b[0m, in \u001b[0;36mpreserve_global_state.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    149\u001b[0m cleanup \u001b[38;5;241m=\u001b[39m setup_compile_debug()\n\u001b[1;32m    150\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 151\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    152\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    153\u001b[0m     cleanup\u001b[38;5;241m.\u001b[39mclose()\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:527\u001b[0m, in \u001b[0;36m_compile.<locals>.transform\u001b[0;34m(instructions, code_options)\u001b[0m\n\u001b[1;32m    525\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    526\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m tracing(tracer\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mtracing_context), tracer\u001b[38;5;241m.\u001b[39mset_current_tx():\n\u001b[0;32m--> 527\u001b[0m         \u001b[43mtracer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    528\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mUnspecializeRestartAnalysis:\n\u001b[1;32m    529\u001b[0m     speculation_log\u001b[38;5;241m.\u001b[39mclear()\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:2128\u001b[0m, in \u001b[0;36mInstructionTranslator.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   2127\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m-> 2128\u001b[0m     \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:818\u001b[0m, in \u001b[0;36mInstructionTranslatorBase.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    813\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    814\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mpush_tx(\u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m    815\u001b[0m     \u001b[38;5;28;01mwhile\u001b[39;00m (\n\u001b[1;32m    816\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minstruction_pointer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    817\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mshould_exit\n\u001b[0;32m--> 818\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    819\u001b[0m     ):\n\u001b[1;32m    820\u001b[0m         \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m    821\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m BackendCompilerFailed:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:781\u001b[0m, in \u001b[0;36mInstructionTranslatorBase.step\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    777\u001b[0m         unimplemented(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmissing: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minst\u001b[38;5;241m.\u001b[39mopname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    778\u001b[0m     TracingContext\u001b[38;5;241m.\u001b[39mset_current_loc(\n\u001b[1;32m    779\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_filename, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlineno, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_name\n\u001b[1;32m    780\u001b[0m     )\n\u001b[0;32m--> 781\u001b[0m     \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minst\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopname\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minst\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    783\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m inst\u001b[38;5;241m.\u001b[39mopname \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    784\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m Unsupported:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:2243\u001b[0m, in \u001b[0;36mInstructionTranslator.RETURN_VALUE\u001b[0;34m(self, inst)\u001b[0m\n\u001b[1;32m   2238\u001b[0m _step_logger()(\n\u001b[1;32m   2239\u001b[0m     logging\u001b[38;5;241m.\u001b[39mINFO,\n\u001b[1;32m   2240\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorchdynamo done tracing \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (RETURN_VALUE)\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m   2241\u001b[0m )\n\u001b[1;32m   2242\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE triggered compile\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 2243\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moutput\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompile_subgraph\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2244\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2245\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreason\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mGraphCompileReason\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2246\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreturn_value\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mframe_summary\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgraph_break\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[1;32m   2247\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2248\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcompile_return_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   2249\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2250\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39madd_output_instructions([create_instruction(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE\u001b[39m\u001b[38;5;124m\"\u001b[39m)])\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:919\u001b[0m, in \u001b[0;36mOutputGraph.compile_subgraph\u001b[0;34m(self, tx, partial_convert, reason, compile_return_value)\u001b[0m\n\u001b[1;32m    916\u001b[0m     append_prefix_insts()\n\u001b[1;32m    917\u001b[0m     \u001b[38;5;66;03m# optimization to generate better code in a common case\u001b[39;00m\n\u001b[1;32m    918\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_output_instructions(\n\u001b[0;32m--> 919\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompile_and_call_fx_graph\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mreversed\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mstack_values\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mroot\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    920\u001b[0m         \u001b[38;5;241m+\u001b[39m [create_instruction(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUNPACK_SEQUENCE\u001b[39m\u001b[38;5;124m\"\u001b[39m, arg\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(stack_values))]\n\u001b[1;32m    921\u001b[0m     )\n\u001b[1;32m    922\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    923\u001b[0m     graph_output_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnew_var(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgraph_out\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36mContextDecorator.__call__.<locals>.inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m     76\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m     77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds):\n\u001b[1;32m     78\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1087\u001b[0m, in \u001b[0;36mOutputGraph.compile_and_call_fx_graph\u001b[0;34m(self, tx, rv, root)\u001b[0m\n\u001b[1;32m   1084\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtracing_context\u001b[38;5;241m.\u001b[39mfake_mode \u001b[38;5;241m=\u001b[39m backend_fake_mode\n\u001b[1;32m   1086\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrestore_global_state():\n\u001b[0;32m-> 1087\u001b[0m     compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcall_user_compiler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1088\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m disable(compiled_fn)\n\u001b[1;32m   1090\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstats\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munique_graphs\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1140\u001b[0m, in \u001b[0;36mOutputGraph.call_user_compiler\u001b[0;34m(self, gm)\u001b[0m\n\u001b[1;32m   1138\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39mverify_correctness:\n\u001b[1;32m   1139\u001b[0m     compiler_fn \u001b[38;5;241m=\u001b[39m WrapperBackend(compiler_fn)\n\u001b[0;32m-> 1140\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexample_inputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1141\u001b[0m _step_logger()(logging\u001b[38;5;241m.\u001b[39mINFO, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdone compiler function \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1142\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(compiled_fn), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompiler_fn did not return callable\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/repro/after_dynamo.py:117\u001b[0m, in \u001b[0;36mwrap_backend_debug.<locals>.debug_wrapper\u001b[0;34m(gm, example_inputs, **kwargs)\u001b[0m\n\u001b[1;32m    115\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[1;32m    116\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 117\u001b[0m     compiled_gm \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    119\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m compiled_gm\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/__init__.py:1662\u001b[0m, in \u001b[0;36m_TorchCompileInductorWrapper.__call__\u001b[0;34m(self, model_, inputs_)\u001b[0m\n\u001b[1;32m   1659\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, model_, inputs_):\n\u001b[1;32m   1660\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_inductor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompile_fx\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m compile_fx\n\u001b[0;32m-> 1662\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompile_fx\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig_patches\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:1168\u001b[0m, in \u001b[0;36mcompile_fx\u001b[0;34m(model_, example_inputs_, inner_compile, config_patches, decompositions)\u001b[0m\n\u001b[1;32m   1163\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m inference_compiler(unlifted_gm, example_inputs_)\n\u001b[1;32m   1165\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_fake_mode(fake_mode), torch\u001b[38;5;241m.\u001b[39m_guards\u001b[38;5;241m.\u001b[39mtracing(\n\u001b[1;32m   1166\u001b[0m     tracing_context\n\u001b[1;32m   1167\u001b[0m ), compiled_autograd\u001b[38;5;241m.\u001b[39mdisable():\n\u001b[0;32m-> 1168\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43maot_autograd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1169\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfw_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1170\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbw_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbw_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1171\u001b[0m \u001b[43m        \u001b[49m\u001b[43minference_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minference_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1172\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdecompositions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecompositions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1173\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpartition_fn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_fn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1174\u001b[0m \u001b[43m        \u001b[49m\u001b[43mkeep_inference_input_mutations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   1175\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs_\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/backends/common.py:55\u001b[0m, in \u001b[0;36maot_autograd.<locals>.compiler_fn\u001b[0;34m(gm, example_inputs)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     \u001b[38;5;66;03m# NB: NOT cloned!\u001b[39;00m\n\u001b[1;32m     54\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m enable_aot_logging(), patch_config:\n\u001b[0;32m---> 55\u001b[0m         cg \u001b[38;5;241m=\u001b[39m \u001b[43maot_module_simplified\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     56\u001b[0m         counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot_autograd\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mok\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m     57\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m disable(cg)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:887\u001b[0m, in \u001b[0;36maot_module_simplified\u001b[0;34m(mod, args, fw_compiler, bw_compiler, partition_fn, decompositions, keep_inference_input_mutations, inference_compiler)\u001b[0m\n\u001b[1;32m    871\u001b[0m aot_config \u001b[38;5;241m=\u001b[39m AOTConfig(\n\u001b[1;32m    872\u001b[0m     fw_compiler\u001b[38;5;241m=\u001b[39mfw_compiler,\n\u001b[1;32m    873\u001b[0m     bw_compiler\u001b[38;5;241m=\u001b[39mbw_compiler,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    883\u001b[0m     no_tangents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m    884\u001b[0m )\n\u001b[1;32m    886\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compiled_autograd\u001b[38;5;241m.\u001b[39mdisable():\n\u001b[0;32m--> 887\u001b[0m     compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_aot_dispatcher_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    888\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfunctional_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    889\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    890\u001b[0m \u001b[43m        \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    891\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    893\u001b[0m \u001b[38;5;66;03m# TODO: There is something deeply wrong here; compiled_fn running with\u001b[39;00m\n\u001b[1;32m    894\u001b[0m \u001b[38;5;66;03m# the boxed calling convention, but aot_module_simplified somehow\u001b[39;00m\n\u001b[1;32m    895\u001b[0m \u001b[38;5;66;03m# historically returned a function that was not the boxed calling\u001b[39;00m\n\u001b[1;32m    896\u001b[0m \u001b[38;5;66;03m# convention.  This should get fixed...\u001b[39;00m\n\u001b[1;32m    897\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39mruntime_args):\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:600\u001b[0m, in \u001b[0;36mcreate_aot_dispatcher_function\u001b[0;34m(flat_fn, flat_args, aot_config)\u001b[0m\n\u001b[1;32m    597\u001b[0m compiler_fn \u001b[38;5;241m=\u001b[39m partial(aot_wrapper_dedupe, compiler_fn\u001b[38;5;241m=\u001b[39mcompiler_fn)\n\u001b[1;32m    598\u001b[0m \u001b[38;5;66;03m# You can put more passes here\u001b[39;00m\n\u001b[0;32m--> 600\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfake_flat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    601\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m aot_config\u001b[38;5;241m.\u001b[39mis_export:\n\u001b[1;32m    602\u001b[0m     mutated_user_inp_locs \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    603\u001b[0m         idx \u001b[38;5;241m-\u001b[39m aot_config\u001b[38;5;241m.\u001b[39mnum_params_buffers\n\u001b[1;32m    604\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m fw_metadata\u001b[38;5;241m.\u001b[39mmutated_inp_runtime_indices\n\u001b[1;32m    605\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m idx \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m aot_config\u001b[38;5;241m.\u001b[39mnum_params_buffers\n\u001b[1;32m    606\u001b[0m     ]\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:425\u001b[0m, in \u001b[0;36maot_wrapper_dedupe\u001b[0;34m(flat_fn, flat_args, aot_config, compiler_fn, fw_metadata)\u001b[0m\n\u001b[1;32m    422\u001b[0m             \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m    424\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ok:\n\u001b[0;32m--> 425\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mleaf_flat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    427\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m requires_subclass_dispatch(leaf_flat_args, fw_metadata):\n\u001b[1;32m    428\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m    429\u001b[0m \u001b[38;5;250m            \u001b[39m\u001b[38;5;124;03m\"\"\"\\\u001b[39;00m\n\u001b[1;32m    430\u001b[0m \u001b[38;5;124;03mEncountered duplicate inputs that are mutated in the graph, but at least one input/output\u001b[39;00m\n\u001b[1;32m    431\u001b[0m \u001b[38;5;124;03mto the graph is a tensor subclass. This is not supported today. You can try to\u001b[39;00m\n\u001b[1;32m    432\u001b[0m \u001b[38;5;124;03mremove the aliasing yourself as a workaround, or otherwise file an issue on github.\"\"\"\u001b[39;00m\n\u001b[1;32m    433\u001b[0m         )\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:630\u001b[0m, in \u001b[0;36maot_wrapper_synthetic_base\u001b[0;34m(flat_fn, flat_args, aot_config, fw_metadata, needs_autograd, compiler_fn)\u001b[0m\n\u001b[1;32m    628\u001b[0m \u001b[38;5;66;03m# Happy path: we don't need synthetic bases\u001b[39;00m\n\u001b[1;32m    629\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m synthetic_base_info \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 630\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mflat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    632\u001b[0m \u001b[38;5;66;03m# export path: ban synthetic bases for now, add later if requested.\u001b[39;00m\n\u001b[1;32m    633\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m requires_subclass_dispatch(flat_args, fw_metadata):\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:295\u001b[0m, in \u001b[0;36maot_dispatch_autograd\u001b[0;34m(flat_fn, flat_args, aot_config, fw_metadata)\u001b[0m\n\u001b[1;32m    292\u001b[0m     tracing_context\u001b[38;5;241m.\u001b[39mfw_metadata \u001b[38;5;241m=\u001b[39m inner_meta\n\u001b[1;32m    294\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m TracingContext\u001b[38;5;241m.\u001b[39mreport_output_strides() \u001b[38;5;28;01mas\u001b[39;00m fwd_output_strides:\n\u001b[0;32m--> 295\u001b[0m     compiled_fw_func \u001b[38;5;241m=\u001b[39m \u001b[43maot_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfw_compiler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfw_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43madjusted_flat_args\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    296\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(compiled_fw_func, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    297\u001b[0m     compiled_fw_func \u001b[38;5;241m=\u001b[39m make_boxed_func(compiled_fw_func)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:1100\u001b[0m, in \u001b[0;36mcompile_fx.<locals>.fw_compiler_base\u001b[0;34m(model, example_inputs, is_inference)\u001b[0m\n\u001b[1;32m   1092\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m orig_output_end_idx \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m num_model_outputs\n\u001b[1;32m   1094\u001b[0m     user_visible_outputs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m   1095\u001b[0m         n\u001b[38;5;241m.\u001b[39mname\n\u001b[1;32m   1096\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m n \u001b[38;5;129;01min\u001b[39;00m model_outputs[original_output_start_index:orig_output_end_idx]\n\u001b[1;32m   1097\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(n, torch\u001b[38;5;241m.\u001b[39mfx\u001b[38;5;241m.\u001b[39mNode)\n\u001b[1;32m   1098\u001b[0m     }\n\u001b[0;32m-> 1100\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1101\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1102\u001b[0m \u001b[43m    \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1103\u001b[0m \u001b[43m    \u001b[49m\u001b[43mnum_fixed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfixed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1104\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcudagraphs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcudagraphs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1105\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgraph_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgraph_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1106\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_inference\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_inference\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1107\u001b[0m \u001b[43m    \u001b[49m\u001b[43mboxed_forward_device_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforward_device\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1108\u001b[0m \u001b[43m    \u001b[49m\u001b[43muser_visible_outputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_visible_outputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1109\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/repro/after_aot.py:83\u001b[0m, in \u001b[0;36mwrap_compiler_debug.<locals>.debug_wrapper\u001b[0;34m(gm, example_inputs, **kwargs)\u001b[0m\n\u001b[1;32m     78\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m config\u001b[38;5;241m.\u001b[39mrepro_after \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdynamo\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m     81\u001b[0m     \u001b[38;5;66;03m# Call the compiler_fn - which is either aot_autograd or inductor\u001b[39;00m\n\u001b[1;32m     82\u001b[0m     \u001b[38;5;66;03m# with fake inputs\u001b[39;00m\n\u001b[0;32m---> 83\u001b[0m     inner_compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     84\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m     85\u001b[0m     \u001b[38;5;66;03m# TODO: Failures here are troublesome because no real inputs,\u001b[39;00m\n\u001b[1;32m     86\u001b[0m     \u001b[38;5;66;03m# need a different serialization strategy\u001b[39;00m\n\u001b[1;32m     87\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39mrepro_after \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/debug.py:305\u001b[0m, in \u001b[0;36mDebugContext.wrap.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    302\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m    303\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    304\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m DebugContext():\n\u001b[0;32m--> 305\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36mContextDecorator.__call__.<locals>.inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m     76\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m     77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds):\n\u001b[1;32m     78\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:320\u001b[0m, in \u001b[0;36mcompile_fx_inner\u001b[0;34m(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, boxed_forward_device_index, user_visible_outputs, layout_opt, extern_node_serializer)\u001b[0m\n\u001b[1;32m    316\u001b[0m     compiled_graph \u001b[38;5;241m=\u001b[39m FxGraphCache\u001b[38;5;241m.\u001b[39mload(\n\u001b[1;32m    317\u001b[0m         fx_codegen_and_compile, gm, example_inputs, graph_kwargs\n\u001b[1;32m    318\u001b[0m     )\n\u001b[1;32m    319\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m     compiled_graph \u001b[38;5;241m=\u001b[39m \u001b[43mfx_codegen_and_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    321\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mgraph_kwargs\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# type: ignore[arg-type]\u001b[39;49;00m\n\u001b[1;32m    322\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    324\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFX codegen and compilation took \u001b[39m\u001b[38;5;132;01m%.3f\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m, time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m start)\n\u001b[1;32m    326\u001b[0m \u001b[38;5;66;03m# Return the output strides to the caller via TracingContext\u001b[39;00m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:535\u001b[0m, in \u001b[0;36mfx_codegen_and_compile\u001b[0;34m(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, user_visible_outputs, layout_opt, extern_node_serializer)\u001b[0m\n\u001b[1;32m    519\u001b[0m graph \u001b[38;5;241m=\u001b[39m GraphLowering(\n\u001b[1;32m    520\u001b[0m     gm,\n\u001b[1;32m    521\u001b[0m     \u001b[38;5;66;03m# example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    532\u001b[0m     is_inference\u001b[38;5;241m=\u001b[39mis_inference,\n\u001b[1;32m    533\u001b[0m )\n\u001b[1;32m    534\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_graph_handler(graph):\n\u001b[0;32m--> 535\u001b[0m     \u001b[43mgraph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    536\u001b[0m     output_strides: List[Optional[Tuple[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m]]] \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m    537\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m graph\u001b[38;5;241m.\u001b[39mgraph_outputs \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    538\u001b[0m         \u001b[38;5;66;03m# We'll put the output strides in the compiled graph so we\u001b[39;00m\n\u001b[1;32m    539\u001b[0m         \u001b[38;5;66;03m# can later return them to the caller via TracingContext\u001b[39;00m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:519\u001b[0m, in \u001b[0;36mGraphLowering.run\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m    517\u001b[0m \u001b[38;5;129m@dynamo_timed\u001b[39m\n\u001b[1;32m    518\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs):\n\u001b[0;32m--> 519\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/fx/interpreter.py:138\u001b[0m, in \u001b[0;36mInterpreter.run\u001b[0;34m(self, initial_env, enable_io_processing, *args)\u001b[0m\n\u001b[1;32m    135\u001b[0m     \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m    137\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 138\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menv[node] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_node\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    140\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mextra_traceback:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:814\u001b[0m, in \u001b[0;36mGraphLowering.run_node\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m    812\u001b[0m     debug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayout_constraints\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    813\u001b[0m     args, kwargs \u001b[38;5;241m=\u001b[39m layout_constraints[n\u001b[38;5;241m.\u001b[39mtarget](n, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 814\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcall_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    815\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_magic_method(n\u001b[38;5;241m.\u001b[39mtarget):\n\u001b[1;32m    816\u001b[0m     \u001b[38;5;66;03m# TODO: this is sus, it probably should be handled in the\u001b[39;00m\n\u001b[1;32m    817\u001b[0m     \u001b[38;5;66;03m# lowerings themselves similarly to sym_size/sym-stride\u001b[39;00m\n\u001b[1;32m    818\u001b[0m     debug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_magic_method\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:691\u001b[0m, in \u001b[0;36mGraphLowering.call_function\u001b[0;34m(self, target, args, kwargs)\u001b[0m\n\u001b[1;32m    689\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    690\u001b[0m     log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m  via \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, lowerings[target])\n\u001b[0;32m--> 691\u001b[0m     out \u001b[38;5;241m=\u001b[39m \u001b[43mlowerings\u001b[49m\u001b[43m[\u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    692\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m    693\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_lowering.py:117\u001b[0m, in \u001b[0;36mconvolution\u001b[0;34m(x, weight, bias, stride, padding, dilation, transposed, output_padding, groups)\u001b[0m\n\u001b[1;32m    115\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    116\u001b[0m     mlir_template \u001b[38;5;241m=\u001b[39m MLIRConvTemplate([x, weight, bias], layout, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 117\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmlir_template\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39moutput_node()\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_template.py:1189\u001b[0m, in \u001b[0;36mMLIRTemplate.generate\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m   1184\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m patch\u001b[38;5;241m.\u001b[39mobject(V\u001b[38;5;241m.\u001b[39mgraph, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mget_dtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fake_get_dtype(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_node)):\n\u001b[1;32m   1185\u001b[0m     kernel  \u001b[38;5;241m=\u001b[39m MLIRTemplateKernel(kernel_name\u001b[38;5;241m=\u001b[39mkernel_name, input_nodes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minput_nodes, call_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlayout\u001b[38;5;241m.\u001b[39msize, kernel_group\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   1186\u001b[0m                                  outer_func_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunction_name \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfunction_name\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   1187\u001b[0m                                  outer_func_render\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mouter_func_render \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mouter_func_render\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   1188\u001b[0m                                  kernel_arg_attributes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_arg_attributes() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mget_arg_attributes\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m-> 1189\u001b[0m     code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkernel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkernel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1191\u001b[0m kernel_hash_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmlir_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex_counter)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1192\u001b[0m extra_args \u001b[38;5;241m=\u001b[39m []\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py:238\u001b[0m, in \u001b[0;36mMLIRConvSingleBatchTemplate.render\u001b[0;34m(self, kernel, template_buffer_node, epilogue_nodes, tile_info, **kwargs)\u001b[0m\n\u001b[1;32m    229\u001b[0m kernel\u001b[38;5;241m.\u001b[39mepilogue_info \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(\n\u001b[1;32m    230\u001b[0m     output_node \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_node\u001b[38;5;241m.\u001b[39mname,\n\u001b[1;32m    231\u001b[0m     sram_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_buffer\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    235\u001b[0m     dim_aliasing \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex0\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mc0\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex1\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtile_n\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex2\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mo_h\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex3\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtile_m\u001b[39m\u001b[38;5;124m\"\u001b[39m}\n\u001b[1;32m    236\u001b[0m )\n\u001b[1;32m    237\u001b[0m kernel\u001b[38;5;241m.\u001b[39mexception_nodes[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnumel\u001b[39m\u001b[38;5;124m\"\u001b[39m : (I_W\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m2\u001b[39m\u001b[38;5;241m*\u001b[39mPADDING_W)\u001b[38;5;241m*\u001b[39m(I_H\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m2\u001b[39m\u001b[38;5;241m*\u001b[39mPADDING_H)\u001b[38;5;241m*\u001b[39mI_C\u001b[38;5;241m*\u001b[39mBATCH}\n\u001b[0;32m--> 238\u001b[0m code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_template_from_string\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconv_template\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkernel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    239\u001b[0m kernel\u001b[38;5;241m.\u001b[39madd_loop_info([kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mK_H\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mK_W\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_H\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_W\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBATCH\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_C\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mI_C\u001b[39m\u001b[38;5;124m\"\u001b[39m]], [kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_M\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_N\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_K\u001b[39m\u001b[38;5;124m\"\u001b[39m]])\n\u001b[1;32m    240\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m code\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/jinja2/environment.py:1299\u001b[0m, in \u001b[0;36mTemplate.render\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1296\u001b[0m ctx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnew_context(\u001b[38;5;28mdict\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs))\n\u001b[1;32m   1298\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1299\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menvironment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconcat\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mroot_render_func\u001b[49m\u001b[43m(\u001b[49m\u001b[43mctx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m   1300\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m   1301\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menvironment\u001b[38;5;241m.\u001b[39mhandle_exception()\n",
+      "File \u001b[0;32m<template>:178\u001b[0m, in \u001b[0;36mroot\u001b[0;34m(context, missing, environment)\u001b[0m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/jinja2/runtime.py:298\u001b[0m, in \u001b[0;36mContext.call\u001b[0;34m(_Context__self, _Context__obj, *args, **kwargs)\u001b[0m\n\u001b[1;32m    295\u001b[0m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_loop_vars\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m    297\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 298\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m__obj\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    299\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m:\n\u001b[1;32m    300\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m __self\u001b[38;5;241m.\u001b[39menvironment\u001b[38;5;241m.\u001b[39mundefined(\n\u001b[1;32m    301\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalue was undefined because a callable raised a\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    302\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m StopIteration exception\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    303\u001b[0m     )\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_template.py:790\u001b[0m, in \u001b[0;36mMLIRTemplateKernel.def_dma_op\u001b[0;34m(self, dma_type, dram_var, index_list, tile_desc, subtile_size, async_type, indent_size)\u001b[0m\n\u001b[1;32m    788\u001b[0m local_code \u001b[38;5;241m=\u001b[39m IndentedBuffer()\n\u001b[1;32m    789\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_kernel_handler(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 790\u001b[0m     index_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse_index_list\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlocal_code\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtile_desc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    791\u001b[0m     node_layout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnamed_nodes[dram_var]\u001b[38;5;241m.\u001b[39mget_layout()\n\u001b[1;32m    792\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m dram_var \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexception_nodes:\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_codegen_backend.py:1051\u001b[0m, in \u001b[0;36mMLIRKernel.parse_index_list\u001b[0;34m(self, expr_list, buffer, offset)\u001b[0m\n\u001b[1;32m   1048\u001b[0m         indices\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mstr\u001b[39m(new_arg))\n\u001b[1;32m   1050\u001b[0m \u001b[38;5;66;03m# Extract index var\u001b[39;00m\n\u001b[0;32m-> 1051\u001b[0m expr_str \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43msum\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mnew_expr_list\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1052\u001b[0m args \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mmap\u001b[39m(\u001b[38;5;28mstr\u001b[39m, dim_list))\n\u001b[1;32m   1053\u001b[0m map_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmap_cse\u001b[38;5;241m.\u001b[39mgenerate(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mglobal_vars, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maffine_map<(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00margs\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)[] -> (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexpr_str\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)>\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/_print_helpers.py:29\u001b[0m, in \u001b[0;36mPrintable.__str__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     27\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__str__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m     28\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msympy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mprinting\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mstr\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m sstr\n\u001b[0;32m---> 29\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msstr\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/printer.py:372\u001b[0m, in \u001b[0;36m_PrintFunction.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    371\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 372\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__wrapped__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/str.py:998\u001b[0m, in \u001b[0;36msstr\u001b[0;34m(expr, **settings)\u001b[0m\n\u001b[1;32m    982\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Returns the expression as a string.\u001b[39;00m\n\u001b[1;32m    983\u001b[0m \n\u001b[1;32m    984\u001b[0m \u001b[38;5;124;03mFor large expressions where speed is a concern, use the setting\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    994\u001b[0m \u001b[38;5;124;03m'Eq(a + b, 0)'\u001b[39;00m\n\u001b[1;32m    995\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    997\u001b[0m p \u001b[38;5;241m=\u001b[39m StrPrinter(settings)\n\u001b[0;32m--> 998\u001b[0m s \u001b[38;5;241m=\u001b[39m \u001b[43mp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdoprint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1000\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m s\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/printer.py:292\u001b[0m, in \u001b[0;36mPrinter.doprint\u001b[0;34m(self, expr)\u001b[0m\n\u001b[1;32m    290\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdoprint\u001b[39m(\u001b[38;5;28mself\u001b[39m, expr):\n\u001b[1;32m    291\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Returns printer's representation for expr (as a string)\"\"\"\u001b[39;00m\n\u001b[0;32m--> 292\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_str(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_print\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m)\u001b[49m)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/printer.py:331\u001b[0m, in \u001b[0;36mPrinter._print\u001b[0;34m(self, expr, **kwargs)\u001b[0m\n\u001b[1;32m    329\u001b[0m     printmethod \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, printmethodname, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m    330\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m printmethod \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 331\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mprintmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    332\u001b[0m \u001b[38;5;66;03m# Unknown object, fall back to the emptyPrinter.\u001b[39;00m\n\u001b[1;32m    333\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39memptyPrinter(expr)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/str.py:57\u001b[0m, in \u001b[0;36mStrPrinter._print_Add\u001b[0;34m(self, expr, order)\u001b[0m\n\u001b[1;32m     55\u001b[0m l \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m     56\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m term \u001b[38;5;129;01min\u001b[39;00m terms:\n\u001b[0;32m---> 57\u001b[0m     t \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_print\u001b[49m\u001b[43m(\u001b[49m\u001b[43mterm\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     58\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m term\u001b[38;5;241m.\u001b[39mis_Add:\n\u001b[1;32m     59\u001b[0m         sign \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/printer.py:331\u001b[0m, in \u001b[0;36mPrinter._print\u001b[0;34m(self, expr, **kwargs)\u001b[0m\n\u001b[1;32m    329\u001b[0m     printmethod \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, printmethodname, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m    330\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m printmethod \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 331\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mprintmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    332\u001b[0m \u001b[38;5;66;03m# Unknown object, fall back to the emptyPrinter.\u001b[39;00m\n\u001b[1;32m    333\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39memptyPrinter(expr)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/str.py:263\u001b[0m, in \u001b[0;36mStrPrinter._print_Mul\u001b[0;34m(self, expr)\u001b[0m\n\u001b[1;32m    261\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_print_Mul\u001b[39m(\u001b[38;5;28mself\u001b[39m, expr):\n\u001b[0;32m--> 263\u001b[0m     prec \u001b[38;5;241m=\u001b[39m \u001b[43mprecedence\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    265\u001b[0m     \u001b[38;5;66;03m# Check for unevaluated Mul. In this case we need to make sure the\u001b[39;00m\n\u001b[1;32m    266\u001b[0m     \u001b[38;5;66;03m# identities are visible, multiple Rational factors are not combined\u001b[39;00m\n\u001b[1;32m    267\u001b[0m     \u001b[38;5;66;03m# etc so we display in a straight-forward form that fully preserves all\u001b[39;00m\n\u001b[1;32m    268\u001b[0m     \u001b[38;5;66;03m# args and their order.\u001b[39;00m\n\u001b[1;32m    269\u001b[0m     args \u001b[38;5;241m=\u001b[39m expr\u001b[38;5;241m.\u001b[39margs\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/precedence.py:132\u001b[0m, in \u001b[0;36mprecedence\u001b[0;34m(item)\u001b[0m\n\u001b[1;32m    130\u001b[0m n \u001b[38;5;241m=\u001b[39m i\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\n\u001b[1;32m    131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n \u001b[38;5;129;01min\u001b[39;00m PRECEDENCE_FUNCTIONS:\n\u001b[0;32m--> 132\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPRECEDENCE_FUNCTIONS\u001b[49m\u001b[43m[\u001b[49m\u001b[43mn\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    133\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m n \u001b[38;5;129;01min\u001b[39;00m PRECEDENCE_VALUES:\n\u001b[1;32m    134\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m PRECEDENCE_VALUES[n]\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/precedence.py:62\u001b[0m, in \u001b[0;36mprecedence_Mul\u001b[0;34m(item)\u001b[0m\n\u001b[1;32m     61\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprecedence_Mul\u001b[39m(item):\n\u001b[0;32m---> 62\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mitem\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcould_extract_minus_sign\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m     63\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m PRECEDENCE[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAdd\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m     64\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m PRECEDENCE[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMul\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/mul.py:180\u001b[0m, in \u001b[0;36mMul.could_extract_minus_sign\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    178\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m  \u001b[38;5;66;03m# e.g. zoo*x == -zoo*x\u001b[39;00m\n\u001b[1;32m    179\u001b[0m c \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 180\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m c\u001b[38;5;241m.\u001b[39mis_Number \u001b[38;5;129;01mand\u001b[39;00m \u001b[43mc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mis_extended_negative\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/assumptions.py:503\u001b[0m, in \u001b[0;36mmake_property.<locals>.getit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    501\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_assumptions \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_assumptions:\n\u001b[1;32m    502\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_assumptions \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_assumptions\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[0;32m--> 503\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_ask\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfact\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/assumptions.py:559\u001b[0m, in \u001b[0;36m_ask\u001b[0;34m(fact, obj)\u001b[0m\n\u001b[1;32m    557\u001b[0m handler_i \u001b[38;5;241m=\u001b[39m handler_map\u001b[38;5;241m.\u001b[39mget(fact_i)\n\u001b[1;32m    558\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m handler_i \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 559\u001b[0m     fact_i_value \u001b[38;5;241m=\u001b[39m \u001b[43mhandler_i\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    561\u001b[0m \u001b[38;5;66;03m# If we get a new value for fact_i then we should update our knowledge\u001b[39;00m\n\u001b[1;32m    562\u001b[0m \u001b[38;5;66;03m# of fact_i as well as any related facts that can be inferred using the\u001b[39;00m\n\u001b[1;32m    563\u001b[0m \u001b[38;5;66;03m# inference rules connecting the fact_i and any other fact values that\u001b[39;00m\n\u001b[1;32m    564\u001b[0m \u001b[38;5;66;03m# are already known.\u001b[39;00m\n\u001b[1;32m    565\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m fact_i_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/expr.py:894\u001b[0m, in \u001b[0;36mExpr._eval_is_extended_negative\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    893\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_eval_is_extended_negative\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 894\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_eval_is_extended_positive_negative\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpositive\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/expr.py:857\u001b[0m, in \u001b[0;36mExpr._eval_is_extended_positive_negative\u001b[0;34m(self, positive)\u001b[0m\n\u001b[1;32m    854\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_number:\n\u001b[1;32m    855\u001b[0m     \u001b[38;5;66;03m# check to see that we can get a value\u001b[39;00m\n\u001b[1;32m    856\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 857\u001b[0m         n2 \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_eval_evalf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    858\u001b[0m     \u001b[38;5;66;03m# XXX: This shouldn't be caught here\u001b[39;00m\n\u001b[1;32m    859\u001b[0m     \u001b[38;5;66;03m# Catches ValueError: hypsum() failed to converge to the requested\u001b[39;00m\n\u001b[1;32m    860\u001b[0m     \u001b[38;5;66;03m# 34 bits of accuracy\u001b[39;00m\n\u001b[1;32m    861\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/numbers.py:672\u001b[0m, in \u001b[0;36mNumber._eval_evalf\u001b[0;34m(self, prec)\u001b[0m\n\u001b[1;32m    671\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_eval_evalf\u001b[39m(\u001b[38;5;28mself\u001b[39m, prec):\n\u001b[0;32m--> 672\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m Float\u001b[38;5;241m.\u001b[39m_new(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_as_mpf_val\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprec\u001b[49m\u001b[43m)\u001b[49m, prec)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/numbers.py:2083\u001b[0m, in \u001b[0;36mInteger._as_mpf_val\u001b[0;34m(self, prec)\u001b[0m\n\u001b[1;32m   2079\u001b[0m is_Integer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m   2081\u001b[0m \u001b[38;5;18m__slots__\u001b[39m \u001b[38;5;241m=\u001b[39m ()\n\u001b[0;32m-> 2083\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_as_mpf_val\u001b[39m(\u001b[38;5;28mself\u001b[39m, prec):\n\u001b[1;32m   2084\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m mlib\u001b[38;5;241m.\u001b[39mfrom_int(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mp, prec, rnd)\n\u001b[1;32m   2086\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_mpmath_\u001b[39m(\u001b[38;5;28mself\u001b[39m, prec, rnd):\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from torchvision.models import resnet18\n",
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "\n",
+    "input = torch.randn(1, 3, 224, 224).to(device=device)\n",
+    "model = resnet18().to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(model)\n",
+    "npu_out = opt_fn(input)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorial/session2/Hands_on.ipynb b/tutorial/session2/Hands_on.ipynb
new file mode 100644
index 00000000..ef66a5f6
--- /dev/null
+++ b/tutorial/session2/Hands_on.ipynb
@@ -0,0 +1,252 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "89aac974-97ea-46f2-b856-7b37c0a23add",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n",
+      "Building extension module npu...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ninja: no work to do.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import torch\n",
+    "import torch._dynamo\n",
+    "import torch.utils.cpp_extension\n",
+    "sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))\n",
+    "\n",
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "module = PyTorchSimRunner.setup_device()\n",
+    "device = module.custom_device()\n",
+    "\n",
+    "def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):\n",
+    "    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",
+    "        message = f\"|{name} Test Passed|\"\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(message)\n",
+    "        print(\"-\" * len(message))\n",
+    "    else:\n",
+    "        message = f\"|{name} Test Failed|\"\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(message)\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(\"custom out: \", out.cpu())\n",
+    "        print(\"cpu out: \", cpu_out)\n",
+    "        exit(1)\n",
+    "\n",
+    "def test_exponent2(device, size=(128, 128)):\n",
+    "    def exponent2(a):\n",
+    "        return a.exp2()\n",
+    "    x = torch.randn(size).to(device=device)\n",
+    "    opt_fn = torch.compile(dynamic=False)(exponent2)\n",
+    "    res = opt_fn(x)\n",
+    "    out = exponent(x.cpu())\n",
+    "    test_result(\"exponent2\", res, out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "42d509f3-d955-4149-9f0f-bd0f3d0620f9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrapper Codegen Path = /tmp/torchinductor_root/uu/cuumxtbdv4ukzpymchmrda2exohouwcdybawmj2v7jog4vbvoycf.py\n",
+      "[Gem5] Gem5 is running... \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running..  \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/wefbdnuiezd/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/wefbdnuiezd/togsim_result/0\"\n",
+      "------------------\n",
+      "|exp2 Test Passed|\n",
+      "------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "input = torch.randn(16, 16)\n",
+    "npu_x = input.to(device=device)\n",
+    "cpu_x = input.to(\"cpu\")\n",
+    "func = torch.exp2\n",
+    "opt_fn = torch.compile(dynamic=False)(func)\n",
+    "npu_out = opt_fn(npu_x)\n",
+    "cpu_out = func(cpu_x)\n",
+    "test_result(\"exp2\", npu_out, cpu_out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5bfdf22f-e749-41a5-a2cf-dcbb630bfb83",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "from ctypes import c_void_p, c_long\n",
+      "import torch\n",
+      "import math\n",
+      "import random\n",
+      "import os\n",
+      "import tempfile\n",
+      "from math import inf, nan\n",
+      "from torch._inductor.hooks import run_intermediate_hooks\n",
+      "from torch._inductor.utils import maybe_profile\n",
+      "from torch._inductor.codegen.memory_planning import _align as align\n",
+      "\n",
+      "from torch import device, empty, empty_strided\n",
+      "from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile\n",
+      "from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_TOGSIM_EAGER_MODE\n",
+      "from Simulator.simulator import TOGSimulator\n",
+      "from PyTorchSimFrontend.extension_op import sparse_mm_dummy_stonne_outer\n",
+      "from torch._inductor.select_algorithm import extern_kernels\n",
+      "\n",
+      "aten = torch.ops.aten\n",
+      "inductor_ops = torch.ops.inductor\n",
+      "assert_size_stride = torch._C._dynamo.guards.assert_size_stride\n",
+      "alloc_from_pool = torch.ops.inductor._alloc_from_pool\n",
+      "reinterpret_tensor = torch.ops.aten._reinterpret_tensor\n",
+      "custom_async_compile = CustomAsyncCompile()\n",
+      "os.environ[\"TORCHSIM_LAST_COMPILED_MODULE\"] = __file__\n",
+      "\n",
+      "def sram_plan_prefix(buffer_name, buffer):\n",
+      "    if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN):\n",
+      "        return\n",
+      "    buffer_size = buffer.untyped_storage().size()\n",
+      "    start = buffer.data_ptr()\n",
+      "    end = start + buffer_size\n",
+      "    # print(f'Alloc {buffer_name}(0x{start:x} ~ 0x{end:x})')\n",
+      "    TOGSimulator.sram_alloc(buffer_name, [start, end])\n",
+      "\n",
+      "def sram_plan_postfix(buffer_name, buffer):\n",
+      "    if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN):\n",
+      "        return\n",
+      "    buffer_size = buffer.untyped_storage().size()\n",
+      "    start = buffer.data_ptr()\n",
+      "    end = start + buffer_size\n",
+      "    # print(f'Dealloc {buffer_name}(0x{start:x} ~ 0x{end:x})')\n",
+      "    TOGSimulator.sram_dealloc(buffer_name, [start, end])\n",
+      "\n",
+      "def host2device_memcopy(buffer):\n",
+      "    pass\n",
+      "\n",
+      "def device2host_memcpy(buffer):\n",
+      "    pass\n",
+      "\n",
+      "print(f'Wrapper Codegen Path = {__file__}')\n",
+      "arg_attributes = [['arg0_1', [1, torch.float32, 256, [16, 16], [16, 1]]], ['buf0', [2, torch.float32, 256, [16, 16], [16, 1]]]]\n",
+      "\n",
+      "\n",
+      "extension_kernel_0 = custom_async_compile.mlir('''memref.global @buf0_spad : memref<256xf32, 1>\n",
+      "memref.global @buf1_spad : memref<256xf32, 1>\n",
+      "func.func @kernel(%in_ptr0: memref<256xf32>,\n",
+      "                       %out_ptr0: memref<256xf32>)\n",
+      "{\n",
+      "    %const0 = arith.constant 0 : index\n",
+      "    %const1 = arith.constant 2 : index\n",
+      "    %const2 = arith.constant 3 : index\n",
+      "    %alloc0 = memref.alloc() : memref<1xi32> // 0\n",
+      "    %alloc1 = memref.alloc() : memref<1xi32> // 1\n",
+      "    %spad0 = memref.get_global @buf0_spad : memref<256xf32, 1>\n",
+      "    %spad1 = memref.get_global @buf1_spad : memref<256xf32, 1>\n",
+      "    affine.for %index0 = 0 to 256 step 256\n",
+      "    {\n",
+      "        memref.dma_start %in_ptr0[%index0], %spad0[%const0], %const1, %alloc0[%const0], %const0, %const1 : memref<256xf32>, memref<256xf32, 1>, memref<1xi32> {dram_stride=[1], sram_stride=[1], padding=0}\n",
+      "        affine.for %compute_idx = 0 to 2 step 2\n",
+      "        {\n",
+      "            %tmp0 = affine.vector_load %spad0[%compute_idx] : memref<256xf32, 1>, vector<2xf32>\n",
+      "            %tmp1 = arith.constant 0.69314718055994528623 : f32\n",
+      "            %tmp2 = vector.broadcast %tmp1 : f32 to vector<2xf32>\n",
+      "            %tmp3 = arith.mulf %tmp0, %tmp2 : vector<2xf32>\n",
+      "            %tmp4 = math.exp %tmp3 : vector<2xf32>\n",
+      "            affine.vector_store %tmp4, %spad1[%compute_idx] : memref<256xf32, 1>, vector<2xf32>\n",
+      "        } {inner_loop=false}\n",
+      "        memref.dma_start %spad1[%const0], %out_ptr0[%index0], %const2, %alloc1[%const0], %const0, %const1 : memref<256xf32, 1>, memref<256xf32>, memref<1xi32> {dram_stride=[1], sram_stride=[1], padding=0}\n",
+      "    } {outer_loop=true}\n",
+      "    return\n",
+      "}\n",
+      "''', \n",
+      "vectorlane_size=128,\n",
+      "loop_size=None,\n",
+      "spad_info={'spad_vaddr': 3489660928, 'spad_paddr': 137438953472, 'spad_size': 131072},\n",
+      "origins={'exp2'},\n",
+      "arg_attributes=arg_attributes,\n",
+      "vlen=256)\n",
+      "\n",
+      "def call(args):\n",
+      "    arg0_1, = args\n",
+      "    args.clear()\n",
+      "    assert_size_stride(arg0_1, (16, 16), (16, 1))\n",
+      "    sram_plan_prefix('arg0_1', arg0_1)\n",
+      "    buf0 = empty((16, 16), device='npu', dtype=torch.float32)\n",
+      "    sram_plan_prefix('buf0', buf0)\n",
+      "    extension_kernel_0(arg0_1, buf0)\n",
+      "    sram_plan_postfix('arg0_1', arg0_1)\n",
+      "    del arg0_1\n",
+      "    sram_plan_postfix('buf0', buf0)\n",
+      "    return (buf0, )\n",
+      "\n",
+      "\n",
+      "def benchmark_compiled_module(times=10, repeat=10):\n",
+      "    from torch._dynamo.testing import rand_strided\n",
+      "    from torch._inductor.utils import print_performance\n",
+      "    arg0_1 = rand_strided((16, 16), (16, 1), device='npu:0', dtype=torch.float32)\n",
+      "    fn = lambda: call([arg0_1])\n",
+      "    return print_performance(fn, times=times, repeat=repeat)\n",
+      "\n",
+      "\n",
+      "if __name__ == \"__main__\":\n",
+      "    from torch._inductor.wrapper_benchmark import compiled_module_main\n",
+      "    compiled_module_main('None', benchmark_compiled_module)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor_root/uu/cuumxtbdv4ukzpymchmrda2exohouwcdybawmj2v7jog4vbvoycf.py"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}