diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 61eb96e1..eba48da2 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -6,7 +6,7 @@ on:
 
 jobs:
   build-and-test:
-    runs-on: ubuntu-latest
+    runs-on: self-hosted
 
     permissions:
       contents: read
@@ -35,6 +35,7 @@ jobs:
           context: .
           file: ./Dockerfile
           push: true
+          no-cache: true
           tags: ghcr.io/psal-postech/torchsim-test:${{ github.sha }}
 
       # Step 4: Wait for GHCR propagation
diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml
index bc356d85..32d6543c 100644
--- a/.github/workflows/pytorchsim_test.yml
+++ b/.github/workflows/pytorchsim_test.yml
@@ -674,6 +674,9 @@ jobs:
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Prepare volume directory
+        run: mkdir -p /tmp/torchsim-ci/${GITHUB_SHA}
+
       - name: Run run_cycle.sh
         run: |
           echo "Running run_cycle.sh"
@@ -682,4 +685,14 @@ jobs:
             -e TORCHSIM_DUMP_PATH=/dump \
             -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
             -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh
+            ${{ inputs.image_name }} bash -c \
+            "cd /workspace && PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh && \
+            cp PyTorchSim/experiments/artifact/cycle_validation/summary_cycle.out /dump/summary_cycle.out"
+          ls /tmp/torchsim-ci/${GITHUB_SHA}
+
+      - name: Upload Accuracy Report Artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: accuracy-report
+          path: /tmp/torchsim-ci/${{ github.sha }}/summary_cycle.out
+          if-no-files-found: error
diff --git a/.gitignore b/.gitignore
index 88eb2fb8..9decced5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,3 @@
 __pycache__/
-PyTorchSimBackend/build/
+TOGSim/build/
 .vscode
diff --git a/.gitmodules b/.gitmodules
index f65e5f2b..24f9ccaf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,18 +1,15 @@
-[submodule "PyTorchSimBackend/extern/onnx"]
-	path = PyTorchSimBackend/extern/onnx
+[submodule "TOGSim/extern/onnx"]
+	path = TOGSim/extern/onnx
 	url = https://github.com/onnx/onnx.git
-[submodule "PyTorchSimBackend/extern/protobuf"]
-	path = PyTorchSimBackend/extern/protobuf
+[submodule "TOGSim/extern/protobuf"]
+	path = TOGSim/extern/protobuf
 	url = https://github.com/protocolbuffers/protobuf.git
-[submodule "PyTorchSimBackend/extern/booksim"]
-	path = PyTorchSimBackend/extern/booksim
+[submodule "TOGSim/extern/booksim"]
+	path = TOGSim/extern/booksim
 	url = https://github.com/PSAL-POSTECH/booksim.git
-[submodule "PyTorchSimBackend/extern/torch2timeloop"]
-        path = PyTorchSimBackend/extern/torch2timeloop
-        url = https://github.com/Accelergy-Project/pytorch2timeloop-converter.git
-[submodule "PyTorchSimBackend/extern/ramulator2"]
-	path = PyTorchSimBackend/extern/ramulator2
+[submodule "TOGSim/extern/ramulator2"]
+	path = TOGSim/extern/ramulator2
 	url = https://github.com/PSAL-POSTECH/ramulator2
-[submodule "PyTorchSimBackend/extern/stonneCore"]
-	path = PyTorchSimBackend/extern/stonneCore
+[submodule "TOGSim/extern/stonneCore"]
+	path = TOGSim/extern/stonneCore
 	url = https://github.com/PSAL-POSTECH/stonne_core.git
diff --git a/Dockerfile b/Dockerfile
index 293dcb60..37721940 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,7 +4,7 @@ FROM ghcr.io/psal-postech/torchsim_base:latest
 # Prepare PyTorchSim project
 COPY . /workspace/PyTorchSim
 
-RUN cd PyTorchSim/PyTorchSimBackend && \
+RUN cd PyTorchSim/TOGSim && \
     mkdir -p build && \
     cd build && \
     conan install .. --build=missing && \
diff --git a/Dockerfile.ksc2025 b/Dockerfile.ksc2025
new file mode 100644
index 00000000..2ac210e0
--- /dev/null
+++ b/Dockerfile.ksc2025
@@ -0,0 +1,90 @@
+# Copyright (c) 2020 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime
+
+# Copied from Gem5 Docker file
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt -y update && apt -y upgrade && \
+    apt -y install build-essential git m4 scons zlib1g zlib1g-dev \
+    libprotobuf-dev protobuf-compiler libprotoc-dev libgoogle-perftools-dev \
+    python3-dev python-is-python3 doxygen libboost-all-dev \
+    libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config pip \
+    python3-venv black libssl-dev libasan5 libubsan1
+RUN pip install mypy pre-commit jupyter
+
+# Pass Access Token securely
+ENV PATH=$PATH:/root/.local/bin
+ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
+
+# Build Gem5
+RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch TorchSim
+RUN cd gem5 && scons build/RISCV/gem5.opt -j $(nproc)
+ENV GEM5_PATH=/workspace/gem5/build/RISCV/gem5.opt
+
+# Build LLVM RISC-V
+RUN git clone https://github.com/PSAL-POSTECH/llvm-project.git --branch torchsim --depth 1
+RUN cd llvm-project && mkdir build && cd build && \
+    cmake -DLLVM_ENABLE_PROJECTS=mlir -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/riscv-llvm -DLLVM_TARGETS_TO_BUILD=RISCV -G "Unix Makefiles" ../llvm && \
+    make -j && make install
+
+# Store RISC-V LLVM for TorchSim
+ENV TORCHSIM_LLVM_PATH=/riscv-llvm/bin
+ENV TORCHSIM_LLVM_INCLUDE_PATH=/riscv-llvm/include
+ENV TORCHSIM_DIR=/workspace/PyTorchSim
+ENV LLVM_DIR=/riscv-llvm
+
+# Download RISC-V tool chain
+RUN apt install -y wget && \
+    wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz && \
+    wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && \
+    tar -zxvf riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && tar -zxvf riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && \
+    rm *.tar.gz
+
+ENV RISCV=/workspace/riscv
+ENV PATH=$RISCV/bin:$PATH
+
+# Install Spike simulator
+RUN apt -y install device-tree-compiler
+RUN git clone https://github.com/PSAL-POSTECH/riscv-isa-sim.git --branch TorchSim && cd riscv-isa-sim && mkdir build && cd build && \
+    ../configure --prefix=$RISCV && make -j && make install
+
+# Install Proxy kernel
+RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \
+     cd riscv-pk && git checkout 4f3debe4d04f56d31089c1c716a27e2d5245e9a1 && mkdir build && cd build && \
+    ../configure --prefix=$RISCV --host=riscv64-unknown-elf && make -j && make install
+
+# Install torchsim dependency
+RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0
+
+# Prepare ONNXim project
+RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch tutorial
+RUN cd PyTorchSim/TOGSim && \
+    git submodule update --recursive --init && \
+    mkdir -p build && \
+    cd build && \
+    conan install .. --build=missing && \
+    cmake .. && \
+    make -j$(nproc)
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json
deleted file mode 100644
index 8f196e81..00000000
--- a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-  "core_type" : ["stonne", "ws_mesh"],
-  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
-  "num_cores" : 2,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_stonne_per_core" : 8,
-  "num_stonne_port" : 64,
-  "num_systolic_array_per_core" : 2,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 16,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 15000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":1
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
deleted file mode 100644
index c7ef15f7..00000000
--- a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "core_type" : ["stonne"],
-  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
-  "num_cores" : 1,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_stonne_per_core" : 8,
-  "num_stonne_port" : 64,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 8,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 15000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json
deleted file mode 100644
index 2293e197..00000000
--- a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "core_type" : ["stonne"],
-  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
-  "num_cores" : 1,
-  "core_freq" : 700,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_stonne_per_core" : 1,
-  "num_stonne_port" : 8,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 700,
-  "dram_channels": 8,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 7000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json
deleted file mode 100644
index 08548638..00000000
--- a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "core_type" : ["stonne"],
-  "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
-  "num_cores" : 1,
-  "core_freq" : 1000,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_stonne_per_core" : 1,
-  "num_stonne_port" : 32,
-
-  "dram_type" : "simple",
-  "dram_freq" : 1000,
-  "dram_channels": 1,
-  "dram_req_size": 32,
-  "dram_latency" : 100,
-  "dram_print_interval": 10000,
-  "l2d_type" : "datacache",
-  "l2d_config" : "S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 7000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
deleted file mode 100644
index 5d7b0d35..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 700,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :700,
-  "dram_channels": 16,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
- 
-  "icnt_type" : "booksim2",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_node_per_core" : 16,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
deleted file mode 100644
index 38acafc0..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 700,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 700,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 10000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
deleted file mode 100644
index 7348d5bc..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 16,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 15000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0": 0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
deleted file mode 100644
index 69ec8bd0..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 8,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 15000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0": 0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
deleted file mode 100644
index bff4e224..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 1050,
-  "sram_size" : 16777216,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 4,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :1200,
-  "dram_channels": 16,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-  "l2d_type" : "datacache",
-  "l2d_config" : "S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 19200,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
deleted file mode 100644
index b2661894..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_num_partitions" : 2,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
- 
-  "icnt_type" : "booksim2",
-  "icnt_latency" : 1,
-  "icnt_freq" : 1000,
-  "icnt_node_per_core" : 16,
-  "icnt_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt",
-  "icnt_print_interval" : 10000,
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
deleted file mode 100644
index 922ede5b..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_num_partitions" : 1,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
- 
-  "icnt_type" : "booksim2",
-  "icnt_latency" : 1,
-  "icnt_freq" : 1000,
-  "icnt_node_per_core" : 16,
-  "icnt_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
deleted file mode 100644
index 034542fe..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 700,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :700,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 20000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
deleted file mode 100644
index 82f42c00..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 28000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
deleted file mode 100644
index 132a52e6..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 28000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":1
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
deleted file mode 100644
index a93e8ae2..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 1050,
-  "sram_size" : 32768,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 4,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :1200,
-  "dram_channels": 32,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_size" : 32,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-  "l2d_type" : "datacache",
-  "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq" : 48000,
-  "icnt_node_per_core" : 1,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
-
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 1,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json
deleted file mode 100644
index e9a64f2e..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 1,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 1000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json
deleted file mode 100644
index 37e18b35..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 2,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json
deleted file mode 100644
index 49225d77..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 4,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json
deleted file mode 100644
index 4ea2c6ff..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 1,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "booksim2",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  }
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json
deleted file mode 100644
index 8aee751b..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 1,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json
deleted file mode 100644
index f76fec32..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "core_type" : ["ws_mesh","ws_mesh"],
-  "num_cores" : 2,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 1,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m4.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json
deleted file mode 100644
index 7571b830..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 2,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m8.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json
deleted file mode 100644
index be163336..00000000
--- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq" : 1000,
-  "sram_size" : 256,
-  "core_print_interval" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" :800,
-  "dram_channels": 4,
-  "dram_req_size": 64,
-  "dram_latency" : 10,
-  "dram_size" : 16,
-  "dram_nbl" : 4,
-  "dram_print_interval": 100000,
-  "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq" : 8000,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple"
-}
\ No newline at end of file
diff --git a/PyTorchSimBackend/extern/torch2timeloop b/PyTorchSimBackend/extern/torch2timeloop
deleted file mode 160000
index 62aa1754..00000000
--- a/PyTorchSimBackend/extern/torch2timeloop
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 62aa175421165cc9cd7dfb182a02fc3e26c01e3a
diff --git a/PyTorchSimBackend/src/TMA.cc b/PyTorchSimBackend/src/TMA.cc
deleted file mode 100644
index 7744b0f5..00000000
--- a/PyTorchSimBackend/src/TMA.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "TMA.h"
-#include "TileGraph.h"
-
-TMA::TMA(uint32_t id, uint32_t dram_req_size) {
-  _id = id;
-  _dram_req_size = dram_req_size;
-  _current_inst = nullptr;
-  _finished = true;
-}
-
-void TMA::issue_tile(std::shared_ptr<Instruction> inst) {
-  _current_inst = std::move(inst);
-  std::vector<size_t>& tile_size = _current_inst->get_tile_size();
-  if (tile_size.size() <= 0 || tile_size.size() > get_max_dim()) {
-    spdlog::error("[TMA {}] issued tile is not supported format..", _id);
-    exit(EXIT_FAILURE);
-  }
-  _finished = false;
-}
-
-std::shared_ptr<std::vector<mem_fetch*>> TMA::get_memory_access() {
-  auto addr_set = _current_inst->get_dram_address(_dram_req_size);
-  auto access_vec = std::make_shared<std::vector<mem_fetch *>>();
-  Tile* owner = (Tile*)_current_inst->get_owner();
-  std::shared_ptr<TileSubGraph> owner_subgraph = owner->get_owner();
-  unsigned long long base_daddr = _current_inst->get_base_dram_address();
-  // Todo. We use a ternsor level buffer allocation, so we don't need to check all memfetch
-  bool is_cacheable = owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size);
-  spdlog::trace("[SRAM Trace] Core-{}, Address: 0x{:016x}, Is_cacheable: {}", _id, base_daddr, is_cacheable);
-  spdlog::trace("[NUMA Trace] Core-{}, Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}",
-    _id, owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write());
-
-  for (auto addr: *addr_set) {
-    mem_access_type acc_type = _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W : mem_access_type::GLOBAL_ACC_R;
-    mf_type type = _current_inst->is_dma_write() ? mf_type::WRITE_REQUEST : mf_type::READ_REQUEST;
-    mem_fetch* access = new mem_fetch(addr, acc_type, type, _dram_req_size, _current_inst->get_numa_id(), static_cast<void*>(_current_inst.get()));
-    access->set_cacheable(is_cacheable);
-    _current_inst->inc_waiting_request();
-    access_vec->push_back(access);
-  }
-  _finished = true;
-  return access_vec;
-}
-
-uint32_t TMA::generate_mem_access_id() {
-  static uint32_t id_counter{0};
-  return id_counter++;
-}
\ No newline at end of file
diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index ca669361..577c45e9 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -7,7 +7,7 @@
 from AsmParser.tog_generator import tog_generator
 from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen
 from PyTorchSimFrontend import extension_config
-from Simulator.simulator import FunctionalSimulator, CycleSimulator, BackendSimulator
+from Simulator.simulator import FunctionalSimulator, CycleSimulator, TOGSimulator
 
 LOCK_TIMEOUT = 600
 
@@ -27,21 +27,6 @@ def dump_metadata(args, arg_attributes, path):
             file.write(f'{arg_name}=({arg_attribute[0]}, {arg.dtype}, {arg.shape})\n')
     return
 
-def parse_stack_sizes(file_path):
-    meta_path = file_path.split(".")[0]+".meta"
-    cmd = ["riscv64-unknown-elf-objcopy", "--dump-section", f".stack_sizes={meta_path}", file_path, "/dev/null"]
-    subprocess.run(cmd, check=True)
-
-    with open(meta_path, 'rb') as f:
-        stack_sizes_data = list(f.read())
-    if len(stack_sizes_data) <= 17:
-        raise ValueError("Invalid .stack_sizes section size")
-
-    stack_size_bytes = stack_sizes_data[8:-9]
-    stack_size = int.from_bytes(stack_size_bytes, byteorder='little')
-    return stack_size
-
-
 def llvm_compile_command(input, output):
     opt_output = f"{input[:-3]}_opt.ll"
     return [re.sub(r"[ \n]+", " ",
@@ -180,7 +165,7 @@ def load(cls, source_code,
         else:
             link_option = ""
         # Generate LLVM kernel calller and binary for validation
-        if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE:
+        if extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE:
             # Use custom malloc to avoid size error
             new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
             cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
@@ -197,7 +182,7 @@ def load(cls, source_code,
                     print("Error output:", e.output)
                     assert(0)
 
-                val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, arg_attributes)
+                val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE, arg_attributes)
                 val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name)
                 val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name,
                                                    validation_binary_name, new_link_option)
@@ -228,7 +213,7 @@ def load(cls, source_code,
                 print("Error output:", e.output)
                 assert(0)
 
-            if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
+            if not extension_config.CONFIG_TORCHSIM_TIMING_MODE:
                 return key
 
             # Generate MLIR kernel calller and binary for cycle calculation
@@ -299,23 +284,23 @@ def dummy_simulator(*args, **kwargs):
                 # Dump arguments and meta data
                 dump_metadata(args, arg_attributes, result_path)
                 runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
-                if not autotune and (extension_config.CONFIG_TORCHSIM_VALIDATION_MODE or validate):
+                if not autotune and (extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE or validate):
                     funcsim = FunctionalSimulator(result_path, key)
                     funcsim.run_spike(args, arg_attributes,
                                     runtime_path, self.validation_binary_name,
                                     vectorlane_size=vectorlane_size, spad_info=spad_info,
                                     cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS, silent_mode=silent_mode)
-                if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
+                if not extension_config.CONFIG_TORCHSIM_TIMING_MODE:
                     return
 
                 onnx_path = os.path.join(result_path, "tile_graph.onnx")
                 attribute_path = os.path.join(runtime_path, "attribute")
-                backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
-                backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG)
+                togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
+                backsim = TOGSimulator(togsim_path, extension_config.CONFIG_TOGSIM_CONFIG)
                 backsim.vectorlane_size = vectorlane_size
                 attribute_path = backsim.create_attribute_file(attribute_path, args, loop_size=loop_size)
                 result_path = backsim.simulation(onnx_path, attribute_path, silent_mode=silent_mode)
-                result = BackendSimulator.get_result_from_file(result_path)
+                result = TOGSimulator.get_result_from_file(result_path)
                 return result
 
         def dryrun_simulator(*args, **kwargs):
@@ -329,11 +314,11 @@ def dryrun_simulator(*args, **kwargs):
                 # Dump arguments and meta data
                 dump_metadata(args, arg_attributes, result_path)
                 runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
-                if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
+                if not extension_config.CONFIG_TORCHSIM_TIMING_MODE:
                     return
 
                 # Todo. Support valude dependent mode for graph mode
-                if False: # extension_config.CONFIG_TORCHSIM_VALIDATION_MODE:
+                if False: # extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE:
                     funcsim = FunctionalSimulator(result_path, key)
                     funcsim.run_spike(args, arg_attributes,
                                     runtime_path, self.validation_binary_name,
@@ -341,7 +326,7 @@ def dryrun_simulator(*args, **kwargs):
                                     cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS)
             return result_path, runtime_path, None
 
-        is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) and not autotune
+        is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) and not autotune
         target_simulator = dryrun_simulator if is_dryrun else dummy_simulator
         target_simulator.arg_attributes = arg_attributes
         target_simulator.future = future
diff --git a/PyTorchSimFrontend/extension_codegen_backend.py b/PyTorchSimFrontend/extension_codegen_backend.py
deleted file mode 100644
index e569d251..00000000
--- a/PyTorchSimFrontend/extension_codegen_backend.py
+++ /dev/null
@@ -1,216 +0,0 @@
-import dataclasses
-import contextlib
-from typing import List
-from typing import Dict
-from torch._inductor.codegen import cpp, wrapper, common
-from torch._inductor.scheduler import BaseScheduling
-from torch._inductor.virtualized import V
-from torch._inductor.utils import IndentedBuffer
-import sympy
-
-cexpr = cpp.CppPrinter().doprint
-
-class ExtensionWrapperCodegen(wrapper.WrapperCodeGen):
-    def __init__(self):
-        super().__init__()
-
-class ExtensionOverrides(common.OpOverrides):
-    pass
-
-class ExtensionKernel(common.Kernel):
-    overrides = ExtensionOverrides
-    newvar_prefix = "auto "
-    suffix = ";"
-
-    def __init__(self, args=None):
-        super().__init__(args)
-        self.call_ranges = None
-        self.ranges = None
-        self.itervars = None
-        self.reduction_depth = None
-        self.reduction_prefix = IndentedBuffer()
-        self.reduction_suffix = IndentedBuffer()
-        self.reduction_vars = {}
-        self.reduction_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc")
-
-    def load(self, name: str, index: sympy.Expr):
-        index = self.rename_indexing(index)
-        var = self.args.input(name)
-        line = f"{var}[{index}]"
-        dtype = V.graph.get_dtype(name)
-        self.cse.prefix = cpp.DTYPE_TO_CPP[dtype] + " "
-        return self.cse.generate(self.loads, line)
-
-    def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
-        index = self.rename_indexing(index)
-        var = self.args.output(name)
-        line = f"{var}[{index}] = {value}"
-        self.cse.generate(self.stores, line, assignment = False)
-
-    def reduction(self, dtype, src_dtype, reduction_type, value):
-        argmax_or_argmin = reduction_type in {"argmax", "argmin"}
-        if argmax_or_argmin:
-            raise NotImplementedError() #TODO: argmin, argmax
-        else:
-            reduction_key = src_dtype, reduction_type, value
-            acc = self.reduction_cse.generate(
-                self.loads, f"reduction {reduction_key}", write=False
-            )
-            self.reduction_vars[acc] = reduction_type
-            acc_type = cpp.reduction_acc_type(reduction_type, dtype)
-            self.reduction_prefix.writeline(f"{acc_type} {acc} = {cpp.reduction_init(reduction_type, dtype)};")
-            line = f"{acc} = {cpp.reduction_combine(reduction_type, acc, value)}"
-            self.cse.generate(self.stores, line, assignment = False)
-            self.reduction_cse.reduction_cache[reduction_key] = acc
-        return acc
-
-    def store_reduction(self, name, index, value):
-        index = self.rename_indexing(index)
-        var = self.args.output(name)
-        self.reduction_suffix.writeline(f"{var}[{index}] = {value};")\
-
-    def codegen_loops(self):
-        code = common.BracesBuffer()
-        # Loop body part
-        loops = [LoopLevel(var, size) for var, size in zip(self.itervars, self.ranges)]
-        loops, reductions = [LoopNest(loops[: self.reduction_depth]),
-                             LoopNest(loops[self.reduction_depth :])]
-        reductions.mark_reduction(self.reduction_vars)
-
-        with contextlib.ExitStack() as stack:
-            loops.codegen(code, stack)
-            with contextlib.ExitStack() as stack_outer:
-                if self.reduction_prefix:
-                    stack_outer.enter_context(code.indent())
-                code.splice(self.reduction_prefix)
-
-                with contextlib.ExitStack() as stack:
-                    reductions.codegen(code, stack)
-                    code.splice(self.loads)
-                    code.splice(self.compute)
-                    code.splice(self.stores)
-                code.splice(self.reduction_suffix)
-        return code
-
-    def define_kernel(self, wrapper, src_code, kernel_name):
-        if src_code in wrapper.src_to_kernel:
-            kernel_name = wrapper.src_to_kernel[src_code]
-        else:
-            wrapper.src_to_kernel[src_code] = kernel_name
-            wrapper.define_kernel(kernel_name, src_code, cuda=False)
-
-    def codegen_kernel(self, wrapper):
-        arg_defs, call_args, arg_types = self.args.cpp_argdefs()
-        arg_defs = ",\n".ljust(25).join(arg_defs)
-        arg_types = ",".join(arg_types)
-        code = common.BracesBuffer()
-
-        # Todo. kernel name custom
-        kernel_name = f"Extensin_Kernel"
-        kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel"
-        code.writeline(f'extern "C" void {kernel_decl_name}({arg_defs})')
-        with code.indent():
-            for old, new in self.args.aliases():
-                code.writeline(f"auto {old} = {new};")
-            # Loop body part
-            code.splice(self.codegen_loops())
-
-        codecache_def = IndentedBuffer()
-        if not V.graph.cpp_wrapper:
-            codecache_def.writeline("async_compile.cpp('''")
-        codecache_def.splice(code)
-        if not V.graph.cpp_wrapper:
-            codecache_def.writeline("''')")
-
-        self.define_kernel(wrapper, codecache_def.getvalue(), kernel_name)
-        # generate the code to call this
-        wrapper.generate_kernel_call(kernel_name, call_args, cuda=False)
-        print(code.getvalue())
-        return code.getvalue()
-
-    def set_ranges(self, lengths, reduction_lengths):
-        if self.call_ranges:
-            assert self.call_ranges == tuple(lengths) + tuple(
-                reduction_lengths
-            ), f"{self.call_ranges} == {tuple(lengths)} + {tuple(reduction_lengths)}"
-            assert self.reduction_depth == len(lengths)
-        else:
-            self.call_ranges = tuple(lengths) + tuple(reduction_lengths)
-            self.ranges = [self.rename_indexing(x) for x in self.call_ranges]
-            self.itervars = [sympy.Symbol(f"i{n}") for n in range(len(self.ranges))]
-            self.reduction_depth = len(lengths)
-        return (
-            self.itervars[: self.reduction_depth],
-            self.itervars[self.reduction_depth :],
-        )
-
-@dataclasses.dataclass
-class LoopLevel:
-    var: sympy.Expr
-    size: sympy.Expr
-    reduction_vars: Dict[str, str] = None
-
-    # Todo. Type change for reduction
-    INDEX_TYPE = "long"
-    def lines(self):
-        line = f"for({self.INDEX_TYPE} {self.var}=0; {self.var}<{cexpr(self.size)}; ++{self.var})"
-        return [line]
-
-@dataclasses.dataclass
-class LoopNest:
-    loops: List[LoopLevel]
-
-    def __bool__(self):
-        return bool(self.loops)
-
-    def mark_reduction(self, reduction_vars):
-        for loop in self.loops:
-            loop.reduction_vars = reduction_vars
-
-    def mark_parallel(self, par_depth):
-        loops = self.loops
-        loops[0].parallel = par_depth
-        for i in range(1, par_depth):
-            loops[i].collapsed = True
-        loops[0].simd = loops[par_depth - 1].simd
-
-    def codegen(self, code, stack):
-        for loop in self.loops:
-            code.writelines(loop.lines())
-            stack.enter_context(code.indent())
-
-class ExtensionScheduling(BaseScheduling):
-    count = 0
-    def __init__(self, scheduler):
-        self.scheduler = scheduler
-        self._scheduling = cpp.CppScheduling(scheduler)
-
-    def can_fuse_vertical(self, node1, node2):
-        return False
-
-    def can_fuse_horizontal(self, node1, node2):
-        return False
-
-    def group_fn(self, sizes):
-        return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes)
-
-    def codegen_nodes(self, nodes):
-        _, (group, reduction_group) = max(
-            nodes, key=lambda x: int(x.is_reduction())
-        ).group
-
-        ex_kernel = ExtensionKernel()
-        for node in nodes:
-            vars, reduction_vars = ex_kernel.set_ranges(group, reduction_group)
-            with ex_kernel:
-                node.run(vars, reduction_vars)
-
-        wrapper = V.graph.wrapper_code
-        ex_kernel.codegen_kernel(wrapper)
-        pass
-
-    def codegen_sync(self):
-        pass
-
-    def flush(self):
-        self._scheduling.flush()
\ No newline at end of file
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index fa5d22b5..3d6fbb76 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -3,73 +3,124 @@
 import tempfile
 import importlib
 
-# Hardware info config
-CONFIG_VECTOR_LANE = int(os.environ.get("TORCHSIM_VECTOR_LANE", default=128))
-CONFIG_VECTOR_LANE_STRIDE = int(os.environ.get("TORCHSIM_VECTOR_LANE_STRIDE", default=2))
-CONFIG_SPAD_INFO = {
-  "spad_vaddr" : 0xD0000000,
-  "spad_paddr" : 0x2000000000,
-  "spad_size" : int(os.environ.get("TORCHSIM_SPAD_SIZE", default=128)) << 10 # Note: spad size per lane
-}
-CONFIG_PRECISION = 4 # 32bit
-CONFIG_NUM_CORES = 1
-CONFIG_VLEN = 256 # 256bits / 32bits = 8 [elements]
-
-# Tile size config
-CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-
-CONFIG_TORCHSIM_DUMP_PATH = os.environ.get('TORCHSIM_DUMP_PATH',
-                        default = f"{tempfile.gettempdir()}/torchinductor")
-CONFIG_TORCHSIM_DUMP_FILE = int(os.environ.get('TORCHSIM_DUMP_FILE', default=True))
-CONFIG_TORCHSIM_VALIDATION_MODE = int(os.environ.get('TORCHSIM_VALIDATION_MODE', default=True))
-CONFIG_CLEANUP_DUMP_ARGS = int(os.environ.get('CLEANUP_DUMP_ARGS', default=False))
-
-# LLVM PATH
-CONFIG_TORCHSIM_LLVM_PATH = os.environ.get('TORCHSIM_LLVM_PATH', default="/usr/bin")
-CONFIG_TORCHSIM_CUSTOM_PASS_PATH = os.environ.get('TORCHSIM_CUSTOM_PASS_PATH',
-                                           default=f"{CONFIG_TORCHSIM_DIR}/GemminiLowerPass/build")
-CONFIG_TORCHSIM_DUMP_MLIR_IR = int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False))
-CONFIG_TORCHSIM_DUMP_LLVM_IR = int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False))
-
-# Backendsim config
-CONFIG_TORCHSIM_BACKEND_CONFIG = os.environ.get('TORCHSIM_CONFIG',
-                                        default=f'{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
-CONFIG_BACKENDSIM_SPIKE_ONLY = int(os.environ.get("BACKENDSIM_SPIKE_ONLY", False))
-CONFIG_BACKENDSIM_EAGER_MODE = int(os.environ.get("BACKENDSIM_EAGER_MODE", default=False))
-CONFIG_BACKENDSIM_DRYRUN = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
-CONFIG_BACKENDSIM_DEBUG_LEVEL = os.environ.get("BACKENDSIM_DEBUG_LEVEL", "")
-
-# GEM5 config
-CONFIG_GEM5_PATH = os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt")
-CONFIG_GEM5_SCRIPT_PATH = os.environ.get('GEM5_SCRIPT_PATH',
-                                  default=f"{CONFIG_TORCHSIM_DIR}/gem5_script/script_systolic.py")
-
-# AUTOTUNE config
-CONFIG_AUTOTUNE = int(os.environ.get('AUTOTUNE', default=True))
-CONFIG_AUTOTUNE_TEMPLATE = int(os.environ.get('AUTOTUNE_TEMPLATE', default=True))
-CONFIG_MAX_AUTOTUNE_TRY = int(os.environ.get('MAX_AUTOTUNE_TRY', default=10))
-CONFIG_AUTOTUNE_TEMPLATE_TOPK = int(os.environ.get('AUTOTUNE_TEMPLATE_TOPK', default=4))
-
-# For block sparse
-CONFIG_BLOCK_SPARSE = int(os.environ.get('BLOCK_SPARSE', default=0))
-
-# For GEMM tile size
-CONFIG_MANUAL_TILE_SIZE = int(os.environ.get('TORCHSIM_MANUAL_TILE_SIZE', default=False))
-CONFIG_TILE_M = int(os.environ.get('TORCHSIM_TILE_M', default=CONFIG_VECTOR_LANE))
-CONFIG_TILE_N = int(os.environ.get('TORCHSIM_TILE_N', default=CONFIG_VECTOR_LANE))
-CONFIG_TILE_K = int(os.environ.get('TORCHSIM_TILE_K', default=CONFIG_VECTOR_LANE))
-CONFIG_GEMM_CHEATSHEET_PATH = os.environ.get('TORCHSIM_GEMM_CHEATSHEET_PATH',
-                                            default=f"{CONFIG_TORCHSIM_DIR}/validation/gemm_tpuv3_cheatsheet.json")
-CONFIG_SUBTILE = int(os.environ.get('TORCHSIM_SUBTILE', default=True))
-CONFIG_MANUAL_SUBTILE_SIZE = int(os.environ.get('TORCHSIM_MANUAL_SUBTILE_SIZE', default=False))
-CONFIG_SUBTILE_M = int(os.environ.get('TORCHSIM_SUBTILE_M', default=CONFIG_VECTOR_LANE))
-CONFIG_SUBTILE_N = int(os.environ.get('TORCHSIM_SUBTILE_N', default=CONFIG_VECTOR_LANE))
-CONFIG_SUBTILE_K = int(os.environ.get('TORCHSIM_SUBTILE_K', default=CONFIG_VECTOR_LANE))
-
-# Advanced fusion options
-CONFIG_FUSION_REDUCTION_EPILOGUE = int(os.environ.get('TORCHSIM_FUSION_REDUCTION_EPILOGUE', default=True))
-CONFIG_FUSION_REDUCTION_REDUCTION = int(os.environ.get('TORCHSIM_FUSION_REDUCTION_REDUCTION', default=True))
-CONFIG_FUSION_PROLOGUE = int(os.environ.get('TORCHSIM_FUSION_PROLOGUE', default=True))
+def __getattr__(name):
+
+    # Hardware info config
+    if name == "CONFIG_VECTOR_LANE":
+        return int(os.environ.get("TORCHSIM_VECTOR_LANE", default=128))
+    if name == "CONFIG_VECTOR_LANE_STRIDE":
+        return int(os.environ.get("TORCHSIM_VECTOR_LANE_STRIDE", default=2))
+    if name == "CONFIG_SPAD_INFO":
+        return {
+          "spad_vaddr" : 0xD0000000,
+          "spad_paddr" : 0x2000000000,
+          "spad_size" : int(os.environ.get("TORCHSIM_SPAD_SIZE", default=128)) << 10 # Note: spad size per lane
+        }
+    if name == "CONFIG_PRECISION":
+          return 4 # 32bit
+    if name == "CONFIG_NUM_CORES":
+          return 1
+    if name == "CONFIG_VLEN":
+          return 256 # 256bits / 32bits = 8 [elements]
+
+    # Tile size config
+    if name == "CONFIG_TORCHSIM_DIR":
+          return os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+
+    if name == "CONFIG_TORCHSIM_DUMP_PATH":
+          return os.environ.get('TORCHSIM_DUMP_PATH', default = f"{tempfile.gettempdir()}/torchinductor")
+    if name == "CONFIG_TORCHSIM_DUMP_FILE":
+          return int(os.environ.get('TORCHSIM_DUMP_FILE', default=True))
+    if name == "CONFIG_TORCHSIM_FUNCTIONAL_MODE":
+          return int(os.environ.get('TORCHSIM_FUNCTIONAL_MODE', default=True))
+    if name == "CONFIG_TORCHSIM_TIMING_MODE":
+          return int(os.environ.get("TORCHSIM_TIMING_MODE", True))
+    if name == "CONFIG_CLEANUP_DUMP_ARGS":
+          return int(os.environ.get('CLEANUP_DUMP_ARGS', default=False))
+
+    # LLVM PATH
+    if name == "CONFIG_TORCHSIM_LLVM_PATH":
+        return os.environ.get('TORCHSIM_LLVM_PATH', default="/usr/bin")
+    if name == "CONFIG_TORCHSIM_CUSTOM_PASS_PATH":
+        return os.environ.get('TORCHSIM_CUSTOM_PASS_PATH',
+                                              default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/GemminiLowerPass/build")
+    if name == "CONFIG_TORCHSIM_DUMP_MLIR_IR":
+        return int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False))
+    if name == "CONFIG_TORCHSIM_DUMP_LLVM_IR":
+        return int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False))
+
+    # TOGSim config
+    if name == "CONFIG_TOGSIM_CONFIG":
+        return os.environ.get('TORCHSIM_CONFIG',
+                default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json")
+    if name == "CONFIG_TOGSIM_EAGER_MODE":
+        return int(os.environ.get("TOGSIM_EAGER_MODE", default=False))
+    if name == "CONFIG_TOGSIM_DRYRUN":
+        return int(os.environ.get('TOGSIM_DRYRUN', default=False))
+    if name == "CONFIG_TOGSIM_DEBUG_LEVEL":
+        return os.environ.get("TOGSIM_DEBUG_LEVEL", "")
+
+    # GEM5 config
+    if name == "CONFIG_GEM5_PATH":
+        return os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt")
+    if name == "CONFIG_GEM5_SCRIPT_PATH":
+        return os.environ.get('GEM5_SCRIPT_PATH',
+                                      default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/gem5_script/script_systolic.py")
+
+    # AUTOTUNE config
+    if name == "CONFIG_AUTOTUNE":
+        return int(os.environ.get('AUTOTUNE', default=False))
+    if name == "CONFIG_AUTOTUNE_TEMPLATE":
+        return int(os.environ.get('AUTOTUNE_TEMPLATE', default=False))
+    if name == "CONFIG_MAX_AUTOTUNE_TRY":
+        return int(os.environ.get('MAX_AUTOTUNE_TRY', default=10))
+    if name == "CONFIG_AUTOTUNE_TEMPLATE_TOPK":
+        return int(os.environ.get('AUTOTUNE_TEMPLATE_TOPK', default=4))
+
+    # For block sparse
+    if name == "CONFIG_BLOCK_SPARSE":
+        return int(os.environ.get('BLOCK_SPARSE', default=0))
+
+    # For GEMM tile size
+    if name == "CONFIG_MANUAL_TILE_SIZE":
+        return int(os.environ.get('TORCHSIM_MANUAL_TILE_SIZE', default=False))
+    if name == "CONFIG_TILE_M":
+        return int(os.getenv("TORCHSIM_TILE_M", __getattr__("CONFIG_VECTOR_LANE")))
+    if name == "CONFIG_TILE_N":
+        return int(os.getenv("TORCHSIM_TILE_N", __getattr__("CONFIG_VECTOR_LANE")))
+    if name == "CONFIG_TILE_K":
+        return int(os.getenv("TORCHSIM_TILE_K", __getattr__("CONFIG_VECTOR_LANE")))
+
+    if name == "CONFIG_SUBTILE":
+        return int(os.environ.get('TORCHSIM_SUBTILE', default=True))
+    if name == "CONFIG_MANUAL_SUBTILE_SIZE":
+        return int(os.environ.get('TORCHSIM_MANUAL_SUBTILE_SIZE', default=False))
+    if name == "CONFIG_SUBTILE_M":
+        return int(os.environ.get('TORCHSIM_SUBTILE_M', default=__getattr__("CONFIG_VECTOR_LANE")))
+    if name == "CONFIG_SUBTILE_N":
+        return int(os.environ.get('TORCHSIM_SUBTILE_N', default=__getattr__("CONFIG_VECTOR_LANE")))
+    if name == "CONFIG_SUBTILE_K":
+        return int(os.environ.get('TORCHSIM_SUBTILE_K', default=__getattr__("CONFIG_VECTOR_LANE")))
+
+    if name == "CONFIG_GEMM_CHEATSHEET_PATH":
+          return os.environ.get('TORCHSIM_GEMM_CHEATSHEET_PATH',
+                          default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/validation/gemm_tpuv3_cheatsheet.json")
+    # Compiler Optimization
+    if name == "CONFIG_COMPILER_OPTIMIZATION":
+          return os.environ.get('TORCHSIM_COMPILER_OPTIMIZATION', default="all")  # options: all, none, custom
+    # Advanced fusion options
+    if name == "CONFIG_FUSION":
+          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "fusion" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
+    if name == "CONFIG_FUSION_REDUCTION_EPILOGUE":
+          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "reduction_epliogue" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
+    if name == "CONFIG_FUSION_REDUCTION_REDUCTION":
+          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "reduction_reduction" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
+    if name == "CONFIG_FUSION_PROLOGUE":
+          return True if ((__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all") or ("prologue" in __getattr__("CONFIG_COMPILER_OPTIMIZATION"))) else False
+    if name == "CONFIG_SINGLE_BATCH_CONV":
+          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "single_batch_conv" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
+    if name == "CONFIG_MULTI_TILE_CONV":
+          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "multi_tile_conv" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
 
 # SRAM Buffer allocation plan
 def load_plan_from_module(module_path):
@@ -96,3 +147,5 @@ def load_plan_from_module(module_path):
 CONFIG_TLS_MODE = int(os.environ.get('TORCHSIM_TLS_MODE', default=1))
 
 CONFIG_USE_TIMING_POOLING = int(os.environ.get('TORCHSIM_USE_TIMING_POOLING', default=0))
+
+CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=0))
\ No newline at end of file
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 22a727c5..167544f2 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -13,7 +13,7 @@
 from torch._inductor.codecache import write
 from PyTorchSimFrontend.extension_codecache import get_write_path
 from PyTorchSimFrontend import extension_config
-from Simulator.simulator import BackendSimulator, TORCH_TO_NUMPY
+from Simulator.simulator import TOGSimulator, TORCH_TO_NUMPY
 
 graph_template = {
     0: {
@@ -46,7 +46,7 @@
 
 class MLIRExternKernelChoice(ExternKernelChoice):
     def call_name(self):
-        is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False))
+        is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False))
         if is_dryrun:
             return f"yield from sparse_mm_dummy_stonne_outer"
         return f"torch.ops.extension_op.{self.name}"
@@ -275,11 +275,11 @@ def prepare_outer_product_matrix(a, b, out):
 def sparse_mm_stonne_outer(a, b, out):
     onnx_path, attribute_path, c_result_path = prepare_outer_product_matrix(a, b, out)
 
-    backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
-    stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json'
-    backsim = BackendSimulator(backend_path, stonne_config_path)
+    togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
+    stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/TOGSim/configs/stonne_single_c1_simple_noc.json'
+    backsim = TOGSimulator(togsim_path, stonne_config_path)
     result_path = backsim.simulation(onnx_path)
-    BackendSimulator.get_result_from_file(result_path)
+    TOGSimulator.get_result_from_file(result_path)
 
     # Load result data
     #with open(c_result_path, 'rb') as f:
diff --git a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py b/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
deleted file mode 100644
index 3690f533..00000000
--- a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py
+++ /dev/null
@@ -1,236 +0,0 @@
-import os
-import subprocess
-import shlex
-import re
-
-from torch._inductor.utils import IndentedBuffer
-from torch._inductor.codegen import cpp
-from torch._inductor.codecache import write_atomic
-
-from PyTorchSimFrontend.llvm.llvm_common import LLVMKernelArgs
-
-class LLVMKernelCallerCodeGen():
-    """
-    Generate C that calls the llvm kernel.
-    """
-
-    def __init__(self, validation, arg_attributes):
-        super().__init__()
-        self.code = IndentedBuffer()
-        self.ending = ";"
-        self.open_bracket = "{"
-        self.closed_bracket = "}"
-        self.newline = "\n"
-        self.kernel_name = "kernel"
-        self.validation = validation
-        self.n_arg = len(arg_attributes)
-        self.arg_attributes = arg_attributes
-        self.arg_use_count = 1
-        self.load_args = {}
-        self.kernel_start_addr = ""
-        self.kernel_end_addr = ""
-
-    def get_argv_idx(self):
-        self.arg_use_count += 1
-        return self.arg_use_count-1
-
-    def write_header(self):
-        self.writeline('#include <stdio.h>')
-        self.writeline('#include <stdlib.h>')
-        self.writeline("#include <stdint.h>")
-        if self.validation:
-            self.writeline("#include <unistd.h>")
-            self.writeline('#include <string.h>')
-            self.writeline('#include <fcntl.h>')
-
-    def is_in_arg(self, arg_name):
-        value = self.arg_attributes[arg_name][0]
-        return LLVMKernelArgs.is_llvm_arg_in(value)
-
-    def is_out_arg(self, arg_name):
-        value = self.arg_attributes[arg_name][0]
-        return LLVMKernelArgs.is_llvm_arg_out(value)
-
-    def load_arg(self):
-        for i, arg_name in enumerate(self.arg_attributes.keys()):
-            if self.is_in_arg(arg_name):
-                argv_idx = self.get_argv_idx() if arg_name not in self.load_args else self.load_args[arg_name]
-                self.load_args[arg_name] = argv_idx
-                self.writeline(f'if(load_arg({arg_name}, sizeof({arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}')
-                with self.code.indent():
-                    self.writeline(f'return -1{self.ending}')
-                self.writeline(self.closed_bracket)
-
-    def dump_arg(self):
-        for i, arg_name in enumerate(self.arg_attributes.keys()):
-            if self.is_out_arg(arg_name):
-                argv_idx = self.get_argv_idx() if arg_name not in self.load_args else self.load_args[arg_name]
-                self.writeline(f'if(dump_arg({arg_name}, sizeof({arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}')
-                with self.code.indent():
-                    self.writeline(f'return -1{self.ending}')
-                self.writeline(self.closed_bracket)
-
-    def write_exit(self):
-        self.writeline(f'return 0{self.ending}')
-
-    def generate_kernel_declare(self):
-        args_type_p = [f'{cpp.DTYPE_TO_CPP[arg_type[1]]}*' for arg_type in self.arg_attributes.values()]
-
-        self.writeline(f"void {self.kernel_name}({', '.join(args_type_p)}){self.ending}{self.newline}")
-
-    def generate_args_define(self):
-        for arg_name, (_, arg_type, arg_shape) in self.arg_attributes.items():
-            self.writeline(f'{cpp.DTYPE_TO_CPP[arg_type]} {arg_name}[atoi(argv[{self.get_argv_idx()}])] __attribute__ ((aligned (4096))){self.ending}')
-        self.writeline(self.newline)
-
-    def generate_load_dump_fn(self):
-        self.writeline(f'{self.newline}int load_arg(void *arg, size_t size, const char *path) {self.open_bracket}')
-        with self.code.indent():
-            self.writeline(f'int fd = open(path, 0x00000000){self.ending}')
-            self.writeline(f'if (fd == -1) {self.open_bracket}')
-            with self.code.indent():
-                self.writeline(f'return -1{self.ending}')
-            self.writeline(self.closed_bracket)
-
-            self.writeline(f'if (read(fd, arg, size) == -1) {self.open_bracket}')
-            with self.code.indent():
-                self.writeline(f'return -1{self.ending}')
-            self.writeline(self.closed_bracket)
-            self.writeline(f'close(fd){self.ending}')
-            self.writeline(f'return 0{self.ending}')
-        self.writeline(self.closed_bracket)
-
-        self.writeline(f'{self.newline}int dump_arg(void *arg, size_t size, const char *path) {self.open_bracket}')
-        with self.code.indent():
-            self.writeline(f'int fd = open(path, 0x00000001 | 0x00000040, 0644){self.ending}')
-            self.writeline(f'if (fd == -1) {self.open_bracket}')
-            with self.code.indent():
-                self.writeline(f'return -1{self.ending}')
-            self.writeline(self.closed_bracket)
-
-            self.writeline(f'if (write(fd, arg, size) == -1) {self.open_bracket}')
-            with self.code.indent():
-                self.writeline(f'return -1{self.ending}')
-            self.writeline(self.closed_bracket)
-            self.writeline(f'close(fd){self.ending}')
-            self.writeline(f'return 0{self.ending}')
-        self.writeline(self.closed_bracket)
-
-    def generate_main(self):
-        self.writeline(f'{self.newline}int main(int argc, char *argv[]) {self.open_bracket}{self.newline}')
-        with self.code.indent():
-            if self.validation:
-                self.load_arg()
-                self.writeline(self.newline)
-
-            self.writeline(f"{self.kernel_name}({', '.join(list(self.arg_attributes))}){self.ending}{self.newline}")
-
-            if self.validation:
-                self.dump_arg()
-
-            self.write_exit()
-        self.writeline(self.closed_bracket)
-
-    def writeline(self, line):
-        self.code.writeline(line)
-
-    def generate_wrapper_file(self, path, name):
-        self.dump_path = path
-
-        self.write_header()
-        self.generate_kernel_declare()
-
-        if self.validation:
-            self.generate_load_dump_fn()
-        self.generate_main()
-
-        write_path = os.path.join(path, name+".c",)
-        write_atomic(write_path, self.code.getvalue())
-        return
-
-    def add_extention(self, name, extension):
-        return name + "." + extension
-
-    def compile_wih_kernel(self, write_path, llvm_name, wrapper_name, binary_name, link_option=""):
-        main_path = os.path.join(write_path, self.add_extention(wrapper_name, 'c'))
-        main_obj_path = os.path.join(write_path, self.add_extention(wrapper_name, 'o'))
-        kernel_path = os.path.join(write_path, self.add_extention(llvm_name, 's'))
-        kernel_obj_path = os.path.join(write_path, self.add_extention(llvm_name, 'o'))
-
-        main_compile = f'riscv64-unknown-elf-gcc -march=rv64gcv -c {main_path} -o {main_obj_path}'
-        kernel_compile = f'clang -c --target="riscv64" -march=rv64gcv -O2 -nostdlib {kernel_path} -o {kernel_obj_path}'
-
-        target = os.path.join(write_path, binary_name)
-        link = f'riscv64-unknown-elf-gcc -march=rv64gcv {main_obj_path} {kernel_obj_path} -o {target} -lm {link_option}'
-
-        main_compile_cmd = shlex.split(main_compile)
-        kernel_compile_cmd = shlex.split(kernel_compile)
-        link_cmd = shlex.split(link)
-
-        try:
-            subprocess.check_call(main_compile_cmd)
-            subprocess.check_call(kernel_compile_cmd)
-            subprocess.check_call(link_cmd)
-        except subprocess.CalledProcessError as e:
-            print("Command failed with exit code", e.returncode)
-            print("Error output:", e.output)
-            assert(0)
-
-    def parse_stack_sizes(self, file_path, vlenb=256):
-        with open(file_path, 'r') as f:
-            stack_sizes_data = f.readlines()
-
-        in_proc = False
-        stack_base = None
-        dynamic_expr = None
-        max_offset = 0
-
-        for line in stack_sizes_data:
-            line = line.strip()
-            if line.startswith(".cfi_startproc"):
-                in_proc = True
-                continue
-            elif line.startswith(".cfi_endproc") and in_proc:
-                if dynamic_expr:
-                    total_stack = eval(dynamic_expr, {"vlenb": vlenb})
-                    return total_stack
-                elif stack_base:
-                    return stack_base
-                else:
-                    return max_offset
-
-            # Skip outer function
-            if not in_proc:
-                continue
-
-            if line.startswith(".cfi_def_cfa_offset"):
-                stack_base = int(line.split()[-1])
-
-            if ".cfi_escape" in line and "#" in line:
-                comment = line.split("#")[-1].strip()
-                m = re.search(r"sp \+ (\d+)\s*\+\s*(\d+)\s*\*\s*vlenb", comment)
-                if m:
-                    base, scale = int(m.group(1)), int(m.group(2))
-                    dynamic_expr = f"{base} + {scale} * vlenb"
-
-    def get_spad_size(self, binary_path):
-        cmd = ["riscv64-unknown-elf-readelf", "-s", binary_path]
-        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-        if result.returncode != 0:
-            raise RuntimeError(f"Readelf error: {result.stderr}")
-
-        output = result.stdout
-        spad_start = None
-        spad_end = None
-        for line in output.splitlines():
-            if '.spad' in line and 'SECTION' in line:
-                parts = line.split()
-                spad_start = int(parts[1], 16)
-            elif 'spad_end' in line:
-                parts = line.split()
-                spad_end = int(parts[1], 16)
-
-        if spad_start is None or spad_end is None:
-            return 0
-        spad_size = spad_end - spad_start
-        return spad_size
\ No newline at end of file
diff --git a/PyTorchSimFrontend/llvm/llvm_common.py b/PyTorchSimFrontend/llvm/llvm_common.py
deleted file mode 100644
index 1c76b826..00000000
--- a/PyTorchSimFrontend/llvm/llvm_common.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import torch
-from torch._inductor.codegen import common
-from torch._inductor.virtualized import V
-import sympy
-
-from typing import Callable
-
-import sympy
-
-import torch.fx
-from torch.utils._sympy.value_ranges import ValueRanges
-
-from torch._inductor.utils import (
-    free_symbol_startswith,
-    get_sympy_Expr_dtype,
-    IndentedBuffer,
-    sympy_subs,
-    unique,
-)
-
-schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
-
-DTYPE_TO_LLVM = {
-    torch.float32: "float",
-    torch.float64: "double",
-    torch.float16: "half",
-    torch.int64: "i64",
-    torch.int32: "i32",
-    torch.int16: "i16",
-    torch.int8: "i8",
-    torch.uint8: "i8",
-    torch.bool: "i8",
-    torch.bfloat16: "bfloat",
-}
-
-DTYPE_SIZE = {
-    torch.float32: 4,
-    torch.float64: 8,
-    torch.float16: 2,
-    torch.int64: 8,
-    torch.int32: 4,
-    torch.int16: 2,
-    torch.int8: 1,
-    torch.uint8: 1,
-    torch.bool: 1,
-    torch.bfloat16: 2,
-}
-
-DTYPE_LOWP_FP = [
-    torch.bfloat16,
-    torch.float16,
-]
-
-class LLVMKernelArgs(common.KernelArgs):
-    LLVM_ARGS_IN = 0x01
-    LLVM_ARGS_OUT = 0x02
-    LLVM_ARGS_INOUT = 0x04
-    LLVM_ARGS_VAR = 0x08
-
-    @staticmethod
-    def is_llvm_arg_in(value):
-        return (LLVMKernelArgs.LLVM_ARGS_IN & value) | (LLVMKernelArgs.LLVM_ARGS_INOUT & value)
-
-    @staticmethod
-    def is_llvm_arg_out(value):
-        return (LLVMKernelArgs.LLVM_ARGS_OUT & value) | (LLVMKernelArgs.LLVM_ARGS_INOUT & value)
-
-    def llvm_argdefs(self, only_args=False):
-        buffer_types = {x.get_name(): [x.get_dtype(), x.get_numel()] for x in V.graph.buffers}
-        for name, val in V.graph.graph_inputs.items():
-            if isinstance(val, sympy.Expr):
-                buffer_types[name] = [get_sympy_Expr_dtype(val), 1]
-            else:
-                buffer_types[name] = [val.get_dtype(), val.get_numel()]
-        buffer_types.update(
-            {name: val.dtype for name, val in V.graph.constants.items()}
-        )
-
-        call_args = []
-        arg_defs = []
-        arg_attributes = {}
-        for inplaced in unique(self.inplace_buffers.values()):
-            if self._buffer_is_marked_removed(inplaced):
-                continue
-            outer = inplaced.other_names[-1]
-            inner = inplaced.inner_name
-            arg_defs.append(f"ptr %{inner}")
-            if not only_args:
-                call_args.append(outer)
-                arg_attributes[outer] = [self.LLVM_ARGS_INOUT] + buffer_types[outer]
-        for outer, inner in self.input_buffers.items():
-            if outer in self.inplace_buffers:
-                continue
-            arg_defs.append(f"ptr readonly %{inner}")
-            if not only_args:
-                call_args.append(outer)
-                arg_attributes[outer] = [self.LLVM_ARGS_IN] + buffer_types[outer]
-        for outer, inner in self.output_buffers.items():
-            if outer in self.inplace_buffers or self._buffer_is_marked_removed(inner):
-                continue
-            arg_defs.append(f"ptr %{inner}")
-            if not only_args:
-                call_args.append(outer)
-                arg_attributes[outer] = [self.LLVM_ARGS_OUT] + buffer_types[outer]
-        for outer, inner in self.sizevars.items():
-            arg_defs.append(f"ptr readonly %{inner}")
-            if not only_args:
-                call_args.append(outer)
-                arg_attributes[outer] = [self.LLVM_ARGS_VAR] + buffer_types[outer]
-        return arg_defs, call_args, arg_attributes
-
-class BaseLLVMKernel(common.Kernel):
-    newvar_prefix = "%"
-    name_prefix = "body"
-    vector_prefix = "vector_body"
-    suffix = ""
-    overrides = None
-    load_format = None
-    store_format = None
-
-    def __init__(self, args=None):
-        super().__init__(args)
-        self.vector_compute = IndentedBuffer()
-        self.reductions_suffix = IndentedBuffer()
-        self.cse = common.CSE(self.newvar_prefix, self.suffix, self.name_prefix)
-        self.vector_cse = common.CSE(self.newvar_prefix, self.suffix, self.vector_prefix)
-        self.tile_size = None
-        self.tile_shape = {}
-
-    def load(self, name: str, index: sympy.Expr):
-        raise NotImplementedError()
-
-    def store_reduction(self, name, index, value):
-        raise NotImplementedError()
-
-    def store(self, name, index, value, mode=None):
-        raise NotImplementedError()
-
-    def reduction(self, dtype, src_dtype, reduction_type, value):
-        raise NotImplementedError()
-
-    def widening(self, args, buf_bounds):
-        if not args[0] in self.tile_shape or not args[1] in self.tile_shape:
-            return args, [1, 1]
-        tile_shape0 = self.tile_shape[args[0]]
-        tile_shape1 = self.tile_shape[args[1]]
-        vec_len0 = tile_shape0[0] * tile_shape0[1]
-        vec_len1 = tile_shape1[0] * tile_shape1[1]
-        if tile_shape0 != tile_shape1:
-            temp = list(args)
-            idx = 0 if tile_shape0[0] != tile_shape1[0] else 1
-            if tile_shape0[idx] > tile_shape1[idx]:
-                if idx == 0:
-                    indexes = [f"i32 {i%tile_shape1[idx-1]}" for i in range(vec_len0)]
-                else:
-                    indexes = [f"i32 {i//tile_shape1[idx-1]}" for i in range(vec_len0)]
-                line = f"shufflevector <{vec_len1} x float> %{args[1]}, <{vec_len1} x float> undef, <{vec_len0} x i32> <{', '.join(indexes)}>"
-                temp[1] = self.cse.generate(self.compute, line, bounds=buf_bounds)
-            elif tile_shape0[idx] < tile_shape1[idx]:
-                if idx == 0:
-                    indexes = [f"i32 {i%tile_shape0[idx-1]}" for i in range(vec_len1)]
-                else:
-                    indexes = [f"i32 {i//tile_shape0[idx-1]}" for i in range(vec_len1)]
-                line = f"shufflevector <{vec_len0} x float> %{args[0]}, <{vec_len0} x float> undef, <{vec_len1} x i32> <{', '.join(indexes)}>"
-                temp[0] = self.cse.generate(self.compute, line, bounds=buf_bounds)
-            args = tuple(temp)
-        return args, max(tile_shape0, tile_shape1)
-
-    def __enter__(self):
-        class CSEProxy:
-            self.name = "CSEProxy"
-
-            @staticmethod
-            def __getattr__(name: str) -> Callable[..., common.CSEVariable]:  # type: ignore[misc]
-                def inner(*args, **kwargs):
-                    # TritonTemplateKernel has no current_node
-                    buf_bounds = ValueRanges.unknown()
-                    if hasattr(V.interpreter, "current_node"):
-                        fx_node = V.interpreter.current_node
-                        assert isinstance(self.node_to_bounds, dict)
-                        buf_bounds = self.node_to_bounds.get(
-                            fx_node, ValueRanges.unknown()
-                        )
-
-                    vector_csevar = None
-                    if isinstance(args[0], list):
-                        vector_args = (args[0][0], args[1][0])
-                        vector_csevar = self.vector_cse.generate(
-                            self.vector_compute,
-                            getattr(parent_handler, "vector_" + name)(*vector_args, **kwargs),  # type: ignore[has-type]
-                            bounds=buf_bounds,
-                        )
-                        vector_csevar.update_on_args(name, vector_args, kwargs)
-                        args = (args[0][1], args[1][1])
-                    if len(args) == 2:
-                        args, tile_shape = self.widening(args, buf_bounds)
-                    elif len(args) == 1:
-                        tile_shape = self.tile_shape[args[0]]
-                    else:
-                        assert(0) # not implemented yet.
-                    vec_len = tile_shape[0] * tile_shape[1]
-                    csevar = self.cse.generate(
-                        self.compute,
-                        getattr(parent_handler, name)(*args, tile_size=vec_len, **kwargs),  # type: ignore[has-type]
-                        bounds=buf_bounds,
-                    )
-                    self.tile_shape[csevar] = tile_shape
-                    csevar.update_on_args(name, args, kwargs)
-                    if vector_csevar is not None:
-                        return [vector_csevar, csevar]
-                    return csevar
-
-                return inner
-
-            @staticmethod
-            def indirect_indexing(index_var, size, check=True):
-                # Skip CSE since this doesn't return an expression
-                return self.indirect_indexing(index_var, size, check)  # type: ignore[attr-defined]
-
-            @staticmethod
-            def load(name: str, index: sympy.Expr):
-                if name in self.cse.invalidated_stores:
-                    # A load from an invalidated store requires us to
-                    # keep the actual buffer around
-                    V.kernel.must_keep_buffers.add(name)
-                if free_symbol_startswith(index, "%"):
-                    return self.indirect_load(name, index)
-                store_cache = self.cse.store_cache
-                if name in store_cache:
-                    return store_cache[name]
-                return self.load(name, index)
-
-            @staticmethod
-            def store(name, index, value, mode=None):
-                self.store_buffer_names.add(name)
-                if mode is None:
-                    self.cse.store_cache[name] = value
-                    if self.current_node:
-                        for other_name in self.current_node.get_mutations():
-                            self.cse.store_cache[other_name] = value
-                if name not in V.graph.removed_buffers:
-                    return self.store(name, index, value, mode=mode)
-
-            @staticmethod
-            def store_reduction(name, index, value):
-                self.store_buffer_names.add(name)
-                self.cse.store_cache[name] = value
-                if self.current_node:
-                    for other_name in self.current_node.get_mutations():
-                        self.cse.store_cache[other_name] = value
-
-                if name not in V.graph.removed_buffers:
-                    return self.store_reduction(name, index, value)
-
-            @staticmethod
-            def reduction(dtype, src_dtype, reduction_type, value):
-                return self.reduction(dtype, src_dtype, reduction_type, value)
-
-            @staticmethod
-            def bucketize(
-                values,
-                offsets_name: str,
-                offsets_size: sympy.Expr,
-                indexing_dtype: torch.dtype,
-                right: bool,
-            ):
-                """
-                [Note: Inductor bucketize op]
-
-                Given values (tensor) and offsets_name (reference to the name of a 1D
-                tensor), calculate the bucket that each value belongs to.
-
-                e.g. for values [-1, 0, 1, 2, 3, 4, 5, 9], offsets [0, 4, 4, 8], right=True
-                return =        [ 0, 1, 1, 1, 1, 3, 3, 4].
-
-                When right == False, bucket i refers to range (offsets[i], offsets[i+1]].
-                When right == True,  bucket i refers to range [offsets[i], offsets[i+1]).
-
-                Offsets must be non-decreasing or the result is undefined.
-                """
-                return self.bucketize(
-                    values, offsets_name, offsets_size, indexing_dtype, right
-                )
-
-        super().__enter__()
-        assert self.overrides
-        parent_handler = self.overrides(V.get_ops_handler())
-        self.exit_stack.enter_context(V.set_ops_handler(CSEProxy()))
-        self.exit_stack.enter_context(V.set_kernel_handler(self))
-        return self
-
-    def rename_indexing(self, index) -> sympy.Expr:
-        # adds the necessary kernel args for index expressions
-        # and renames variables in index expressions to kernel arg names
-        if isinstance(index, (list, tuple)):
-            return [self.rename_indexing(x) for x in index]
-        index = V.graph.sizevars.simplify(index)
-        sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name)
-        replacements = {
-            x: self.args.size(x)
-            for x in sorted_symbols
-            if x.name.startswith("s") or x.name.startswith("ps")
-        }
-        return sympy_subs(index, replacements)
diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py
index 537809de..e52d6cff 100644
--- a/PyTorchSimFrontend/mlir/mlir_autotune.py
+++ b/PyTorchSimFrontend/mlir/mlir_autotune.py
@@ -2,21 +2,17 @@
 import torch
 import os
 import dataclasses
-from torch._inductor.autotune_process import BenchmarkRequest
 from torch._inductor.autotune_process import TensorMeta
 from torch._inductor.codecache import get_hash, write
 from PyTorchSimFrontend import extension_config
-from Simulator.simulator import BackendSimulator
+from Simulator.simulator import TOGSimulator
 
 from typing import (
     Any,
     Callable,
-    Dict,
     Iterable,
     List,
     Optional,
-    Sequence,
-    TYPE_CHECKING,
     Union,
 )
 
@@ -62,9 +58,9 @@ def make_run_fn(
         # Check already cached result.
         write_path = get_write_path(self.source_code)
         key,  _ = write(self.source_code, "mlir", specified_dir=write_path)
-        result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "backendsim_result/0")
+        result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "togsim_result/0")
         if os.path.exists(result_path):
-            result = BackendSimulator.get_result_from_file(result_path)
+            result = TOGSimulator.get_result_from_file(result_path)
             def cached_run_fn(*args, **kwargs):
                 return result
             return cached_run_fn
diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
index 3fff9958..dff6b0fd 100644
--- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
+++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
@@ -1,16 +1,46 @@
+import os
+import subprocess
+import shlex
+import re
 import torch
-from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
-from PyTorchSimFrontend.llvm.llvm_caller_codegen import LLVMKernelCallerCodeGen
-from PyTorchSimFrontend.mlir.mlir_common import DTYPE_TO_C
+from torch._inductor.utils import IndentedBuffer
+from torch._inductor.codecache import write_atomic
+from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs, DTYPE_TO_C
 
-class MLIRKernelCallerCodeGen(LLVMKernelCallerCodeGen):
+class MLIRKernelCallerCodeGen():
+    """
+    Generate C that calls the llvm kernel.
+    """
 
     def __init__(self, validation, arg_attributes, cycle_sim=False):
-        super().__init__(validation, arg_attributes)
+        super().__init__()
+        self.code = IndentedBuffer()
+        self.ending = ";"
+        self.open_bracket = "{"
+        self.closed_bracket = "}"
+        self.newline = "\n"
+        self.kernel_name = "kernel"
+        self.validation = validation
+        self.n_arg = len(arg_attributes)
+        self.arg_attributes = arg_attributes
+        self.arg_use_count = 1
+        self.load_args = {}
+        self.kernel_start_addr = ""
+        self.kernel_end_addr = ""
         self.cycle_sim = cycle_sim
 
+    def get_argv_idx(self):
+        self.arg_use_count += 1
+        return self.arg_use_count-1
+
     def write_header(self):
-        super().write_header()
+        self.writeline('#include <stdio.h>')
+        self.writeline('#include <stdlib.h>')
+        self.writeline("#include <stdint.h>")
+        if self.validation:
+            self.writeline("#include <unistd.h>")
+            self.writeline('#include <string.h>')
+            self.writeline('#include <fcntl.h>')
         global_var_header = "gem5_global_var.h" if self.cycle_sim else "global_var.h"
         self.writeline(f"#include \"{global_var_header}\"")
 
@@ -42,6 +72,9 @@ def dump_arg(self):
                     self.writeline(f'return -1{self.ending}')
                 self.writeline(self.closed_bracket)
 
+    def write_exit(self):
+        self.writeline(f'return 0{self.ending}')
+
     def generate_kernel_declare(self):
         # memref to llvm arguments (memref -> ptr, ptr, i64, <?xi64>, <?xi64>) allocated pointer, aligned pointer, offset, size, stride
         args_type_p = [f'{DTYPE_TO_C[arg_type[1]]}*, {DTYPE_TO_C[arg_type[1]]}*, int64_t, int64_t, int64_t' for (_, arg_type) in self.arg_attributes]
@@ -86,4 +119,142 @@ def generate_main(self):
                 self.dump_arg()
 
             self.write_exit()
-        self.writeline(self.closed_bracket)
\ No newline at end of file
+        self.writeline(self.closed_bracket)
+
+    def generate_load_dump_fn(self):
+        self.writeline(f'{self.newline}int load_arg(void *arg, size_t size, const char *path) {self.open_bracket}')
+        with self.code.indent():
+            self.writeline(f'int fd = open(path, 0x00000000){self.ending}')
+            self.writeline(f'if (fd == -1) {self.open_bracket}')
+            with self.code.indent():
+                self.writeline(f'return -1{self.ending}')
+            self.writeline(self.closed_bracket)
+
+            self.writeline(f'if (read(fd, arg, size) == -1) {self.open_bracket}')
+            with self.code.indent():
+                self.writeline(f'return -1{self.ending}')
+            self.writeline(self.closed_bracket)
+            self.writeline(f'close(fd){self.ending}')
+            self.writeline(f'return 0{self.ending}')
+        self.writeline(self.closed_bracket)
+
+        self.writeline(f'{self.newline}int dump_arg(void *arg, size_t size, const char *path) {self.open_bracket}')
+        with self.code.indent():
+            self.writeline(f'int fd = open(path, 0x00000001 | 0x00000040, 0644){self.ending}')
+            self.writeline(f'if (fd == -1) {self.open_bracket}')
+            with self.code.indent():
+                self.writeline(f'return -1{self.ending}')
+            self.writeline(self.closed_bracket)
+
+            self.writeline(f'if (write(fd, arg, size) == -1) {self.open_bracket}')
+            with self.code.indent():
+                self.writeline(f'return -1{self.ending}')
+            self.writeline(self.closed_bracket)
+            self.writeline(f'close(fd){self.ending}')
+            self.writeline(f'return 0{self.ending}')
+        self.writeline(self.closed_bracket)
+
+
+    def writeline(self, line):
+        self.code.writeline(line)
+
+    def generate_wrapper_file(self, path, name):
+        self.dump_path = path
+
+        self.write_header()
+        self.generate_kernel_declare()
+
+        if self.validation:
+            self.generate_load_dump_fn()
+        self.generate_main()
+
+        write_path = os.path.join(path, name+".c",)
+        write_atomic(write_path, self.code.getvalue())
+        return
+
+    def add_extention(self, name, extension):
+        return name + "." + extension
+
+    def compile_wih_kernel(self, write_path, llvm_name, wrapper_name, binary_name, link_option=""):
+        main_path = os.path.join(write_path, self.add_extention(wrapper_name, 'c'))
+        main_obj_path = os.path.join(write_path, self.add_extention(wrapper_name, 'o'))
+        kernel_path = os.path.join(write_path, self.add_extention(llvm_name, 's'))
+        kernel_obj_path = os.path.join(write_path, self.add_extention(llvm_name, 'o'))
+
+        main_compile = f'riscv64-unknown-elf-gcc -march=rv64gcv -c {main_path} -o {main_obj_path}'
+        kernel_compile = f'clang -c --target="riscv64" -march=rv64gcv -O2 -nostdlib {kernel_path} -o {kernel_obj_path}'
+
+        target = os.path.join(write_path, binary_name)
+        link = f'riscv64-unknown-elf-gcc -march=rv64gcv {main_obj_path} {kernel_obj_path} -o {target} -lm {link_option}'
+
+        main_compile_cmd = shlex.split(main_compile)
+        kernel_compile_cmd = shlex.split(kernel_compile)
+        link_cmd = shlex.split(link)
+
+        try:
+            subprocess.check_call(main_compile_cmd)
+            subprocess.check_call(kernel_compile_cmd)
+            subprocess.check_call(link_cmd)
+        except subprocess.CalledProcessError as e:
+            print("Command failed with exit code", e.returncode)
+            print("Error output:", e.output)
+            assert(0)
+
+    def parse_stack_sizes(self, file_path, vlenb=256):
+        with open(file_path, 'r') as f:
+            stack_sizes_data = f.readlines()
+
+        in_proc = False
+        stack_base = None
+        dynamic_expr = None
+        max_offset = 0
+
+        for line in stack_sizes_data:
+            line = line.strip()
+            if line.startswith(".cfi_startproc"):
+                in_proc = True
+                continue
+            elif line.startswith(".cfi_endproc") and in_proc:
+                if dynamic_expr:
+                    total_stack = eval(dynamic_expr, {"vlenb": vlenb})
+                    return total_stack
+                elif stack_base:
+                    return stack_base
+                else:
+                    return max_offset
+
+            # Skip outer function
+            if not in_proc:
+                continue
+
+            if line.startswith(".cfi_def_cfa_offset"):
+                stack_base = int(line.split()[-1])
+
+            if ".cfi_escape" in line and "#" in line:
+                comment = line.split("#")[-1].strip()
+                m = re.search(r"sp \+ (\d+)\s*\+\s*(\d+)\s*\*\s*vlenb", comment)
+                if m:
+                    base, scale = int(m.group(1)), int(m.group(2))
+                    dynamic_expr = f"{base} + {scale} * vlenb"
+
+    def get_spad_size(self, binary_path):
+        cmd = ["riscv64-unknown-elf-readelf", "-s", binary_path]
+        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(f"Readelf error: {result.stderr}")
+
+        output = result.stdout
+        spad_start = None
+        spad_end = None
+        for line in output.splitlines():
+            if '.spad' in line and 'SECTION' in line:
+                parts = line.split()
+                spad_start = int(parts[1], 16)
+            elif 'spad_end' in line:
+                parts = line.split()
+                spad_end = int(parts[1], 16)
+
+        if spad_start is None or spad_end is None:
+            return 0
+        spad_size = spad_end - spad_start
+        return spad_size
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index b3352ea6..c24260ce 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -2,15 +2,18 @@
 import sympy
 import re
 import os
+import math
 from functools import reduce
 from operator import mul
 import torch
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
+from torch._dynamo.testing import rand_strided
+from torch._inductor.autotune_process import TensorMeta
 from torch._dynamo.utils import dynamo_timed
 from torch._inductor.codegen import cpp, wrapper, common, memory_planning
 from torch._inductor.virtualized import V, _ops as ops
-from torch._inductor.codecache import write_atomic, write
+from torch._inductor.codecache import write_atomic
 from torch._inductor.utils import (
     IndentedBuffer,
     is_welford_reduction,
@@ -21,6 +24,7 @@
 from PyTorchSimFrontend import extension_config
 from . import mlir_common
 from .mlir_common import LoopLevel, LoopNest
+from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
 
 def reduction_init(reduction_type, dtype):
     if dtype in cpp.DTYPE_LOWP_FP:
@@ -95,8 +99,8 @@ def write_header(self):
 
                 from torch import device, empty, empty_strided
                 from {extension_codecache.__name__} import CustomAsyncCompile
-                from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_BACKENDSIM_EAGER_MODE
-                from Simulator.simulator import BackendSimulator
+                from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_TOGSIM_EAGER_MODE
+                from Simulator.simulator import TOGSimulator
                 from PyTorchSimFrontend.extension_op import sparse_mm_dummy_stonne_outer
                 from torch._inductor.select_algorithm import extern_kernels
 
@@ -118,7 +122,7 @@ def sram_plan_prefix(buffer_name, buffer):
                 start = buffer.data_ptr()
                 end = start + buffer_size
                 # print(f'Alloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})')
-                BackendSimulator.sram_alloc(buffer_name, [start, end])
+                TOGSimulator.sram_alloc(buffer_name, [start, end])
 
             def sram_plan_postfix(buffer_name, buffer):
                 if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN):
@@ -127,7 +131,7 @@ def sram_plan_postfix(buffer_name, buffer):
                 start = buffer.data_ptr()
                 end = start + buffer_size
                 # print(f'Dealloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})')
-                BackendSimulator.sram_dealloc(buffer_name, [start, end])
+                TOGSimulator.sram_dealloc(buffer_name, [start, end])
 
             def host2device_memcopy(buffer):
                 pass
@@ -420,6 +424,10 @@ def exp(operand, *args, var_info=None, **kwargs):
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
         return f'math.exp %{operand} : {shape}', [tile_size, dtype]
 
+    @staticmethod
+    def exp2(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError()
+
     @staticmethod
     def erf(operand, *args, var_info=None, **kwargs):
         # Check scalar
@@ -1287,7 +1295,7 @@ def store_reduction(self, name, index, value):
             # mean
             reduction_numel = reduce(mul, self.ranges[self.reduction_depth:], 1)
             divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(reduction_numel)} : f32")
-            if self.buffer_types[name][1] > 1:
+            if compute_vec_size > 1:
                 divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to vector<{self.var_info[sum][0]}x{mlir_dtype}>")
             else:
                 divider_vec = divider
@@ -1627,15 +1635,40 @@ def get_cycle(choice):
         choices = self.make_choices(*args)
 
         if len(choices) == 0: # can't autotune
-            return None
+            return [None, None]
         with ThreadPoolExecutor(max_workers=8) as executor:
             results = list(executor.map(get_cycle, choices))
         max_idx = results.index(min(results))
         if min(results) == float("inf"):
             raise RuntimeError("Failed to find optimal tile size...")
         self._log_autotune_result(choices[max_idx], results[max_idx])
-        optimal_src_code = choices[max_idx][1]
-        return optimal_src_code
+        optimal_src_code, loop_size = choices[max_idx][1], choices[max_idx][-1]
+        return optimal_src_code, loop_size
+
+    def run_bench(self, nodes, kernel_name, src_code):
+        _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs()
+        input_call_args = tuple(self.args.input_buffers.keys())
+        output_call_args = tuple(self.args.output_buffers.keys())
+        full_input_nodes = tuple([V.graph.get_buffer(k) for k in input_call_args])
+        full_output_nodes = tuple([V.graph.get_buffer(k) for k in output_call_args])
+
+        bmreq = MLIRBenchmarkRequest(
+            kernel_name=kernel_name,
+            input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes),
+            output_tensor_meta=TensorMeta.from_irnodes(full_output_nodes),
+            extra_args={
+                "vector_lane" : self.vector_lane,
+                "spad_info": self.spad_info,
+                "vlen" : self.vlen,
+                "arg_attributes" : arg_attributes,
+                "validate" : extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE,
+                "autotune" : True,
+            },
+            source_code=src_code,
+        )
+        dummy_inputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.input_tensor_meta]
+        dummy_outputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.output_tensor_meta]
+        return bmreq.make_run_fn(dummy_inputs, dummy_outputs)
 
     def _log_autotune_result(self, best_choice, best_cycle):
         print(
@@ -1647,8 +1680,8 @@ def _log_autotune_result(self, best_choice, best_cycle):
     def codegen_nodes(self, nodes, kernel_name):
         src_code = super().codegen_nodes(nodes, kernel_name)
         self._prepare_simulator_headers(src_code)
-        if extension_config.CONFIG_AUTOTUNE and not extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
-            optimal_src_code = self.autotune(nodes, kernel_name)
+        if extension_config.CONFIG_AUTOTUNE and extension_config.CONFIG_TORCHSIM_TIMING_MODE:
+            optimal_src_code = self.autotune(nodes, kernel_name)[0]
             if optimal_src_code is not None:
                 return optimal_src_code
         return src_code
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 2644f125..c655dde3 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -1,15 +1,12 @@
 import dataclasses
 import math
 from dataclasses import dataclass
-from typing import Optional, Iterable
 from typing import Dict
 from typing import List
 from collections import defaultdict
 from functools import reduce
 from operator import mul
 import torch
-from torch._dynamo.testing import rand_strided
-from torch._inductor.autotune_process import TensorMeta
 from torch._inductor.codegen import common
 from torch._inductor.codegen import cpp
 from torch._inductor.virtualized import V
@@ -35,7 +32,6 @@
 )
 from PyTorchSimFrontend import extension_config
 from PyTorchSimFrontend import extension_codecache
-from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
 schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
 
 DTYPE_TO_MLIR = {
@@ -776,31 +772,6 @@ def codegen_nodes(self, nodes, kernel_name):
             self.meta_kernel()
             return src_code
 
-    def run_bench(self, nodes, kernel_name, src_code):
-        _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs()
-        input_call_args = tuple(self.args.input_buffers.keys())
-        output_call_args = tuple(self.args.output_buffers.keys())
-        full_input_nodes = tuple([V.graph.get_buffer(k) for k in input_call_args])
-        full_output_nodes = tuple([V.graph.get_buffer(k) for k in output_call_args])
-
-        bmreq = MLIRBenchmarkRequest(
-            kernel_name=kernel_name,
-            input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes),
-            output_tensor_meta=TensorMeta.from_irnodes(full_output_nodes),
-            extra_args={
-                "vector_lane" : self.vector_lane,
-                "spad_info": self.spad_info,
-                "vlen" : self.vlen,
-                "arg_attributes" : arg_attributes,
-                "validate" : extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
-                "autotune" : True,
-            },
-            source_code=src_code,
-        )
-        dummy_inputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.input_tensor_meta]
-        dummy_outputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.output_tensor_meta]
-        return bmreq.make_run_fn(dummy_inputs, dummy_outputs)
-
     def codegen_kernel(self, kernel_name):
         arg_defs, _, _, _ = self.kernel_group.args.mlir_argdefs()
         arg_defs = ",\n".ljust(25).join(arg_defs)
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py
index 52979d73..77826730 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py
@@ -82,7 +82,7 @@ def outer_func_render(self, kernel_name, input_args):
         Y = self.output_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
-        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
+        eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False))
         options = dict(
             kernel=self.kernel,
             KERNEL_NAME=kernel_name,
@@ -93,8 +93,8 @@ def outer_func_render(self, kernel_name, input_args):
             OUTPUT=Y,
             PADDING_H=self.padding[0],
             PADDING_W=self.padding[1],
-            VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE,
-            BACKENDSIM_EAGER_MODE=eager_mode,
+            VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE,
+            TOGSIM_EAGER_MODE=eager_mode,
             input_reorder=self.input_reorder
         )
         code = self._template_from_string(self.WRAPPER_TEMPLATE).render(**options)
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
index 26018a94..0bf01421 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
@@ -5,7 +5,6 @@
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
 from torch._inductor.ir import IRNode
 from PyTorchSimFrontend.mlir import mlir_common
-from PyTorchSimFrontend import extension_config
 
 CONV_TEMPLATE = r"""
 // Multi Channel Tile Conv2D kernel
@@ -121,7 +120,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
 
     # Launch kernel
     {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
-    {%- if BACKENDSIM_EAGER_MODE %}
+    {%- if TOGSIM_EAGER_MODE %}
     yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
     {%- endif %}
 """
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
index a2959b4d..92b9a525 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
@@ -121,7 +121,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
 
     # Launch kernel
     {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
-    {%- if BACKENDSIM_EAGER_MODE %}
+    {%- if TOGSIM_EAGER_MODE %}
     yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
     {%- endif %}
 """
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
index afbe9289..ab124852 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
@@ -121,7 +121,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
 
     # Launch kernel
     {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
-    {%- if BACKENDSIM_EAGER_MODE %}
+    {%- if TOGSIM_EAGER_MODE %}
     yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
     {%- endif %}
 """
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 777d0a7b..66aa0a27 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -125,7 +125,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
 
     # Launch kernel
     {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
-    {%- if BACKENDSIM_EAGER_MODE %}
+    {%- if TOGSIM_EAGER_MODE %}
     yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
     {%- endif %}
 """
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index c2120e7b..6271b548 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -1,4 +1,3 @@
-import os
 import json
 from pathlib import Path
 from torch import empty_strided
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index 6508ea86..af59d88f 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -15,7 +15,7 @@
 from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate
 from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate
 from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
-from PyTorchSimFrontend.extension_config import CONFIG_VECTOR_LANE, CONFIG_USE_TIMING_POOLING
+from PyTorchSimFrontend import extension_config
 
 aten = torch.ops.aten
 aten_spmm = MLIRExternKernelChoice(torch.sparse.mm, "custom_op::sparse_addmm")
@@ -106,11 +106,11 @@ def convolution(
     layout = conv_layout(x, weight, None, **kwargs)
 
     # Select conv kernel
-    if BATCH == 1 and stride[0] == 1:
+    if BATCH == 1 and stride[0] == 1 and extension_config.CONFIG_SINGLE_BATCH_CONV:
         mlir_template = MLIRConvSingleBatchTemplate([x, weight, bias], layout, **kwargs)
-    elif BATCH == 1 and stride[0] != 1:
+    elif BATCH == 1 and stride[0] != 1 and extension_config.CONFIG_SINGLE_BATCH_CONV:
         mlir_template = MLIRConvSingleBatchStridedTemplate([x, weight, bias], layout, **kwargs)
-    elif I_C < CONFIG_VECTOR_LANE // 8: # 8 is hard-coded for now. This should be changed to a better heuristic.
+    elif I_C < extension_config.CONFIG_VECTOR_LANE // 8 and extension_config.CONFIG_MULTI_TILE_CONV: # 8 is hard-coded for now. This should be changed to a better heuristic.
         mlir_template = MLIRConvMultiTileTemplate([x, weight, bias], layout, **kwargs)
     else:
         mlir_template = MLIRConvTemplate([x, weight, bias], layout, **kwargs)
@@ -187,5 +187,5 @@ def custom_unsafe_index(x, indices):
 lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
 lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
 lowerings.update({getattr(aten._unsafe_index, overload): custom_unsafe_index for overload in aten._unsafe_index.overloads()})
-if CONFIG_USE_TIMING_POOLING:
+if extension_config.CONFIG_USE_TIMING_POOLING:
     lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
index 2cca36b6..3658f992 100644
--- a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py
@@ -1,11 +1,9 @@
-import os
 from typing import List, Optional, cast
 
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
 from torch._inductor.ir import Buffer
 from torch._inductor.ir import IRNode
-from torch._inductor.ir import ReinterpretView
 from PyTorchSimFrontend.mlir import mlir_common
 import sympy
 
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 26b90401..38603319 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -3,12 +3,9 @@
 import sympy
 from functools import reduce
 import operator
-from sympy import symbols, sympify, Symbol
-from collections import OrderedDict
-from concurrent.futures import ThreadPoolExecutor
+from sympy import symbols, sympify
 from PyTorchSimFrontend import extension_config
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
-from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
 
 from torch._inductor import config
 from torch._inductor.scheduler import BaseScheduling, FusedSchedulerNode, SchedulerNode, BaseSchedulerNode
@@ -97,6 +94,8 @@ def can_fuse_vertical(self, node1, node2):
         return self.can_fuse_horizontal(node1, node2)
 
     def can_fuse_horizontal(self, node1, node2):
+        if not extension_config.CONFIG_FUSION:
+            return False
         if (len(node1.get_nodes())+ len(node2.get_nodes())) > self.max_fusion_size:
             return False
         _, (vars1, reduce1) = node1.group
@@ -217,7 +216,7 @@ def codegen_nodes(self, nodes):
         ex_kernel.call_kernel(kernel_name)
         _, args, _, _ = ex_kernel.args.mlir_argdefs()
         args = ", ".join(args)
-        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
+        eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False))
         if (eager_mode):
             V.graph.wrapper_code.writeline(
                 f"yield ({kernel_name}, ({args}))"
@@ -288,7 +287,7 @@ def codegen_template(self, template_node, epilogue_nodes):
         kernel.call_kernel(kernel_name)
         V.graph.removed_buffers |= kernel.removed_buffers
         _, args, _, _ = self.kernel_group.args.mlir_argdefs()
-        eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False))
+        eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False))
         if (eager_mode):
             target_kernel_name = kernel_name if kernel.outer_func_name is None else kernel.outer_func_name + f"_{len(args)}"
             args = ", ".join(args)
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index e6e9dd0c..df3621eb 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -13,8 +13,8 @@
 from typing import List, Optional
 from unittest.mock import patch
 
-from torch._inductor.codegen.common import Kernel, KernelTemplate, ChoiceCaller, OpOverrides, CSE, DeferredLine
-from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, View
+from torch._inductor.codegen.common import KernelTemplate, ChoiceCaller, CSE, DeferredLine
+from torch._inductor.ir import Buffer, IRNode, TemplateBuffer
 from torch._inductor.select_algorithm import PartialRender
 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
 from torch._inductor.autotune_process import TensorMeta
@@ -29,7 +29,7 @@
 from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode
 from torch._inductor.codegen import common
 
-from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_DIR, CONFIG_AUTOTUNE_TEMPLATE_TOPK
+from PyTorchSimFrontend import extension_config
 from . import mlir_common
 
 class IndentedBufferGroup:
@@ -234,7 +234,7 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p
                     used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
                     check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane)
                     if check_spad_size:
-                        dir_path = f"{CONFIG_TORCHSIM_DIR}/validation/gemm_candidates"
+                        dir_path = f"{extension_config.CONFIG_TORCHSIM_DIR}/validation/gemm_candidates"
                         os.makedirs(dir_path, exist_ok=True)
                         file_path = f"{dir_path}/gemm_{M}_{K}_{N}.txt"
                         line_to_write = f"{tile_M} {tile_K} {tile_N}\n"
@@ -494,7 +494,7 @@ def make_choices(self, tile_candidates, render, template_node, prologue_nodes, e
             print(f"[Auto-tune] Trying tile size: {list(tile_info)}")
             src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info)
             bench_runner = self.run_bench([template_node], self.kernel_name, src_code)
-            choices.append((bench_runner, src_code, tile_info))
+            choices.append((bench_runner, src_code, tile_info, self.loop_size))
             self.reset(reason=None)
         return choices
 
@@ -506,7 +506,12 @@ def _log_autotune_result(self, best_choice, best_cycle):
         )
 
     def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes):
-        src_code = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes)
+        if extension_config.CONFIG_AUTOTUNE_TEMPLATE and len(tile_candidates):
+            src_code, loop_size = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes)
+            self.loop_size = loop_size
+        else:
+            tile_info = tile_candidates[0] if tile_candidates else None
+            src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info)
 
         with V.set_kernel_handler(self):
             self.meta_kernel()
@@ -1118,7 +1123,7 @@ def set_tile_size(self, template_fusion_info, prologue=False):
             numel_per_lane = tile_desc.get_numel_per_lane()
             r_tile_size = tile_desc.get_tile_size()[-1]
             nr_outer_loop = (numel_per_lane + r_tile_size-1) // r_tile_size
-            tile_desc.vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality...
+            tile_desc.vmap.forced_vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality...
 
             self.reduction_fusion = True
             self.r_tile_size = tile_desc.get_tile_size()[-1]
@@ -1129,7 +1134,7 @@ def set_tile_size(self, template_fusion_info, prologue=False):
             self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop
             self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_loop_idx, nr_outer_loop)
         else:
-            tile_desc.vec_size=64
+            tile_desc.vmap.forced_vec_size = 64
 
             if prologue:
                 self.prologue_compute_body_loop.size = tile_desc.get_numel_per_lane()
@@ -1225,7 +1230,7 @@ def make_kernel_render(
                 template=self,
                 kwargs=kwargs
             )
-            tile_candidates = self.get_tile_candidates(**kwargs)[:CONFIG_AUTOTUNE_TEMPLATE_TOPK]
+            tile_candidates = self.get_tile_candidates(**kwargs)[:extension_config.CONFIG_AUTOTUNE_TEMPLATE_TOPK]
             return kernel, tile_candidates, render
 
         return MLIRTemplateCaller(
diff --git a/README.md b/README.md
index 4289195e..dbfdf2e8 100644
--- a/README.md
+++ b/README.md
@@ -98,7 +98,7 @@ The `tests` directory contains several AI workloads examples.
 ```bash
 python tests/test_matmul.py 
 ```
-The result is stored to `TORCHSIM_DUMP_PATH/hash/backendsim_result/`. The log file contains detailed core, memory, and interconnect stats.
+The result is stored to `TORCHSIM_DUMP_PATH/hash/togsim_result/`. The log file contains detailed core, memory, and interconnect stats.
 
 ### Run Your Own Model on PyTorchSim
 You can run your own PyTorch model on PyTorchSim by setting up a custom NPU device.  
@@ -131,9 +131,9 @@ Wrapper Codegen Path = /tmp/torchinductor_root/yd/cyda7nhzv5mtakfhfcxtmmhtsv6kg7
 [Gem5Simulator] cmd>  /workspace/gem5/build/RISCV/gem5.opt -r --stdout-file=sto.log -d /tmp/torchinductor/tmp/fy6nnyudtno/m5out /root/workspace/PyTorchSim/gem5_script/script_systolic.py -c /tmp/torchinductor/tmp/fy6nnyudtno/cycle_bin --vlane 128
 [Gem5Simulator] Simulation is still running... 
 [SpikeSimulator] cmd>  spike --isa rv64gcv --varch=vlen:256,elen:64 --vectorlane-size=128 -m0x80000000:0x1900000000,0x2000000000:0x1000000 --scratchpad-base-paddr=137438953472 --scratchpad-base-vaddr=3489660928 --scratchpad-size=131072  --kernel-addr=0000000000010400:10846 --base-path=/tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001 /workspace/riscv-pk/build/pk /tmp/torchinductor/tmp/fy6nnyudtno/validation_binary /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/arg0_1/0.raw /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/arg1_1/0.raw /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/buf0/0.raw
-[BackendSimulator] cmd>  /root/workspace/PyTorchSim/PyTorchSimBackend/build/bin/Simulator --config /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/attribute/0
-[BackendSimulator] Simulation is still running..  
-[BackendSimulator] Simulation of "/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/fy6nnyudtno/backendsim_result/0"
+[TOGSimulator] cmd>  /root/workspace/PyTorchSim/TOGSim/build/bin/Simulator --config /root/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/attribute/0
+[TOGSimulator] Simulation is still running..  
+[TOGSimulator] Simulation of "/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/0"
 ----------------------------
 |Matmul Forward Test Passed|
 ----------------------------
@@ -143,25 +143,25 @@ Simulation consists of three steps
 
 1. `Gem5Simulator` obatins compute latency for TOG.
 2. `SpikeSimulator` verifies the output code.
-3. `BackendSimulator` simulates a NPU architecture.
+3. `TOGSimulator` simulates a NPU architecture.
 
 If you want to turn off the `SpikeSimulator` for fast simulation, you can set as below.
 ```bash
-export TORCHSIM_VALIDATION_MODE=False
+export TORCHSIM_FUNCTIONAL_MODE=False
 ```
 Log contains memory & core stats.
 ```bash
 [info] HBM2-CH_0: avg BW utilization 37% (255 reads, 128 writes)
 [info] Row hits: 359, Row misses: 26, Row conflicts: 0
 [info] ========= Core stat =========
-[info] Core [0] : Systolic array [0] Utilization(%) 0.00, active cycle 0, idle cycle 1014
-[info] Core [0] : Systolic array [1] Utilization(%) 12.62, active cycle 128, idle cycle 886
-[info] Core [0] : TMA active cycle 3 TMA idle cycle 1011 DRAM BW 182.000 GB/s (6144)
-[info] Core [0] : Vector Unit Utilization(%) 4.34, active cycle 44, idle_cycle 0
-[info] Core [0] : Numa hit count : 0, Numa miss count : 0
-[info] Core [0] : Total cycle 1014
-[info] Total execution cycle: 1014
-[info] Simulation time: 0.039296 seconds
+[info] Core [0] : Systolic array [0] Utilization(%) 0.00, active_cycles 0, idle_cycles 1014
+[info] Core [0] : Systolic array [1] Utilization(%) 12.62, active_cycles 128, idle_cycles 886
+[info] Core [0] : DMA active_cycles 3 DMA idle_cycles 1011 DRAM BW 182.000 GB/s (6144)
+[info] Core [0] : Vector Unit Utilization(%) 4.34, active_cycles 44, idle_cycle 0
+[info] Core [0] : NUMA local memory: 34 requests, remote memory: 0 requests
+[info] Core [0] : Total_cycles 1014
+[info] Total execution cycles: 1014
+[info] Wall-clock time for simulation: 0.039296 seconds
 ```
 The log is dumped in `TORCHSIM_DUMP_PATH` and you can set the path as below.
 ```bash
@@ -186,7 +186,7 @@ Our load generator supports multi-tenancy experiments. You can run a simple exam
 python tests/test_scheduler.py
 ```
 Below is an example code of multi-tenancy `resnet18` and `EncoderBlock`.
-In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSim config file(`.json`). The compiled PyTorch models are then registered with a unique model id.
+In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSimulator config file(`.json`). The compiled PyTorch models are then registered with a unique model id.
 
 ```python3
 import os
@@ -195,11 +195,11 @@ import torch
 from torchvision.models import resnet18
 from test_transformer import EncoderBlock
 base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-config = f'{base_path}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
+config = f'{base_path}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
 
 sys.path.append(base_path)
 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
 
 # Register compiled model
 target_model0 = resnet18().eval()
@@ -344,14 +344,14 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
 ## TOGSim Configuration
 ![NPU_Core](./docs/npu_core.jpg)
 
-`PyTorchSimBackend/configs` directory contains example NPU configuration files in the JSON format.
+`TOGSim/configs` directory contains example NPU configuration files in the JSON format.
 ```
   "num_cores" : 2,                   // Number of NPU cores
-  "core_freq" : 940,                 // Core's frequency (MHz)
+  "core_freq_mhz" : 940,             // Core's frequency (MHz)
   "num_systolic_array_per_core" : 2, // Number of systolic array per core
 
   "dram_type" : "ramulator2",        // DRAM type (ex. ramulator2, simple)
-  "dram_freq" : 940,                 // DRAM frequency (MHz)
+  "dram_freq_mhz" : 940,             // DRAM frequency (MHz)
   "dram_channels": 32,               // Number of DRAM channels
   "dram_req_size": 32,               // DRAM request size (B)
   "dram_latency" : 10,               // DRAM latency (cycle)
@@ -361,9 +361,10 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
   "l2d_type" : "datacache",
   "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32",
 
-  "icnt_type" : "simple",            // Interconnect type (ex. booksim, simple)
-  "icnt_latency" : 7,                // Interconnect latency (cycle)
-  "icnt_freq" : 28000,               // Interconnect frequency (MHz)
+  "icnt_type" : "simple",              // Interconnect type (ex. booksim, simple)
+  "icnt_latency" : 7,                  // Interconnect latency (cycle)
+  "icnt_freq_mhz" : 940,               // Interconnect frequency (MHz)
+  "icnt_injection_ports_per_core" : 16 // Interconnect injection ports per core
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", // Booksim2 config file path
 
   "precision" : 4,                   // Element's precision in tensor (Byte)
@@ -376,7 +377,7 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
 ```
 You can set TOGSim config path as below.
 ```bash
-export TORCHSIM_CONFIG=/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+export TORCHSIM_CONFIG=/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
 ```
 ## Future Works
 Currently, PyTorchSim supports PyTorch 2.2. Support for newer versions will be added soon.
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 10358321..0b633fa9 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 import importlib.util
 from PyTorchSimFrontend.extension_codecache import hash_prefix
-from Simulator.simulator import BackendSimulator
+from Simulator.simulator import TOGSimulator
 from PyTorchSimFrontend import extension_config
 
 def import_module_from_path(module_name, path):
@@ -144,7 +144,7 @@ class PyTorchSimRunner:
     PARTITION_BUSY = 0
     PARTITION_IDLE = 1
     SELECT_NOTHING = 2
-    def __init__(self, backend_simulator : BackendSimulator, num_partion=1) -> None:
+    def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None:
         self.module = self.setup_device()
         self.num_partion = num_partion
         self.launch_model_dicts = []
@@ -156,11 +156,11 @@ def __init__(self, backend_simulator : BackendSimulator, num_partion=1) -> None:
             self.partition_state.append(self.PARTITION_IDLE)
 
         self.finish_req_dict = {}
-        self.backend_simulator = backend_simulator
+        self.tog_simulator = tog_simulator
 
         # Dry run for compile and create generator
-        os.environ["BACKENDSIM_DRYRUN"] = "1"
-        os.environ["BACKENDSIM_EAGER_MODE"] = "1"
+        os.environ["TOGSIM_DRYRUN"] = "1"
+        os.environ["TOGSIM_EAGER_MODE"] = "1"
 
     @staticmethod
     def setup_device():
@@ -171,7 +171,7 @@ def setup_device():
 
         import torch.utils.cpp_extension
         module = torch.utils.cpp_extension.load(
-            name="extension_device",
+            name="npu",
             sources=[
                 str(source_file),
             ],
@@ -179,7 +179,7 @@ def setup_device():
             verbose=True,
         )
 
-        torch.utils.rename_privateuse1_backend("extension_device")
+        torch.utils.rename_privateuse1_backend("npu")
         from torch._inductor.codegen.common import (
             get_scheduling_for_device,
             get_wrapper_codegen_for_device,
@@ -192,13 +192,13 @@ def setup_device():
             MLIRScheduling
         )
         register_backend_for_device(
-            "extension_device", MLIRScheduling, ExtensionWrapperCodegen
+            "npu", MLIRScheduling, ExtensionWrapperCodegen
         )
         assert(
-            get_scheduling_for_device("extension_device") == MLIRScheduling
+            get_scheduling_for_device("npu") == MLIRScheduling
         )
         assert(
-        get_wrapper_codegen_for_device("extension_device")
+        get_wrapper_codegen_for_device("npu")
             == ExtensionWrapperCodegen
         )
         return module
@@ -222,7 +222,7 @@ def is_all_idle(self):
         return all([self.is_partition_idle(i) for i in range(self.num_partion)])
 
     def prepare_model(self, req_model: SchedulerDNNModel):
-        result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "backend_result", req_model.model_name)
+        result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "togsim_result", req_model.model_name)
         os.makedirs(result_path, exist_ok=True)
         index = str(len(os.listdir(result_path)))
 
@@ -244,7 +244,7 @@ def prepare_launch_kernel(self, kernel, inputs):
         onnx_path = os.path.join(result_path, "tile_graph.onnx")
 
         attribute_path = os.path.join(runtime_path, "attribute")
-        attribute_path = self.backend_simulator.create_attribute_file(attribute_path, inputs)
+        attribute_path = self.tog_simulator.create_attribute_file(attribute_path, inputs)
         return onnx_path, attribute_path
 
     def launch_kernel(self, current_cycle, partion_idx=0):
@@ -260,11 +260,11 @@ def launch_kernel(self, current_cycle, partion_idx=0):
         else:
             onnx_path, attribute_path = kernel, inputs
         self.partition_state[partion_idx] = self.PARTITION_BUSY
-        return self.backend_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx)
+        return self.tog_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx)
 
 class FIFORunner(PyTorchSimRunner):
-    def __init__(self, backend_simulator: BackendSimulator, num_partion=1) -> None:
-        super().__init__(backend_simulator, num_partion)
+    def __init__(self, tog_simulator: TOGSimulator, num_partion=1) -> None:
+        super().__init__(tog_simulator, num_partion)
 
     def select_kernel(self, partition_idx):
         while len(self.nested_launch_model_dicts[partition_idx]) or len(self.launch_model_dicts[partition_idx]):
@@ -298,8 +298,8 @@ def select_kernel(self, partition_idx):
         return self.SELECT_NOTHING
 
 class RoundRobinRunner(PyTorchSimRunner):
-    def __init__(self, backend_simulator: BackendSimulator, num_partion=1) -> None:
-        super().__init__(backend_simulator, num_partion)
+    def __init__(self, tog_simulator: TOGSimulator, num_partion=1) -> None:
+        super().__init__(tog_simulator, num_partion)
         self.next_pointer = None
 
     def select_kernel(self, partition_idx):
@@ -347,7 +347,7 @@ class Scheduler:
 
     FIFO_ENGINE = 0
     RR_ENGINE = 1
-    def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, backend_config=extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) -> None:
+    def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG) -> None:
         self.current_cycle = 0
         self.max_batch = max_batch
         self.num_request_queue = num_request_queue
@@ -356,13 +356,13 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE,
             self.request_queue.append([])
         self.finish_queue : List[Request] = []
 
-        backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
-        self.backend_simulator = BackendSimulator(backend_path, backend_config)
-        self.backend_simulator.interactive_simulation()
+        togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
+        self.tog_simulator = TOGSimulator(togsim_path, togsim_config)
+        self.tog_simulator.interactive_simulation()
         if engine_select == Scheduler.FIFO_ENGINE:
-            self.execution_engine = FIFORunner(self.backend_simulator, self.num_request_queue)
+            self.execution_engine = FIFORunner(self.tog_simulator, self.num_request_queue)
         elif engine_select == Scheduler.RR_ENGINE:
-            self.execution_engine = RoundRobinRunner(self.backend_simulator, self.num_request_queue)
+            self.execution_engine = RoundRobinRunner(self.tog_simulator, self.num_request_queue)
         else:
             print(f"Not supporetd engine type {engine_select}")
             exit(1)
@@ -469,8 +469,8 @@ def schedule(self):
 
         # Need to forward the time until next_arrival_time
         if self.execution_engine.is_all_idle():
-            reason = self.backend_simulator.until(self.msec_to_cycle(next_time))
-            self.current_cycle = self.backend_simulator.cycle()
+            reason = self.tog_simulator.until(self.msec_to_cycle(next_time))
+            self.current_cycle = self.tog_simulator.cycle()
         else:
             self.run(next_time)
         return
@@ -490,8 +490,8 @@ def execute_cycle():
                 return []
 
             # Schedule jobs and update the current time
-            result_list = self.backend_simulator.until(self.msec_to_cycle(until_time))
-            self.current_cycle = self.backend_simulator.cycle()
+            result_list = self.tog_simulator.until(self.msec_to_cycle(until_time))
+            self.current_cycle = self.tog_simulator.cycle()
 
             for core_idx in result_list:
                 # Kernel is finished. So set idle state
@@ -526,7 +526,7 @@ def is_request_queue_empty(self):
 
     def is_finished(self):
         if self.is_request_queue_empty() and self.execution_engine.is_all_idle():
-            self.backend_simulator.wait()
+            self.tog_simulator.wait()
             return True
         return False
 
@@ -534,7 +534,7 @@ def current_time(self):
         return self.cycle_to_msec(self.current_cycle)
 
     def cycle_to_msec(self, cycle):
-        freq = self.backend_simulator.get_core_freq()
+        freq = self.tog_simulator.get_core_freq()
         return cycle / (freq  / 1000)
 
     def msec_to_cycle(self, msec):
@@ -542,5 +542,5 @@ def msec_to_cycle(self, msec):
         if (msec == -1):
             return msec
 
-        freq = self.backend_simulator.get_core_freq()
+        freq = self.tog_simulator.get_core_freq()
         return int(msec * (freq / 1000))
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index bd048538..c586c2fd 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -12,7 +12,7 @@
 import torch
 import numpy as np
 
-from PyTorchSimFrontend.llvm.llvm_common import LLVMKernelArgs
+from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
 from PyTorchSimFrontend import extension_config
 
 TORCH_TO_NUMPY = {
@@ -64,10 +64,10 @@ def dump_args(self, args, arg_attributes, load_path, dump_path):
         for (arg_name, arg_attribute), arg in zip(arg_attributes, args):
             size = arg_attribute[2] if arg_attribute[1] != torch.bool else (arg_attribute[2] + 7) // 8
             array_size.append(size)
-            if LLVMKernelArgs.is_llvm_arg_in(arg_attribute[0]):
+            if MLIRKernelArgs.is_mlir_arg_in(arg_attribute[0]):
                 index = self.write_arg(arg, load_path, arg_name)
                 file_path.append(os.path.join(load_path, arg_name, f'{index}.raw'))
-            elif LLVMKernelArgs.is_llvm_arg_out(arg_attribute[0]):
+            elif MLIRKernelArgs.is_mlir_arg_out(arg_attribute[0]):
                 path = os.path.join(dump_path, arg_name)
                 os.makedirs(path, exist_ok=True)
                 file_path.append(os.path.join(path, f'{self.get_biggest_filename(path)}.raw'))
@@ -101,8 +101,9 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
         os.makedirs(os.path.join(runtime_path, "indirect_access"), exist_ok=True)
         os.makedirs(os.path.join(runtime_path, "dma_access"), exist_ok=True)
         run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
-        if not silent_mode:
-            print("[SpikeSimulator] cmd> ", run)
+        if not silent_mode and extension_config.CONFIG_DEBUG_MODE:
+            print("[Spike] cmd> ", run)
+        print("[Spike] Running Spike simulator")
         run_cmd = shlex.split(run)
         try:
             stdout_setting = subprocess.DEVNULL if silent_mode else None
@@ -110,7 +111,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
             subprocess.check_call(run_cmd, stdout=stdout_setting, stderr=stderr_setting)
         except subprocess.CalledProcessError as e:
             if not silent_mode:
-                print("[SpikeSimulator] Command failed with exit code", e.returncode)
+                print("[Spike] Command failed with exit code", e.returncode)
             error_msg = ""
             if e.returncode == 200:
                 error_msg = "INVALID_SPAD_ACCESS"
@@ -121,7 +122,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
             raise RuntimeError(f"{error_msg}")
 
         for (arg_name, arg_attribute), arg, path in zip(arg_attributes, args, file_path):
-            if LLVMKernelArgs.is_llvm_arg_out(arg_attribute[0]):
+            if MLIRKernelArgs.is_mlir_arg_out(arg_attribute[0]):
                 self.load_tensor(arg, arg_name, arg_attribute, path)
 
         if cleanup:
@@ -155,7 +156,7 @@ def show_progress():
             while not finished:
                 i = (i + 1) % 3
                 tail = "." * i + " " * (3-i)
-                sys.stdout.write("\r[Gem5Simulator] Simulation is still running." + tail)
+                sys.stdout.write("\r[Gem5] Gem5 is running." + tail)
                 time.sleep(1)
             print("")
 
@@ -163,9 +164,10 @@ def show_progress():
         gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-r", "--stdout-file=sto.log", "-d", dir_path, extension_config.CONFIG_GEM5_SCRIPT_PATH, "-c", target_binary, "--vlane", str(vectorlane_size)]
         try:
             # Create progress thread
-            is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) or silent_mode
+            is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) or silent_mode
             if not is_dryrun:
-                print("[Gem5Simulator] cmd> ", " ".join(gem5_cmd))
+                if extension_config.CONFIG_DEBUG_MODE:
+                    print("[Gem5] cmd> ", " ".join(gem5_cmd))
                 finished = False
                 progress_thread = threading.Thread(target=show_progress)
                 progress_thread.start()
@@ -175,11 +177,11 @@ def show_progress():
             else:
                 output = subprocess.check_output(gem5_cmd, stderr=subprocess.DEVNULL)
         except subprocess.CalledProcessError as e:
-            print(f"[Gem5Simulator] Gem5 simulation failed with error: \"{e.output.decode()}\"")
+            print(f"[Gem5] Gem5 simulation failed with error: \"{e.output.decode()}\"")
             if not is_dryrun:
                 finished = True
                 progress_thread.join()
-            raise RuntimeError(f"GEM5 Simulation Failed: \"{e.output.decode()}\"")
+            raise RuntimeError(f"Gem5 Simulation Failed: \"{e.output.decode()}\"")
 
         with open(f"{dir_path}/stats.txt", "r") as stat_file:
             raw_list = stat_file.readlines()
@@ -188,18 +190,18 @@ def show_progress():
         cycle_list = cycle_list[:-1]
         return cycle_list
 
-class BackendSimulator():
-    BACKEND_RESULT_PATH_KEY = "BACKEND_RESULT_PATH"
-    FINISH_STR = "Simulation Finished"
+class TOGSimulator():
+    TOGSIM_RESULT_PATH_KEY = "TOGSIM_RESULT_PATH"
+    FINISH_STR = "Simulation finished"
     ALLOC_POOL = dict() # For eagermode buffer plan
-    def __init__(self, backend_path, config_path, vectorlane_size=-1) -> None:
-        self.base_dir = backend_path
+    def __init__(self, togsim_path, config_path, vectorlane_size=-1) -> None:
+        self.base_dir = togsim_path
         self.config_path = config_path
         self.config_json = self.load_json(self.config_path)
         self.process = None
         self.vectorlane_size = vectorlane_size
 
-    def get_backend_command(self):
+    def get_togsim_command(self):
         bin = os.path.join(self.base_dir, "build/bin/Simulator")
         config = os.path.join(self.base_dir, self.config_path)
         cmd = f"{bin} --config {config}"
@@ -211,16 +213,16 @@ def show_progress():
             while not finished:
                 i = (i + 1) % 3
                 tail = "." * i + " " * (3-i)
-                sys.stdout.write("\r[BackendSimulator] Simulation is still running." + tail)
+                sys.stdout.write("\r[TOGSim] TOGSim is running." + tail)
                 time.sleep(1)
             print("")
-        cmd = f"{self.get_backend_command()} --models_list {model_path}"
-        if extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL:
-            cmd += f" --log_level {extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL}"
+        cmd = f"{self.get_togsim_command()} --models_list {model_path}"
+        if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL:
+            cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}"
         if attribute_path:
             cmd = f"{cmd} --attributes_list {attribute_path}"
-        if not silent_mode:
-            print("[BackendSimulator] cmd> ", cmd)
+        if not silent_mode and extension_config.CONFIG_DEBUG_MODE:
+            print("[TOGSim] cmd> ", cmd)
 
         # Create progress thread
         if not silent_mode:
@@ -236,25 +238,26 @@ def show_progress():
             if not silent_mode:
                 finished = True
                 progress_thread.join()
-                print("[BackendSimulator] Command failed with exit code", e.returncode)
-                print("[BackendSimulator] Error output:", e.output)
+                print("[TOGSim] Command failed with exit code", e.returncode)
+                print("[TOGSim] Error output:", e.output)
             assert 0
         # Save result to result_path
-        result_path = os.path.join(os.path.dirname(model_path), "backendsim_result")
+        result_path = os.path.join(os.path.dirname(model_path), "togsim_result")
         os.makedirs(result_path, exist_ok=True)
         file_name = str(len(os.listdir(result_path)))
         result_path = os.path.join(result_path, file_name)
         with open(result_path, "w") as f:
             f.write(result.decode())
-        print(f'[BackendSimulator] Simulation of "{model_path}" is stored to "{result_path}"')
+        print(f'[TOGSim] Simulation of "{model_path}" is stored to "{result_path}"')
         return result_path
 
     def interactive_simulation(self):
-        cmd = f"{self.get_backend_command()} --mode interactive"
-        if extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL:
-            cmd += f" --log_level {extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL}"
+        cmd = f"{self.get_togsim_command()} --mode interactive"
+        if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL:
+            cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}"
 
-        print("[BackendSimulator] cmd> ", cmd)
+        if extension_config.CONFIG_DEBUG_MODE:
+            print("[TOGSim] cmd> ", cmd)
         if self.process is None:
             self.process = subprocess.Popen(
                 shlex.split(cmd),
@@ -263,27 +266,27 @@ def interactive_simulation(self):
                 universal_newlines=True
             )
         else:
-            print("[BackendSimulator] Simulator is already running.")
+            print("[TOGSim] Simulator is already running.")
 
     def stop(self):
         if self.process:
             self.process.terminate()
             self.process.wait()
             self.process = None
-            print("[BackendSimulator] Simulator stopped.")
+            print("[TOGSim] Simulator stopped.")
 
     def wait(self):
         if self.process:
-            print("[BackendSimulator] Waiting for simulation to complete...")
+            print("[TOGSim] Waiting for simulation to complete...")
             self.quit()
             self.process.wait()
             self.process = None
-            print("[BackendSimulator] Simulation completed.")
+            print("[TOGSim] Simulation completed.")
 
     def send_command(self, command):
         if self.process:
             try:
-                if not extension_config.CONFIG_BACKENDSIM_DRYRUN:
+                if not extension_config.CONFIG_TOGSIM_DRYRUN:
                     print(command, flush=True)
                 self.process.stdin.write(command + '\n')
                 self.process.stdin.flush()
@@ -367,8 +370,8 @@ def load_json(self, config_path):
             raise ValueError(f"Invalid JSON format: {e}")
 
     def get_core_freq(self):
-        if "core_freq" in self.config_json:
-            return self.config_json["core_freq"] * 1000 * 1000 # MHz
+        if "core_freq_mhz" in self.config_json:
+            return self.config_json["core_freq_mhz"] * 1000 * 1000 # MHz
         else:
             raise KeyError("Key 'core_freq' not found in JSON.")
 
@@ -403,13 +406,13 @@ def get_result_from_file(result_path):
         simulation_finished_idx = -1
         simulation_finished = False
         for idx, line in enumerate(lines):
-            if BackendSimulator.FINISH_STR in line:
+            if TOGSimulator.FINISH_STR in line:
                 simulation_finished = True
                 simulation_finished_idx = idx
                 break
 
         if simulation_finished_idx == -1:
-            print("[BackendSimulator] Tried to parsing wrong formated output file!")
+            print("[TOGSim] Tried to parsing wrong formated output file!")
             return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time
 
         total_stat_lines = lines[simulation_finished_idx:]
@@ -440,15 +443,15 @@ def get_result_from_file(result_path):
             if 'DRAM: AVG BW Util' in line:
                 avg_dram_bw = float(re.search(r'AVG BW Util (\d+\.?\d*)%', line).group(1))
 
-            if 'Total execution cycle' in line:
-                total_cycle = int(re.search(r'Total execution cycle: (\d+)', line).group(1))
+            if 'Total execution cycles' in line:
+                total_cycle = int(re.search(r'Total execution cycles: (\d+)', line).group(1))
 
             # Parse total simulation time
-            if 'Simulation time' in line:
-                simulation_time = float(re.search(r'Simulation time: (\d+\.?\d*) seconds', line).group(1))
+            if 'Wall-clock time for simulation' in line:
+                simulation_time = float(re.search(r'Wall-clock time for simulation: (\d+\.?\d*) seconds', line).group(1))
         return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time, total_cycle
 
 if __name__ == "__main__":
-    sim = BackendSimulator("/workspace/PyTorchSim/PyTorchSimBackend", "/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json")
+    sim = TOGSimulator("/workspace/PyTorchSim/TOGSim", "/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json")
     sim.interactive_simulation()
     sim.until(4000)
\ No newline at end of file
diff --git a/PyTorchSimBackend/CMakeLists.txt b/TOGSim/CMakeLists.txt
similarity index 100%
rename from PyTorchSimBackend/CMakeLists.txt
rename to TOGSim/CMakeLists.txt
diff --git a/PyTorchSimBackend/conanfile.txt b/TOGSim/conanfile.txt
similarity index 100%
rename from PyTorchSimBackend/conanfile.txt
rename to TOGSim/conanfile.txt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/anynet.icnt b/TOGSim/configs/booksim2_configs/anynet.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/anynet.icnt
rename to TOGSim/configs/booksim2_configs/anynet.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/anynet_file b/TOGSim/configs/booksim2_configs/anynet_file
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/anynet_file
rename to TOGSim/configs/booksim2_configs/anynet_file
diff --git a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt
similarity index 75%
rename from PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt
rename to TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt
index d18ff6e7..3102fecc 100644
--- a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt
+++ b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt
@@ -2,7 +2,7 @@
 use_map = 0
 flit_size = 32
 topology = anynet
-network_file = /workspace/PyTorchSim/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net
+network_file = /workspace/PyTorchSim/TOGSim/configs/booksim2_configs/chiplet_32_32_2.net
 routing_function = min
 subnets = 1
 routing_delay = 4
diff --git a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.net
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net
rename to TOGSim/configs/booksim2_configs/chiplet_32_32_2.net
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m16.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m16.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m16.icnt
rename to TOGSim/configs/booksim2_configs/fly_c16_m16.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m32.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m32.icnt
rename to TOGSim/configs/booksim2_configs/fly_c16_m32.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m8.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m8.icnt
rename to TOGSim/configs/booksim2_configs/fly_c16_m8.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m1.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m1.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m1.icnt
rename to TOGSim/configs/booksim2_configs/fly_c1_m1.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m2.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m2.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m2.icnt
rename to TOGSim/configs/booksim2_configs/fly_c1_m2.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m8.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m8.icnt
rename to TOGSim/configs/booksim2_configs/fly_c1_m8.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c2_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c2_m32.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c2_m32.icnt
rename to TOGSim/configs/booksim2_configs/fly_c2_m32.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c2_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c2_m8.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c2_m8.icnt
rename to TOGSim/configs/booksim2_configs/fly_c2_m8.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m32.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m32.icnt
rename to TOGSim/configs/booksim2_configs/fly_c32_m32.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m4.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m4.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m4.icnt
rename to TOGSim/configs/booksim2_configs/fly_c32_m4.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m8.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m8.icnt
rename to TOGSim/configs/booksim2_configs/fly_c32_m8.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m2.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m2.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m2.icnt
rename to TOGSim/configs/booksim2_configs/fly_c4_m2.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m32.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m32.icnt
rename to TOGSim/configs/booksim2_configs/fly_c4_m32.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m8.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m8.icnt
rename to TOGSim/configs/booksim2_configs/fly_c4_m8.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8.icnt
rename to TOGSim/configs/booksim2_configs/fly_c64_m8.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-age.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8_sif-age.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-age.icnt
rename to TOGSim/configs/booksim2_configs/fly_c64_m8_sif-age.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt
rename to TOGSim/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/make_anynet_topology.py b/TOGSim/configs/booksim2_configs/make_anynet_topology.py
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/make_anynet_topology.py
rename to TOGSim/configs/booksim2_configs/make_anynet_topology.py
diff --git a/PyTorchSimBackend/configs/booksim2_configs/mesh_sif-age.icnt b/TOGSim/configs/booksim2_configs/mesh_sif-age.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/mesh_sif-age.icnt
rename to TOGSim/configs/booksim2_configs/mesh_sif-age.icnt
diff --git a/PyTorchSimBackend/configs/booksim2_configs/mesh_sif-rr.icnt b/TOGSim/configs/booksim2_configs/mesh_sif-rr.icnt
similarity index 100%
rename from PyTorchSimBackend/configs/booksim2_configs/mesh_sif-rr.icnt
rename to TOGSim/configs/booksim2_configs/mesh_sif-rr.icnt
diff --git a/TOGSim/configs/heterogeneous_c2_simple_noc.json b/TOGSim/configs/heterogeneous_c2_simple_noc.json
new file mode 100644
index 00000000..60f160a8
--- /dev/null
+++ b/TOGSim/configs/heterogeneous_c2_simple_noc.json
@@ -0,0 +1,29 @@
+{
+  "core_type" : ["stonne", "ws_mesh"],
+  "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "num_cores" : 2,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_stonne_per_core" : 8,
+  "num_stonne_port" : 64,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16,
+
+  "num_partition" : 2,
+  "partition": {
+    "core_0":0,
+    "core_1":1
+  }
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/configs/ramulator2_configs/DDR4.yaml b/TOGSim/configs/ramulator2_configs/DDR4.yaml
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator2_configs/DDR4.yaml
rename to TOGSim/configs/ramulator2_configs/DDR4.yaml
diff --git a/PyTorchSimBackend/configs/ramulator2_configs/HBM2.yaml b/TOGSim/configs/ramulator2_configs/HBM2.yaml
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator2_configs/HBM2.yaml
rename to TOGSim/configs/ramulator2_configs/HBM2.yaml
diff --git a/PyTorchSimBackend/configs/ramulator2_configs/HBM2_TPUv3.yaml b/TOGSim/configs/ramulator2_configs/HBM2_TPUv3.yaml
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator2_configs/HBM2_TPUv3.yaml
rename to TOGSim/configs/ramulator2_configs/HBM2_TPUv3.yaml
diff --git a/PyTorchSimBackend/configs/ramulator_configs/ALDRAM-config.cfg b/TOGSim/configs/ramulator_configs/ALDRAM-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/ALDRAM-config.cfg
rename to TOGSim/configs/ramulator_configs/ALDRAM-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/DDR3-config.cfg b/TOGSim/configs/ramulator_configs/DDR3-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/DDR3-config.cfg
rename to TOGSim/configs/ramulator_configs/DDR3-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/DDR4-config.cfg b/TOGSim/configs/ramulator_configs/DDR4-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/DDR4-config.cfg
rename to TOGSim/configs/ramulator_configs/DDR4-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/DSARP-config.cfg b/TOGSim/configs/ramulator_configs/DSARP-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/DSARP-config.cfg
rename to TOGSim/configs/ramulator_configs/DSARP-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/GDDR5-config.cfg b/TOGSim/configs/ramulator_configs/GDDR5-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/GDDR5-config.cfg
rename to TOGSim/configs/ramulator_configs/GDDR5-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config.cfg b/TOGSim/configs/ramulator_configs/HBM-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg b/TOGSim/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FCFS.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FCFS.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FCFS.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config_FCFS.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg b/TOGSim/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg b/TOGSim/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg
rename to TOGSim/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBMx0.5ch-config.cfg b/TOGSim/configs/ramulator_configs/HBMx0.5ch-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBMx0.5ch-config.cfg
rename to TOGSim/configs/ramulator_configs/HBMx0.5ch-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBMx2ch-config.cfg b/TOGSim/configs/ramulator_configs/HBMx2ch-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/HBMx2ch-config.cfg
rename to TOGSim/configs/ramulator_configs/HBMx2ch-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/LPDDR3-config.cfg b/TOGSim/configs/ramulator_configs/LPDDR3-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/LPDDR3-config.cfg
rename to TOGSim/configs/ramulator_configs/LPDDR3-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/LPDDR4-config.cfg b/TOGSim/configs/ramulator_configs/LPDDR4-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/LPDDR4-config.cfg
rename to TOGSim/configs/ramulator_configs/LPDDR4-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/PCM-config.cfg b/TOGSim/configs/ramulator_configs/PCM-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/PCM-config.cfg
rename to TOGSim/configs/ramulator_configs/PCM-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/SALP-config.cfg b/TOGSim/configs/ramulator_configs/SALP-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/SALP-config.cfg
rename to TOGSim/configs/ramulator_configs/SALP-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/STTMRAM-config.cfg b/TOGSim/configs/ramulator_configs/STTMRAM-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/STTMRAM-config.cfg
rename to TOGSim/configs/ramulator_configs/STTMRAM-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/TLDRAM-config.cfg b/TOGSim/configs/ramulator_configs/TLDRAM-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/TLDRAM-config.cfg
rename to TOGSim/configs/ramulator_configs/TLDRAM-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/WideIO-config.cfg b/TOGSim/configs/ramulator_configs/WideIO-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/WideIO-config.cfg
rename to TOGSim/configs/ramulator_configs/WideIO-config.cfg
diff --git a/PyTorchSimBackend/configs/ramulator_configs/WideIO2-config.cfg b/TOGSim/configs/ramulator_configs/WideIO2-config.cfg
similarity index 100%
rename from PyTorchSimBackend/configs/ramulator_configs/WideIO2-config.cfg
rename to TOGSim/configs/ramulator_configs/WideIO2-config.cfg
diff --git a/TOGSim/configs/stonne_big_c1_simple_noc.json b/TOGSim/configs/stonne_big_c1_simple_noc.json
new file mode 100644
index 00000000..5d563fbe
--- /dev/null
+++ b/TOGSim/configs/stonne_big_c1_simple_noc.json
@@ -0,0 +1,22 @@
+{
+  "core_type" : ["stonne"],
+  "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "num_cores" : 1,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_stonne_per_core" : 8,
+  "num_stonne_port" : 64,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 8,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycless": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/stonne_single_c1_simple_noc.json b/TOGSim/configs/stonne_single_c1_simple_noc.json
new file mode 100644
index 00000000..304e84b3
--- /dev/null
+++ b/TOGSim/configs/stonne_single_c1_simple_noc.json
@@ -0,0 +1,22 @@
+{
+  "core_type" : ["stonne"],
+  "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "num_cores" : 1,
+  "core_freq_mhz" : 700,
+  "core_stats_print_period_cycles" : 10000,
+  "num_stonne_per_core" : 1,
+  "num_stonne_port" : 8,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 700,
+  "dram_channels": 8,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 700,
+  "icnt_injection_ports_per_core" : 8
+}
\ No newline at end of file
diff --git a/TOGSim/configs/stonne_validation_c1_simple_noc.json b/TOGSim/configs/stonne_validation_c1_simple_noc.json
new file mode 100644
index 00000000..38d4244c
--- /dev/null
+++ b/TOGSim/configs/stonne_validation_c1_simple_noc.json
@@ -0,0 +1,23 @@
+{
+  "core_type" : ["stonne"],
+  "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
+  "num_cores" : 1,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 10000,
+  "num_stonne_per_core" : 1,
+  "num_stonne_port" : 32,
+
+  "dram_type" : "simple",
+  "dram_freq_mhz" : 1000,
+  "dram_channels": 1,
+  "dram_req_size_byte": 32,
+  "dram_latency" : 100,
+  "dram_stats_print_period_cycles": 10000,
+  "l2d_type" : "datacache",
+  "l2d_config" : "S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 1000,
+  "icnt_injection_ports_per_core" : 8
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
new file mode 100644
index 00000000..58519aad
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
@@ -0,0 +1,19 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 700,
+  "core_stats_print_period_cycles" : 10000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :700,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+
+  "icnt_type" : "booksim2",
+  "icnt_freq_mhz" : 700,
+  "icnt_injection_ports_per_core" : 16,
+  "booksim_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt"
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
new file mode 100644
index 00000000..1257891c
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
@@ -0,0 +1,18 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 700,
+  "core_stats_print_period_cycles" : 10000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 700,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycless": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 700,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
new file mode 100644
index 00000000..b92d8029
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
@@ -0,0 +1,19 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
new file mode 100644
index 00000000..34896fc7
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
@@ -0,0 +1,19 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 8,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
new file mode 100644
index 00000000..59be9fd4
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
@@ -0,0 +1,21 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 1050,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 4,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :1200,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+  "l2d_type" : "datacache",
+  "l2d_config" : "S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 1050,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
new file mode 100644
index 00000000..271e7e1c
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
@@ -0,0 +1,19 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "booksim2",
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16,
+  "booksim_config_path" : "../configs/booksim2_configs/fly_c32_m32.icnt"
+}
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json
similarity index 70%
rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
rename to TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json
index d51e9c5f..7382c4c8 100644
--- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
+++ b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json
@@ -1,26 +1,25 @@
 {
   "num_cores" : 2,
-  "core_freq" : 940,
+  "core_freq_mhz" : 940,
   "sram_size" : 65536,
   "core_print_interval" : 10000,
   "num_systolic_array_per_core" : 2,
 
   "dram_type" : "ramulator2",
   "dram_freq" : 940,
-  "dram_channels": 32,
+  "dram_channels": 8,
   "dram_req_size": 32,
   "dram_latency" : 10,
-  "dram_size" : 32,
   "dram_nbl" : 2,
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
+ 
   "icnt_type" : "booksim2",
-  "icnt_latency" : 7,
-  "icnt_freq" : 28000,
-  "icnt_node_per_core" : 1,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m32.icnt",
-
+  "icnt_latency" : 1,
+  "icnt_freq" : 940,
+  "icnt_injection_ports_per_core" : 16,
+  "icnt_config_path" : "../configs/booksim2_configs/fly_c32_m8.icnt",
+ 
   "precision" : 4,
   "scheduler" : "simple",
   "num_partition" : 2,
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
new file mode 100644
index 00000000..6561ffc0
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
@@ -0,0 +1,21 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "dram_num_partitions" : 2,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "booksim2",
+  "icnt_freq_mhz" : 1000,
+  "icnt_injection_ports_per_core" : 16,
+  "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt",
+  "icnt_stats_print_period_cycles" : 10000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
new file mode 100644
index 00000000..fad63cc3
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
@@ -0,0 +1,20 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "dram_num_partitions" : 1,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "booksim2",
+  "icnt_freq_mhz" : 1000,
+  "icnt_injection_ports_per_core" : 16,
+  "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt"
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
new file mode 100644
index 00000000..2207f2b9
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
@@ -0,0 +1,18 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 700,
+  "core_stats_print_period_cycles" : 10000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :700,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 700,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
new file mode 100644
index 00000000..76f51b40
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
@@ -0,0 +1,19 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
new file mode 100644
index 00000000..42e003c7
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
@@ -0,0 +1,25 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16,
+
+  "num_partition" : 2,
+  "partition": {
+    "core_0":0,
+    "core_1":1
+  }
+}
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
new file mode 100644
index 00000000..44ec72fe
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
@@ -0,0 +1,21 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 1050,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 4,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :1200,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+  "l2d_type" : "datacache",
+  "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 7,
+  "icnt_freq_mhz" : 1050,
+  "icnt_injection_ports_per_core" : 16
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json
new file mode 100644
index 00000000..045407b7
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json
@@ -0,0 +1,17 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 1,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycles": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json
new file mode 100644
index 00000000..d8f95d70
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json
@@ -0,0 +1,17 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 2,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycles": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json
new file mode 100644
index 00000000..a5fa9585
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json
@@ -0,0 +1,17 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 4,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycles": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_booksim.json b/TOGSim/configs/systolic_ws_8x8_c1_booksim.json
new file mode 100644
index 00000000..cf560171
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c1_booksim.json
@@ -0,0 +1,17 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 1,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycless": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "booksim2",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json
new file mode 100644
index 00000000..8da61d72
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json
@@ -0,0 +1,17 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 1,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycles": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+ 
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json
new file mode 100644
index 00000000..c5f429f9
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json
@@ -0,0 +1,18 @@
+{
+  "core_type" : ["ws_mesh","ws_mesh"],
+  "num_cores" : 2,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 1,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycless": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json
new file mode 100644
index 00000000..254520be
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json
@@ -0,0 +1,17 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 2,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycles": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json
new file mode 100644
index 00000000..e39867a7
--- /dev/null
+++ b/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json
@@ -0,0 +1,17 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 1000,
+  "core_stats_print_period_cycles" : 100000,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 4,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycless": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency" : 1,
+  "icnt_freq_mhz" : 1000
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/extern/booksim b/TOGSim/extern/booksim
similarity index 100%
rename from PyTorchSimBackend/extern/booksim
rename to TOGSim/extern/booksim
diff --git a/PyTorchSimBackend/extern/onnx b/TOGSim/extern/onnx
similarity index 100%
rename from PyTorchSimBackend/extern/onnx
rename to TOGSim/extern/onnx
diff --git a/PyTorchSimBackend/extern/protobuf b/TOGSim/extern/protobuf
similarity index 100%
rename from PyTorchSimBackend/extern/protobuf
rename to TOGSim/extern/protobuf
diff --git a/PyTorchSimBackend/extern/ramulator2 b/TOGSim/extern/ramulator2
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator2
rename to TOGSim/extern/ramulator2
diff --git a/PyTorchSimBackend/extern/ramulator_custom/.gitignore b/TOGSim/extern/ramulator_custom/.gitignore
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/.gitignore
rename to TOGSim/extern/ramulator_custom/.gitignore
diff --git a/PyTorchSimBackend/extern/ramulator_custom/CMakeLists.txt b/TOGSim/extern/ramulator_custom/CMakeLists.txt
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/CMakeLists.txt
rename to TOGSim/extern/ramulator_custom/CMakeLists.txt
diff --git a/PyTorchSimBackend/extern/ramulator_custom/include/ramulator/Ramulator.hpp b/TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/include/ramulator/Ramulator.hpp
rename to TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Config.cpp b/TOGSim/extern/ramulator_custom/src/Config.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Config.cpp
rename to TOGSim/extern/ramulator_custom/src/Config.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Config.h b/TOGSim/extern/ramulator_custom/src/Config.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Config.h
rename to TOGSim/extern/ramulator_custom/src/Config.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Controller.h b/TOGSim/extern/ramulator_custom/src/Controller.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Controller.h
rename to TOGSim/extern/ramulator_custom/src/Controller.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DDR4.cpp b/TOGSim/extern/ramulator_custom/src/DDR4.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/DDR4.cpp
rename to TOGSim/extern/ramulator_custom/src/DDR4.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DDR4.h b/TOGSim/extern/ramulator_custom/src/DDR4.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/DDR4.h
rename to TOGSim/extern/ramulator_custom/src/DDR4.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DRAM.h b/TOGSim/extern/ramulator_custom/src/DRAM.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/DRAM.h
rename to TOGSim/extern/ramulator_custom/src/DRAM.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/HBM.cpp b/TOGSim/extern/ramulator_custom/src/HBM.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/HBM.cpp
rename to TOGSim/extern/ramulator_custom/src/HBM.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/HBM.h b/TOGSim/extern/ramulator_custom/src/HBM.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/HBM.h
rename to TOGSim/extern/ramulator_custom/src/HBM.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Memory.h b/TOGSim/extern/ramulator_custom/src/Memory.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Memory.h
rename to TOGSim/extern/ramulator_custom/src/Memory.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.cpp b/TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.cpp
rename to TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.h b/TOGSim/extern/ramulator_custom/src/MemoryFactory.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.h
rename to TOGSim/extern/ramulator_custom/src/MemoryFactory.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Ramulator.cpp b/TOGSim/extern/ramulator_custom/src/Ramulator.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Ramulator.cpp
rename to TOGSim/extern/ramulator_custom/src/Ramulator.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Refresh.cpp b/TOGSim/extern/ramulator_custom/src/Refresh.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Refresh.cpp
rename to TOGSim/extern/ramulator_custom/src/Refresh.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Refresh.h b/TOGSim/extern/ramulator_custom/src/Refresh.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Refresh.h
rename to TOGSim/extern/ramulator_custom/src/Refresh.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Request.cpp b/TOGSim/extern/ramulator_custom/src/Request.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Request.cpp
rename to TOGSim/extern/ramulator_custom/src/Request.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Request.h b/TOGSim/extern/ramulator_custom/src/Request.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Request.h
rename to TOGSim/extern/ramulator_custom/src/Request.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Scheduler.h b/TOGSim/extern/ramulator_custom/src/Scheduler.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Scheduler.h
rename to TOGSim/extern/ramulator_custom/src/Scheduler.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/SpeedyController.h b/TOGSim/extern/ramulator_custom/src/SpeedyController.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/SpeedyController.h
rename to TOGSim/extern/ramulator_custom/src/SpeedyController.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/StatType.cpp b/TOGSim/extern/ramulator_custom/src/StatType.cpp
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/StatType.cpp
rename to TOGSim/extern/ramulator_custom/src/StatType.cpp
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/StatType.h b/TOGSim/extern/ramulator_custom/src/StatType.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/StatType.h
rename to TOGSim/extern/ramulator_custom/src/StatType.h
diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Statistics.h b/TOGSim/extern/ramulator_custom/src/Statistics.h
similarity index 100%
rename from PyTorchSimBackend/extern/ramulator_custom/src/Statistics.h
rename to TOGSim/extern/ramulator_custom/src/Statistics.h
diff --git a/PyTorchSimBackend/extern/stonneCore b/TOGSim/extern/stonneCore
similarity index 100%
rename from PyTorchSimBackend/extern/stonneCore
rename to TOGSim/extern/stonneCore
diff --git a/PyTorchSimBackend/include/Cache.h b/TOGSim/include/Cache.h
similarity index 100%
rename from PyTorchSimBackend/include/Cache.h
rename to TOGSim/include/Cache.h
diff --git a/PyTorchSimBackend/include/Cache_defs.h b/TOGSim/include/Cache_defs.h
similarity index 100%
rename from PyTorchSimBackend/include/Cache_defs.h
rename to TOGSim/include/Cache_defs.h
diff --git a/PyTorchSimBackend/include/Cache_stats.h b/TOGSim/include/Cache_stats.h
similarity index 100%
rename from PyTorchSimBackend/include/Cache_stats.h
rename to TOGSim/include/Cache_stats.h
diff --git a/PyTorchSimBackend/include/Common.h b/TOGSim/include/Common.h
similarity index 100%
rename from PyTorchSimBackend/include/Common.h
rename to TOGSim/include/Common.h
diff --git a/PyTorchSimBackend/include/Core.h b/TOGSim/include/Core.h
similarity index 85%
rename from PyTorchSimBackend/include/Core.h
rename to TOGSim/include/Core.h
index a3d55fa2..e4d2f30a 100644
--- a/PyTorchSimBackend/include/Core.h
+++ b/TOGSim/include/Core.h
@@ -9,7 +9,7 @@
 #include "Dram.h"
 #include "Tile.h"
 #include "SimulationConfig.h"
-#include "TMA.h"
+#include "DMA.h"
 
 class Core {
  public:
@@ -27,9 +27,9 @@ class Core {
   virtual void pop_memory_request();
   virtual mem_fetch* top_memory_request() { return _request_queue.front(); }
   virtual void push_memory_response(mem_fetch* response);
-  void check_tag() { _tma.check_table(); }
-  void inc_numa_hit() { _stat_numa_hit++; }
-  void inc_numa_miss() { _stat_numa_miss++; }
+  void check_tag() { _dma.check_table(); }
+  void inc_numa_local_access() { _stat_numa_local_access++; }
+  void inc_numa_remote_access() { _stat_numa_remote_access++; }
 
   std::queue<std::shared_ptr<Instruction>>& get_compute_pipeline(int compute_type);
   enum {
@@ -50,20 +50,18 @@ class Core {
   /* Core id & config file */
   const uint32_t _id;
   const SimulationConfig _config;
-  size_t _sram_size;
-  size_t _used_sram_size;
   uint32_t _num_systolic_array_per_core;
   uint32_t _systolic_array_rr = 0;
 
-  /* TMA Unit */
-  TMA _tma;
+  /* DMA Unit */
+  DMA _dma;
 
   /* cycle */
   cycle_type _core_cycle;
   cycle_type _stat_tot_vu_compute_cycle = 0;
   std::vector<cycle_type> _stat_tot_sa_compute_cycle;
-  cycle_type _stat_tot_tma_cycle = 0;
-  cycle_type _stat_tot_tma_idle_cycle = 0;
+  cycle_type _stat_tot_dma_cycle = 0;
+  cycle_type _stat_tot_dma_idle_cycle = 0;
   cycle_type _stat_tot_vu_compute_idle_cycle = 0;
   std::vector<cycle_type> _stat_tot_sa_compute_idle_cycle;
   std::vector<uint64_t> _stat_inst_count;
@@ -71,13 +69,13 @@ class Core {
   uint64_t _stat_tot_mem_response = 0;
   uint64_t _stat_gemm_inst = 0;
   uint64_t _stat_skip_dma = 0;
-  uint64_t _stat_numa_hit = 0;
-  uint64_t _stat_numa_miss = 0;
+  uint64_t _stat_numa_local_access = 0;
+  uint64_t _stat_numa_remote_access = 0;
 
   cycle_type _stat_vu_compute_cycle = 0;
   std::vector<cycle_type> _stat_sa_compute_cycle;
-  cycle_type _stat_tma_cycle = 0;
-  cycle_type _stat_tma_idle_cycle = 0;
+  cycle_type _stat_dma_cycle = 0;
+  cycle_type _stat_dma_idle_cycle = 0;
   cycle_type _stat_vu_compute_idle_cycle = 0;
   std::vector<cycle_type> _stat_sa_compute_idle_cycle;
   uint64_t _stat_mem_response = 0;
diff --git a/PyTorchSimBackend/include/TMA.h b/TOGSim/include/DMA.h
similarity index 94%
rename from PyTorchSimBackend/include/TMA.h
rename to TOGSim/include/DMA.h
index f8355470..2f41c6f3 100644
--- a/PyTorchSimBackend/include/TMA.h
+++ b/TOGSim/include/DMA.h
@@ -1,8 +1,9 @@
-#ifndef TMA_H
-#define TMA_H
+#ifndef DMA_H
+#define DMA_H
 
 #include <cstdint>
 #include <memory>
+#include <queue>
 #include <map>
 #include <vector>
 #include "Instruction.h"
@@ -16,9 +17,9 @@ struct VectorCompare {
     }
 };
 
-class TMA {
+class DMA {
  public:
-  TMA(uint32_t id, uint32_t dram_req_size);
+  DMA(uint32_t id, uint32_t dram_req_size);
 
   void issue_tile(std::shared_ptr<Instruction> inst);
   bool is_finished() { return _finished; }
@@ -114,7 +115,7 @@ class TMA {
   }
 
   std::shared_ptr<Instruction>& get_current_inst() { return _current_inst; }
-  std::shared_ptr<std::vector<mem_fetch*>> get_memory_access();
+  std::shared_ptr<std::vector<mem_fetch*>> get_memory_access(cycle_type core_cycle, int nr_req);
   uint32_t generate_mem_access_id();
   const uint32_t get_max_dim() { return _max_dim; }
 
@@ -130,5 +131,7 @@ class TMA {
   bool _finished=true;
   std::map<int, std::map<std::vector<int>, uint32_t>> tag_table;
   std::map<int, std::map<std::vector<int>, std::vector<std::shared_ptr<Instruction>>>> waiters;
+  std::queue<mem_fetch*> _pending_accesses;
+  bool _generated_once = false;
 };
 #endif
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/DelayQueue.h b/TOGSim/include/DelayQueue.h
similarity index 100%
rename from PyTorchSimBackend/include/DelayQueue.h
rename to TOGSim/include/DelayQueue.h
diff --git a/PyTorchSimBackend/include/Dram.h b/TOGSim/include/Dram.h
similarity index 99%
rename from PyTorchSimBackend/include/Dram.h
rename to TOGSim/include/Dram.h
index 5e51b96d..d28ac25f 100644
--- a/PyTorchSimBackend/include/Dram.h
+++ b/TOGSim/include/Dram.h
@@ -6,7 +6,7 @@
 #include <utility>
 
 #include "Common.h"
-#include "TMA.h"
+#include "DMA.h"
 #include "ramulator2.hh"
 #include "Hashing.h"
 #include "Cache.h"
diff --git a/PyTorchSimBackend/include/Hashing.h b/TOGSim/include/Hashing.h
similarity index 100%
rename from PyTorchSimBackend/include/Hashing.h
rename to TOGSim/include/Hashing.h
diff --git a/PyTorchSimBackend/include/Instruction.h b/TOGSim/include/Instruction.h
similarity index 96%
rename from PyTorchSimBackend/include/Instruction.h
rename to TOGSim/include/Instruction.h
index 4c14dd81..9fad13f4 100644
--- a/PyTorchSimBackend/include/Instruction.h
+++ b/TOGSim/include/Instruction.h
@@ -60,9 +60,7 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   std::vector<addr_type> get_trace_address() { return _trace_address; }
   bool load_indirect_index(const std::string& path, uint64_t*& indirect_index, const std::vector<uint64_t>& tile_size);
   void set_trace_address(std::vector<addr_type>& trace_address) { _trace_address = trace_address; }
-  size_t get_free_sram_size() { return _free_sram_size; }
   addr_type get_base_dram_address() { return dram_addr; }
-  void set_free_sram_size(size_t sram_size) { _free_sram_size=sram_size; }
   void* get_owner() { return _owner; }
   void set_owner(void *owner) { _owner = owner;}
   void set_owner_ready_queue(std::list<std::shared_ptr<Instruction>>* q) { _owner_ready_queue_ref = q; }
@@ -103,7 +101,6 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   size_t _tile_numel;
   size_t _nr_waiting_request=0;
   size_t _precision=0;
-  size_t _free_sram_size=0;
   addr_type dram_addr;
   uint32_t _numa_id = 0; // For DMA instruction
   int _compute_type = 0;
diff --git a/PyTorchSimBackend/include/Interconnect.h b/TOGSim/include/Interconnect.h
similarity index 95%
rename from PyTorchSimBackend/include/Interconnect.h
rename to TOGSim/include/Interconnect.h
index 8467b7aa..e6b325d0 100644
--- a/PyTorchSimBackend/include/Interconnect.h
+++ b/TOGSim/include/Interconnect.h
@@ -1,6 +1,6 @@
 #ifndef INTERCONNECT_H
 #define INTERCONNECT_H
-#include "TMA.h"
+#include "DMA.h"
 #include "booksim2/Interconnect.hpp"
 #include <cmath>
 #include <filesystem>
@@ -51,8 +51,9 @@ class SimpleInterconnect : public Interconnect {
     mem_fetch* access;
   };
 
-  std::vector<std::queue<Entity>> _in_buffers;
+  std::vector<std::vector<std::queue<Entity>>> _in_buffers;
   std::vector<std::queue<mem_fetch*>> _out_buffers;
+  std::vector<int> _rr_next_src;
   std::vector<bool> _busy_node;
 };
 
diff --git a/PyTorchSimBackend/include/IntervalTree.h b/TOGSim/include/IntervalTree.h
similarity index 100%
rename from PyTorchSimBackend/include/IntervalTree.h
rename to TOGSim/include/IntervalTree.h
diff --git a/PyTorchSimBackend/include/L2Cache.h b/TOGSim/include/L2Cache.h
similarity index 100%
rename from PyTorchSimBackend/include/L2Cache.h
rename to TOGSim/include/L2Cache.h
diff --git a/PyTorchSimBackend/include/Memfetch.h b/TOGSim/include/Memfetch.h
similarity index 100%
rename from PyTorchSimBackend/include/Memfetch.h
rename to TOGSim/include/Memfetch.h
diff --git a/PyTorchSimBackend/include/Model.h b/TOGSim/include/Model.h
similarity index 100%
rename from PyTorchSimBackend/include/Model.h
rename to TOGSim/include/Model.h
diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/TOGSim/include/SimulationConfig.h
similarity index 82%
rename from PyTorchSimBackend/include/SimulationConfig.h
rename to TOGSim/include/SimulationConfig.h
index 06a41c9f..64cfa223 100644
--- a/PyTorchSimBackend/include/SimulationConfig.h
+++ b/TOGSim/include/SimulationConfig.h
@@ -18,8 +18,7 @@ struct SimulationConfig {
   std::vector<CoreType> core_type;
   std::string stonne_config_path;
   uint32_t num_cores;
-  uint32_t core_freq;
-  uint32_t sram_size;
+  uint32_t core_freq_mhz;
   uint32_t core_print_interval = 0;
   uint32_t num_systolic_array_per_core = 1;
   uint32_t num_stonne_per_core = 1;
@@ -28,7 +27,8 @@ struct SimulationConfig {
   /* DRAM config */
   DramType dram_type;
   uint32_t dram_num_partitions = 1;
-  uint32_t dram_freq;
+  uint32_t dram_channels_per_partitions = 0;
+  uint32_t dram_freq_mhz;
   uint32_t dram_channels;
   uint32_t dram_req_size;
   uint32_t dram_latency;
@@ -43,11 +43,11 @@ struct SimulationConfig {
 
   /* ICNT config */
   IcntType icnt_type;
-  uint32_t icnt_node_per_core = 1;
+  uint32_t icnt_injection_ports_per_core = 1;
   std::string icnt_config_path;
-  uint32_t icnt_freq;
+  uint32_t icnt_freq_mhz;
   uint32_t icnt_latency;
-  uint32_t icnt_print_interval=0;
+  uint32_t icnt_stats_print_period_cycles=0;
 
   /* Sheduler config */
   uint32_t num_partition=1;
@@ -57,7 +57,6 @@ struct SimulationConfig {
   std::map<uint32_t, uint32_t> partiton_map;
 
   /* Other configs */
-  uint32_t precision;
   std::string layout;
 
   uint64_t align_address(uint64_t addr) {
@@ -65,6 +64,6 @@ struct SimulationConfig {
   }
 
   float max_dram_bandwidth() {
-    return dram_freq * dram_channels * dram_req_size * 2 / dram_nbl / 1000; // GB/s
+    return dram_freq_mhz * dram_channels * dram_req_size * 2 / dram_nbl / 1000; // GB/s
   }
 };
\ No newline at end of file
diff --git a/PyTorchSimBackend/include/Simulator.h b/TOGSim/include/Simulator.h
similarity index 100%
rename from PyTorchSimBackend/include/Simulator.h
rename to TOGSim/include/Simulator.h
diff --git a/PyTorchSimBackend/include/SparseCore.h b/TOGSim/include/SparseCore.h
similarity index 100%
rename from PyTorchSimBackend/include/SparseCore.h
rename to TOGSim/include/SparseCore.h
diff --git a/PyTorchSimBackend/include/Tile.h b/TOGSim/include/Tile.h
similarity index 100%
rename from PyTorchSimBackend/include/Tile.h
rename to TOGSim/include/Tile.h
diff --git a/PyTorchSimBackend/include/TileGraph.h b/TOGSim/include/TileGraph.h
similarity index 100%
rename from PyTorchSimBackend/include/TileGraph.h
rename to TOGSim/include/TileGraph.h
diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/TOGSim/include/TileGraphParser.h
similarity index 100%
rename from PyTorchSimBackend/include/TileGraphParser.h
rename to TOGSim/include/TileGraphParser.h
diff --git a/PyTorchSimBackend/include/scheduler/Scheduler.h b/TOGSim/include/scheduler/Scheduler.h
similarity index 100%
rename from PyTorchSimBackend/include/scheduler/Scheduler.h
rename to TOGSim/include/scheduler/Scheduler.h
diff --git a/PyTorchSimBackend/src/CMakeLists.txt b/TOGSim/src/CMakeLists.txt
similarity index 100%
rename from PyTorchSimBackend/src/CMakeLists.txt
rename to TOGSim/src/CMakeLists.txt
diff --git a/PyTorchSimBackend/src/Cache.cc b/TOGSim/src/Cache.cc
similarity index 100%
rename from PyTorchSimBackend/src/Cache.cc
rename to TOGSim/src/Cache.cc
diff --git a/PyTorchSimBackend/src/Cache_stats.cc b/TOGSim/src/Cache_stats.cc
similarity index 100%
rename from PyTorchSimBackend/src/Cache_stats.cc
rename to TOGSim/src/Cache_stats.cc
diff --git a/PyTorchSimBackend/src/Common.cc b/TOGSim/src/Common.cc
similarity index 74%
rename from PyTorchSimBackend/src/Common.cc
rename to TOGSim/src/Common.cc
index 687f32f5..b5c092b3 100644
--- a/PyTorchSimBackend/src/Common.cc
+++ b/TOGSim/src/Common.cc
@@ -39,15 +39,14 @@ SimulationConfig initialize_config(json config) {
     for (int i=0; i<parsed_config.num_cores; i++)
       parsed_config.core_type.push_back(CoreType::WS_MESH);
   }
-  parsed_config.core_freq = config["core_freq"];
-  parsed_config.sram_size = config["sram_size"];
+  parsed_config.core_freq_mhz = config["core_freq_mhz"];
   if (config.contains("num_systolic_array_per_core"))
     parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"];
   if (config.contains("num_stonne_per_core"))
     parsed_config.num_stonne_per_core = config["num_stonne_per_core"];
    if (config.contains("num_stonne_port"))
     parsed_config.num_stonne_port = config["num_stonne_port"];
-  parsed_config.core_print_interval = get_config_value<uint32_t>(config, "core_print_interval");
+  parsed_config.core_print_interval = get_config_value<uint32_t>(config, "core_stats_print_period_cycles");
 
   /* Stonne config */ 
   if (config.contains("stonne_config_path"))
@@ -63,20 +62,27 @@ SimulationConfig initialize_config(json config) {
   else
     throw std::runtime_error(fmt::format("Not implemented dram type {} ",
                                          (std::string)config["dram_type"]));
-  parsed_config.dram_freq = config["dram_freq"];
+  parsed_config.dram_freq_mhz = config["dram_freq_mhz"];
   if (config.contains("dram_latency"))
     parsed_config.dram_latency = config["dram_latency"];
-  if (config.contains("dram_config_path"))
-    parsed_config.dram_config_path = config["dram_config_path"];
+  if (config.contains("ramulator_config_path"))
+    parsed_config.dram_config_path = config["ramulator_config_path"];
   parsed_config.dram_channels = config["dram_channels"];
-  if (config.contains("dram_req_size"))
-    parsed_config.dram_req_size = config["dram_req_size"];
-  if (config.contains("dram_print_interval"))
-    parsed_config.dram_print_interval = config["dram_print_interval"];
-  if(config.contains("dram_nbl"))
-    parsed_config.dram_nbl = config["dram_nbl"];
-  if (config.contains("dram_num_partitions"))
+  if (config.contains("dram_req_size_byte"))
+    parsed_config.dram_req_size = config["dram_req_size_byte"];
+  if (config.contains("dram_stats_print_period_cycles"))
+    parsed_config.dram_print_interval = config["dram_stats_print_period_cycles"];
+  if(config.contains("dram_num_burst_length"))
+    parsed_config.dram_nbl = config["dram_num_burst_length"];
+  if (config.contains("dram_num_partitions")) {
     parsed_config.dram_num_partitions = config["dram_num_partitions"];
+    if (parsed_config.dram_channels % parsed_config.dram_num_partitions != 0) {
+      throw std::runtime_error("[Config] DRAM channels must be divisible by dram_num_partitions");
+    }
+  }
+  parsed_config.dram_channels_per_partitions =
+    parsed_config.dram_channels / parsed_config.dram_num_partitions;
+
 
    /* L2D config */
   if (config.contains("l2d_type")) {
@@ -104,17 +110,18 @@ SimulationConfig initialize_config(json config) {
   else
     throw std::runtime_error(fmt::format("Not implemented icnt type {} ",
                                          (std::string)config["icnt_type"]));
-  parsed_config.icnt_freq = config["icnt_freq"];
+  parsed_config.icnt_freq_mhz = config["icnt_freq_mhz"];
   if (config.contains("icnt_latency"))
     parsed_config.icnt_latency = config["icnt_latency"];
-  if (config.contains("icnt_config_path"))
-    parsed_config.icnt_config_path = config["icnt_config_path"];
-  if (config.contains("icnt_print_interval"))
-    parsed_config.icnt_print_interval = config["icnt_print_interval"];
-  if (config.contains("icnt_node_per_core"))
-    parsed_config.icnt_node_per_core = config["icnt_node_per_core"];
+  if (config.contains("booksim_config_path"))
+    parsed_config.icnt_config_path = config["booksim_config_path"];
+  if (config.contains("icnt_stats_print_period_cycles"))
+    parsed_config.icnt_stats_print_period_cycles = config["icnt_stats_print_period_cycles"];
+  if (config.contains("icnt_injection_ports_per_core"))
+    parsed_config.icnt_injection_ports_per_core = config["icnt_injection_ports_per_core"];
 
-  parsed_config.scheduler_type = config["scheduler"];
+  if (config.contains("scheduler"))
+    parsed_config.scheduler_type = config["scheduler"];
   if (config.contains("num_partition"))
     parsed_config.num_partition = config["num_partition"];
   if (config.contains("partition")) {
diff --git a/PyTorchSimBackend/src/Core.cc b/TOGSim/src/Core.cc
similarity index 71%
rename from PyTorchSimBackend/src/Core.cc
rename to TOGSim/src/Core.cc
index 4be41a70..30858193 100644
--- a/PyTorchSimBackend/src/Core.cc
+++ b/TOGSim/src/Core.cc
@@ -4,11 +4,9 @@ Core::Core(uint32_t id, SimulationConfig config)
     : _id(id),
       _config(config),
       _core_cycle(0),
-      _stat_tma_cycle(0),
+      _stat_dma_cycle(0),
       _num_systolic_array_per_core(config.num_systolic_array_per_core),
-      _tma(id, config.dram_req_size) {
-  _sram_size = _config.sram_size * 1024;
-  _used_sram_size = 0;
+      _dma(id, config.dram_req_size) {
   _sa_compute_pipeline.resize(_num_systolic_array_per_core);
   _stat_tot_sa_compute_cycle.resize(_num_systolic_array_per_core);
   _stat_sa_compute_cycle.resize(_num_systolic_array_per_core);
@@ -25,14 +23,9 @@ bool Core::can_issue(const std::shared_ptr<Tile>& op) {
 
 void Core::issue(std::shared_ptr<Tile> op) {
   if (op->get_instructions().size()){
-    spdlog::trace("[Core {}][{}] New Tile is issued, remain sram: {} Required size: {}, Free size: {}",
-      _id, _core_cycle, _sram_size-_used_sram_size, op->get_required_sram_size(),
-      op->get_instructions().back()->get_free_sram_size());
-  } else {
-    spdlog::trace("[Core {}][{}] New Tile is issued, remain sram: {} Required size: {}",
-      _id, _core_cycle, _sram_size-_used_sram_size, op->get_required_sram_size());
+    spdlog::trace("[{}][Core {}][TILE_SCHEDULED]",
+      _core_cycle, _id);
   }
-  //_used_sram_size += op->get_required_sram_size();
   for (const auto& inst : op->get_instructions()) {
     if (inst->is_ready())
       op->enqueue_ready(inst);
@@ -125,39 +118,38 @@ void Core::dma_cycle() {
     /* Set tag table of async dma load */
     if (instruction->is_dma_read() && instruction->is_async_dma()) {
       auto& key = instruction->get_tag_id();
-      assert(!_tma.get_tag_finish(instruction->subgraph_id, key));
-      _tma.set_tag_finish(instruction->subgraph_id, key);
-      spdlog::trace("[Core {}][{}] {} ASYNC FINISHED, Used sram: {}, Release sram: {}, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}",
-                    _id, _core_cycle, opcode_to_string(instruction->get_opcode()),
-                    _used_sram_size, instruction->get_free_sram_size(),
+      assert(!_dma.get_tag_finish(instruction->subgraph_id, key));
+      _dma.set_tag_finish(instruction->subgraph_id, key);
+      spdlog::trace("[{}][Core {}] {} ASYNC FINISHED, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}",
+                    _core_cycle, _id, opcode_to_string(instruction->get_opcode()),
                     instruction->subgraph_id, instruction->get_addr_name(),
                     fmt::format("[{}]", fmt::join(instruction->get_tag_id(), ", ")),
                     fmt::format("[{}]", fmt::join(instruction->get_tag_idx_list(), ", ")),
                     fmt::format("[{}]", fmt::join(instruction->get_tag_stride_list(), ", ")));
-      for (auto & wait_inst : _tma.get_tag_waiter(instruction->subgraph_id, key)) {
-        _tma.mark_tag_used(instruction->subgraph_id, key);
+      for (auto & wait_inst : _dma.get_tag_waiter(instruction->subgraph_id, key)) {
+        _dma.mark_tag_used(instruction->subgraph_id, key);
         finish_instruction(wait_inst);
       }
     }
     _dma_finished_queue.erase(_dma_finished_queue.begin());
   }
 
-  if (_tma.is_finished()) {
+  if (_dma.is_finished()) {
     /* Finish instruction when it is DMA store */
-    if (_tma.get_current_inst() != nullptr) {
-      std::shared_ptr<Instruction> finished_inst = std::move(_tma.get_current_inst());
+    if (_dma.get_current_inst() != nullptr) {
+      std::shared_ptr<Instruction> finished_inst = std::move(_dma.get_current_inst());
       if (finished_inst->is_dma_write()) {
         /* Only DMA write operation is finished! */
         finish_instruction(finished_inst);
       } else if (finished_inst->is_dma_read() && finished_inst->is_async_dma()) {
         /* Register tag table for async dma load */
-        _tma.register_tag(finished_inst->subgraph_id, finished_inst->get_tag_id());
+        _dma.register_tag(finished_inst->subgraph_id, finished_inst->get_tag_id());
         finish_instruction(finished_inst);
       } else if(!finished_inst->is_dma_read()) {
-        spdlog::error("[Core {}][{}] TMA instruction in not valid", _id, _core_cycle);
+        spdlog::error("[{}][Core {}] DMA instruction in not valid", _core_cycle, _id);
         exit(EXIT_FAILURE);
       } else if (finished_inst->get_opcode() == Opcode::BAR) {
-        spdlog::trace("[Core {}][{}] {} FINISHED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
+        spdlog::trace("[{}][Core {}] {} FINISHED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
                       opcode_to_string(finished_inst->get_opcode()), finished_inst->get_addr_name(),
                       fmt::format("[{}]", fmt::join(finished_inst->get_tag_id(), ", ")),
                       fmt::format("[{}]", fmt::join(finished_inst->get_tag_idx_list(), ", ")),
@@ -170,27 +162,27 @@ void Core::dma_cycle() {
     /* Issue new DMA operation */
     if (!_ld_inst_queue.empty()) {
       std::shared_ptr<Instruction> inst = _ld_inst_queue.front();
-      _tma.issue_tile(inst);
+      _dma.issue_tile(inst);
       _ld_inst_queue.pop();
     } else if (!_st_inst_queue.empty()) {
       std::shared_ptr<Instruction> inst = _st_inst_queue.front();
-      _tma.issue_tile(inst);
+      _dma.issue_tile(inst);
       _st_inst_queue.pop();
     } else {
-      /* TMA is idle */
-      _stat_tma_idle_cycle++;
+      /* DMA is idle */
+      _stat_dma_idle_cycle++;
       return;
     }
   }
   /* Generate memfetch */
-  auto access_vec = _tma.get_memory_access();
+  auto access_vec = _dma.get_memory_access(_core_cycle, _config.icnt_injection_ports_per_core);
   for (auto access : *access_vec) {
     access->set_start_cycle(_core_cycle);
     _request_queue.push(access);
   }
 
-  /* Increase tma stat cycle */
-  _stat_tma_cycle++;
+  /* Increase dma stat cycle */
+  _stat_dma_cycle++;
 }
 
 void Core::cycle() {
@@ -218,20 +210,20 @@ void Core::cycle() {
             /* Check another MOVIN with same tag is issued */
             auto& key = inst->get_tag_id();
             if (inst->is_sparse_inst()) {
-              _tma.register_tag(inst->subgraph_id, key);
-              _tma.set_tag_sparse(inst->subgraph_id, key);
+              _dma.register_tag(inst->subgraph_id, key);
+              _dma.set_tag_sparse(inst->subgraph_id, key);
               finish_instruction(inst);
               issued = true;
               _stat_tot_skipped_inst.at(static_cast<size_t>(inst->get_opcode()))++;
               break;
-            } else if (inst->is_async_dma() && _tma.tag_key_exist(inst->subgraph_id, key)) {
-              bool finished = _tma.get_tag_finish(inst->subgraph_id, key);
+            } else if (inst->is_async_dma() && _dma.tag_key_exist(inst->subgraph_id, key)) {
+              bool finished = _dma.get_tag_finish(inst->subgraph_id, key);
               if (finished)
                 finish_instruction(inst);
               else
-                _tma.register_tag_waiter(inst->subgraph_id, key, inst);
-              spdlog::trace("[Core {}][{}] {} SKIPPED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
-                            opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(),
+                _dma.register_tag_waiter(inst->subgraph_id, key, inst);
+              spdlog::trace("[{}][Core {}][SIKIPPED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
+                            opcode_to_string(inst->get_opcode()),
                             inst->get_addr_name(),
                             fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
                             fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
@@ -240,8 +232,8 @@ void Core::cycle() {
               _stat_tot_skipped_inst.at(static_cast<size_t>(inst->get_opcode()))++;
               break;
             } else {
-              spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
-                            opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(),
+              spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
+                            opcode_to_string(inst->get_opcode()),
                             inst->get_addr_name(),
                             fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
                             fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
@@ -252,8 +244,12 @@ void Core::cycle() {
             }
           }
         case Opcode::MOVOUT:
-          spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {}", _id, _core_cycle,
-                        opcode_to_string(inst->get_opcode()), inst->get_free_sram_size());
+          spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
+                        opcode_to_string(inst->get_opcode()),
+                        inst->get_addr_name(),
+                        fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
+                        fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
+                        fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
           _st_inst_queue.push(inst);
           issued = true;
           break;
@@ -269,13 +265,14 @@ void Core::cycle() {
               inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle() - overlapped_cycle;
               inst->bubble_cycle = bubble_cycle;
             }
+
             if (inst->get_compute_cycle() == 0) {
               inst->finish_instruction();
               static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
               _stat_tot_skipped_inst.at(static_cast<size_t>(inst->get_opcode()))++;
               instructions.erase(it);
             } else {
-              spdlog::trace("[Core {}][SA {}][{}] {}-{} ISSUED, finsh at {}", _id, _systolic_array_rr, _core_cycle,
+              spdlog::trace("[{}][Core {}][INST_ISSUED][SA {}] {}-{}, finsh at {}", _core_cycle, _id, _systolic_array_rr,
                             opcode_to_string(inst->get_opcode()), inst->get_compute_type(), inst->finish_cycle);
               target_pipeline.push(inst);
               issued = true;
@@ -288,7 +285,7 @@ void Core::cycle() {
         case Opcode::BAR:
           {
             auto& key = inst->get_tag_id();
-            uint32_t finished = _tma.get_tag_finish(inst->subgraph_id, key);
+            uint32_t finished = _dma.get_tag_finish(inst->subgraph_id, key);
             if (finished == -1) {
               for (auto child_inst : inst->get_child_inst()) {
                 if (child_inst->get_opcode() == Opcode::COMP && child_inst->get_compute_type() == MATMUL) {
@@ -297,12 +294,12 @@ void Core::cycle() {
               }
               finish_instruction(inst);
             } else if (finished != 0) {
-              _tma.mark_tag_used(inst->subgraph_id, key);
+              _dma.mark_tag_used(inst->subgraph_id, key);
               finish_instruction(inst);
             } else {
-              _tma.register_tag_waiter(inst->subgraph_id, key, inst);
+              _dma.register_tag_waiter(inst->subgraph_id, key, inst);
             }
-            spdlog::trace("[Core {}][{}] {} ISSUED,  addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle,
+            spdlog::trace("[{}][Core {}][INST_ISSUED] {},  addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
                             opcode_to_string(inst->get_opcode()), inst->get_addr_name(),
                             fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
                             fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
@@ -344,31 +341,26 @@ void Core::cycle() {
 }
 
 void Core::finish_instruction(std::shared_ptr<Instruction>& inst) {
-  size_t free_sram_size = inst->get_free_sram_size();
   if (inst->finished) {
-    spdlog::error("[Core {}][{}] {} FINISHED, inst already finished!!", _id, _core_cycle,
+    spdlog::error("[{}][Core {}][ERROR] {} inst already finished!!", _core_cycle, _id,
                   opcode_to_string(inst->get_opcode()));
     exit(EXIT_FAILURE);
   }
   inst->finish_instruction();
   static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
   if (inst->get_opcode() == Opcode::COMP) {
-    spdlog::trace("[Core {}][{}] {}-{} FINISHED, Used sram: {}, Release sram: {}",
-      _id, _core_cycle, opcode_to_string(inst->get_opcode()), inst->get_compute_type(),
-      _used_sram_size, inst->get_free_sram_size());
+    spdlog::trace("[{}][Core {}][INST_FINISHED] {}-{}",
+      _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_compute_type());
   } else if (inst->get_opcode() != Opcode::BAR && inst->is_async_dma()){
-    spdlog::trace("[Core {}][{}] {} ASYNC REGISTERED, Used sram: {}, Release sram: {} subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}",
-      _id, _core_cycle, opcode_to_string(inst->get_opcode()), _used_sram_size,
-      inst->get_free_sram_size(), inst->subgraph_id, inst->get_addr_name(),
+    spdlog::trace("[{}][Core {}][ASYNC] {} subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}",
+      _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->subgraph_id, inst->get_addr_name(),
       inst->get_tag_id(),
       fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
       fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
   } else if ((inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) && !inst->is_async_dma()) {
-    spdlog::trace("[Core {}][{}] {} FINISHED, free_sram_size: {} addr_name: {}", _id, _core_cycle,
-      opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(),
-      inst->get_addr_name());
+    spdlog::trace("[{}][Core {}][INST_FINISHED] {} addr_name: {}", _core_cycle, _id,
+      opcode_to_string(inst->get_opcode()), inst->get_addr_name());
   }
-  //_used_sram_size -= free_sram_size;
 }
 
 bool Core::running() {
@@ -378,7 +370,7 @@ bool Core::running() {
   for (int i=0; i<_num_systolic_array_per_core;i++)
     running = running || !_sa_compute_pipeline.at(i).empty();
   running = running || !_dma_waiting_queue.empty() || !_dma_finished_queue.empty();
-  running = running || !_tma.empty();
+  running = running || !_dma.empty();
   running = running || !_ld_inst_queue.empty();
   running = running || !_st_inst_queue.empty();
   return running;
@@ -419,43 +411,62 @@ void Core::print_stats() {
   std::vector<float> sa_utilization;
   update_stats();
   spdlog::info("===== Instructions count =====");
-  for (int i=0; i < static_cast<size_t>(Opcode::COUNT); i++) {
-    if (i == static_cast<size_t>(Opcode::COMP))
-      spdlog::info("Core [{}] : {} inst count {} (GEMM: {}, Vector: {}), skipped inst count {}", _id, opcode_to_string(static_cast<Opcode>(i)), _stat_inst_count.at(i), _stat_gemm_inst, _stat_inst_count.at(i) - _stat_gemm_inst, _stat_tot_skipped_inst.at(i));
-    else
-      spdlog::info("Core [{}] : {} inst count {}, skipped inst count {}", _id, opcode_to_string(static_cast<Opcode>(i)), _stat_inst_count.at(i), _stat_tot_skipped_inst.at(i));
+  for (int i = 0; i < static_cast<size_t>(Opcode::COUNT); i++) {
+    auto opcode  = static_cast<Opcode>(i);
+    auto inst = _stat_inst_count.at(i);
+    auto skipped = _stat_tot_skipped_inst.at(i);
+    auto name = opcode_to_string(opcode);
+
+    if (opcode == Opcode::COMP) {
+      auto gemm   = _stat_gemm_inst;
+      auto vector = inst - gemm;
+      if (skipped)
+        spdlog::info("Core [{}] : {:8} inst_count {} (GEMM: {}, Vector: {}), skipped inst_count {}",
+            _id, name, inst, gemm, vector, skipped);
+      else
+        spdlog::info("Core [{}] : {:8} inst_count {} (GEMM: {}, Vector: {})",
+            _id, name, inst, gemm, vector);
+    }
+    else {
+      if (skipped)
+        spdlog::info("Core [{}] : {:8} inst_count {}, skipped inst_count {}",
+            _id, name, inst, skipped);
+      else
+        spdlog::info("Core [{}] : {:8} inst_count {}",
+            _id, name, inst);
+    }
   }
   spdlog::info("========= Core stat =========");
   for (int i=0; i<_num_systolic_array_per_core; i++)
     sa_utilization.push_back(static_cast<float>(_stat_tot_sa_compute_cycle.at(i) * 100) / _core_cycle);
   for (int i=0; i<_num_systolic_array_per_core; i++)
-    spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i),
+    spdlog::info("Core [{}] : Systolic array [{}] utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, i, sa_utilization.at(i),
       _stat_tot_sa_compute_cycle.at(i), _stat_tot_sa_compute_idle_cycle.at(i));
-  float dram_bw = _config.dram_req_size * _stat_tot_mem_response * _config.core_freq / (_core_cycle * 1000); // B/cycle
-  spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_tot_tma_cycle, _stat_tot_tma_idle_cycle, dram_bw, _stat_tot_mem_response);
-  spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
+  float dram_bw = _config.dram_req_size * _stat_tot_mem_response * _config.core_freq_mhz / (_core_cycle * 1000); // B/cycle
+  spdlog::info("Core [{}] : DMA active_cycles, {} DMA idle_cycles {}, DRAM BW {:.3f} GB/s ({} responses)", _id, _stat_tot_dma_cycle, _stat_tot_dma_idle_cycle, dram_bw, _stat_tot_mem_response);
+  spdlog::info("Core [{}] : Vector unit utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
     static_cast<float>(_stat_tot_vu_compute_cycle * 100) / _core_cycle, _stat_tot_vu_compute_cycle, _stat_tot_vu_compute_idle_cycle);
-  spdlog::info("Core [{}] : Numa hit count : {}, Numa miss count : {}", _id, _stat_numa_hit, _stat_numa_miss);
-  spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle);
+  spdlog::info("Core [{}] : NUMA local memory: {} requests, remote memory: {} requests", _id, _stat_numa_local_access, _stat_numa_remote_access);
+  spdlog::info("Core [{}] : Total_cycles {}", _id, _core_cycle);
 }
 
 void Core::print_current_stats() {
   std::vector<float> sa_utilization;
   for (int i=0; i<_num_systolic_array_per_core; i++)
     sa_utilization.push_back(static_cast<float>(_stat_sa_compute_cycle.at(i) * 100) / _config.core_print_interval);
-  float dram_bw = _config.dram_req_size * _stat_mem_response * _config.core_freq / (_config.core_print_interval * 1000); // B/cycle
+  float dram_bw = _config.dram_req_size * _stat_mem_response * _config.core_freq_mhz / (_config.core_print_interval * 1000); // B/cycle
   auto level = spdlog::level::info;
   if(_id != 0)
     level = spdlog::level::debug;
 
   spdlog::info("========= Core stat =========");
   for (int i=0; i<_num_systolic_array_per_core; i++)
-    spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i),
+    spdlog::info("Core [{}] : Systolic array [{}] utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, i, sa_utilization.at(i),
       _stat_sa_compute_cycle.at(i), _stat_sa_compute_idle_cycle.at(i));
-  spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_tma_cycle, _stat_tma_idle_cycle, dram_bw, _stat_mem_response);
-  spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
+  spdlog::info("Core [{}] : DMA active_cycles {}, DMA idle_cycles {}, DRAM BW {:.3f} GB/s ({} responses)", _id, _stat_dma_cycle, _stat_dma_idle_cycle, dram_bw, _stat_mem_response);
+  spdlog::info("Core [{}] : Vector unit Utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id,
     static_cast<float>(_stat_vu_compute_cycle * 100) / _config.core_print_interval, _stat_vu_compute_cycle, _stat_vu_compute_idle_cycle);
-  spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle);
+  spdlog::info("Core [{}] : Total_cycles {}", _id, _core_cycle);
   update_stats();
 }
 
@@ -468,13 +479,13 @@ void Core::update_stats() {
   }
 
   _stat_tot_vu_compute_cycle += _stat_vu_compute_cycle;
-  _stat_tot_tma_cycle += _stat_tma_cycle;
-  _stat_tot_tma_idle_cycle += _stat_tma_idle_cycle;
+  _stat_tot_dma_cycle += _stat_dma_cycle;
+  _stat_tot_dma_idle_cycle += _stat_dma_idle_cycle;
   _stat_tot_mem_response += +_stat_mem_response;
 
   _stat_vu_compute_cycle = 0;
-  _stat_tma_cycle = 0;
-  _stat_tma_idle_cycle = 0;
+  _stat_dma_cycle = 0;
+  _stat_dma_idle_cycle = 0;
   _stat_vu_compute_idle_cycle = 0;
   _stat_mem_response = 0;
 }
\ No newline at end of file
diff --git a/TOGSim/src/DMA.cc b/TOGSim/src/DMA.cc
new file mode 100644
index 00000000..f8f21025
--- /dev/null
+++ b/TOGSim/src/DMA.cc
@@ -0,0 +1,83 @@
+#include "DMA.h"
+#include "TileGraph.h"
+
+DMA::DMA(uint32_t id, uint32_t dram_req_size) {
+  _id = id;
+  _dram_req_size = dram_req_size;
+  _current_inst = nullptr;
+  _finished = true;
+}
+
+void DMA::issue_tile(std::shared_ptr<Instruction> inst) {
+  _current_inst = std::move(inst);
+  std::vector<size_t>& tile_size = _current_inst->get_tile_size();
+  if (tile_size.size() <= 0 || tile_size.size() > get_max_dim()) {
+    spdlog::error("[DMA {}] issued tile is not supported format..", _id);
+    exit(EXIT_FAILURE);
+  }
+  _finished = false;
+}
+
+std::shared_ptr<std::vector<mem_fetch*>> DMA::get_memory_access(cycle_type core_cycle, int nr_req) {
+
+  if (!_generated_once) {
+    std::shared_ptr<std::set<addr_type>> addr_set =
+      _current_inst->get_dram_address(_dram_req_size);
+
+    Tile* owner = (Tile*)_current_inst->get_owner();
+    std::shared_ptr<TileSubGraph> owner_subgraph = owner->get_owner();
+    unsigned long long base_daddr = _current_inst->get_base_dram_address();
+
+    bool is_cacheable =
+      owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size);
+
+    spdlog::trace("[{}][Core {}][SRAM] Address: 0x{:016x}, Is_cacheable: {}",
+                    core_cycle, _id, base_daddr, is_cacheable);
+    spdlog::trace("[{}][Core {}][NUMA] Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}",
+                    core_cycle, _id, owner_subgraph->get_core_id(),
+                    _current_inst->get_numa_id(), _current_inst->get_addr_name(),
+                    _current_inst->is_dma_write());
+    for (const auto& addr : *addr_set) {
+      mem_access_type acc_type =
+        _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W
+                                          : mem_access_type::GLOBAL_ACC_R;
+      mf_type type =
+        _current_inst->is_dma_write() ? mf_type::WRITE_REQUEST
+                                          : mf_type::READ_REQUEST;
+
+      mem_fetch* access = new mem_fetch(
+          addr, acc_type, type, _dram_req_size,
+          _current_inst->get_numa_id(),
+          static_cast<void*>(_current_inst.get()));
+
+      access->set_cacheable(is_cacheable);
+      _current_inst->inc_waiting_request();
+      _pending_accesses.push(access);
+    }
+    _generated_once = true;
+  }
+
+  if (nr_req == -1)
+    nr_req = _pending_accesses.size();
+
+  // Return pending accesses up to nr_req
+  auto access_vec = std::make_shared<std::vector<mem_fetch *>>();
+  for (int i = 0; i < nr_req; i++) {
+      if (_pending_accesses.empty())
+        break;
+      access_vec->push_back(_pending_accesses.front());
+      _pending_accesses.pop();
+  }
+
+  if (_pending_accesses.empty()) {
+    _finished = true;
+    _generated_once = false;
+  }
+
+  return access_vec;
+}
+
+uint32_t DMA::generate_mem_access_id() {
+  static uint32_t id_counter{0};
+  return id_counter++;
+}
\ No newline at end of file
diff --git a/PyTorchSimBackend/src/DelayQueue.cc b/TOGSim/src/DelayQueue.cc
similarity index 100%
rename from PyTorchSimBackend/src/DelayQueue.cc
rename to TOGSim/src/DelayQueue.cc
diff --git a/PyTorchSimBackend/src/Dram.cc b/TOGSim/src/Dram.cc
similarity index 97%
rename from PyTorchSimBackend/src/Dram.cc
rename to TOGSim/src/Dram.cc
index ab074bda..089c582e 100644
--- a/PyTorchSimBackend/src/Dram.cc
+++ b/TOGSim/src/Dram.cc
@@ -17,10 +17,10 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) {
   _n_bl = config.dram_nbl;
   _req_size = config.dram_req_size;
   _n_partitions = config.dram_num_partitions;
-  _n_ch_per_partition = _n_ch / _n_partitions;
+  _n_ch_per_partition = config.dram_channels_per_partitions;
   _config = config;
 
-  spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}", config.max_dram_bandwidth(), config.dram_freq, _n_ch, _req_size);
+  spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}B", config.max_dram_bandwidth(), config.dram_freq_mhz, _n_ch, _req_size);
   /* Initialize DRAM Channels */
   for (int ch = 0; ch < _n_ch; ch++) {
     m_to_crossbar_queue.push_back(std::queue<mem_fetch*>());
diff --git a/PyTorchSimBackend/src/Hashing.cc b/TOGSim/src/Hashing.cc
similarity index 100%
rename from PyTorchSimBackend/src/Hashing.cc
rename to TOGSim/src/Hashing.cc
diff --git a/PyTorchSimBackend/src/Instruction.cc b/TOGSim/src/Instruction.cc
similarity index 100%
rename from PyTorchSimBackend/src/Instruction.cc
rename to TOGSim/src/Instruction.cc
diff --git a/PyTorchSimBackend/src/Interconnect.cc b/TOGSim/src/Interconnect.cc
similarity index 77%
rename from PyTorchSimBackend/src/Interconnect.cc
rename to TOGSim/src/Interconnect.cc
index 8a684ff7..ab2d5d89 100644
--- a/PyTorchSimBackend/src/Interconnect.cc
+++ b/TOGSim/src/Interconnect.cc
@@ -4,12 +4,15 @@ SimpleInterconnect::SimpleInterconnect(SimulationConfig config)
   :  _latency(config.icnt_latency) {
   _cycles = 0;
   _config = config;
-  _n_nodes = config.num_cores + config.dram_channels;
+  _n_nodes = config.num_cores * _config.icnt_injection_ports_per_core + config.dram_channels;
   _in_buffers.resize(_n_nodes);
   _out_buffers.resize(_n_nodes);
   _busy_node.resize(_n_nodes);
+  _rr_next_src.resize(_n_nodes);
   for(int node = 0; node < _n_nodes; node++) {
     _busy_node[node] = false;
+    _in_buffers.at(node).resize(_n_nodes);
+    _rr_next_src[node] = 0;
   }
 }
 
@@ -19,35 +22,36 @@ bool SimpleInterconnect::running() {
 }
 
 void SimpleInterconnect::cycle() {
-  for(int node = 0; node < _n_nodes; node++) {
-    int src_node = (_rr_start + node ) % _n_nodes;
-    if(!_in_buffers[src_node].empty() && _in_buffers[src_node].front().finish_cycle <= _cycles) {
-      uint32_t dest = _in_buffers[src_node].front().dest;
-      if(!_busy_node[dest]) {
-        _out_buffers[dest].push(_in_buffers[src_node].front().access);  
-        _in_buffers[src_node].pop();
-        _busy_node[dest] = true;
-        // spdlog::trace("PUSH TO OUTBUFFER {} {}", src_node, dest);
+  for(int dest = 0; dest < _n_nodes; dest++) {
+    int src_start = _rr_next_src[dest];
+    bool pushed = false;
+
+    for(int i = 0; i < _n_nodes; i++) {
+      int src = (src_start + i) % _n_nodes;
+
+      if (!_in_buffers[src][dest].empty() &&
+          _in_buffers[src][dest].front().finish_cycle <= _cycles) {
+
+        _out_buffers[dest].push(_in_buffers[src][dest].front().access);
+        _in_buffers[src][dest].pop();
+        _rr_next_src[dest] = (src + 1) % _n_nodes;
+        pushed = true;
+        break;
       }
     }
   }
-  
-  for(int node = 0; node < _n_nodes; node++) {
-    _busy_node[node] = false;
-  }
-  _rr_start = (_rr_start + 1) % _n_nodes;
   _cycles++;
 }
 
 void SimpleInterconnect::push(uint32_t src, uint32_t dest, mem_fetch* request) {
   SimpleInterconnect::Entity entity;
-  if(_in_buffers[src].empty())
+  if(_in_buffers[src][dest].empty())
     entity.finish_cycle =  _cycles + _latency;
   else
-    entity.finish_cycle =  _in_buffers[src].back().finish_cycle + 1;
+    entity.finish_cycle =  _in_buffers[src][dest].back().finish_cycle + 1;
   entity.dest = dest;
   entity.access = request;
-  _in_buffers[src].push(entity);
+  _in_buffers[src][dest].push(entity);
 }
 
 bool SimpleInterconnect::is_full(uint32_t nid, mem_fetch* request) {
@@ -72,11 +76,11 @@ void SimpleInterconnect::pop(uint32_t nid) {
 
 Booksim2Interconnect::Booksim2Interconnect(SimulationConfig config) {
   _config = config;
-  _n_nodes = config.num_cores * _config.icnt_node_per_core + config.dram_channels;
-  spdlog::info("Initialize Booksim2"); 
+  _n_nodes = config.num_cores * _config.icnt_injection_ports_per_core + config.dram_channels;
+  spdlog::info("Initialize Booksim2");
   char* onnxim_path_env = std::getenv("TORCHSIM_DIR");
   std::string onnxim_path = onnxim_path_env != NULL?
-    std::string(onnxim_path_env) + "/PyTorchSimBackend" : std::string("./");
+    std::string(onnxim_path_env) + "/TOGSim" : std::string("./");
 
   _config_path = fs::path(onnxim_path).append("configs").append((std::string)config.icnt_config_path).string();
   spdlog::info("Config path : {}", _config_path);
diff --git a/PyTorchSimBackend/src/L2Cache.cc b/TOGSim/src/L2Cache.cc
similarity index 100%
rename from PyTorchSimBackend/src/L2Cache.cc
rename to TOGSim/src/L2Cache.cc
diff --git a/PyTorchSimBackend/src/Simulator.cc b/TOGSim/src/Simulator.cc
similarity index 90%
rename from PyTorchSimBackend/src/Simulator.cc
rename to TOGSim/src/Simulator.cc
index 63bd3146..41a2c7a5 100644
--- a/PyTorchSimBackend/src/Simulator.cc
+++ b/TOGSim/src/Simulator.cc
@@ -3,9 +3,9 @@
 Simulator::Simulator(SimulationConfig config)
     : _config(config), _core_cycles(0) {
   // Create dram object
-  _core_period = 1000000 / (config.core_freq);
-  _icnt_period = 1000000 / (config.icnt_freq);
-  _dram_period = 1000000 / (config.dram_freq);
+  _core_period = 1000000 / (config.core_freq_mhz);
+  _icnt_period = 1000000 / (config.icnt_freq_mhz);
+  _dram_period = 1000000 / (config.dram_freq_mhz);
   _core_time = 0;
   _dram_time = 0;
   _icnt_time = 0;
@@ -14,20 +14,20 @@ Simulator::Simulator(SimulationConfig config)
   _n_cores = config.num_cores;
   _n_memories = config.dram_channels;
   _memory_req_size = config.dram_req_size;
-  _noc_node_per_core = config.icnt_node_per_core;
+  _noc_node_per_core = config.icnt_injection_ports_per_core;
   char* onnxim_path_env = std::getenv("TORCHSIM_DIR");
   std::string onnxim_path = onnxim_path_env != NULL?
-    std::string(onnxim_path_env) + "/PyTorchSimBackend" : std::string("./");
+    std::string(onnxim_path_env) + "/TOGSim" : std::string("./");
 
   // Create core objects
   _cores.resize(_n_cores);
   for (int core_index = 0; core_index < _n_cores; core_index++) {
     if (config.core_type[core_index] == CoreType::WS_MESH) {
-      spdlog::info("[Config/Core] Core {}: {} MHz, Spad size: {} KB, Systolic array per core: {}",
-        core_index, config.core_freq , config.sram_size, config.num_systolic_array_per_core);
+      spdlog::info("[Config/Core] Core {}: {} MHz, Systolic array per core: {}",
+        core_index, config.core_freq_mhz, config.num_systolic_array_per_core);
       _cores.at(core_index) = std::make_unique<Core>(core_index, _config);
     } else if(config.core_type[core_index] == CoreType::STONNE) {
-      spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq);
+      spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq_mhz);
       _cores.at(core_index) = std::make_unique<SparseCore>(core_index, _config);
     } else {
       throw std::runtime_error(fmt::format("Not implemented Core type {} ",
@@ -51,7 +51,7 @@ Simulator::Simulator(SimulationConfig config)
   }
 
   // Create interconnect object
-  spdlog::info("[Config/Interconnect] Inerconnect freq: {} MHz", config.icnt_freq);
+  spdlog::info("[Config/Interconnect] Interconnect freq: {} MHz", config.icnt_freq_mhz);
   if (config.icnt_type == IcntType::SIMPLE) {
     spdlog::info("[Config/Interconnect] SimpleInerconnect selected");
     _icnt = std::make_unique<SimpleInterconnect>(config);
@@ -62,7 +62,7 @@ Simulator::Simulator(SimulationConfig config)
     spdlog::error("[Configuration] Invalid interconnect type...!");
     exit(EXIT_FAILURE);
   }
-  _icnt_interval = config.icnt_print_interval;
+  _icnt_interval = config.icnt_stats_print_period_cycles;
 
   // Initialize Scheduler
   for (int i=0; i<config.num_partition;i++)
@@ -117,11 +117,11 @@ void Simulator::icnt_cycle() {
         mem_fetch *front = _cores[core_id]->top_memory_request();
         front->set_core_id(core_id);
         if (!_icnt->is_full(port_id, front)) {
-          //int node_id = _dram->get_channel_id(front) / 16;
-          //if (core_id == node_id)
-          //  _cores[core_id]->inc_numa_hit();
-          //else
-          //  _cores[core_id]->inc_numa_miss();
+          int node_id = _dram->get_channel_id(front) / _config.dram_channels_per_partitions;
+          if (core_id == node_id)
+            _cores[core_id]->inc_numa_local_access();
+          else
+            _cores[core_id]->inc_numa_remote_access();
           _icnt->push(port_id , get_dest_node(front), front);
           _cores[core_id]->pop_memory_request();
           _nr_from_core++;
@@ -229,7 +229,7 @@ void Simulator::cycle() {
     if (IS_ICNT_CYCLE(_cycle_mask))
       icnt_cycle();
   }
-  spdlog::info("Simulation Finished");
+  spdlog::info("Simulation finished");
   for (auto &core: _cores) {
     core->check_tag();
   }
@@ -291,5 +291,5 @@ void Simulator::print_core_stat()
   for (int core_id = 0; core_id < _n_cores; core_id++) {
     _cores[core_id]->print_stats();
   }
-  spdlog::info("Total execution cycle: {}", _core_cycles);
-}
\ No newline at end of file
+  spdlog::info("Total execution cycles: {}", _core_cycles);
+}
diff --git a/PyTorchSimBackend/src/SparseCore.cc b/TOGSim/src/SparseCore.cc
similarity index 86%
rename from PyTorchSimBackend/src/SparseCore.cc
rename to TOGSim/src/SparseCore.cc
index 64d3da55..d5629b9c 100644
--- a/PyTorchSimBackend/src/SparseCore.cc
+++ b/TOGSim/src/SparseCore.cc
@@ -27,14 +27,14 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config)
   }
 
   Config stonneConfig = stonneCores.at(0)->getStonneConfig();
-  unsigned int core_freq = config.core_freq; // MHz;
+  unsigned int core_freq_mhz = config.core_freq_mhz; // MHz;
   num_ms = stonneConfig.m_MSNetworkCfg.ms_size;
   r_port_nr = config.num_stonne_port;
   w_port_nr = config.num_stonne_port;
 
-  double compute_throughput = static_cast<double>(num_ms) * core_freq / 1e3; // FLOPs/sec
-  double dn_bandwidth = static_cast<double>(r_port_nr) * config.dram_req_size * core_freq * 1e6 / 8.0 / 1e9; // GB/s
-  double rn_bandwidth = static_cast<double>(w_port_nr) * config.dram_req_size * core_freq * 1e6 / 8.0 / 1e9; // GB/s
+  double compute_throughput = static_cast<double>(num_ms) * core_freq_mhz / 1e3; // FLOPs/sec
+  double dn_bandwidth = static_cast<double>(r_port_nr) * config.dram_req_size * core_freq_mhz * 1e6 / 8.0 / 1e9; // GB/s
+  double rn_bandwidth = static_cast<double>(w_port_nr) * config.dram_req_size * core_freq_mhz * 1e6 / 8.0 / 1e9; // GB/s
   for (int i=0; i<nr_cores; i++) {
     spdlog::info("[Config/StonneCore {}][{}] Compute Throughput: {:.2f} GFLOPs/sec", id, i, compute_throughput);
     spdlog::info("[Config/StonneCore {}][{}] Distribution Network Bandwidth: {:.2f} GB/s",
@@ -68,7 +68,7 @@ void SparseCore::issue(std::shared_ptr<Tile> tile) {
     }
   }
   if (selected_core_idx == -1) {
-    spdlog::error("[StonneCore {}] Faield to issue tile", _id);
+    spdlog::error("[StonneCore {}] Failed to issue tile", _id);
     exit(1);
   }
   stonneCores.at(selected_core_idx)->init(1);
@@ -84,7 +84,7 @@ void SparseCore::issue(std::shared_ptr<Tile> tile) {
   setTraceMode(selected_core_idx, is_trace_mode);
   percore_tiles.at(selected_core_idx).push_back(tile);
   coreBusy.at(selected_core_idx) = true;
-  spdlog::info("[StonneCore {}][{}] issued new tile (trace_mode: {})", _id, selected_core_idx, is_trace_mode);
+  spdlog::info("[{}][StonneCore {}/{}][Launch] New operation (trace_mode: {})", _core_cycle, _id, selected_core_idx, is_trace_mode);
 };
 
 bool SparseCore::can_issue(const std::shared_ptr<Tile>& op) {
@@ -100,8 +100,8 @@ void SparseCore::checkStatus(uint32_t subcore_id) {
   int new_status = stonneCore->getMCFSMStats();
   int compute_cycle = stonneCore->getMSStats().n_multiplications;
   if (traceCoreStatus.at(subcore_id) != new_status) {
-    spdlog::trace("Stonne Core [{}][{}] status transition {} -> {}, Load/Store: {}/{}, compute_cycle: {}",
-      _id, _core_cycle, traceCoreStatus.at(subcore_id), new_status,
+    spdlog::trace("[{}][StonneCore {}/{}][Transition] status {} -> {}, Load/Store: {}/{}, compute_cycle: {}",
+      _core_cycle, _id, subcore_id, traceCoreStatus.at(subcore_id), new_status,
       traceLoadTraffic.at(subcore_id).size(), traceStoreTraffic.at(subcore_id).size(), (compute_cycle - traceCoreCycle.at(subcore_id))/num_ms);
     if (traceLoadTraffic.at(subcore_id).size()) {
       TraceNode load_node = TraceNode(traceNodeList.at(subcore_id).size()+2, "load", TraceNode::StonneTraceLoad);
@@ -151,14 +151,14 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
           traceStoreTraffic.at(subcore_id).insert(target_addr);
           break;
         default:
-          spdlog::error("[SparseCore] Invalid request type from core");
+          spdlog::error("[StonneCore] Invalid request type from core");
           return;
       }
       req->request_time = _core_cycle;
       req->stonneId = subcore_id;
       std::tuple<uint64_t, mem_access_type, mf_type, int> key = std::make_tuple(target_addr, acc_type, type, allocTrafficID());
       registerMemfetch(key, [this, req, acc_type, type]() {
-        spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
+        spdlog::trace("[{}][StonneCore][DRAM Response] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
               _core_cycle, _core_cycle - req->request_time, req->getAddress(), int(req->getcmd()), _config.dram_req_size);
         req->setReply();
         stonneCores.at(req->stonneId)->pushResponse(req);
@@ -168,7 +168,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
     /* Finish stonne core */
     if (coreBusy.at(subcore_id) && stonneCore->isFinished()) {
       stonneCore->finish();
-      spdlog::info("[SparseCore][{}] Operation finished at {}", _id, _core_cycle);
+      spdlog::info("[{}][StonneCore {}/{}][Finish] Operation done", _core_cycle, _id, subcore_id);
       std::shared_ptr<Tile> target_tile = percore_tiles.at(subcore_id).front();
       SST_STONNE::StonneOpDesc *opDesc = static_cast<SST_STONNE::StonneOpDesc*>(target_tile->get_custom_data());
       if (opDesc->trace_path != "")
@@ -239,7 +239,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
         {
           auto acc_type = mem_access_type::GLOBAL_ACC_R;
           auto type = mf_type::READ_REQUEST;
-          spdlog::trace("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle,
+          spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}", _core_cycle, _id, subcore_id,
                         opcode_to_string(inst->get_opcode()));
           for (auto addr : inst->get_trace_address()) {
             addr = addr - (addr & _config.dram_req_size-1);
@@ -247,8 +247,8 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
             std::tuple<uint64_t, mem_access_type, mf_type, int> key = std::make_tuple(addr, acc_type, type, allocTrafficID());
             uint64_t current_time = _core_cycle;
             registerMemfetch(key, [this, inst, addr, current_time, type]() {
-              spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
-                this->_core_cycle, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size);
+              spdlog::trace("[{}][StonneCore {}][RESPONSE] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
+                this->_core_cycle, _id, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size);
               inst->dec_waiting_request();
             });
           }
@@ -260,7 +260,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
         {
           auto acc_type = mem_access_type::GLOBAL_ACC_W;
           auto type = mf_type::WRITE_REQUEST;
-          spdlog::trace("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle,
+          spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}", _core_cycle, _id, subcore_id,
                         opcode_to_string(inst->get_opcode()));
           for (auto addr : inst->get_trace_address()) {
             addr = addr - (addr & _config.dram_req_size-1);
@@ -268,8 +268,8 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
             std::tuple<uint64_t, mem_access_type, mf_type, int> key = std::make_tuple(addr, acc_type, type, allocTrafficID());
             uint64_t current_time = _core_cycle;
             registerMemfetch(key, [this, inst, addr, current_time, type]() {
-              spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
-                this->_core_cycle, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size);
+              spdlog::trace("[{}][StonneCore {}][RESPONSE] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \
+                this->_core_cycle, _id, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size);
               inst->dec_waiting_request();
             });
           }
@@ -285,7 +285,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
             inst->finish_cycle = _core_cycle + inst->get_compute_cycle();
           else
             inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle();
-          spdlog::trace("[Core {}][{}][{}] {} ISSUED, finsh at {}", _id, subcore_id, _core_cycle,
+          spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}, finsh at {}", _core_cycle, _id, subcore_id,
                           opcode_to_string(inst->get_opcode()), inst->finish_cycle);
           target_pipeline.push(inst);
           issued = true;
@@ -313,7 +313,7 @@ void SparseCore::cycle() {
     for (auto& req_pair : request_merge_table) {
       _request_queue.push(req_pair.second);
       request_merge_table.erase(req_pair.first);
-      spdlog::debug("[SparseCore][{}][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \
+      spdlog::debug("[{}][StonneCore][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \
               _core_cycle, _id, req_pair.second->get_addr(), int(req_pair.second->get_access_type()), int(req_pair.second->get_type()),
               _config.dram_req_size, nr_request);
       nr_request++;
@@ -366,9 +366,9 @@ void SparseCore::print_current_stats() {
     }
     cycle_type nr_mul = percore_stat.at(i).n_multiplications;
     percore_stat.at(i).reset();
-    spdlog::info("Stonne Core [{}][{}] : nr_multiplications: {}", _id, i, nr_mul);
+    spdlog::info("StonneCore [{}][{}] : nr_multiplications: {}", _id, i, nr_mul);
   }
-  spdlog::info("Stonne Core [{}] : Total cycle {}", _id, _core_cycle);
+  spdlog::info("StonneCore [{}] : Total cycle {}", _id, _core_cycle);
 }
 
 void SparseCore::print_stats() {
@@ -383,9 +383,9 @@ void SparseCore::print_stats() {
       percore_total_stat.at(i) += percore_stat.at(i);
     }
     cycle_type nr_mul = percore_total_stat.at(i).n_multiplications;
-    spdlog::info("Stonne Core [{}][{}] : nr_multiplications: {}", _id, i, nr_mul);
+    spdlog::info("StonneCore [{}][{}] : nr_multiplications: {}", _id, i, nr_mul);
   }
-  spdlog::info("Stonne Core [{}] : Total cycle {}", _id, _core_cycle);
+  spdlog::info("StonneCore [{}] : Total cycle {}", _id, _core_cycle);
 }
 
 std::shared_ptr<Tile> SparseCore::pop_finished_tile() {
@@ -399,18 +399,18 @@ std::shared_ptr<Tile> SparseCore::pop_finished_tile() {
 
 void SparseCore::finish_instruction(std::shared_ptr<Instruction>& inst) {
   if (inst->finished) {
-    spdlog::error("[Core {}][{}] {} FINISHED, inst already finished!!", _id, _core_cycle,
+    spdlog::error("[{}][StonneCore {}][Error] {} inst already finished!!", _core_cycle, _id,
                   opcode_to_string(inst->get_opcode()));
     exit(EXIT_FAILURE);
   }
   inst->finish_instruction();
   static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
   if (inst->get_opcode() == Opcode::COMP) {
-    spdlog::info("[StonneCore {}][{}] {} FINISHED",
-      _id, _core_cycle, opcode_to_string(inst->get_opcode()));
+    spdlog::info("[{}][StonneCore {}][INST_FINISHED] {}",
+      _core_cycle, _id, opcode_to_string(inst->get_opcode()));
   } else if (inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) {
-    spdlog::info("[StonneCore {}][{}] {} FINISHED, free_sram_size: {}", _id, _core_cycle,
-      opcode_to_string(inst->get_opcode()), inst->get_free_sram_size());
+    spdlog::info("[{}][StonneCore {}][INST_FINISHED] {}", _core_cycle, _id,
+      opcode_to_string(inst->get_opcode()));
   }
 }
 
@@ -460,5 +460,5 @@ void SparseCore::dumpTrace(int stonne_core_id, const std::string& path) {
       outFile << traceNodeList.at(stonne_core_id)[i];
   }
   outFile << "\n}" << std::endl;
-  spdlog::info("[StonneCore] Success to save trace dump file to \"{}\"", path);
+  spdlog::info("[{}][StonneCore] Success to save trace dump file to \"{}\"", _core_cycle, path);
 }
diff --git a/PyTorchSimBackend/src/Tile.cc b/TOGSim/src/Tile.cc
similarity index 100%
rename from PyTorchSimBackend/src/Tile.cc
rename to TOGSim/src/Tile.cc
diff --git a/PyTorchSimBackend/src/TileGraph.cc b/TOGSim/src/TileGraph.cc
similarity index 96%
rename from PyTorchSimBackend/src/TileGraph.cc
rename to TOGSim/src/TileGraph.cc
index 33e995e9..120d49e2 100644
--- a/PyTorchSimBackend/src/TileGraph.cc
+++ b/TOGSim/src/TileGraph.cc
@@ -111,7 +111,6 @@ void TileGraph::allocate_subgraph(int core_id, int slot_id) {
 
   for (auto it = _subgraph_vec.begin(); it != _subgraph_vec.end(); ++it) {
     if ((*it)->get_core_id() == -1 || (*it)->get_core_id() == core_id) {
-      spdlog::trace("[TileGraph] Core {} allocated new subgraph(affinity={}) (remains: {})", core_id, (*it)->get_core_id(), _subgraph_vec.size()-1);
       std::shared_ptr<TileSubGraph> subgraph = *it;
       _cpu_graph_map[core_id][slot_id] = subgraph;
       _subgraph_vec.erase(it);
diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc
similarity index 98%
rename from PyTorchSimBackend/src/TileGraphParser.cc
rename to TOGSim/src/TileGraphParser.cc
index 4a562724..42776a51 100644
--- a/PyTorchSimBackend/src/TileGraphParser.cc
+++ b/TOGSim/src/TileGraphParser.cc
@@ -627,9 +627,6 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
           }
         }
       }
-      /* Set last instruction's free sram size */
-      if(parent->get_instructions().size())
-        parent->get_instructions().back()->set_free_sram_size(parent->get_required_sram_size());
 
       parent->append_child(child);
       /* Create new tile */
@@ -682,11 +679,6 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       tile_vec.back()->inc_required_sram_size(inst->get_tile_numel() * inst->get_precision());
   }
 
-  /* Set last instruction's free sram size */
-  std::shared_ptr<Tile> parent = tile_vec.back();
-  if (parent->get_instructions().size())
-    parent->get_instructions().back()->set_free_sram_size(parent->get_required_sram_size());
-
   return tile_vec;
 }
 
diff --git a/PyTorchSimBackend/src/helper/CommandLineParser.cc b/TOGSim/src/helper/CommandLineParser.cc
similarity index 100%
rename from PyTorchSimBackend/src/helper/CommandLineParser.cc
rename to TOGSim/src/helper/CommandLineParser.cc
diff --git a/PyTorchSimBackend/src/helper/CommandLineParser.h b/TOGSim/src/helper/CommandLineParser.h
similarity index 100%
rename from PyTorchSimBackend/src/helper/CommandLineParser.h
rename to TOGSim/src/helper/CommandLineParser.h
diff --git a/PyTorchSimBackend/src/main.cc b/TOGSim/src/main.cc
similarity index 95%
rename from PyTorchSimBackend/src/main.cc
rename to TOGSim/src/main.cc
index 214e7131..1af11257 100644
--- a/PyTorchSimBackend/src/main.cc
+++ b/TOGSim/src/main.cc
@@ -9,7 +9,7 @@
 namespace fs = std::filesystem;
 namespace po = boost::program_options;
 
-const char* env_value = std::getenv("BACKENDSIM_DRYRUN");
+const char* env_value = std::getenv("TOGSIM_DRYRUN");
 bool isDryRun = (env_value != nullptr && std::string(env_value) == "1");
 
 void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, std::string config_path, cycle_type request_time=0, int partiton_id=0) {
@@ -38,7 +38,7 @@ int until(Simulator *simulator, cycle_type until_cycle) {
 void interactive_mode(Simulator* simulator) {
   std::string command;
 
-  std::cout << "[" << simulator->get_core_cycle() << "] BackendSim> ";
+  std::cout << "[" << simulator->get_core_cycle() << "] TOGSim> ";
   while (std::getline(std::cin, command)) {
 
     std::istringstream iss(command);
@@ -79,7 +79,7 @@ void interactive_mode(Simulator* simulator) {
       spdlog::error("Error: unknown command {} Available commands are: launch, until, quit.", token);
     }
     if (isDryRun)
-      std::cout << "[" << simulator->get_core_cycle() << "] BackendSim> ";
+      std::cout << "[" << simulator->get_core_cycle() << "] TOGSim> ";
   }
   simulator->cycle();
   if (simulator->get_core_cycle()==0)
@@ -149,6 +149,6 @@ int main(int argc, char** argv) {
   /* Simulation time measurement */
   auto end = std::chrono::high_resolution_clock::now();
   std::chrono::duration<double> duration = end - start;
-  spdlog::info("Simulation time: {:2f} seconds", duration.count());
+  spdlog::info("Wall-clock time for simulation: {:2f} seconds", duration.count());
   return 0;
 }
diff --git a/PyTorchSimBackend/src/scheduler/Scheduler.cc b/TOGSim/src/scheduler/Scheduler.cc
similarity index 100%
rename from PyTorchSimBackend/src/scheduler/Scheduler.cc
rename to TOGSim/src/scheduler/Scheduler.cc
diff --git a/experiments/BERT.py b/experiments/BERT.py
index 3534505d..c5bb454e 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -9,7 +9,7 @@ def run_BERT(size, input_seq, config):
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
     # from tests.test_transformer import EncoderBlock
     from tests.Fusion.test_transformer_fusion import EncoderBlock
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
 
     hidden_dim = {'base': 768, 'large': 1024, 'xlarge': 2048}
@@ -36,7 +36,7 @@ def run_BERT(size, input_seq, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path FIXME: gem5 result is different as directoy name
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -51,7 +51,7 @@ def run_BERT(size, input_seq, config):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     run_BERT(size, input_seq, config)
diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh
index a32cd0a6..28e6ad5e 100755
--- a/experiments/artifact/cycle_validation/run_cycle.sh
+++ b/experiments/artifact/cycle_validation/run_cycle.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e
 
-export TORCHSIM_CONFIG=$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+export TORCHSIM_CONFIG=$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
 LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
 mkdir -p $LOG_DIR
 
diff --git a/experiments/artifact/cycle_validation/summary_cycle.py b/experiments/artifact/cycle_validation/summary_cycle.py
index 529d0161..c0f48ac3 100644
--- a/experiments/artifact/cycle_validation/summary_cycle.py
+++ b/experiments/artifact/cycle_validation/summary_cycle.py
@@ -88,7 +88,7 @@ def compute_mae(errors):
             name = file[:-4]
             with open(full_path, errors="ignore") as f:
                 for line in f:
-                    match = re.search(r"Total execution cycle:\s*([0-9]+)", line)
+                    match = re.search(r"Total execution cycles:\s*([0-9]+)", line)
                     if match:
                         cycle_map[name] = int(match.group(1))
                         break
diff --git a/experiments/artifact/speedup/run_speedup.sh b/experiments/artifact/speedup/run_speedup.sh
index 7d0c0da2..2b9625e9 100755
--- a/experiments/artifact/speedup/run_speedup.sh
+++ b/experiments/artifact/speedup/run_speedup.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
-CONFIG_DIR="$TORCHSIM_DIR/PyTorchSimBackend/configs"
-SIMULATOR_BIN="$TORCHSIM_DIR/PyTorchSimBackend/build/bin/Simulator"
+CONFIG_DIR="$TORCHSIM_DIR/TOGSim/configs"
+SIMULATOR_BIN="$TORCHSIM_DIR/TOGSim/build/bin/Simulator"
 
 configs=(
     "systolic_ws_128x128_c2_simple_noc_tpuv3.json"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
index 66829f02..4055b355 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
@@ -26,7 +26,7 @@ for i in "${config[@]}"; do
     echo "===== config=$i | model=$ops =====" >> "$output_file"
     sum=0.0
     count=0
-    config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i"
+    config_path="$TORCHSIM_DIR/TOGSim/configs/$i"
 
     for iter in {1..5}; do
       echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
index 2f9718f1..83b3798a 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
@@ -27,7 +27,7 @@ for i in "${config[@]}"; do
     echo "===== config=$i | model=$ops =====" >> "$output_file"
     sum=0.0
     count=0
-    config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i"
+    config_path="$TORCHSIM_DIR/TOGSim/configs/$i"
 
     for iter in {1..5}; do
       echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
index 8ff7e2b6..f1467614 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
@@ -25,7 +25,7 @@ for i in "${config[@]}"; do
     echo "===== config=$i | model=$ops =====" >> "$output_file"
     sum=0.0
     count=0
-    config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i"
+    config_path="$TORCHSIM_DIR/TOGSim/configs/$i"
 
     for iter in {1..5}; do
       echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
index aa35735c..2ed3ca2a 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
@@ -33,7 +33,7 @@ for i in "${config[@]}"; do
       echo "===== config=$i | model=$ops =====" >> "$output_file"
       sum=0.0
       count=0
-      config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i"
+      config_path="$TORCHSIM_DIR/TOGSim/configs/$i"
 
       for iter in {1..5}; do
         echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
diff --git a/experiments/attention.py b/experiments/attention.py
index e8f89dac..5a8c5f45 100644
--- a/experiments/attention.py
+++ b/experiments/attention.py
@@ -14,7 +14,7 @@ def attention(query, key, value):
         p_attn = scores.softmax(dim=-2)
         return torch.matmul(value.transpose(-1, -2), p_attn)
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
     query = torch.randn(size).to(device=device)
     key = torch.randn(size).to(device=device)
@@ -36,7 +36,7 @@ def attention(query, key, value):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -50,7 +50,7 @@ def attention(query, key, value):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     run_attention(size, config)
diff --git a/experiments/conv.py b/experiments/conv.py
index e8b97906..c8ca9a37 100644
--- a/experiments/conv.py
+++ b/experiments/conv.py
@@ -15,7 +15,7 @@ def custom_conv2d(a, b, bias):
         conv2d.weight = torch.nn.Parameter(b)
         # conv2d.bias = torch.nn.Parameter(bias)
         return conv2d(a)
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
     conv_input = torch.randn(batch_size, i_c, i_h, i_w).to(memory_format=torch.channels_last, device=device)
     conv_kernel = torch.randn(o_c, i_c, kernel_size, kernel_size).to(memory_format=torch.channels_last, device=device)
@@ -37,7 +37,7 @@ def custom_conv2d(a, b, bias):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -51,7 +51,7 @@ def custom_conv2d(a, b, bias):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     run_conv2d(size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7], config)
\ No newline at end of file
diff --git a/experiments/gemm.py b/experiments/gemm.py
index e7a639ad..67dc4f79 100644
--- a/experiments/gemm.py
+++ b/experiments/gemm.py
@@ -10,7 +10,7 @@ def run_matmul(input_size, hidden_size, output_size, config):
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
     def custom_matmul(a, b):
         return torch.matmul(a, b)
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
     torch.manual_seed(0)
     input = torch.randn(input_size, hidden_size).to(device=device)
@@ -31,7 +31,7 @@ def custom_matmul(a, b):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -45,8 +45,8 @@ def custom_matmul(a, b):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     from Scheduler.scheduler import PyTorchSimRunner
     module = PyTorchSimRunner.setup_device()
diff --git a/experiments/layernorm.py b/experiments/layernorm.py
index f149394e..0beaac6c 100644
--- a/experiments/layernorm.py
+++ b/experiments/layernorm.py
@@ -8,7 +8,7 @@
 
 def run_layernorm(size, config):
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
     input = torch.randn(size).to(device=device)
     opt_fn = torch.compile(dynamic=False)(torch.nn.LayerNorm(size[-1]).to(device=device))
@@ -27,7 +27,7 @@ def run_layernorm(size, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -42,7 +42,7 @@ def run_layernorm(size, config):
     os.environ['TORCHSIM_FUSION_REDUCTION_REDUCTION'] = "0"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     run_layernorm(size, config)
diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index 5d9dcf86..23d62e40 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -8,7 +8,7 @@
 def run_resnet(batch, config):
     from torchvision.models import resnet18
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
     model = resnet18().eval()
     input = torch.randn(batch, 3, 224, 224).to(device=device)
@@ -29,7 +29,7 @@ def run_resnet(batch, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -43,7 +43,7 @@ def run_resnet(batch, config):
     os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     run_resnet(batch, config)
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index bd52afc1..60a46071 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -8,7 +8,7 @@
 def run_resnet(batch, config):
     from torchvision.models import resnet50
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
     model = resnet50().eval()
     input = torch.randn(batch, 3, 224, 224).to(device=device)
@@ -29,7 +29,7 @@ def run_resnet(batch, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -43,7 +43,7 @@ def run_resnet(batch, config):
     os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     run_resnet(batch, config)
diff --git a/experiments/softmax.py b/experiments/softmax.py
index 14d28fee..532ef091 100644
--- a/experiments/softmax.py
+++ b/experiments/softmax.py
@@ -8,7 +8,7 @@
 
 def run_softmax(size, config, dim=1):
     from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
     device = scheduler.execution_engine.module.custom_device()
     input = torch.randn(size).to(device=device)
     opt_fn = torch.compile(dynamic=False)(torch.nn.Softmax(dim=dim).to(device=device))
@@ -27,7 +27,7 @@ def run_softmax(size, config, dim=1):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -41,7 +41,7 @@ def run_softmax(size, config, dim=1):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'BACKENDSIM_SPIKE_ONLY' in os.environ:
-        del os.environ['BACKENDSIM_SPIKE_ONLY']
+    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
+        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
 
     run_softmax(size, config)
diff --git a/scripts/CompilerOpt_experiment/DMAopt.sh b/scripts/CompilerOpt_experiment/DMAopt.sh
index 469cf766..22118b1e 100644
--- a/scripts/CompilerOpt_experiment/DMAopt.sh
+++ b/scripts/CompilerOpt_experiment/DMAopt.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json"
+export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json"
 
 # None FG DMA
 export TORCHSIM_SUBTILE=0
diff --git a/scripts/build_from_source.sh b/scripts/build_from_source.sh
index d9806069..fb9e82e3 100644
--- a/scripts/build_from_source.sh
+++ b/scripts/build_from_source.sh
@@ -6,7 +6,7 @@ cd $home
 apt -y update && apt -y upgrade && apt -y install scons
 git clone https://github.com/PSAL-POSTECH/gem5.git
 cd gem5 && scons build/RISCV/gem5.opt -j $(nproc)
-export GEM5_PATH=$home/gem5/release/gem5.opt
+export GEM5_PATH=$home/gem5/build/RISCV/gem5.opt
 cd $home
 
 # LLVM
diff --git a/scripts/chiplet.sh b/scripts/chiplet.sh
index 3dfba3d9..2989e4fd 100755
--- a/scripts/chiplet.sh
+++ b/scripts/chiplet.sh
@@ -14,16 +14,16 @@ fi
 
 GEMM_PATH="$1"
 INDEX_NAME="$2"
-SIMULATOR_PATH="$TORCHSIM_DIR/PyTorchSimBackend/build/bin/Simulator"
+SIMULATOR_PATH="$TORCHSIM_DIR/TOGSim/build/bin/Simulator"
 GEMM_DIR_NAME=$(basename "$GEMM_PATH")
 echo "GEMM Directory Name: $GEMM_DIR_NAME"
 
 CONFIG_LIST=(
-    "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json"
+    "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json"
 )
 CONFIG_LIST2=(
-    "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json"
-    "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json"
+    "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json"
+    "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json"
 )
 shift
 shift
@@ -51,7 +51,7 @@ for CONFIG in "${CONFIG_LIST[@]}"; do
         # Run Simulator
         echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME"
         "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" &
-        echo "[BackendSimulator] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\""
+        echo "[TOGSim] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\""
     done
 done
 
@@ -65,6 +65,6 @@ for CONFIG in "${CONFIG_LIST2[@]}"; do
     # Run Simulator
     # echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME"
     "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" &
-    echo "[BackendSimulator] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\""
+    echo "[TOGSim] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\""
 done
 wait
\ No newline at end of file
diff --git a/scripts/end2end.sh b/scripts/end2end.sh
index 7ca5c93d..579b8c14 100755
--- a/scripts/end2end.sh
+++ b/scripts/end2end.sh
@@ -7,34 +7,34 @@ BASE_PATH=$1 # Input as the first argument
 total_sum=0
 total_core=0
 total_vector=0
-# Find all backendsim_result folders
-mapfile -t backend_folders < <(find "$BASE_PATH" -type d -name "backendsim_result")
+# Find all togsim_result folders
+mapfile -t togsim_folders < <(find "$BASE_PATH" -type d -name "togsim_result")
 
-# Iterate over each backendsim_result folder
-for backend_folder in "${backend_folders[@]}"; do
-  # echo "Processing folder: $backend_folder"
+# Iterate over each togsim_result folder
+for togsim_folder in "${togsim_folders[@]}"; do
+  # echo "Processing folder: $togsim_folder"
 
-  # Find all files within the backendsim_result folder
-  mapfile -t files < <(find "$backend_folder" -type f)
+  # Find all files within the togsim_result folder
+  mapfile -t files < <(find "$togsim_folder" -type f)
 
   for file in "${files[@]}"; do
     # echo "Processing $file"
 
-    # Extract the last line containing "Total cycle"
-    total_cycle=$(grep "Total cycle" "$file" | tail -n 1 | sed -E 's/.*Total cycle ([0-9]+).*/\1/')
+    # Extract the last line containing "Total_cycles"
+    total_cycle=$(grep "Total_cycles" "$file" | tail -n 1 | sed -E 's/.*Total_cycles ([0-9]+).*/\1/')
     # echo "total_cycle: $total_cycle"
-    active_cycles=($(grep -o 'active cycle [0-9]*' "$file" | awk '{print $3}'))
+    active_cycles=($(grep -o 'active_cycles [0-9]*' "$file" | awk '{print $3}'))
     num_cycles=${#active_cycles[@]}
     if [ "$num_cycles" -ge 3 ]; then
         core_cycle=${active_cycles[$((num_cycles-3))]}
     else
-        echo "Error: cannot find core active cycle"
+        echo "Error: cannot find core active_cycles"
     fi
     if [[ "$num_cycles" -ge 1 ]]; then
-        # Extract the last two active cycles
+        # Extract the last two active_cycless
         vector_core_cycle=${active_cycles[$((num_cycles-1))]}
     else
-        echo "Error: cannot find vector core active cycle"
+        echo "Error: cannot find vector core active_cycles"
     fi
     echo "file: $file total_cycle: $total_cycle SA core_cycle: $core_cycle vector_core_cycle: $vector_core_cycle"
 
diff --git a/scripts/get_tog_result.sh b/scripts/get_tog_result.sh
index 9359e1e5..6fd399e0 100755
--- a/scripts/get_tog_result.sh
+++ b/scripts/get_tog_result.sh
@@ -3,8 +3,8 @@ total_cycles=0
 
 # Read through input stream line by line
 while IFS= read -r line; do
-    # Check if the line contains both "[BackendSimulator]" and "stored"
-    if [[ "$line" == *"[BackendSimulator]"* && "$line" == *"stored"* ]]; then
+    # Check if the line contains both "[TOGSimulator]" and "stored"
+    if [[ "$line" == *"[TOGSimulator]"* && "$line" == *"stored"* ]]; then
         # Extract the file path from the line
         file_path=$(echo "$line" | sed -n 's/.*stored to "\(.*\)"$/\1/p')
         
diff --git a/scripts/sim_time.sh b/scripts/sim_time.sh
index 15c60736..95df5982 100755
--- a/scripts/sim_time.sh
+++ b/scripts/sim_time.sh
@@ -6,15 +6,15 @@ BASE_PATH=$1 # Input as the first argument
 # Initialize total_sum as string for awk processing
 total_sum=0.0
 
-# Find all backendsim_result folders
-mapfile -t backend_folders < <(find "$BASE_PATH" -type d -name "backendsim_result")
+# Find all togsim_result folders
+mapfile -t togsim_folders < <(find "$BASE_PATH" -type d -name "togsim_result")
 
-# Iterate over each backendsim_result folder
-for backend_folder in "${backend_folders[@]}"; do
-  mapfile -t files < <(find "$backend_folder" -type f)
+# Iterate over each togsim_result folder
+for togsim_folder in "${togsim_folders[@]}"; do
+  mapfile -t files < <(find "$togsim_folder" -type f)
 
   for file in "${files[@]}"; do
-    sim_time=$(grep "Simulation time:" "$file" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+(\.[0-9]+)?).*/\1/')
+    sim_time=$(grep "Wall-clock time for simulation:" "$file" | tail -n 1 | sed -E 's/.*Wall-clock time for simulation: ([0-9]+(\.[0-9]+)?).*/\1/')
     echo "file: $file total_cycle: $sim_time"
 
     if [[ -n "$sim_time" ]]; then
diff --git a/scripts/sparsity_experiment/run.sh b/scripts/sparsity_experiment/run.sh
index 0b7bc6f5..94e00527 100755
--- a/scripts/sparsity_experiment/run.sh
+++ b/scripts/sparsity_experiment/run.sh
@@ -5,7 +5,7 @@ export TORCHSIM_FORCE_TIME_M=8
 export TORCHSIM_FORCE_TIME_N=8
 
 OUTPUT_DIR="12GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -13,7 +13,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="24GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -21,7 +21,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="48GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -29,7 +29,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="12GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -37,7 +37,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="24GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -45,7 +45,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="48GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
diff --git a/scripts/stonne_experiment2/tog_gen.py b/scripts/stonne_experiment2/tog_gen.py
index 2f184f4c..be30795b 100644
--- a/scripts/stonne_experiment2/tog_gen.py
+++ b/scripts/stonne_experiment2/tog_gen.py
@@ -5,7 +5,7 @@
 from collections import defaultdict
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 from AsmParser.tog_generator import tog_generator
-from Simulator.simulator import BackendSimulator
+from Simulator.simulator import TOGSimulator
 from PyTorchSimFrontend import extension_config
 
 def extract_simulation_stats(result_path):
@@ -19,9 +19,9 @@ def extract_simulation_stats(result_path):
     for line in lines:
         if "nr_multiplications" in line:
             nr_multiplications = line.strip().split(":")[-1].strip()
-        elif "Total execution cycle" in line:
+        elif "Total execution cycles" in line:
             total_cycle = line.strip().split(":")[-1].strip()
-        elif "Simulation time" in line:
+        elif "Wall-clock time for simulation" in line:
             sim_time = line.strip().split(":")[-1].replace("seconds", "").strip()
     return nr_multiplications, total_cycle, sim_time
 
@@ -71,9 +71,9 @@ def extract_simulation_stats(result_path):
         if "outerPro" in path:
             continue
         tog_path = os.path.join(path, "tile_graph.onnx")
-        backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend")
-        stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json'
-        backsim = BackendSimulator(backend_path, stonne_config_path)
+        togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
+        stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/TOGSim/configs/stonne_validation_c1_simple_noc.json'
+        backsim = TOGSimulator(togsim_path, stonne_config_path)
         result_path = backsim.simulation(tog_path)
         nr_multiplications, total_cycle, sim_time = extract_simulation_stats(result_path)
         sim_time, total_cycle = float(sim_time), int(total_cycle)
diff --git a/tests/test_compile_overhead.py b/tests/test_compile_overhead.py
index cf0dc1bb..c32b4364 100644
--- a/tests/test_compile_overhead.py
+++ b/tests/test_compile_overhead.py
@@ -21,7 +21,7 @@
         #    shutil.rmtree("/tmp/torchinductor")
         #except FileNotFoundError:
         #    print("no cache")
-        scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
+        scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
         # Register compiled model
         opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
         SchedulerDNNModel.register_model("resnet18", opt_model1)
diff --git a/tests/test_hetro.py b/tests/test_hetro.py
index 5e36d730..557ea5d6 100644
--- a/tests/test_hetro.py
+++ b/tests/test_hetro.py
@@ -26,7 +26,7 @@ def custom_matmul(a, b):
     K = args.K
     sparsity = args.sparsity
     mode = args.mode
-    config_path = f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/{args.config}"
+    config_path = f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/{args.config}"
 
     print("M: ", M)
     print("N: ", N)
@@ -36,7 +36,7 @@ def custom_matmul(a, b):
     with torch.no_grad():
         # Init scheduler
         scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
-                            backend_config=config_path)
+                            togsim_config=config_path)
 
         # Register compiled model
         opt_model1 = torch.compile(custom_matmul)
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index c64093a0..91bf0ad8 100644
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -7,13 +7,13 @@
 base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
 sys.path.append(base_path)
 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-config = f'{base_path}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
+config = f'{base_path}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
 
 target_model1 = model1().eval()
 target_model2 = model2(768, 12).eval()
 
 # Init scheduler
-scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config)
+scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
 # Register compiled model
 opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last))
 opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device()))
diff --git a/tests/test_scheduler_batching.py b/tests/test_scheduler_batching.py
index f3b54159..5a34d161 100644
--- a/tests/test_scheduler_batching.py
+++ b/tests/test_scheduler_batching.py
@@ -17,7 +17,7 @@
     target_model1 = model1().eval()
 
     # Init scheduler
-    scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
+    scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
     # Register compiled model
     opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
     SchedulerDNNModel.register_model("resnet18", opt_model1)
diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py
index 1cf0d3b3..c7abf0ae 100644
--- a/tests/test_spmm_scheduler.py
+++ b/tests/test_spmm_scheduler.py
@@ -25,7 +25,7 @@
     output_size = args.output_size
     w1_sparsity = args.w1_sparsity
     w2_sparsity = args.w2_sparsity
-    config_path = f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/{args.config}"
+    config_path = f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/{args.config}"
 
     print("batch_size: ", batch_size)
     print("input_size: ", input_size)
@@ -37,7 +37,7 @@
     with torch.no_grad():
         # Init scheduler
         scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
-                            backend_config=config_path)
+                              togsim_config=config_path)
 
         target_model1 = model1(input_size, hidden_size, output_size, w1_sparsity, w2_sparsity, scheduler.execution_engine.module.custom_device()).eval()
         target_model2 = model2(768, 12).eval()
diff --git a/tutorial/session1/HelloPyTorchSim.ipynb b/tutorial/session1/HelloPyTorchSim.ipynb
new file mode 100644
index 00000000..dfb086a4
--- /dev/null
+++ b/tutorial/session1/HelloPyTorchSim.ipynb
@@ -0,0 +1,1216 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Hello, PyTorchSim!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import os\n",
+    "import sys\n",
+    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
+    "sys.path.append(base_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## One Touch Simulation\n",
+    "### Normal Matmul Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "\n",
+    "torch.manual_seed(0)\n",
+    "input = torch.randn(128, 128).to(device)\n",
+    "weight = torch.randn(128, 128).to(device)\n",
+    "\n",
+    "opt_fn = torch.compile(torch.matmul)\n",
+    "cpu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### PyTorchSim Matmul Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n",
+      "Building extension module npu...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ninja: no work to do.\n",
+      "Wrapper Codegen Path = /tmp/torchinductor_root/ro/croutbd6yxrzgdstfcplx7yrpn2do5frwhyx2md5r7rvrubdhdgd.py\n",
+      "[Gem5] Gem5 is running... \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running..  \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/0\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "\n",
+    "torch.manual_seed(0)\n",
+    "input = torch.randn(128, 128).to(device)\n",
+    "weight = torch.randn(128, 128).to(device)\n",
+    "\n",
+    "opt_fn = torch.compile(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n",
+    "    if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",
+    "        message = f\"|{name} Test Passed|\"\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(message)\n",
+    "        print(\"-\" * len(message))\n",
+    "    else:\n",
+    "        message = f\"|{name} Test Failed|\"\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(message)\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(\"npu out: \", npu_out.cpu())\n",
+    "        print(\"cpu out: \", cpu_out)\n",
+    "        exit(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------\n",
+      "|MatMul Test Passed|\n",
+      "--------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_result(\"MatMul\", npu_out, cpu_out)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from Scheduler.scheduler import PyTorchSimRunner\n",
+    "# npu_device = PyTorchSimRunner.setup_device().custom_device()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Normal Backward Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "torch.manual_seed(0)\n",
+    "cpu_input = torch.randn(128, 128).to(device)\n",
+    "cpu_weight = torch.randn(128, 128).to(device)\n",
+    "cpu_target = torch.randn(128, 128).to(device)\n",
+    "cpu_input.requires_grad = True\n",
+    "cpu_weight.requires_grad = True\n",
+    "\n",
+    "opt_fn = torch.compile(torch.matmul)\n",
+    "cpu_out = opt_fn(cpu_input, cpu_weight)\n",
+    "\n",
+    "loss_fn = torch.nn.CrossEntropyLoss()\n",
+    "cpu_loss = loss_fn(cpu_out, cpu_target)\n",
+    "cpu_loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### PyTorchSim Backward Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrapper Codegen Path = /tmp/torchinductor_root/5i/c5isqyualxbaqsmuhsux7oubvkypfmh4kvamqvgref6z3ypnrpw5.py\n",
+      "[Gem5] Gem5 is running... \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running..  \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/19\"\n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "0 <= device.index() && device.index() < static_cast<c10::DeviceIndex>(device_ready_queues_.size()) INTERNAL ASSERT FAILED at \"/opt/conda/conda-bld/pytorch_1704987394225/work/torch/csrc/autograd/engine.cpp\":1423, please report a bug to PyTorch. ",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[7], line 15\u001b[0m\n\u001b[1;32m     13\u001b[0m loss_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mnn\u001b[38;5;241m.\u001b[39mCrossEntropyLoss()\n\u001b[1;32m     14\u001b[0m npu_loss \u001b[38;5;241m=\u001b[39m loss_fn(npu_out, npu_target)\n\u001b[0;32m---> 15\u001b[0m \u001b[43mnpu_loss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_tensor.py:522\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m    512\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    513\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m    514\u001b[0m         Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m    515\u001b[0m         (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    520\u001b[0m         inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m    521\u001b[0m     )\n\u001b[0;32m--> 522\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    523\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m    524\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py:266\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m    261\u001b[0m     retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[1;32m    263\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[1;32m    264\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m    265\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 266\u001b[0m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m    267\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    268\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    269\u001b[0m \u001b[43m    \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    270\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    271\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    272\u001b[0m \u001b[43m    \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    273\u001b[0m \u001b[43m    \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    274\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: 0 <= device.index() && device.index() < static_cast<c10::DeviceIndex>(device_ready_queues_.size()) INTERNAL ASSERT FAILED at \"/opt/conda/conda-bld/pytorch_1704987394225/work/torch/csrc/autograd/engine.cpp\":1423, please report a bug to PyTorch. "
+     ]
+    }
+   ],
+   "source": [
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "npu_device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "torch.manual_seed(0)\n",
+    "npu_input = torch.randn(128, 128).to(npu_device)\n",
+    "npu_weight = torch.randn(128, 128).to(npu_device)\n",
+    "npu_target = torch.randn(128, 128).to(npu_device)\n",
+    "npu_input.requires_grad = True\n",
+    "npu_weight.requires_grad = True\n",
+    "\n",
+    "opt_fn = torch.compile(torch.matmul)\n",
+    "npu_out = opt_fn(npu_input, npu_weight)\n",
+    "\n",
+    "loss_fn = torch.nn.CrossEntropyLoss()\n",
+    "npu_loss = loss_fn(npu_out, npu_target)\n",
+    "npu_loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'test_result' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtest_result\u001b[49m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMatMul Input Grad\u001b[39m\u001b[38;5;124m\"\u001b[39m, npu_input\u001b[38;5;241m.\u001b[39mgrad, cpu_input\u001b[38;5;241m.\u001b[39mgrad)\n\u001b[1;32m      2\u001b[0m test_result(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMatMul Weight Grad\u001b[39m\u001b[38;5;124m\"\u001b[39m, npu_weight\u001b[38;5;241m.\u001b[39mgrad, cpu_weight\u001b[38;5;241m.\u001b[39mgrad)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'test_result' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "test_result(\"MatMul Input Grad\", npu_input.grad, cpu_input.grad)\n",
+    "test_result(\"MatMul Weight Grad\", npu_weight.grad, cpu_weight.grad)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Mapping\n",
+    "\n",
+    "Default mapping is based on heuristic."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrapper Codegen Path = /tmp/torchinductor_root/5z/c5z4ur2k2svn2gaawn776ev3t6gsa7esgu36la63523cqpbbt56d.py\n",
+      "[Gem5] Gem5 is running..  \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running.   \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-11-30 18:53:14.002] [info] Total execution cycle: 47158\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0 | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Manual Mapping\n",
+    "User can set tile size manually."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrapper Codegen Path = /tmp/torchinductor_root/mv/cmv6cp7oo3wwndv76iv3sib7r74tnbvodfwxi3rw33k7grlh3h4h.py\n",
+      "[Gem5] Gem5 is running.   \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running... \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/75hiq5mugpq/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/75hiq5mugpq/togsim_result/0\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "torch._dynamo.reset()\n",
+    "\n",
+    "os.environ['TORCHSIM_MANUAL_TILE_SIZE']=\"1\"\n",
+    "os.environ['TORCHSIM_TILE_M']=\"512\"\n",
+    "os.environ['TORCHSIM_TILE_N']=\"512\"\n",
+    "os.environ['TORCHSIM_TILE_K']=\"512\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-11-30 18:54:00.878] [info] Total execution cycle: 53704\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor/tmp/75hiq5mugpq/togsim_result/0 | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Autotune"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Auto-tune] Trying tile size: [1024, 1024, 256, 128, 1024, 256]\n",
+      "[Auto-tune] Trying tile size: [256, 1024, 1024, 128, 1024, 1024]\n",
+      "[Auto-tune] Trying tile size: [1024, 256, 1024, 128, 256, 1024]\n",
+      "[Auto-tune] Trying tile size: [1024, 1024, 128, 128, 1024, 128]\n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/x27ipc5avjg/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/0\"\n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/7j33rcic2qn/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/7j33rcic2qn/togsim_result/0\"\n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/vsaamplubl5/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/vsaamplubl5/togsim_result/0\"\n",
+      "[Auto-tune] Optimal tile size: [1024, 1024, 128, 128, 1024, 128], cycles: 46423\n",
+      "Wrapper Codegen Path = /tmp/torchinductor_root/3b/c3bebp4b4rp73grbvhbaq4xdxny7f5m7fgqkgpflp2cjn3x5uugr.py\n",
+      "[Gem5] Gem5 is running..  \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running.   \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/x27ipc5avjg/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/1\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "torch._dynamo.reset()\n",
+    "os.environ['TORCHSIM_MANUAL_TILE_SIZE']=\"0\"\n",
+    "os.environ['AUTOTUNE_TEMPLATE']=\"1\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-11-30 18:54:53.051] [info] Total execution cycle: 46422\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/1 | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Execution Mode\n",
+    "### Functional & Timing mode (Default)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrapper Codegen Path = /tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py\n",
+      "[Gem5] Gem5 is running..  \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running.   \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/4\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "torch._dynamo.reset()\n",
+    "os.environ['AUTOTUNE_TEMPLATE']=\"0\"\n",
+    "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"1\"\n",
+    "os.environ['TORCHSIM_TIMING_MODE']=\"1\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Functional only mode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Spike] Running Spike simulator\n"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"1\"\n",
+    "os.environ['TORCHSIM_TIMING_MODE']=\"0\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Timing only mode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[23], line 8\u001b[0m\n\u001b[1;32m      5\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m      7\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 8\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m     11\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;124;03m    Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m     \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     17\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m     19\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m inner\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified.<locals>.forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m    899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m    900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func.<locals>.g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper.<locals>.runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m     88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     89\u001b[0m     \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m     90\u001b[0m     \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m     91\u001b[0m     \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m     92\u001b[0m     \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m     93\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m         all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m     95\u001b[0m             compiled_fn,\n\u001b[1;32m     96\u001b[0m             args,\n\u001b[1;32m     97\u001b[0m             disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m     98\u001b[0m         )\n\u001b[1;32m    100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m    101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m    103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m    104\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m         out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    106\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    107\u001b[0m         \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m    108\u001b[0m         \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m    109\u001b[0m         warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m    110\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    111\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    112\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    113\u001b[0m         )\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base.<locals>.rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    116\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m    117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m    863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m    884\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m    885\u001b[0m     compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m    886\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m    887\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m    888\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m    889\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m    890\u001b[0m     )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m    124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m    127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir.<locals>.dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m    284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m    287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m     23\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m     26\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m     27\u001b[0m         file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"0\"\n",
+    "os.environ['TORCHSIM_TIMING_MODE']=\"1\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TOGSim Configuration\n",
+    "### Single Core"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[22], line 7\u001b[0m\n\u001b[1;32m      4\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m      6\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 7\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m     11\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;124;03m    Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m     \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     17\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m     19\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m inner\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified.<locals>.forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m    899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m    900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func.<locals>.g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper.<locals>.runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m     88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     89\u001b[0m     \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m     90\u001b[0m     \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m     91\u001b[0m     \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m     92\u001b[0m     \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m     93\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m         all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m     95\u001b[0m             compiled_fn,\n\u001b[1;32m     96\u001b[0m             args,\n\u001b[1;32m     97\u001b[0m             disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m     98\u001b[0m         )\n\u001b[1;32m    100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m    101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m    103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m    104\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m         out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    106\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    107\u001b[0m         \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m    108\u001b[0m         \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m    109\u001b[0m         warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m    110\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    111\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    112\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    113\u001b[0m         )\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base.<locals>.rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    116\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m    117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m    863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m    884\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m    885\u001b[0m     compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m    886\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m    887\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m    888\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m    889\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m    890\u001b[0m     )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m    124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m    127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir.<locals>.dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m    284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m    287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m     23\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m     26\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m     27\u001b[0m         file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ['TORCHSIM_CONFIG']=\"/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-11-30 18:32:01.843] [info] Total execution cycle: 47126\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/11 | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Multi-Core"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running.   \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/12\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ['TORCHSIM_CONFIG']=\"/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-11-30 18:34:48.969] [info] Total execution cycle: 40736\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/12 | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TOGSim log level\n",
+    "### log level info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[21], line 7\u001b[0m\n\u001b[1;32m      4\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m      6\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 7\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m     11\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;124;03m    Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m     \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     17\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m     19\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m inner\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified.<locals>.forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m    899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m    900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func.<locals>.g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper.<locals>.runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m     88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     89\u001b[0m     \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m     90\u001b[0m     \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m     91\u001b[0m     \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m     92\u001b[0m     \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m     93\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m         all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m     95\u001b[0m             compiled_fn,\n\u001b[1;32m     96\u001b[0m             args,\n\u001b[1;32m     97\u001b[0m             disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m     98\u001b[0m         )\n\u001b[1;32m    100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m    101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m    103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m    104\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m         out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    106\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    107\u001b[0m         \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m    108\u001b[0m         \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m    109\u001b[0m         warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m    110\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    111\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    112\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    113\u001b[0m         )\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base.<locals>.rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    116\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m    117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m    863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m    884\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m    885\u001b[0m     compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m    886\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m    887\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m    888\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m    889\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m    890\u001b[0m     )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m    124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m    127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir.<locals>.dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m    284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m    287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m     23\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m     26\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m     27\u001b[0m         file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ['TORCHSIM_DUMP_PATH']=\"/workspace/PyTorchSim\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### log level trace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running.   \n",
+      "[TOGSim] Simulation of \"/workspace/PyTorchSim/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/workspace/PyTorchSim/tmp/4q4qv6gbpia/togsim_result/1\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ['BACKENDSIM_DEBUG_LEVEL']=\"trace\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Scheduler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torchvision.models import resnet18\n",
+    "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request\n",
+    "from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_BACKEND_CONFIG\n",
+    "\n",
+    "scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=CONFIG_TORCHSIM_BACKEND_CONFIG)\n",
+    "device = scheduler.execution_engine.module.custom_device()\n",
+    "\n",
+    "model = resnet18().eval()\n",
+    "input = torch.randn(1, 3, 224, 224).to(device=device)\n",
+    "opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last))\n",
+    "\n",
+    "SchedulerDNNModel.register_model(\"resnet18\", opt_fn)\n",
+    "request = Request(\"resnet18\", [input], [], request_queue_idx=0)\n",
+    "scheduler.add_request(request, request_time=0)\n",
+    "\n",
+    "# Run scheduler\n",
+    "while not scheduler.is_finished():\n",
+    "    with torch.no_grad():\n",
+    "        scheduler.schedule()\n",
+    "\n",
+    "print(\"ResNet18 Simulation Done\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Generator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-11-30 13:05:13.597] [info] [LoadConfig] Success to open \"/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\"\n",
+      "[2025-11-30 13:05:13.597] [info] [Config/Core] CPU 0: Partition 0\n",
+      "[2025-11-30 13:05:13.597] [info] [Config/Core] CPU 1: Partition 0\n",
+      "[2025-11-30 13:05:13.597] [info] [Config/Core] Core 0: 700 MHz, Systolic array per core: 1\n",
+      "[2025-11-30 13:05:13.597] [info] [Config/Core] Core 1: 700 MHz, Systolic array per core: 1\n",
+      "[2025-11-30 13:05:13.597] [info] [Config/DRAM] Ramulator2 config: /root/workspace/PyTorchSim/PyTorchSimBackend/configs/../configs/ramulator2_configs/HBM2.yaml\n",
+      "[2025-11-30 13:05:13.597] [info] [Config/DRAM] DRAM Bandwidth 716 GB/s, Freq: 700 MHz, Channels: 32, Request_size: 32B\n",
+      "[2025-11-30 13:05:13.597] [info] [Config/L2Cache] No L2 cache\n",
+      "[2025-11-30 13:05:13.673] [info] [Config/Interconnect] Interconnect freq: 20000 MHz\n",
+      "[2025-11-30 13:05:13.673] [info] [Config/Interconnect] SimpleInerconnect selected\n",
+      "[0] BackendSim> [Reqest] Resnet18 request time:  0\n",
+      "[Request issue] partition: 0 batch size: 1\n",
+      "[Request-0 issue] partition: 0 arrival_time: 0 start_time: 0.0\n",
+      "Wrapper Codegen Path = /tmp/torchinductor_root/qx/cqxp5xnkdgcdwmer5w6ftyf46iegefhyjclg6mkz2smhktj4tpcy.py\n",
+      "launch /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx /tmp/torchinductor/tmp/w5hefiqdl3p/runtime_0001/attribute/0 0 0\n",
+      "[2025-11-30 13:05:22.114] [info] [LoadConfig] Success to open \"/tmp/torchinductor/tmp/w5hefiqdl3p/runtime_0001/attribute/0\"\n",
+      "[2025-11-30 13:05:22.114] [info] [LoadConfig] Success to open \"/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\"\n",
+      "[2025-11-30 13:05:22.115] [info] [TOGParser/Attribute] Address Attribute key: arg0 address: 0xa3056c0\n",
+      "[2025-11-30 13:05:22.115] [info] [TOGParser/Attribute] Address Attribute key: arg1 address: 0xc4a3d40\n",
+      "[2025-11-30 13:05:22.115] [info] [TOGParser] Register Metadata \"systolic_size\": \"128\"\n",
+      "[2025-11-30 13:05:22.115] [info] [TOGParser] Register Metadata \"stonneGraph\": \"0\"\n",
+      "[2025-11-30 13:05:22.116] [info] [Scheduler 0] Register graph path: /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx operation: primals_123 at 0\n",
+      "[2025-11-30 13:05:22.116] [info] [Scheduler 0] Tile Graph FIFO Scheduled\n",
+      "until -1\n",
+      "[2025-11-30 13:05:22.117] [info] HBM2-CH_0: BW utilization 0% (0 reads, 0 writes)\n",
+      "[2025-11-30 13:05:22.319] [info] [Scheduler 0] Graph path: /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx operation: primals_123 finish at 2424\n",
+      "[2025-11-30 13:05:22.319] [info] Total compute time 2424\n",
+      "cycle\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 33\u001b[0m\n\u001b[1;32m     31\u001b[0m \u001b[38;5;66;03m# Run scheduler\u001b[39;00m\n\u001b[1;32m     32\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m scheduler\u001b[38;5;241m.\u001b[39mis_finished():\n\u001b[0;32m---> 33\u001b[0m     \u001b[43mscheduler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mschedule\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:475\u001b[0m, in \u001b[0;36mScheduler.schedule\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    473\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcurrent_cycle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbackend_simulator\u001b[38;5;241m.\u001b[39mcycle()\n\u001b[1;32m    474\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 475\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnext_time\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    476\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:507\u001b[0m, in \u001b[0;36mScheduler.run\u001b[0;34m(self, until_time)\u001b[0m\n\u001b[1;32m    505\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m until_time \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m    506\u001b[0m     \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mis_any_idle(req_empty_info):\n\u001b[0;32m--> 507\u001b[0m         result \u001b[38;5;241m=\u001b[39m \u001b[43mexecute_cycle\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    508\u001b[0m         req_empty_info \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest_empty(i) \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mnum_partion)]\n\u001b[1;32m    509\u001b[0m         \u001b[38;5;66;03m# if result is not -1, schedule new request\u001b[39;00m\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:484\u001b[0m, in \u001b[0;36mScheduler.run.<locals>.execute_cycle\u001b[0;34m()\u001b[0m\n\u001b[1;32m    482\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mnum_partion):\n\u001b[1;32m    483\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mpartition_state[i] \u001b[38;5;241m==\u001b[39m PyTorchSimRunner\u001b[38;5;241m.\u001b[39mPARTITION_IDLE:\n\u001b[0;32m--> 484\u001b[0m         ret \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlaunch_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_cycle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    485\u001b[0m         launch_ret_info\u001b[38;5;241m.\u001b[39mappend(ret)\n\u001b[1;32m    487\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcheck_finish_request()\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:254\u001b[0m, in \u001b[0;36mPyTorchSimRunner.launch_kernel\u001b[0;34m(self, current_cycle, partion_idx)\u001b[0m\n\u001b[1;32m    252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpartition_state[partion_idx] \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mPARTITION_IDLE:\n\u001b[1;32m    253\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpartition_state[partion_idx]\n\u001b[0;32m--> 254\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselect_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpartion_idx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    255\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mSELECT_NOTHING:\n\u001b[1;32m    256\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mSELECT_NOTHING\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:290\u001b[0m, in \u001b[0;36mFIFORunner.select_kernel\u001b[0;34m(self, partition_idx)\u001b[0m\n\u001b[1;32m    287\u001b[0m         nested_gen \u001b[38;5;241m=\u001b[39m kernel(\u001b[38;5;241m*\u001b[39minputs)\n\u001b[1;32m    288\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnested_launch_model_dicts[partition_idx] \u001b[38;5;241m=\u001b[39m {req : nested_gen}\n\u001b[1;32m    289\u001b[0m         kernel, inputs \u001b[38;5;241m=\u001b[39m \\\n\u001b[0;32m--> 290\u001b[0m             \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnested_launch_model_dicts\u001b[49m\u001b[43m[\u001b[49m\u001b[43mpartition_idx\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43mreq\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    291\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m kernel, inputs\n\u001b[1;32m    292\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    293\u001b[0m     \u001b[38;5;66;03m# Retry\u001b[39;00m\n",
+      "File \u001b[0;32m/tmp/torchinductor_root/qx/cqxp5xnkdgcdwmer5w6ftyf46iegefhyjclg6mkz2smhktj4tpcy.py:227\u001b[0m, in \u001b[0;36mConv2D_1_3_224_22464_3_7_7_2_2_3_3_1_1_3\u001b[0;34m(X, W, Y)\u001b[0m\n\u001b[1;32m    224\u001b[0m W \u001b[38;5;241m=\u001b[39m W\u001b[38;5;241m.\u001b[39mpermute(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m0\u001b[39m)\u001b[38;5;241m.\u001b[39mcontiguous() \u001b[38;5;66;03m# (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)\u001b[39;00m\n\u001b[1;32m    226\u001b[0m \u001b[38;5;66;03m# Launch kernel\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m \u001b[43mmlir_kernel_1\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mW\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    228\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m (mlir_kernel_1, (X, W, Y))\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:307\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir.<locals>.dryrun_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    306\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdryrun_simulator\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 307\u001b[0m     key \u001b[38;5;241m=\u001b[39m \u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    308\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfilelock\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FileLock\n\u001b[1;32m    309\u001b[0m     lock_dir \u001b[38;5;241m=\u001b[39m get_lock_dir()\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/concurrent/futures/_base.py:453\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    450\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[1;32m    451\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__get_result()\n\u001b[0;32m--> 453\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_condition\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    455\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n\u001b[1;32m    456\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/threading.py:320\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    318\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:    \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m    319\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m         \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    321\u001b[0m         gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m    322\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "from torchvision.models import resnet18\n",
+    "\n",
+    "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator\n",
+    "CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
+    "\n",
+    "lambda_requests = 10\n",
+    "max_time = 30\n",
+    "\n",
+    "target_model1 = resnet18().eval()\n",
+    "\n",
+    "# Init scheduler\n",
+    "scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, backend_config=f\"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\")\n",
+    "# Register compiled model\n",
+    "opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)\n",
+    "SchedulerDNNModel.register_model(\"resnet18\", opt_model1)\n",
+    "\n",
+    "# Generate time stamp\n",
+    "for request_time in poisson_request_generator(lambda_requests, max_time):\n",
+    "    # Init input data\n",
+    "    model_input1 = torch.randn(1, 3, 224, 224)\n",
+    "\n",
+    "    # Init request\n",
+    "    new_request1 = Request(\"resnet18\", [model_input1], [], request_queue_idx=0)\n",
+    "\n",
+    "    # Add request to scheduler\n",
+    "    print(\"[Reqest] Resnet18 request time: \", request_time, flush=True)\n",
+    "    scheduler.add_request(new_request1, request_time=request_time)\n",
+    "\n",
+    "# Run scheduler\n",
+    "while not scheduler.is_finished():\n",
+    "    scheduler.schedule()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compiler Optimization\n",
+    "### GeMM + ReLU fusion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrapper Codegen Path = /tmp/torchinductor_root/vr/cvrlybtkuzkk6pmnlfxu7o55375z24tajmiow6mszaen5t4ra6zo.py\n",
+      "[Gem5] Gem5 is running.   \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running.   \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/5o2xythi5z3/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/5o2xythi5z3/togsim_result/0\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "def gemm_relu(a, b):\n",
+    "    return torch.relu(torch.matmul(a, b))\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n",
+    "out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cat: /tmp/torchinductor/tmp/5o2xythi5z3/backendsim_result/0: No such file or directory\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor/tmp/5o2xythi5z3/backendsim_result/0 | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Disable fusion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n",
+      "Building extension module npu...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ninja: no work to do.\n",
+      "Wrapper Codegen Path = /tmp/torchinductor_root/tl/ctlqjsvukam6d4kteerml7exwbt4paw7cjtjbxcwdlsd7e4koriq.py\n",
+      "[Gem5] Gem5 is running... \n",
+      "[Gem5] Gem5 is running..  \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running.   \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0\"\n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running..  \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/37dfo4nczcq/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/37dfo4nczcq/togsim_result/0\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "os.environ['TORCHSIM_COMPILER_OPTIMIZATION']=\"none\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "def gemm_relu(a, b):\n",
+    "    return torch.relu(torch.matmul(a, b))\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n",
+    "out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-11-30 12:52:49.376] [info] Total execution cycle: 47164\n",
+      "[2025-11-30 12:52:52.444] [info] Total execution cycle: 58510\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/backendsim_result/2 | grep \"Total execution cycle\"\n",
+    "!cat /tmp/torchinductor/tmp/37dfo4nczcq/backendsim_result/0 | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Single kernel mode (TODO: remove it?)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/assumptions.py:499\u001b[0m, in \u001b[0;36mmake_property.<locals>.getit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    498\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 499\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_assumptions\u001b[49m\u001b[43m[\u001b[49m\u001b[43mfact\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m    500\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m:\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'extended_negative'",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[2], line 10\u001b[0m\n\u001b[1;32m      7\u001b[0m model \u001b[38;5;241m=\u001b[39m resnet18()\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m      9\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(model)\n\u001b[0;32m---> 10\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1519\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1523\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1519\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1523\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:655\u001b[0m, in \u001b[0;36mcatch_errors_wrapper.<locals>.catch_errors\u001b[0;34m(frame, cache_entry, frame_state)\u001b[0m\n\u001b[1;32m    652\u001b[0m             \u001b[38;5;28;01mreturn\u001b[39;00m hijacked_callback(frame, cache_entry, hooks, frame_state)\n\u001b[1;32m    654\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compile_lock, _disable_current_modes():\n\u001b[0;32m--> 655\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcallback\u001b[49m\u001b[43m(\u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_entry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:727\u001b[0m, in \u001b[0;36mconvert_frame.<locals>._convert_frame\u001b[0;34m(frame, cache_entry, hooks, frame_state)\u001b[0m\n\u001b[1;32m    725\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mframes\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtotal\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    726\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 727\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[43minner_convert\u001b[49m\u001b[43m(\u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_entry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    728\u001b[0m     counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mframes\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mok\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    729\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:383\u001b[0m, in \u001b[0;36mconvert_frame_assert.<locals>._convert_frame_assert\u001b[0;34m(frame, cache_entry, hooks, frame_state)\u001b[0m\n\u001b[1;32m    370\u001b[0m signpost_event(\n\u001b[1;32m    371\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdynamo\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    372\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_convert_frame_assert._compile\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    379\u001b[0m     },\n\u001b[1;32m    380\u001b[0m )\n\u001b[1;32m    382\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config\u001b[38;5;241m.\u001b[39mpatch(_patch_config_if_changed()):\n\u001b[0;32m--> 383\u001b[0m     compiled_product \u001b[38;5;241m=\u001b[39m \u001b[43m_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    384\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_code\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    385\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_globals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    386\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_locals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    387\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_builtins\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    388\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcompiler_fn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    389\u001b[0m \u001b[43m        \u001b[49m\u001b[43mone_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    390\u001b[0m \u001b[43m        \u001b[49m\u001b[43mexport\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    391\u001b[0m \u001b[43m        \u001b[49m\u001b[43mexport_constraints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    392\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    393\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    394\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    395\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mframe_state\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    396\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcompile_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcompile_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    397\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    398\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m compiled_product\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:646\u001b[0m, in \u001b[0;36m_compile\u001b[0;34m(code, globals, locals, builtins, compiler_fn, one_graph, export, export_constraints, hooks, cache_size, frame, frame_state, compile_id)\u001b[0m\n\u001b[1;32m    644\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compile_context(CompileContext(compile_id)):\n\u001b[1;32m    645\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 646\u001b[0m         guarded_code \u001b[38;5;241m=\u001b[39m \u001b[43mcompile_inner\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mone_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransform\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    647\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m guarded_code\n\u001b[1;32m    648\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m (\n\u001b[1;32m    649\u001b[0m         Unsupported,\n\u001b[1;32m    650\u001b[0m         TorchRuntimeError,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    657\u001b[0m         BisectValidationException,\n\u001b[1;32m    658\u001b[0m     ) \u001b[38;5;28;01mas\u001b[39;00m e:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:562\u001b[0m, in \u001b[0;36m_compile.<locals>.compile_inner\u001b[0;34m(code, one_graph, hooks, transform)\u001b[0m\n\u001b[1;32m    560\u001b[0m CompileContext\u001b[38;5;241m.\u001b[39mget()\u001b[38;5;241m.\u001b[39mattempt \u001b[38;5;241m=\u001b[39m attempt\n\u001b[1;32m    561\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 562\u001b[0m     out_code \u001b[38;5;241m=\u001b[39m \u001b[43mtransform_code_object\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransform\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    563\u001b[0m     \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m    564\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mRestartAnalysis \u001b[38;5;28;01mas\u001b[39;00m e:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/bytecode_transformation.py:1033\u001b[0m, in \u001b[0;36mtransform_code_object\u001b[0;34m(code, transformations, safe)\u001b[0m\n\u001b[1;32m   1030\u001b[0m instructions \u001b[38;5;241m=\u001b[39m cleaned_instructions(code, safe)\n\u001b[1;32m   1031\u001b[0m propagate_line_nums(instructions)\n\u001b[0;32m-> 1033\u001b[0m \u001b[43mtransformations\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstructions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcode_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1034\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m clean_and_assemble_instructions(instructions, keys, code_options)[\u001b[38;5;241m1\u001b[39m]\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:151\u001b[0m, in \u001b[0;36mpreserve_global_state.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    149\u001b[0m cleanup \u001b[38;5;241m=\u001b[39m setup_compile_debug()\n\u001b[1;32m    150\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 151\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    152\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    153\u001b[0m     cleanup\u001b[38;5;241m.\u001b[39mclose()\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:527\u001b[0m, in \u001b[0;36m_compile.<locals>.transform\u001b[0;34m(instructions, code_options)\u001b[0m\n\u001b[1;32m    525\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    526\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m tracing(tracer\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mtracing_context), tracer\u001b[38;5;241m.\u001b[39mset_current_tx():\n\u001b[0;32m--> 527\u001b[0m         \u001b[43mtracer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    528\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mUnspecializeRestartAnalysis:\n\u001b[1;32m    529\u001b[0m     speculation_log\u001b[38;5;241m.\u001b[39mclear()\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:2128\u001b[0m, in \u001b[0;36mInstructionTranslator.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   2127\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m-> 2128\u001b[0m     \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:818\u001b[0m, in \u001b[0;36mInstructionTranslatorBase.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    813\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    814\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mpush_tx(\u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m    815\u001b[0m     \u001b[38;5;28;01mwhile\u001b[39;00m (\n\u001b[1;32m    816\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minstruction_pointer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    817\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mshould_exit\n\u001b[0;32m--> 818\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    819\u001b[0m     ):\n\u001b[1;32m    820\u001b[0m         \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m    821\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m BackendCompilerFailed:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:781\u001b[0m, in \u001b[0;36mInstructionTranslatorBase.step\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    777\u001b[0m         unimplemented(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmissing: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minst\u001b[38;5;241m.\u001b[39mopname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    778\u001b[0m     TracingContext\u001b[38;5;241m.\u001b[39mset_current_loc(\n\u001b[1;32m    779\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_filename, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlineno, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_name\n\u001b[1;32m    780\u001b[0m     )\n\u001b[0;32m--> 781\u001b[0m     \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minst\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopname\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minst\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    783\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m inst\u001b[38;5;241m.\u001b[39mopname \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    784\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m Unsupported:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:2243\u001b[0m, in \u001b[0;36mInstructionTranslator.RETURN_VALUE\u001b[0;34m(self, inst)\u001b[0m\n\u001b[1;32m   2238\u001b[0m _step_logger()(\n\u001b[1;32m   2239\u001b[0m     logging\u001b[38;5;241m.\u001b[39mINFO,\n\u001b[1;32m   2240\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorchdynamo done tracing \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (RETURN_VALUE)\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m   2241\u001b[0m )\n\u001b[1;32m   2242\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE triggered compile\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 2243\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moutput\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompile_subgraph\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2244\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2245\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreason\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mGraphCompileReason\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2246\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreturn_value\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mframe_summary\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgraph_break\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[1;32m   2247\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2248\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcompile_return_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   2249\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2250\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39madd_output_instructions([create_instruction(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE\u001b[39m\u001b[38;5;124m\"\u001b[39m)])\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:919\u001b[0m, in \u001b[0;36mOutputGraph.compile_subgraph\u001b[0;34m(self, tx, partial_convert, reason, compile_return_value)\u001b[0m\n\u001b[1;32m    916\u001b[0m     append_prefix_insts()\n\u001b[1;32m    917\u001b[0m     \u001b[38;5;66;03m# optimization to generate better code in a common case\u001b[39;00m\n\u001b[1;32m    918\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_output_instructions(\n\u001b[0;32m--> 919\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompile_and_call_fx_graph\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mreversed\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mstack_values\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mroot\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    920\u001b[0m         \u001b[38;5;241m+\u001b[39m [create_instruction(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUNPACK_SEQUENCE\u001b[39m\u001b[38;5;124m\"\u001b[39m, arg\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(stack_values))]\n\u001b[1;32m    921\u001b[0m     )\n\u001b[1;32m    922\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    923\u001b[0m     graph_output_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnew_var(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgraph_out\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36mContextDecorator.__call__.<locals>.inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m     76\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m     77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds):\n\u001b[1;32m     78\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1087\u001b[0m, in \u001b[0;36mOutputGraph.compile_and_call_fx_graph\u001b[0;34m(self, tx, rv, root)\u001b[0m\n\u001b[1;32m   1084\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtracing_context\u001b[38;5;241m.\u001b[39mfake_mode \u001b[38;5;241m=\u001b[39m backend_fake_mode\n\u001b[1;32m   1086\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrestore_global_state():\n\u001b[0;32m-> 1087\u001b[0m     compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcall_user_compiler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1088\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m disable(compiled_fn)\n\u001b[1;32m   1090\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstats\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munique_graphs\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1140\u001b[0m, in \u001b[0;36mOutputGraph.call_user_compiler\u001b[0;34m(self, gm)\u001b[0m\n\u001b[1;32m   1138\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39mverify_correctness:\n\u001b[1;32m   1139\u001b[0m     compiler_fn \u001b[38;5;241m=\u001b[39m WrapperBackend(compiler_fn)\n\u001b[0;32m-> 1140\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexample_inputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1141\u001b[0m _step_logger()(logging\u001b[38;5;241m.\u001b[39mINFO, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdone compiler function \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1142\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(compiled_fn), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompiler_fn did not return callable\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/repro/after_dynamo.py:117\u001b[0m, in \u001b[0;36mwrap_backend_debug.<locals>.debug_wrapper\u001b[0;34m(gm, example_inputs, **kwargs)\u001b[0m\n\u001b[1;32m    115\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[1;32m    116\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 117\u001b[0m     compiled_gm \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    119\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m compiled_gm\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/__init__.py:1662\u001b[0m, in \u001b[0;36m_TorchCompileInductorWrapper.__call__\u001b[0;34m(self, model_, inputs_)\u001b[0m\n\u001b[1;32m   1659\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, model_, inputs_):\n\u001b[1;32m   1660\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_inductor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompile_fx\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m compile_fx\n\u001b[0;32m-> 1662\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompile_fx\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig_patches\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:1168\u001b[0m, in \u001b[0;36mcompile_fx\u001b[0;34m(model_, example_inputs_, inner_compile, config_patches, decompositions)\u001b[0m\n\u001b[1;32m   1163\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m inference_compiler(unlifted_gm, example_inputs_)\n\u001b[1;32m   1165\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_fake_mode(fake_mode), torch\u001b[38;5;241m.\u001b[39m_guards\u001b[38;5;241m.\u001b[39mtracing(\n\u001b[1;32m   1166\u001b[0m     tracing_context\n\u001b[1;32m   1167\u001b[0m ), compiled_autograd\u001b[38;5;241m.\u001b[39mdisable():\n\u001b[0;32m-> 1168\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43maot_autograd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1169\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfw_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1170\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbw_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbw_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1171\u001b[0m \u001b[43m        \u001b[49m\u001b[43minference_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minference_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1172\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdecompositions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecompositions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1173\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpartition_fn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_fn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1174\u001b[0m \u001b[43m        \u001b[49m\u001b[43mkeep_inference_input_mutations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   1175\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs_\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/backends/common.py:55\u001b[0m, in \u001b[0;36maot_autograd.<locals>.compiler_fn\u001b[0;34m(gm, example_inputs)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     \u001b[38;5;66;03m# NB: NOT cloned!\u001b[39;00m\n\u001b[1;32m     54\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m enable_aot_logging(), patch_config:\n\u001b[0;32m---> 55\u001b[0m         cg \u001b[38;5;241m=\u001b[39m \u001b[43maot_module_simplified\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     56\u001b[0m         counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot_autograd\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mok\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m     57\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m disable(cg)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:887\u001b[0m, in \u001b[0;36maot_module_simplified\u001b[0;34m(mod, args, fw_compiler, bw_compiler, partition_fn, decompositions, keep_inference_input_mutations, inference_compiler)\u001b[0m\n\u001b[1;32m    871\u001b[0m aot_config \u001b[38;5;241m=\u001b[39m AOTConfig(\n\u001b[1;32m    872\u001b[0m     fw_compiler\u001b[38;5;241m=\u001b[39mfw_compiler,\n\u001b[1;32m    873\u001b[0m     bw_compiler\u001b[38;5;241m=\u001b[39mbw_compiler,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    883\u001b[0m     no_tangents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m    884\u001b[0m )\n\u001b[1;32m    886\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compiled_autograd\u001b[38;5;241m.\u001b[39mdisable():\n\u001b[0;32m--> 887\u001b[0m     compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_aot_dispatcher_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    888\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfunctional_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    889\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    890\u001b[0m \u001b[43m        \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    891\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    893\u001b[0m \u001b[38;5;66;03m# TODO: There is something deeply wrong here; compiled_fn running with\u001b[39;00m\n\u001b[1;32m    894\u001b[0m \u001b[38;5;66;03m# the boxed calling convention, but aot_module_simplified somehow\u001b[39;00m\n\u001b[1;32m    895\u001b[0m \u001b[38;5;66;03m# historically returned a function that was not the boxed calling\u001b[39;00m\n\u001b[1;32m    896\u001b[0m \u001b[38;5;66;03m# convention.  This should get fixed...\u001b[39;00m\n\u001b[1;32m    897\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39mruntime_args):\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:600\u001b[0m, in \u001b[0;36mcreate_aot_dispatcher_function\u001b[0;34m(flat_fn, flat_args, aot_config)\u001b[0m\n\u001b[1;32m    597\u001b[0m compiler_fn \u001b[38;5;241m=\u001b[39m partial(aot_wrapper_dedupe, compiler_fn\u001b[38;5;241m=\u001b[39mcompiler_fn)\n\u001b[1;32m    598\u001b[0m \u001b[38;5;66;03m# You can put more passes here\u001b[39;00m\n\u001b[0;32m--> 600\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfake_flat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    601\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m aot_config\u001b[38;5;241m.\u001b[39mis_export:\n\u001b[1;32m    602\u001b[0m     mutated_user_inp_locs \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    603\u001b[0m         idx \u001b[38;5;241m-\u001b[39m aot_config\u001b[38;5;241m.\u001b[39mnum_params_buffers\n\u001b[1;32m    604\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m fw_metadata\u001b[38;5;241m.\u001b[39mmutated_inp_runtime_indices\n\u001b[1;32m    605\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m idx \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m aot_config\u001b[38;5;241m.\u001b[39mnum_params_buffers\n\u001b[1;32m    606\u001b[0m     ]\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:425\u001b[0m, in \u001b[0;36maot_wrapper_dedupe\u001b[0;34m(flat_fn, flat_args, aot_config, compiler_fn, fw_metadata)\u001b[0m\n\u001b[1;32m    422\u001b[0m             \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m    424\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ok:\n\u001b[0;32m--> 425\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mleaf_flat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    427\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m requires_subclass_dispatch(leaf_flat_args, fw_metadata):\n\u001b[1;32m    428\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m    429\u001b[0m \u001b[38;5;250m            \u001b[39m\u001b[38;5;124;03m\"\"\"\\\u001b[39;00m\n\u001b[1;32m    430\u001b[0m \u001b[38;5;124;03mEncountered duplicate inputs that are mutated in the graph, but at least one input/output\u001b[39;00m\n\u001b[1;32m    431\u001b[0m \u001b[38;5;124;03mto the graph is a tensor subclass. This is not supported today. You can try to\u001b[39;00m\n\u001b[1;32m    432\u001b[0m \u001b[38;5;124;03mremove the aliasing yourself as a workaround, or otherwise file an issue on github.\"\"\"\u001b[39;00m\n\u001b[1;32m    433\u001b[0m         )\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:630\u001b[0m, in \u001b[0;36maot_wrapper_synthetic_base\u001b[0;34m(flat_fn, flat_args, aot_config, fw_metadata, needs_autograd, compiler_fn)\u001b[0m\n\u001b[1;32m    628\u001b[0m \u001b[38;5;66;03m# Happy path: we don't need synthetic bases\u001b[39;00m\n\u001b[1;32m    629\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m synthetic_base_info \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 630\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mflat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    632\u001b[0m \u001b[38;5;66;03m# export path: ban synthetic bases for now, add later if requested.\u001b[39;00m\n\u001b[1;32m    633\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m requires_subclass_dispatch(flat_args, fw_metadata):\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:295\u001b[0m, in \u001b[0;36maot_dispatch_autograd\u001b[0;34m(flat_fn, flat_args, aot_config, fw_metadata)\u001b[0m\n\u001b[1;32m    292\u001b[0m     tracing_context\u001b[38;5;241m.\u001b[39mfw_metadata \u001b[38;5;241m=\u001b[39m inner_meta\n\u001b[1;32m    294\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m TracingContext\u001b[38;5;241m.\u001b[39mreport_output_strides() \u001b[38;5;28;01mas\u001b[39;00m fwd_output_strides:\n\u001b[0;32m--> 295\u001b[0m     compiled_fw_func \u001b[38;5;241m=\u001b[39m \u001b[43maot_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfw_compiler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfw_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43madjusted_flat_args\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    296\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(compiled_fw_func, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    297\u001b[0m     compiled_fw_func \u001b[38;5;241m=\u001b[39m make_boxed_func(compiled_fw_func)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:1100\u001b[0m, in \u001b[0;36mcompile_fx.<locals>.fw_compiler_base\u001b[0;34m(model, example_inputs, is_inference)\u001b[0m\n\u001b[1;32m   1092\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m orig_output_end_idx \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m num_model_outputs\n\u001b[1;32m   1094\u001b[0m     user_visible_outputs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m   1095\u001b[0m         n\u001b[38;5;241m.\u001b[39mname\n\u001b[1;32m   1096\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m n \u001b[38;5;129;01min\u001b[39;00m model_outputs[original_output_start_index:orig_output_end_idx]\n\u001b[1;32m   1097\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(n, torch\u001b[38;5;241m.\u001b[39mfx\u001b[38;5;241m.\u001b[39mNode)\n\u001b[1;32m   1098\u001b[0m     }\n\u001b[0;32m-> 1100\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1101\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1102\u001b[0m \u001b[43m    \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1103\u001b[0m \u001b[43m    \u001b[49m\u001b[43mnum_fixed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfixed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1104\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcudagraphs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcudagraphs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1105\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgraph_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgraph_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1106\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_inference\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_inference\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1107\u001b[0m \u001b[43m    \u001b[49m\u001b[43mboxed_forward_device_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforward_device\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1108\u001b[0m \u001b[43m    \u001b[49m\u001b[43muser_visible_outputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_visible_outputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1109\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/repro/after_aot.py:83\u001b[0m, in \u001b[0;36mwrap_compiler_debug.<locals>.debug_wrapper\u001b[0;34m(gm, example_inputs, **kwargs)\u001b[0m\n\u001b[1;32m     78\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m config\u001b[38;5;241m.\u001b[39mrepro_after \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdynamo\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m     81\u001b[0m     \u001b[38;5;66;03m# Call the compiler_fn - which is either aot_autograd or inductor\u001b[39;00m\n\u001b[1;32m     82\u001b[0m     \u001b[38;5;66;03m# with fake inputs\u001b[39;00m\n\u001b[0;32m---> 83\u001b[0m     inner_compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     84\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m     85\u001b[0m     \u001b[38;5;66;03m# TODO: Failures here are troublesome because no real inputs,\u001b[39;00m\n\u001b[1;32m     86\u001b[0m     \u001b[38;5;66;03m# need a different serialization strategy\u001b[39;00m\n\u001b[1;32m     87\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39mrepro_after \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/debug.py:305\u001b[0m, in \u001b[0;36mDebugContext.wrap.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    302\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m    303\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    304\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m DebugContext():\n\u001b[0;32m--> 305\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36mContextDecorator.__call__.<locals>.inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m     76\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m     77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds):\n\u001b[1;32m     78\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:320\u001b[0m, in \u001b[0;36mcompile_fx_inner\u001b[0;34m(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, boxed_forward_device_index, user_visible_outputs, layout_opt, extern_node_serializer)\u001b[0m\n\u001b[1;32m    316\u001b[0m     compiled_graph \u001b[38;5;241m=\u001b[39m FxGraphCache\u001b[38;5;241m.\u001b[39mload(\n\u001b[1;32m    317\u001b[0m         fx_codegen_and_compile, gm, example_inputs, graph_kwargs\n\u001b[1;32m    318\u001b[0m     )\n\u001b[1;32m    319\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m     compiled_graph \u001b[38;5;241m=\u001b[39m \u001b[43mfx_codegen_and_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    321\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mgraph_kwargs\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# type: ignore[arg-type]\u001b[39;49;00m\n\u001b[1;32m    322\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    324\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFX codegen and compilation took \u001b[39m\u001b[38;5;132;01m%.3f\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m, time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m start)\n\u001b[1;32m    326\u001b[0m \u001b[38;5;66;03m# Return the output strides to the caller via TracingContext\u001b[39;00m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:535\u001b[0m, in \u001b[0;36mfx_codegen_and_compile\u001b[0;34m(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, user_visible_outputs, layout_opt, extern_node_serializer)\u001b[0m\n\u001b[1;32m    519\u001b[0m graph \u001b[38;5;241m=\u001b[39m GraphLowering(\n\u001b[1;32m    520\u001b[0m     gm,\n\u001b[1;32m    521\u001b[0m     \u001b[38;5;66;03m# example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    532\u001b[0m     is_inference\u001b[38;5;241m=\u001b[39mis_inference,\n\u001b[1;32m    533\u001b[0m )\n\u001b[1;32m    534\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_graph_handler(graph):\n\u001b[0;32m--> 535\u001b[0m     \u001b[43mgraph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    536\u001b[0m     output_strides: List[Optional[Tuple[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m]]] \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m    537\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m graph\u001b[38;5;241m.\u001b[39mgraph_outputs \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    538\u001b[0m         \u001b[38;5;66;03m# We'll put the output strides in the compiled graph so we\u001b[39;00m\n\u001b[1;32m    539\u001b[0m         \u001b[38;5;66;03m# can later return them to the caller via TracingContext\u001b[39;00m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:519\u001b[0m, in \u001b[0;36mGraphLowering.run\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m    517\u001b[0m \u001b[38;5;129m@dynamo_timed\u001b[39m\n\u001b[1;32m    518\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs):\n\u001b[0;32m--> 519\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/fx/interpreter.py:138\u001b[0m, in \u001b[0;36mInterpreter.run\u001b[0;34m(self, initial_env, enable_io_processing, *args)\u001b[0m\n\u001b[1;32m    135\u001b[0m     \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m    137\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 138\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menv[node] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_node\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    140\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mextra_traceback:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:814\u001b[0m, in \u001b[0;36mGraphLowering.run_node\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m    812\u001b[0m     debug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayout_constraints\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    813\u001b[0m     args, kwargs \u001b[38;5;241m=\u001b[39m layout_constraints[n\u001b[38;5;241m.\u001b[39mtarget](n, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 814\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcall_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    815\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_magic_method(n\u001b[38;5;241m.\u001b[39mtarget):\n\u001b[1;32m    816\u001b[0m     \u001b[38;5;66;03m# TODO: this is sus, it probably should be handled in the\u001b[39;00m\n\u001b[1;32m    817\u001b[0m     \u001b[38;5;66;03m# lowerings themselves similarly to sym_size/sym-stride\u001b[39;00m\n\u001b[1;32m    818\u001b[0m     debug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_magic_method\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:691\u001b[0m, in \u001b[0;36mGraphLowering.call_function\u001b[0;34m(self, target, args, kwargs)\u001b[0m\n\u001b[1;32m    689\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    690\u001b[0m     log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m  via \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, lowerings[target])\n\u001b[0;32m--> 691\u001b[0m     out \u001b[38;5;241m=\u001b[39m \u001b[43mlowerings\u001b[49m\u001b[43m[\u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    692\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m    693\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_lowering.py:117\u001b[0m, in \u001b[0;36mconvolution\u001b[0;34m(x, weight, bias, stride, padding, dilation, transposed, output_padding, groups)\u001b[0m\n\u001b[1;32m    115\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    116\u001b[0m     mlir_template \u001b[38;5;241m=\u001b[39m MLIRConvTemplate([x, weight, bias], layout, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 117\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmlir_template\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39moutput_node()\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_template.py:1189\u001b[0m, in \u001b[0;36mMLIRTemplate.generate\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m   1184\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m patch\u001b[38;5;241m.\u001b[39mobject(V\u001b[38;5;241m.\u001b[39mgraph, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mget_dtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fake_get_dtype(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_node)):\n\u001b[1;32m   1185\u001b[0m     kernel  \u001b[38;5;241m=\u001b[39m MLIRTemplateKernel(kernel_name\u001b[38;5;241m=\u001b[39mkernel_name, input_nodes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minput_nodes, call_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlayout\u001b[38;5;241m.\u001b[39msize, kernel_group\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   1186\u001b[0m                                  outer_func_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunction_name \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfunction_name\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   1187\u001b[0m                                  outer_func_render\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mouter_func_render \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mouter_func_render\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   1188\u001b[0m                                  kernel_arg_attributes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_arg_attributes() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mget_arg_attributes\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m-> 1189\u001b[0m     code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkernel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkernel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1191\u001b[0m kernel_hash_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmlir_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex_counter)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1192\u001b[0m extra_args \u001b[38;5;241m=\u001b[39m []\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py:238\u001b[0m, in \u001b[0;36mMLIRConvSingleBatchTemplate.render\u001b[0;34m(self, kernel, template_buffer_node, epilogue_nodes, tile_info, **kwargs)\u001b[0m\n\u001b[1;32m    229\u001b[0m kernel\u001b[38;5;241m.\u001b[39mepilogue_info \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(\n\u001b[1;32m    230\u001b[0m     output_node \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_node\u001b[38;5;241m.\u001b[39mname,\n\u001b[1;32m    231\u001b[0m     sram_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_buffer\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    235\u001b[0m     dim_aliasing \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex0\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mc0\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex1\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtile_n\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex2\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mo_h\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex3\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtile_m\u001b[39m\u001b[38;5;124m\"\u001b[39m}\n\u001b[1;32m    236\u001b[0m )\n\u001b[1;32m    237\u001b[0m kernel\u001b[38;5;241m.\u001b[39mexception_nodes[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnumel\u001b[39m\u001b[38;5;124m\"\u001b[39m : (I_W\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m2\u001b[39m\u001b[38;5;241m*\u001b[39mPADDING_W)\u001b[38;5;241m*\u001b[39m(I_H\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m2\u001b[39m\u001b[38;5;241m*\u001b[39mPADDING_H)\u001b[38;5;241m*\u001b[39mI_C\u001b[38;5;241m*\u001b[39mBATCH}\n\u001b[0;32m--> 238\u001b[0m code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_template_from_string\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconv_template\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkernel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    239\u001b[0m kernel\u001b[38;5;241m.\u001b[39madd_loop_info([kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mK_H\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mK_W\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_H\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_W\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBATCH\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_C\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mI_C\u001b[39m\u001b[38;5;124m\"\u001b[39m]], [kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_M\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_N\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_K\u001b[39m\u001b[38;5;124m\"\u001b[39m]])\n\u001b[1;32m    240\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m code\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/jinja2/environment.py:1299\u001b[0m, in \u001b[0;36mTemplate.render\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1296\u001b[0m ctx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnew_context(\u001b[38;5;28mdict\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs))\n\u001b[1;32m   1298\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1299\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menvironment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconcat\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mroot_render_func\u001b[49m\u001b[43m(\u001b[49m\u001b[43mctx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m   1300\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m   1301\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menvironment\u001b[38;5;241m.\u001b[39mhandle_exception()\n",
+      "File \u001b[0;32m<template>:178\u001b[0m, in \u001b[0;36mroot\u001b[0;34m(context, missing, environment)\u001b[0m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/jinja2/runtime.py:298\u001b[0m, in \u001b[0;36mContext.call\u001b[0;34m(_Context__self, _Context__obj, *args, **kwargs)\u001b[0m\n\u001b[1;32m    295\u001b[0m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_loop_vars\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m    297\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 298\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m__obj\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    299\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m:\n\u001b[1;32m    300\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m __self\u001b[38;5;241m.\u001b[39menvironment\u001b[38;5;241m.\u001b[39mundefined(\n\u001b[1;32m    301\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalue was undefined because a callable raised a\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    302\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m StopIteration exception\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    303\u001b[0m     )\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_template.py:790\u001b[0m, in \u001b[0;36mMLIRTemplateKernel.def_dma_op\u001b[0;34m(self, dma_type, dram_var, index_list, tile_desc, subtile_size, async_type, indent_size)\u001b[0m\n\u001b[1;32m    788\u001b[0m local_code \u001b[38;5;241m=\u001b[39m IndentedBuffer()\n\u001b[1;32m    789\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_kernel_handler(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 790\u001b[0m     index_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse_index_list\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlocal_code\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtile_desc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    791\u001b[0m     node_layout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnamed_nodes[dram_var]\u001b[38;5;241m.\u001b[39mget_layout()\n\u001b[1;32m    792\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m dram_var \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexception_nodes:\n",
+      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_codegen_backend.py:1051\u001b[0m, in \u001b[0;36mMLIRKernel.parse_index_list\u001b[0;34m(self, expr_list, buffer, offset)\u001b[0m\n\u001b[1;32m   1048\u001b[0m         indices\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mstr\u001b[39m(new_arg))\n\u001b[1;32m   1050\u001b[0m \u001b[38;5;66;03m# Extract index var\u001b[39;00m\n\u001b[0;32m-> 1051\u001b[0m expr_str \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43msum\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mnew_expr_list\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1052\u001b[0m args \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mmap\u001b[39m(\u001b[38;5;28mstr\u001b[39m, dim_list))\n\u001b[1;32m   1053\u001b[0m map_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmap_cse\u001b[38;5;241m.\u001b[39mgenerate(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mglobal_vars, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maffine_map<(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00margs\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)[] -> (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexpr_str\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)>\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/_print_helpers.py:29\u001b[0m, in \u001b[0;36mPrintable.__str__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     27\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__str__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m     28\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msympy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mprinting\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mstr\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m sstr\n\u001b[0;32m---> 29\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msstr\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/printer.py:372\u001b[0m, in \u001b[0;36m_PrintFunction.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    371\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 372\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__wrapped__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/str.py:998\u001b[0m, in \u001b[0;36msstr\u001b[0;34m(expr, **settings)\u001b[0m\n\u001b[1;32m    982\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Returns the expression as a string.\u001b[39;00m\n\u001b[1;32m    983\u001b[0m \n\u001b[1;32m    984\u001b[0m \u001b[38;5;124;03mFor large expressions where speed is a concern, use the setting\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    994\u001b[0m \u001b[38;5;124;03m'Eq(a + b, 0)'\u001b[39;00m\n\u001b[1;32m    995\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    997\u001b[0m p \u001b[38;5;241m=\u001b[39m StrPrinter(settings)\n\u001b[0;32m--> 998\u001b[0m s \u001b[38;5;241m=\u001b[39m \u001b[43mp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdoprint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1000\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m s\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/printer.py:292\u001b[0m, in \u001b[0;36mPrinter.doprint\u001b[0;34m(self, expr)\u001b[0m\n\u001b[1;32m    290\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdoprint\u001b[39m(\u001b[38;5;28mself\u001b[39m, expr):\n\u001b[1;32m    291\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Returns printer's representation for expr (as a string)\"\"\"\u001b[39;00m\n\u001b[0;32m--> 292\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_str(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_print\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m)\u001b[49m)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/printer.py:331\u001b[0m, in \u001b[0;36mPrinter._print\u001b[0;34m(self, expr, **kwargs)\u001b[0m\n\u001b[1;32m    329\u001b[0m     printmethod \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, printmethodname, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m    330\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m printmethod \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 331\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mprintmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    332\u001b[0m \u001b[38;5;66;03m# Unknown object, fall back to the emptyPrinter.\u001b[39;00m\n\u001b[1;32m    333\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39memptyPrinter(expr)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/str.py:57\u001b[0m, in \u001b[0;36mStrPrinter._print_Add\u001b[0;34m(self, expr, order)\u001b[0m\n\u001b[1;32m     55\u001b[0m l \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m     56\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m term \u001b[38;5;129;01min\u001b[39;00m terms:\n\u001b[0;32m---> 57\u001b[0m     t \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_print\u001b[49m\u001b[43m(\u001b[49m\u001b[43mterm\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     58\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m term\u001b[38;5;241m.\u001b[39mis_Add:\n\u001b[1;32m     59\u001b[0m         sign \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/printer.py:331\u001b[0m, in \u001b[0;36mPrinter._print\u001b[0;34m(self, expr, **kwargs)\u001b[0m\n\u001b[1;32m    329\u001b[0m     printmethod \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, printmethodname, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m    330\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m printmethod \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 331\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mprintmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    332\u001b[0m \u001b[38;5;66;03m# Unknown object, fall back to the emptyPrinter.\u001b[39;00m\n\u001b[1;32m    333\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39memptyPrinter(expr)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/str.py:263\u001b[0m, in \u001b[0;36mStrPrinter._print_Mul\u001b[0;34m(self, expr)\u001b[0m\n\u001b[1;32m    261\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_print_Mul\u001b[39m(\u001b[38;5;28mself\u001b[39m, expr):\n\u001b[0;32m--> 263\u001b[0m     prec \u001b[38;5;241m=\u001b[39m \u001b[43mprecedence\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    265\u001b[0m     \u001b[38;5;66;03m# Check for unevaluated Mul. In this case we need to make sure the\u001b[39;00m\n\u001b[1;32m    266\u001b[0m     \u001b[38;5;66;03m# identities are visible, multiple Rational factors are not combined\u001b[39;00m\n\u001b[1;32m    267\u001b[0m     \u001b[38;5;66;03m# etc so we display in a straight-forward form that fully preserves all\u001b[39;00m\n\u001b[1;32m    268\u001b[0m     \u001b[38;5;66;03m# args and their order.\u001b[39;00m\n\u001b[1;32m    269\u001b[0m     args \u001b[38;5;241m=\u001b[39m expr\u001b[38;5;241m.\u001b[39margs\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/precedence.py:132\u001b[0m, in \u001b[0;36mprecedence\u001b[0;34m(item)\u001b[0m\n\u001b[1;32m    130\u001b[0m n \u001b[38;5;241m=\u001b[39m i\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\n\u001b[1;32m    131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n \u001b[38;5;129;01min\u001b[39;00m PRECEDENCE_FUNCTIONS:\n\u001b[0;32m--> 132\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPRECEDENCE_FUNCTIONS\u001b[49m\u001b[43m[\u001b[49m\u001b[43mn\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    133\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m n \u001b[38;5;129;01min\u001b[39;00m PRECEDENCE_VALUES:\n\u001b[1;32m    134\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m PRECEDENCE_VALUES[n]\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/precedence.py:62\u001b[0m, in \u001b[0;36mprecedence_Mul\u001b[0;34m(item)\u001b[0m\n\u001b[1;32m     61\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprecedence_Mul\u001b[39m(item):\n\u001b[0;32m---> 62\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mitem\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcould_extract_minus_sign\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m     63\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m PRECEDENCE[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAdd\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m     64\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m PRECEDENCE[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMul\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/mul.py:180\u001b[0m, in \u001b[0;36mMul.could_extract_minus_sign\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    178\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m  \u001b[38;5;66;03m# e.g. zoo*x == -zoo*x\u001b[39;00m\n\u001b[1;32m    179\u001b[0m c \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 180\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m c\u001b[38;5;241m.\u001b[39mis_Number \u001b[38;5;129;01mand\u001b[39;00m \u001b[43mc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mis_extended_negative\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/assumptions.py:503\u001b[0m, in \u001b[0;36mmake_property.<locals>.getit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    501\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_assumptions \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_assumptions:\n\u001b[1;32m    502\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_assumptions \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_assumptions\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[0;32m--> 503\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_ask\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfact\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/assumptions.py:559\u001b[0m, in \u001b[0;36m_ask\u001b[0;34m(fact, obj)\u001b[0m\n\u001b[1;32m    557\u001b[0m handler_i \u001b[38;5;241m=\u001b[39m handler_map\u001b[38;5;241m.\u001b[39mget(fact_i)\n\u001b[1;32m    558\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m handler_i \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 559\u001b[0m     fact_i_value \u001b[38;5;241m=\u001b[39m \u001b[43mhandler_i\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    561\u001b[0m \u001b[38;5;66;03m# If we get a new value for fact_i then we should update our knowledge\u001b[39;00m\n\u001b[1;32m    562\u001b[0m \u001b[38;5;66;03m# of fact_i as well as any related facts that can be inferred using the\u001b[39;00m\n\u001b[1;32m    563\u001b[0m \u001b[38;5;66;03m# inference rules connecting the fact_i and any other fact values that\u001b[39;00m\n\u001b[1;32m    564\u001b[0m \u001b[38;5;66;03m# are already known.\u001b[39;00m\n\u001b[1;32m    565\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m fact_i_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/expr.py:894\u001b[0m, in \u001b[0;36mExpr._eval_is_extended_negative\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    893\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_eval_is_extended_negative\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 894\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_eval_is_extended_positive_negative\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpositive\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/expr.py:857\u001b[0m, in \u001b[0;36mExpr._eval_is_extended_positive_negative\u001b[0;34m(self, positive)\u001b[0m\n\u001b[1;32m    854\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_number:\n\u001b[1;32m    855\u001b[0m     \u001b[38;5;66;03m# check to see that we can get a value\u001b[39;00m\n\u001b[1;32m    856\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 857\u001b[0m         n2 \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_eval_evalf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    858\u001b[0m     \u001b[38;5;66;03m# XXX: This shouldn't be caught here\u001b[39;00m\n\u001b[1;32m    859\u001b[0m     \u001b[38;5;66;03m# Catches ValueError: hypsum() failed to converge to the requested\u001b[39;00m\n\u001b[1;32m    860\u001b[0m     \u001b[38;5;66;03m# 34 bits of accuracy\u001b[39;00m\n\u001b[1;32m    861\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/numbers.py:672\u001b[0m, in \u001b[0;36mNumber._eval_evalf\u001b[0;34m(self, prec)\u001b[0m\n\u001b[1;32m    671\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_eval_evalf\u001b[39m(\u001b[38;5;28mself\u001b[39m, prec):\n\u001b[0;32m--> 672\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m Float\u001b[38;5;241m.\u001b[39m_new(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_as_mpf_val\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprec\u001b[49m\u001b[43m)\u001b[49m, prec)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/numbers.py:2083\u001b[0m, in \u001b[0;36mInteger._as_mpf_val\u001b[0;34m(self, prec)\u001b[0m\n\u001b[1;32m   2079\u001b[0m is_Integer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m   2081\u001b[0m \u001b[38;5;18m__slots__\u001b[39m \u001b[38;5;241m=\u001b[39m ()\n\u001b[0;32m-> 2083\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_as_mpf_val\u001b[39m(\u001b[38;5;28mself\u001b[39m, prec):\n\u001b[1;32m   2084\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m mlib\u001b[38;5;241m.\u001b[39mfrom_int(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mp, prec, rnd)\n\u001b[1;32m   2086\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_mpmath_\u001b[39m(\u001b[38;5;28mself\u001b[39m, prec, rnd):\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from torchvision.models import resnet18\n",
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "\n",
+    "input = torch.randn(1, 3, 224, 224).to(device=device)\n",
+    "model = resnet18().to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(model)\n",
+    "npu_out = opt_fn(input)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorial/session2/Hands_on.ipynb b/tutorial/session2/Hands_on.ipynb
new file mode 100644
index 00000000..ef66a5f6
--- /dev/null
+++ b/tutorial/session2/Hands_on.ipynb
@@ -0,0 +1,252 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "89aac974-97ea-46f2-b856-7b37c0a23add",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n",
+      "Building extension module npu...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "Loading extension module npu...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ninja: no work to do.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import torch\n",
+    "import torch._dynamo\n",
+    "import torch.utils.cpp_extension\n",
+    "sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))\n",
+    "\n",
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "module = PyTorchSimRunner.setup_device()\n",
+    "device = module.custom_device()\n",
+    "\n",
+    "def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):\n",
+    "    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",
+    "        message = f\"|{name} Test Passed|\"\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(message)\n",
+    "        print(\"-\" * len(message))\n",
+    "    else:\n",
+    "        message = f\"|{name} Test Failed|\"\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(message)\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(\"custom out: \", out.cpu())\n",
+    "        print(\"cpu out: \", cpu_out)\n",
+    "        exit(1)\n",
+    "\n",
+    "def test_exponent2(device, size=(128, 128)):\n",
+    "    def exponent2(a):\n",
+    "        return a.exp2()\n",
+    "    x = torch.randn(size).to(device=device)\n",
+    "    opt_fn = torch.compile(dynamic=False)(exponent2)\n",
+    "    res = opt_fn(x)\n",
+    "    out = exponent(x.cpu())\n",
+    "    test_result(\"exponent2\", res, out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "42d509f3-d955-4149-9f0f-bd0f3d0620f9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Wrapper Codegen Path = /tmp/torchinductor_root/uu/cuumxtbdv4ukzpymchmrda2exohouwcdybawmj2v7jog4vbvoycf.py\n",
+      "[Gem5] Gem5 is running... \n",
+      "[Spike] Running Spike simulator\n",
+      "[TOGSim] TOGSim is running..  \n",
+      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/wefbdnuiezd/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/wefbdnuiezd/togsim_result/0\"\n",
+      "------------------\n",
+      "|exp2 Test Passed|\n",
+      "------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "input = torch.randn(16, 16)\n",
+    "npu_x = input.to(device=device)\n",
+    "cpu_x = input.to(\"cpu\")\n",
+    "func = torch.exp2\n",
+    "opt_fn = torch.compile(dynamic=False)(func)\n",
+    "npu_out = opt_fn(npu_x)\n",
+    "cpu_out = func(cpu_x)\n",
+    "test_result(\"exp2\", npu_out, cpu_out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5bfdf22f-e749-41a5-a2cf-dcbb630bfb83",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "from ctypes import c_void_p, c_long\n",
+      "import torch\n",
+      "import math\n",
+      "import random\n",
+      "import os\n",
+      "import tempfile\n",
+      "from math import inf, nan\n",
+      "from torch._inductor.hooks import run_intermediate_hooks\n",
+      "from torch._inductor.utils import maybe_profile\n",
+      "from torch._inductor.codegen.memory_planning import _align as align\n",
+      "\n",
+      "from torch import device, empty, empty_strided\n",
+      "from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile\n",
+      "from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_TOGSIM_EAGER_MODE\n",
+      "from Simulator.simulator import TOGSimulator\n",
+      "from PyTorchSimFrontend.extension_op import sparse_mm_dummy_stonne_outer\n",
+      "from torch._inductor.select_algorithm import extern_kernels\n",
+      "\n",
+      "aten = torch.ops.aten\n",
+      "inductor_ops = torch.ops.inductor\n",
+      "assert_size_stride = torch._C._dynamo.guards.assert_size_stride\n",
+      "alloc_from_pool = torch.ops.inductor._alloc_from_pool\n",
+      "reinterpret_tensor = torch.ops.aten._reinterpret_tensor\n",
+      "custom_async_compile = CustomAsyncCompile()\n",
+      "os.environ[\"TORCHSIM_LAST_COMPILED_MODULE\"] = __file__\n",
+      "\n",
+      "def sram_plan_prefix(buffer_name, buffer):\n",
+      "    if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN):\n",
+      "        return\n",
+      "    buffer_size = buffer.untyped_storage().size()\n",
+      "    start = buffer.data_ptr()\n",
+      "    end = start + buffer_size\n",
+      "    # print(f'Alloc {buffer_name}(0x{start:x} ~ 0x{end:x})')\n",
+      "    TOGSimulator.sram_alloc(buffer_name, [start, end])\n",
+      "\n",
+      "def sram_plan_postfix(buffer_name, buffer):\n",
+      "    if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN):\n",
+      "        return\n",
+      "    buffer_size = buffer.untyped_storage().size()\n",
+      "    start = buffer.data_ptr()\n",
+      "    end = start + buffer_size\n",
+      "    # print(f'Dealloc {buffer_name}(0x{start:x} ~ 0x{end:x})')\n",
+      "    TOGSimulator.sram_dealloc(buffer_name, [start, end])\n",
+      "\n",
+      "def host2device_memcopy(buffer):\n",
+      "    pass\n",
+      "\n",
+      "def device2host_memcpy(buffer):\n",
+      "    pass\n",
+      "\n",
+      "print(f'Wrapper Codegen Path = {__file__}')\n",
+      "arg_attributes = [['arg0_1', [1, torch.float32, 256, [16, 16], [16, 1]]], ['buf0', [2, torch.float32, 256, [16, 16], [16, 1]]]]\n",
+      "\n",
+      "\n",
+      "extension_kernel_0 = custom_async_compile.mlir('''memref.global @buf0_spad : memref<256xf32, 1>\n",
+      "memref.global @buf1_spad : memref<256xf32, 1>\n",
+      "func.func @kernel(%in_ptr0: memref<256xf32>,\n",
+      "                       %out_ptr0: memref<256xf32>)\n",
+      "{\n",
+      "    %const0 = arith.constant 0 : index\n",
+      "    %const1 = arith.constant 2 : index\n",
+      "    %const2 = arith.constant 3 : index\n",
+      "    %alloc0 = memref.alloc() : memref<1xi32> // 0\n",
+      "    %alloc1 = memref.alloc() : memref<1xi32> // 1\n",
+      "    %spad0 = memref.get_global @buf0_spad : memref<256xf32, 1>\n",
+      "    %spad1 = memref.get_global @buf1_spad : memref<256xf32, 1>\n",
+      "    affine.for %index0 = 0 to 256 step 256\n",
+      "    {\n",
+      "        memref.dma_start %in_ptr0[%index0], %spad0[%const0], %const1, %alloc0[%const0], %const0, %const1 : memref<256xf32>, memref<256xf32, 1>, memref<1xi32> {dram_stride=[1], sram_stride=[1], padding=0}\n",
+      "        affine.for %compute_idx = 0 to 2 step 2\n",
+      "        {\n",
+      "            %tmp0 = affine.vector_load %spad0[%compute_idx] : memref<256xf32, 1>, vector<2xf32>\n",
+      "            %tmp1 = arith.constant 0.69314718055994528623 : f32\n",
+      "            %tmp2 = vector.broadcast %tmp1 : f32 to vector<2xf32>\n",
+      "            %tmp3 = arith.mulf %tmp0, %tmp2 : vector<2xf32>\n",
+      "            %tmp4 = math.exp %tmp3 : vector<2xf32>\n",
+      "            affine.vector_store %tmp4, %spad1[%compute_idx] : memref<256xf32, 1>, vector<2xf32>\n",
+      "        } {inner_loop=false}\n",
+      "        memref.dma_start %spad1[%const0], %out_ptr0[%index0], %const2, %alloc1[%const0], %const0, %const1 : memref<256xf32, 1>, memref<256xf32>, memref<1xi32> {dram_stride=[1], sram_stride=[1], padding=0}\n",
+      "    } {outer_loop=true}\n",
+      "    return\n",
+      "}\n",
+      "''', \n",
+      "vectorlane_size=128,\n",
+      "loop_size=None,\n",
+      "spad_info={'spad_vaddr': 3489660928, 'spad_paddr': 137438953472, 'spad_size': 131072},\n",
+      "origins={'exp2'},\n",
+      "arg_attributes=arg_attributes,\n",
+      "vlen=256)\n",
+      "\n",
+      "def call(args):\n",
+      "    arg0_1, = args\n",
+      "    args.clear()\n",
+      "    assert_size_stride(arg0_1, (16, 16), (16, 1))\n",
+      "    sram_plan_prefix('arg0_1', arg0_1)\n",
+      "    buf0 = empty((16, 16), device='npu', dtype=torch.float32)\n",
+      "    sram_plan_prefix('buf0', buf0)\n",
+      "    extension_kernel_0(arg0_1, buf0)\n",
+      "    sram_plan_postfix('arg0_1', arg0_1)\n",
+      "    del arg0_1\n",
+      "    sram_plan_postfix('buf0', buf0)\n",
+      "    return (buf0, )\n",
+      "\n",
+      "\n",
+      "def benchmark_compiled_module(times=10, repeat=10):\n",
+      "    from torch._dynamo.testing import rand_strided\n",
+      "    from torch._inductor.utils import print_performance\n",
+      "    arg0_1 = rand_strided((16, 16), (16, 1), device='npu:0', dtype=torch.float32)\n",
+      "    fn = lambda: call([arg0_1])\n",
+      "    return print_performance(fn, times=times, repeat=repeat)\n",
+      "\n",
+      "\n",
+      "if __name__ == \"__main__\":\n",
+      "    from torch._inductor.wrapper_benchmark import compiled_module_main\n",
+      "    compiled_module_main('None', benchmark_compiled_module)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat /tmp/torchinductor_root/uu/cuumxtbdv4ukzpymchmrda2exohouwcdybawmj2v7jog4vbvoycf.py"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}