diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 61eb96e1..eba48da2 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -6,7 +6,7 @@ on: jobs: build-and-test: - runs-on: ubuntu-latest + runs-on: self-hosted permissions: contents: read @@ -35,6 +35,7 @@ jobs: context: . file: ./Dockerfile push: true + no-cache: true tags: ghcr.io/psal-postech/torchsim-test:${{ github.sha }} # Step 4: Wait for GHCR propagation diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index bc356d85..32d6543c 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -674,6 +674,9 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Prepare volume directory + run: mkdir -p /tmp/torchsim-ci/${GITHUB_SHA} + - name: Run run_cycle.sh run: | echo "Running run_cycle.sh" @@ -682,4 +685,14 @@ jobs: -e TORCHSIM_DUMP_PATH=/dump \ -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ - ${{ inputs.image_name }} PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh + ${{ inputs.image_name }} bash -c \ + "cd /workspace && PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh && \ + cp PyTorchSim/experiments/artifact/cycle_validation/summary_cycle.out /dump/summary_cycle.out" + ls /tmp/torchsim-ci/${GITHUB_SHA} + + - name: Upload Accuracy Report Artifact + uses: actions/upload-artifact@v4 + with: + name: accuracy-report + path: /tmp/torchsim-ci/${{ github.sha }}/summary_cycle.out + if-no-files-found: error diff --git a/.gitignore b/.gitignore index 88eb2fb8..9decced5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ __pycache__/ -PyTorchSimBackend/build/ +TOGSim/build/ .vscode diff --git a/.gitmodules b/.gitmodules index f65e5f2b..24f9ccaf 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,18 +1,15 @@ -[submodule "PyTorchSimBackend/extern/onnx"] - path = PyTorchSimBackend/extern/onnx +[submodule "TOGSim/extern/onnx"] + path = TOGSim/extern/onnx url = https://github.com/onnx/onnx.git -[submodule "PyTorchSimBackend/extern/protobuf"] - path = PyTorchSimBackend/extern/protobuf +[submodule "TOGSim/extern/protobuf"] + path = TOGSim/extern/protobuf url = https://github.com/protocolbuffers/protobuf.git -[submodule "PyTorchSimBackend/extern/booksim"] - path = PyTorchSimBackend/extern/booksim +[submodule "TOGSim/extern/booksim"] + path = TOGSim/extern/booksim url = https://github.com/PSAL-POSTECH/booksim.git -[submodule "PyTorchSimBackend/extern/torch2timeloop"] - path = PyTorchSimBackend/extern/torch2timeloop - url = https://github.com/Accelergy-Project/pytorch2timeloop-converter.git -[submodule "PyTorchSimBackend/extern/ramulator2"] - path = PyTorchSimBackend/extern/ramulator2 +[submodule "TOGSim/extern/ramulator2"] + path = TOGSim/extern/ramulator2 url = https://github.com/PSAL-POSTECH/ramulator2 -[submodule "PyTorchSimBackend/extern/stonneCore"] - path = PyTorchSimBackend/extern/stonneCore +[submodule "TOGSim/extern/stonneCore"] + path = TOGSim/extern/stonneCore url = https://github.com/PSAL-POSTECH/stonne_core.git diff --git a/Dockerfile b/Dockerfile index 293dcb60..37721940 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ FROM ghcr.io/psal-postech/torchsim_base:latest # Prepare PyTorchSim project COPY . /workspace/PyTorchSim -RUN cd PyTorchSim/PyTorchSimBackend && \ +RUN cd PyTorchSim/TOGSim && \ mkdir -p build && \ cd build && \ conan install .. --build=missing && \ diff --git a/Dockerfile.ksc2025 b/Dockerfile.ksc2025 new file mode 100644 index 00000000..2ac210e0 --- /dev/null +++ b/Dockerfile.ksc2025 @@ -0,0 +1,90 @@ +# Copyright (c) 2020 The Regents of the University of California +# All Rights Reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime + +# Copied from Gem5 Docker file +ENV DEBIAN_FRONTEND=noninteractive +RUN apt -y update && apt -y upgrade && \ + apt -y install build-essential git m4 scons zlib1g zlib1g-dev \ + libprotobuf-dev protobuf-compiler libprotoc-dev libgoogle-perftools-dev \ + python3-dev python-is-python3 doxygen libboost-all-dev \ + libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config pip \ + python3-venv black libssl-dev libasan5 libubsan1 +RUN pip install mypy pre-commit jupyter + +# Pass Access Token securely +ENV PATH=$PATH:/root/.local/bin +ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH + +# Build Gem5 +RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch TorchSim +RUN cd gem5 && scons build/RISCV/gem5.opt -j $(nproc) +ENV GEM5_PATH=/workspace/gem5/build/RISCV/gem5.opt + +# Build LLVM RISC-V +RUN git clone https://github.com/PSAL-POSTECH/llvm-project.git --branch torchsim --depth 1 +RUN cd llvm-project && mkdir build && cd build && \ + cmake -DLLVM_ENABLE_PROJECTS=mlir -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/riscv-llvm -DLLVM_TARGETS_TO_BUILD=RISCV -G "Unix Makefiles" ../llvm && \ + make -j && make install + +# Store RISC-V LLVM for TorchSim +ENV TORCHSIM_LLVM_PATH=/riscv-llvm/bin +ENV TORCHSIM_LLVM_INCLUDE_PATH=/riscv-llvm/include +ENV TORCHSIM_DIR=/workspace/PyTorchSim +ENV LLVM_DIR=/riscv-llvm + +# Download RISC-V tool chain +RUN apt install -y wget && \ + wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz && \ + wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && \ + tar -zxvf riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && tar -zxvf riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && \ + rm *.tar.gz + +ENV RISCV=/workspace/riscv +ENV PATH=$RISCV/bin:$PATH + +# Install Spike simulator +RUN apt -y install device-tree-compiler +RUN git clone https://github.com/PSAL-POSTECH/riscv-isa-sim.git --branch TorchSim && cd riscv-isa-sim && mkdir build && cd build && \ + ../configure --prefix=$RISCV && make -j && make install + +# Install Proxy kernel +RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \ + cd riscv-pk && git checkout 4f3debe4d04f56d31089c1c716a27e2d5245e9a1 && mkdir build && cd build && \ + ../configure --prefix=$RISCV --host=riscv64-unknown-elf && make -j && make install + +# Install torchsim dependency +RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0 + +# Prepare ONNXim project +RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch tutorial +RUN cd PyTorchSim/TOGSim && \ + git submodule update --recursive --init && \ + mkdir -p build && \ + cd build && \ + conan install .. --build=missing && \ + cmake .. && \ + make -j$(nproc) \ No newline at end of file diff --git a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json deleted file mode 100644 index 8f196e81..00000000 --- a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "core_type" : ["stonne", "ws_mesh"], - "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", - "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_stonne_per_core" : 8, - "num_stonne_port" : 64, - "num_systolic_array_per_core" : 2, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 16, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 15000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":1 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json deleted file mode 100644 index c7ef15f7..00000000 --- a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "core_type" : ["stonne"], - "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", - "num_cores" : 1, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_stonne_per_core" : 8, - "num_stonne_port" : 64, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 8, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 15000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json deleted file mode 100644 index 2293e197..00000000 --- a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "core_type" : ["stonne"], - "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", - "num_cores" : 1, - "core_freq" : 700, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_stonne_per_core" : 1, - "num_stonne_port" : 8, - - "dram_type" : "ramulator2", - "dram_freq" : 700, - "dram_channels": 8, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 7000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json deleted file mode 100644 index 08548638..00000000 --- a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "core_type" : ["stonne"], - "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", - "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_stonne_per_core" : 1, - "num_stonne_port" : 32, - - "dram_type" : "simple", - "dram_freq" : 1000, - "dram_channels": 1, - "dram_req_size": 32, - "dram_latency" : 100, - "dram_print_interval": 10000, - "l2d_type" : "datacache", - "l2d_config" : "S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 7000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json deleted file mode 100644 index 5d7b0d35..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 700, - "sram_size" : 65536, - "core_print_interval" : 10000, - - "dram_type" : "ramulator2", - "dram_freq" :700, - "dram_channels": 16, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - - "icnt_type" : "booksim2", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_node_per_core" : 16, - "icnt_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json deleted file mode 100644 index 38acafc0..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 700, - "sram_size" : 65536, - "core_print_interval" : 10000, - - "dram_type" : "ramulator2", - "dram_freq" : 700, - "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 10000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json deleted file mode 100644 index 7348d5bc..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 2, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 16, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 15000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0": 0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json deleted file mode 100644 index 69ec8bd0..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 2, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 8, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 15000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0": 0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json deleted file mode 100644 index bff4e224..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 1050, - "sram_size" : 16777216, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 4, - - "dram_type" : "ramulator2", - "dram_freq" :1200, - "dram_channels": 16, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - "l2d_type" : "datacache", - "l2d_config" : "S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 19200, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json deleted file mode 100644 index b2661894..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 2, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_num_partitions" : 2, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "booksim2", - "icnt_latency" : 1, - "icnt_freq" : 1000, - "icnt_node_per_core" : 16, - "icnt_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", - "icnt_print_interval" : 10000, - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json deleted file mode 100644 index 922ede5b..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 2, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_num_partitions" : 1, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "booksim2", - "icnt_latency" : 1, - "icnt_freq" : 1000, - "icnt_node_per_core" : 16, - "icnt_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json deleted file mode 100644 index 034542fe..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 700, - "sram_size" : 65536, - "core_print_interval" : 10000, - - "dram_type" : "ramulator2", - "dram_freq" :700, - "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 20000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json deleted file mode 100644 index 82f42c00..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 2, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 28000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json deleted file mode 100644 index 132a52e6..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 2, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 28000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":1 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json deleted file mode 100644 index a93e8ae2..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 1050, - "sram_size" : 32768, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 4, - - "dram_type" : "ramulator2", - "dram_freq" :1200, - "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - "l2d_type" : "datacache", - "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 48000, - "icnt_node_per_core" : 1, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json deleted file mode 100644 index e9a64f2e..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 1, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 1, - "icnt_freq" : 1000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple" -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json deleted file mode 100644 index 37e18b35..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 2, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple" -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json deleted file mode 100644 index 49225d77..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 4, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple" -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json deleted file mode 100644 index 4ea2c6ff..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 1, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "booksim2", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json deleted file mode 100644 index 8aee751b..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 1, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple" -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json deleted file mode 100644 index f76fec32..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "core_type" : ["ws_mesh","ws_mesh"], - "num_cores" : 2, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 1, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m4.icnt", - - "precision" : 4, - "scheduler" : "simple" -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json deleted file mode 100644 index 7571b830..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 2, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m8.icnt", - - "precision" : 4, - "scheduler" : "simple" -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json deleted file mode 100644 index be163336..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 4, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple" -} \ No newline at end of file diff --git a/PyTorchSimBackend/extern/torch2timeloop b/PyTorchSimBackend/extern/torch2timeloop deleted file mode 160000 index 62aa1754..00000000 --- a/PyTorchSimBackend/extern/torch2timeloop +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 62aa175421165cc9cd7dfb182a02fc3e26c01e3a diff --git a/PyTorchSimBackend/src/TMA.cc b/PyTorchSimBackend/src/TMA.cc deleted file mode 100644 index 7744b0f5..00000000 --- a/PyTorchSimBackend/src/TMA.cc +++ /dev/null @@ -1,48 +0,0 @@ -#include "TMA.h" -#include "TileGraph.h" - -TMA::TMA(uint32_t id, uint32_t dram_req_size) { - _id = id; - _dram_req_size = dram_req_size; - _current_inst = nullptr; - _finished = true; -} - -void TMA::issue_tile(std::shared_ptr inst) { - _current_inst = std::move(inst); - std::vector& tile_size = _current_inst->get_tile_size(); - if (tile_size.size() <= 0 || tile_size.size() > get_max_dim()) { - spdlog::error("[TMA {}] issued tile is not supported format..", _id); - exit(EXIT_FAILURE); - } - _finished = false; -} - -std::shared_ptr> TMA::get_memory_access() { - auto addr_set = _current_inst->get_dram_address(_dram_req_size); - auto access_vec = std::make_shared>(); - Tile* owner = (Tile*)_current_inst->get_owner(); - std::shared_ptr owner_subgraph = owner->get_owner(); - unsigned long long base_daddr = _current_inst->get_base_dram_address(); - // Todo. We use a ternsor level buffer allocation, so we don't need to check all memfetch - bool is_cacheable = owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size); - spdlog::trace("[SRAM Trace] Core-{}, Address: 0x{:016x}, Is_cacheable: {}", _id, base_daddr, is_cacheable); - spdlog::trace("[NUMA Trace] Core-{}, Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}", - _id, owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write()); - - for (auto addr: *addr_set) { - mem_access_type acc_type = _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W : mem_access_type::GLOBAL_ACC_R; - mf_type type = _current_inst->is_dma_write() ? mf_type::WRITE_REQUEST : mf_type::READ_REQUEST; - mem_fetch* access = new mem_fetch(addr, acc_type, type, _dram_req_size, _current_inst->get_numa_id(), static_cast(_current_inst.get())); - access->set_cacheable(is_cacheable); - _current_inst->inc_waiting_request(); - access_vec->push_back(access); - } - _finished = true; - return access_vec; -} - -uint32_t TMA::generate_mem_access_id() { - static uint32_t id_counter{0}; - return id_counter++; -} \ No newline at end of file diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index ca669361..577c45e9 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -7,7 +7,7 @@ from AsmParser.tog_generator import tog_generator from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen from PyTorchSimFrontend import extension_config -from Simulator.simulator import FunctionalSimulator, CycleSimulator, BackendSimulator +from Simulator.simulator import FunctionalSimulator, CycleSimulator, TOGSimulator LOCK_TIMEOUT = 600 @@ -27,21 +27,6 @@ def dump_metadata(args, arg_attributes, path): file.write(f'{arg_name}=({arg_attribute[0]}, {arg.dtype}, {arg.shape})\n') return -def parse_stack_sizes(file_path): - meta_path = file_path.split(".")[0]+".meta" - cmd = ["riscv64-unknown-elf-objcopy", "--dump-section", f".stack_sizes={meta_path}", file_path, "/dev/null"] - subprocess.run(cmd, check=True) - - with open(meta_path, 'rb') as f: - stack_sizes_data = list(f.read()) - if len(stack_sizes_data) <= 17: - raise ValueError("Invalid .stack_sizes section size") - - stack_size_bytes = stack_sizes_data[8:-9] - stack_size = int.from_bytes(stack_size_bytes, byteorder='little') - return stack_size - - def llvm_compile_command(input, output): opt_output = f"{input[:-3]}_opt.ll" return [re.sub(r"[ \n]+", " ", @@ -180,7 +165,7 @@ def load(cls, source_code, else: link_option = "" # Generate LLVM kernel calller and binary for validation - if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE: + if extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE: # Use custom malloc to avoid size error new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free" cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen) @@ -197,7 +182,7 @@ def load(cls, source_code, print("Error output:", e.output) assert(0) - val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, arg_attributes) + val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE, arg_attributes) val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name) val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name, validation_binary_name, new_link_option) @@ -228,7 +213,7 @@ def load(cls, source_code, print("Error output:", e.output) assert(0) - if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: + if not extension_config.CONFIG_TORCHSIM_TIMING_MODE: return key # Generate MLIR kernel calller and binary for cycle calculation @@ -299,23 +284,23 @@ def dummy_simulator(*args, **kwargs): # Dump arguments and meta data dump_metadata(args, arg_attributes, result_path) runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path) - if not autotune and (extension_config.CONFIG_TORCHSIM_VALIDATION_MODE or validate): + if not autotune and (extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE or validate): funcsim = FunctionalSimulator(result_path, key) funcsim.run_spike(args, arg_attributes, runtime_path, self.validation_binary_name, vectorlane_size=vectorlane_size, spad_info=spad_info, cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS, silent_mode=silent_mode) - if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: + if not extension_config.CONFIG_TORCHSIM_TIMING_MODE: return onnx_path = os.path.join(result_path, "tile_graph.onnx") attribute_path = os.path.join(runtime_path, "attribute") - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + backsim = TOGSimulator(togsim_path, extension_config.CONFIG_TOGSIM_CONFIG) backsim.vectorlane_size = vectorlane_size attribute_path = backsim.create_attribute_file(attribute_path, args, loop_size=loop_size) result_path = backsim.simulation(onnx_path, attribute_path, silent_mode=silent_mode) - result = BackendSimulator.get_result_from_file(result_path) + result = TOGSimulator.get_result_from_file(result_path) return result def dryrun_simulator(*args, **kwargs): @@ -329,11 +314,11 @@ def dryrun_simulator(*args, **kwargs): # Dump arguments and meta data dump_metadata(args, arg_attributes, result_path) runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path) - if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: + if not extension_config.CONFIG_TORCHSIM_TIMING_MODE: return # Todo. Support valude dependent mode for graph mode - if False: # extension_config.CONFIG_TORCHSIM_VALIDATION_MODE: + if False: # extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE: funcsim = FunctionalSimulator(result_path, key) funcsim.run_spike(args, arg_attributes, runtime_path, self.validation_binary_name, @@ -341,7 +326,7 @@ def dryrun_simulator(*args, **kwargs): cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS) return result_path, runtime_path, None - is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) and not autotune + is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) and not autotune target_simulator = dryrun_simulator if is_dryrun else dummy_simulator target_simulator.arg_attributes = arg_attributes target_simulator.future = future diff --git a/PyTorchSimFrontend/extension_codegen_backend.py b/PyTorchSimFrontend/extension_codegen_backend.py deleted file mode 100644 index e569d251..00000000 --- a/PyTorchSimFrontend/extension_codegen_backend.py +++ /dev/null @@ -1,216 +0,0 @@ -import dataclasses -import contextlib -from typing import List -from typing import Dict -from torch._inductor.codegen import cpp, wrapper, common -from torch._inductor.scheduler import BaseScheduling -from torch._inductor.virtualized import V -from torch._inductor.utils import IndentedBuffer -import sympy - -cexpr = cpp.CppPrinter().doprint - -class ExtensionWrapperCodegen(wrapper.WrapperCodeGen): - def __init__(self): - super().__init__() - -class ExtensionOverrides(common.OpOverrides): - pass - -class ExtensionKernel(common.Kernel): - overrides = ExtensionOverrides - newvar_prefix = "auto " - suffix = ";" - - def __init__(self, args=None): - super().__init__(args) - self.call_ranges = None - self.ranges = None - self.itervars = None - self.reduction_depth = None - self.reduction_prefix = IndentedBuffer() - self.reduction_suffix = IndentedBuffer() - self.reduction_vars = {} - self.reduction_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc") - - def load(self, name: str, index: sympy.Expr): - index = self.rename_indexing(index) - var = self.args.input(name) - line = f"{var}[{index}]" - dtype = V.graph.get_dtype(name) - self.cse.prefix = cpp.DTYPE_TO_CPP[dtype] + " " - return self.cse.generate(self.loads, line) - - def store(self, name: str, index: sympy.Expr, value, *args, **kwargs): - index = self.rename_indexing(index) - var = self.args.output(name) - line = f"{var}[{index}] = {value}" - self.cse.generate(self.stores, line, assignment = False) - - def reduction(self, dtype, src_dtype, reduction_type, value): - argmax_or_argmin = reduction_type in {"argmax", "argmin"} - if argmax_or_argmin: - raise NotImplementedError() #TODO: argmin, argmax - else: - reduction_key = src_dtype, reduction_type, value - acc = self.reduction_cse.generate( - self.loads, f"reduction {reduction_key}", write=False - ) - self.reduction_vars[acc] = reduction_type - acc_type = cpp.reduction_acc_type(reduction_type, dtype) - self.reduction_prefix.writeline(f"{acc_type} {acc} = {cpp.reduction_init(reduction_type, dtype)};") - line = f"{acc} = {cpp.reduction_combine(reduction_type, acc, value)}" - self.cse.generate(self.stores, line, assignment = False) - self.reduction_cse.reduction_cache[reduction_key] = acc - return acc - - def store_reduction(self, name, index, value): - index = self.rename_indexing(index) - var = self.args.output(name) - self.reduction_suffix.writeline(f"{var}[{index}] = {value};")\ - - def codegen_loops(self): - code = common.BracesBuffer() - # Loop body part - loops = [LoopLevel(var, size) for var, size in zip(self.itervars, self.ranges)] - loops, reductions = [LoopNest(loops[: self.reduction_depth]), - LoopNest(loops[self.reduction_depth :])] - reductions.mark_reduction(self.reduction_vars) - - with contextlib.ExitStack() as stack: - loops.codegen(code, stack) - with contextlib.ExitStack() as stack_outer: - if self.reduction_prefix: - stack_outer.enter_context(code.indent()) - code.splice(self.reduction_prefix) - - with contextlib.ExitStack() as stack: - reductions.codegen(code, stack) - code.splice(self.loads) - code.splice(self.compute) - code.splice(self.stores) - code.splice(self.reduction_suffix) - return code - - def define_kernel(self, wrapper, src_code, kernel_name): - if src_code in wrapper.src_to_kernel: - kernel_name = wrapper.src_to_kernel[src_code] - else: - wrapper.src_to_kernel[src_code] = kernel_name - wrapper.define_kernel(kernel_name, src_code, cuda=False) - - def codegen_kernel(self, wrapper): - arg_defs, call_args, arg_types = self.args.cpp_argdefs() - arg_defs = ",\n".ljust(25).join(arg_defs) - arg_types = ",".join(arg_types) - code = common.BracesBuffer() - - # Todo. kernel name custom - kernel_name = f"Extensin_Kernel" - kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel" - code.writeline(f'extern "C" void {kernel_decl_name}({arg_defs})') - with code.indent(): - for old, new in self.args.aliases(): - code.writeline(f"auto {old} = {new};") - # Loop body part - code.splice(self.codegen_loops()) - - codecache_def = IndentedBuffer() - if not V.graph.cpp_wrapper: - codecache_def.writeline("async_compile.cpp('''") - codecache_def.splice(code) - if not V.graph.cpp_wrapper: - codecache_def.writeline("''')") - - self.define_kernel(wrapper, codecache_def.getvalue(), kernel_name) - # generate the code to call this - wrapper.generate_kernel_call(kernel_name, call_args, cuda=False) - print(code.getvalue()) - return code.getvalue() - - def set_ranges(self, lengths, reduction_lengths): - if self.call_ranges: - assert self.call_ranges == tuple(lengths) + tuple( - reduction_lengths - ), f"{self.call_ranges} == {tuple(lengths)} + {tuple(reduction_lengths)}" - assert self.reduction_depth == len(lengths) - else: - self.call_ranges = tuple(lengths) + tuple(reduction_lengths) - self.ranges = [self.rename_indexing(x) for x in self.call_ranges] - self.itervars = [sympy.Symbol(f"i{n}") for n in range(len(self.ranges))] - self.reduction_depth = len(lengths) - return ( - self.itervars[: self.reduction_depth], - self.itervars[self.reduction_depth :], - ) - -@dataclasses.dataclass -class LoopLevel: - var: sympy.Expr - size: sympy.Expr - reduction_vars: Dict[str, str] = None - - # Todo. Type change for reduction - INDEX_TYPE = "long" - def lines(self): - line = f"for({self.INDEX_TYPE} {self.var}=0; {self.var}<{cexpr(self.size)}; ++{self.var})" - return [line] - -@dataclasses.dataclass -class LoopNest: - loops: List[LoopLevel] - - def __bool__(self): - return bool(self.loops) - - def mark_reduction(self, reduction_vars): - for loop in self.loops: - loop.reduction_vars = reduction_vars - - def mark_parallel(self, par_depth): - loops = self.loops - loops[0].parallel = par_depth - for i in range(1, par_depth): - loops[i].collapsed = True - loops[0].simd = loops[par_depth - 1].simd - - def codegen(self, code, stack): - for loop in self.loops: - code.writelines(loop.lines()) - stack.enter_context(code.indent()) - -class ExtensionScheduling(BaseScheduling): - count = 0 - def __init__(self, scheduler): - self.scheduler = scheduler - self._scheduling = cpp.CppScheduling(scheduler) - - def can_fuse_vertical(self, node1, node2): - return False - - def can_fuse_horizontal(self, node1, node2): - return False - - def group_fn(self, sizes): - return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes) - - def codegen_nodes(self, nodes): - _, (group, reduction_group) = max( - nodes, key=lambda x: int(x.is_reduction()) - ).group - - ex_kernel = ExtensionKernel() - for node in nodes: - vars, reduction_vars = ex_kernel.set_ranges(group, reduction_group) - with ex_kernel: - node.run(vars, reduction_vars) - - wrapper = V.graph.wrapper_code - ex_kernel.codegen_kernel(wrapper) - pass - - def codegen_sync(self): - pass - - def flush(self): - self._scheduling.flush() \ No newline at end of file diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index fa5d22b5..3d6fbb76 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -3,73 +3,124 @@ import tempfile import importlib -# Hardware info config -CONFIG_VECTOR_LANE = int(os.environ.get("TORCHSIM_VECTOR_LANE", default=128)) -CONFIG_VECTOR_LANE_STRIDE = int(os.environ.get("TORCHSIM_VECTOR_LANE_STRIDE", default=2)) -CONFIG_SPAD_INFO = { - "spad_vaddr" : 0xD0000000, - "spad_paddr" : 0x2000000000, - "spad_size" : int(os.environ.get("TORCHSIM_SPAD_SIZE", default=128)) << 10 # Note: spad size per lane -} -CONFIG_PRECISION = 4 # 32bit -CONFIG_NUM_CORES = 1 -CONFIG_VLEN = 256 # 256bits / 32bits = 8 [elements] - -# Tile size config -CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - -CONFIG_TORCHSIM_DUMP_PATH = os.environ.get('TORCHSIM_DUMP_PATH', - default = f"{tempfile.gettempdir()}/torchinductor") -CONFIG_TORCHSIM_DUMP_FILE = int(os.environ.get('TORCHSIM_DUMP_FILE', default=True)) -CONFIG_TORCHSIM_VALIDATION_MODE = int(os.environ.get('TORCHSIM_VALIDATION_MODE', default=True)) -CONFIG_CLEANUP_DUMP_ARGS = int(os.environ.get('CLEANUP_DUMP_ARGS', default=False)) - -# LLVM PATH -CONFIG_TORCHSIM_LLVM_PATH = os.environ.get('TORCHSIM_LLVM_PATH', default="/usr/bin") -CONFIG_TORCHSIM_CUSTOM_PASS_PATH = os.environ.get('TORCHSIM_CUSTOM_PASS_PATH', - default=f"{CONFIG_TORCHSIM_DIR}/GemminiLowerPass/build") -CONFIG_TORCHSIM_DUMP_MLIR_IR = int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False)) -CONFIG_TORCHSIM_DUMP_LLVM_IR = int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False)) - -# Backendsim config -CONFIG_TORCHSIM_BACKEND_CONFIG = os.environ.get('TORCHSIM_CONFIG', - default=f'{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') -CONFIG_BACKENDSIM_SPIKE_ONLY = int(os.environ.get("BACKENDSIM_SPIKE_ONLY", False)) -CONFIG_BACKENDSIM_EAGER_MODE = int(os.environ.get("BACKENDSIM_EAGER_MODE", default=False)) -CONFIG_BACKENDSIM_DRYRUN = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) -CONFIG_BACKENDSIM_DEBUG_LEVEL = os.environ.get("BACKENDSIM_DEBUG_LEVEL", "") - -# GEM5 config -CONFIG_GEM5_PATH = os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt") -CONFIG_GEM5_SCRIPT_PATH = os.environ.get('GEM5_SCRIPT_PATH', - default=f"{CONFIG_TORCHSIM_DIR}/gem5_script/script_systolic.py") - -# AUTOTUNE config -CONFIG_AUTOTUNE = int(os.environ.get('AUTOTUNE', default=True)) -CONFIG_AUTOTUNE_TEMPLATE = int(os.environ.get('AUTOTUNE_TEMPLATE', default=True)) -CONFIG_MAX_AUTOTUNE_TRY = int(os.environ.get('MAX_AUTOTUNE_TRY', default=10)) -CONFIG_AUTOTUNE_TEMPLATE_TOPK = int(os.environ.get('AUTOTUNE_TEMPLATE_TOPK', default=4)) - -# For block sparse -CONFIG_BLOCK_SPARSE = int(os.environ.get('BLOCK_SPARSE', default=0)) - -# For GEMM tile size -CONFIG_MANUAL_TILE_SIZE = int(os.environ.get('TORCHSIM_MANUAL_TILE_SIZE', default=False)) -CONFIG_TILE_M = int(os.environ.get('TORCHSIM_TILE_M', default=CONFIG_VECTOR_LANE)) -CONFIG_TILE_N = int(os.environ.get('TORCHSIM_TILE_N', default=CONFIG_VECTOR_LANE)) -CONFIG_TILE_K = int(os.environ.get('TORCHSIM_TILE_K', default=CONFIG_VECTOR_LANE)) -CONFIG_GEMM_CHEATSHEET_PATH = os.environ.get('TORCHSIM_GEMM_CHEATSHEET_PATH', - default=f"{CONFIG_TORCHSIM_DIR}/validation/gemm_tpuv3_cheatsheet.json") -CONFIG_SUBTILE = int(os.environ.get('TORCHSIM_SUBTILE', default=True)) -CONFIG_MANUAL_SUBTILE_SIZE = int(os.environ.get('TORCHSIM_MANUAL_SUBTILE_SIZE', default=False)) -CONFIG_SUBTILE_M = int(os.environ.get('TORCHSIM_SUBTILE_M', default=CONFIG_VECTOR_LANE)) -CONFIG_SUBTILE_N = int(os.environ.get('TORCHSIM_SUBTILE_N', default=CONFIG_VECTOR_LANE)) -CONFIG_SUBTILE_K = int(os.environ.get('TORCHSIM_SUBTILE_K', default=CONFIG_VECTOR_LANE)) - -# Advanced fusion options -CONFIG_FUSION_REDUCTION_EPILOGUE = int(os.environ.get('TORCHSIM_FUSION_REDUCTION_EPILOGUE', default=True)) -CONFIG_FUSION_REDUCTION_REDUCTION = int(os.environ.get('TORCHSIM_FUSION_REDUCTION_REDUCTION', default=True)) -CONFIG_FUSION_PROLOGUE = int(os.environ.get('TORCHSIM_FUSION_PROLOGUE', default=True)) +def __getattr__(name): + + # Hardware info config + if name == "CONFIG_VECTOR_LANE": + return int(os.environ.get("TORCHSIM_VECTOR_LANE", default=128)) + if name == "CONFIG_VECTOR_LANE_STRIDE": + return int(os.environ.get("TORCHSIM_VECTOR_LANE_STRIDE", default=2)) + if name == "CONFIG_SPAD_INFO": + return { + "spad_vaddr" : 0xD0000000, + "spad_paddr" : 0x2000000000, + "spad_size" : int(os.environ.get("TORCHSIM_SPAD_SIZE", default=128)) << 10 # Note: spad size per lane + } + if name == "CONFIG_PRECISION": + return 4 # 32bit + if name == "CONFIG_NUM_CORES": + return 1 + if name == "CONFIG_VLEN": + return 256 # 256bits / 32bits = 8 [elements] + + # Tile size config + if name == "CONFIG_TORCHSIM_DIR": + return os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') + + if name == "CONFIG_TORCHSIM_DUMP_PATH": + return os.environ.get('TORCHSIM_DUMP_PATH', default = f"{tempfile.gettempdir()}/torchinductor") + if name == "CONFIG_TORCHSIM_DUMP_FILE": + return int(os.environ.get('TORCHSIM_DUMP_FILE', default=True)) + if name == "CONFIG_TORCHSIM_FUNCTIONAL_MODE": + return int(os.environ.get('TORCHSIM_FUNCTIONAL_MODE', default=True)) + if name == "CONFIG_TORCHSIM_TIMING_MODE": + return int(os.environ.get("TORCHSIM_TIMING_MODE", True)) + if name == "CONFIG_CLEANUP_DUMP_ARGS": + return int(os.environ.get('CLEANUP_DUMP_ARGS', default=False)) + + # LLVM PATH + if name == "CONFIG_TORCHSIM_LLVM_PATH": + return os.environ.get('TORCHSIM_LLVM_PATH', default="/usr/bin") + if name == "CONFIG_TORCHSIM_CUSTOM_PASS_PATH": + return os.environ.get('TORCHSIM_CUSTOM_PASS_PATH', + default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/GemminiLowerPass/build") + if name == "CONFIG_TORCHSIM_DUMP_MLIR_IR": + return int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False)) + if name == "CONFIG_TORCHSIM_DUMP_LLVM_IR": + return int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False)) + + # TOGSim config + if name == "CONFIG_TOGSIM_CONFIG": + return os.environ.get('TORCHSIM_CONFIG', + default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json") + if name == "CONFIG_TOGSIM_EAGER_MODE": + return int(os.environ.get("TOGSIM_EAGER_MODE", default=False)) + if name == "CONFIG_TOGSIM_DRYRUN": + return int(os.environ.get('TOGSIM_DRYRUN', default=False)) + if name == "CONFIG_TOGSIM_DEBUG_LEVEL": + return os.environ.get("TOGSIM_DEBUG_LEVEL", "") + + # GEM5 config + if name == "CONFIG_GEM5_PATH": + return os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt") + if name == "CONFIG_GEM5_SCRIPT_PATH": + return os.environ.get('GEM5_SCRIPT_PATH', + default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/gem5_script/script_systolic.py") + + # AUTOTUNE config + if name == "CONFIG_AUTOTUNE": + return int(os.environ.get('AUTOTUNE', default=False)) + if name == "CONFIG_AUTOTUNE_TEMPLATE": + return int(os.environ.get('AUTOTUNE_TEMPLATE', default=False)) + if name == "CONFIG_MAX_AUTOTUNE_TRY": + return int(os.environ.get('MAX_AUTOTUNE_TRY', default=10)) + if name == "CONFIG_AUTOTUNE_TEMPLATE_TOPK": + return int(os.environ.get('AUTOTUNE_TEMPLATE_TOPK', default=4)) + + # For block sparse + if name == "CONFIG_BLOCK_SPARSE": + return int(os.environ.get('BLOCK_SPARSE', default=0)) + + # For GEMM tile size + if name == "CONFIG_MANUAL_TILE_SIZE": + return int(os.environ.get('TORCHSIM_MANUAL_TILE_SIZE', default=False)) + if name == "CONFIG_TILE_M": + return int(os.getenv("TORCHSIM_TILE_M", __getattr__("CONFIG_VECTOR_LANE"))) + if name == "CONFIG_TILE_N": + return int(os.getenv("TORCHSIM_TILE_N", __getattr__("CONFIG_VECTOR_LANE"))) + if name == "CONFIG_TILE_K": + return int(os.getenv("TORCHSIM_TILE_K", __getattr__("CONFIG_VECTOR_LANE"))) + + if name == "CONFIG_SUBTILE": + return int(os.environ.get('TORCHSIM_SUBTILE', default=True)) + if name == "CONFIG_MANUAL_SUBTILE_SIZE": + return int(os.environ.get('TORCHSIM_MANUAL_SUBTILE_SIZE', default=False)) + if name == "CONFIG_SUBTILE_M": + return int(os.environ.get('TORCHSIM_SUBTILE_M', default=__getattr__("CONFIG_VECTOR_LANE"))) + if name == "CONFIG_SUBTILE_N": + return int(os.environ.get('TORCHSIM_SUBTILE_N', default=__getattr__("CONFIG_VECTOR_LANE"))) + if name == "CONFIG_SUBTILE_K": + return int(os.environ.get('TORCHSIM_SUBTILE_K', default=__getattr__("CONFIG_VECTOR_LANE"))) + + if name == "CONFIG_GEMM_CHEATSHEET_PATH": + return os.environ.get('TORCHSIM_GEMM_CHEATSHEET_PATH', + default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/validation/gemm_tpuv3_cheatsheet.json") + # Compiler Optimization + if name == "CONFIG_COMPILER_OPTIMIZATION": + return os.environ.get('TORCHSIM_COMPILER_OPTIMIZATION', default="all") # options: all, none, custom + # Advanced fusion options + if name == "CONFIG_FUSION": + return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "fusion" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False + if name == "CONFIG_FUSION_REDUCTION_EPILOGUE": + return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "reduction_epliogue" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False + if name == "CONFIG_FUSION_REDUCTION_REDUCTION": + return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "reduction_reduction" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False + if name == "CONFIG_FUSION_PROLOGUE": + return True if ((__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all") or ("prologue" in __getattr__("CONFIG_COMPILER_OPTIMIZATION"))) else False + if name == "CONFIG_SINGLE_BATCH_CONV": + return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "single_batch_conv" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False + if name == "CONFIG_MULTI_TILE_CONV": + return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "multi_tile_conv" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False # SRAM Buffer allocation plan def load_plan_from_module(module_path): @@ -96,3 +147,5 @@ def load_plan_from_module(module_path): CONFIG_TLS_MODE = int(os.environ.get('TORCHSIM_TLS_MODE', default=1)) CONFIG_USE_TIMING_POOLING = int(os.environ.get('TORCHSIM_USE_TIMING_POOLING', default=0)) + +CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=0)) \ No newline at end of file diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py index 22a727c5..167544f2 100644 --- a/PyTorchSimFrontend/extension_op.py +++ b/PyTorchSimFrontend/extension_op.py @@ -13,7 +13,7 @@ from torch._inductor.codecache import write from PyTorchSimFrontend.extension_codecache import get_write_path from PyTorchSimFrontend import extension_config -from Simulator.simulator import BackendSimulator, TORCH_TO_NUMPY +from Simulator.simulator import TOGSimulator, TORCH_TO_NUMPY graph_template = { 0: { @@ -46,7 +46,7 @@ class MLIRExternKernelChoice(ExternKernelChoice): def call_name(self): - is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) + is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) if is_dryrun: return f"yield from sparse_mm_dummy_stonne_outer" return f"torch.ops.extension_op.{self.name}" @@ -275,11 +275,11 @@ def prepare_outer_product_matrix(a, b, out): def sparse_mm_stonne_outer(a, b, out): onnx_path, attribute_path, c_result_path = prepare_outer_product_matrix(a, b, out) - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json' - backsim = BackendSimulator(backend_path, stonne_config_path) + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/TOGSim/configs/stonne_single_c1_simple_noc.json' + backsim = TOGSimulator(togsim_path, stonne_config_path) result_path = backsim.simulation(onnx_path) - BackendSimulator.get_result_from_file(result_path) + TOGSimulator.get_result_from_file(result_path) # Load result data #with open(c_result_path, 'rb') as f: diff --git a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py b/PyTorchSimFrontend/llvm/llvm_caller_codegen.py deleted file mode 100644 index 3690f533..00000000 --- a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py +++ /dev/null @@ -1,236 +0,0 @@ -import os -import subprocess -import shlex -import re - -from torch._inductor.utils import IndentedBuffer -from torch._inductor.codegen import cpp -from torch._inductor.codecache import write_atomic - -from PyTorchSimFrontend.llvm.llvm_common import LLVMKernelArgs - -class LLVMKernelCallerCodeGen(): - """ - Generate C that calls the llvm kernel. - """ - - def __init__(self, validation, arg_attributes): - super().__init__() - self.code = IndentedBuffer() - self.ending = ";" - self.open_bracket = "{" - self.closed_bracket = "}" - self.newline = "\n" - self.kernel_name = "kernel" - self.validation = validation - self.n_arg = len(arg_attributes) - self.arg_attributes = arg_attributes - self.arg_use_count = 1 - self.load_args = {} - self.kernel_start_addr = "" - self.kernel_end_addr = "" - - def get_argv_idx(self): - self.arg_use_count += 1 - return self.arg_use_count-1 - - def write_header(self): - self.writeline('#include ') - self.writeline('#include ') - self.writeline("#include ") - if self.validation: - self.writeline("#include ") - self.writeline('#include ') - self.writeline('#include ') - - def is_in_arg(self, arg_name): - value = self.arg_attributes[arg_name][0] - return LLVMKernelArgs.is_llvm_arg_in(value) - - def is_out_arg(self, arg_name): - value = self.arg_attributes[arg_name][0] - return LLVMKernelArgs.is_llvm_arg_out(value) - - def load_arg(self): - for i, arg_name in enumerate(self.arg_attributes.keys()): - if self.is_in_arg(arg_name): - argv_idx = self.get_argv_idx() if arg_name not in self.load_args else self.load_args[arg_name] - self.load_args[arg_name] = argv_idx - self.writeline(f'if(load_arg({arg_name}, sizeof({arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - - def dump_arg(self): - for i, arg_name in enumerate(self.arg_attributes.keys()): - if self.is_out_arg(arg_name): - argv_idx = self.get_argv_idx() if arg_name not in self.load_args else self.load_args[arg_name] - self.writeline(f'if(dump_arg({arg_name}, sizeof({arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - - def write_exit(self): - self.writeline(f'return 0{self.ending}') - - def generate_kernel_declare(self): - args_type_p = [f'{cpp.DTYPE_TO_CPP[arg_type[1]]}*' for arg_type in self.arg_attributes.values()] - - self.writeline(f"void {self.kernel_name}({', '.join(args_type_p)}){self.ending}{self.newline}") - - def generate_args_define(self): - for arg_name, (_, arg_type, arg_shape) in self.arg_attributes.items(): - self.writeline(f'{cpp.DTYPE_TO_CPP[arg_type]} {arg_name}[atoi(argv[{self.get_argv_idx()}])] __attribute__ ((aligned (4096))){self.ending}') - self.writeline(self.newline) - - def generate_load_dump_fn(self): - self.writeline(f'{self.newline}int load_arg(void *arg, size_t size, const char *path) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'int fd = open(path, 0x00000000){self.ending}') - self.writeline(f'if (fd == -1) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - - self.writeline(f'if (read(fd, arg, size) == -1) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - self.writeline(f'close(fd){self.ending}') - self.writeline(f'return 0{self.ending}') - self.writeline(self.closed_bracket) - - self.writeline(f'{self.newline}int dump_arg(void *arg, size_t size, const char *path) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'int fd = open(path, 0x00000001 | 0x00000040, 0644){self.ending}') - self.writeline(f'if (fd == -1) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - - self.writeline(f'if (write(fd, arg, size) == -1) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - self.writeline(f'close(fd){self.ending}') - self.writeline(f'return 0{self.ending}') - self.writeline(self.closed_bracket) - - def generate_main(self): - self.writeline(f'{self.newline}int main(int argc, char *argv[]) {self.open_bracket}{self.newline}') - with self.code.indent(): - if self.validation: - self.load_arg() - self.writeline(self.newline) - - self.writeline(f"{self.kernel_name}({', '.join(list(self.arg_attributes))}){self.ending}{self.newline}") - - if self.validation: - self.dump_arg() - - self.write_exit() - self.writeline(self.closed_bracket) - - def writeline(self, line): - self.code.writeline(line) - - def generate_wrapper_file(self, path, name): - self.dump_path = path - - self.write_header() - self.generate_kernel_declare() - - if self.validation: - self.generate_load_dump_fn() - self.generate_main() - - write_path = os.path.join(path, name+".c",) - write_atomic(write_path, self.code.getvalue()) - return - - def add_extention(self, name, extension): - return name + "." + extension - - def compile_wih_kernel(self, write_path, llvm_name, wrapper_name, binary_name, link_option=""): - main_path = os.path.join(write_path, self.add_extention(wrapper_name, 'c')) - main_obj_path = os.path.join(write_path, self.add_extention(wrapper_name, 'o')) - kernel_path = os.path.join(write_path, self.add_extention(llvm_name, 's')) - kernel_obj_path = os.path.join(write_path, self.add_extention(llvm_name, 'o')) - - main_compile = f'riscv64-unknown-elf-gcc -march=rv64gcv -c {main_path} -o {main_obj_path}' - kernel_compile = f'clang -c --target="riscv64" -march=rv64gcv -O2 -nostdlib {kernel_path} -o {kernel_obj_path}' - - target = os.path.join(write_path, binary_name) - link = f'riscv64-unknown-elf-gcc -march=rv64gcv {main_obj_path} {kernel_obj_path} -o {target} -lm {link_option}' - - main_compile_cmd = shlex.split(main_compile) - kernel_compile_cmd = shlex.split(kernel_compile) - link_cmd = shlex.split(link) - - try: - subprocess.check_call(main_compile_cmd) - subprocess.check_call(kernel_compile_cmd) - subprocess.check_call(link_cmd) - except subprocess.CalledProcessError as e: - print("Command failed with exit code", e.returncode) - print("Error output:", e.output) - assert(0) - - def parse_stack_sizes(self, file_path, vlenb=256): - with open(file_path, 'r') as f: - stack_sizes_data = f.readlines() - - in_proc = False - stack_base = None - dynamic_expr = None - max_offset = 0 - - for line in stack_sizes_data: - line = line.strip() - if line.startswith(".cfi_startproc"): - in_proc = True - continue - elif line.startswith(".cfi_endproc") and in_proc: - if dynamic_expr: - total_stack = eval(dynamic_expr, {"vlenb": vlenb}) - return total_stack - elif stack_base: - return stack_base - else: - return max_offset - - # Skip outer function - if not in_proc: - continue - - if line.startswith(".cfi_def_cfa_offset"): - stack_base = int(line.split()[-1]) - - if ".cfi_escape" in line and "#" in line: - comment = line.split("#")[-1].strip() - m = re.search(r"sp \+ (\d+)\s*\+\s*(\d+)\s*\*\s*vlenb", comment) - if m: - base, scale = int(m.group(1)), int(m.group(2)) - dynamic_expr = f"{base} + {scale} * vlenb" - - def get_spad_size(self, binary_path): - cmd = ["riscv64-unknown-elf-readelf", "-s", binary_path] - result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - if result.returncode != 0: - raise RuntimeError(f"Readelf error: {result.stderr}") - - output = result.stdout - spad_start = None - spad_end = None - for line in output.splitlines(): - if '.spad' in line and 'SECTION' in line: - parts = line.split() - spad_start = int(parts[1], 16) - elif 'spad_end' in line: - parts = line.split() - spad_end = int(parts[1], 16) - - if spad_start is None or spad_end is None: - return 0 - spad_size = spad_end - spad_start - return spad_size \ No newline at end of file diff --git a/PyTorchSimFrontend/llvm/llvm_common.py b/PyTorchSimFrontend/llvm/llvm_common.py deleted file mode 100644 index 1c76b826..00000000 --- a/PyTorchSimFrontend/llvm/llvm_common.py +++ /dev/null @@ -1,304 +0,0 @@ -import torch -from torch._inductor.codegen import common -from torch._inductor.virtualized import V -import sympy - -from typing import Callable - -import sympy - -import torch.fx -from torch.utils._sympy.value_ranges import ValueRanges - -from torch._inductor.utils import ( - free_symbol_startswith, - get_sympy_Expr_dtype, - IndentedBuffer, - sympy_subs, - unique, -) - -schedule_log = torch._logging.getArtifactLogger(__name__, "schedule") - -DTYPE_TO_LLVM = { - torch.float32: "float", - torch.float64: "double", - torch.float16: "half", - torch.int64: "i64", - torch.int32: "i32", - torch.int16: "i16", - torch.int8: "i8", - torch.uint8: "i8", - torch.bool: "i8", - torch.bfloat16: "bfloat", -} - -DTYPE_SIZE = { - torch.float32: 4, - torch.float64: 8, - torch.float16: 2, - torch.int64: 8, - torch.int32: 4, - torch.int16: 2, - torch.int8: 1, - torch.uint8: 1, - torch.bool: 1, - torch.bfloat16: 2, -} - -DTYPE_LOWP_FP = [ - torch.bfloat16, - torch.float16, -] - -class LLVMKernelArgs(common.KernelArgs): - LLVM_ARGS_IN = 0x01 - LLVM_ARGS_OUT = 0x02 - LLVM_ARGS_INOUT = 0x04 - LLVM_ARGS_VAR = 0x08 - - @staticmethod - def is_llvm_arg_in(value): - return (LLVMKernelArgs.LLVM_ARGS_IN & value) | (LLVMKernelArgs.LLVM_ARGS_INOUT & value) - - @staticmethod - def is_llvm_arg_out(value): - return (LLVMKernelArgs.LLVM_ARGS_OUT & value) | (LLVMKernelArgs.LLVM_ARGS_INOUT & value) - - def llvm_argdefs(self, only_args=False): - buffer_types = {x.get_name(): [x.get_dtype(), x.get_numel()] for x in V.graph.buffers} - for name, val in V.graph.graph_inputs.items(): - if isinstance(val, sympy.Expr): - buffer_types[name] = [get_sympy_Expr_dtype(val), 1] - else: - buffer_types[name] = [val.get_dtype(), val.get_numel()] - buffer_types.update( - {name: val.dtype for name, val in V.graph.constants.items()} - ) - - call_args = [] - arg_defs = [] - arg_attributes = {} - for inplaced in unique(self.inplace_buffers.values()): - if self._buffer_is_marked_removed(inplaced): - continue - outer = inplaced.other_names[-1] - inner = inplaced.inner_name - arg_defs.append(f"ptr %{inner}") - if not only_args: - call_args.append(outer) - arg_attributes[outer] = [self.LLVM_ARGS_INOUT] + buffer_types[outer] - for outer, inner in self.input_buffers.items(): - if outer in self.inplace_buffers: - continue - arg_defs.append(f"ptr readonly %{inner}") - if not only_args: - call_args.append(outer) - arg_attributes[outer] = [self.LLVM_ARGS_IN] + buffer_types[outer] - for outer, inner in self.output_buffers.items(): - if outer in self.inplace_buffers or self._buffer_is_marked_removed(inner): - continue - arg_defs.append(f"ptr %{inner}") - if not only_args: - call_args.append(outer) - arg_attributes[outer] = [self.LLVM_ARGS_OUT] + buffer_types[outer] - for outer, inner in self.sizevars.items(): - arg_defs.append(f"ptr readonly %{inner}") - if not only_args: - call_args.append(outer) - arg_attributes[outer] = [self.LLVM_ARGS_VAR] + buffer_types[outer] - return arg_defs, call_args, arg_attributes - -class BaseLLVMKernel(common.Kernel): - newvar_prefix = "%" - name_prefix = "body" - vector_prefix = "vector_body" - suffix = "" - overrides = None - load_format = None - store_format = None - - def __init__(self, args=None): - super().__init__(args) - self.vector_compute = IndentedBuffer() - self.reductions_suffix = IndentedBuffer() - self.cse = common.CSE(self.newvar_prefix, self.suffix, self.name_prefix) - self.vector_cse = common.CSE(self.newvar_prefix, self.suffix, self.vector_prefix) - self.tile_size = None - self.tile_shape = {} - - def load(self, name: str, index: sympy.Expr): - raise NotImplementedError() - - def store_reduction(self, name, index, value): - raise NotImplementedError() - - def store(self, name, index, value, mode=None): - raise NotImplementedError() - - def reduction(self, dtype, src_dtype, reduction_type, value): - raise NotImplementedError() - - def widening(self, args, buf_bounds): - if not args[0] in self.tile_shape or not args[1] in self.tile_shape: - return args, [1, 1] - tile_shape0 = self.tile_shape[args[0]] - tile_shape1 = self.tile_shape[args[1]] - vec_len0 = tile_shape0[0] * tile_shape0[1] - vec_len1 = tile_shape1[0] * tile_shape1[1] - if tile_shape0 != tile_shape1: - temp = list(args) - idx = 0 if tile_shape0[0] != tile_shape1[0] else 1 - if tile_shape0[idx] > tile_shape1[idx]: - if idx == 0: - indexes = [f"i32 {i%tile_shape1[idx-1]}" for i in range(vec_len0)] - else: - indexes = [f"i32 {i//tile_shape1[idx-1]}" for i in range(vec_len0)] - line = f"shufflevector <{vec_len1} x float> %{args[1]}, <{vec_len1} x float> undef, <{vec_len0} x i32> <{', '.join(indexes)}>" - temp[1] = self.cse.generate(self.compute, line, bounds=buf_bounds) - elif tile_shape0[idx] < tile_shape1[idx]: - if idx == 0: - indexes = [f"i32 {i%tile_shape0[idx-1]}" for i in range(vec_len1)] - else: - indexes = [f"i32 {i//tile_shape0[idx-1]}" for i in range(vec_len1)] - line = f"shufflevector <{vec_len0} x float> %{args[0]}, <{vec_len0} x float> undef, <{vec_len1} x i32> <{', '.join(indexes)}>" - temp[0] = self.cse.generate(self.compute, line, bounds=buf_bounds) - args = tuple(temp) - return args, max(tile_shape0, tile_shape1) - - def __enter__(self): - class CSEProxy: - self.name = "CSEProxy" - - @staticmethod - def __getattr__(name: str) -> Callable[..., common.CSEVariable]: # type: ignore[misc] - def inner(*args, **kwargs): - # TritonTemplateKernel has no current_node - buf_bounds = ValueRanges.unknown() - if hasattr(V.interpreter, "current_node"): - fx_node = V.interpreter.current_node - assert isinstance(self.node_to_bounds, dict) - buf_bounds = self.node_to_bounds.get( - fx_node, ValueRanges.unknown() - ) - - vector_csevar = None - if isinstance(args[0], list): - vector_args = (args[0][0], args[1][0]) - vector_csevar = self.vector_cse.generate( - self.vector_compute, - getattr(parent_handler, "vector_" + name)(*vector_args, **kwargs), # type: ignore[has-type] - bounds=buf_bounds, - ) - vector_csevar.update_on_args(name, vector_args, kwargs) - args = (args[0][1], args[1][1]) - if len(args) == 2: - args, tile_shape = self.widening(args, buf_bounds) - elif len(args) == 1: - tile_shape = self.tile_shape[args[0]] - else: - assert(0) # not implemented yet. - vec_len = tile_shape[0] * tile_shape[1] - csevar = self.cse.generate( - self.compute, - getattr(parent_handler, name)(*args, tile_size=vec_len, **kwargs), # type: ignore[has-type] - bounds=buf_bounds, - ) - self.tile_shape[csevar] = tile_shape - csevar.update_on_args(name, args, kwargs) - if vector_csevar is not None: - return [vector_csevar, csevar] - return csevar - - return inner - - @staticmethod - def indirect_indexing(index_var, size, check=True): - # Skip CSE since this doesn't return an expression - return self.indirect_indexing(index_var, size, check) # type: ignore[attr-defined] - - @staticmethod - def load(name: str, index: sympy.Expr): - if name in self.cse.invalidated_stores: - # A load from an invalidated store requires us to - # keep the actual buffer around - V.kernel.must_keep_buffers.add(name) - if free_symbol_startswith(index, "%"): - return self.indirect_load(name, index) - store_cache = self.cse.store_cache - if name in store_cache: - return store_cache[name] - return self.load(name, index) - - @staticmethod - def store(name, index, value, mode=None): - self.store_buffer_names.add(name) - if mode is None: - self.cse.store_cache[name] = value - if self.current_node: - for other_name in self.current_node.get_mutations(): - self.cse.store_cache[other_name] = value - if name not in V.graph.removed_buffers: - return self.store(name, index, value, mode=mode) - - @staticmethod - def store_reduction(name, index, value): - self.store_buffer_names.add(name) - self.cse.store_cache[name] = value - if self.current_node: - for other_name in self.current_node.get_mutations(): - self.cse.store_cache[other_name] = value - - if name not in V.graph.removed_buffers: - return self.store_reduction(name, index, value) - - @staticmethod - def reduction(dtype, src_dtype, reduction_type, value): - return self.reduction(dtype, src_dtype, reduction_type, value) - - @staticmethod - def bucketize( - values, - offsets_name: str, - offsets_size: sympy.Expr, - indexing_dtype: torch.dtype, - right: bool, - ): - """ - [Note: Inductor bucketize op] - - Given values (tensor) and offsets_name (reference to the name of a 1D - tensor), calculate the bucket that each value belongs to. - - e.g. for values [-1, 0, 1, 2, 3, 4, 5, 9], offsets [0, 4, 4, 8], right=True - return = [ 0, 1, 1, 1, 1, 3, 3, 4]. - - When right == False, bucket i refers to range (offsets[i], offsets[i+1]]. - When right == True, bucket i refers to range [offsets[i], offsets[i+1]). - - Offsets must be non-decreasing or the result is undefined. - """ - return self.bucketize( - values, offsets_name, offsets_size, indexing_dtype, right - ) - - super().__enter__() - assert self.overrides - parent_handler = self.overrides(V.get_ops_handler()) - self.exit_stack.enter_context(V.set_ops_handler(CSEProxy())) - self.exit_stack.enter_context(V.set_kernel_handler(self)) - return self - - def rename_indexing(self, index) -> sympy.Expr: - # adds the necessary kernel args for index expressions - # and renames variables in index expressions to kernel arg names - if isinstance(index, (list, tuple)): - return [self.rename_indexing(x) for x in index] - index = V.graph.sizevars.simplify(index) - sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name) - replacements = { - x: self.args.size(x) - for x in sorted_symbols - if x.name.startswith("s") or x.name.startswith("ps") - } - return sympy_subs(index, replacements) diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py index 537809de..e52d6cff 100644 --- a/PyTorchSimFrontend/mlir/mlir_autotune.py +++ b/PyTorchSimFrontend/mlir/mlir_autotune.py @@ -2,21 +2,17 @@ import torch import os import dataclasses -from torch._inductor.autotune_process import BenchmarkRequest from torch._inductor.autotune_process import TensorMeta from torch._inductor.codecache import get_hash, write from PyTorchSimFrontend import extension_config -from Simulator.simulator import BackendSimulator +from Simulator.simulator import TOGSimulator from typing import ( Any, Callable, - Dict, Iterable, List, Optional, - Sequence, - TYPE_CHECKING, Union, ) @@ -62,9 +58,9 @@ def make_run_fn( # Check already cached result. write_path = get_write_path(self.source_code) key, _ = write(self.source_code, "mlir", specified_dir=write_path) - result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "backendsim_result/0") + result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "togsim_result/0") if os.path.exists(result_path): - result = BackendSimulator.get_result_from_file(result_path) + result = TOGSimulator.get_result_from_file(result_path) def cached_run_fn(*args, **kwargs): return result return cached_run_fn diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py index 3fff9958..dff6b0fd 100644 --- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py +++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py @@ -1,16 +1,46 @@ +import os +import subprocess +import shlex +import re import torch -from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs -from PyTorchSimFrontend.llvm.llvm_caller_codegen import LLVMKernelCallerCodeGen -from PyTorchSimFrontend.mlir.mlir_common import DTYPE_TO_C +from torch._inductor.utils import IndentedBuffer +from torch._inductor.codecache import write_atomic +from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs, DTYPE_TO_C -class MLIRKernelCallerCodeGen(LLVMKernelCallerCodeGen): +class MLIRKernelCallerCodeGen(): + """ + Generate C that calls the llvm kernel. + """ def __init__(self, validation, arg_attributes, cycle_sim=False): - super().__init__(validation, arg_attributes) + super().__init__() + self.code = IndentedBuffer() + self.ending = ";" + self.open_bracket = "{" + self.closed_bracket = "}" + self.newline = "\n" + self.kernel_name = "kernel" + self.validation = validation + self.n_arg = len(arg_attributes) + self.arg_attributes = arg_attributes + self.arg_use_count = 1 + self.load_args = {} + self.kernel_start_addr = "" + self.kernel_end_addr = "" self.cycle_sim = cycle_sim + def get_argv_idx(self): + self.arg_use_count += 1 + return self.arg_use_count-1 + def write_header(self): - super().write_header() + self.writeline('#include ') + self.writeline('#include ') + self.writeline("#include ") + if self.validation: + self.writeline("#include ") + self.writeline('#include ') + self.writeline('#include ') global_var_header = "gem5_global_var.h" if self.cycle_sim else "global_var.h" self.writeline(f"#include \"{global_var_header}\"") @@ -42,6 +72,9 @@ def dump_arg(self): self.writeline(f'return -1{self.ending}') self.writeline(self.closed_bracket) + def write_exit(self): + self.writeline(f'return 0{self.ending}') + def generate_kernel_declare(self): # memref to llvm arguments (memref -> ptr, ptr, i64, , ) allocated pointer, aligned pointer, offset, size, stride args_type_p = [f'{DTYPE_TO_C[arg_type[1]]}*, {DTYPE_TO_C[arg_type[1]]}*, int64_t, int64_t, int64_t' for (_, arg_type) in self.arg_attributes] @@ -86,4 +119,142 @@ def generate_main(self): self.dump_arg() self.write_exit() - self.writeline(self.closed_bracket) \ No newline at end of file + self.writeline(self.closed_bracket) + + def generate_load_dump_fn(self): + self.writeline(f'{self.newline}int load_arg(void *arg, size_t size, const char *path) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'int fd = open(path, 0x00000000){self.ending}') + self.writeline(f'if (fd == -1) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'return -1{self.ending}') + self.writeline(self.closed_bracket) + + self.writeline(f'if (read(fd, arg, size) == -1) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'return -1{self.ending}') + self.writeline(self.closed_bracket) + self.writeline(f'close(fd){self.ending}') + self.writeline(f'return 0{self.ending}') + self.writeline(self.closed_bracket) + + self.writeline(f'{self.newline}int dump_arg(void *arg, size_t size, const char *path) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'int fd = open(path, 0x00000001 | 0x00000040, 0644){self.ending}') + self.writeline(f'if (fd == -1) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'return -1{self.ending}') + self.writeline(self.closed_bracket) + + self.writeline(f'if (write(fd, arg, size) == -1) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'return -1{self.ending}') + self.writeline(self.closed_bracket) + self.writeline(f'close(fd){self.ending}') + self.writeline(f'return 0{self.ending}') + self.writeline(self.closed_bracket) + + + def writeline(self, line): + self.code.writeline(line) + + def generate_wrapper_file(self, path, name): + self.dump_path = path + + self.write_header() + self.generate_kernel_declare() + + if self.validation: + self.generate_load_dump_fn() + self.generate_main() + + write_path = os.path.join(path, name+".c",) + write_atomic(write_path, self.code.getvalue()) + return + + def add_extention(self, name, extension): + return name + "." + extension + + def compile_wih_kernel(self, write_path, llvm_name, wrapper_name, binary_name, link_option=""): + main_path = os.path.join(write_path, self.add_extention(wrapper_name, 'c')) + main_obj_path = os.path.join(write_path, self.add_extention(wrapper_name, 'o')) + kernel_path = os.path.join(write_path, self.add_extention(llvm_name, 's')) + kernel_obj_path = os.path.join(write_path, self.add_extention(llvm_name, 'o')) + + main_compile = f'riscv64-unknown-elf-gcc -march=rv64gcv -c {main_path} -o {main_obj_path}' + kernel_compile = f'clang -c --target="riscv64" -march=rv64gcv -O2 -nostdlib {kernel_path} -o {kernel_obj_path}' + + target = os.path.join(write_path, binary_name) + link = f'riscv64-unknown-elf-gcc -march=rv64gcv {main_obj_path} {kernel_obj_path} -o {target} -lm {link_option}' + + main_compile_cmd = shlex.split(main_compile) + kernel_compile_cmd = shlex.split(kernel_compile) + link_cmd = shlex.split(link) + + try: + subprocess.check_call(main_compile_cmd) + subprocess.check_call(kernel_compile_cmd) + subprocess.check_call(link_cmd) + except subprocess.CalledProcessError as e: + print("Command failed with exit code", e.returncode) + print("Error output:", e.output) + assert(0) + + def parse_stack_sizes(self, file_path, vlenb=256): + with open(file_path, 'r') as f: + stack_sizes_data = f.readlines() + + in_proc = False + stack_base = None + dynamic_expr = None + max_offset = 0 + + for line in stack_sizes_data: + line = line.strip() + if line.startswith(".cfi_startproc"): + in_proc = True + continue + elif line.startswith(".cfi_endproc") and in_proc: + if dynamic_expr: + total_stack = eval(dynamic_expr, {"vlenb": vlenb}) + return total_stack + elif stack_base: + return stack_base + else: + return max_offset + + # Skip outer function + if not in_proc: + continue + + if line.startswith(".cfi_def_cfa_offset"): + stack_base = int(line.split()[-1]) + + if ".cfi_escape" in line and "#" in line: + comment = line.split("#")[-1].strip() + m = re.search(r"sp \+ (\d+)\s*\+\s*(\d+)\s*\*\s*vlenb", comment) + if m: + base, scale = int(m.group(1)), int(m.group(2)) + dynamic_expr = f"{base} + {scale} * vlenb" + + def get_spad_size(self, binary_path): + cmd = ["riscv64-unknown-elf-readelf", "-s", binary_path] + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if result.returncode != 0: + raise RuntimeError(f"Readelf error: {result.stderr}") + + output = result.stdout + spad_start = None + spad_end = None + for line in output.splitlines(): + if '.spad' in line and 'SECTION' in line: + parts = line.split() + spad_start = int(parts[1], 16) + elif 'spad_end' in line: + parts = line.split() + spad_end = int(parts[1], 16) + + if spad_start is None or spad_end is None: + return 0 + spad_size = spad_end - spad_start + return spad_size \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index b3352ea6..c24260ce 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -2,15 +2,18 @@ import sympy import re import os +import math from functools import reduce from operator import mul import torch from collections import defaultdict from concurrent.futures import ThreadPoolExecutor +from torch._dynamo.testing import rand_strided +from torch._inductor.autotune_process import TensorMeta from torch._dynamo.utils import dynamo_timed from torch._inductor.codegen import cpp, wrapper, common, memory_planning from torch._inductor.virtualized import V, _ops as ops -from torch._inductor.codecache import write_atomic, write +from torch._inductor.codecache import write_atomic from torch._inductor.utils import ( IndentedBuffer, is_welford_reduction, @@ -21,6 +24,7 @@ from PyTorchSimFrontend import extension_config from . import mlir_common from .mlir_common import LoopLevel, LoopNest +from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest def reduction_init(reduction_type, dtype): if dtype in cpp.DTYPE_LOWP_FP: @@ -95,8 +99,8 @@ def write_header(self): from torch import device, empty, empty_strided from {extension_codecache.__name__} import CustomAsyncCompile - from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_BACKENDSIM_EAGER_MODE - from Simulator.simulator import BackendSimulator + from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_TOGSIM_EAGER_MODE + from Simulator.simulator import TOGSimulator from PyTorchSimFrontend.extension_op import sparse_mm_dummy_stonne_outer from torch._inductor.select_algorithm import extern_kernels @@ -118,7 +122,7 @@ def sram_plan_prefix(buffer_name, buffer): start = buffer.data_ptr() end = start + buffer_size # print(f'Alloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})') - BackendSimulator.sram_alloc(buffer_name, [start, end]) + TOGSimulator.sram_alloc(buffer_name, [start, end]) def sram_plan_postfix(buffer_name, buffer): if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN): @@ -127,7 +131,7 @@ def sram_plan_postfix(buffer_name, buffer): start = buffer.data_ptr() end = start + buffer_size # print(f'Dealloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})') - BackendSimulator.sram_dealloc(buffer_name, [start, end]) + TOGSimulator.sram_dealloc(buffer_name, [start, end]) def host2device_memcopy(buffer): pass @@ -420,6 +424,10 @@ def exp(operand, *args, var_info=None, **kwargs): shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype return f'math.exp %{operand} : {shape}', [tile_size, dtype] + @staticmethod + def exp2(operand, *args, var_info=None, **kwargs): + raise NotImplementedError() + @staticmethod def erf(operand, *args, var_info=None, **kwargs): # Check scalar @@ -1287,7 +1295,7 @@ def store_reduction(self, name, index, value): # mean reduction_numel = reduce(mul, self.ranges[self.reduction_depth:], 1) divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(reduction_numel)} : f32") - if self.buffer_types[name][1] > 1: + if compute_vec_size > 1: divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to vector<{self.var_info[sum][0]}x{mlir_dtype}>") else: divider_vec = divider @@ -1627,15 +1635,40 @@ def get_cycle(choice): choices = self.make_choices(*args) if len(choices) == 0: # can't autotune - return None + return [None, None] with ThreadPoolExecutor(max_workers=8) as executor: results = list(executor.map(get_cycle, choices)) max_idx = results.index(min(results)) if min(results) == float("inf"): raise RuntimeError("Failed to find optimal tile size...") self._log_autotune_result(choices[max_idx], results[max_idx]) - optimal_src_code = choices[max_idx][1] - return optimal_src_code + optimal_src_code, loop_size = choices[max_idx][1], choices[max_idx][-1] + return optimal_src_code, loop_size + + def run_bench(self, nodes, kernel_name, src_code): + _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs() + input_call_args = tuple(self.args.input_buffers.keys()) + output_call_args = tuple(self.args.output_buffers.keys()) + full_input_nodes = tuple([V.graph.get_buffer(k) for k in input_call_args]) + full_output_nodes = tuple([V.graph.get_buffer(k) for k in output_call_args]) + + bmreq = MLIRBenchmarkRequest( + kernel_name=kernel_name, + input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes), + output_tensor_meta=TensorMeta.from_irnodes(full_output_nodes), + extra_args={ + "vector_lane" : self.vector_lane, + "spad_info": self.spad_info, + "vlen" : self.vlen, + "arg_attributes" : arg_attributes, + "validate" : extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE, + "autotune" : True, + }, + source_code=src_code, + ) + dummy_inputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.input_tensor_meta] + dummy_outputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.output_tensor_meta] + return bmreq.make_run_fn(dummy_inputs, dummy_outputs) def _log_autotune_result(self, best_choice, best_cycle): print( @@ -1647,8 +1680,8 @@ def _log_autotune_result(self, best_choice, best_cycle): def codegen_nodes(self, nodes, kernel_name): src_code = super().codegen_nodes(nodes, kernel_name) self._prepare_simulator_headers(src_code) - if extension_config.CONFIG_AUTOTUNE and not extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: - optimal_src_code = self.autotune(nodes, kernel_name) + if extension_config.CONFIG_AUTOTUNE and extension_config.CONFIG_TORCHSIM_TIMING_MODE: + optimal_src_code = self.autotune(nodes, kernel_name)[0] if optimal_src_code is not None: return optimal_src_code return src_code diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 2644f125..c655dde3 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -1,15 +1,12 @@ import dataclasses import math from dataclasses import dataclass -from typing import Optional, Iterable from typing import Dict from typing import List from collections import defaultdict from functools import reduce from operator import mul import torch -from torch._dynamo.testing import rand_strided -from torch._inductor.autotune_process import TensorMeta from torch._inductor.codegen import common from torch._inductor.codegen import cpp from torch._inductor.virtualized import V @@ -35,7 +32,6 @@ ) from PyTorchSimFrontend import extension_config from PyTorchSimFrontend import extension_codecache -from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest schedule_log = torch._logging.getArtifactLogger(__name__, "schedule") DTYPE_TO_MLIR = { @@ -776,31 +772,6 @@ def codegen_nodes(self, nodes, kernel_name): self.meta_kernel() return src_code - def run_bench(self, nodes, kernel_name, src_code): - _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs() - input_call_args = tuple(self.args.input_buffers.keys()) - output_call_args = tuple(self.args.output_buffers.keys()) - full_input_nodes = tuple([V.graph.get_buffer(k) for k in input_call_args]) - full_output_nodes = tuple([V.graph.get_buffer(k) for k in output_call_args]) - - bmreq = MLIRBenchmarkRequest( - kernel_name=kernel_name, - input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes), - output_tensor_meta=TensorMeta.from_irnodes(full_output_nodes), - extra_args={ - "vector_lane" : self.vector_lane, - "spad_info": self.spad_info, - "vlen" : self.vlen, - "arg_attributes" : arg_attributes, - "validate" : extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, - "autotune" : True, - }, - source_code=src_code, - ) - dummy_inputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.input_tensor_meta] - dummy_outputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.output_tensor_meta] - return bmreq.make_run_fn(dummy_inputs, dummy_outputs) - def codegen_kernel(self, kernel_name): arg_defs, _, _, _ = self.kernel_group.args.mlir_argdefs() arg_defs = ",\n".ljust(25).join(arg_defs) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py index 52979d73..77826730 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_common.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py @@ -82,7 +82,7 @@ def outer_func_render(self, kernel_name, input_args): Y = self.output_node Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) + eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) options = dict( kernel=self.kernel, KERNEL_NAME=kernel_name, @@ -93,8 +93,8 @@ def outer_func_render(self, kernel_name, input_args): OUTPUT=Y, PADDING_H=self.padding[0], PADDING_W=self.padding[1], - VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, - BACKENDSIM_EAGER_MODE=eager_mode, + VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE, + TOGSIM_EAGER_MODE=eager_mode, input_reorder=self.input_reorder ) code = self._template_from_string(self.WRAPPER_TEMPLATE).render(**options) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py index 26018a94..0bf01421 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py @@ -5,7 +5,6 @@ from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode from PyTorchSimFrontend.mlir import mlir_common -from PyTorchSimFrontend import extension_config CONV_TEMPLATE = r""" // Multi Channel Tile Conv2D kernel @@ -121,7 +120,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if BACKENDSIM_EAGER_MODE %} + {%- if TOGSIM_EAGER_MODE %} yield ({{KERNEL_NAME}}, ) {%- endif %} """ diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py index a2959b4d..92b9a525 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py @@ -121,7 +121,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if BACKENDSIM_EAGER_MODE %} + {%- if TOGSIM_EAGER_MODE %} yield ({{KERNEL_NAME}}, ) {%- endif %} """ diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py index afbe9289..ab124852 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py @@ -121,7 +121,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if BACKENDSIM_EAGER_MODE %} + {%- if TOGSIM_EAGER_MODE %} yield ({{KERNEL_NAME}}, ) {%- endif %} """ diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py index 777d0a7b..66aa0a27 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py @@ -125,7 +125,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if BACKENDSIM_EAGER_MODE %} + {%- if TOGSIM_EAGER_MODE %} yield ({{KERNEL_NAME}}, ) {%- endif %} """ diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py index c2120e7b..6271b548 100644 --- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py @@ -1,4 +1,3 @@ -import os import json from pathlib import Path from torch import empty_strided diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py index 6508ea86..af59d88f 100644 --- a/PyTorchSimFrontend/mlir/mlir_lowering.py +++ b/PyTorchSimFrontend/mlir/mlir_lowering.py @@ -15,7 +15,7 @@ from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate -from PyTorchSimFrontend.extension_config import CONFIG_VECTOR_LANE, CONFIG_USE_TIMING_POOLING +from PyTorchSimFrontend import extension_config aten = torch.ops.aten aten_spmm = MLIRExternKernelChoice(torch.sparse.mm, "custom_op::sparse_addmm") @@ -106,11 +106,11 @@ def convolution( layout = conv_layout(x, weight, None, **kwargs) # Select conv kernel - if BATCH == 1 and stride[0] == 1: + if BATCH == 1 and stride[0] == 1 and extension_config.CONFIG_SINGLE_BATCH_CONV: mlir_template = MLIRConvSingleBatchTemplate([x, weight, bias], layout, **kwargs) - elif BATCH == 1 and stride[0] != 1: + elif BATCH == 1 and stride[0] != 1 and extension_config.CONFIG_SINGLE_BATCH_CONV: mlir_template = MLIRConvSingleBatchStridedTemplate([x, weight, bias], layout, **kwargs) - elif I_C < CONFIG_VECTOR_LANE // 8: # 8 is hard-coded for now. This should be changed to a better heuristic. + elif I_C < extension_config.CONFIG_VECTOR_LANE // 8 and extension_config.CONFIG_MULTI_TILE_CONV: # 8 is hard-coded for now. This should be changed to a better heuristic. mlir_template = MLIRConvMultiTileTemplate([x, weight, bias], layout, **kwargs) else: mlir_template = MLIRConvTemplate([x, weight, bias], layout, **kwargs) @@ -187,5 +187,5 @@ def custom_unsafe_index(x, indices): lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()}) lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()}) lowerings.update({getattr(aten._unsafe_index, overload): custom_unsafe_index for overload in aten._unsafe_index.overloads()}) -if CONFIG_USE_TIMING_POOLING: +if extension_config.CONFIG_USE_TIMING_POOLING: lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py index 2cca36b6..3658f992 100644 --- a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py +++ b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py @@ -1,11 +1,9 @@ -import os from typing import List, Optional, cast from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import Buffer from torch._inductor.ir import IRNode -from torch._inductor.ir import ReinterpretView from PyTorchSimFrontend.mlir import mlir_common import sympy diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 26b90401..38603319 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -3,12 +3,9 @@ import sympy from functools import reduce import operator -from sympy import symbols, sympify, Symbol -from collections import OrderedDict -from concurrent.futures import ThreadPoolExecutor +from sympy import symbols, sympify from PyTorchSimFrontend import extension_config from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel -from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest from torch._inductor import config from torch._inductor.scheduler import BaseScheduling, FusedSchedulerNode, SchedulerNode, BaseSchedulerNode @@ -97,6 +94,8 @@ def can_fuse_vertical(self, node1, node2): return self.can_fuse_horizontal(node1, node2) def can_fuse_horizontal(self, node1, node2): + if not extension_config.CONFIG_FUSION: + return False if (len(node1.get_nodes())+ len(node2.get_nodes())) > self.max_fusion_size: return False _, (vars1, reduce1) = node1.group @@ -217,7 +216,7 @@ def codegen_nodes(self, nodes): ex_kernel.call_kernel(kernel_name) _, args, _, _ = ex_kernel.args.mlir_argdefs() args = ", ".join(args) - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) + eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) if (eager_mode): V.graph.wrapper_code.writeline( f"yield ({kernel_name}, ({args}))" @@ -288,7 +287,7 @@ def codegen_template(self, template_node, epilogue_nodes): kernel.call_kernel(kernel_name) V.graph.removed_buffers |= kernel.removed_buffers _, args, _, _ = self.kernel_group.args.mlir_argdefs() - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) + eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) if (eager_mode): target_kernel_name = kernel_name if kernel.outer_func_name is None else kernel.outer_func_name + f"_{len(args)}" args = ", ".join(args) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index e6e9dd0c..df3621eb 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -13,8 +13,8 @@ from typing import List, Optional from unittest.mock import patch -from torch._inductor.codegen.common import Kernel, KernelTemplate, ChoiceCaller, OpOverrides, CSE, DeferredLine -from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, View +from torch._inductor.codegen.common import KernelTemplate, ChoiceCaller, CSE, DeferredLine +from torch._inductor.ir import Buffer, IRNode, TemplateBuffer from torch._inductor.select_algorithm import PartialRender from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller from torch._inductor.autotune_process import TensorMeta @@ -29,7 +29,7 @@ from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode from torch._inductor.codegen import common -from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_DIR, CONFIG_AUTOTUNE_TEMPLATE_TOPK +from PyTorchSimFrontend import extension_config from . import mlir_common class IndentedBufferGroup: @@ -234,7 +234,7 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane) if check_spad_size: - dir_path = f"{CONFIG_TORCHSIM_DIR}/validation/gemm_candidates" + dir_path = f"{extension_config.CONFIG_TORCHSIM_DIR}/validation/gemm_candidates" os.makedirs(dir_path, exist_ok=True) file_path = f"{dir_path}/gemm_{M}_{K}_{N}.txt" line_to_write = f"{tile_M} {tile_K} {tile_N}\n" @@ -494,7 +494,7 @@ def make_choices(self, tile_candidates, render, template_node, prologue_nodes, e print(f"[Auto-tune] Trying tile size: {list(tile_info)}") src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info) bench_runner = self.run_bench([template_node], self.kernel_name, src_code) - choices.append((bench_runner, src_code, tile_info)) + choices.append((bench_runner, src_code, tile_info, self.loop_size)) self.reset(reason=None) return choices @@ -506,7 +506,12 @@ def _log_autotune_result(self, best_choice, best_cycle): ) def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes): - src_code = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) + if extension_config.CONFIG_AUTOTUNE_TEMPLATE and len(tile_candidates): + src_code, loop_size = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) + self.loop_size = loop_size + else: + tile_info = tile_candidates[0] if tile_candidates else None + src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info) with V.set_kernel_handler(self): self.meta_kernel() @@ -1118,7 +1123,7 @@ def set_tile_size(self, template_fusion_info, prologue=False): numel_per_lane = tile_desc.get_numel_per_lane() r_tile_size = tile_desc.get_tile_size()[-1] nr_outer_loop = (numel_per_lane + r_tile_size-1) // r_tile_size - tile_desc.vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality... + tile_desc.vmap.forced_vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality... self.reduction_fusion = True self.r_tile_size = tile_desc.get_tile_size()[-1] @@ -1129,7 +1134,7 @@ def set_tile_size(self, template_fusion_info, prologue=False): self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_loop_idx, nr_outer_loop) else: - tile_desc.vec_size=64 + tile_desc.vmap.forced_vec_size = 64 if prologue: self.prologue_compute_body_loop.size = tile_desc.get_numel_per_lane() @@ -1225,7 +1230,7 @@ def make_kernel_render( template=self, kwargs=kwargs ) - tile_candidates = self.get_tile_candidates(**kwargs)[:CONFIG_AUTOTUNE_TEMPLATE_TOPK] + tile_candidates = self.get_tile_candidates(**kwargs)[:extension_config.CONFIG_AUTOTUNE_TEMPLATE_TOPK] return kernel, tile_candidates, render return MLIRTemplateCaller( diff --git a/README.md b/README.md index 4289195e..dbfdf2e8 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ The `tests` directory contains several AI workloads examples. ```bash python tests/test_matmul.py ``` -The result is stored to `TORCHSIM_DUMP_PATH/hash/backendsim_result/`. The log file contains detailed core, memory, and interconnect stats. +The result is stored to `TORCHSIM_DUMP_PATH/hash/togsim_result/`. The log file contains detailed core, memory, and interconnect stats. ### Run Your Own Model on PyTorchSim You can run your own PyTorch model on PyTorchSim by setting up a custom NPU device. @@ -131,9 +131,9 @@ Wrapper Codegen Path = /tmp/torchinductor_root/yd/cyda7nhzv5mtakfhfcxtmmhtsv6kg7 [Gem5Simulator] cmd> /workspace/gem5/build/RISCV/gem5.opt -r --stdout-file=sto.log -d /tmp/torchinductor/tmp/fy6nnyudtno/m5out /root/workspace/PyTorchSim/gem5_script/script_systolic.py -c /tmp/torchinductor/tmp/fy6nnyudtno/cycle_bin --vlane 128 [Gem5Simulator] Simulation is still running... [SpikeSimulator] cmd> spike --isa rv64gcv --varch=vlen:256,elen:64 --vectorlane-size=128 -m0x80000000:0x1900000000,0x2000000000:0x1000000 --scratchpad-base-paddr=137438953472 --scratchpad-base-vaddr=3489660928 --scratchpad-size=131072 --kernel-addr=0000000000010400:10846 --base-path=/tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001 /workspace/riscv-pk/build/pk /tmp/torchinductor/tmp/fy6nnyudtno/validation_binary /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/arg0_1/0.raw /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/arg1_1/0.raw /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/buf0/0.raw -[BackendSimulator] cmd> /root/workspace/PyTorchSim/PyTorchSimBackend/build/bin/Simulator --config /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/attribute/0 -[BackendSimulator] Simulation is still running.. -[BackendSimulator] Simulation of "/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/fy6nnyudtno/backendsim_result/0" +[TOGSimulator] cmd> /root/workspace/PyTorchSim/TOGSim/build/bin/Simulator --config /root/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/attribute/0 +[TOGSimulator] Simulation is still running.. +[TOGSimulator] Simulation of "/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/0" ---------------------------- |Matmul Forward Test Passed| ---------------------------- @@ -143,25 +143,25 @@ Simulation consists of three steps 1. `Gem5Simulator` obatins compute latency for TOG. 2. `SpikeSimulator` verifies the output code. -3. `BackendSimulator` simulates a NPU architecture. +3. `TOGSimulator` simulates a NPU architecture. If you want to turn off the `SpikeSimulator` for fast simulation, you can set as below. ```bash -export TORCHSIM_VALIDATION_MODE=False +export TORCHSIM_FUNCTIONAL_MODE=False ``` Log contains memory & core stats. ```bash [info] HBM2-CH_0: avg BW utilization 37% (255 reads, 128 writes) [info] Row hits: 359, Row misses: 26, Row conflicts: 0 [info] ========= Core stat ========= -[info] Core [0] : Systolic array [0] Utilization(%) 0.00, active cycle 0, idle cycle 1014 -[info] Core [0] : Systolic array [1] Utilization(%) 12.62, active cycle 128, idle cycle 886 -[info] Core [0] : TMA active cycle 3 TMA idle cycle 1011 DRAM BW 182.000 GB/s (6144) -[info] Core [0] : Vector Unit Utilization(%) 4.34, active cycle 44, idle_cycle 0 -[info] Core [0] : Numa hit count : 0, Numa miss count : 0 -[info] Core [0] : Total cycle 1014 -[info] Total execution cycle: 1014 -[info] Simulation time: 0.039296 seconds +[info] Core [0] : Systolic array [0] Utilization(%) 0.00, active_cycles 0, idle_cycles 1014 +[info] Core [0] : Systolic array [1] Utilization(%) 12.62, active_cycles 128, idle_cycles 886 +[info] Core [0] : DMA active_cycles 3 DMA idle_cycles 1011 DRAM BW 182.000 GB/s (6144) +[info] Core [0] : Vector Unit Utilization(%) 4.34, active_cycles 44, idle_cycle 0 +[info] Core [0] : NUMA local memory: 34 requests, remote memory: 0 requests +[info] Core [0] : Total_cycles 1014 +[info] Total execution cycles: 1014 +[info] Wall-clock time for simulation: 0.039296 seconds ``` The log is dumped in `TORCHSIM_DUMP_PATH` and you can set the path as below. ```bash @@ -186,7 +186,7 @@ Our load generator supports multi-tenancy experiments. You can run a simple exam python tests/test_scheduler.py ``` Below is an example code of multi-tenancy `resnet18` and `EncoderBlock`. -In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSim config file(`.json`). The compiled PyTorch models are then registered with a unique model id. +In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSimulator config file(`.json`). The compiled PyTorch models are then registered with a unique model id. ```python3 import os @@ -195,11 +195,11 @@ import torch from torchvision.models import resnet18 from test_transformer import EncoderBlock base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') -config = f'{base_path}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' +config = f'{base_path}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' sys.path.append(base_path) from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request -scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) +scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) # Register compiled model target_model0 = resnet18().eval() @@ -344,14 +344,14 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing ## TOGSim Configuration ![NPU_Core](./docs/npu_core.jpg) -`PyTorchSimBackend/configs` directory contains example NPU configuration files in the JSON format. +`TOGSim/configs` directory contains example NPU configuration files in the JSON format. ``` "num_cores" : 2, // Number of NPU cores - "core_freq" : 940, // Core's frequency (MHz) + "core_freq_mhz" : 940, // Core's frequency (MHz) "num_systolic_array_per_core" : 2, // Number of systolic array per core "dram_type" : "ramulator2", // DRAM type (ex. ramulator2, simple) - "dram_freq" : 940, // DRAM frequency (MHz) + "dram_freq_mhz" : 940, // DRAM frequency (MHz) "dram_channels": 32, // Number of DRAM channels "dram_req_size": 32, // DRAM request size (B) "dram_latency" : 10, // DRAM latency (cycle) @@ -361,9 +361,10 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing "l2d_type" : "datacache", "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32", - "icnt_type" : "simple", // Interconnect type (ex. booksim, simple) - "icnt_latency" : 7, // Interconnect latency (cycle) - "icnt_freq" : 28000, // Interconnect frequency (MHz) + "icnt_type" : "simple", // Interconnect type (ex. booksim, simple) + "icnt_latency" : 7, // Interconnect latency (cycle) + "icnt_freq_mhz" : 940, // Interconnect frequency (MHz) + "icnt_injection_ports_per_core" : 16 // Interconnect injection ports per core "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", // Booksim2 config file path "precision" : 4, // Element's precision in tensor (Byte) @@ -376,7 +377,7 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing ``` You can set TOGSim config path as below. ```bash -export TORCHSIM_CONFIG=/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json +export TORCHSIM_CONFIG=/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json ``` ## Future Works Currently, PyTorchSim supports PyTorch 2.2. Support for newer versions will be added soon. diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 10358321..0b633fa9 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -5,7 +5,7 @@ from pathlib import Path import importlib.util from PyTorchSimFrontend.extension_codecache import hash_prefix -from Simulator.simulator import BackendSimulator +from Simulator.simulator import TOGSimulator from PyTorchSimFrontend import extension_config def import_module_from_path(module_name, path): @@ -144,7 +144,7 @@ class PyTorchSimRunner: PARTITION_BUSY = 0 PARTITION_IDLE = 1 SELECT_NOTHING = 2 - def __init__(self, backend_simulator : BackendSimulator, num_partion=1) -> None: + def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None: self.module = self.setup_device() self.num_partion = num_partion self.launch_model_dicts = [] @@ -156,11 +156,11 @@ def __init__(self, backend_simulator : BackendSimulator, num_partion=1) -> None: self.partition_state.append(self.PARTITION_IDLE) self.finish_req_dict = {} - self.backend_simulator = backend_simulator + self.tog_simulator = tog_simulator # Dry run for compile and create generator - os.environ["BACKENDSIM_DRYRUN"] = "1" - os.environ["BACKENDSIM_EAGER_MODE"] = "1" + os.environ["TOGSIM_DRYRUN"] = "1" + os.environ["TOGSIM_EAGER_MODE"] = "1" @staticmethod def setup_device(): @@ -171,7 +171,7 @@ def setup_device(): import torch.utils.cpp_extension module = torch.utils.cpp_extension.load( - name="extension_device", + name="npu", sources=[ str(source_file), ], @@ -179,7 +179,7 @@ def setup_device(): verbose=True, ) - torch.utils.rename_privateuse1_backend("extension_device") + torch.utils.rename_privateuse1_backend("npu") from torch._inductor.codegen.common import ( get_scheduling_for_device, get_wrapper_codegen_for_device, @@ -192,13 +192,13 @@ def setup_device(): MLIRScheduling ) register_backend_for_device( - "extension_device", MLIRScheduling, ExtensionWrapperCodegen + "npu", MLIRScheduling, ExtensionWrapperCodegen ) assert( - get_scheduling_for_device("extension_device") == MLIRScheduling + get_scheduling_for_device("npu") == MLIRScheduling ) assert( - get_wrapper_codegen_for_device("extension_device") + get_wrapper_codegen_for_device("npu") == ExtensionWrapperCodegen ) return module @@ -222,7 +222,7 @@ def is_all_idle(self): return all([self.is_partition_idle(i) for i in range(self.num_partion)]) def prepare_model(self, req_model: SchedulerDNNModel): - result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "backend_result", req_model.model_name) + result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "togsim_result", req_model.model_name) os.makedirs(result_path, exist_ok=True) index = str(len(os.listdir(result_path))) @@ -244,7 +244,7 @@ def prepare_launch_kernel(self, kernel, inputs): onnx_path = os.path.join(result_path, "tile_graph.onnx") attribute_path = os.path.join(runtime_path, "attribute") - attribute_path = self.backend_simulator.create_attribute_file(attribute_path, inputs) + attribute_path = self.tog_simulator.create_attribute_file(attribute_path, inputs) return onnx_path, attribute_path def launch_kernel(self, current_cycle, partion_idx=0): @@ -260,11 +260,11 @@ def launch_kernel(self, current_cycle, partion_idx=0): else: onnx_path, attribute_path = kernel, inputs self.partition_state[partion_idx] = self.PARTITION_BUSY - return self.backend_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx) + return self.tog_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx) class FIFORunner(PyTorchSimRunner): - def __init__(self, backend_simulator: BackendSimulator, num_partion=1) -> None: - super().__init__(backend_simulator, num_partion) + def __init__(self, tog_simulator: TOGSimulator, num_partion=1) -> None: + super().__init__(tog_simulator, num_partion) def select_kernel(self, partition_idx): while len(self.nested_launch_model_dicts[partition_idx]) or len(self.launch_model_dicts[partition_idx]): @@ -298,8 +298,8 @@ def select_kernel(self, partition_idx): return self.SELECT_NOTHING class RoundRobinRunner(PyTorchSimRunner): - def __init__(self, backend_simulator: BackendSimulator, num_partion=1) -> None: - super().__init__(backend_simulator, num_partion) + def __init__(self, tog_simulator: TOGSimulator, num_partion=1) -> None: + super().__init__(tog_simulator, num_partion) self.next_pointer = None def select_kernel(self, partition_idx): @@ -347,7 +347,7 @@ class Scheduler: FIFO_ENGINE = 0 RR_ENGINE = 1 - def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, backend_config=extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) -> None: + def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG) -> None: self.current_cycle = 0 self.max_batch = max_batch self.num_request_queue = num_request_queue @@ -356,13 +356,13 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, self.request_queue.append([]) self.finish_queue : List[Request] = [] - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - self.backend_simulator = BackendSimulator(backend_path, backend_config) - self.backend_simulator.interactive_simulation() + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + self.tog_simulator = TOGSimulator(togsim_path, togsim_config) + self.tog_simulator.interactive_simulation() if engine_select == Scheduler.FIFO_ENGINE: - self.execution_engine = FIFORunner(self.backend_simulator, self.num_request_queue) + self.execution_engine = FIFORunner(self.tog_simulator, self.num_request_queue) elif engine_select == Scheduler.RR_ENGINE: - self.execution_engine = RoundRobinRunner(self.backend_simulator, self.num_request_queue) + self.execution_engine = RoundRobinRunner(self.tog_simulator, self.num_request_queue) else: print(f"Not supporetd engine type {engine_select}") exit(1) @@ -469,8 +469,8 @@ def schedule(self): # Need to forward the time until next_arrival_time if self.execution_engine.is_all_idle(): - reason = self.backend_simulator.until(self.msec_to_cycle(next_time)) - self.current_cycle = self.backend_simulator.cycle() + reason = self.tog_simulator.until(self.msec_to_cycle(next_time)) + self.current_cycle = self.tog_simulator.cycle() else: self.run(next_time) return @@ -490,8 +490,8 @@ def execute_cycle(): return [] # Schedule jobs and update the current time - result_list = self.backend_simulator.until(self.msec_to_cycle(until_time)) - self.current_cycle = self.backend_simulator.cycle() + result_list = self.tog_simulator.until(self.msec_to_cycle(until_time)) + self.current_cycle = self.tog_simulator.cycle() for core_idx in result_list: # Kernel is finished. So set idle state @@ -526,7 +526,7 @@ def is_request_queue_empty(self): def is_finished(self): if self.is_request_queue_empty() and self.execution_engine.is_all_idle(): - self.backend_simulator.wait() + self.tog_simulator.wait() return True return False @@ -534,7 +534,7 @@ def current_time(self): return self.cycle_to_msec(self.current_cycle) def cycle_to_msec(self, cycle): - freq = self.backend_simulator.get_core_freq() + freq = self.tog_simulator.get_core_freq() return cycle / (freq / 1000) def msec_to_cycle(self, msec): @@ -542,5 +542,5 @@ def msec_to_cycle(self, msec): if (msec == -1): return msec - freq = self.backend_simulator.get_core_freq() + freq = self.tog_simulator.get_core_freq() return int(msec * (freq / 1000)) diff --git a/Simulator/simulator.py b/Simulator/simulator.py index bd048538..c586c2fd 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -12,7 +12,7 @@ import torch import numpy as np -from PyTorchSimFrontend.llvm.llvm_common import LLVMKernelArgs +from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs from PyTorchSimFrontend import extension_config TORCH_TO_NUMPY = { @@ -64,10 +64,10 @@ def dump_args(self, args, arg_attributes, load_path, dump_path): for (arg_name, arg_attribute), arg in zip(arg_attributes, args): size = arg_attribute[2] if arg_attribute[1] != torch.bool else (arg_attribute[2] + 7) // 8 array_size.append(size) - if LLVMKernelArgs.is_llvm_arg_in(arg_attribute[0]): + if MLIRKernelArgs.is_mlir_arg_in(arg_attribute[0]): index = self.write_arg(arg, load_path, arg_name) file_path.append(os.path.join(load_path, arg_name, f'{index}.raw')) - elif LLVMKernelArgs.is_llvm_arg_out(arg_attribute[0]): + elif MLIRKernelArgs.is_mlir_arg_out(arg_attribute[0]): path = os.path.join(dump_path, arg_name) os.makedirs(path, exist_ok=True) file_path.append(os.path.join(path, f'{self.get_biggest_filename(path)}.raw')) @@ -101,8 +101,9 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size= os.makedirs(os.path.join(runtime_path, "indirect_access"), exist_ok=True) os.makedirs(os.path.join(runtime_path, "dma_access"), exist_ok=True) run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}' - if not silent_mode: - print("[SpikeSimulator] cmd> ", run) + if not silent_mode and extension_config.CONFIG_DEBUG_MODE: + print("[Spike] cmd> ", run) + print("[Spike] Running Spike simulator") run_cmd = shlex.split(run) try: stdout_setting = subprocess.DEVNULL if silent_mode else None @@ -110,7 +111,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size= subprocess.check_call(run_cmd, stdout=stdout_setting, stderr=stderr_setting) except subprocess.CalledProcessError as e: if not silent_mode: - print("[SpikeSimulator] Command failed with exit code", e.returncode) + print("[Spike] Command failed with exit code", e.returncode) error_msg = "" if e.returncode == 200: error_msg = "INVALID_SPAD_ACCESS" @@ -121,7 +122,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size= raise RuntimeError(f"{error_msg}") for (arg_name, arg_attribute), arg, path in zip(arg_attributes, args, file_path): - if LLVMKernelArgs.is_llvm_arg_out(arg_attribute[0]): + if MLIRKernelArgs.is_mlir_arg_out(arg_attribute[0]): self.load_tensor(arg, arg_name, arg_attribute, path) if cleanup: @@ -155,7 +156,7 @@ def show_progress(): while not finished: i = (i + 1) % 3 tail = "." * i + " " * (3-i) - sys.stdout.write("\r[Gem5Simulator] Simulation is still running." + tail) + sys.stdout.write("\r[Gem5] Gem5 is running." + tail) time.sleep(1) print("") @@ -163,9 +164,10 @@ def show_progress(): gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-r", "--stdout-file=sto.log", "-d", dir_path, extension_config.CONFIG_GEM5_SCRIPT_PATH, "-c", target_binary, "--vlane", str(vectorlane_size)] try: # Create progress thread - is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) or silent_mode + is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) or silent_mode if not is_dryrun: - print("[Gem5Simulator] cmd> ", " ".join(gem5_cmd)) + if extension_config.CONFIG_DEBUG_MODE: + print("[Gem5] cmd> ", " ".join(gem5_cmd)) finished = False progress_thread = threading.Thread(target=show_progress) progress_thread.start() @@ -175,11 +177,11 @@ def show_progress(): else: output = subprocess.check_output(gem5_cmd, stderr=subprocess.DEVNULL) except subprocess.CalledProcessError as e: - print(f"[Gem5Simulator] Gem5 simulation failed with error: \"{e.output.decode()}\"") + print(f"[Gem5] Gem5 simulation failed with error: \"{e.output.decode()}\"") if not is_dryrun: finished = True progress_thread.join() - raise RuntimeError(f"GEM5 Simulation Failed: \"{e.output.decode()}\"") + raise RuntimeError(f"Gem5 Simulation Failed: \"{e.output.decode()}\"") with open(f"{dir_path}/stats.txt", "r") as stat_file: raw_list = stat_file.readlines() @@ -188,18 +190,18 @@ def show_progress(): cycle_list = cycle_list[:-1] return cycle_list -class BackendSimulator(): - BACKEND_RESULT_PATH_KEY = "BACKEND_RESULT_PATH" - FINISH_STR = "Simulation Finished" +class TOGSimulator(): + TOGSIM_RESULT_PATH_KEY = "TOGSIM_RESULT_PATH" + FINISH_STR = "Simulation finished" ALLOC_POOL = dict() # For eagermode buffer plan - def __init__(self, backend_path, config_path, vectorlane_size=-1) -> None: - self.base_dir = backend_path + def __init__(self, togsim_path, config_path, vectorlane_size=-1) -> None: + self.base_dir = togsim_path self.config_path = config_path self.config_json = self.load_json(self.config_path) self.process = None self.vectorlane_size = vectorlane_size - def get_backend_command(self): + def get_togsim_command(self): bin = os.path.join(self.base_dir, "build/bin/Simulator") config = os.path.join(self.base_dir, self.config_path) cmd = f"{bin} --config {config}" @@ -211,16 +213,16 @@ def show_progress(): while not finished: i = (i + 1) % 3 tail = "." * i + " " * (3-i) - sys.stdout.write("\r[BackendSimulator] Simulation is still running." + tail) + sys.stdout.write("\r[TOGSim] TOGSim is running." + tail) time.sleep(1) print("") - cmd = f"{self.get_backend_command()} --models_list {model_path}" - if extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL: - cmd += f" --log_level {extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL}" + cmd = f"{self.get_togsim_command()} --models_list {model_path}" + if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: + cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" if attribute_path: cmd = f"{cmd} --attributes_list {attribute_path}" - if not silent_mode: - print("[BackendSimulator] cmd> ", cmd) + if not silent_mode and extension_config.CONFIG_DEBUG_MODE: + print("[TOGSim] cmd> ", cmd) # Create progress thread if not silent_mode: @@ -236,25 +238,26 @@ def show_progress(): if not silent_mode: finished = True progress_thread.join() - print("[BackendSimulator] Command failed with exit code", e.returncode) - print("[BackendSimulator] Error output:", e.output) + print("[TOGSim] Command failed with exit code", e.returncode) + print("[TOGSim] Error output:", e.output) assert 0 # Save result to result_path - result_path = os.path.join(os.path.dirname(model_path), "backendsim_result") + result_path = os.path.join(os.path.dirname(model_path), "togsim_result") os.makedirs(result_path, exist_ok=True) file_name = str(len(os.listdir(result_path))) result_path = os.path.join(result_path, file_name) with open(result_path, "w") as f: f.write(result.decode()) - print(f'[BackendSimulator] Simulation of "{model_path}" is stored to "{result_path}"') + print(f'[TOGSim] Simulation of "{model_path}" is stored to "{result_path}"') return result_path def interactive_simulation(self): - cmd = f"{self.get_backend_command()} --mode interactive" - if extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL: - cmd += f" --log_level {extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL}" + cmd = f"{self.get_togsim_command()} --mode interactive" + if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: + cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" - print("[BackendSimulator] cmd> ", cmd) + if extension_config.CONFIG_DEBUG_MODE: + print("[TOGSim] cmd> ", cmd) if self.process is None: self.process = subprocess.Popen( shlex.split(cmd), @@ -263,27 +266,27 @@ def interactive_simulation(self): universal_newlines=True ) else: - print("[BackendSimulator] Simulator is already running.") + print("[TOGSim] Simulator is already running.") def stop(self): if self.process: self.process.terminate() self.process.wait() self.process = None - print("[BackendSimulator] Simulator stopped.") + print("[TOGSim] Simulator stopped.") def wait(self): if self.process: - print("[BackendSimulator] Waiting for simulation to complete...") + print("[TOGSim] Waiting for simulation to complete...") self.quit() self.process.wait() self.process = None - print("[BackendSimulator] Simulation completed.") + print("[TOGSim] Simulation completed.") def send_command(self, command): if self.process: try: - if not extension_config.CONFIG_BACKENDSIM_DRYRUN: + if not extension_config.CONFIG_TOGSIM_DRYRUN: print(command, flush=True) self.process.stdin.write(command + '\n') self.process.stdin.flush() @@ -367,8 +370,8 @@ def load_json(self, config_path): raise ValueError(f"Invalid JSON format: {e}") def get_core_freq(self): - if "core_freq" in self.config_json: - return self.config_json["core_freq"] * 1000 * 1000 # MHz + if "core_freq_mhz" in self.config_json: + return self.config_json["core_freq_mhz"] * 1000 * 1000 # MHz else: raise KeyError("Key 'core_freq' not found in JSON.") @@ -403,13 +406,13 @@ def get_result_from_file(result_path): simulation_finished_idx = -1 simulation_finished = False for idx, line in enumerate(lines): - if BackendSimulator.FINISH_STR in line: + if TOGSimulator.FINISH_STR in line: simulation_finished = True simulation_finished_idx = idx break if simulation_finished_idx == -1: - print("[BackendSimulator] Tried to parsing wrong formated output file!") + print("[TOGSim] Tried to parsing wrong formated output file!") return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time total_stat_lines = lines[simulation_finished_idx:] @@ -440,15 +443,15 @@ def get_result_from_file(result_path): if 'DRAM: AVG BW Util' in line: avg_dram_bw = float(re.search(r'AVG BW Util (\d+\.?\d*)%', line).group(1)) - if 'Total execution cycle' in line: - total_cycle = int(re.search(r'Total execution cycle: (\d+)', line).group(1)) + if 'Total execution cycles' in line: + total_cycle = int(re.search(r'Total execution cycles: (\d+)', line).group(1)) # Parse total simulation time - if 'Simulation time' in line: - simulation_time = float(re.search(r'Simulation time: (\d+\.?\d*) seconds', line).group(1)) + if 'Wall-clock time for simulation' in line: + simulation_time = float(re.search(r'Wall-clock time for simulation: (\d+\.?\d*) seconds', line).group(1)) return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time, total_cycle if __name__ == "__main__": - sim = BackendSimulator("/workspace/PyTorchSim/PyTorchSimBackend", "/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json") + sim = TOGSimulator("/workspace/PyTorchSim/TOGSim", "/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json") sim.interactive_simulation() sim.until(4000) \ No newline at end of file diff --git a/PyTorchSimBackend/CMakeLists.txt b/TOGSim/CMakeLists.txt similarity index 100% rename from PyTorchSimBackend/CMakeLists.txt rename to TOGSim/CMakeLists.txt diff --git a/PyTorchSimBackend/conanfile.txt b/TOGSim/conanfile.txt similarity index 100% rename from PyTorchSimBackend/conanfile.txt rename to TOGSim/conanfile.txt diff --git a/PyTorchSimBackend/configs/booksim2_configs/anynet.icnt b/TOGSim/configs/booksim2_configs/anynet.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/anynet.icnt rename to TOGSim/configs/booksim2_configs/anynet.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/anynet_file b/TOGSim/configs/booksim2_configs/anynet_file similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/anynet_file rename to TOGSim/configs/booksim2_configs/anynet_file diff --git a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt similarity index 75% rename from PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt rename to TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt index d18ff6e7..3102fecc 100644 --- a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt +++ b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt @@ -2,7 +2,7 @@ use_map = 0 flit_size = 32 topology = anynet -network_file = /workspace/PyTorchSim/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net +network_file = /workspace/PyTorchSim/TOGSim/configs/booksim2_configs/chiplet_32_32_2.net routing_function = min subnets = 1 routing_delay = 4 diff --git a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.net similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net rename to TOGSim/configs/booksim2_configs/chiplet_32_32_2.net diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m16.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m16.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m16.icnt rename to TOGSim/configs/booksim2_configs/fly_c16_m16.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m32.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m32.icnt rename to TOGSim/configs/booksim2_configs/fly_c16_m32.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c16_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m1.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m1.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m1.icnt rename to TOGSim/configs/booksim2_configs/fly_c1_m1.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m2.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m2.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m2.icnt rename to TOGSim/configs/booksim2_configs/fly_c1_m2.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c1_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c2_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c2_m32.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c2_m32.icnt rename to TOGSim/configs/booksim2_configs/fly_c2_m32.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c2_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c2_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c2_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c2_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m32.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m32.icnt rename to TOGSim/configs/booksim2_configs/fly_c32_m32.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m4.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m4.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m4.icnt rename to TOGSim/configs/booksim2_configs/fly_c32_m4.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c32_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m2.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m2.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m2.icnt rename to TOGSim/configs/booksim2_configs/fly_c4_m2.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m32.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m32.icnt rename to TOGSim/configs/booksim2_configs/fly_c4_m32.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c4_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c64_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-age.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8_sif-age.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-age.icnt rename to TOGSim/configs/booksim2_configs/fly_c64_m8_sif-age.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt rename to TOGSim/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/make_anynet_topology.py b/TOGSim/configs/booksim2_configs/make_anynet_topology.py similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/make_anynet_topology.py rename to TOGSim/configs/booksim2_configs/make_anynet_topology.py diff --git a/PyTorchSimBackend/configs/booksim2_configs/mesh_sif-age.icnt b/TOGSim/configs/booksim2_configs/mesh_sif-age.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/mesh_sif-age.icnt rename to TOGSim/configs/booksim2_configs/mesh_sif-age.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/mesh_sif-rr.icnt b/TOGSim/configs/booksim2_configs/mesh_sif-rr.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/mesh_sif-rr.icnt rename to TOGSim/configs/booksim2_configs/mesh_sif-rr.icnt diff --git a/TOGSim/configs/heterogeneous_c2_simple_noc.json b/TOGSim/configs/heterogeneous_c2_simple_noc.json new file mode 100644 index 00000000..60f160a8 --- /dev/null +++ b/TOGSim/configs/heterogeneous_c2_simple_noc.json @@ -0,0 +1,29 @@ +{ + "core_type" : ["stonne", "ws_mesh"], + "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", + "num_cores" : 2, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_stonne_per_core" : 8, + "num_stonne_port" : 64, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 16, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 940, + "icnt_injection_ports_per_core" : 16, + + "num_partition" : 2, + "partition": { + "core_0":0, + "core_1":1 + } +} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/ramulator2_configs/DDR4.yaml b/TOGSim/configs/ramulator2_configs/DDR4.yaml similarity index 100% rename from PyTorchSimBackend/configs/ramulator2_configs/DDR4.yaml rename to TOGSim/configs/ramulator2_configs/DDR4.yaml diff --git a/PyTorchSimBackend/configs/ramulator2_configs/HBM2.yaml b/TOGSim/configs/ramulator2_configs/HBM2.yaml similarity index 100% rename from PyTorchSimBackend/configs/ramulator2_configs/HBM2.yaml rename to TOGSim/configs/ramulator2_configs/HBM2.yaml diff --git a/PyTorchSimBackend/configs/ramulator2_configs/HBM2_TPUv3.yaml b/TOGSim/configs/ramulator2_configs/HBM2_TPUv3.yaml similarity index 100% rename from PyTorchSimBackend/configs/ramulator2_configs/HBM2_TPUv3.yaml rename to TOGSim/configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/PyTorchSimBackend/configs/ramulator_configs/ALDRAM-config.cfg b/TOGSim/configs/ramulator_configs/ALDRAM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/ALDRAM-config.cfg rename to TOGSim/configs/ramulator_configs/ALDRAM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/DDR3-config.cfg b/TOGSim/configs/ramulator_configs/DDR3-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/DDR3-config.cfg rename to TOGSim/configs/ramulator_configs/DDR3-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/DDR4-config.cfg b/TOGSim/configs/ramulator_configs/DDR4-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/DDR4-config.cfg rename to TOGSim/configs/ramulator_configs/DDR4-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/DSARP-config.cfg b/TOGSim/configs/ramulator_configs/DSARP-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/DSARP-config.cfg rename to TOGSim/configs/ramulator_configs/DSARP-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/GDDR5-config.cfg b/TOGSim/configs/ramulator_configs/GDDR5-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/GDDR5-config.cfg rename to TOGSim/configs/ramulator_configs/GDDR5-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config.cfg b/TOGSim/configs/ramulator_configs/HBM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config.cfg rename to TOGSim/configs/ramulator_configs/HBM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg b/TOGSim/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FCFS.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FCFS.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FCFS.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_FCFS.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg b/TOGSim/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg b/TOGSim/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBMx0.5ch-config.cfg b/TOGSim/configs/ramulator_configs/HBMx0.5ch-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBMx0.5ch-config.cfg rename to TOGSim/configs/ramulator_configs/HBMx0.5ch-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBMx2ch-config.cfg b/TOGSim/configs/ramulator_configs/HBMx2ch-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBMx2ch-config.cfg rename to TOGSim/configs/ramulator_configs/HBMx2ch-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/LPDDR3-config.cfg b/TOGSim/configs/ramulator_configs/LPDDR3-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/LPDDR3-config.cfg rename to TOGSim/configs/ramulator_configs/LPDDR3-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/LPDDR4-config.cfg b/TOGSim/configs/ramulator_configs/LPDDR4-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/LPDDR4-config.cfg rename to TOGSim/configs/ramulator_configs/LPDDR4-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/PCM-config.cfg b/TOGSim/configs/ramulator_configs/PCM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/PCM-config.cfg rename to TOGSim/configs/ramulator_configs/PCM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/SALP-config.cfg b/TOGSim/configs/ramulator_configs/SALP-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/SALP-config.cfg rename to TOGSim/configs/ramulator_configs/SALP-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/STTMRAM-config.cfg b/TOGSim/configs/ramulator_configs/STTMRAM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/STTMRAM-config.cfg rename to TOGSim/configs/ramulator_configs/STTMRAM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/TLDRAM-config.cfg b/TOGSim/configs/ramulator_configs/TLDRAM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/TLDRAM-config.cfg rename to TOGSim/configs/ramulator_configs/TLDRAM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/WideIO-config.cfg b/TOGSim/configs/ramulator_configs/WideIO-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/WideIO-config.cfg rename to TOGSim/configs/ramulator_configs/WideIO-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/WideIO2-config.cfg b/TOGSim/configs/ramulator_configs/WideIO2-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/WideIO2-config.cfg rename to TOGSim/configs/ramulator_configs/WideIO2-config.cfg diff --git a/TOGSim/configs/stonne_big_c1_simple_noc.json b/TOGSim/configs/stonne_big_c1_simple_noc.json new file mode 100644 index 00000000..5d563fbe --- /dev/null +++ b/TOGSim/configs/stonne_big_c1_simple_noc.json @@ -0,0 +1,22 @@ +{ + "core_type" : ["stonne"], + "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", + "num_cores" : 1, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_stonne_per_core" : 8, + "num_stonne_port" : 64, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 8, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycless": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 940, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/stonne_single_c1_simple_noc.json b/TOGSim/configs/stonne_single_c1_simple_noc.json new file mode 100644 index 00000000..304e84b3 --- /dev/null +++ b/TOGSim/configs/stonne_single_c1_simple_noc.json @@ -0,0 +1,22 @@ +{ + "core_type" : ["stonne"], + "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", + "num_cores" : 1, + "core_freq_mhz" : 700, + "core_stats_print_period_cycles" : 10000, + "num_stonne_per_core" : 1, + "num_stonne_port" : 8, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 700, + "dram_channels": 8, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 700, + "icnt_injection_ports_per_core" : 8 +} \ No newline at end of file diff --git a/TOGSim/configs/stonne_validation_c1_simple_noc.json b/TOGSim/configs/stonne_validation_c1_simple_noc.json new file mode 100644 index 00000000..38d4244c --- /dev/null +++ b/TOGSim/configs/stonne_validation_c1_simple_noc.json @@ -0,0 +1,23 @@ +{ + "core_type" : ["stonne"], + "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", + "num_cores" : 1, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 10000, + "num_stonne_per_core" : 1, + "num_stonne_port" : 32, + + "dram_type" : "simple", + "dram_freq_mhz" : 1000, + "dram_channels": 1, + "dram_req_size_byte": 32, + "dram_latency" : 100, + "dram_stats_print_period_cycles": 10000, + "l2d_type" : "datacache", + "l2d_config" : "S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 1000, + "icnt_injection_ports_per_core" : 8 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json new file mode 100644 index 00000000..58519aad --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json @@ -0,0 +1,19 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 700, + "core_stats_print_period_cycles" : 10000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :700, + "dram_channels": 16, + "dram_req_size_byte": 32, + + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + + "icnt_type" : "booksim2", + "icnt_freq_mhz" : 700, + "icnt_injection_ports_per_core" : 16, + "booksim_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt" +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json new file mode 100644 index 00000000..1257891c --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json @@ -0,0 +1,18 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 700, + "core_stats_print_period_cycles" : 10000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 700, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycless": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 700, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json new file mode 100644 index 00000000..b92d8029 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json @@ -0,0 +1,19 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 16, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 940, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json new file mode 100644 index 00000000..34896fc7 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json @@ -0,0 +1,19 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 8, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 940, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json new file mode 100644 index 00000000..59be9fd4 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json @@ -0,0 +1,21 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 1050, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 4, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :1200, + "dram_channels": 16, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + "l2d_type" : "datacache", + "l2d_config" : "S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 1050, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json new file mode 100644 index 00000000..271e7e1c --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json @@ -0,0 +1,19 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "booksim2", + "icnt_freq_mhz" : 940, + "icnt_injection_ports_per_core" : 16, + "booksim_config_path" : "../configs/booksim2_configs/fly_c32_m32.icnt" +} diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json similarity index 70% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json rename to TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json index d51e9c5f..7382c4c8 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json +++ b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json @@ -1,26 +1,25 @@ { "num_cores" : 2, - "core_freq" : 940, + "core_freq_mhz" : 940, "sram_size" : 65536, "core_print_interval" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", "dram_freq" : 940, - "dram_channels": 32, + "dram_channels": 8, "dram_req_size": 32, "dram_latency" : 10, - "dram_size" : 32, "dram_nbl" : 2, "dram_print_interval": 10000, "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - + "icnt_type" : "booksim2", - "icnt_latency" : 7, - "icnt_freq" : 28000, - "icnt_node_per_core" : 1, - "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m32.icnt", - + "icnt_latency" : 1, + "icnt_freq" : 940, + "icnt_injection_ports_per_core" : 16, + "icnt_config_path" : "../configs/booksim2_configs/fly_c32_m8.icnt", + "precision" : 4, "scheduler" : "simple", "num_partition" : 2, diff --git a/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json new file mode 100644 index 00000000..6561ffc0 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json @@ -0,0 +1,21 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "dram_num_partitions" : 2, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "booksim2", + "icnt_freq_mhz" : 1000, + "icnt_injection_ports_per_core" : 16, + "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", + "icnt_stats_print_period_cycles" : 10000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json new file mode 100644 index 00000000..fad63cc3 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json @@ -0,0 +1,20 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "dram_num_partitions" : 1, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "booksim2", + "icnt_freq_mhz" : 1000, + "icnt_injection_ports_per_core" : 16, + "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt" +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json new file mode 100644 index 00000000..2207f2b9 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json @@ -0,0 +1,18 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 700, + "core_stats_print_period_cycles" : 10000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :700, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 700, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json new file mode 100644 index 00000000..76f51b40 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json @@ -0,0 +1,19 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 940, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json new file mode 100644 index 00000000..42e003c7 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json @@ -0,0 +1,25 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 940, + "icnt_injection_ports_per_core" : 16, + + "num_partition" : 2, + "partition": { + "core_0":0, + "core_1":1 + } +} diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json new file mode 100644 index 00000000..44ec72fe --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json @@ -0,0 +1,21 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 1050, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 4, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :1200, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + "l2d_type" : "datacache", + "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 1050, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json new file mode 100644 index 00000000..045407b7 --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json @@ -0,0 +1,17 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 1, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycles": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json new file mode 100644 index 00000000..d8f95d70 --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json @@ -0,0 +1,17 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 2, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycles": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json new file mode 100644 index 00000000..a5fa9585 --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json @@ -0,0 +1,17 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 4, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycles": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c1_booksim.json b/TOGSim/configs/systolic_ws_8x8_c1_booksim.json new file mode 100644 index 00000000..cf560171 --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c1_booksim.json @@ -0,0 +1,17 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 1, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycless": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "booksim2", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json new file mode 100644 index 00000000..8da61d72 --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json @@ -0,0 +1,17 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 1, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycles": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json new file mode 100644 index 00000000..c5f429f9 --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json @@ -0,0 +1,18 @@ +{ + "core_type" : ["ws_mesh","ws_mesh"], + "num_cores" : 2, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 1, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycless": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json new file mode 100644 index 00000000..254520be --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json @@ -0,0 +1,17 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 2, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycles": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json new file mode 100644 index 00000000..e39867a7 --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json @@ -0,0 +1,17 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 4, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycless": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/PyTorchSimBackend/extern/booksim b/TOGSim/extern/booksim similarity index 100% rename from PyTorchSimBackend/extern/booksim rename to TOGSim/extern/booksim diff --git a/PyTorchSimBackend/extern/onnx b/TOGSim/extern/onnx similarity index 100% rename from PyTorchSimBackend/extern/onnx rename to TOGSim/extern/onnx diff --git a/PyTorchSimBackend/extern/protobuf b/TOGSim/extern/protobuf similarity index 100% rename from PyTorchSimBackend/extern/protobuf rename to TOGSim/extern/protobuf diff --git a/PyTorchSimBackend/extern/ramulator2 b/TOGSim/extern/ramulator2 similarity index 100% rename from PyTorchSimBackend/extern/ramulator2 rename to TOGSim/extern/ramulator2 diff --git a/PyTorchSimBackend/extern/ramulator_custom/.gitignore b/TOGSim/extern/ramulator_custom/.gitignore similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/.gitignore rename to TOGSim/extern/ramulator_custom/.gitignore diff --git a/PyTorchSimBackend/extern/ramulator_custom/CMakeLists.txt b/TOGSim/extern/ramulator_custom/CMakeLists.txt similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/CMakeLists.txt rename to TOGSim/extern/ramulator_custom/CMakeLists.txt diff --git a/PyTorchSimBackend/extern/ramulator_custom/include/ramulator/Ramulator.hpp b/TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/include/ramulator/Ramulator.hpp rename to TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Config.cpp b/TOGSim/extern/ramulator_custom/src/Config.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Config.cpp rename to TOGSim/extern/ramulator_custom/src/Config.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Config.h b/TOGSim/extern/ramulator_custom/src/Config.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Config.h rename to TOGSim/extern/ramulator_custom/src/Config.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Controller.h b/TOGSim/extern/ramulator_custom/src/Controller.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Controller.h rename to TOGSim/extern/ramulator_custom/src/Controller.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DDR4.cpp b/TOGSim/extern/ramulator_custom/src/DDR4.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/DDR4.cpp rename to TOGSim/extern/ramulator_custom/src/DDR4.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DDR4.h b/TOGSim/extern/ramulator_custom/src/DDR4.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/DDR4.h rename to TOGSim/extern/ramulator_custom/src/DDR4.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DRAM.h b/TOGSim/extern/ramulator_custom/src/DRAM.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/DRAM.h rename to TOGSim/extern/ramulator_custom/src/DRAM.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/HBM.cpp b/TOGSim/extern/ramulator_custom/src/HBM.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/HBM.cpp rename to TOGSim/extern/ramulator_custom/src/HBM.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/HBM.h b/TOGSim/extern/ramulator_custom/src/HBM.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/HBM.h rename to TOGSim/extern/ramulator_custom/src/HBM.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Memory.h b/TOGSim/extern/ramulator_custom/src/Memory.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Memory.h rename to TOGSim/extern/ramulator_custom/src/Memory.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.cpp b/TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.cpp rename to TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.h b/TOGSim/extern/ramulator_custom/src/MemoryFactory.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.h rename to TOGSim/extern/ramulator_custom/src/MemoryFactory.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Ramulator.cpp b/TOGSim/extern/ramulator_custom/src/Ramulator.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Ramulator.cpp rename to TOGSim/extern/ramulator_custom/src/Ramulator.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Refresh.cpp b/TOGSim/extern/ramulator_custom/src/Refresh.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Refresh.cpp rename to TOGSim/extern/ramulator_custom/src/Refresh.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Refresh.h b/TOGSim/extern/ramulator_custom/src/Refresh.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Refresh.h rename to TOGSim/extern/ramulator_custom/src/Refresh.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Request.cpp b/TOGSim/extern/ramulator_custom/src/Request.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Request.cpp rename to TOGSim/extern/ramulator_custom/src/Request.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Request.h b/TOGSim/extern/ramulator_custom/src/Request.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Request.h rename to TOGSim/extern/ramulator_custom/src/Request.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Scheduler.h b/TOGSim/extern/ramulator_custom/src/Scheduler.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Scheduler.h rename to TOGSim/extern/ramulator_custom/src/Scheduler.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/SpeedyController.h b/TOGSim/extern/ramulator_custom/src/SpeedyController.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/SpeedyController.h rename to TOGSim/extern/ramulator_custom/src/SpeedyController.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/StatType.cpp b/TOGSim/extern/ramulator_custom/src/StatType.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/StatType.cpp rename to TOGSim/extern/ramulator_custom/src/StatType.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/StatType.h b/TOGSim/extern/ramulator_custom/src/StatType.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/StatType.h rename to TOGSim/extern/ramulator_custom/src/StatType.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Statistics.h b/TOGSim/extern/ramulator_custom/src/Statistics.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Statistics.h rename to TOGSim/extern/ramulator_custom/src/Statistics.h diff --git a/PyTorchSimBackend/extern/stonneCore b/TOGSim/extern/stonneCore similarity index 100% rename from PyTorchSimBackend/extern/stonneCore rename to TOGSim/extern/stonneCore diff --git a/PyTorchSimBackend/include/Cache.h b/TOGSim/include/Cache.h similarity index 100% rename from PyTorchSimBackend/include/Cache.h rename to TOGSim/include/Cache.h diff --git a/PyTorchSimBackend/include/Cache_defs.h b/TOGSim/include/Cache_defs.h similarity index 100% rename from PyTorchSimBackend/include/Cache_defs.h rename to TOGSim/include/Cache_defs.h diff --git a/PyTorchSimBackend/include/Cache_stats.h b/TOGSim/include/Cache_stats.h similarity index 100% rename from PyTorchSimBackend/include/Cache_stats.h rename to TOGSim/include/Cache_stats.h diff --git a/PyTorchSimBackend/include/Common.h b/TOGSim/include/Common.h similarity index 100% rename from PyTorchSimBackend/include/Common.h rename to TOGSim/include/Common.h diff --git a/PyTorchSimBackend/include/Core.h b/TOGSim/include/Core.h similarity index 85% rename from PyTorchSimBackend/include/Core.h rename to TOGSim/include/Core.h index a3d55fa2..e4d2f30a 100644 --- a/PyTorchSimBackend/include/Core.h +++ b/TOGSim/include/Core.h @@ -9,7 +9,7 @@ #include "Dram.h" #include "Tile.h" #include "SimulationConfig.h" -#include "TMA.h" +#include "DMA.h" class Core { public: @@ -27,9 +27,9 @@ class Core { virtual void pop_memory_request(); virtual mem_fetch* top_memory_request() { return _request_queue.front(); } virtual void push_memory_response(mem_fetch* response); - void check_tag() { _tma.check_table(); } - void inc_numa_hit() { _stat_numa_hit++; } - void inc_numa_miss() { _stat_numa_miss++; } + void check_tag() { _dma.check_table(); } + void inc_numa_local_access() { _stat_numa_local_access++; } + void inc_numa_remote_access() { _stat_numa_remote_access++; } std::queue>& get_compute_pipeline(int compute_type); enum { @@ -50,20 +50,18 @@ class Core { /* Core id & config file */ const uint32_t _id; const SimulationConfig _config; - size_t _sram_size; - size_t _used_sram_size; uint32_t _num_systolic_array_per_core; uint32_t _systolic_array_rr = 0; - /* TMA Unit */ - TMA _tma; + /* DMA Unit */ + DMA _dma; /* cycle */ cycle_type _core_cycle; cycle_type _stat_tot_vu_compute_cycle = 0; std::vector _stat_tot_sa_compute_cycle; - cycle_type _stat_tot_tma_cycle = 0; - cycle_type _stat_tot_tma_idle_cycle = 0; + cycle_type _stat_tot_dma_cycle = 0; + cycle_type _stat_tot_dma_idle_cycle = 0; cycle_type _stat_tot_vu_compute_idle_cycle = 0; std::vector _stat_tot_sa_compute_idle_cycle; std::vector _stat_inst_count; @@ -71,13 +69,13 @@ class Core { uint64_t _stat_tot_mem_response = 0; uint64_t _stat_gemm_inst = 0; uint64_t _stat_skip_dma = 0; - uint64_t _stat_numa_hit = 0; - uint64_t _stat_numa_miss = 0; + uint64_t _stat_numa_local_access = 0; + uint64_t _stat_numa_remote_access = 0; cycle_type _stat_vu_compute_cycle = 0; std::vector _stat_sa_compute_cycle; - cycle_type _stat_tma_cycle = 0; - cycle_type _stat_tma_idle_cycle = 0; + cycle_type _stat_dma_cycle = 0; + cycle_type _stat_dma_idle_cycle = 0; cycle_type _stat_vu_compute_idle_cycle = 0; std::vector _stat_sa_compute_idle_cycle; uint64_t _stat_mem_response = 0; diff --git a/PyTorchSimBackend/include/TMA.h b/TOGSim/include/DMA.h similarity index 94% rename from PyTorchSimBackend/include/TMA.h rename to TOGSim/include/DMA.h index f8355470..2f41c6f3 100644 --- a/PyTorchSimBackend/include/TMA.h +++ b/TOGSim/include/DMA.h @@ -1,8 +1,9 @@ -#ifndef TMA_H -#define TMA_H +#ifndef DMA_H +#define DMA_H #include #include +#include #include #include #include "Instruction.h" @@ -16,9 +17,9 @@ struct VectorCompare { } }; -class TMA { +class DMA { public: - TMA(uint32_t id, uint32_t dram_req_size); + DMA(uint32_t id, uint32_t dram_req_size); void issue_tile(std::shared_ptr inst); bool is_finished() { return _finished; } @@ -114,7 +115,7 @@ class TMA { } std::shared_ptr& get_current_inst() { return _current_inst; } - std::shared_ptr> get_memory_access(); + std::shared_ptr> get_memory_access(cycle_type core_cycle, int nr_req); uint32_t generate_mem_access_id(); const uint32_t get_max_dim() { return _max_dim; } @@ -130,5 +131,7 @@ class TMA { bool _finished=true; std::map, uint32_t>> tag_table; std::map, std::vector>>> waiters; + std::queue _pending_accesses; + bool _generated_once = false; }; #endif \ No newline at end of file diff --git a/PyTorchSimBackend/include/DelayQueue.h b/TOGSim/include/DelayQueue.h similarity index 100% rename from PyTorchSimBackend/include/DelayQueue.h rename to TOGSim/include/DelayQueue.h diff --git a/PyTorchSimBackend/include/Dram.h b/TOGSim/include/Dram.h similarity index 99% rename from PyTorchSimBackend/include/Dram.h rename to TOGSim/include/Dram.h index 5e51b96d..d28ac25f 100644 --- a/PyTorchSimBackend/include/Dram.h +++ b/TOGSim/include/Dram.h @@ -6,7 +6,7 @@ #include #include "Common.h" -#include "TMA.h" +#include "DMA.h" #include "ramulator2.hh" #include "Hashing.h" #include "Cache.h" diff --git a/PyTorchSimBackend/include/Hashing.h b/TOGSim/include/Hashing.h similarity index 100% rename from PyTorchSimBackend/include/Hashing.h rename to TOGSim/include/Hashing.h diff --git a/PyTorchSimBackend/include/Instruction.h b/TOGSim/include/Instruction.h similarity index 96% rename from PyTorchSimBackend/include/Instruction.h rename to TOGSim/include/Instruction.h index 4c14dd81..9fad13f4 100644 --- a/PyTorchSimBackend/include/Instruction.h +++ b/TOGSim/include/Instruction.h @@ -60,9 +60,7 @@ class Instruction : public std::enable_shared_from_this { std::vector get_trace_address() { return _trace_address; } bool load_indirect_index(const std::string& path, uint64_t*& indirect_index, const std::vector& tile_size); void set_trace_address(std::vector& trace_address) { _trace_address = trace_address; } - size_t get_free_sram_size() { return _free_sram_size; } addr_type get_base_dram_address() { return dram_addr; } - void set_free_sram_size(size_t sram_size) { _free_sram_size=sram_size; } void* get_owner() { return _owner; } void set_owner(void *owner) { _owner = owner;} void set_owner_ready_queue(std::list>* q) { _owner_ready_queue_ref = q; } @@ -103,7 +101,6 @@ class Instruction : public std::enable_shared_from_this { size_t _tile_numel; size_t _nr_waiting_request=0; size_t _precision=0; - size_t _free_sram_size=0; addr_type dram_addr; uint32_t _numa_id = 0; // For DMA instruction int _compute_type = 0; diff --git a/PyTorchSimBackend/include/Interconnect.h b/TOGSim/include/Interconnect.h similarity index 95% rename from PyTorchSimBackend/include/Interconnect.h rename to TOGSim/include/Interconnect.h index 8467b7aa..e6b325d0 100644 --- a/PyTorchSimBackend/include/Interconnect.h +++ b/TOGSim/include/Interconnect.h @@ -1,6 +1,6 @@ #ifndef INTERCONNECT_H #define INTERCONNECT_H -#include "TMA.h" +#include "DMA.h" #include "booksim2/Interconnect.hpp" #include #include @@ -51,8 +51,9 @@ class SimpleInterconnect : public Interconnect { mem_fetch* access; }; - std::vector> _in_buffers; + std::vector>> _in_buffers; std::vector> _out_buffers; + std::vector _rr_next_src; std::vector _busy_node; }; diff --git a/PyTorchSimBackend/include/IntervalTree.h b/TOGSim/include/IntervalTree.h similarity index 100% rename from PyTorchSimBackend/include/IntervalTree.h rename to TOGSim/include/IntervalTree.h diff --git a/PyTorchSimBackend/include/L2Cache.h b/TOGSim/include/L2Cache.h similarity index 100% rename from PyTorchSimBackend/include/L2Cache.h rename to TOGSim/include/L2Cache.h diff --git a/PyTorchSimBackend/include/Memfetch.h b/TOGSim/include/Memfetch.h similarity index 100% rename from PyTorchSimBackend/include/Memfetch.h rename to TOGSim/include/Memfetch.h diff --git a/PyTorchSimBackend/include/Model.h b/TOGSim/include/Model.h similarity index 100% rename from PyTorchSimBackend/include/Model.h rename to TOGSim/include/Model.h diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/TOGSim/include/SimulationConfig.h similarity index 82% rename from PyTorchSimBackend/include/SimulationConfig.h rename to TOGSim/include/SimulationConfig.h index 06a41c9f..64cfa223 100644 --- a/PyTorchSimBackend/include/SimulationConfig.h +++ b/TOGSim/include/SimulationConfig.h @@ -18,8 +18,7 @@ struct SimulationConfig { std::vector core_type; std::string stonne_config_path; uint32_t num_cores; - uint32_t core_freq; - uint32_t sram_size; + uint32_t core_freq_mhz; uint32_t core_print_interval = 0; uint32_t num_systolic_array_per_core = 1; uint32_t num_stonne_per_core = 1; @@ -28,7 +27,8 @@ struct SimulationConfig { /* DRAM config */ DramType dram_type; uint32_t dram_num_partitions = 1; - uint32_t dram_freq; + uint32_t dram_channels_per_partitions = 0; + uint32_t dram_freq_mhz; uint32_t dram_channels; uint32_t dram_req_size; uint32_t dram_latency; @@ -43,11 +43,11 @@ struct SimulationConfig { /* ICNT config */ IcntType icnt_type; - uint32_t icnt_node_per_core = 1; + uint32_t icnt_injection_ports_per_core = 1; std::string icnt_config_path; - uint32_t icnt_freq; + uint32_t icnt_freq_mhz; uint32_t icnt_latency; - uint32_t icnt_print_interval=0; + uint32_t icnt_stats_print_period_cycles=0; /* Sheduler config */ uint32_t num_partition=1; @@ -57,7 +57,6 @@ struct SimulationConfig { std::map partiton_map; /* Other configs */ - uint32_t precision; std::string layout; uint64_t align_address(uint64_t addr) { @@ -65,6 +64,6 @@ struct SimulationConfig { } float max_dram_bandwidth() { - return dram_freq * dram_channels * dram_req_size * 2 / dram_nbl / 1000; // GB/s + return dram_freq_mhz * dram_channels * dram_req_size * 2 / dram_nbl / 1000; // GB/s } }; \ No newline at end of file diff --git a/PyTorchSimBackend/include/Simulator.h b/TOGSim/include/Simulator.h similarity index 100% rename from PyTorchSimBackend/include/Simulator.h rename to TOGSim/include/Simulator.h diff --git a/PyTorchSimBackend/include/SparseCore.h b/TOGSim/include/SparseCore.h similarity index 100% rename from PyTorchSimBackend/include/SparseCore.h rename to TOGSim/include/SparseCore.h diff --git a/PyTorchSimBackend/include/Tile.h b/TOGSim/include/Tile.h similarity index 100% rename from PyTorchSimBackend/include/Tile.h rename to TOGSim/include/Tile.h diff --git a/PyTorchSimBackend/include/TileGraph.h b/TOGSim/include/TileGraph.h similarity index 100% rename from PyTorchSimBackend/include/TileGraph.h rename to TOGSim/include/TileGraph.h diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/TOGSim/include/TileGraphParser.h similarity index 100% rename from PyTorchSimBackend/include/TileGraphParser.h rename to TOGSim/include/TileGraphParser.h diff --git a/PyTorchSimBackend/include/scheduler/Scheduler.h b/TOGSim/include/scheduler/Scheduler.h similarity index 100% rename from PyTorchSimBackend/include/scheduler/Scheduler.h rename to TOGSim/include/scheduler/Scheduler.h diff --git a/PyTorchSimBackend/src/CMakeLists.txt b/TOGSim/src/CMakeLists.txt similarity index 100% rename from PyTorchSimBackend/src/CMakeLists.txt rename to TOGSim/src/CMakeLists.txt diff --git a/PyTorchSimBackend/src/Cache.cc b/TOGSim/src/Cache.cc similarity index 100% rename from PyTorchSimBackend/src/Cache.cc rename to TOGSim/src/Cache.cc diff --git a/PyTorchSimBackend/src/Cache_stats.cc b/TOGSim/src/Cache_stats.cc similarity index 100% rename from PyTorchSimBackend/src/Cache_stats.cc rename to TOGSim/src/Cache_stats.cc diff --git a/PyTorchSimBackend/src/Common.cc b/TOGSim/src/Common.cc similarity index 74% rename from PyTorchSimBackend/src/Common.cc rename to TOGSim/src/Common.cc index 687f32f5..b5c092b3 100644 --- a/PyTorchSimBackend/src/Common.cc +++ b/TOGSim/src/Common.cc @@ -39,15 +39,14 @@ SimulationConfig initialize_config(json config) { for (int i=0; i(config, "core_print_interval"); + parsed_config.core_print_interval = get_config_value(config, "core_stats_print_period_cycles"); /* Stonne config */ if (config.contains("stonne_config_path")) @@ -63,20 +62,27 @@ SimulationConfig initialize_config(json config) { else throw std::runtime_error(fmt::format("Not implemented dram type {} ", (std::string)config["dram_type"])); - parsed_config.dram_freq = config["dram_freq"]; + parsed_config.dram_freq_mhz = config["dram_freq_mhz"]; if (config.contains("dram_latency")) parsed_config.dram_latency = config["dram_latency"]; - if (config.contains("dram_config_path")) - parsed_config.dram_config_path = config["dram_config_path"]; + if (config.contains("ramulator_config_path")) + parsed_config.dram_config_path = config["ramulator_config_path"]; parsed_config.dram_channels = config["dram_channels"]; - if (config.contains("dram_req_size")) - parsed_config.dram_req_size = config["dram_req_size"]; - if (config.contains("dram_print_interval")) - parsed_config.dram_print_interval = config["dram_print_interval"]; - if(config.contains("dram_nbl")) - parsed_config.dram_nbl = config["dram_nbl"]; - if (config.contains("dram_num_partitions")) + if (config.contains("dram_req_size_byte")) + parsed_config.dram_req_size = config["dram_req_size_byte"]; + if (config.contains("dram_stats_print_period_cycles")) + parsed_config.dram_print_interval = config["dram_stats_print_period_cycles"]; + if(config.contains("dram_num_burst_length")) + parsed_config.dram_nbl = config["dram_num_burst_length"]; + if (config.contains("dram_num_partitions")) { parsed_config.dram_num_partitions = config["dram_num_partitions"]; + if (parsed_config.dram_channels % parsed_config.dram_num_partitions != 0) { + throw std::runtime_error("[Config] DRAM channels must be divisible by dram_num_partitions"); + } + } + parsed_config.dram_channels_per_partitions = + parsed_config.dram_channels / parsed_config.dram_num_partitions; + /* L2D config */ if (config.contains("l2d_type")) { @@ -104,17 +110,18 @@ SimulationConfig initialize_config(json config) { else throw std::runtime_error(fmt::format("Not implemented icnt type {} ", (std::string)config["icnt_type"])); - parsed_config.icnt_freq = config["icnt_freq"]; + parsed_config.icnt_freq_mhz = config["icnt_freq_mhz"]; if (config.contains("icnt_latency")) parsed_config.icnt_latency = config["icnt_latency"]; - if (config.contains("icnt_config_path")) - parsed_config.icnt_config_path = config["icnt_config_path"]; - if (config.contains("icnt_print_interval")) - parsed_config.icnt_print_interval = config["icnt_print_interval"]; - if (config.contains("icnt_node_per_core")) - parsed_config.icnt_node_per_core = config["icnt_node_per_core"]; + if (config.contains("booksim_config_path")) + parsed_config.icnt_config_path = config["booksim_config_path"]; + if (config.contains("icnt_stats_print_period_cycles")) + parsed_config.icnt_stats_print_period_cycles = config["icnt_stats_print_period_cycles"]; + if (config.contains("icnt_injection_ports_per_core")) + parsed_config.icnt_injection_ports_per_core = config["icnt_injection_ports_per_core"]; - parsed_config.scheduler_type = config["scheduler"]; + if (config.contains("scheduler")) + parsed_config.scheduler_type = config["scheduler"]; if (config.contains("num_partition")) parsed_config.num_partition = config["num_partition"]; if (config.contains("partition")) { diff --git a/PyTorchSimBackend/src/Core.cc b/TOGSim/src/Core.cc similarity index 71% rename from PyTorchSimBackend/src/Core.cc rename to TOGSim/src/Core.cc index 4be41a70..30858193 100644 --- a/PyTorchSimBackend/src/Core.cc +++ b/TOGSim/src/Core.cc @@ -4,11 +4,9 @@ Core::Core(uint32_t id, SimulationConfig config) : _id(id), _config(config), _core_cycle(0), - _stat_tma_cycle(0), + _stat_dma_cycle(0), _num_systolic_array_per_core(config.num_systolic_array_per_core), - _tma(id, config.dram_req_size) { - _sram_size = _config.sram_size * 1024; - _used_sram_size = 0; + _dma(id, config.dram_req_size) { _sa_compute_pipeline.resize(_num_systolic_array_per_core); _stat_tot_sa_compute_cycle.resize(_num_systolic_array_per_core); _stat_sa_compute_cycle.resize(_num_systolic_array_per_core); @@ -25,14 +23,9 @@ bool Core::can_issue(const std::shared_ptr& op) { void Core::issue(std::shared_ptr op) { if (op->get_instructions().size()){ - spdlog::trace("[Core {}][{}] New Tile is issued, remain sram: {} Required size: {}, Free size: {}", - _id, _core_cycle, _sram_size-_used_sram_size, op->get_required_sram_size(), - op->get_instructions().back()->get_free_sram_size()); - } else { - spdlog::trace("[Core {}][{}] New Tile is issued, remain sram: {} Required size: {}", - _id, _core_cycle, _sram_size-_used_sram_size, op->get_required_sram_size()); + spdlog::trace("[{}][Core {}][TILE_SCHEDULED]", + _core_cycle, _id); } - //_used_sram_size += op->get_required_sram_size(); for (const auto& inst : op->get_instructions()) { if (inst->is_ready()) op->enqueue_ready(inst); @@ -125,39 +118,38 @@ void Core::dma_cycle() { /* Set tag table of async dma load */ if (instruction->is_dma_read() && instruction->is_async_dma()) { auto& key = instruction->get_tag_id(); - assert(!_tma.get_tag_finish(instruction->subgraph_id, key)); - _tma.set_tag_finish(instruction->subgraph_id, key); - spdlog::trace("[Core {}][{}] {} ASYNC FINISHED, Used sram: {}, Release sram: {}, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", - _id, _core_cycle, opcode_to_string(instruction->get_opcode()), - _used_sram_size, instruction->get_free_sram_size(), + assert(!_dma.get_tag_finish(instruction->subgraph_id, key)); + _dma.set_tag_finish(instruction->subgraph_id, key); + spdlog::trace("[{}][Core {}] {} ASYNC FINISHED, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", + _core_cycle, _id, opcode_to_string(instruction->get_opcode()), instruction->subgraph_id, instruction->get_addr_name(), fmt::format("[{}]", fmt::join(instruction->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(instruction->get_tag_idx_list(), ", ")), fmt::format("[{}]", fmt::join(instruction->get_tag_stride_list(), ", "))); - for (auto & wait_inst : _tma.get_tag_waiter(instruction->subgraph_id, key)) { - _tma.mark_tag_used(instruction->subgraph_id, key); + for (auto & wait_inst : _dma.get_tag_waiter(instruction->subgraph_id, key)) { + _dma.mark_tag_used(instruction->subgraph_id, key); finish_instruction(wait_inst); } } _dma_finished_queue.erase(_dma_finished_queue.begin()); } - if (_tma.is_finished()) { + if (_dma.is_finished()) { /* Finish instruction when it is DMA store */ - if (_tma.get_current_inst() != nullptr) { - std::shared_ptr finished_inst = std::move(_tma.get_current_inst()); + if (_dma.get_current_inst() != nullptr) { + std::shared_ptr finished_inst = std::move(_dma.get_current_inst()); if (finished_inst->is_dma_write()) { /* Only DMA write operation is finished! */ finish_instruction(finished_inst); } else if (finished_inst->is_dma_read() && finished_inst->is_async_dma()) { /* Register tag table for async dma load */ - _tma.register_tag(finished_inst->subgraph_id, finished_inst->get_tag_id()); + _dma.register_tag(finished_inst->subgraph_id, finished_inst->get_tag_id()); finish_instruction(finished_inst); } else if(!finished_inst->is_dma_read()) { - spdlog::error("[Core {}][{}] TMA instruction in not valid", _id, _core_cycle); + spdlog::error("[{}][Core {}] DMA instruction in not valid", _core_cycle, _id); exit(EXIT_FAILURE); } else if (finished_inst->get_opcode() == Opcode::BAR) { - spdlog::trace("[Core {}][{}] {} FINISHED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle, + spdlog::trace("[{}][Core {}] {} FINISHED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, opcode_to_string(finished_inst->get_opcode()), finished_inst->get_addr_name(), fmt::format("[{}]", fmt::join(finished_inst->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(finished_inst->get_tag_idx_list(), ", ")), @@ -170,27 +162,27 @@ void Core::dma_cycle() { /* Issue new DMA operation */ if (!_ld_inst_queue.empty()) { std::shared_ptr inst = _ld_inst_queue.front(); - _tma.issue_tile(inst); + _dma.issue_tile(inst); _ld_inst_queue.pop(); } else if (!_st_inst_queue.empty()) { std::shared_ptr inst = _st_inst_queue.front(); - _tma.issue_tile(inst); + _dma.issue_tile(inst); _st_inst_queue.pop(); } else { - /* TMA is idle */ - _stat_tma_idle_cycle++; + /* DMA is idle */ + _stat_dma_idle_cycle++; return; } } /* Generate memfetch */ - auto access_vec = _tma.get_memory_access(); + auto access_vec = _dma.get_memory_access(_core_cycle, _config.icnt_injection_ports_per_core); for (auto access : *access_vec) { access->set_start_cycle(_core_cycle); _request_queue.push(access); } - /* Increase tma stat cycle */ - _stat_tma_cycle++; + /* Increase dma stat cycle */ + _stat_dma_cycle++; } void Core::cycle() { @@ -218,20 +210,20 @@ void Core::cycle() { /* Check another MOVIN with same tag is issued */ auto& key = inst->get_tag_id(); if (inst->is_sparse_inst()) { - _tma.register_tag(inst->subgraph_id, key); - _tma.set_tag_sparse(inst->subgraph_id, key); + _dma.register_tag(inst->subgraph_id, key); + _dma.set_tag_sparse(inst->subgraph_id, key); finish_instruction(inst); issued = true; _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; break; - } else if (inst->is_async_dma() && _tma.tag_key_exist(inst->subgraph_id, key)) { - bool finished = _tma.get_tag_finish(inst->subgraph_id, key); + } else if (inst->is_async_dma() && _dma.tag_key_exist(inst->subgraph_id, key)) { + bool finished = _dma.get_tag_finish(inst->subgraph_id, key); if (finished) finish_instruction(inst); else - _tma.register_tag_waiter(inst->subgraph_id, key, inst); - spdlog::trace("[Core {}][{}] {} SKIPPED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(), + _dma.register_tag_waiter(inst->subgraph_id, key, inst); + spdlog::trace("[{}][Core {}][SIKIPPED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, + opcode_to_string(inst->get_opcode()), inst->get_addr_name(), fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), @@ -240,8 +232,8 @@ void Core::cycle() { _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; break; } else { - spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(), + spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, + opcode_to_string(inst->get_opcode()), inst->get_addr_name(), fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), @@ -252,8 +244,12 @@ void Core::cycle() { } } case Opcode::MOVOUT: - spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size()); + spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, + opcode_to_string(inst->get_opcode()), + inst->get_addr_name(), + fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), + fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), + fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", "))); _st_inst_queue.push(inst); issued = true; break; @@ -269,13 +265,14 @@ void Core::cycle() { inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle() - overlapped_cycle; inst->bubble_cycle = bubble_cycle; } + if (inst->get_compute_cycle() == 0) { inst->finish_instruction(); static_cast(inst->get_owner())->inc_finished_inst(); _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; instructions.erase(it); } else { - spdlog::trace("[Core {}][SA {}][{}] {}-{} ISSUED, finsh at {}", _id, _systolic_array_rr, _core_cycle, + spdlog::trace("[{}][Core {}][INST_ISSUED][SA {}] {}-{}, finsh at {}", _core_cycle, _id, _systolic_array_rr, opcode_to_string(inst->get_opcode()), inst->get_compute_type(), inst->finish_cycle); target_pipeline.push(inst); issued = true; @@ -288,7 +285,7 @@ void Core::cycle() { case Opcode::BAR: { auto& key = inst->get_tag_id(); - uint32_t finished = _tma.get_tag_finish(inst->subgraph_id, key); + uint32_t finished = _dma.get_tag_finish(inst->subgraph_id, key); if (finished == -1) { for (auto child_inst : inst->get_child_inst()) { if (child_inst->get_opcode() == Opcode::COMP && child_inst->get_compute_type() == MATMUL) { @@ -297,12 +294,12 @@ void Core::cycle() { } finish_instruction(inst); } else if (finished != 0) { - _tma.mark_tag_used(inst->subgraph_id, key); + _dma.mark_tag_used(inst->subgraph_id, key); finish_instruction(inst); } else { - _tma.register_tag_waiter(inst->subgraph_id, key, inst); + _dma.register_tag_waiter(inst->subgraph_id, key, inst); } - spdlog::trace("[Core {}][{}] {} ISSUED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle, + spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_addr_name(), fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), @@ -344,31 +341,26 @@ void Core::cycle() { } void Core::finish_instruction(std::shared_ptr& inst) { - size_t free_sram_size = inst->get_free_sram_size(); if (inst->finished) { - spdlog::error("[Core {}][{}] {} FINISHED, inst already finished!!", _id, _core_cycle, + spdlog::error("[{}][Core {}][ERROR] {} inst already finished!!", _core_cycle, _id, opcode_to_string(inst->get_opcode())); exit(EXIT_FAILURE); } inst->finish_instruction(); static_cast(inst->get_owner())->inc_finished_inst(); if (inst->get_opcode() == Opcode::COMP) { - spdlog::trace("[Core {}][{}] {}-{} FINISHED, Used sram: {}, Release sram: {}", - _id, _core_cycle, opcode_to_string(inst->get_opcode()), inst->get_compute_type(), - _used_sram_size, inst->get_free_sram_size()); + spdlog::trace("[{}][Core {}][INST_FINISHED] {}-{}", + _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_compute_type()); } else if (inst->get_opcode() != Opcode::BAR && inst->is_async_dma()){ - spdlog::trace("[Core {}][{}] {} ASYNC REGISTERED, Used sram: {}, Release sram: {} subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", - _id, _core_cycle, opcode_to_string(inst->get_opcode()), _used_sram_size, - inst->get_free_sram_size(), inst->subgraph_id, inst->get_addr_name(), + spdlog::trace("[{}][Core {}][ASYNC] {} subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", + _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->subgraph_id, inst->get_addr_name(), inst->get_tag_id(), fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", "))); } else if ((inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) && !inst->is_async_dma()) { - spdlog::trace("[Core {}][{}] {} FINISHED, free_sram_size: {} addr_name: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(), - inst->get_addr_name()); + spdlog::trace("[{}][Core {}][INST_FINISHED] {} addr_name: {}", _core_cycle, _id, + opcode_to_string(inst->get_opcode()), inst->get_addr_name()); } - //_used_sram_size -= free_sram_size; } bool Core::running() { @@ -378,7 +370,7 @@ bool Core::running() { for (int i=0; i<_num_systolic_array_per_core;i++) running = running || !_sa_compute_pipeline.at(i).empty(); running = running || !_dma_waiting_queue.empty() || !_dma_finished_queue.empty(); - running = running || !_tma.empty(); + running = running || !_dma.empty(); running = running || !_ld_inst_queue.empty(); running = running || !_st_inst_queue.empty(); return running; @@ -419,43 +411,62 @@ void Core::print_stats() { std::vector sa_utilization; update_stats(); spdlog::info("===== Instructions count ====="); - for (int i=0; i < static_cast(Opcode::COUNT); i++) { - if (i == static_cast(Opcode::COMP)) - spdlog::info("Core [{}] : {} inst count {} (GEMM: {}, Vector: {}), skipped inst count {}", _id, opcode_to_string(static_cast(i)), _stat_inst_count.at(i), _stat_gemm_inst, _stat_inst_count.at(i) - _stat_gemm_inst, _stat_tot_skipped_inst.at(i)); - else - spdlog::info("Core [{}] : {} inst count {}, skipped inst count {}", _id, opcode_to_string(static_cast(i)), _stat_inst_count.at(i), _stat_tot_skipped_inst.at(i)); + for (int i = 0; i < static_cast(Opcode::COUNT); i++) { + auto opcode = static_cast(i); + auto inst = _stat_inst_count.at(i); + auto skipped = _stat_tot_skipped_inst.at(i); + auto name = opcode_to_string(opcode); + + if (opcode == Opcode::COMP) { + auto gemm = _stat_gemm_inst; + auto vector = inst - gemm; + if (skipped) + spdlog::info("Core [{}] : {:8} inst_count {} (GEMM: {}, Vector: {}), skipped inst_count {}", + _id, name, inst, gemm, vector, skipped); + else + spdlog::info("Core [{}] : {:8} inst_count {} (GEMM: {}, Vector: {})", + _id, name, inst, gemm, vector); + } + else { + if (skipped) + spdlog::info("Core [{}] : {:8} inst_count {}, skipped inst_count {}", + _id, name, inst, skipped); + else + spdlog::info("Core [{}] : {:8} inst_count {}", + _id, name, inst); + } } spdlog::info("========= Core stat ========="); for (int i=0; i<_num_systolic_array_per_core; i++) sa_utilization.push_back(static_cast(_stat_tot_sa_compute_cycle.at(i) * 100) / _core_cycle); for (int i=0; i<_num_systolic_array_per_core; i++) - spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i), + spdlog::info("Core [{}] : Systolic array [{}] utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, i, sa_utilization.at(i), _stat_tot_sa_compute_cycle.at(i), _stat_tot_sa_compute_idle_cycle.at(i)); - float dram_bw = _config.dram_req_size * _stat_tot_mem_response * _config.core_freq / (_core_cycle * 1000); // B/cycle - spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_tot_tma_cycle, _stat_tot_tma_idle_cycle, dram_bw, _stat_tot_mem_response); - spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id, + float dram_bw = _config.dram_req_size * _stat_tot_mem_response * _config.core_freq_mhz / (_core_cycle * 1000); // B/cycle + spdlog::info("Core [{}] : DMA active_cycles, {} DMA idle_cycles {}, DRAM BW {:.3f} GB/s ({} responses)", _id, _stat_tot_dma_cycle, _stat_tot_dma_idle_cycle, dram_bw, _stat_tot_mem_response); + spdlog::info("Core [{}] : Vector unit utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id, static_cast(_stat_tot_vu_compute_cycle * 100) / _core_cycle, _stat_tot_vu_compute_cycle, _stat_tot_vu_compute_idle_cycle); - spdlog::info("Core [{}] : Numa hit count : {}, Numa miss count : {}", _id, _stat_numa_hit, _stat_numa_miss); - spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle); + spdlog::info("Core [{}] : NUMA local memory: {} requests, remote memory: {} requests", _id, _stat_numa_local_access, _stat_numa_remote_access); + spdlog::info("Core [{}] : Total_cycles {}", _id, _core_cycle); } void Core::print_current_stats() { std::vector sa_utilization; for (int i=0; i<_num_systolic_array_per_core; i++) sa_utilization.push_back(static_cast(_stat_sa_compute_cycle.at(i) * 100) / _config.core_print_interval); - float dram_bw = _config.dram_req_size * _stat_mem_response * _config.core_freq / (_config.core_print_interval * 1000); // B/cycle + float dram_bw = _config.dram_req_size * _stat_mem_response * _config.core_freq_mhz / (_config.core_print_interval * 1000); // B/cycle auto level = spdlog::level::info; if(_id != 0) level = spdlog::level::debug; spdlog::info("========= Core stat ========="); for (int i=0; i<_num_systolic_array_per_core; i++) - spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i), + spdlog::info("Core [{}] : Systolic array [{}] utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, i, sa_utilization.at(i), _stat_sa_compute_cycle.at(i), _stat_sa_compute_idle_cycle.at(i)); - spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_tma_cycle, _stat_tma_idle_cycle, dram_bw, _stat_mem_response); - spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id, + spdlog::info("Core [{}] : DMA active_cycles {}, DMA idle_cycles {}, DRAM BW {:.3f} GB/s ({} responses)", _id, _stat_dma_cycle, _stat_dma_idle_cycle, dram_bw, _stat_mem_response); + spdlog::info("Core [{}] : Vector unit Utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, static_cast(_stat_vu_compute_cycle * 100) / _config.core_print_interval, _stat_vu_compute_cycle, _stat_vu_compute_idle_cycle); - spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle); + spdlog::info("Core [{}] : Total_cycles {}", _id, _core_cycle); update_stats(); } @@ -468,13 +479,13 @@ void Core::update_stats() { } _stat_tot_vu_compute_cycle += _stat_vu_compute_cycle; - _stat_tot_tma_cycle += _stat_tma_cycle; - _stat_tot_tma_idle_cycle += _stat_tma_idle_cycle; + _stat_tot_dma_cycle += _stat_dma_cycle; + _stat_tot_dma_idle_cycle += _stat_dma_idle_cycle; _stat_tot_mem_response += +_stat_mem_response; _stat_vu_compute_cycle = 0; - _stat_tma_cycle = 0; - _stat_tma_idle_cycle = 0; + _stat_dma_cycle = 0; + _stat_dma_idle_cycle = 0; _stat_vu_compute_idle_cycle = 0; _stat_mem_response = 0; } \ No newline at end of file diff --git a/TOGSim/src/DMA.cc b/TOGSim/src/DMA.cc new file mode 100644 index 00000000..f8f21025 --- /dev/null +++ b/TOGSim/src/DMA.cc @@ -0,0 +1,83 @@ +#include "DMA.h" +#include "TileGraph.h" + +DMA::DMA(uint32_t id, uint32_t dram_req_size) { + _id = id; + _dram_req_size = dram_req_size; + _current_inst = nullptr; + _finished = true; +} + +void DMA::issue_tile(std::shared_ptr inst) { + _current_inst = std::move(inst); + std::vector& tile_size = _current_inst->get_tile_size(); + if (tile_size.size() <= 0 || tile_size.size() > get_max_dim()) { + spdlog::error("[DMA {}] issued tile is not supported format..", _id); + exit(EXIT_FAILURE); + } + _finished = false; +} + +std::shared_ptr> DMA::get_memory_access(cycle_type core_cycle, int nr_req) { + + if (!_generated_once) { + std::shared_ptr> addr_set = + _current_inst->get_dram_address(_dram_req_size); + + Tile* owner = (Tile*)_current_inst->get_owner(); + std::shared_ptr owner_subgraph = owner->get_owner(); + unsigned long long base_daddr = _current_inst->get_base_dram_address(); + + bool is_cacheable = + owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size); + + spdlog::trace("[{}][Core {}][SRAM] Address: 0x{:016x}, Is_cacheable: {}", + core_cycle, _id, base_daddr, is_cacheable); + spdlog::trace("[{}][Core {}][NUMA] Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}", + core_cycle, _id, owner_subgraph->get_core_id(), + _current_inst->get_numa_id(), _current_inst->get_addr_name(), + _current_inst->is_dma_write()); + for (const auto& addr : *addr_set) { + mem_access_type acc_type = + _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W + : mem_access_type::GLOBAL_ACC_R; + mf_type type = + _current_inst->is_dma_write() ? mf_type::WRITE_REQUEST + : mf_type::READ_REQUEST; + + mem_fetch* access = new mem_fetch( + addr, acc_type, type, _dram_req_size, + _current_inst->get_numa_id(), + static_cast(_current_inst.get())); + + access->set_cacheable(is_cacheable); + _current_inst->inc_waiting_request(); + _pending_accesses.push(access); + } + _generated_once = true; + } + + if (nr_req == -1) + nr_req = _pending_accesses.size(); + + // Return pending accesses up to nr_req + auto access_vec = std::make_shared>(); + for (int i = 0; i < nr_req; i++) { + if (_pending_accesses.empty()) + break; + access_vec->push_back(_pending_accesses.front()); + _pending_accesses.pop(); + } + + if (_pending_accesses.empty()) { + _finished = true; + _generated_once = false; + } + + return access_vec; +} + +uint32_t DMA::generate_mem_access_id() { + static uint32_t id_counter{0}; + return id_counter++; +} \ No newline at end of file diff --git a/PyTorchSimBackend/src/DelayQueue.cc b/TOGSim/src/DelayQueue.cc similarity index 100% rename from PyTorchSimBackend/src/DelayQueue.cc rename to TOGSim/src/DelayQueue.cc diff --git a/PyTorchSimBackend/src/Dram.cc b/TOGSim/src/Dram.cc similarity index 97% rename from PyTorchSimBackend/src/Dram.cc rename to TOGSim/src/Dram.cc index ab074bda..089c582e 100644 --- a/PyTorchSimBackend/src/Dram.cc +++ b/TOGSim/src/Dram.cc @@ -17,10 +17,10 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) { _n_bl = config.dram_nbl; _req_size = config.dram_req_size; _n_partitions = config.dram_num_partitions; - _n_ch_per_partition = _n_ch / _n_partitions; + _n_ch_per_partition = config.dram_channels_per_partitions; _config = config; - spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}", config.max_dram_bandwidth(), config.dram_freq, _n_ch, _req_size); + spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}B", config.max_dram_bandwidth(), config.dram_freq_mhz, _n_ch, _req_size); /* Initialize DRAM Channels */ for (int ch = 0; ch < _n_ch; ch++) { m_to_crossbar_queue.push_back(std::queue()); diff --git a/PyTorchSimBackend/src/Hashing.cc b/TOGSim/src/Hashing.cc similarity index 100% rename from PyTorchSimBackend/src/Hashing.cc rename to TOGSim/src/Hashing.cc diff --git a/PyTorchSimBackend/src/Instruction.cc b/TOGSim/src/Instruction.cc similarity index 100% rename from PyTorchSimBackend/src/Instruction.cc rename to TOGSim/src/Instruction.cc diff --git a/PyTorchSimBackend/src/Interconnect.cc b/TOGSim/src/Interconnect.cc similarity index 77% rename from PyTorchSimBackend/src/Interconnect.cc rename to TOGSim/src/Interconnect.cc index 8a684ff7..ab2d5d89 100644 --- a/PyTorchSimBackend/src/Interconnect.cc +++ b/TOGSim/src/Interconnect.cc @@ -4,12 +4,15 @@ SimpleInterconnect::SimpleInterconnect(SimulationConfig config) : _latency(config.icnt_latency) { _cycles = 0; _config = config; - _n_nodes = config.num_cores + config.dram_channels; + _n_nodes = config.num_cores * _config.icnt_injection_ports_per_core + config.dram_channels; _in_buffers.resize(_n_nodes); _out_buffers.resize(_n_nodes); _busy_node.resize(_n_nodes); + _rr_next_src.resize(_n_nodes); for(int node = 0; node < _n_nodes; node++) { _busy_node[node] = false; + _in_buffers.at(node).resize(_n_nodes); + _rr_next_src[node] = 0; } } @@ -19,35 +22,36 @@ bool SimpleInterconnect::running() { } void SimpleInterconnect::cycle() { - for(int node = 0; node < _n_nodes; node++) { - int src_node = (_rr_start + node ) % _n_nodes; - if(!_in_buffers[src_node].empty() && _in_buffers[src_node].front().finish_cycle <= _cycles) { - uint32_t dest = _in_buffers[src_node].front().dest; - if(!_busy_node[dest]) { - _out_buffers[dest].push(_in_buffers[src_node].front().access); - _in_buffers[src_node].pop(); - _busy_node[dest] = true; - // spdlog::trace("PUSH TO OUTBUFFER {} {}", src_node, dest); + for(int dest = 0; dest < _n_nodes; dest++) { + int src_start = _rr_next_src[dest]; + bool pushed = false; + + for(int i = 0; i < _n_nodes; i++) { + int src = (src_start + i) % _n_nodes; + + if (!_in_buffers[src][dest].empty() && + _in_buffers[src][dest].front().finish_cycle <= _cycles) { + + _out_buffers[dest].push(_in_buffers[src][dest].front().access); + _in_buffers[src][dest].pop(); + _rr_next_src[dest] = (src + 1) % _n_nodes; + pushed = true; + break; } } } - - for(int node = 0; node < _n_nodes; node++) { - _busy_node[node] = false; - } - _rr_start = (_rr_start + 1) % _n_nodes; _cycles++; } void SimpleInterconnect::push(uint32_t src, uint32_t dest, mem_fetch* request) { SimpleInterconnect::Entity entity; - if(_in_buffers[src].empty()) + if(_in_buffers[src][dest].empty()) entity.finish_cycle = _cycles + _latency; else - entity.finish_cycle = _in_buffers[src].back().finish_cycle + 1; + entity.finish_cycle = _in_buffers[src][dest].back().finish_cycle + 1; entity.dest = dest; entity.access = request; - _in_buffers[src].push(entity); + _in_buffers[src][dest].push(entity); } bool SimpleInterconnect::is_full(uint32_t nid, mem_fetch* request) { @@ -72,11 +76,11 @@ void SimpleInterconnect::pop(uint32_t nid) { Booksim2Interconnect::Booksim2Interconnect(SimulationConfig config) { _config = config; - _n_nodes = config.num_cores * _config.icnt_node_per_core + config.dram_channels; - spdlog::info("Initialize Booksim2"); + _n_nodes = config.num_cores * _config.icnt_injection_ports_per_core + config.dram_channels; + spdlog::info("Initialize Booksim2"); char* onnxim_path_env = std::getenv("TORCHSIM_DIR"); std::string onnxim_path = onnxim_path_env != NULL? - std::string(onnxim_path_env) + "/PyTorchSimBackend" : std::string("./"); + std::string(onnxim_path_env) + "/TOGSim" : std::string("./"); _config_path = fs::path(onnxim_path).append("configs").append((std::string)config.icnt_config_path).string(); spdlog::info("Config path : {}", _config_path); diff --git a/PyTorchSimBackend/src/L2Cache.cc b/TOGSim/src/L2Cache.cc similarity index 100% rename from PyTorchSimBackend/src/L2Cache.cc rename to TOGSim/src/L2Cache.cc diff --git a/PyTorchSimBackend/src/Simulator.cc b/TOGSim/src/Simulator.cc similarity index 90% rename from PyTorchSimBackend/src/Simulator.cc rename to TOGSim/src/Simulator.cc index 63bd3146..41a2c7a5 100644 --- a/PyTorchSimBackend/src/Simulator.cc +++ b/TOGSim/src/Simulator.cc @@ -3,9 +3,9 @@ Simulator::Simulator(SimulationConfig config) : _config(config), _core_cycles(0) { // Create dram object - _core_period = 1000000 / (config.core_freq); - _icnt_period = 1000000 / (config.icnt_freq); - _dram_period = 1000000 / (config.dram_freq); + _core_period = 1000000 / (config.core_freq_mhz); + _icnt_period = 1000000 / (config.icnt_freq_mhz); + _dram_period = 1000000 / (config.dram_freq_mhz); _core_time = 0; _dram_time = 0; _icnt_time = 0; @@ -14,20 +14,20 @@ Simulator::Simulator(SimulationConfig config) _n_cores = config.num_cores; _n_memories = config.dram_channels; _memory_req_size = config.dram_req_size; - _noc_node_per_core = config.icnt_node_per_core; + _noc_node_per_core = config.icnt_injection_ports_per_core; char* onnxim_path_env = std::getenv("TORCHSIM_DIR"); std::string onnxim_path = onnxim_path_env != NULL? - std::string(onnxim_path_env) + "/PyTorchSimBackend" : std::string("./"); + std::string(onnxim_path_env) + "/TOGSim" : std::string("./"); // Create core objects _cores.resize(_n_cores); for (int core_index = 0; core_index < _n_cores; core_index++) { if (config.core_type[core_index] == CoreType::WS_MESH) { - spdlog::info("[Config/Core] Core {}: {} MHz, Spad size: {} KB, Systolic array per core: {}", - core_index, config.core_freq , config.sram_size, config.num_systolic_array_per_core); + spdlog::info("[Config/Core] Core {}: {} MHz, Systolic array per core: {}", + core_index, config.core_freq_mhz, config.num_systolic_array_per_core); _cores.at(core_index) = std::make_unique(core_index, _config); } else if(config.core_type[core_index] == CoreType::STONNE) { - spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq); + spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq_mhz); _cores.at(core_index) = std::make_unique(core_index, _config); } else { throw std::runtime_error(fmt::format("Not implemented Core type {} ", @@ -51,7 +51,7 @@ Simulator::Simulator(SimulationConfig config) } // Create interconnect object - spdlog::info("[Config/Interconnect] Inerconnect freq: {} MHz", config.icnt_freq); + spdlog::info("[Config/Interconnect] Interconnect freq: {} MHz", config.icnt_freq_mhz); if (config.icnt_type == IcntType::SIMPLE) { spdlog::info("[Config/Interconnect] SimpleInerconnect selected"); _icnt = std::make_unique(config); @@ -62,7 +62,7 @@ Simulator::Simulator(SimulationConfig config) spdlog::error("[Configuration] Invalid interconnect type...!"); exit(EXIT_FAILURE); } - _icnt_interval = config.icnt_print_interval; + _icnt_interval = config.icnt_stats_print_period_cycles; // Initialize Scheduler for (int i=0; itop_memory_request(); front->set_core_id(core_id); if (!_icnt->is_full(port_id, front)) { - //int node_id = _dram->get_channel_id(front) / 16; - //if (core_id == node_id) - // _cores[core_id]->inc_numa_hit(); - //else - // _cores[core_id]->inc_numa_miss(); + int node_id = _dram->get_channel_id(front) / _config.dram_channels_per_partitions; + if (core_id == node_id) + _cores[core_id]->inc_numa_local_access(); + else + _cores[core_id]->inc_numa_remote_access(); _icnt->push(port_id , get_dest_node(front), front); _cores[core_id]->pop_memory_request(); _nr_from_core++; @@ -229,7 +229,7 @@ void Simulator::cycle() { if (IS_ICNT_CYCLE(_cycle_mask)) icnt_cycle(); } - spdlog::info("Simulation Finished"); + spdlog::info("Simulation finished"); for (auto &core: _cores) { core->check_tag(); } @@ -291,5 +291,5 @@ void Simulator::print_core_stat() for (int core_id = 0; core_id < _n_cores; core_id++) { _cores[core_id]->print_stats(); } - spdlog::info("Total execution cycle: {}", _core_cycles); -} \ No newline at end of file + spdlog::info("Total execution cycles: {}", _core_cycles); +} diff --git a/PyTorchSimBackend/src/SparseCore.cc b/TOGSim/src/SparseCore.cc similarity index 86% rename from PyTorchSimBackend/src/SparseCore.cc rename to TOGSim/src/SparseCore.cc index 64d3da55..d5629b9c 100644 --- a/PyTorchSimBackend/src/SparseCore.cc +++ b/TOGSim/src/SparseCore.cc @@ -27,14 +27,14 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config) } Config stonneConfig = stonneCores.at(0)->getStonneConfig(); - unsigned int core_freq = config.core_freq; // MHz; + unsigned int core_freq_mhz = config.core_freq_mhz; // MHz; num_ms = stonneConfig.m_MSNetworkCfg.ms_size; r_port_nr = config.num_stonne_port; w_port_nr = config.num_stonne_port; - double compute_throughput = static_cast(num_ms) * core_freq / 1e3; // FLOPs/sec - double dn_bandwidth = static_cast(r_port_nr) * config.dram_req_size * core_freq * 1e6 / 8.0 / 1e9; // GB/s - double rn_bandwidth = static_cast(w_port_nr) * config.dram_req_size * core_freq * 1e6 / 8.0 / 1e9; // GB/s + double compute_throughput = static_cast(num_ms) * core_freq_mhz / 1e3; // FLOPs/sec + double dn_bandwidth = static_cast(r_port_nr) * config.dram_req_size * core_freq_mhz * 1e6 / 8.0 / 1e9; // GB/s + double rn_bandwidth = static_cast(w_port_nr) * config.dram_req_size * core_freq_mhz * 1e6 / 8.0 / 1e9; // GB/s for (int i=0; i tile) { } } if (selected_core_idx == -1) { - spdlog::error("[StonneCore {}] Faield to issue tile", _id); + spdlog::error("[StonneCore {}] Failed to issue tile", _id); exit(1); } stonneCores.at(selected_core_idx)->init(1); @@ -84,7 +84,7 @@ void SparseCore::issue(std::shared_ptr tile) { setTraceMode(selected_core_idx, is_trace_mode); percore_tiles.at(selected_core_idx).push_back(tile); coreBusy.at(selected_core_idx) = true; - spdlog::info("[StonneCore {}][{}] issued new tile (trace_mode: {})", _id, selected_core_idx, is_trace_mode); + spdlog::info("[{}][StonneCore {}/{}][Launch] New operation (trace_mode: {})", _core_cycle, _id, selected_core_idx, is_trace_mode); }; bool SparseCore::can_issue(const std::shared_ptr& op) { @@ -100,8 +100,8 @@ void SparseCore::checkStatus(uint32_t subcore_id) { int new_status = stonneCore->getMCFSMStats(); int compute_cycle = stonneCore->getMSStats().n_multiplications; if (traceCoreStatus.at(subcore_id) != new_status) { - spdlog::trace("Stonne Core [{}][{}] status transition {} -> {}, Load/Store: {}/{}, compute_cycle: {}", - _id, _core_cycle, traceCoreStatus.at(subcore_id), new_status, + spdlog::trace("[{}][StonneCore {}/{}][Transition] status {} -> {}, Load/Store: {}/{}, compute_cycle: {}", + _core_cycle, _id, subcore_id, traceCoreStatus.at(subcore_id), new_status, traceLoadTraffic.at(subcore_id).size(), traceStoreTraffic.at(subcore_id).size(), (compute_cycle - traceCoreCycle.at(subcore_id))/num_ms); if (traceLoadTraffic.at(subcore_id).size()) { TraceNode load_node = TraceNode(traceNodeList.at(subcore_id).size()+2, "load", TraceNode::StonneTraceLoad); @@ -151,14 +151,14 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { traceStoreTraffic.at(subcore_id).insert(target_addr); break; default: - spdlog::error("[SparseCore] Invalid request type from core"); + spdlog::error("[StonneCore] Invalid request type from core"); return; } req->request_time = _core_cycle; req->stonneId = subcore_id; std::tuple key = std::make_tuple(target_addr, acc_type, type, allocTrafficID()); registerMemfetch(key, [this, req, acc_type, type]() { - spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ + spdlog::trace("[{}][StonneCore][DRAM Response] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ _core_cycle, _core_cycle - req->request_time, req->getAddress(), int(req->getcmd()), _config.dram_req_size); req->setReply(); stonneCores.at(req->stonneId)->pushResponse(req); @@ -168,7 +168,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { /* Finish stonne core */ if (coreBusy.at(subcore_id) && stonneCore->isFinished()) { stonneCore->finish(); - spdlog::info("[SparseCore][{}] Operation finished at {}", _id, _core_cycle); + spdlog::info("[{}][StonneCore {}/{}][Finish] Operation done", _core_cycle, _id, subcore_id); std::shared_ptr target_tile = percore_tiles.at(subcore_id).front(); SST_STONNE::StonneOpDesc *opDesc = static_cast(target_tile->get_custom_data()); if (opDesc->trace_path != "") @@ -239,7 +239,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { { auto acc_type = mem_access_type::GLOBAL_ACC_R; auto type = mf_type::READ_REQUEST; - spdlog::trace("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle, + spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}", _core_cycle, _id, subcore_id, opcode_to_string(inst->get_opcode())); for (auto addr : inst->get_trace_address()) { addr = addr - (addr & _config.dram_req_size-1); @@ -247,8 +247,8 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { std::tuple key = std::make_tuple(addr, acc_type, type, allocTrafficID()); uint64_t current_time = _core_cycle; registerMemfetch(key, [this, inst, addr, current_time, type]() { - spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ - this->_core_cycle, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size); + spdlog::trace("[{}][StonneCore {}][RESPONSE] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ + this->_core_cycle, _id, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size); inst->dec_waiting_request(); }); } @@ -260,7 +260,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { { auto acc_type = mem_access_type::GLOBAL_ACC_W; auto type = mf_type::WRITE_REQUEST; - spdlog::trace("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle, + spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}", _core_cycle, _id, subcore_id, opcode_to_string(inst->get_opcode())); for (auto addr : inst->get_trace_address()) { addr = addr - (addr & _config.dram_req_size-1); @@ -268,8 +268,8 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { std::tuple key = std::make_tuple(addr, acc_type, type, allocTrafficID()); uint64_t current_time = _core_cycle; registerMemfetch(key, [this, inst, addr, current_time, type]() { - spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ - this->_core_cycle, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size); + spdlog::trace("[{}][StonneCore {}][RESPONSE] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ + this->_core_cycle, _id, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size); inst->dec_waiting_request(); }); } @@ -285,7 +285,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { inst->finish_cycle = _core_cycle + inst->get_compute_cycle(); else inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle(); - spdlog::trace("[Core {}][{}][{}] {} ISSUED, finsh at {}", _id, subcore_id, _core_cycle, + spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}, finsh at {}", _core_cycle, _id, subcore_id, opcode_to_string(inst->get_opcode()), inst->finish_cycle); target_pipeline.push(inst); issued = true; @@ -313,7 +313,7 @@ void SparseCore::cycle() { for (auto& req_pair : request_merge_table) { _request_queue.push(req_pair.second); request_merge_table.erase(req_pair.first); - spdlog::debug("[SparseCore][{}][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \ + spdlog::debug("[{}][StonneCore][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \ _core_cycle, _id, req_pair.second->get_addr(), int(req_pair.second->get_access_type()), int(req_pair.second->get_type()), _config.dram_req_size, nr_request); nr_request++; @@ -366,9 +366,9 @@ void SparseCore::print_current_stats() { } cycle_type nr_mul = percore_stat.at(i).n_multiplications; percore_stat.at(i).reset(); - spdlog::info("Stonne Core [{}][{}] : nr_multiplications: {}", _id, i, nr_mul); + spdlog::info("StonneCore [{}][{}] : nr_multiplications: {}", _id, i, nr_mul); } - spdlog::info("Stonne Core [{}] : Total cycle {}", _id, _core_cycle); + spdlog::info("StonneCore [{}] : Total cycle {}", _id, _core_cycle); } void SparseCore::print_stats() { @@ -383,9 +383,9 @@ void SparseCore::print_stats() { percore_total_stat.at(i) += percore_stat.at(i); } cycle_type nr_mul = percore_total_stat.at(i).n_multiplications; - spdlog::info("Stonne Core [{}][{}] : nr_multiplications: {}", _id, i, nr_mul); + spdlog::info("StonneCore [{}][{}] : nr_multiplications: {}", _id, i, nr_mul); } - spdlog::info("Stonne Core [{}] : Total cycle {}", _id, _core_cycle); + spdlog::info("StonneCore [{}] : Total cycle {}", _id, _core_cycle); } std::shared_ptr SparseCore::pop_finished_tile() { @@ -399,18 +399,18 @@ std::shared_ptr SparseCore::pop_finished_tile() { void SparseCore::finish_instruction(std::shared_ptr& inst) { if (inst->finished) { - spdlog::error("[Core {}][{}] {} FINISHED, inst already finished!!", _id, _core_cycle, + spdlog::error("[{}][StonneCore {}][Error] {} inst already finished!!", _core_cycle, _id, opcode_to_string(inst->get_opcode())); exit(EXIT_FAILURE); } inst->finish_instruction(); static_cast(inst->get_owner())->inc_finished_inst(); if (inst->get_opcode() == Opcode::COMP) { - spdlog::info("[StonneCore {}][{}] {} FINISHED", - _id, _core_cycle, opcode_to_string(inst->get_opcode())); + spdlog::info("[{}][StonneCore {}][INST_FINISHED] {}", + _core_cycle, _id, opcode_to_string(inst->get_opcode())); } else if (inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) { - spdlog::info("[StonneCore {}][{}] {} FINISHED, free_sram_size: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size()); + spdlog::info("[{}][StonneCore {}][INST_FINISHED] {}", _core_cycle, _id, + opcode_to_string(inst->get_opcode())); } } @@ -460,5 +460,5 @@ void SparseCore::dumpTrace(int stonne_core_id, const std::string& path) { outFile << traceNodeList.at(stonne_core_id)[i]; } outFile << "\n}" << std::endl; - spdlog::info("[StonneCore] Success to save trace dump file to \"{}\"", path); + spdlog::info("[{}][StonneCore] Success to save trace dump file to \"{}\"", _core_cycle, path); } diff --git a/PyTorchSimBackend/src/Tile.cc b/TOGSim/src/Tile.cc similarity index 100% rename from PyTorchSimBackend/src/Tile.cc rename to TOGSim/src/Tile.cc diff --git a/PyTorchSimBackend/src/TileGraph.cc b/TOGSim/src/TileGraph.cc similarity index 96% rename from PyTorchSimBackend/src/TileGraph.cc rename to TOGSim/src/TileGraph.cc index 33e995e9..120d49e2 100644 --- a/PyTorchSimBackend/src/TileGraph.cc +++ b/TOGSim/src/TileGraph.cc @@ -111,7 +111,6 @@ void TileGraph::allocate_subgraph(int core_id, int slot_id) { for (auto it = _subgraph_vec.begin(); it != _subgraph_vec.end(); ++it) { if ((*it)->get_core_id() == -1 || (*it)->get_core_id() == core_id) { - spdlog::trace("[TileGraph] Core {} allocated new subgraph(affinity={}) (remains: {})", core_id, (*it)->get_core_id(), _subgraph_vec.size()-1); std::shared_ptr subgraph = *it; _cpu_graph_map[core_id][slot_id] = subgraph; _subgraph_vec.erase(it); diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc similarity index 98% rename from PyTorchSimBackend/src/TileGraphParser.cc rename to TOGSim/src/TileGraphParser.cc index 4a562724..42776a51 100644 --- a/PyTorchSimBackend/src/TileGraphParser.cc +++ b/TOGSim/src/TileGraphParser.cc @@ -627,9 +627,6 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa } } } - /* Set last instruction's free sram size */ - if(parent->get_instructions().size()) - parent->get_instructions().back()->set_free_sram_size(parent->get_required_sram_size()); parent->append_child(child); /* Create new tile */ @@ -682,11 +679,6 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa tile_vec.back()->inc_required_sram_size(inst->get_tile_numel() * inst->get_precision()); } - /* Set last instruction's free sram size */ - std::shared_ptr parent = tile_vec.back(); - if (parent->get_instructions().size()) - parent->get_instructions().back()->set_free_sram_size(parent->get_required_sram_size()); - return tile_vec; } diff --git a/PyTorchSimBackend/src/helper/CommandLineParser.cc b/TOGSim/src/helper/CommandLineParser.cc similarity index 100% rename from PyTorchSimBackend/src/helper/CommandLineParser.cc rename to TOGSim/src/helper/CommandLineParser.cc diff --git a/PyTorchSimBackend/src/helper/CommandLineParser.h b/TOGSim/src/helper/CommandLineParser.h similarity index 100% rename from PyTorchSimBackend/src/helper/CommandLineParser.h rename to TOGSim/src/helper/CommandLineParser.h diff --git a/PyTorchSimBackend/src/main.cc b/TOGSim/src/main.cc similarity index 95% rename from PyTorchSimBackend/src/main.cc rename to TOGSim/src/main.cc index 214e7131..1af11257 100644 --- a/PyTorchSimBackend/src/main.cc +++ b/TOGSim/src/main.cc @@ -9,7 +9,7 @@ namespace fs = std::filesystem; namespace po = boost::program_options; -const char* env_value = std::getenv("BACKENDSIM_DRYRUN"); +const char* env_value = std::getenv("TOGSIM_DRYRUN"); bool isDryRun = (env_value != nullptr && std::string(env_value) == "1"); void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, std::string config_path, cycle_type request_time=0, int partiton_id=0) { @@ -38,7 +38,7 @@ int until(Simulator *simulator, cycle_type until_cycle) { void interactive_mode(Simulator* simulator) { std::string command; - std::cout << "[" << simulator->get_core_cycle() << "] BackendSim> "; + std::cout << "[" << simulator->get_core_cycle() << "] TOGSim> "; while (std::getline(std::cin, command)) { std::istringstream iss(command); @@ -79,7 +79,7 @@ void interactive_mode(Simulator* simulator) { spdlog::error("Error: unknown command {} Available commands are: launch, until, quit.", token); } if (isDryRun) - std::cout << "[" << simulator->get_core_cycle() << "] BackendSim> "; + std::cout << "[" << simulator->get_core_cycle() << "] TOGSim> "; } simulator->cycle(); if (simulator->get_core_cycle()==0) @@ -149,6 +149,6 @@ int main(int argc, char** argv) { /* Simulation time measurement */ auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end - start; - spdlog::info("Simulation time: {:2f} seconds", duration.count()); + spdlog::info("Wall-clock time for simulation: {:2f} seconds", duration.count()); return 0; } diff --git a/PyTorchSimBackend/src/scheduler/Scheduler.cc b/TOGSim/src/scheduler/Scheduler.cc similarity index 100% rename from PyTorchSimBackend/src/scheduler/Scheduler.cc rename to TOGSim/src/scheduler/Scheduler.cc diff --git a/experiments/BERT.py b/experiments/BERT.py index 3534505d..c5bb454e 100644 --- a/experiments/BERT.py +++ b/experiments/BERT.py @@ -9,7 +9,7 @@ def run_BERT(size, input_seq, config): from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request # from tests.test_transformer import EncoderBlock from tests.Fusion.test_transformer_fusion import EncoderBlock - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() hidden_dim = {'base': 768, 'large': 1024, 'xlarge': 2048} @@ -36,7 +36,7 @@ def run_BERT(size, input_seq, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path FIXME: gem5 result is different as directoy name sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -51,7 +51,7 @@ def run_BERT(size, input_seq, config): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_BERT(size, input_seq, config) diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh index a32cd0a6..28e6ad5e 100755 --- a/experiments/artifact/cycle_validation/run_cycle.sh +++ b/experiments/artifact/cycle_validation/run_cycle.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -export TORCHSIM_CONFIG=$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json +export TORCHSIM_CONFIG=$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs mkdir -p $LOG_DIR diff --git a/experiments/artifact/cycle_validation/summary_cycle.py b/experiments/artifact/cycle_validation/summary_cycle.py index 529d0161..c0f48ac3 100644 --- a/experiments/artifact/cycle_validation/summary_cycle.py +++ b/experiments/artifact/cycle_validation/summary_cycle.py @@ -88,7 +88,7 @@ def compute_mae(errors): name = file[:-4] with open(full_path, errors="ignore") as f: for line in f: - match = re.search(r"Total execution cycle:\s*([0-9]+)", line) + match = re.search(r"Total execution cycles:\s*([0-9]+)", line) if match: cycle_map[name] = int(match.group(1)) break diff --git a/experiments/artifact/speedup/run_speedup.sh b/experiments/artifact/speedup/run_speedup.sh index 7d0c0da2..2b9625e9 100755 --- a/experiments/artifact/speedup/run_speedup.sh +++ b/experiments/artifact/speedup/run_speedup.sh @@ -1,7 +1,7 @@ #!/bin/bash LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs -CONFIG_DIR="$TORCHSIM_DIR/PyTorchSimBackend/configs" -SIMULATOR_BIN="$TORCHSIM_DIR/PyTorchSimBackend/build/bin/Simulator" +CONFIG_DIR="$TORCHSIM_DIR/TOGSim/configs" +SIMULATOR_BIN="$TORCHSIM_DIR/TOGSim/build/bin/Simulator" configs=( "systolic_ws_128x128_c2_simple_noc_tpuv3.json" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh index 66829f02..4055b355 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh @@ -26,7 +26,7 @@ for i in "${config[@]}"; do echo "===== config=$i | model=$ops =====" >> "$output_file" sum=0.0 count=0 - config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i" + config_path="$TORCHSIM_DIR/TOGSim/configs/$i" for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh index 2f9718f1..83b3798a 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh @@ -27,7 +27,7 @@ for i in "${config[@]}"; do echo "===== config=$i | model=$ops =====" >> "$output_file" sum=0.0 count=0 - config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i" + config_path="$TORCHSIM_DIR/TOGSim/configs/$i" for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh index 8ff7e2b6..f1467614 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh @@ -25,7 +25,7 @@ for i in "${config[@]}"; do echo "===== config=$i | model=$ops =====" >> "$output_file" sum=0.0 count=0 - config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i" + config_path="$TORCHSIM_DIR/TOGSim/configs/$i" for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh index aa35735c..2ed3ca2a 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh @@ -33,7 +33,7 @@ for i in "${config[@]}"; do echo "===== config=$i | model=$ops =====" >> "$output_file" sum=0.0 count=0 - config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i" + config_path="$TORCHSIM_DIR/TOGSim/configs/$i" for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" diff --git a/experiments/attention.py b/experiments/attention.py index e8f89dac..5a8c5f45 100644 --- a/experiments/attention.py +++ b/experiments/attention.py @@ -14,7 +14,7 @@ def attention(query, key, value): p_attn = scores.softmax(dim=-2) return torch.matmul(value.transpose(-1, -2), p_attn) from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() query = torch.randn(size).to(device=device) key = torch.randn(size).to(device=device) @@ -36,7 +36,7 @@ def attention(query, key, value): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -50,7 +50,7 @@ def attention(query, key, value): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_attention(size, config) diff --git a/experiments/conv.py b/experiments/conv.py index e8b97906..c8ca9a37 100644 --- a/experiments/conv.py +++ b/experiments/conv.py @@ -15,7 +15,7 @@ def custom_conv2d(a, b, bias): conv2d.weight = torch.nn.Parameter(b) # conv2d.bias = torch.nn.Parameter(bias) return conv2d(a) - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() conv_input = torch.randn(batch_size, i_c, i_h, i_w).to(memory_format=torch.channels_last, device=device) conv_kernel = torch.randn(o_c, i_c, kernel_size, kernel_size).to(memory_format=torch.channels_last, device=device) @@ -37,7 +37,7 @@ def custom_conv2d(a, b, bias): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -51,7 +51,7 @@ def custom_conv2d(a, b, bias): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_conv2d(size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7], config) \ No newline at end of file diff --git a/experiments/gemm.py b/experiments/gemm.py index e7a639ad..67dc4f79 100644 --- a/experiments/gemm.py +++ b/experiments/gemm.py @@ -10,7 +10,7 @@ def run_matmul(input_size, hidden_size, output_size, config): from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request def custom_matmul(a, b): return torch.matmul(a, b) - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() torch.manual_seed(0) input = torch.randn(input_size, hidden_size).to(device=device) @@ -31,7 +31,7 @@ def custom_matmul(a, b): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -45,8 +45,8 @@ def custom_matmul(a, b): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] from Scheduler.scheduler import PyTorchSimRunner module = PyTorchSimRunner.setup_device() diff --git a/experiments/layernorm.py b/experiments/layernorm.py index f149394e..0beaac6c 100644 --- a/experiments/layernorm.py +++ b/experiments/layernorm.py @@ -8,7 +8,7 @@ def run_layernorm(size, config): from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() input = torch.randn(size).to(device=device) opt_fn = torch.compile(dynamic=False)(torch.nn.LayerNorm(size[-1]).to(device=device)) @@ -27,7 +27,7 @@ def run_layernorm(size, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -42,7 +42,7 @@ def run_layernorm(size, config): os.environ['TORCHSIM_FUSION_REDUCTION_REDUCTION'] = "0" # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_layernorm(size, config) diff --git a/experiments/resnet18.py b/experiments/resnet18.py index 5d9dcf86..23d62e40 100644 --- a/experiments/resnet18.py +++ b/experiments/resnet18.py @@ -8,7 +8,7 @@ def run_resnet(batch, config): from torchvision.models import resnet18 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() model = resnet18().eval() input = torch.randn(batch, 3, 224, 224).to(device=device) @@ -29,7 +29,7 @@ def run_resnet(batch, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -43,7 +43,7 @@ def run_resnet(batch, config): os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1" # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_resnet(batch, config) diff --git a/experiments/resnet50.py b/experiments/resnet50.py index bd52afc1..60a46071 100644 --- a/experiments/resnet50.py +++ b/experiments/resnet50.py @@ -8,7 +8,7 @@ def run_resnet(batch, config): from torchvision.models import resnet50 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() model = resnet50().eval() input = torch.randn(batch, 3, 224, 224).to(device=device) @@ -29,7 +29,7 @@ def run_resnet(batch, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -43,7 +43,7 @@ def run_resnet(batch, config): os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1" # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_resnet(batch, config) diff --git a/experiments/softmax.py b/experiments/softmax.py index 14d28fee..532ef091 100644 --- a/experiments/softmax.py +++ b/experiments/softmax.py @@ -8,7 +8,7 @@ def run_softmax(size, config, dim=1): from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() input = torch.randn(size).to(device=device) opt_fn = torch.compile(dynamic=False)(torch.nn.Softmax(dim=dim).to(device=device)) @@ -27,7 +27,7 @@ def run_softmax(size, config, dim=1): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -41,7 +41,7 @@ def run_softmax(size, config, dim=1): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_softmax(size, config) diff --git a/scripts/CompilerOpt_experiment/DMAopt.sh b/scripts/CompilerOpt_experiment/DMAopt.sh index 469cf766..22118b1e 100644 --- a/scripts/CompilerOpt_experiment/DMAopt.sh +++ b/scripts/CompilerOpt_experiment/DMAopt.sh @@ -1,5 +1,5 @@ #!/bin/bash -export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json" +export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json" # None FG DMA export TORCHSIM_SUBTILE=0 diff --git a/scripts/build_from_source.sh b/scripts/build_from_source.sh index d9806069..fb9e82e3 100644 --- a/scripts/build_from_source.sh +++ b/scripts/build_from_source.sh @@ -6,7 +6,7 @@ cd $home apt -y update && apt -y upgrade && apt -y install scons git clone https://github.com/PSAL-POSTECH/gem5.git cd gem5 && scons build/RISCV/gem5.opt -j $(nproc) -export GEM5_PATH=$home/gem5/release/gem5.opt +export GEM5_PATH=$home/gem5/build/RISCV/gem5.opt cd $home # LLVM diff --git a/scripts/chiplet.sh b/scripts/chiplet.sh index 3dfba3d9..2989e4fd 100755 --- a/scripts/chiplet.sh +++ b/scripts/chiplet.sh @@ -14,16 +14,16 @@ fi GEMM_PATH="$1" INDEX_NAME="$2" -SIMULATOR_PATH="$TORCHSIM_DIR/PyTorchSimBackend/build/bin/Simulator" +SIMULATOR_PATH="$TORCHSIM_DIR/TOGSim/build/bin/Simulator" GEMM_DIR_NAME=$(basename "$GEMM_PATH") echo "GEMM Directory Name: $GEMM_DIR_NAME" CONFIG_LIST=( - "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json" + "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json" ) CONFIG_LIST2=( - "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json" - "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json" + "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json" + "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json" ) shift shift @@ -51,7 +51,7 @@ for CONFIG in "${CONFIG_LIST[@]}"; do # Run Simulator echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" & - echo "[BackendSimulator] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\"" + echo "[TOGSim] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\"" done done @@ -65,6 +65,6 @@ for CONFIG in "${CONFIG_LIST2[@]}"; do # Run Simulator # echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" & - echo "[BackendSimulator] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\"" + echo "[TOGSim] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\"" done wait \ No newline at end of file diff --git a/scripts/end2end.sh b/scripts/end2end.sh index 7ca5c93d..579b8c14 100755 --- a/scripts/end2end.sh +++ b/scripts/end2end.sh @@ -7,34 +7,34 @@ BASE_PATH=$1 # Input as the first argument total_sum=0 total_core=0 total_vector=0 -# Find all backendsim_result folders -mapfile -t backend_folders < <(find "$BASE_PATH" -type d -name "backendsim_result") +# Find all togsim_result folders +mapfile -t togsim_folders < <(find "$BASE_PATH" -type d -name "togsim_result") -# Iterate over each backendsim_result folder -for backend_folder in "${backend_folders[@]}"; do - # echo "Processing folder: $backend_folder" +# Iterate over each togsim_result folder +for togsim_folder in "${togsim_folders[@]}"; do + # echo "Processing folder: $togsim_folder" - # Find all files within the backendsim_result folder - mapfile -t files < <(find "$backend_folder" -type f) + # Find all files within the togsim_result folder + mapfile -t files < <(find "$togsim_folder" -type f) for file in "${files[@]}"; do # echo "Processing $file" - # Extract the last line containing "Total cycle" - total_cycle=$(grep "Total cycle" "$file" | tail -n 1 | sed -E 's/.*Total cycle ([0-9]+).*/\1/') + # Extract the last line containing "Total_cycles" + total_cycle=$(grep "Total_cycles" "$file" | tail -n 1 | sed -E 's/.*Total_cycles ([0-9]+).*/\1/') # echo "total_cycle: $total_cycle" - active_cycles=($(grep -o 'active cycle [0-9]*' "$file" | awk '{print $3}')) + active_cycles=($(grep -o 'active_cycles [0-9]*' "$file" | awk '{print $3}')) num_cycles=${#active_cycles[@]} if [ "$num_cycles" -ge 3 ]; then core_cycle=${active_cycles[$((num_cycles-3))]} else - echo "Error: cannot find core active cycle" + echo "Error: cannot find core active_cycles" fi if [[ "$num_cycles" -ge 1 ]]; then - # Extract the last two active cycles + # Extract the last two active_cycless vector_core_cycle=${active_cycles[$((num_cycles-1))]} else - echo "Error: cannot find vector core active cycle" + echo "Error: cannot find vector core active_cycles" fi echo "file: $file total_cycle: $total_cycle SA core_cycle: $core_cycle vector_core_cycle: $vector_core_cycle" diff --git a/scripts/get_tog_result.sh b/scripts/get_tog_result.sh index 9359e1e5..6fd399e0 100755 --- a/scripts/get_tog_result.sh +++ b/scripts/get_tog_result.sh @@ -3,8 +3,8 @@ total_cycles=0 # Read through input stream line by line while IFS= read -r line; do - # Check if the line contains both "[BackendSimulator]" and "stored" - if [[ "$line" == *"[BackendSimulator]"* && "$line" == *"stored"* ]]; then + # Check if the line contains both "[TOGSimulator]" and "stored" + if [[ "$line" == *"[TOGSimulator]"* && "$line" == *"stored"* ]]; then # Extract the file path from the line file_path=$(echo "$line" | sed -n 's/.*stored to "\(.*\)"$/\1/p') diff --git a/scripts/sim_time.sh b/scripts/sim_time.sh index 15c60736..95df5982 100755 --- a/scripts/sim_time.sh +++ b/scripts/sim_time.sh @@ -6,15 +6,15 @@ BASE_PATH=$1 # Input as the first argument # Initialize total_sum as string for awk processing total_sum=0.0 -# Find all backendsim_result folders -mapfile -t backend_folders < <(find "$BASE_PATH" -type d -name "backendsim_result") +# Find all togsim_result folders +mapfile -t togsim_folders < <(find "$BASE_PATH" -type d -name "togsim_result") -# Iterate over each backendsim_result folder -for backend_folder in "${backend_folders[@]}"; do - mapfile -t files < <(find "$backend_folder" -type f) +# Iterate over each togsim_result folder +for togsim_folder in "${togsim_folders[@]}"; do + mapfile -t files < <(find "$togsim_folder" -type f) for file in "${files[@]}"; do - sim_time=$(grep "Simulation time:" "$file" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+(\.[0-9]+)?).*/\1/') + sim_time=$(grep "Wall-clock time for simulation:" "$file" | tail -n 1 | sed -E 's/.*Wall-clock time for simulation: ([0-9]+(\.[0-9]+)?).*/\1/') echo "file: $file total_cycle: $sim_time" if [[ -n "$sim_time" ]]; then diff --git a/scripts/sparsity_experiment/run.sh b/scripts/sparsity_experiment/run.sh index 0b7bc6f5..94e00527 100755 --- a/scripts/sparsity_experiment/run.sh +++ b/scripts/sparsity_experiment/run.sh @@ -5,7 +5,7 @@ export TORCHSIM_FORCE_TIME_M=8 export TORCHSIM_FORCE_TIME_N=8 OUTPUT_DIR="12GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -13,7 +13,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="24GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -21,7 +21,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="48GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -29,7 +29,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="12GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -37,7 +37,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="24GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -45,7 +45,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="48GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 diff --git a/scripts/stonne_experiment2/tog_gen.py b/scripts/stonne_experiment2/tog_gen.py index 2f184f4c..be30795b 100644 --- a/scripts/stonne_experiment2/tog_gen.py +++ b/scripts/stonne_experiment2/tog_gen.py @@ -5,7 +5,7 @@ from collections import defaultdict sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) from AsmParser.tog_generator import tog_generator -from Simulator.simulator import BackendSimulator +from Simulator.simulator import TOGSimulator from PyTorchSimFrontend import extension_config def extract_simulation_stats(result_path): @@ -19,9 +19,9 @@ def extract_simulation_stats(result_path): for line in lines: if "nr_multiplications" in line: nr_multiplications = line.strip().split(":")[-1].strip() - elif "Total execution cycle" in line: + elif "Total execution cycles" in line: total_cycle = line.strip().split(":")[-1].strip() - elif "Simulation time" in line: + elif "Wall-clock time for simulation" in line: sim_time = line.strip().split(":")[-1].replace("seconds", "").strip() return nr_multiplications, total_cycle, sim_time @@ -71,9 +71,9 @@ def extract_simulation_stats(result_path): if "outerPro" in path: continue tog_path = os.path.join(path, "tile_graph.onnx") - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json' - backsim = BackendSimulator(backend_path, stonne_config_path) + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/TOGSim/configs/stonne_validation_c1_simple_noc.json' + backsim = TOGSimulator(togsim_path, stonne_config_path) result_path = backsim.simulation(tog_path) nr_multiplications, total_cycle, sim_time = extract_simulation_stats(result_path) sim_time, total_cycle = float(sim_time), int(total_cycle) diff --git a/tests/test_compile_overhead.py b/tests/test_compile_overhead.py index cf0dc1bb..c32b4364 100644 --- a/tests/test_compile_overhead.py +++ b/tests/test_compile_overhead.py @@ -21,7 +21,7 @@ # shutil.rmtree("/tmp/torchinductor") #except FileNotFoundError: # print("no cache") - scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json") + scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json") # Register compiled model opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False) SchedulerDNNModel.register_model("resnet18", opt_model1) diff --git a/tests/test_hetro.py b/tests/test_hetro.py index 5e36d730..557ea5d6 100644 --- a/tests/test_hetro.py +++ b/tests/test_hetro.py @@ -26,7 +26,7 @@ def custom_matmul(a, b): K = args.K sparsity = args.sparsity mode = args.mode - config_path = f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/{args.config}" + config_path = f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/{args.config}" print("M: ", M) print("N: ", N) @@ -36,7 +36,7 @@ def custom_matmul(a, b): with torch.no_grad(): # Init scheduler scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, - backend_config=config_path) + togsim_config=config_path) # Register compiled model opt_model1 = torch.compile(custom_matmul) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index c64093a0..91bf0ad8 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -7,13 +7,13 @@ base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') sys.path.append(base_path) from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request -config = f'{base_path}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' +config = f'{base_path}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' target_model1 = model1().eval() target_model2 = model2(768, 12).eval() # Init scheduler -scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) +scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) # Register compiled model opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last)) opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device())) diff --git a/tests/test_scheduler_batching.py b/tests/test_scheduler_batching.py index f3b54159..5a34d161 100644 --- a/tests/test_scheduler_batching.py +++ b/tests/test_scheduler_batching.py @@ -17,7 +17,7 @@ target_model1 = model1().eval() # Init scheduler - scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json") + scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json") # Register compiled model opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False) SchedulerDNNModel.register_model("resnet18", opt_model1) diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py index 1cf0d3b3..c7abf0ae 100644 --- a/tests/test_spmm_scheduler.py +++ b/tests/test_spmm_scheduler.py @@ -25,7 +25,7 @@ output_size = args.output_size w1_sparsity = args.w1_sparsity w2_sparsity = args.w2_sparsity - config_path = f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/{args.config}" + config_path = f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/{args.config}" print("batch_size: ", batch_size) print("input_size: ", input_size) @@ -37,7 +37,7 @@ with torch.no_grad(): # Init scheduler scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, - backend_config=config_path) + togsim_config=config_path) target_model1 = model1(input_size, hidden_size, output_size, w1_sparsity, w2_sparsity, scheduler.execution_engine.module.custom_device()).eval() target_model2 = model2(768, 12).eval() diff --git a/tutorial/session1/HelloPyTorchSim.ipynb b/tutorial/session1/HelloPyTorchSim.ipynb new file mode 100644 index 00000000..dfb086a4 --- /dev/null +++ b/tutorial/session1/HelloPyTorchSim.ipynb @@ -0,0 +1,1216 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hello, PyTorchSim!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import os\n", + "import sys\n", + "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", + "sys.path.append(base_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## One Touch Simulation\n", + "### Normal Matmul Code" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "torch.manual_seed(0)\n", + "input = torch.randn(128, 128).to(device)\n", + "weight = torch.randn(128, 128).to(device)\n", + "\n", + "opt_fn = torch.compile(torch.matmul)\n", + "cpu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PyTorchSim Matmul Code" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/ro/croutbd6yxrzgdstfcplx7yrpn2do5frwhyx2md5r7rvrubdhdgd.py\n", + "[Gem5] Gem5 is running... \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running.. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/0\"\n" + ] + } + ], + "source": [ + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "torch.manual_seed(0)\n", + "input = torch.randn(128, 128).to(device)\n", + "weight = torch.randn(128, 128).to(device)\n", + "\n", + "opt_fn = torch.compile(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n", + " if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n", + " message = f\"|{name} Test Passed|\"\n", + " print(\"-\" * len(message))\n", + " print(message)\n", + " print(\"-\" * len(message))\n", + " else:\n", + " message = f\"|{name} Test Failed|\"\n", + " print(\"-\" * len(message))\n", + " print(message)\n", + " print(\"-\" * len(message))\n", + " print(\"npu out: \", npu_out.cpu())\n", + " print(\"cpu out: \", cpu_out)\n", + " exit(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------\n", + "|MatMul Test Passed|\n", + "--------------------\n" + ] + } + ], + "source": [ + "test_result(\"MatMul\", npu_out, cpu_out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# from Scheduler.scheduler import PyTorchSimRunner\n", + "# npu_device = PyTorchSimRunner.setup_device().custom_device()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Normal Backward Code" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "torch.manual_seed(0)\n", + "cpu_input = torch.randn(128, 128).to(device)\n", + "cpu_weight = torch.randn(128, 128).to(device)\n", + "cpu_target = torch.randn(128, 128).to(device)\n", + "cpu_input.requires_grad = True\n", + "cpu_weight.requires_grad = True\n", + "\n", + "opt_fn = torch.compile(torch.matmul)\n", + "cpu_out = opt_fn(cpu_input, cpu_weight)\n", + "\n", + "loss_fn = torch.nn.CrossEntropyLoss()\n", + "cpu_loss = loss_fn(cpu_out, cpu_target)\n", + "cpu_loss.backward()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PyTorchSim Backward Code" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrapper Codegen Path = /tmp/torchinductor_root/5i/c5isqyualxbaqsmuhsux7oubvkypfmh4kvamqvgref6z3ypnrpw5.py\n", + "[Gem5] Gem5 is running... \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running.. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/19\"\n" + ] + }, + { + "ename": "RuntimeError", + "evalue": "0 <= device.index() && device.index() < static_cast(device_ready_queues_.size()) INTERNAL ASSERT FAILED at \"/opt/conda/conda-bld/pytorch_1704987394225/work/torch/csrc/autograd/engine.cpp\":1423, please report a bug to PyTorch. ", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 15\u001b[0m\n\u001b[1;32m 13\u001b[0m loss_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mnn\u001b[38;5;241m.\u001b[39mCrossEntropyLoss()\n\u001b[1;32m 14\u001b[0m npu_loss \u001b[38;5;241m=\u001b[39m loss_fn(npu_out, npu_target)\n\u001b[0;32m---> 15\u001b[0m \u001b[43mnpu_loss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_tensor.py:522\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 512\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 513\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m 514\u001b[0m Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m 515\u001b[0m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 520\u001b[0m inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m 521\u001b[0m )\n\u001b[0;32m--> 522\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 523\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m 524\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py:266\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 261\u001b[0m retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[1;32m 263\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[1;32m 264\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m 265\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 266\u001b[0m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m 267\u001b[0m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 268\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 269\u001b[0m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 270\u001b[0m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 271\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 272\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 274\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mRuntimeError\u001b[0m: 0 <= device.index() && device.index() < static_cast(device_ready_queues_.size()) INTERNAL ASSERT FAILED at \"/opt/conda/conda-bld/pytorch_1704987394225/work/torch/csrc/autograd/engine.cpp\":1423, please report a bug to PyTorch. " + ] + } + ], + "source": [ + "from Scheduler.scheduler import PyTorchSimRunner\n", + "npu_device = PyTorchSimRunner.setup_device().custom_device()\n", + "torch.manual_seed(0)\n", + "npu_input = torch.randn(128, 128).to(npu_device)\n", + "npu_weight = torch.randn(128, 128).to(npu_device)\n", + "npu_target = torch.randn(128, 128).to(npu_device)\n", + "npu_input.requires_grad = True\n", + "npu_weight.requires_grad = True\n", + "\n", + "opt_fn = torch.compile(torch.matmul)\n", + "npu_out = opt_fn(npu_input, npu_weight)\n", + "\n", + "loss_fn = torch.nn.CrossEntropyLoss()\n", + "npu_loss = loss_fn(npu_out, npu_target)\n", + "npu_loss.backward()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'test_result' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtest_result\u001b[49m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMatMul Input Grad\u001b[39m\u001b[38;5;124m\"\u001b[39m, npu_input\u001b[38;5;241m.\u001b[39mgrad, cpu_input\u001b[38;5;241m.\u001b[39mgrad)\n\u001b[1;32m 2\u001b[0m test_result(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMatMul Weight Grad\u001b[39m\u001b[38;5;124m\"\u001b[39m, npu_weight\u001b[38;5;241m.\u001b[39mgrad, cpu_weight\u001b[38;5;241m.\u001b[39mgrad)\n", + "\u001b[0;31mNameError\u001b[0m: name 'test_result' is not defined" + ] + } + ], + "source": [ + "test_result(\"MatMul Input Grad\", npu_input.grad, cpu_input.grad)\n", + "test_result(\"MatMul Weight Grad\", npu_weight.grad, cpu_weight.grad)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Mapping\n", + "\n", + "Default mapping is based on heuristic." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrapper Codegen Path = /tmp/torchinductor_root/5z/c5z4ur2k2svn2gaawn776ev3t6gsa7esgu36la63523cqpbbt56d.py\n", + "[Gem5] Gem5 is running.. \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0\"\n" + ] + } + ], + "source": [ + "import torch\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 18:53:14.002] [info] Total execution cycle: 47158\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Manual Mapping\n", + "User can set tile size manually." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrapper Codegen Path = /tmp/torchinductor_root/mv/cmv6cp7oo3wwndv76iv3sib7r74tnbvodfwxi3rw33k7grlh3h4h.py\n", + "[Gem5] Gem5 is running. \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running... \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/75hiq5mugpq/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/75hiq5mugpq/togsim_result/0\"\n" + ] + } + ], + "source": [ + "torch._dynamo.reset()\n", + "\n", + "os.environ['TORCHSIM_MANUAL_TILE_SIZE']=\"1\"\n", + "os.environ['TORCHSIM_TILE_M']=\"512\"\n", + "os.environ['TORCHSIM_TILE_N']=\"512\"\n", + "os.environ['TORCHSIM_TILE_K']=\"512\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 18:54:00.878] [info] Total execution cycle: 53704\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/75hiq5mugpq/togsim_result/0 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Autotune" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Auto-tune] Trying tile size: [1024, 1024, 256, 128, 1024, 256]\n", + "[Auto-tune] Trying tile size: [256, 1024, 1024, 128, 1024, 1024]\n", + "[Auto-tune] Trying tile size: [1024, 256, 1024, 128, 256, 1024]\n", + "[Auto-tune] Trying tile size: [1024, 1024, 128, 128, 1024, 128]\n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/x27ipc5avjg/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/0\"\n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/7j33rcic2qn/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/7j33rcic2qn/togsim_result/0\"\n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/vsaamplubl5/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/vsaamplubl5/togsim_result/0\"\n", + "[Auto-tune] Optimal tile size: [1024, 1024, 128, 128, 1024, 128], cycles: 46423\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/3b/c3bebp4b4rp73grbvhbaq4xdxny7f5m7fgqkgpflp2cjn3x5uugr.py\n", + "[Gem5] Gem5 is running.. \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/x27ipc5avjg/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/1\"\n" + ] + } + ], + "source": [ + "torch._dynamo.reset()\n", + "os.environ['TORCHSIM_MANUAL_TILE_SIZE']=\"0\"\n", + "os.environ['AUTOTUNE_TEMPLATE']=\"1\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 18:54:53.051] [info] Total execution cycle: 46422\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/1 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Execution Mode\n", + "### Functional & Timing mode (Default)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrapper Codegen Path = /tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py\n", + "[Gem5] Gem5 is running.. \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/4\"\n" + ] + } + ], + "source": [ + "torch._dynamo.reset()\n", + "os.environ['AUTOTUNE_TEMPLATE']=\"0\"\n", + "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"1\"\n", + "os.environ['TORCHSIM_TIMING_MODE']=\"1\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Functional only mode" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Spike] Running Spike simulator\n" + ] + } + ], + "source": [ + "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"1\"\n", + "os.environ['TORCHSIM_TIMING_MODE']=\"0\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Timing only mode" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[23], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m 7\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 8\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m 11\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124;03m Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m inner\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified..forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m 899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m 900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func..g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper..runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 89\u001b[0m \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m 90\u001b[0m \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m 91\u001b[0m \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m 92\u001b[0m \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m 95\u001b[0m compiled_fn,\n\u001b[1;32m 96\u001b[0m args,\n\u001b[1;32m 97\u001b[0m disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m 98\u001b[0m )\n\u001b[1;32m 100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m 101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m 104\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 107\u001b[0m \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m 109\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 110\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 113\u001b[0m )\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base..rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m 117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m 884\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m 885\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m 886\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m 887\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m 888\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m 889\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m 890\u001b[0m )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m 124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m 127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir..dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m 284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m 287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m 27\u001b[0m file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'" + ] + } + ], + "source": [ + "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"0\"\n", + "os.environ['TORCHSIM_TIMING_MODE']=\"1\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TOGSim Configuration\n", + "### Single Core" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[22], line 7\u001b[0m\n\u001b[1;32m 4\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m 6\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 7\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m 11\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124;03m Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m inner\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified..forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m 899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m 900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func..g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper..runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 89\u001b[0m \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m 90\u001b[0m \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m 91\u001b[0m \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m 92\u001b[0m \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m 95\u001b[0m compiled_fn,\n\u001b[1;32m 96\u001b[0m args,\n\u001b[1;32m 97\u001b[0m disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m 98\u001b[0m )\n\u001b[1;32m 100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m 101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m 104\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 107\u001b[0m \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m 109\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 110\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 113\u001b[0m )\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base..rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m 117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m 884\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m 885\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m 886\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m 887\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m 888\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m 889\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m 890\u001b[0m )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m 124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m 127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir..dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m 284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m 287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m 27\u001b[0m file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'" + ] + } + ], + "source": [ + "os.environ['TORCHSIM_CONFIG']=\"/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 18:32:01.843] [info] Total execution cycle: 47126\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/11 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Multi-Core" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/12\"\n" + ] + } + ], + "source": [ + "os.environ['TORCHSIM_CONFIG']=\"/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 18:34:48.969] [info] Total execution cycle: 40736\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/12 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TOGSim log level\n", + "### log level info" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[21], line 7\u001b[0m\n\u001b[1;32m 4\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m 6\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 7\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m 11\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124;03m Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m inner\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified..forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m 899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m 900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func..g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper..runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 89\u001b[0m \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m 90\u001b[0m \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m 91\u001b[0m \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m 92\u001b[0m \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m 95\u001b[0m compiled_fn,\n\u001b[1;32m 96\u001b[0m args,\n\u001b[1;32m 97\u001b[0m disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m 98\u001b[0m )\n\u001b[1;32m 100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m 101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m 104\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 107\u001b[0m \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m 109\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 110\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 113\u001b[0m )\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base..rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m 117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m 884\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m 885\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m 886\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m 887\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m 888\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m 889\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m 890\u001b[0m )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m 124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m 127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir..dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m 284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m 287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m 27\u001b[0m file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'" + ] + } + ], + "source": [ + "os.environ['TORCHSIM_DUMP_PATH']=\"/workspace/PyTorchSim\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### log level trace" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/workspace/PyTorchSim/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/workspace/PyTorchSim/tmp/4q4qv6gbpia/togsim_result/1\"\n" + ] + } + ], + "source": [ + "os.environ['BACKENDSIM_DEBUG_LEVEL']=\"trace\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scheduler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torchvision.models import resnet18\n", + "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request\n", + "from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_BACKEND_CONFIG\n", + "\n", + "scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=CONFIG_TORCHSIM_BACKEND_CONFIG)\n", + "device = scheduler.execution_engine.module.custom_device()\n", + "\n", + "model = resnet18().eval()\n", + "input = torch.randn(1, 3, 224, 224).to(device=device)\n", + "opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last))\n", + "\n", + "SchedulerDNNModel.register_model(\"resnet18\", opt_fn)\n", + "request = Request(\"resnet18\", [input], [], request_queue_idx=0)\n", + "scheduler.add_request(request, request_time=0)\n", + "\n", + "# Run scheduler\n", + "while not scheduler.is_finished():\n", + " with torch.no_grad():\n", + " scheduler.schedule()\n", + "\n", + "print(\"ResNet18 Simulation Done\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Generator" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 13:05:13.597] [info] [LoadConfig] Success to open \"/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\"\n", + "[2025-11-30 13:05:13.597] [info] [Config/Core] CPU 0: Partition 0\n", + "[2025-11-30 13:05:13.597] [info] [Config/Core] CPU 1: Partition 0\n", + "[2025-11-30 13:05:13.597] [info] [Config/Core] Core 0: 700 MHz, Systolic array per core: 1\n", + "[2025-11-30 13:05:13.597] [info] [Config/Core] Core 1: 700 MHz, Systolic array per core: 1\n", + "[2025-11-30 13:05:13.597] [info] [Config/DRAM] Ramulator2 config: /root/workspace/PyTorchSim/PyTorchSimBackend/configs/../configs/ramulator2_configs/HBM2.yaml\n", + "[2025-11-30 13:05:13.597] [info] [Config/DRAM] DRAM Bandwidth 716 GB/s, Freq: 700 MHz, Channels: 32, Request_size: 32B\n", + "[2025-11-30 13:05:13.597] [info] [Config/L2Cache] No L2 cache\n", + "[2025-11-30 13:05:13.673] [info] [Config/Interconnect] Interconnect freq: 20000 MHz\n", + "[2025-11-30 13:05:13.673] [info] [Config/Interconnect] SimpleInerconnect selected\n", + "[0] BackendSim> [Reqest] Resnet18 request time: 0\n", + "[Request issue] partition: 0 batch size: 1\n", + "[Request-0 issue] partition: 0 arrival_time: 0 start_time: 0.0\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/qx/cqxp5xnkdgcdwmer5w6ftyf46iegefhyjclg6mkz2smhktj4tpcy.py\n", + "launch /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx /tmp/torchinductor/tmp/w5hefiqdl3p/runtime_0001/attribute/0 0 0\n", + "[2025-11-30 13:05:22.114] [info] [LoadConfig] Success to open \"/tmp/torchinductor/tmp/w5hefiqdl3p/runtime_0001/attribute/0\"\n", + "[2025-11-30 13:05:22.114] [info] [LoadConfig] Success to open \"/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\"\n", + "[2025-11-30 13:05:22.115] [info] [TOGParser/Attribute] Address Attribute key: arg0 address: 0xa3056c0\n", + "[2025-11-30 13:05:22.115] [info] [TOGParser/Attribute] Address Attribute key: arg1 address: 0xc4a3d40\n", + "[2025-11-30 13:05:22.115] [info] [TOGParser] Register Metadata \"systolic_size\": \"128\"\n", + "[2025-11-30 13:05:22.115] [info] [TOGParser] Register Metadata \"stonneGraph\": \"0\"\n", + "[2025-11-30 13:05:22.116] [info] [Scheduler 0] Register graph path: /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx operation: primals_123 at 0\n", + "[2025-11-30 13:05:22.116] [info] [Scheduler 0] Tile Graph FIFO Scheduled\n", + "until -1\n", + "[2025-11-30 13:05:22.117] [info] HBM2-CH_0: BW utilization 0% (0 reads, 0 writes)\n", + "[2025-11-30 13:05:22.319] [info] [Scheduler 0] Graph path: /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx operation: primals_123 finish at 2424\n", + "[2025-11-30 13:05:22.319] [info] Total compute time 2424\n", + "cycle\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 33\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[38;5;66;03m# Run scheduler\u001b[39;00m\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m scheduler\u001b[38;5;241m.\u001b[39mis_finished():\n\u001b[0;32m---> 33\u001b[0m \u001b[43mscheduler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mschedule\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:475\u001b[0m, in \u001b[0;36mScheduler.schedule\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 473\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcurrent_cycle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbackend_simulator\u001b[38;5;241m.\u001b[39mcycle()\n\u001b[1;32m 474\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 475\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnext_time\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 476\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:507\u001b[0m, in \u001b[0;36mScheduler.run\u001b[0;34m(self, until_time)\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m until_time \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 506\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mis_any_idle(req_empty_info):\n\u001b[0;32m--> 507\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mexecute_cycle\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 508\u001b[0m req_empty_info \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest_empty(i) \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mnum_partion)]\n\u001b[1;32m 509\u001b[0m \u001b[38;5;66;03m# if result is not -1, schedule new request\u001b[39;00m\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:484\u001b[0m, in \u001b[0;36mScheduler.run..execute_cycle\u001b[0;34m()\u001b[0m\n\u001b[1;32m 482\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mnum_partion):\n\u001b[1;32m 483\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mpartition_state[i] \u001b[38;5;241m==\u001b[39m PyTorchSimRunner\u001b[38;5;241m.\u001b[39mPARTITION_IDLE:\n\u001b[0;32m--> 484\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlaunch_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_cycle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 485\u001b[0m launch_ret_info\u001b[38;5;241m.\u001b[39mappend(ret)\n\u001b[1;32m 487\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcheck_finish_request()\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:254\u001b[0m, in \u001b[0;36mPyTorchSimRunner.launch_kernel\u001b[0;34m(self, current_cycle, partion_idx)\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpartition_state[partion_idx] \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mPARTITION_IDLE:\n\u001b[1;32m 253\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpartition_state[partion_idx]\n\u001b[0;32m--> 254\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselect_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpartion_idx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mSELECT_NOTHING:\n\u001b[1;32m 256\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mSELECT_NOTHING\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:290\u001b[0m, in \u001b[0;36mFIFORunner.select_kernel\u001b[0;34m(self, partition_idx)\u001b[0m\n\u001b[1;32m 287\u001b[0m nested_gen \u001b[38;5;241m=\u001b[39m kernel(\u001b[38;5;241m*\u001b[39minputs)\n\u001b[1;32m 288\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnested_launch_model_dicts[partition_idx] \u001b[38;5;241m=\u001b[39m {req : nested_gen}\n\u001b[1;32m 289\u001b[0m kernel, inputs \u001b[38;5;241m=\u001b[39m \\\n\u001b[0;32m--> 290\u001b[0m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnested_launch_model_dicts\u001b[49m\u001b[43m[\u001b[49m\u001b[43mpartition_idx\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43mreq\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m kernel, inputs\n\u001b[1;32m 292\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 293\u001b[0m \u001b[38;5;66;03m# Retry\u001b[39;00m\n", + "File \u001b[0;32m/tmp/torchinductor_root/qx/cqxp5xnkdgcdwmer5w6ftyf46iegefhyjclg6mkz2smhktj4tpcy.py:227\u001b[0m, in \u001b[0;36mConv2D_1_3_224_22464_3_7_7_2_2_3_3_1_1_3\u001b[0;34m(X, W, Y)\u001b[0m\n\u001b[1;32m 224\u001b[0m W \u001b[38;5;241m=\u001b[39m W\u001b[38;5;241m.\u001b[39mpermute(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m0\u001b[39m)\u001b[38;5;241m.\u001b[39mcontiguous() \u001b[38;5;66;03m# (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)\u001b[39;00m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;66;03m# Launch kernel\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m \u001b[43mmlir_kernel_1\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mW\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 228\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m (mlir_kernel_1, (X, W, Y))\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:307\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir..dryrun_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 306\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdryrun_simulator\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 307\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 308\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfilelock\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FileLock\n\u001b[1;32m 309\u001b[0m lock_dir \u001b[38;5;241m=\u001b[39m get_lock_dir()\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/concurrent/futures/_base.py:453\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 450\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[1;32m 451\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__get_result()\n\u001b[0;32m--> 453\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_condition\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 455\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n\u001b[1;32m 456\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/threading.py:320\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m 319\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 321\u001b[0m gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 322\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "import os\n", + "import torch\n", + "from torchvision.models import resnet18\n", + "\n", + "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator\n", + "CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", + "\n", + "lambda_requests = 10\n", + "max_time = 30\n", + "\n", + "target_model1 = resnet18().eval()\n", + "\n", + "# Init scheduler\n", + "scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, backend_config=f\"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\")\n", + "# Register compiled model\n", + "opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)\n", + "SchedulerDNNModel.register_model(\"resnet18\", opt_model1)\n", + "\n", + "# Generate time stamp\n", + "for request_time in poisson_request_generator(lambda_requests, max_time):\n", + " # Init input data\n", + " model_input1 = torch.randn(1, 3, 224, 224)\n", + "\n", + " # Init request\n", + " new_request1 = Request(\"resnet18\", [model_input1], [], request_queue_idx=0)\n", + "\n", + " # Add request to scheduler\n", + " print(\"[Reqest] Resnet18 request time: \", request_time, flush=True)\n", + " scheduler.add_request(new_request1, request_time=request_time)\n", + "\n", + "# Run scheduler\n", + "while not scheduler.is_finished():\n", + " scheduler.schedule()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compiler Optimization\n", + "### GeMM + ReLU fusion" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrapper Codegen Path = /tmp/torchinductor_root/vr/cvrlybtkuzkk6pmnlfxu7o55375z24tajmiow6mszaen5t4ra6zo.py\n", + "[Gem5] Gem5 is running. \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/5o2xythi5z3/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/5o2xythi5z3/togsim_result/0\"\n" + ] + } + ], + "source": [ + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "def gemm_relu(a, b):\n", + " return torch.relu(torch.matmul(a, b))\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n", + "out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cat: /tmp/torchinductor/tmp/5o2xythi5z3/backendsim_result/0: No such file or directory\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/5o2xythi5z3/backendsim_result/0 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Disable fusion" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/tl/ctlqjsvukam6d4kteerml7exwbt4paw7cjtjbxcwdlsd7e4koriq.py\n", + "[Gem5] Gem5 is running... \n", + "[Gem5] Gem5 is running.. \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0\"\n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running.. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/37dfo4nczcq/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/37dfo4nczcq/togsim_result/0\"\n" + ] + } + ], + "source": [ + "os.environ['TORCHSIM_COMPILER_OPTIMIZATION']=\"none\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "def gemm_relu(a, b):\n", + " return torch.relu(torch.matmul(a, b))\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n", + "out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 12:52:49.376] [info] Total execution cycle: 47164\n", + "[2025-11-30 12:52:52.444] [info] Total execution cycle: 58510\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/backendsim_result/2 | grep \"Total execution cycle\"\n", + "!cat /tmp/torchinductor/tmp/37dfo4nczcq/backendsim_result/0 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Single kernel mode (TODO: remove it?)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/assumptions.py:499\u001b[0m, in \u001b[0;36mmake_property..getit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 498\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 499\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_assumptions\u001b[49m\u001b[43m[\u001b[49m\u001b[43mfact\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 500\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m:\n", + "\u001b[0;31mKeyError\u001b[0m: 'extended_negative'", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m model \u001b[38;5;241m=\u001b[39m resnet18()\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m 9\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(model)\n\u001b[0;32m---> 10\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:655\u001b[0m, in \u001b[0;36mcatch_errors_wrapper..catch_errors\u001b[0;34m(frame, cache_entry, frame_state)\u001b[0m\n\u001b[1;32m 652\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m hijacked_callback(frame, cache_entry, hooks, frame_state)\n\u001b[1;32m 654\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compile_lock, _disable_current_modes():\n\u001b[0;32m--> 655\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcallback\u001b[49m\u001b[43m(\u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_entry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:727\u001b[0m, in \u001b[0;36mconvert_frame.._convert_frame\u001b[0;34m(frame, cache_entry, hooks, frame_state)\u001b[0m\n\u001b[1;32m 725\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mframes\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtotal\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 726\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 727\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43minner_convert\u001b[49m\u001b[43m(\u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_entry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 728\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mframes\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mok\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 729\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:383\u001b[0m, in \u001b[0;36mconvert_frame_assert.._convert_frame_assert\u001b[0;34m(frame, cache_entry, hooks, frame_state)\u001b[0m\n\u001b[1;32m 370\u001b[0m signpost_event(\n\u001b[1;32m 371\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdynamo\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 372\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_convert_frame_assert._compile\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 379\u001b[0m },\n\u001b[1;32m 380\u001b[0m )\n\u001b[1;32m 382\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config\u001b[38;5;241m.\u001b[39mpatch(_patch_config_if_changed()):\n\u001b[0;32m--> 383\u001b[0m compiled_product \u001b[38;5;241m=\u001b[39m \u001b[43m_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 384\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_code\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 385\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_globals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 386\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_locals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 387\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_builtins\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 388\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompiler_fn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 389\u001b[0m \u001b[43m \u001b[49m\u001b[43mone_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 390\u001b[0m \u001b[43m \u001b[49m\u001b[43mexport\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 391\u001b[0m \u001b[43m \u001b[49m\u001b[43mexport_constraints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 392\u001b[0m \u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 393\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 394\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 395\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mframe_state\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompile_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcompile_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m compiled_product\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:646\u001b[0m, in \u001b[0;36m_compile\u001b[0;34m(code, globals, locals, builtins, compiler_fn, one_graph, export, export_constraints, hooks, cache_size, frame, frame_state, compile_id)\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compile_context(CompileContext(compile_id)):\n\u001b[1;32m 645\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 646\u001b[0m guarded_code \u001b[38;5;241m=\u001b[39m \u001b[43mcompile_inner\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mone_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransform\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 647\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m guarded_code\n\u001b[1;32m 648\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\n\u001b[1;32m 649\u001b[0m Unsupported,\n\u001b[1;32m 650\u001b[0m TorchRuntimeError,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 657\u001b[0m BisectValidationException,\n\u001b[1;32m 658\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:562\u001b[0m, in \u001b[0;36m_compile..compile_inner\u001b[0;34m(code, one_graph, hooks, transform)\u001b[0m\n\u001b[1;32m 560\u001b[0m CompileContext\u001b[38;5;241m.\u001b[39mget()\u001b[38;5;241m.\u001b[39mattempt \u001b[38;5;241m=\u001b[39m attempt\n\u001b[1;32m 561\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 562\u001b[0m out_code \u001b[38;5;241m=\u001b[39m \u001b[43mtransform_code_object\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransform\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 563\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 564\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mRestartAnalysis \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/bytecode_transformation.py:1033\u001b[0m, in \u001b[0;36mtransform_code_object\u001b[0;34m(code, transformations, safe)\u001b[0m\n\u001b[1;32m 1030\u001b[0m instructions \u001b[38;5;241m=\u001b[39m cleaned_instructions(code, safe)\n\u001b[1;32m 1031\u001b[0m propagate_line_nums(instructions)\n\u001b[0;32m-> 1033\u001b[0m \u001b[43mtransformations\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstructions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcode_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1034\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m clean_and_assemble_instructions(instructions, keys, code_options)[\u001b[38;5;241m1\u001b[39m]\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:151\u001b[0m, in \u001b[0;36mpreserve_global_state.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m cleanup \u001b[38;5;241m=\u001b[39m setup_compile_debug()\n\u001b[1;32m 150\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 151\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 153\u001b[0m cleanup\u001b[38;5;241m.\u001b[39mclose()\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:527\u001b[0m, in \u001b[0;36m_compile..transform\u001b[0;34m(instructions, code_options)\u001b[0m\n\u001b[1;32m 525\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 526\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m tracing(tracer\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mtracing_context), tracer\u001b[38;5;241m.\u001b[39mset_current_tx():\n\u001b[0;32m--> 527\u001b[0m \u001b[43mtracer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 528\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mUnspecializeRestartAnalysis:\n\u001b[1;32m 529\u001b[0m speculation_log\u001b[38;5;241m.\u001b[39mclear()\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:2128\u001b[0m, in \u001b[0;36mInstructionTranslator.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 2127\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m-> 2128\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:818\u001b[0m, in \u001b[0;36mInstructionTranslatorBase.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 813\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 814\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mpush_tx(\u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m 815\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m (\n\u001b[1;32m 816\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minstruction_pointer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 817\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mshould_exit\n\u001b[0;32m--> 818\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 819\u001b[0m ):\n\u001b[1;32m 820\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 821\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m BackendCompilerFailed:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:781\u001b[0m, in \u001b[0;36mInstructionTranslatorBase.step\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 777\u001b[0m unimplemented(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmissing: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minst\u001b[38;5;241m.\u001b[39mopname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 778\u001b[0m TracingContext\u001b[38;5;241m.\u001b[39mset_current_loc(\n\u001b[1;32m 779\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_filename, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlineno, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_name\n\u001b[1;32m 780\u001b[0m )\n\u001b[0;32m--> 781\u001b[0m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minst\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopname\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minst\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 783\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m inst\u001b[38;5;241m.\u001b[39mopname \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 784\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m Unsupported:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:2243\u001b[0m, in \u001b[0;36mInstructionTranslator.RETURN_VALUE\u001b[0;34m(self, inst)\u001b[0m\n\u001b[1;32m 2238\u001b[0m _step_logger()(\n\u001b[1;32m 2239\u001b[0m logging\u001b[38;5;241m.\u001b[39mINFO,\n\u001b[1;32m 2240\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorchdynamo done tracing \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (RETURN_VALUE)\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 2241\u001b[0m )\n\u001b[1;32m 2242\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE triggered compile\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 2243\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moutput\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompile_subgraph\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2244\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2245\u001b[0m \u001b[43m \u001b[49m\u001b[43mreason\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mGraphCompileReason\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2246\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreturn_value\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mframe_summary\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgraph_break\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[1;32m 2247\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2248\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompile_return_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 2249\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2250\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39madd_output_instructions([create_instruction(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE\u001b[39m\u001b[38;5;124m\"\u001b[39m)])\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:919\u001b[0m, in \u001b[0;36mOutputGraph.compile_subgraph\u001b[0;34m(self, tx, partial_convert, reason, compile_return_value)\u001b[0m\n\u001b[1;32m 916\u001b[0m append_prefix_insts()\n\u001b[1;32m 917\u001b[0m \u001b[38;5;66;03m# optimization to generate better code in a common case\u001b[39;00m\n\u001b[1;32m 918\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_output_instructions(\n\u001b[0;32m--> 919\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompile_and_call_fx_graph\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mreversed\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mstack_values\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mroot\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 920\u001b[0m \u001b[38;5;241m+\u001b[39m [create_instruction(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUNPACK_SEQUENCE\u001b[39m\u001b[38;5;124m\"\u001b[39m, arg\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(stack_values))]\n\u001b[1;32m 921\u001b[0m )\n\u001b[1;32m 922\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 923\u001b[0m graph_output_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnew_var(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgraph_out\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36mContextDecorator.__call__..inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds):\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1087\u001b[0m, in \u001b[0;36mOutputGraph.compile_and_call_fx_graph\u001b[0;34m(self, tx, rv, root)\u001b[0m\n\u001b[1;32m 1084\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtracing_context\u001b[38;5;241m.\u001b[39mfake_mode \u001b[38;5;241m=\u001b[39m backend_fake_mode\n\u001b[1;32m 1086\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrestore_global_state():\n\u001b[0;32m-> 1087\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcall_user_compiler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1088\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m disable(compiled_fn)\n\u001b[1;32m 1090\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstats\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munique_graphs\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1140\u001b[0m, in \u001b[0;36mOutputGraph.call_user_compiler\u001b[0;34m(self, gm)\u001b[0m\n\u001b[1;32m 1138\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39mverify_correctness:\n\u001b[1;32m 1139\u001b[0m compiler_fn \u001b[38;5;241m=\u001b[39m WrapperBackend(compiler_fn)\n\u001b[0;32m-> 1140\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexample_inputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1141\u001b[0m _step_logger()(logging\u001b[38;5;241m.\u001b[39mINFO, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdone compiler function \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1142\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(compiled_fn), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompiler_fn did not return callable\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/repro/after_dynamo.py:117\u001b[0m, in \u001b[0;36mwrap_backend_debug..debug_wrapper\u001b[0;34m(gm, example_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 117\u001b[0m compiled_gm \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 119\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m compiled_gm\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/__init__.py:1662\u001b[0m, in \u001b[0;36m_TorchCompileInductorWrapper.__call__\u001b[0;34m(self, model_, inputs_)\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, model_, inputs_):\n\u001b[1;32m 1660\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_inductor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompile_fx\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m compile_fx\n\u001b[0;32m-> 1662\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompile_fx\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig_patches\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:1168\u001b[0m, in \u001b[0;36mcompile_fx\u001b[0;34m(model_, example_inputs_, inner_compile, config_patches, decompositions)\u001b[0m\n\u001b[1;32m 1163\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m inference_compiler(unlifted_gm, example_inputs_)\n\u001b[1;32m 1165\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_fake_mode(fake_mode), torch\u001b[38;5;241m.\u001b[39m_guards\u001b[38;5;241m.\u001b[39mtracing(\n\u001b[1;32m 1166\u001b[0m tracing_context\n\u001b[1;32m 1167\u001b[0m ), compiled_autograd\u001b[38;5;241m.\u001b[39mdisable():\n\u001b[0;32m-> 1168\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43maot_autograd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1169\u001b[0m \u001b[43m \u001b[49m\u001b[43mfw_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1170\u001b[0m \u001b[43m \u001b[49m\u001b[43mbw_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbw_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1171\u001b[0m \u001b[43m \u001b[49m\u001b[43minference_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minference_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1172\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecompositions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecompositions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1173\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartition_fn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_fn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1174\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeep_inference_input_mutations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1175\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs_\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/backends/common.py:55\u001b[0m, in \u001b[0;36maot_autograd..compiler_fn\u001b[0;34m(gm, example_inputs)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# NB: NOT cloned!\u001b[39;00m\n\u001b[1;32m 54\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m enable_aot_logging(), patch_config:\n\u001b[0;32m---> 55\u001b[0m cg \u001b[38;5;241m=\u001b[39m \u001b[43maot_module_simplified\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 56\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot_autograd\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mok\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m disable(cg)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:887\u001b[0m, in \u001b[0;36maot_module_simplified\u001b[0;34m(mod, args, fw_compiler, bw_compiler, partition_fn, decompositions, keep_inference_input_mutations, inference_compiler)\u001b[0m\n\u001b[1;32m 871\u001b[0m aot_config \u001b[38;5;241m=\u001b[39m AOTConfig(\n\u001b[1;32m 872\u001b[0m fw_compiler\u001b[38;5;241m=\u001b[39mfw_compiler,\n\u001b[1;32m 873\u001b[0m bw_compiler\u001b[38;5;241m=\u001b[39mbw_compiler,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 883\u001b[0m no_tangents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 884\u001b[0m )\n\u001b[1;32m 886\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compiled_autograd\u001b[38;5;241m.\u001b[39mdisable():\n\u001b[0;32m--> 887\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_aot_dispatcher_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 888\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunctional_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 889\u001b[0m \u001b[43m \u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 890\u001b[0m \u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 891\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 893\u001b[0m \u001b[38;5;66;03m# TODO: There is something deeply wrong here; compiled_fn running with\u001b[39;00m\n\u001b[1;32m 894\u001b[0m \u001b[38;5;66;03m# the boxed calling convention, but aot_module_simplified somehow\u001b[39;00m\n\u001b[1;32m 895\u001b[0m \u001b[38;5;66;03m# historically returned a function that was not the boxed calling\u001b[39;00m\n\u001b[1;32m 896\u001b[0m \u001b[38;5;66;03m# convention. This should get fixed...\u001b[39;00m\n\u001b[1;32m 897\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39mruntime_args):\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:600\u001b[0m, in \u001b[0;36mcreate_aot_dispatcher_function\u001b[0;34m(flat_fn, flat_args, aot_config)\u001b[0m\n\u001b[1;32m 597\u001b[0m compiler_fn \u001b[38;5;241m=\u001b[39m partial(aot_wrapper_dedupe, compiler_fn\u001b[38;5;241m=\u001b[39mcompiler_fn)\n\u001b[1;32m 598\u001b[0m \u001b[38;5;66;03m# You can put more passes here\u001b[39;00m\n\u001b[0;32m--> 600\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfake_flat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 601\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m aot_config\u001b[38;5;241m.\u001b[39mis_export:\n\u001b[1;32m 602\u001b[0m mutated_user_inp_locs \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 603\u001b[0m idx \u001b[38;5;241m-\u001b[39m aot_config\u001b[38;5;241m.\u001b[39mnum_params_buffers\n\u001b[1;32m 604\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m fw_metadata\u001b[38;5;241m.\u001b[39mmutated_inp_runtime_indices\n\u001b[1;32m 605\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m idx \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m aot_config\u001b[38;5;241m.\u001b[39mnum_params_buffers\n\u001b[1;32m 606\u001b[0m ]\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:425\u001b[0m, in \u001b[0;36maot_wrapper_dedupe\u001b[0;34m(flat_fn, flat_args, aot_config, compiler_fn, fw_metadata)\u001b[0m\n\u001b[1;32m 422\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 424\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ok:\n\u001b[0;32m--> 425\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mleaf_flat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 427\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m requires_subclass_dispatch(leaf_flat_args, fw_metadata):\n\u001b[1;32m 428\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 429\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\\\u001b[39;00m\n\u001b[1;32m 430\u001b[0m \u001b[38;5;124;03mEncountered duplicate inputs that are mutated in the graph, but at least one input/output\u001b[39;00m\n\u001b[1;32m 431\u001b[0m \u001b[38;5;124;03mto the graph is a tensor subclass. This is not supported today. You can try to\u001b[39;00m\n\u001b[1;32m 432\u001b[0m \u001b[38;5;124;03mremove the aliasing yourself as a workaround, or otherwise file an issue on github.\"\"\"\u001b[39;00m\n\u001b[1;32m 433\u001b[0m )\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:630\u001b[0m, in \u001b[0;36maot_wrapper_synthetic_base\u001b[0;34m(flat_fn, flat_args, aot_config, fw_metadata, needs_autograd, compiler_fn)\u001b[0m\n\u001b[1;32m 628\u001b[0m \u001b[38;5;66;03m# Happy path: we don't need synthetic bases\u001b[39;00m\n\u001b[1;32m 629\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m synthetic_base_info \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 630\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mflat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 632\u001b[0m \u001b[38;5;66;03m# export path: ban synthetic bases for now, add later if requested.\u001b[39;00m\n\u001b[1;32m 633\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m requires_subclass_dispatch(flat_args, fw_metadata):\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:295\u001b[0m, in \u001b[0;36maot_dispatch_autograd\u001b[0;34m(flat_fn, flat_args, aot_config, fw_metadata)\u001b[0m\n\u001b[1;32m 292\u001b[0m tracing_context\u001b[38;5;241m.\u001b[39mfw_metadata \u001b[38;5;241m=\u001b[39m inner_meta\n\u001b[1;32m 294\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m TracingContext\u001b[38;5;241m.\u001b[39mreport_output_strides() \u001b[38;5;28;01mas\u001b[39;00m fwd_output_strides:\n\u001b[0;32m--> 295\u001b[0m compiled_fw_func \u001b[38;5;241m=\u001b[39m \u001b[43maot_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfw_compiler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfw_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43madjusted_flat_args\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(compiled_fw_func, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 297\u001b[0m compiled_fw_func \u001b[38;5;241m=\u001b[39m make_boxed_func(compiled_fw_func)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:1100\u001b[0m, in \u001b[0;36mcompile_fx..fw_compiler_base\u001b[0;34m(model, example_inputs, is_inference)\u001b[0m\n\u001b[1;32m 1092\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m orig_output_end_idx \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m num_model_outputs\n\u001b[1;32m 1094\u001b[0m user_visible_outputs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 1095\u001b[0m n\u001b[38;5;241m.\u001b[39mname\n\u001b[1;32m 1096\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m n \u001b[38;5;129;01min\u001b[39;00m model_outputs[original_output_start_index:orig_output_end_idx]\n\u001b[1;32m 1097\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(n, torch\u001b[38;5;241m.\u001b[39mfx\u001b[38;5;241m.\u001b[39mNode)\n\u001b[1;32m 1098\u001b[0m }\n\u001b[0;32m-> 1100\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1101\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1102\u001b[0m \u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1103\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_fixed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfixed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1104\u001b[0m \u001b[43m \u001b[49m\u001b[43mcudagraphs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcudagraphs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1105\u001b[0m \u001b[43m \u001b[49m\u001b[43mgraph_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgraph_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1106\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_inference\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_inference\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1107\u001b[0m \u001b[43m \u001b[49m\u001b[43mboxed_forward_device_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforward_device\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1108\u001b[0m \u001b[43m \u001b[49m\u001b[43muser_visible_outputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_visible_outputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1109\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/repro/after_aot.py:83\u001b[0m, in \u001b[0;36mwrap_compiler_debug..debug_wrapper\u001b[0;34m(gm, example_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m config\u001b[38;5;241m.\u001b[39mrepro_after \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdynamo\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 81\u001b[0m \u001b[38;5;66;03m# Call the compiler_fn - which is either aot_autograd or inductor\u001b[39;00m\n\u001b[1;32m 82\u001b[0m \u001b[38;5;66;03m# with fake inputs\u001b[39;00m\n\u001b[0;32m---> 83\u001b[0m inner_compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 84\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 85\u001b[0m \u001b[38;5;66;03m# TODO: Failures here are troublesome because no real inputs,\u001b[39;00m\n\u001b[1;32m 86\u001b[0m \u001b[38;5;66;03m# need a different serialization strategy\u001b[39;00m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39mrepro_after \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/debug.py:305\u001b[0m, in \u001b[0;36mDebugContext.wrap..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 302\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 304\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m DebugContext():\n\u001b[0;32m--> 305\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36mContextDecorator.__call__..inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds):\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:320\u001b[0m, in \u001b[0;36mcompile_fx_inner\u001b[0;34m(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, boxed_forward_device_index, user_visible_outputs, layout_opt, extern_node_serializer)\u001b[0m\n\u001b[1;32m 316\u001b[0m compiled_graph \u001b[38;5;241m=\u001b[39m FxGraphCache\u001b[38;5;241m.\u001b[39mload(\n\u001b[1;32m 317\u001b[0m fx_codegen_and_compile, gm, example_inputs, graph_kwargs\n\u001b[1;32m 318\u001b[0m )\n\u001b[1;32m 319\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m compiled_graph \u001b[38;5;241m=\u001b[39m \u001b[43mfx_codegen_and_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 321\u001b[0m \u001b[43m \u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mgraph_kwargs\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[arg-type]\u001b[39;49;00m\n\u001b[1;32m 322\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 324\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFX codegen and compilation took \u001b[39m\u001b[38;5;132;01m%.3f\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m, time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m start)\n\u001b[1;32m 326\u001b[0m \u001b[38;5;66;03m# Return the output strides to the caller via TracingContext\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:535\u001b[0m, in \u001b[0;36mfx_codegen_and_compile\u001b[0;34m(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, user_visible_outputs, layout_opt, extern_node_serializer)\u001b[0m\n\u001b[1;32m 519\u001b[0m graph \u001b[38;5;241m=\u001b[39m GraphLowering(\n\u001b[1;32m 520\u001b[0m gm,\n\u001b[1;32m 521\u001b[0m \u001b[38;5;66;03m# example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 532\u001b[0m is_inference\u001b[38;5;241m=\u001b[39mis_inference,\n\u001b[1;32m 533\u001b[0m )\n\u001b[1;32m 534\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_graph_handler(graph):\n\u001b[0;32m--> 535\u001b[0m \u001b[43mgraph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 536\u001b[0m output_strides: List[Optional[Tuple[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m]]] \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 537\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m graph\u001b[38;5;241m.\u001b[39mgraph_outputs \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 538\u001b[0m \u001b[38;5;66;03m# We'll put the output strides in the compiled graph so we\u001b[39;00m\n\u001b[1;32m 539\u001b[0m \u001b[38;5;66;03m# can later return them to the caller via TracingContext\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:519\u001b[0m, in \u001b[0;36mGraphLowering.run\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[38;5;129m@dynamo_timed\u001b[39m\n\u001b[1;32m 518\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs):\n\u001b[0;32m--> 519\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/fx/interpreter.py:138\u001b[0m, in \u001b[0;36mInterpreter.run\u001b[0;34m(self, initial_env, enable_io_processing, *args)\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m 137\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 138\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menv[node] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_node\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 140\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mextra_traceback:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:814\u001b[0m, in \u001b[0;36mGraphLowering.run_node\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 812\u001b[0m debug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayout_constraints\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 813\u001b[0m args, kwargs \u001b[38;5;241m=\u001b[39m layout_constraints[n\u001b[38;5;241m.\u001b[39mtarget](n, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 814\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcall_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 815\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_magic_method(n\u001b[38;5;241m.\u001b[39mtarget):\n\u001b[1;32m 816\u001b[0m \u001b[38;5;66;03m# TODO: this is sus, it probably should be handled in the\u001b[39;00m\n\u001b[1;32m 817\u001b[0m \u001b[38;5;66;03m# lowerings themselves similarly to sym_size/sym-stride\u001b[39;00m\n\u001b[1;32m 818\u001b[0m debug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_magic_method\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:691\u001b[0m, in \u001b[0;36mGraphLowering.call_function\u001b[0;34m(self, target, args, kwargs)\u001b[0m\n\u001b[1;32m 689\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 690\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m via \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, lowerings[target])\n\u001b[0;32m--> 691\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mlowerings\u001b[49m\u001b[43m[\u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 692\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m 693\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_lowering.py:117\u001b[0m, in \u001b[0;36mconvolution\u001b[0;34m(x, weight, bias, stride, padding, dilation, transposed, output_padding, groups)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 116\u001b[0m mlir_template \u001b[38;5;241m=\u001b[39m MLIRConvTemplate([x, weight, bias], layout, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 117\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmlir_template\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39moutput_node()\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_template.py:1189\u001b[0m, in \u001b[0;36mMLIRTemplate.generate\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 1184\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m patch\u001b[38;5;241m.\u001b[39mobject(V\u001b[38;5;241m.\u001b[39mgraph, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mget_dtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fake_get_dtype(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_node)):\n\u001b[1;32m 1185\u001b[0m kernel \u001b[38;5;241m=\u001b[39m MLIRTemplateKernel(kernel_name\u001b[38;5;241m=\u001b[39mkernel_name, input_nodes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minput_nodes, call_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlayout\u001b[38;5;241m.\u001b[39msize, kernel_group\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1186\u001b[0m outer_func_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunction_name \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfunction_name\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1187\u001b[0m outer_func_render\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mouter_func_render \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mouter_func_render\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1188\u001b[0m kernel_arg_attributes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_arg_attributes() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mget_arg_attributes\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m-> 1189\u001b[0m code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkernel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkernel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1191\u001b[0m kernel_hash_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmlir_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex_counter)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1192\u001b[0m extra_args \u001b[38;5;241m=\u001b[39m []\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py:238\u001b[0m, in \u001b[0;36mMLIRConvSingleBatchTemplate.render\u001b[0;34m(self, kernel, template_buffer_node, epilogue_nodes, tile_info, **kwargs)\u001b[0m\n\u001b[1;32m 229\u001b[0m kernel\u001b[38;5;241m.\u001b[39mepilogue_info \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(\n\u001b[1;32m 230\u001b[0m output_node \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_node\u001b[38;5;241m.\u001b[39mname,\n\u001b[1;32m 231\u001b[0m sram_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_buffer\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 235\u001b[0m dim_aliasing \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex0\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mc0\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex1\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtile_n\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex2\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mo_h\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex3\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtile_m\u001b[39m\u001b[38;5;124m\"\u001b[39m}\n\u001b[1;32m 236\u001b[0m )\n\u001b[1;32m 237\u001b[0m kernel\u001b[38;5;241m.\u001b[39mexception_nodes[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnumel\u001b[39m\u001b[38;5;124m\"\u001b[39m : (I_W\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m2\u001b[39m\u001b[38;5;241m*\u001b[39mPADDING_W)\u001b[38;5;241m*\u001b[39m(I_H\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m2\u001b[39m\u001b[38;5;241m*\u001b[39mPADDING_H)\u001b[38;5;241m*\u001b[39mI_C\u001b[38;5;241m*\u001b[39mBATCH}\n\u001b[0;32m--> 238\u001b[0m code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_template_from_string\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconv_template\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkernel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 239\u001b[0m kernel\u001b[38;5;241m.\u001b[39madd_loop_info([kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mK_H\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mK_W\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_H\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_W\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBATCH\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_C\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mI_C\u001b[39m\u001b[38;5;124m\"\u001b[39m]], [kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_M\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_N\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_K\u001b[39m\u001b[38;5;124m\"\u001b[39m]])\n\u001b[1;32m 240\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m code\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/jinja2/environment.py:1299\u001b[0m, in \u001b[0;36mTemplate.render\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1296\u001b[0m ctx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnew_context(\u001b[38;5;28mdict\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs))\n\u001b[1;32m 1298\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1299\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menvironment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconcat\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mroot_render_func\u001b[49m\u001b[43m(\u001b[49m\u001b[43mctx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 1300\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 1301\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menvironment\u001b[38;5;241m.\u001b[39mhandle_exception()\n", + "File \u001b[0;32m