diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 61eb96e1..eba48da2 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -6,7 +6,7 @@ on: jobs: build-and-test: - runs-on: ubuntu-latest + runs-on: self-hosted permissions: contents: read @@ -35,6 +35,7 @@ jobs: context: . file: ./Dockerfile push: true + no-cache: true tags: ghcr.io/psal-postech/torchsim-test:${{ github.sha }} # Step 4: Wait for GHCR propagation diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index c27df48a..32d6543c 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -662,3 +662,37 @@ jobs: -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_scheduler.py + + test_accuracy: + name: Run test_accuracy + runs-on: self-hosted + steps: + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Prepare volume directory + run: mkdir -p /tmp/torchsim-ci/${GITHUB_SHA} + + - name: Run run_cycle.sh + run: | + echo "Running run_cycle.sh" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} bash -c \ + "cd /workspace && PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh && \ + cp PyTorchSim/experiments/artifact/cycle_validation/summary_cycle.out /dump/summary_cycle.out" + ls /tmp/torchsim-ci/${GITHUB_SHA} + + - name: Upload Accuracy Report Artifact + uses: actions/upload-artifact@v4 + with: + name: accuracy-report + path: /tmp/torchsim-ci/${{ github.sha }}/summary_cycle.out + if-no-files-found: error diff --git a/.gitignore b/.gitignore index 88eb2fb8..9decced5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ __pycache__/ -PyTorchSimBackend/build/ +TOGSim/build/ .vscode diff --git a/.gitmodules b/.gitmodules index f65e5f2b..24f9ccaf 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,18 +1,15 @@ -[submodule "PyTorchSimBackend/extern/onnx"] - path = PyTorchSimBackend/extern/onnx +[submodule "TOGSim/extern/onnx"] + path = TOGSim/extern/onnx url = https://github.com/onnx/onnx.git -[submodule "PyTorchSimBackend/extern/protobuf"] - path = PyTorchSimBackend/extern/protobuf +[submodule "TOGSim/extern/protobuf"] + path = TOGSim/extern/protobuf url = https://github.com/protocolbuffers/protobuf.git -[submodule "PyTorchSimBackend/extern/booksim"] - path = PyTorchSimBackend/extern/booksim +[submodule "TOGSim/extern/booksim"] + path = TOGSim/extern/booksim url = https://github.com/PSAL-POSTECH/booksim.git -[submodule "PyTorchSimBackend/extern/torch2timeloop"] - path = PyTorchSimBackend/extern/torch2timeloop - url = https://github.com/Accelergy-Project/pytorch2timeloop-converter.git -[submodule "PyTorchSimBackend/extern/ramulator2"] - path = PyTorchSimBackend/extern/ramulator2 +[submodule "TOGSim/extern/ramulator2"] + path = TOGSim/extern/ramulator2 url = https://github.com/PSAL-POSTECH/ramulator2 -[submodule "PyTorchSimBackend/extern/stonneCore"] - path = PyTorchSimBackend/extern/stonneCore +[submodule "TOGSim/extern/stonneCore"] + path = TOGSim/extern/stonneCore url = https://github.com/PSAL-POSTECH/stonne_core.git diff --git a/Dockerfile b/Dockerfile index 293dcb60..37721940 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ FROM ghcr.io/psal-postech/torchsim_base:latest # Prepare PyTorchSim project COPY . /workspace/PyTorchSim -RUN cd PyTorchSim/PyTorchSimBackend && \ +RUN cd PyTorchSim/TOGSim && \ mkdir -p build && \ cd build && \ conan install .. --build=missing && \ diff --git a/Dockerfile.ksc2025 b/Dockerfile.ksc2025 new file mode 100644 index 00000000..2ac210e0 --- /dev/null +++ b/Dockerfile.ksc2025 @@ -0,0 +1,90 @@ +# Copyright (c) 2020 The Regents of the University of California +# All Rights Reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime + +# Copied from Gem5 Docker file +ENV DEBIAN_FRONTEND=noninteractive +RUN apt -y update && apt -y upgrade && \ + apt -y install build-essential git m4 scons zlib1g zlib1g-dev \ + libprotobuf-dev protobuf-compiler libprotoc-dev libgoogle-perftools-dev \ + python3-dev python-is-python3 doxygen libboost-all-dev \ + libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config pip \ + python3-venv black libssl-dev libasan5 libubsan1 +RUN pip install mypy pre-commit jupyter + +# Pass Access Token securely +ENV PATH=$PATH:/root/.local/bin +ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH + +# Build Gem5 +RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch TorchSim +RUN cd gem5 && scons build/RISCV/gem5.opt -j $(nproc) +ENV GEM5_PATH=/workspace/gem5/build/RISCV/gem5.opt + +# Build LLVM RISC-V +RUN git clone https://github.com/PSAL-POSTECH/llvm-project.git --branch torchsim --depth 1 +RUN cd llvm-project && mkdir build && cd build && \ + cmake -DLLVM_ENABLE_PROJECTS=mlir -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/riscv-llvm -DLLVM_TARGETS_TO_BUILD=RISCV -G "Unix Makefiles" ../llvm && \ + make -j && make install + +# Store RISC-V LLVM for TorchSim +ENV TORCHSIM_LLVM_PATH=/riscv-llvm/bin +ENV TORCHSIM_LLVM_INCLUDE_PATH=/riscv-llvm/include +ENV TORCHSIM_DIR=/workspace/PyTorchSim +ENV LLVM_DIR=/riscv-llvm + +# Download RISC-V tool chain +RUN apt install -y wget && \ + wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz && \ + wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && \ + tar -zxvf riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && tar -zxvf riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && \ + rm *.tar.gz + +ENV RISCV=/workspace/riscv +ENV PATH=$RISCV/bin:$PATH + +# Install Spike simulator +RUN apt -y install device-tree-compiler +RUN git clone https://github.com/PSAL-POSTECH/riscv-isa-sim.git --branch TorchSim && cd riscv-isa-sim && mkdir build && cd build && \ + ../configure --prefix=$RISCV && make -j && make install + +# Install Proxy kernel +RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \ + cd riscv-pk && git checkout 4f3debe4d04f56d31089c1c716a27e2d5245e9a1 && mkdir build && cd build && \ + ../configure --prefix=$RISCV --host=riscv64-unknown-elf && make -j && make install + +# Install torchsim dependency +RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0 + +# Prepare ONNXim project +RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch tutorial +RUN cd PyTorchSim/TOGSim && \ + git submodule update --recursive --init && \ + mkdir -p build && \ + cd build && \ + conan install .. --build=missing && \ + cmake .. && \ + make -j$(nproc) \ No newline at end of file diff --git a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json deleted file mode 100644 index 8f196e81..00000000 --- a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "core_type" : ["stonne", "ws_mesh"], - "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", - "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_stonne_per_core" : 8, - "num_stonne_port" : 64, - "num_systolic_array_per_core" : 2, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 16, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 15000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":1 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json deleted file mode 100644 index c7ef15f7..00000000 --- a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "core_type" : ["stonne"], - "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", - "num_cores" : 1, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_stonne_per_core" : 8, - "num_stonne_port" : 64, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 8, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 15000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json deleted file mode 100644 index 2293e197..00000000 --- a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "core_type" : ["stonne"], - "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", - "num_cores" : 1, - "core_freq" : 700, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_stonne_per_core" : 1, - "num_stonne_port" : 8, - - "dram_type" : "ramulator2", - "dram_freq" : 700, - "dram_channels": 8, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 7000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json deleted file mode 100644 index 08548638..00000000 --- a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "core_type" : ["stonne"], - "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", - "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_stonne_per_core" : 1, - "num_stonne_port" : 32, - - "dram_type" : "simple", - "dram_freq" : 1000, - "dram_channels": 1, - "dram_req_size": 32, - "dram_latency" : 100, - "dram_print_interval": 10000, - "l2d_type" : "datacache", - "l2d_config" : "S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 7000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json deleted file mode 100644 index 5d7b0d35..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 700, - "sram_size" : 65536, - "core_print_interval" : 10000, - - "dram_type" : "ramulator2", - "dram_freq" :700, - "dram_channels": 16, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - - "icnt_type" : "booksim2", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_node_per_core" : 16, - "icnt_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json deleted file mode 100644 index 38acafc0..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 700, - "sram_size" : 65536, - "core_print_interval" : 10000, - - "dram_type" : "ramulator2", - "dram_freq" : 700, - "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 10000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json deleted file mode 100644 index 7348d5bc..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 2, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 16, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 15000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0": 0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json deleted file mode 100644 index 69ec8bd0..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 2, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 8, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 15000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0": 0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json deleted file mode 100644 index bff4e224..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 1050, - "sram_size" : 16777216, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 4, - - "dram_type" : "ramulator2", - "dram_freq" :1200, - "dram_channels": 16, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - "l2d_type" : "datacache", - "l2d_config" : "S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 19200, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json deleted file mode 100644 index b2661894..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 2, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_num_partitions" : 2, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "booksim2", - "icnt_latency" : 1, - "icnt_freq" : 1000, - "icnt_node_per_core" : 16, - "icnt_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", - "icnt_print_interval" : 10000, - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json deleted file mode 100644 index 922ede5b..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 2, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_num_partitions" : 1, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "booksim2", - "icnt_latency" : 1, - "icnt_freq" : 1000, - "icnt_node_per_core" : 16, - "icnt_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json deleted file mode 100644 index 034542fe..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 700, - "sram_size" : 65536, - "core_print_interval" : 10000, - - "dram_type" : "ramulator2", - "dram_freq" :700, - "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 20000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json deleted file mode 100644 index 82f42c00..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 2, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 28000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json deleted file mode 100644 index 132a52e6..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 2, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 28000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":1 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json deleted file mode 100644 index a93e8ae2..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 1050, - "sram_size" : 32768, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 4, - - "dram_type" : "ramulator2", - "dram_freq" :1200, - "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - "l2d_type" : "datacache", - "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32", - - "icnt_type" : "simple", - "icnt_latency" : 7, - "icnt_freq" : 48000, - "icnt_node_per_core" : 1, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json deleted file mode 100644 index e9a64f2e..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 1, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 1, - "icnt_freq" : 1000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple" -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json deleted file mode 100644 index 37e18b35..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 2, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple" -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json deleted file mode 100644 index 49225d77..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 4, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple" -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json deleted file mode 100644 index 4ea2c6ff..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 1, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "booksim2", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json deleted file mode 100644 index 8aee751b..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 1, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple" -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json deleted file mode 100644 index f76fec32..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "core_type" : ["ws_mesh","ws_mesh"], - "num_cores" : 2, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 1, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m4.icnt", - - "precision" : 4, - "scheduler" : "simple" -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json deleted file mode 100644 index 7571b830..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 2, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m8.icnt", - - "precision" : 4, - "scheduler" : "simple" -} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json deleted file mode 100644 index be163336..00000000 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "num_cores" : 2, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, - - "dram_type" : "ramulator2", - "dram_freq" :800, - "dram_channels": 4, - "dram_req_size": 64, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple" -} \ No newline at end of file diff --git a/PyTorchSimBackend/extern/torch2timeloop b/PyTorchSimBackend/extern/torch2timeloop deleted file mode 160000 index 62aa1754..00000000 --- a/PyTorchSimBackend/extern/torch2timeloop +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 62aa175421165cc9cd7dfb182a02fc3e26c01e3a diff --git a/PyTorchSimBackend/src/TMA.cc b/PyTorchSimBackend/src/TMA.cc deleted file mode 100644 index 7744b0f5..00000000 --- a/PyTorchSimBackend/src/TMA.cc +++ /dev/null @@ -1,48 +0,0 @@ -#include "TMA.h" -#include "TileGraph.h" - -TMA::TMA(uint32_t id, uint32_t dram_req_size) { - _id = id; - _dram_req_size = dram_req_size; - _current_inst = nullptr; - _finished = true; -} - -void TMA::issue_tile(std::shared_ptr inst) { - _current_inst = std::move(inst); - std::vector& tile_size = _current_inst->get_tile_size(); - if (tile_size.size() <= 0 || tile_size.size() > get_max_dim()) { - spdlog::error("[TMA {}] issued tile is not supported format..", _id); - exit(EXIT_FAILURE); - } - _finished = false; -} - -std::shared_ptr> TMA::get_memory_access() { - auto addr_set = _current_inst->get_dram_address(_dram_req_size); - auto access_vec = std::make_shared>(); - Tile* owner = (Tile*)_current_inst->get_owner(); - std::shared_ptr owner_subgraph = owner->get_owner(); - unsigned long long base_daddr = _current_inst->get_base_dram_address(); - // Todo. We use a ternsor level buffer allocation, so we don't need to check all memfetch - bool is_cacheable = owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size); - spdlog::trace("[SRAM Trace] Core-{}, Address: 0x{:016x}, Is_cacheable: {}", _id, base_daddr, is_cacheable); - spdlog::trace("[NUMA Trace] Core-{}, Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}", - _id, owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write()); - - for (auto addr: *addr_set) { - mem_access_type acc_type = _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W : mem_access_type::GLOBAL_ACC_R; - mf_type type = _current_inst->is_dma_write() ? mf_type::WRITE_REQUEST : mf_type::READ_REQUEST; - mem_fetch* access = new mem_fetch(addr, acc_type, type, _dram_req_size, _current_inst->get_numa_id(), static_cast(_current_inst.get())); - access->set_cacheable(is_cacheable); - _current_inst->inc_waiting_request(); - access_vec->push_back(access); - } - _finished = true; - return access_vec; -} - -uint32_t TMA::generate_mem_access_id() { - static uint32_t id_counter{0}; - return id_counter++; -} \ No newline at end of file diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index 20152e9f..577c45e9 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -7,7 +7,7 @@ from AsmParser.tog_generator import tog_generator from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen from PyTorchSimFrontend import extension_config -from Simulator.simulator import FunctionalSimulator, CycleSimulator, BackendSimulator +from Simulator.simulator import FunctionalSimulator, CycleSimulator, TOGSimulator LOCK_TIMEOUT = 600 @@ -27,21 +27,6 @@ def dump_metadata(args, arg_attributes, path): file.write(f'{arg_name}=({arg_attribute[0]}, {arg.dtype}, {arg.shape})\n') return -def parse_stack_sizes(file_path): - meta_path = file_path.split(".")[0]+".meta" - cmd = ["riscv64-unknown-elf-objcopy", "--dump-section", f".stack_sizes={meta_path}", file_path, "/dev/null"] - subprocess.run(cmd, check=True) - - with open(meta_path, 'rb') as f: - stack_sizes_data = list(f.read()) - if len(stack_sizes_data) <= 17: - raise ValueError("Invalid .stack_sizes section size") - - stack_size_bytes = stack_sizes_data[8:-9] - stack_size = int.from_bytes(stack_size_bytes, byteorder='little') - return stack_size - - def llvm_compile_command(input, output): opt_output = f"{input[:-3]}_opt.ll" return [re.sub(r"[ \n]+", " ", @@ -142,6 +127,10 @@ class SpadOverflowError(Exception): def __init__(self, message="SPAD overflow occurred."): super().__init__(message) +class TileSizeError(Exception): + def __init__(self, message="SPAD overflow occurred."): + super().__init__(message) + class MLIRCodeCache: cache = dict() clear = staticmethod(cache.clear) # Todo: Cache @@ -176,7 +165,7 @@ def load(cls, source_code, else: link_option = "" # Generate LLVM kernel calller and binary for validation - if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE: + if extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE: # Use custom malloc to avoid size error new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free" cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen) @@ -193,7 +182,7 @@ def load(cls, source_code, print("Error output:", e.output) assert(0) - val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, arg_attributes) + val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE, arg_attributes) val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name) val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name, validation_binary_name, new_link_option) @@ -224,7 +213,7 @@ def load(cls, source_code, print("Error output:", e.output) assert(0) - if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: + if not extension_config.CONFIG_TORCHSIM_TIMING_MODE: return key # Generate MLIR kernel calller and binary for cycle calculation @@ -278,8 +267,12 @@ def task(): loop_size = kwargs["loop_size"] else: loop_size = [] + + # In the autotune mode, skip validation to speed up + autotune = kwargs.get('autotune', False) + validate = kwargs.get('validate', False) if not autotune else False + def dummy_simulator(*args, **kwargs): - validate = kwargs.get('validate', False) # Wait for compilation key = future.result() from filelock import FileLock @@ -291,57 +284,49 @@ def dummy_simulator(*args, **kwargs): # Dump arguments and meta data dump_metadata(args, arg_attributes, result_path) runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path) - if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE or validate: + if not autotune and (extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE or validate): funcsim = FunctionalSimulator(result_path, key) funcsim.run_spike(args, arg_attributes, runtime_path, self.validation_binary_name, vectorlane_size=vectorlane_size, spad_info=spad_info, cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS, silent_mode=silent_mode) - if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: + if not extension_config.CONFIG_TORCHSIM_TIMING_MODE: return onnx_path = os.path.join(result_path, "tile_graph.onnx") attribute_path = os.path.join(runtime_path, "attribute") - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + backsim = TOGSimulator(togsim_path, extension_config.CONFIG_TOGSIM_CONFIG) backsim.vectorlane_size = vectorlane_size attribute_path = backsim.create_attribute_file(attribute_path, args, loop_size=loop_size) result_path = backsim.simulation(onnx_path, attribute_path, silent_mode=silent_mode) - result = BackendSimulator.get_result_from_file(result_path) + result = TOGSimulator.get_result_from_file(result_path) return result def dryrun_simulator(*args, **kwargs): - autotune = kwargs.get('autotune', False) key = future.result() - # Run simulator pass - result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key)) - # Dump arguments and meta data - dump_metadata(args, arg_attributes, result_path) - runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path) - if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: - return - - if autotune: - onnx_path = os.path.join(result_path, "tile_graph.onnx") - attribute_path = os.path.join(runtime_path, "attribute") - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) - backsim.vectorlane_size = vectorlane_size - attribute_path = backsim.create_attribute_file(attribute_path, args, loop_size=loop_size) - result_path_2 = backsim.simulation(onnx_path, attribute_path) - result = BackendSimulator.get_result_from_file(result_path_2) - return result_path, runtime_path, result + from filelock import FileLock + lock_dir = get_lock_dir() + lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT) + with lock: + # Run simulator pass + result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key)) + # Dump arguments and meta data + dump_metadata(args, arg_attributes, result_path) + runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path) + if not extension_config.CONFIG_TORCHSIM_TIMING_MODE: + return - # Todo. Support valude dependent mode for graph mode - if False: # extension_config.CONFIG_TORCHSIM_VALIDATION_MODE: - funcsim = FunctionalSimulator(result_path, key) - funcsim.run_spike(args, arg_attributes, - runtime_path, self.validation_binary_name, - vectorlane_size=vectorlane_size, spad_info=spad_info, - cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS) + # Todo. Support valude dependent mode for graph mode + if False: # extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE: + funcsim = FunctionalSimulator(result_path, key) + funcsim.run_spike(args, arg_attributes, + runtime_path, self.validation_binary_name, + vectorlane_size=vectorlane_size, spad_info=spad_info, + cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS) return result_path, runtime_path, None - is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) + is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) and not autotune target_simulator = dryrun_simulator if is_dryrun else dummy_simulator target_simulator.arg_attributes = arg_attributes target_simulator.future = future diff --git a/PyTorchSimFrontend/extension_codegen_backend.py b/PyTorchSimFrontend/extension_codegen_backend.py deleted file mode 100644 index e569d251..00000000 --- a/PyTorchSimFrontend/extension_codegen_backend.py +++ /dev/null @@ -1,216 +0,0 @@ -import dataclasses -import contextlib -from typing import List -from typing import Dict -from torch._inductor.codegen import cpp, wrapper, common -from torch._inductor.scheduler import BaseScheduling -from torch._inductor.virtualized import V -from torch._inductor.utils import IndentedBuffer -import sympy - -cexpr = cpp.CppPrinter().doprint - -class ExtensionWrapperCodegen(wrapper.WrapperCodeGen): - def __init__(self): - super().__init__() - -class ExtensionOverrides(common.OpOverrides): - pass - -class ExtensionKernel(common.Kernel): - overrides = ExtensionOverrides - newvar_prefix = "auto " - suffix = ";" - - def __init__(self, args=None): - super().__init__(args) - self.call_ranges = None - self.ranges = None - self.itervars = None - self.reduction_depth = None - self.reduction_prefix = IndentedBuffer() - self.reduction_suffix = IndentedBuffer() - self.reduction_vars = {} - self.reduction_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc") - - def load(self, name: str, index: sympy.Expr): - index = self.rename_indexing(index) - var = self.args.input(name) - line = f"{var}[{index}]" - dtype = V.graph.get_dtype(name) - self.cse.prefix = cpp.DTYPE_TO_CPP[dtype] + " " - return self.cse.generate(self.loads, line) - - def store(self, name: str, index: sympy.Expr, value, *args, **kwargs): - index = self.rename_indexing(index) - var = self.args.output(name) - line = f"{var}[{index}] = {value}" - self.cse.generate(self.stores, line, assignment = False) - - def reduction(self, dtype, src_dtype, reduction_type, value): - argmax_or_argmin = reduction_type in {"argmax", "argmin"} - if argmax_or_argmin: - raise NotImplementedError() #TODO: argmin, argmax - else: - reduction_key = src_dtype, reduction_type, value - acc = self.reduction_cse.generate( - self.loads, f"reduction {reduction_key}", write=False - ) - self.reduction_vars[acc] = reduction_type - acc_type = cpp.reduction_acc_type(reduction_type, dtype) - self.reduction_prefix.writeline(f"{acc_type} {acc} = {cpp.reduction_init(reduction_type, dtype)};") - line = f"{acc} = {cpp.reduction_combine(reduction_type, acc, value)}" - self.cse.generate(self.stores, line, assignment = False) - self.reduction_cse.reduction_cache[reduction_key] = acc - return acc - - def store_reduction(self, name, index, value): - index = self.rename_indexing(index) - var = self.args.output(name) - self.reduction_suffix.writeline(f"{var}[{index}] = {value};")\ - - def codegen_loops(self): - code = common.BracesBuffer() - # Loop body part - loops = [LoopLevel(var, size) for var, size in zip(self.itervars, self.ranges)] - loops, reductions = [LoopNest(loops[: self.reduction_depth]), - LoopNest(loops[self.reduction_depth :])] - reductions.mark_reduction(self.reduction_vars) - - with contextlib.ExitStack() as stack: - loops.codegen(code, stack) - with contextlib.ExitStack() as stack_outer: - if self.reduction_prefix: - stack_outer.enter_context(code.indent()) - code.splice(self.reduction_prefix) - - with contextlib.ExitStack() as stack: - reductions.codegen(code, stack) - code.splice(self.loads) - code.splice(self.compute) - code.splice(self.stores) - code.splice(self.reduction_suffix) - return code - - def define_kernel(self, wrapper, src_code, kernel_name): - if src_code in wrapper.src_to_kernel: - kernel_name = wrapper.src_to_kernel[src_code] - else: - wrapper.src_to_kernel[src_code] = kernel_name - wrapper.define_kernel(kernel_name, src_code, cuda=False) - - def codegen_kernel(self, wrapper): - arg_defs, call_args, arg_types = self.args.cpp_argdefs() - arg_defs = ",\n".ljust(25).join(arg_defs) - arg_types = ",".join(arg_types) - code = common.BracesBuffer() - - # Todo. kernel name custom - kernel_name = f"Extensin_Kernel" - kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel" - code.writeline(f'extern "C" void {kernel_decl_name}({arg_defs})') - with code.indent(): - for old, new in self.args.aliases(): - code.writeline(f"auto {old} = {new};") - # Loop body part - code.splice(self.codegen_loops()) - - codecache_def = IndentedBuffer() - if not V.graph.cpp_wrapper: - codecache_def.writeline("async_compile.cpp('''") - codecache_def.splice(code) - if not V.graph.cpp_wrapper: - codecache_def.writeline("''')") - - self.define_kernel(wrapper, codecache_def.getvalue(), kernel_name) - # generate the code to call this - wrapper.generate_kernel_call(kernel_name, call_args, cuda=False) - print(code.getvalue()) - return code.getvalue() - - def set_ranges(self, lengths, reduction_lengths): - if self.call_ranges: - assert self.call_ranges == tuple(lengths) + tuple( - reduction_lengths - ), f"{self.call_ranges} == {tuple(lengths)} + {tuple(reduction_lengths)}" - assert self.reduction_depth == len(lengths) - else: - self.call_ranges = tuple(lengths) + tuple(reduction_lengths) - self.ranges = [self.rename_indexing(x) for x in self.call_ranges] - self.itervars = [sympy.Symbol(f"i{n}") for n in range(len(self.ranges))] - self.reduction_depth = len(lengths) - return ( - self.itervars[: self.reduction_depth], - self.itervars[self.reduction_depth :], - ) - -@dataclasses.dataclass -class LoopLevel: - var: sympy.Expr - size: sympy.Expr - reduction_vars: Dict[str, str] = None - - # Todo. Type change for reduction - INDEX_TYPE = "long" - def lines(self): - line = f"for({self.INDEX_TYPE} {self.var}=0; {self.var}<{cexpr(self.size)}; ++{self.var})" - return [line] - -@dataclasses.dataclass -class LoopNest: - loops: List[LoopLevel] - - def __bool__(self): - return bool(self.loops) - - def mark_reduction(self, reduction_vars): - for loop in self.loops: - loop.reduction_vars = reduction_vars - - def mark_parallel(self, par_depth): - loops = self.loops - loops[0].parallel = par_depth - for i in range(1, par_depth): - loops[i].collapsed = True - loops[0].simd = loops[par_depth - 1].simd - - def codegen(self, code, stack): - for loop in self.loops: - code.writelines(loop.lines()) - stack.enter_context(code.indent()) - -class ExtensionScheduling(BaseScheduling): - count = 0 - def __init__(self, scheduler): - self.scheduler = scheduler - self._scheduling = cpp.CppScheduling(scheduler) - - def can_fuse_vertical(self, node1, node2): - return False - - def can_fuse_horizontal(self, node1, node2): - return False - - def group_fn(self, sizes): - return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes) - - def codegen_nodes(self, nodes): - _, (group, reduction_group) = max( - nodes, key=lambda x: int(x.is_reduction()) - ).group - - ex_kernel = ExtensionKernel() - for node in nodes: - vars, reduction_vars = ex_kernel.set_ranges(group, reduction_group) - with ex_kernel: - node.run(vars, reduction_vars) - - wrapper = V.graph.wrapper_code - ex_kernel.codegen_kernel(wrapper) - pass - - def codegen_sync(self): - pass - - def flush(self): - self._scheduling.flush() \ No newline at end of file diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index 59f3818c..3d6fbb76 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -3,74 +3,124 @@ import tempfile import importlib -# Hardware info config -CONFIG_VECTOR_LANE = int(os.environ.get("TORCHSIM_VECTOR_LANE", default=128)) -CONFIG_VECTOR_LANE_STRIDE = int(os.environ.get("TORCHSIM_VECTOR_LANE_STRIDE", default=2)) -CONFIG_SPAD_INFO = { - "spad_vaddr" : 0xD0000000, - "spad_paddr" : 0x2000000000, - "spad_size" : int(os.environ.get("TORCHSIM_SPAD_SIZE", default=128)) << 10 # Note: spad size per lane -} -CONFIG_PRECISION = 4 # 32bit -CONFIG_NUM_CORES = 1 -CONFIG_VLEN = 256 # 256bits / 32bits = 8 [elements] - -# Tile size config -CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - -# DUMP PATH -CONFIG_BACKEND_RESULT_PATH_KEY = os.getenv("BACKEND_RESULT_PATH_KEY") - -CONFIG_TORCHSIM_DUMP_PATH = os.environ.get('TORCHSIM_DUMP_PATH', - default = f"{tempfile.gettempdir()}/torchinductor") -CONFIG_TORCHSIM_DUMP_FILE = int(os.environ.get('TORCHSIM_DUMP_FILE', default=True)) -CONFIG_TORCHSIM_VALIDATION_MODE = int(os.environ.get('TORCHSIM_VALIDATION_MODE', default=True)) -CONFIG_CLEANUP_DUMP_ARGS = int(os.environ.get('CLEANUP_DUMP_ARGS', default=False)) - -# LLVM PATH -CONFIG_TORCHSIM_LLVM_PATH = os.environ.get('TORCHSIM_LLVM_PATH', default="/usr/bin") -CONFIG_TORCHSIM_CUSTOM_PASS_PATH = os.environ.get('TORCHSIM_CUSTOM_PASS_PATH', - default=f"{CONFIG_TORCHSIM_DIR}/GemminiLowerPass/build") -CONFIG_TORCHSIM_DUMP_MLIR_IR = int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False)) -CONFIG_TORCHSIM_DUMP_LLVM_IR = int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False)) - -# Backendsim config -CONFIG_TORCHSIM_BACKEND_CONFIG = os.environ.get('TORCHSIM_CONFIG', - default=f'{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') -CONFIG_BACKENDSIM_SPIKE_ONLY = int(os.environ.get("BACKENDSIM_SPIKE_ONLY", False)) -CONFIG_BACKENDSIM_EAGER_MODE = int(os.environ.get("BACKENDSIM_EAGER_MODE", default=False)) -CONFIG_BACKENDSIM_DRYRUN = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) -CONFIG_BACKENDSIM_DEBUG_LEVEL = os.environ.get("BACKENDSIM_DEBUG_LEVEL", "") - -# GEM5 config -CONFIG_GEM5_PATH = os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt") -CONFIG_GEM5_SCRIPT_PATH = os.environ.get('GEM5_SCRIPT_PATH', - default=f"{CONFIG_TORCHSIM_DIR}/gem5_script/script_systolic.py") - -# AUTOTUNE config -CONFIG_AUTOTUNE = int(os.environ.get('AUTOTUNE', default=True)) -CONFIG_MAX_AUTOTUNE_TRY = int(os.environ.get('MAX_AUTOTUNE_TRY', default=10)) - -# For block sparse -CONFIG_BLOCK_SPARSE = int(os.environ.get('BLOCK_SPARSE', default=0)) - -# For GEMM tile size -CONFIG_MANUAL_TILE_SIZE = int(os.environ.get('TORCHSIM_MANUAL_TILE_SIZE', default=False)) -CONFIG_TILE_M = int(os.environ.get('TORCHSIM_TILE_M', default=CONFIG_VECTOR_LANE)) -CONFIG_TILE_N = int(os.environ.get('TORCHSIM_TILE_N', default=CONFIG_VECTOR_LANE)) -CONFIG_TILE_K = int(os.environ.get('TORCHSIM_TILE_K', default=CONFIG_VECTOR_LANE)) -CONFIG_GEMM_CHEATSHEET_PATH = os.environ.get('TORCHSIM_GEMM_CHEATSHEET_PATH', - default=f"{CONFIG_TORCHSIM_DIR}/validation/gemm_tpuv3_cheatsheet.json") -CONFIG_SUBTILE = int(os.environ.get('TORCHSIM_SUBTILE', default=True)) -CONFIG_MANUAL_SUBTILE_SIZE = int(os.environ.get('TORCHSIM_MANUAL_SUBTILE_SIZE', default=False)) -CONFIG_SUBTILE_M = int(os.environ.get('TORCHSIM_SUBTILE_M', default=CONFIG_VECTOR_LANE)) -CONFIG_SUBTILE_N = int(os.environ.get('TORCHSIM_SUBTILE_N', default=CONFIG_VECTOR_LANE)) -CONFIG_SUBTILE_K = int(os.environ.get('TORCHSIM_SUBTILE_K', default=CONFIG_VECTOR_LANE)) - -# Advanced fusion options -CONFIG_FUSION_REDUCTION_EPILOGUE = int(os.environ.get('TORCHSIM_FUSION_REDUCTION_EPILOGUE', default=True)) -CONFIG_FUSION_REDUCTION_REDUCTION = int(os.environ.get('TORCHSIM_FUSION_REDUCTION_REDUCTION', default=True)) -CONFIG_FUSION_PROLOGUE = int(os.environ.get('TORCHSIM_FUSION_PROLOGUE', default=True)) +def __getattr__(name): + + # Hardware info config + if name == "CONFIG_VECTOR_LANE": + return int(os.environ.get("TORCHSIM_VECTOR_LANE", default=128)) + if name == "CONFIG_VECTOR_LANE_STRIDE": + return int(os.environ.get("TORCHSIM_VECTOR_LANE_STRIDE", default=2)) + if name == "CONFIG_SPAD_INFO": + return { + "spad_vaddr" : 0xD0000000, + "spad_paddr" : 0x2000000000, + "spad_size" : int(os.environ.get("TORCHSIM_SPAD_SIZE", default=128)) << 10 # Note: spad size per lane + } + if name == "CONFIG_PRECISION": + return 4 # 32bit + if name == "CONFIG_NUM_CORES": + return 1 + if name == "CONFIG_VLEN": + return 256 # 256bits / 32bits = 8 [elements] + + # Tile size config + if name == "CONFIG_TORCHSIM_DIR": + return os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') + + if name == "CONFIG_TORCHSIM_DUMP_PATH": + return os.environ.get('TORCHSIM_DUMP_PATH', default = f"{tempfile.gettempdir()}/torchinductor") + if name == "CONFIG_TORCHSIM_DUMP_FILE": + return int(os.environ.get('TORCHSIM_DUMP_FILE', default=True)) + if name == "CONFIG_TORCHSIM_FUNCTIONAL_MODE": + return int(os.environ.get('TORCHSIM_FUNCTIONAL_MODE', default=True)) + if name == "CONFIG_TORCHSIM_TIMING_MODE": + return int(os.environ.get("TORCHSIM_TIMING_MODE", True)) + if name == "CONFIG_CLEANUP_DUMP_ARGS": + return int(os.environ.get('CLEANUP_DUMP_ARGS', default=False)) + + # LLVM PATH + if name == "CONFIG_TORCHSIM_LLVM_PATH": + return os.environ.get('TORCHSIM_LLVM_PATH', default="/usr/bin") + if name == "CONFIG_TORCHSIM_CUSTOM_PASS_PATH": + return os.environ.get('TORCHSIM_CUSTOM_PASS_PATH', + default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/GemminiLowerPass/build") + if name == "CONFIG_TORCHSIM_DUMP_MLIR_IR": + return int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False)) + if name == "CONFIG_TORCHSIM_DUMP_LLVM_IR": + return int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False)) + + # TOGSim config + if name == "CONFIG_TOGSIM_CONFIG": + return os.environ.get('TORCHSIM_CONFIG', + default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json") + if name == "CONFIG_TOGSIM_EAGER_MODE": + return int(os.environ.get("TOGSIM_EAGER_MODE", default=False)) + if name == "CONFIG_TOGSIM_DRYRUN": + return int(os.environ.get('TOGSIM_DRYRUN', default=False)) + if name == "CONFIG_TOGSIM_DEBUG_LEVEL": + return os.environ.get("TOGSIM_DEBUG_LEVEL", "") + + # GEM5 config + if name == "CONFIG_GEM5_PATH": + return os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt") + if name == "CONFIG_GEM5_SCRIPT_PATH": + return os.environ.get('GEM5_SCRIPT_PATH', + default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/gem5_script/script_systolic.py") + + # AUTOTUNE config + if name == "CONFIG_AUTOTUNE": + return int(os.environ.get('AUTOTUNE', default=False)) + if name == "CONFIG_AUTOTUNE_TEMPLATE": + return int(os.environ.get('AUTOTUNE_TEMPLATE', default=False)) + if name == "CONFIG_MAX_AUTOTUNE_TRY": + return int(os.environ.get('MAX_AUTOTUNE_TRY', default=10)) + if name == "CONFIG_AUTOTUNE_TEMPLATE_TOPK": + return int(os.environ.get('AUTOTUNE_TEMPLATE_TOPK', default=4)) + + # For block sparse + if name == "CONFIG_BLOCK_SPARSE": + return int(os.environ.get('BLOCK_SPARSE', default=0)) + + # For GEMM tile size + if name == "CONFIG_MANUAL_TILE_SIZE": + return int(os.environ.get('TORCHSIM_MANUAL_TILE_SIZE', default=False)) + if name == "CONFIG_TILE_M": + return int(os.getenv("TORCHSIM_TILE_M", __getattr__("CONFIG_VECTOR_LANE"))) + if name == "CONFIG_TILE_N": + return int(os.getenv("TORCHSIM_TILE_N", __getattr__("CONFIG_VECTOR_LANE"))) + if name == "CONFIG_TILE_K": + return int(os.getenv("TORCHSIM_TILE_K", __getattr__("CONFIG_VECTOR_LANE"))) + + if name == "CONFIG_SUBTILE": + return int(os.environ.get('TORCHSIM_SUBTILE', default=True)) + if name == "CONFIG_MANUAL_SUBTILE_SIZE": + return int(os.environ.get('TORCHSIM_MANUAL_SUBTILE_SIZE', default=False)) + if name == "CONFIG_SUBTILE_M": + return int(os.environ.get('TORCHSIM_SUBTILE_M', default=__getattr__("CONFIG_VECTOR_LANE"))) + if name == "CONFIG_SUBTILE_N": + return int(os.environ.get('TORCHSIM_SUBTILE_N', default=__getattr__("CONFIG_VECTOR_LANE"))) + if name == "CONFIG_SUBTILE_K": + return int(os.environ.get('TORCHSIM_SUBTILE_K', default=__getattr__("CONFIG_VECTOR_LANE"))) + + if name == "CONFIG_GEMM_CHEATSHEET_PATH": + return os.environ.get('TORCHSIM_GEMM_CHEATSHEET_PATH', + default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/validation/gemm_tpuv3_cheatsheet.json") + # Compiler Optimization + if name == "CONFIG_COMPILER_OPTIMIZATION": + return os.environ.get('TORCHSIM_COMPILER_OPTIMIZATION', default="all") # options: all, none, custom + # Advanced fusion options + if name == "CONFIG_FUSION": + return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "fusion" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False + if name == "CONFIG_FUSION_REDUCTION_EPILOGUE": + return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "reduction_epliogue" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False + if name == "CONFIG_FUSION_REDUCTION_REDUCTION": + return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "reduction_reduction" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False + if name == "CONFIG_FUSION_PROLOGUE": + return True if ((__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all") or ("prologue" in __getattr__("CONFIG_COMPILER_OPTIMIZATION"))) else False + if name == "CONFIG_SINGLE_BATCH_CONV": + return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "single_batch_conv" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False + if name == "CONFIG_MULTI_TILE_CONV": + return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "multi_tile_conv" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False # SRAM Buffer allocation plan def load_plan_from_module(module_path): @@ -97,3 +147,5 @@ def load_plan_from_module(module_path): CONFIG_TLS_MODE = int(os.environ.get('TORCHSIM_TLS_MODE', default=1)) CONFIG_USE_TIMING_POOLING = int(os.environ.get('TORCHSIM_USE_TIMING_POOLING', default=0)) + +CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=0)) \ No newline at end of file diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py index 22a727c5..167544f2 100644 --- a/PyTorchSimFrontend/extension_op.py +++ b/PyTorchSimFrontend/extension_op.py @@ -13,7 +13,7 @@ from torch._inductor.codecache import write from PyTorchSimFrontend.extension_codecache import get_write_path from PyTorchSimFrontend import extension_config -from Simulator.simulator import BackendSimulator, TORCH_TO_NUMPY +from Simulator.simulator import TOGSimulator, TORCH_TO_NUMPY graph_template = { 0: { @@ -46,7 +46,7 @@ class MLIRExternKernelChoice(ExternKernelChoice): def call_name(self): - is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) + is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) if is_dryrun: return f"yield from sparse_mm_dummy_stonne_outer" return f"torch.ops.extension_op.{self.name}" @@ -275,11 +275,11 @@ def prepare_outer_product_matrix(a, b, out): def sparse_mm_stonne_outer(a, b, out): onnx_path, attribute_path, c_result_path = prepare_outer_product_matrix(a, b, out) - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json' - backsim = BackendSimulator(backend_path, stonne_config_path) + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/TOGSim/configs/stonne_single_c1_simple_noc.json' + backsim = TOGSimulator(togsim_path, stonne_config_path) result_path = backsim.simulation(onnx_path) - BackendSimulator.get_result_from_file(result_path) + TOGSimulator.get_result_from_file(result_path) # Load result data #with open(c_result_path, 'rb') as f: diff --git a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py b/PyTorchSimFrontend/llvm/llvm_caller_codegen.py deleted file mode 100644 index 3690f533..00000000 --- a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py +++ /dev/null @@ -1,236 +0,0 @@ -import os -import subprocess -import shlex -import re - -from torch._inductor.utils import IndentedBuffer -from torch._inductor.codegen import cpp -from torch._inductor.codecache import write_atomic - -from PyTorchSimFrontend.llvm.llvm_common import LLVMKernelArgs - -class LLVMKernelCallerCodeGen(): - """ - Generate C that calls the llvm kernel. - """ - - def __init__(self, validation, arg_attributes): - super().__init__() - self.code = IndentedBuffer() - self.ending = ";" - self.open_bracket = "{" - self.closed_bracket = "}" - self.newline = "\n" - self.kernel_name = "kernel" - self.validation = validation - self.n_arg = len(arg_attributes) - self.arg_attributes = arg_attributes - self.arg_use_count = 1 - self.load_args = {} - self.kernel_start_addr = "" - self.kernel_end_addr = "" - - def get_argv_idx(self): - self.arg_use_count += 1 - return self.arg_use_count-1 - - def write_header(self): - self.writeline('#include ') - self.writeline('#include ') - self.writeline("#include ") - if self.validation: - self.writeline("#include ") - self.writeline('#include ') - self.writeline('#include ') - - def is_in_arg(self, arg_name): - value = self.arg_attributes[arg_name][0] - return LLVMKernelArgs.is_llvm_arg_in(value) - - def is_out_arg(self, arg_name): - value = self.arg_attributes[arg_name][0] - return LLVMKernelArgs.is_llvm_arg_out(value) - - def load_arg(self): - for i, arg_name in enumerate(self.arg_attributes.keys()): - if self.is_in_arg(arg_name): - argv_idx = self.get_argv_idx() if arg_name not in self.load_args else self.load_args[arg_name] - self.load_args[arg_name] = argv_idx - self.writeline(f'if(load_arg({arg_name}, sizeof({arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - - def dump_arg(self): - for i, arg_name in enumerate(self.arg_attributes.keys()): - if self.is_out_arg(arg_name): - argv_idx = self.get_argv_idx() if arg_name not in self.load_args else self.load_args[arg_name] - self.writeline(f'if(dump_arg({arg_name}, sizeof({arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - - def write_exit(self): - self.writeline(f'return 0{self.ending}') - - def generate_kernel_declare(self): - args_type_p = [f'{cpp.DTYPE_TO_CPP[arg_type[1]]}*' for arg_type in self.arg_attributes.values()] - - self.writeline(f"void {self.kernel_name}({', '.join(args_type_p)}){self.ending}{self.newline}") - - def generate_args_define(self): - for arg_name, (_, arg_type, arg_shape) in self.arg_attributes.items(): - self.writeline(f'{cpp.DTYPE_TO_CPP[arg_type]} {arg_name}[atoi(argv[{self.get_argv_idx()}])] __attribute__ ((aligned (4096))){self.ending}') - self.writeline(self.newline) - - def generate_load_dump_fn(self): - self.writeline(f'{self.newline}int load_arg(void *arg, size_t size, const char *path) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'int fd = open(path, 0x00000000){self.ending}') - self.writeline(f'if (fd == -1) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - - self.writeline(f'if (read(fd, arg, size) == -1) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - self.writeline(f'close(fd){self.ending}') - self.writeline(f'return 0{self.ending}') - self.writeline(self.closed_bracket) - - self.writeline(f'{self.newline}int dump_arg(void *arg, size_t size, const char *path) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'int fd = open(path, 0x00000001 | 0x00000040, 0644){self.ending}') - self.writeline(f'if (fd == -1) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - - self.writeline(f'if (write(fd, arg, size) == -1) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - self.writeline(f'close(fd){self.ending}') - self.writeline(f'return 0{self.ending}') - self.writeline(self.closed_bracket) - - def generate_main(self): - self.writeline(f'{self.newline}int main(int argc, char *argv[]) {self.open_bracket}{self.newline}') - with self.code.indent(): - if self.validation: - self.load_arg() - self.writeline(self.newline) - - self.writeline(f"{self.kernel_name}({', '.join(list(self.arg_attributes))}){self.ending}{self.newline}") - - if self.validation: - self.dump_arg() - - self.write_exit() - self.writeline(self.closed_bracket) - - def writeline(self, line): - self.code.writeline(line) - - def generate_wrapper_file(self, path, name): - self.dump_path = path - - self.write_header() - self.generate_kernel_declare() - - if self.validation: - self.generate_load_dump_fn() - self.generate_main() - - write_path = os.path.join(path, name+".c",) - write_atomic(write_path, self.code.getvalue()) - return - - def add_extention(self, name, extension): - return name + "." + extension - - def compile_wih_kernel(self, write_path, llvm_name, wrapper_name, binary_name, link_option=""): - main_path = os.path.join(write_path, self.add_extention(wrapper_name, 'c')) - main_obj_path = os.path.join(write_path, self.add_extention(wrapper_name, 'o')) - kernel_path = os.path.join(write_path, self.add_extention(llvm_name, 's')) - kernel_obj_path = os.path.join(write_path, self.add_extention(llvm_name, 'o')) - - main_compile = f'riscv64-unknown-elf-gcc -march=rv64gcv -c {main_path} -o {main_obj_path}' - kernel_compile = f'clang -c --target="riscv64" -march=rv64gcv -O2 -nostdlib {kernel_path} -o {kernel_obj_path}' - - target = os.path.join(write_path, binary_name) - link = f'riscv64-unknown-elf-gcc -march=rv64gcv {main_obj_path} {kernel_obj_path} -o {target} -lm {link_option}' - - main_compile_cmd = shlex.split(main_compile) - kernel_compile_cmd = shlex.split(kernel_compile) - link_cmd = shlex.split(link) - - try: - subprocess.check_call(main_compile_cmd) - subprocess.check_call(kernel_compile_cmd) - subprocess.check_call(link_cmd) - except subprocess.CalledProcessError as e: - print("Command failed with exit code", e.returncode) - print("Error output:", e.output) - assert(0) - - def parse_stack_sizes(self, file_path, vlenb=256): - with open(file_path, 'r') as f: - stack_sizes_data = f.readlines() - - in_proc = False - stack_base = None - dynamic_expr = None - max_offset = 0 - - for line in stack_sizes_data: - line = line.strip() - if line.startswith(".cfi_startproc"): - in_proc = True - continue - elif line.startswith(".cfi_endproc") and in_proc: - if dynamic_expr: - total_stack = eval(dynamic_expr, {"vlenb": vlenb}) - return total_stack - elif stack_base: - return stack_base - else: - return max_offset - - # Skip outer function - if not in_proc: - continue - - if line.startswith(".cfi_def_cfa_offset"): - stack_base = int(line.split()[-1]) - - if ".cfi_escape" in line and "#" in line: - comment = line.split("#")[-1].strip() - m = re.search(r"sp \+ (\d+)\s*\+\s*(\d+)\s*\*\s*vlenb", comment) - if m: - base, scale = int(m.group(1)), int(m.group(2)) - dynamic_expr = f"{base} + {scale} * vlenb" - - def get_spad_size(self, binary_path): - cmd = ["riscv64-unknown-elf-readelf", "-s", binary_path] - result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - if result.returncode != 0: - raise RuntimeError(f"Readelf error: {result.stderr}") - - output = result.stdout - spad_start = None - spad_end = None - for line in output.splitlines(): - if '.spad' in line and 'SECTION' in line: - parts = line.split() - spad_start = int(parts[1], 16) - elif 'spad_end' in line: - parts = line.split() - spad_end = int(parts[1], 16) - - if spad_start is None or spad_end is None: - return 0 - spad_size = spad_end - spad_start - return spad_size \ No newline at end of file diff --git a/PyTorchSimFrontend/llvm/llvm_common.py b/PyTorchSimFrontend/llvm/llvm_common.py deleted file mode 100644 index 1c76b826..00000000 --- a/PyTorchSimFrontend/llvm/llvm_common.py +++ /dev/null @@ -1,304 +0,0 @@ -import torch -from torch._inductor.codegen import common -from torch._inductor.virtualized import V -import sympy - -from typing import Callable - -import sympy - -import torch.fx -from torch.utils._sympy.value_ranges import ValueRanges - -from torch._inductor.utils import ( - free_symbol_startswith, - get_sympy_Expr_dtype, - IndentedBuffer, - sympy_subs, - unique, -) - -schedule_log = torch._logging.getArtifactLogger(__name__, "schedule") - -DTYPE_TO_LLVM = { - torch.float32: "float", - torch.float64: "double", - torch.float16: "half", - torch.int64: "i64", - torch.int32: "i32", - torch.int16: "i16", - torch.int8: "i8", - torch.uint8: "i8", - torch.bool: "i8", - torch.bfloat16: "bfloat", -} - -DTYPE_SIZE = { - torch.float32: 4, - torch.float64: 8, - torch.float16: 2, - torch.int64: 8, - torch.int32: 4, - torch.int16: 2, - torch.int8: 1, - torch.uint8: 1, - torch.bool: 1, - torch.bfloat16: 2, -} - -DTYPE_LOWP_FP = [ - torch.bfloat16, - torch.float16, -] - -class LLVMKernelArgs(common.KernelArgs): - LLVM_ARGS_IN = 0x01 - LLVM_ARGS_OUT = 0x02 - LLVM_ARGS_INOUT = 0x04 - LLVM_ARGS_VAR = 0x08 - - @staticmethod - def is_llvm_arg_in(value): - return (LLVMKernelArgs.LLVM_ARGS_IN & value) | (LLVMKernelArgs.LLVM_ARGS_INOUT & value) - - @staticmethod - def is_llvm_arg_out(value): - return (LLVMKernelArgs.LLVM_ARGS_OUT & value) | (LLVMKernelArgs.LLVM_ARGS_INOUT & value) - - def llvm_argdefs(self, only_args=False): - buffer_types = {x.get_name(): [x.get_dtype(), x.get_numel()] for x in V.graph.buffers} - for name, val in V.graph.graph_inputs.items(): - if isinstance(val, sympy.Expr): - buffer_types[name] = [get_sympy_Expr_dtype(val), 1] - else: - buffer_types[name] = [val.get_dtype(), val.get_numel()] - buffer_types.update( - {name: val.dtype for name, val in V.graph.constants.items()} - ) - - call_args = [] - arg_defs = [] - arg_attributes = {} - for inplaced in unique(self.inplace_buffers.values()): - if self._buffer_is_marked_removed(inplaced): - continue - outer = inplaced.other_names[-1] - inner = inplaced.inner_name - arg_defs.append(f"ptr %{inner}") - if not only_args: - call_args.append(outer) - arg_attributes[outer] = [self.LLVM_ARGS_INOUT] + buffer_types[outer] - for outer, inner in self.input_buffers.items(): - if outer in self.inplace_buffers: - continue - arg_defs.append(f"ptr readonly %{inner}") - if not only_args: - call_args.append(outer) - arg_attributes[outer] = [self.LLVM_ARGS_IN] + buffer_types[outer] - for outer, inner in self.output_buffers.items(): - if outer in self.inplace_buffers or self._buffer_is_marked_removed(inner): - continue - arg_defs.append(f"ptr %{inner}") - if not only_args: - call_args.append(outer) - arg_attributes[outer] = [self.LLVM_ARGS_OUT] + buffer_types[outer] - for outer, inner in self.sizevars.items(): - arg_defs.append(f"ptr readonly %{inner}") - if not only_args: - call_args.append(outer) - arg_attributes[outer] = [self.LLVM_ARGS_VAR] + buffer_types[outer] - return arg_defs, call_args, arg_attributes - -class BaseLLVMKernel(common.Kernel): - newvar_prefix = "%" - name_prefix = "body" - vector_prefix = "vector_body" - suffix = "" - overrides = None - load_format = None - store_format = None - - def __init__(self, args=None): - super().__init__(args) - self.vector_compute = IndentedBuffer() - self.reductions_suffix = IndentedBuffer() - self.cse = common.CSE(self.newvar_prefix, self.suffix, self.name_prefix) - self.vector_cse = common.CSE(self.newvar_prefix, self.suffix, self.vector_prefix) - self.tile_size = None - self.tile_shape = {} - - def load(self, name: str, index: sympy.Expr): - raise NotImplementedError() - - def store_reduction(self, name, index, value): - raise NotImplementedError() - - def store(self, name, index, value, mode=None): - raise NotImplementedError() - - def reduction(self, dtype, src_dtype, reduction_type, value): - raise NotImplementedError() - - def widening(self, args, buf_bounds): - if not args[0] in self.tile_shape or not args[1] in self.tile_shape: - return args, [1, 1] - tile_shape0 = self.tile_shape[args[0]] - tile_shape1 = self.tile_shape[args[1]] - vec_len0 = tile_shape0[0] * tile_shape0[1] - vec_len1 = tile_shape1[0] * tile_shape1[1] - if tile_shape0 != tile_shape1: - temp = list(args) - idx = 0 if tile_shape0[0] != tile_shape1[0] else 1 - if tile_shape0[idx] > tile_shape1[idx]: - if idx == 0: - indexes = [f"i32 {i%tile_shape1[idx-1]}" for i in range(vec_len0)] - else: - indexes = [f"i32 {i//tile_shape1[idx-1]}" for i in range(vec_len0)] - line = f"shufflevector <{vec_len1} x float> %{args[1]}, <{vec_len1} x float> undef, <{vec_len0} x i32> <{', '.join(indexes)}>" - temp[1] = self.cse.generate(self.compute, line, bounds=buf_bounds) - elif tile_shape0[idx] < tile_shape1[idx]: - if idx == 0: - indexes = [f"i32 {i%tile_shape0[idx-1]}" for i in range(vec_len1)] - else: - indexes = [f"i32 {i//tile_shape0[idx-1]}" for i in range(vec_len1)] - line = f"shufflevector <{vec_len0} x float> %{args[0]}, <{vec_len0} x float> undef, <{vec_len1} x i32> <{', '.join(indexes)}>" - temp[0] = self.cse.generate(self.compute, line, bounds=buf_bounds) - args = tuple(temp) - return args, max(tile_shape0, tile_shape1) - - def __enter__(self): - class CSEProxy: - self.name = "CSEProxy" - - @staticmethod - def __getattr__(name: str) -> Callable[..., common.CSEVariable]: # type: ignore[misc] - def inner(*args, **kwargs): - # TritonTemplateKernel has no current_node - buf_bounds = ValueRanges.unknown() - if hasattr(V.interpreter, "current_node"): - fx_node = V.interpreter.current_node - assert isinstance(self.node_to_bounds, dict) - buf_bounds = self.node_to_bounds.get( - fx_node, ValueRanges.unknown() - ) - - vector_csevar = None - if isinstance(args[0], list): - vector_args = (args[0][0], args[1][0]) - vector_csevar = self.vector_cse.generate( - self.vector_compute, - getattr(parent_handler, "vector_" + name)(*vector_args, **kwargs), # type: ignore[has-type] - bounds=buf_bounds, - ) - vector_csevar.update_on_args(name, vector_args, kwargs) - args = (args[0][1], args[1][1]) - if len(args) == 2: - args, tile_shape = self.widening(args, buf_bounds) - elif len(args) == 1: - tile_shape = self.tile_shape[args[0]] - else: - assert(0) # not implemented yet. - vec_len = tile_shape[0] * tile_shape[1] - csevar = self.cse.generate( - self.compute, - getattr(parent_handler, name)(*args, tile_size=vec_len, **kwargs), # type: ignore[has-type] - bounds=buf_bounds, - ) - self.tile_shape[csevar] = tile_shape - csevar.update_on_args(name, args, kwargs) - if vector_csevar is not None: - return [vector_csevar, csevar] - return csevar - - return inner - - @staticmethod - def indirect_indexing(index_var, size, check=True): - # Skip CSE since this doesn't return an expression - return self.indirect_indexing(index_var, size, check) # type: ignore[attr-defined] - - @staticmethod - def load(name: str, index: sympy.Expr): - if name in self.cse.invalidated_stores: - # A load from an invalidated store requires us to - # keep the actual buffer around - V.kernel.must_keep_buffers.add(name) - if free_symbol_startswith(index, "%"): - return self.indirect_load(name, index) - store_cache = self.cse.store_cache - if name in store_cache: - return store_cache[name] - return self.load(name, index) - - @staticmethod - def store(name, index, value, mode=None): - self.store_buffer_names.add(name) - if mode is None: - self.cse.store_cache[name] = value - if self.current_node: - for other_name in self.current_node.get_mutations(): - self.cse.store_cache[other_name] = value - if name not in V.graph.removed_buffers: - return self.store(name, index, value, mode=mode) - - @staticmethod - def store_reduction(name, index, value): - self.store_buffer_names.add(name) - self.cse.store_cache[name] = value - if self.current_node: - for other_name in self.current_node.get_mutations(): - self.cse.store_cache[other_name] = value - - if name not in V.graph.removed_buffers: - return self.store_reduction(name, index, value) - - @staticmethod - def reduction(dtype, src_dtype, reduction_type, value): - return self.reduction(dtype, src_dtype, reduction_type, value) - - @staticmethod - def bucketize( - values, - offsets_name: str, - offsets_size: sympy.Expr, - indexing_dtype: torch.dtype, - right: bool, - ): - """ - [Note: Inductor bucketize op] - - Given values (tensor) and offsets_name (reference to the name of a 1D - tensor), calculate the bucket that each value belongs to. - - e.g. for values [-1, 0, 1, 2, 3, 4, 5, 9], offsets [0, 4, 4, 8], right=True - return = [ 0, 1, 1, 1, 1, 3, 3, 4]. - - When right == False, bucket i refers to range (offsets[i], offsets[i+1]]. - When right == True, bucket i refers to range [offsets[i], offsets[i+1]). - - Offsets must be non-decreasing or the result is undefined. - """ - return self.bucketize( - values, offsets_name, offsets_size, indexing_dtype, right - ) - - super().__enter__() - assert self.overrides - parent_handler = self.overrides(V.get_ops_handler()) - self.exit_stack.enter_context(V.set_ops_handler(CSEProxy())) - self.exit_stack.enter_context(V.set_kernel_handler(self)) - return self - - def rename_indexing(self, index) -> sympy.Expr: - # adds the necessary kernel args for index expressions - # and renames variables in index expressions to kernel arg names - if isinstance(index, (list, tuple)): - return [self.rename_indexing(x) for x in index] - index = V.graph.sizevars.simplify(index) - sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name) - replacements = { - x: self.args.size(x) - for x in sorted_symbols - if x.name.startswith("s") or x.name.startswith("ps") - } - return sympy_subs(index, replacements) diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py index af101f44..e52d6cff 100644 --- a/PyTorchSimFrontend/mlir/mlir_autotune.py +++ b/PyTorchSimFrontend/mlir/mlir_autotune.py @@ -1,20 +1,28 @@ import functools import torch +import os import dataclasses -from torch._inductor.autotune_process import BenchmarkRequest from torch._inductor.autotune_process import TensorMeta +from torch._inductor.codecache import get_hash, write +from PyTorchSimFrontend import extension_config +from Simulator.simulator import TOGSimulator from typing import ( Any, Callable, - Dict, Iterable, List, Optional, - Sequence, - TYPE_CHECKING, Union, ) + +# FIXME. Avoid circular import +def hash_prefix(hash_value): + return hash_value[1:12] + +def get_write_path(src_code): + return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(get_hash(src_code.strip()))) + @dataclasses.dataclass class MLIRBenchmarkRequest(): def __init__( @@ -46,16 +54,30 @@ def make_run_fn( ) -> Callable[[], None]: from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile custom_async_compile = CustomAsyncCompile() + + # Check already cached result. + write_path = get_write_path(self.source_code) + key, _ = write(self.source_code, "mlir", specified_dir=write_path) + result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "togsim_result/0") + if os.path.exists(result_path): + result = TOGSimulator.get_result_from_file(result_path) + def cached_run_fn(*args, **kwargs): + return result + return cached_run_fn + + # Run a candidate code run_method = custom_async_compile.mlir( self.source_code, vectorlane_size=self.extra_args["vector_lane"], loop_size=None, spad_info=self.extra_args["spad_info"], vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"], - origins="Unknown", silent_mode=True) + origins="Unknown", silent_mode=True, + validate=self.extra_args['validate'], autotune=self.extra_args['autotune']) args = [ tensor for tensor in list(input_tensors) + list(output_tensors) ] + # Generate partial function. return functools.partial( run_method, diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py index 79e03bd5..178ea987 100644 --- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py @@ -6,8 +6,6 @@ from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode -from torch._inductor.codecache import write_atomic -import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir import mlir_common BMM_TEMPLATE = r""" @@ -162,51 +160,31 @@ def render(self, template_buffer_node = None, epilogue_nodes: Optional[List[IRNode]] = None, prologue_nodes: Optional[List[IRNode]] = None, + tile_info = None, **kwargs): - if template_buffer_node is not None: - self.output_node = template_buffer_node - - # Extract input arguments info - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - W_tensor = empty_strided(W.layout.size, W.layout.stride) - X_tensor = empty_strided(X.layout.size, X.layout.stride) - if len(W_tensor.size()) > 3 or len(W_tensor.size()) == 2: - W_tensor = W_tensor.view([-1, W_tensor.shape[-2], W_tensor.shape[-1]]) - if len(X_tensor.size()) > 3 or len(X_tensor.size()) == 2: - X_tensor = X_tensor.view([-1, X_tensor.shape[-2], X_tensor.shape[-1]]) - B, M, N, K = X_tensor.size()[0], X_tensor.size()[1], W_tensor.size()[2], X_tensor.size()[2] - - W_stride = W_tensor.stride() - X_stride = X_tensor.stride() - - # Select tile size - n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0 - TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node) - SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane) or prologue_nodes else kernel.vector_lane - SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane - SUB_TILE_K = TILE_K # if (TILE_K < kernel.vector_lane) or prologue_nodes else kernel.vector_lane + X, W, Y, Bias, W_tensor, X_tensor, B, M, N, K, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) + if tile_info is None: + TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_extra_node, 0, n_prologue_node)[0] + else: + TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info TOG_latency = M if TILE_M > M else TILE_M kernel.loop_size = [TOG_latency, TILE_N, TILE_K] - TILE_K = TILE_K // 2 if prologue_nodes else TILE_K # Select template code nr_reduction_nodes = [node for node in epilogue_nodes if node.is_reduction()] if epilogue_nodes is not None else [] if nr_reduction_nodes: - template = BMM_REDUCTION_TEMPLATE - epilogue_dim_aliasing = {"index0":"index0", "index1":"index2", "index2": "index1"} - nr_rdim = 1 + template = BMM_REDUCTION_TEMPLATE + epilogue_dim_aliasing = {"index0":"index0", "index1":"index2", "index2": "index1"} + nr_rdim = 1 elif prologue_nodes: - template = BMM_PROLOGUE_TEMPLATE - epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2"} - nr_rdim = 0 + template = BMM_PROLOGUE_TEMPLATE + epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2"} + nr_rdim = 0 else: - template = BMM_TEMPLATE - epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2"} - nr_rdim = 0 + template = BMM_TEMPLATE + epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2"} + nr_rdim = 0 # Prepare tile descriptors vlane_stride = 1 @@ -323,19 +301,53 @@ def render(self, dram_idx = Y_idx, dram_tile_desc = Y_tile_desc, nr_rdim = nr_rdim, + r_dim_size = M, dim_aliasing = epilogue_dim_aliasing ) code = self._template_from_string(template).render(**kernel.render_options) kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]]) return code - def codegen_header(self, code, extra_headers): - write_path = extension_codecache.get_write_path(code) - if not os.path.exists(write_path): - os.makedirs(write_path) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, extra_headers[0]) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, extra_headers[1]) \ No newline at end of file + def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes): + if template_buffer_node is not None: + self.output_node = template_buffer_node + + # Extract input arguments info + X, W = self.input_nodes[0], self.input_nodes[1] + Y = self.output_node + Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] + + W_tensor = empty_strided(W.layout.size, W.layout.stride) + X_tensor = empty_strided(X.layout.size, X.layout.stride) + if len(W_tensor.size()) > 3 or len(W_tensor.size()) == 2: + W_tensor = W_tensor.view([-1, W_tensor.shape[-2], W_tensor.shape[-1]]) + if len(X_tensor.size()) > 3 or len(X_tensor.size()) == 2: + X_tensor = X_tensor.view([-1, X_tensor.shape[-2], X_tensor.shape[-1]]) + B, M, N, K = X_tensor.size()[0], X_tensor.size()[1], W_tensor.size()[2], X_tensor.size()[2] + + W_stride = W_tensor.stride() + X_stride = X_tensor.stride() + + # Select tile size + n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0 + n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0 + return X,W,Y,Bias,W_tensor,X_tensor,B,M,N,K,n_extra_node, n_prologue_node + + def get_tile_candidates(self, + kernel: MLIRTemplateKernel, + template_buffer_node = None, + epilogue_nodes: Optional[List[IRNode]] = None, + prologue_nodes: Optional[List[IRNode]] = None, + **kwargs): + X, W, Y, Bias, W_tensor, X_tensor, B, M, N, K, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) + return self.select_tile(kernel, M, N, K, n_extra_node, 0, n_prologue_node) + + def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node): + tile_candidates = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node) + for idx, (TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): + SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane) or n_prologue_node else kernel.vector_lane + SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane + SUB_TILE_K = TILE_K # if (TILE_K < kernel.vector_lane) or prologue_nodes else kernel.vector_lane + TILE_K = TILE_K // 2 if n_prologue_node else TILE_K + tile_candidates[idx] = TILE_M,TILE_N,TILE_K,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + return tile_candidates diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py index 3fff9958..dff6b0fd 100644 --- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py +++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py @@ -1,16 +1,46 @@ +import os +import subprocess +import shlex +import re import torch -from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs -from PyTorchSimFrontend.llvm.llvm_caller_codegen import LLVMKernelCallerCodeGen -from PyTorchSimFrontend.mlir.mlir_common import DTYPE_TO_C +from torch._inductor.utils import IndentedBuffer +from torch._inductor.codecache import write_atomic +from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs, DTYPE_TO_C -class MLIRKernelCallerCodeGen(LLVMKernelCallerCodeGen): +class MLIRKernelCallerCodeGen(): + """ + Generate C that calls the llvm kernel. + """ def __init__(self, validation, arg_attributes, cycle_sim=False): - super().__init__(validation, arg_attributes) + super().__init__() + self.code = IndentedBuffer() + self.ending = ";" + self.open_bracket = "{" + self.closed_bracket = "}" + self.newline = "\n" + self.kernel_name = "kernel" + self.validation = validation + self.n_arg = len(arg_attributes) + self.arg_attributes = arg_attributes + self.arg_use_count = 1 + self.load_args = {} + self.kernel_start_addr = "" + self.kernel_end_addr = "" self.cycle_sim = cycle_sim + def get_argv_idx(self): + self.arg_use_count += 1 + return self.arg_use_count-1 + def write_header(self): - super().write_header() + self.writeline('#include ') + self.writeline('#include ') + self.writeline("#include ") + if self.validation: + self.writeline("#include ") + self.writeline('#include ') + self.writeline('#include ') global_var_header = "gem5_global_var.h" if self.cycle_sim else "global_var.h" self.writeline(f"#include \"{global_var_header}\"") @@ -42,6 +72,9 @@ def dump_arg(self): self.writeline(f'return -1{self.ending}') self.writeline(self.closed_bracket) + def write_exit(self): + self.writeline(f'return 0{self.ending}') + def generate_kernel_declare(self): # memref to llvm arguments (memref -> ptr, ptr, i64, , ) allocated pointer, aligned pointer, offset, size, stride args_type_p = [f'{DTYPE_TO_C[arg_type[1]]}*, {DTYPE_TO_C[arg_type[1]]}*, int64_t, int64_t, int64_t' for (_, arg_type) in self.arg_attributes] @@ -86,4 +119,142 @@ def generate_main(self): self.dump_arg() self.write_exit() - self.writeline(self.closed_bracket) \ No newline at end of file + self.writeline(self.closed_bracket) + + def generate_load_dump_fn(self): + self.writeline(f'{self.newline}int load_arg(void *arg, size_t size, const char *path) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'int fd = open(path, 0x00000000){self.ending}') + self.writeline(f'if (fd == -1) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'return -1{self.ending}') + self.writeline(self.closed_bracket) + + self.writeline(f'if (read(fd, arg, size) == -1) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'return -1{self.ending}') + self.writeline(self.closed_bracket) + self.writeline(f'close(fd){self.ending}') + self.writeline(f'return 0{self.ending}') + self.writeline(self.closed_bracket) + + self.writeline(f'{self.newline}int dump_arg(void *arg, size_t size, const char *path) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'int fd = open(path, 0x00000001 | 0x00000040, 0644){self.ending}') + self.writeline(f'if (fd == -1) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'return -1{self.ending}') + self.writeline(self.closed_bracket) + + self.writeline(f'if (write(fd, arg, size) == -1) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'return -1{self.ending}') + self.writeline(self.closed_bracket) + self.writeline(f'close(fd){self.ending}') + self.writeline(f'return 0{self.ending}') + self.writeline(self.closed_bracket) + + + def writeline(self, line): + self.code.writeline(line) + + def generate_wrapper_file(self, path, name): + self.dump_path = path + + self.write_header() + self.generate_kernel_declare() + + if self.validation: + self.generate_load_dump_fn() + self.generate_main() + + write_path = os.path.join(path, name+".c",) + write_atomic(write_path, self.code.getvalue()) + return + + def add_extention(self, name, extension): + return name + "." + extension + + def compile_wih_kernel(self, write_path, llvm_name, wrapper_name, binary_name, link_option=""): + main_path = os.path.join(write_path, self.add_extention(wrapper_name, 'c')) + main_obj_path = os.path.join(write_path, self.add_extention(wrapper_name, 'o')) + kernel_path = os.path.join(write_path, self.add_extention(llvm_name, 's')) + kernel_obj_path = os.path.join(write_path, self.add_extention(llvm_name, 'o')) + + main_compile = f'riscv64-unknown-elf-gcc -march=rv64gcv -c {main_path} -o {main_obj_path}' + kernel_compile = f'clang -c --target="riscv64" -march=rv64gcv -O2 -nostdlib {kernel_path} -o {kernel_obj_path}' + + target = os.path.join(write_path, binary_name) + link = f'riscv64-unknown-elf-gcc -march=rv64gcv {main_obj_path} {kernel_obj_path} -o {target} -lm {link_option}' + + main_compile_cmd = shlex.split(main_compile) + kernel_compile_cmd = shlex.split(kernel_compile) + link_cmd = shlex.split(link) + + try: + subprocess.check_call(main_compile_cmd) + subprocess.check_call(kernel_compile_cmd) + subprocess.check_call(link_cmd) + except subprocess.CalledProcessError as e: + print("Command failed with exit code", e.returncode) + print("Error output:", e.output) + assert(0) + + def parse_stack_sizes(self, file_path, vlenb=256): + with open(file_path, 'r') as f: + stack_sizes_data = f.readlines() + + in_proc = False + stack_base = None + dynamic_expr = None + max_offset = 0 + + for line in stack_sizes_data: + line = line.strip() + if line.startswith(".cfi_startproc"): + in_proc = True + continue + elif line.startswith(".cfi_endproc") and in_proc: + if dynamic_expr: + total_stack = eval(dynamic_expr, {"vlenb": vlenb}) + return total_stack + elif stack_base: + return stack_base + else: + return max_offset + + # Skip outer function + if not in_proc: + continue + + if line.startswith(".cfi_def_cfa_offset"): + stack_base = int(line.split()[-1]) + + if ".cfi_escape" in line and "#" in line: + comment = line.split("#")[-1].strip() + m = re.search(r"sp \+ (\d+)\s*\+\s*(\d+)\s*\*\s*vlenb", comment) + if m: + base, scale = int(m.group(1)), int(m.group(2)) + dynamic_expr = f"{base} + {scale} * vlenb" + + def get_spad_size(self, binary_path): + cmd = ["riscv64-unknown-elf-readelf", "-s", binary_path] + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if result.returncode != 0: + raise RuntimeError(f"Readelf error: {result.stderr}") + + output = result.stdout + spad_start = None + spad_end = None + for line in output.splitlines(): + if '.spad' in line and 'SECTION' in line: + parts = line.split() + spad_start = int(parts[1], 16) + elif 'spad_end' in line: + parts = line.split() + spad_end = int(parts[1], 16) + + if spad_start is None or spad_end is None: + return 0 + spad_size = spad_end - spad_start + return spad_size \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 21d2868e..c24260ce 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -2,26 +2,29 @@ import sympy import re import os +import math from functools import reduce from operator import mul import torch from collections import defaultdict from concurrent.futures import ThreadPoolExecutor +from torch._dynamo.testing import rand_strided +from torch._inductor.autotune_process import TensorMeta from torch._dynamo.utils import dynamo_timed from torch._inductor.codegen import cpp, wrapper, common, memory_planning from torch._inductor.virtualized import V, _ops as ops -from torch._inductor.codecache import write_atomic, write +from torch._inductor.codecache import write_atomic from torch._inductor.utils import ( IndentedBuffer, is_welford_reduction, sympy_product ) from torch.utils._sympy.functions import ModularIndexing, FloorDiv -import PyTorchSimFrontend.extension_codecache as extension_codecache - +from PyTorchSimFrontend import extension_codecache from PyTorchSimFrontend import extension_config from . import mlir_common from .mlir_common import LoopLevel, LoopNest +from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest def reduction_init(reduction_type, dtype): if dtype in cpp.DTYPE_LOWP_FP: @@ -96,8 +99,8 @@ def write_header(self): from torch import device, empty, empty_strided from {extension_codecache.__name__} import CustomAsyncCompile - from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_BACKENDSIM_EAGER_MODE - from Simulator.simulator import BackendSimulator + from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_TOGSIM_EAGER_MODE + from Simulator.simulator import TOGSimulator from PyTorchSimFrontend.extension_op import sparse_mm_dummy_stonne_outer from torch._inductor.select_algorithm import extern_kernels @@ -119,7 +122,7 @@ def sram_plan_prefix(buffer_name, buffer): start = buffer.data_ptr() end = start + buffer_size # print(f'Alloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})') - BackendSimulator.sram_alloc(buffer_name, [start, end]) + TOGSimulator.sram_alloc(buffer_name, [start, end]) def sram_plan_postfix(buffer_name, buffer): if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN): @@ -128,7 +131,7 @@ def sram_plan_postfix(buffer_name, buffer): start = buffer.data_ptr() end = start + buffer_size # print(f'Dealloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})') - BackendSimulator.sram_dealloc(buffer_name, [start, end]) + TOGSimulator.sram_dealloc(buffer_name, [start, end]) def host2device_memcopy(buffer): pass @@ -421,6 +424,10 @@ def exp(operand, *args, var_info=None, **kwargs): shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype return f'math.exp %{operand} : {shape}', [tile_size, dtype] + @staticmethod + def exp2(operand, *args, var_info=None, **kwargs): + raise NotImplementedError() + @staticmethod def erf(operand, *args, var_info=None, **kwargs): # Check scalar @@ -1076,8 +1083,8 @@ def load(self, name: str, index: sympy.Expr): # Extract sram info local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index, buffer=apply_buffer) - vlane_split_axis = local_tile_desc.vlane_split_axis - vlane_stride = local_tile_desc.vlane_stride + vlane_split_axis = local_tile_desc.vmap.vlane_split_axis + vlane_stride = local_tile_desc.vmap.vlane_stride tile_numel_per_lane = local_tile_desc.get_numel_per_lane() tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype) tile_stride = local_tile_desc.get_tile_stride() @@ -1123,8 +1130,8 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs): # Prepare dma instruction local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index) - vlane_split_axis = local_tile_desc.vlane_split_axis - vlane_stride = local_tile_desc.vlane_stride + vlane_split_axis = local_tile_desc.vmap.vlane_split_axis + vlane_stride = local_tile_desc.vmap.vlane_stride dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name]) tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype) @@ -1271,8 +1278,8 @@ def store_reduction(self, name, index, value): # Tile is always reuduced in inner loop local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index, broadcast=False, store_reduction=True, buffer=self.reductions_suffix) - vlane_split_axis = local_tile_desc.vlane_split_axis - vlane_stride = local_tile_desc.vlane_stride + vlane_split_axis = local_tile_desc.vmap.vlane_split_axis + vlane_stride = local_tile_desc.vmap.vlane_stride dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name]) tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype) @@ -1288,7 +1295,7 @@ def store_reduction(self, name, index, value): # mean reduction_numel = reduce(mul, self.ranges[self.reduction_depth:], 1) divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(reduction_numel)} : f32") - if self.buffer_types[name][1] > 1: + if compute_vec_size > 1: divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to vector<{self.var_info[sum][0]}x{mlir_dtype}>") else: divider_vec = divider @@ -1354,15 +1361,15 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index): self.register_var_info(div_vec, [compute_vec_size, "index"]) self.register_var_info(mod_vec, [compute_vec_size, "index"]) dim = ops.modular(ops.div(vector_index, div_vec), mod_vec) - if idx == tile_desc.vlane_split_axis: # Need to add vector lane offset - offset = tile_desc.vlane_stride #* strides[idx] - outer_sz = tile_size[idx] // tile_desc.vlane_stride + if idx == tile_desc.vmap.vlane_split_axis: # Need to add vector lane offset + offset = tile_desc.vmap.vlane_stride #* strides[idx] + outer_sz = tile_size[idx] // tile_desc.vmap.vlane_stride nr_vector_lane = self.get_const_cse(self.vector_lane, "index") nr_vector_lane_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{nr_vector_lane} : index to vector<{compute_vec_size}xindex>") self.register_var_info(nr_vector_lane_vec, [compute_vec_size, "index"]) - vlane_stride_coeff = self.get_const_cse(tile_desc.vlane_stride, "index") + vlane_stride_coeff = self.get_const_cse(tile_desc.vmap.vlane_stride, "index") vlane_outer_coeff = self.get_const_cse(outer_sz, "index") vlane_stride_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{vlane_stride_coeff} : index to vector<{compute_vec_size}xindex>") vlane_outer_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{vlane_outer_coeff} : index to vector<{compute_vec_size}xindex>") @@ -1432,9 +1439,9 @@ def index_expr(self, index, dtype): # FIXME. This is a temporary solution to get tile stride of the reduction case tile_desc = mlir_common.MLIRMultiDimTile( base_tile_desc.get_tile_size(), - base_tile_desc.vector_lane, - base_tile_desc.vlane_split_axis, - base_tile_desc.vlane_stride, + base_tile_desc.vmap.vector_lane, + base_tile_desc.vmap.vlane_split_axis, + base_tile_desc.vmap.vlane_stride, base_tile_desc.get_compute_vec_size(), ) axis_order = list(range(len(tile_desc.get_tile_size()))) @@ -1536,83 +1543,148 @@ def codegen_loops(self): def make_choices(self, nodes, kernel_name): choices = [] initial_tile_size = self.kernel_group.tile_desc.get_tile_size() - previous_ranges = self.ranges - prevent_infinite_loop = 0 - if len(initial_tile_size) < 2: - return choices # Can't autotune for 1-D tile size + prev_ranges = self.ranges + prev_tail_threshold = self.kernel_group.tile_desc.tail_ratio_threshold + + # Allow more tail ratio during autotuning + self.kernel_group.tile_desc.tail_ratio_threshold = 0.3 + + if prev_ranges == [1] or len(prev_ranges) == 0: + return choices + #if len(initial_tile_size) < 2: + # return choices # Can't autotune for 1-D tile size + for vlane_stride in [2, 4, 8]: - os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = str(vlane_stride) - previous_tile_size = initial_tile_size - increase_dim = -2 # increase the first dimension - while previous_tile_size[increase_dim] * 2 <= previous_ranges[increase_dim] and previous_tile_size[increase_dim] <= 2 ** 13 and prevent_infinite_loop < 10: - incrase_dim = -1 # only increase the last dimension - prevent_infinite_loop += 1 - while previous_tile_size[incrase_dim] * 2 <= previous_ranges[incrase_dim] and previous_tile_size[incrase_dim] <= 2 ** 13: + self.kernel_group.tile_desc.set_tile_size(initial_tile_size) + self.kernel_group.tile_desc.vmap.vlane_stride = vlane_stride + prevent_infinite_loop = 0 + + # Get the dimension to increase + candidate_axes = [ + axis for axis, constr in enumerate(self.kernel_group.tile_desc.tile_constraint) + if not constr.fixed + ] + search_space = set() + + # Try initial tile size + self.reset(None) + src_code = super().codegen_nodes(nodes, kernel_name) + current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size()) + search_space.add(current_tile_sz) + + print(f"[Auto-tune] Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}") + self._prepare_simulator_headers(src_code) + bench_runner = self.run_bench(nodes, kernel_name, src_code) + choices.append((bench_runner, src_code, current_tile_sz, self.kernel_group.tile_desc.vmap.vlane_stride)) + + while prevent_infinite_loop < 10 and candidate_axes: + for axis in list(candidate_axes): + prev_tile_sz = self.kernel_group.tile_desc.get_tile_size() + + # If tile size is maximized for this axis, remove from candidate axes + if prev_tile_sz[axis] >= prev_ranges[axis] * 2 or prev_tile_sz[axis] >= 2 ** 13: + candidate_axes.remove(axis) + self.reset(None) + continue + + # Try increase tile size for this axis + try: + self.kernel_group.tile_desc.scale_tile_dim(axis, prev_ranges[axis], 2) + except extension_codecache.TileSizeError as e: + # Failed to find proper tile size + candidate_axes.remove(axis) + self.reset(None) + continue + + self.reset(None) src_code = super().codegen_nodes(nodes, kernel_name) - if self.stop_autotune: - print(f"[Auto-tune] Skipping autotuning due to enough tile size: {self.kernel_group.tile_desc.get_tile_size()}") - break - print(f"[Auto-tune] Trying tile size: {self.kernel_group.tile_desc.get_tile_size()}, vlane_stride: {vlane_stride}") - previous_tile_size = self.kernel_group.tile_desc.get_tile_size() + current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size()) + + # FIXME. How to intergrate this constraint to tile system? + pad = self.kernel_group.tile_desc.vmap.get_used_vlane(current_tile_sz) * self.kernel_group.tile_desc.vmap.vlane_stride + vlane_size = current_tile_sz[self.kernel_group.tile_desc.vmap.vlane_split_axis] + if vlane_size > pad and vlane_size % pad: + prevent_infinite_loop += 1 + continue + + # If tile size is converged for this axis, remove from candidate axes + if current_tile_sz in search_space: + candidate_axes.remove(axis) + continue + + # Add this choice + search_space.add(current_tile_sz) + print(f"[Auto-tune] Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}") self._prepare_simulator_headers(src_code) bench_runner = self.run_bench(nodes, kernel_name, src_code) - choices.append((bench_runner, src_code, self.kernel_group)) - self.reset(f"tile_size_{incrase_dim}") - previous_tile_size[incrase_dim] = initial_tile_size[incrase_dim] - self.kernel_group.tile_desc.set_tile_size(previous_tile_size) - self.reset(f"tile_size_{increase_dim}") - self.reset("vlane_stride") + choices.append((bench_runner, src_code, self.kernel_group.tile_desc.get_tile_size(), self.kernel_group.tile_desc.vmap.vlane_stride)) + prevent_infinite_loop += 1 + self.kernel_group.tile_desc.prev_tail_threshold = prev_tail_threshold return choices - def autotune(self, nodes, kernel_name): + def autotune(self, *args): def get_cycle(choice): - bench_runner, src_code, kernel_group = choice + bench_runner = choice[0] for n_try in range(extension_config.CONFIG_MAX_AUTOTUNE_TRY): # TODO: make simple try: - # bench_runner = self.run_bench(nodes, kernel_name, src_code) - if int(os.environ.get('BACKENDSIM_DRYRUN', default=False)): - _, _, out = bench_runner(autotune=1) - else: - out = bench_runner(validate=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE) + out = bench_runner() return out[-1] except (extension_codecache.SpadOverflowError, RuntimeError) as e: return float("inf") - #if isinstance(e, RuntimeError) and str(e) != "STACK_OVERFLOW": - # print(f"Benchmark[trial-{n_try}] failed with unexpected error: {e}") - # return float("inf") - #print(f"Benchmark failed due to spad overflow with tile size: {self.kernel_group.tile_desc.get_tile_size()}") - #self.kernel_group = kernel_group # Reset to the original tile desc - #self.reset("spad_overflow") - #src_code = super().codegen_nodes(nodes, kernel_name) - #bench_runner = self.run_bench(nodes, kernel_name, src_code) - #kernel_group = self.kernel_group - #self._prepare_simulator_headers(src_code) return float("inf") # Exceeded maximum number of autotuning attempts - - choices = self.make_choices(nodes, kernel_name) + choices = self.make_choices(*args) if len(choices) == 0: # can't autotune - return None + return [None, None] with ThreadPoolExecutor(max_workers=8) as executor: results = list(executor.map(get_cycle, choices)) max_idx = results.index(min(results)) if min(results) == float("inf"): raise RuntimeError("Failed to find optimal tile size...") - print(f"[Auto-tune] Optimal tile size: {choices[max_idx][2].tile_desc.get_tile_size()}, vlane_stride: {choices[max_idx][2].tile_desc.vlane_stride}, cycles: {results[max_idx]}") - optimal_src_code = choices[max_idx][1] - return optimal_src_code + self._log_autotune_result(choices[max_idx], results[max_idx]) + optimal_src_code, loop_size = choices[max_idx][1], choices[max_idx][-1] + return optimal_src_code, loop_size + + def run_bench(self, nodes, kernel_name, src_code): + _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs() + input_call_args = tuple(self.args.input_buffers.keys()) + output_call_args = tuple(self.args.output_buffers.keys()) + full_input_nodes = tuple([V.graph.get_buffer(k) for k in input_call_args]) + full_output_nodes = tuple([V.graph.get_buffer(k) for k in output_call_args]) + + bmreq = MLIRBenchmarkRequest( + kernel_name=kernel_name, + input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes), + output_tensor_meta=TensorMeta.from_irnodes(full_output_nodes), + extra_args={ + "vector_lane" : self.vector_lane, + "spad_info": self.spad_info, + "vlen" : self.vlen, + "arg_attributes" : arg_attributes, + "validate" : extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE, + "autotune" : True, + }, + source_code=src_code, + ) + dummy_inputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.input_tensor_meta] + dummy_outputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.output_tensor_meta] + return bmreq.make_run_fn(dummy_inputs, dummy_outputs) + + def _log_autotune_result(self, best_choice, best_cycle): + print( + f"[Auto-tune] Optimal tile size: {list(best_choice[2])}, " + f"vlane_stride: {best_choice[3]}, " + f"cycles: {best_cycle}" + ) def codegen_nodes(self, nodes, kernel_name): src_code = super().codegen_nodes(nodes, kernel_name) self._prepare_simulator_headers(src_code) - if not extension_config.CONFIG_AUTOTUNE or extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: - return src_code - else: - optimal_src_code = self.autotune(nodes, kernel_name) - if optimal_src_code: + if extension_config.CONFIG_AUTOTUNE and extension_config.CONFIG_TORCHSIM_TIMING_MODE: + optimal_src_code = self.autotune(nodes, kernel_name)[0] + if optimal_src_code is not None: return optimal_src_code - else: - return src_code + return src_code def _prepare_simulator_headers(self, src_code): write_path = extension_codecache.get_write_path(src_code) @@ -1664,78 +1736,73 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe index_var = self.parse_indices(index, buffer=buffer, indirect_dims=indirect_dims) - if kg_tile_desc.vlane_split_axis in local_dims: - local_vlane_split_axis = local_dims.index(kg_tile_desc.vlane_split_axis) + if kg_tile_desc.vmap.vlane_split_axis in local_dims: + local_vlane_split_axis = local_dims.index(kg_tile_desc.vmap.vlane_split_axis) else: local_vlane_split_axis = max(len(local_dims) - 1, 0) # Case 0. Tile is 0-D scalar if len(local_dims) == 0: if not store_reduction: - local_tile_desc.set_tile_size([kg_tile_desc.get_used_vlane() * kg_tile_desc.vlane_stride]) # Force it to use vector instruction. - local_tile_desc.vlane_split_axis = local_vlane_split_axis # last axis - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.set_tile_size([kg_tile_desc.get_used_vlane() * kg_tile_desc.vmap.vlane_stride]) # Force it to use vector instruction. + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis # last axis + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride else: local_tile_desc.set_tile_size([1]) - local_tile_desc.vlane_split_axis = 0 - local_tile_desc.vlane_stride = 1 + local_tile_desc.vmap.vlane_split_axis = 0 + local_tile_desc.vmap.vlane_stride = 1 dram_stride = [0] # Edge case # Case 1. Tile is 1-D vector type elif len(local_dims) == 1 and len(local_dims) <= self.reduction_depth: local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(local_dims[0])]) - local_tile_desc.vlane_split_axis = local_vlane_split_axis - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride # Case 2. Tile is 1-D vector type with reduction elif len(local_dims) == 1 and len(local_dims) == self.reduction_depth + 1: local_tile_desc.set_tile_size([1, kg_tile_desc.get_dim_size(local_dims[0])]) - local_tile_desc.vlane_split_axis = local_vlane_split_axis + 1 - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + 1 + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride # Case 3. Tile is 2-D tile elif len(local_dims) == 2: is_reduction = self.reduction_depth == 1 and not store_reduction if is_reduction: local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims], [1, 0]) - local_tile_desc.vlane_split_axis = local_vlane_split_axis - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride else: local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims]) - local_tile_desc.vlane_split_axis = local_vlane_split_axis - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride # Case 3. Tile is 3-D tile elif len(local_dims) == 3: is_reduction = self.reduction_depth < 3 and not store_reduction if is_reduction: axis_order = [1, 2, 0] if self.get_nr_rdim()==1 else [2, 1, 0] local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims], axis_order) - local_tile_desc.vlane_split_axis = local_vlane_split_axis - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride else: local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims]) - local_tile_desc.vlane_split_axis = local_vlane_split_axis - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride # Case 4. Tile is 4-D tile (e.g., Convolution epilogue) elif len(local_dims) == 4: is_reduction = self.reduction_depth < 3 and not store_reduction if is_reduction: raise NotImplementedError("Currently not implemented... ;)") local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims]) - local_tile_desc.vlane_split_axis = local_vlane_split_axis - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride else: raise NotImplementedError("Currently not implemented... ;)") if len(implicit_local_dims)!=0 and len(local_dims) != len(implicit_local_dims) and self.is_modular_indexing(index): - tile_size = local_tile_desc.get_tile_size() - new_tile_size = [] - new_vlane_split_axis = local_tile_desc.vlane_split_axis - implicit_dim_size = list(kg_tile_desc.implicit_dim_size.values()) - for i, target_dim_size in enumerate(implicit_dim_size): - new_tile_size += [1]*(len(target_dim_size)-1) + tile_size[i:i+1] - if local_tile_desc.vlane_split_axis >= i: - new_vlane_split_axis += len(target_dim_size)-1 - # Update - local_tile_desc.set_tile_size(new_tile_size) - local_tile_desc.vlane_split_axis = new_vlane_split_axis + for axis_constraints in self.kernel_group.tile_desc.implicit_dim_size.values(): + if len(axis_constraints) <= 1: + continue + sorted_constraints = sorted(axis_constraints, key=lambda c: int(c.args[1])) + for constraint in sorted_constraints[1:]: + index = index.replace(constraint.original_expr, 0) # Calculate dram stride dram_stride = [0] * local_tile_desc.get_nr_dim() @@ -1780,6 +1847,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe new_tile_sizes = list(self.kernel_group.tile_desc.get_tile_size()) new_tile_sizes[dim_idx] = new_size self.kernel_group.tile_desc.set_tile_size(new_tile_sizes) + self.kernel_group.tile_desc.tile_constraint[dim_idx].fixed = True # Send recompile signal self.reset("recompile") diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 73996351..c655dde3 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -1,19 +1,18 @@ import dataclasses import math +from dataclasses import dataclass from typing import Dict from typing import List from collections import defaultdict from functools import reduce from operator import mul import torch -from torch._dynamo.testing import rand_strided -from torch._inductor.autotune_process import TensorMeta from torch._inductor.codegen import common from torch._inductor.codegen import cpp from torch._inductor.virtualized import V from torch._inductor.ir import MultiOutputLayout from torch._inductor.dependencies import MemoryDep, StarDep, WeakDep -from torch.utils._sympy.functions import ModularIndexing +from torch.utils._sympy.functions import ModularIndexing, FloorDiv, Mod import sympy import contextlib @@ -32,7 +31,7 @@ unique, ) from PyTorchSimFrontend import extension_config -from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest +from PyTorchSimFrontend import extension_codecache schedule_log = torch._logging.getArtifactLogger(__name__, "schedule") DTYPE_TO_MLIR = { @@ -209,169 +208,72 @@ def set_info(outer, inner, arg_type): set_info(outer, inner, self.MLIR_ARGS_VAR) return arg_defs, call_args, arg_attributes, buffer_types -class MLIRMultiDimTile(): - def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None, vec_size=None): - self.name = "" - self._tile_size = list(tile_size) - self._tile_stride = None - self.tile_axis_order = list(range(len(tile_size))) - self.vec_size = vec_size - self.update_tile_stride() - - # Vector lane mapping config +class VectorLaneMapping(): + def __init__(self, vector_lane: int, forced_vec_size: int, vlane_split_axis: int, vlane_stride: int): self.vector_lane = vector_lane self.vlane_split_axis = vlane_split_axis self.vlane_stride = vlane_stride - self.implicit_dim_size = None - self.nr_rdim = 0 - - # Dram offset - self.offset = sympy.Integer(0) - - def set_name(self, name: str): - self.name = name - - def set_tile_size(self, tile_size, tile_axis_order=None): - self._tile_size = tile_size - if tile_axis_order is None: - self.tile_axis_order = list(range(len(tile_size))) - else: - self.tile_axis_order = tile_axis_order - self.update_tile_stride() - - def set_tile_size_stride(self, tile_size, tile_stride): - self._tile_size = tile_size - self._tile_stride = tile_stride - - def get_name(self) -> str: - return self.name - - def get_tile_size(self): - return self._tile_size - - def get_numel(self): - """ - Return size of multi-dimensional tile - """ - size = 1 - for dim_size in self._tile_size: - size *= dim_size - return size - - def get_numel_per_lane(self): - tile_size_per_lane = self.get_tile_size_per_lane() - size = 1 - for dim_size in tile_size_per_lane: - size *= dim_size - return size - - def update_tile_stride(self): - strides = [1] * len(self._tile_size) - init = 1 + self.forced_vec_size = forced_vec_size - original_indices = list(range(len(self.tile_axis_order))) - sorted_pairs = sorted( - zip(self.tile_axis_order, self._tile_size, original_indices), - key=lambda x: x[0], reverse=True + def get_used_vlane(self, tile_size: list[int]): + return min( + math.ceil(tile_size[self.vlane_split_axis] / self.vlane_stride), + self.vector_lane ) - for _, size, original_indices in sorted_pairs: - strides[original_indices] = init - init *= size - self._tile_stride = strides - def get_tile_stride(self): - return self._tile_stride + def get_tile_size_per_lane(self, tile_size: list[int]): + per_lane = tile_size.copy() + used = self.get_used_vlane(tile_size) + if self.vlane_split_axis < 0 or self.vlane_split_axis >= len(per_lane): + raise AssertionError("Not allowed split_axis") + per_lane[self.vlane_split_axis] = math.ceil(per_lane[self.vlane_split_axis] / used) + return per_lane - def get_tile_stride_per_lane(self): - tile_stride = list(self.get_tile_stride()) # original strides - tile_size = list(self.get_tile_size()) # original tile size - split_axis = self.vlane_split_axis + def get_numel_per_lane(self, tile_size: list[int]): + return math.prod(self.get_tile_size_per_lane(tile_size)) - tile_size_per_lane = self.get_tile_size_per_lane() - coeff = tile_size[split_axis]//tile_size_per_lane[split_axis] + def get_tile_stride_per_lane(self, tile_size: list[int], tile_stride: list[int]): + tile_stride = tile_stride.copy() # original strides + get_tile_size_per_lane = self.get_tile_size_per_lane(tile_size) + coeff = tile_size[self.vlane_split_axis]//get_tile_size_per_lane[self.vlane_split_axis] # Propagate stride according to per-lane tile size for i in range(len(tile_stride)): - if tile_stride[i] > tile_stride[split_axis]: + if tile_stride[i] > tile_stride[self.vlane_split_axis]: tile_stride[i] = tile_stride[i] // coeff return tile_stride - def get_tile_size_per_lane(self): - tile_size_per_lane = list(self._tile_size) - if self.vlane_split_axis < 0 or self.vlane_split_axis >= len(tile_size_per_lane): - raise AssertionError("Not allowed split_axis") - used_vlane = self.get_used_vlane() - tile_size_per_lane[self.vlane_split_axis] = \ - self.div_round_up(tile_size_per_lane[self.vlane_split_axis], used_vlane) - return tile_size_per_lane - - def get_nr_dim(self): - """ - Return number of dimensions - """ - return len(self._tile_size) - - def get_dim_size(self, index): - if isinstance(index, int): - return self._tile_size[index] - elif "index" in str(index): - return self._tile_size[int(str(index)[5:])] - raise NotImplementedError("Unsupported format of index") - - def get_mlir_shape(self, dtype): - str_tile_size = [str(dim) for dim in self._tile_size] - shape = "x".join(str_tile_size) - return f"memref<{shape}x{dtype}, 1>" - - def get_mlir_vshape(self, mlir_dtype): - return f"vector<{self.get_compute_vec_size()}x{mlir_dtype}>" if self.get_compute_vec_size() > 1 else f"{mlir_dtype}" - - def get_used_vlane(self): - """ - Return number of used vector lane - """ - if self.vlane_split_axis < 0 or self.vlane_split_axis >= len(self._tile_size): - raise AssertionError("Not allowed split_axis") - return min(self.div_round_up(self._tile_size[self.vlane_split_axis], self.vlane_stride), self.vector_lane) - - def get_vlane_stride(self): - return self.vlane_stride - - def get_compute_vec_size(self): - # Granule size used in compute loop - if self.vec_size is not None: - return self.vec_size - if self.nr_rdim: - assert self.nr_rdim!=0 - val = self.get_numel_per_lane() // self.get_reduction_numel() - if self.get_numel_per_lane() >= val * 8: - return val*8 - elif self.get_numel_per_lane() >= val * 4: - return val*4 - elif self.get_numel_per_lane() >= val * 2: - return val*2 + def get_compute_vec_size(self, tile_size: list[int], reduction_numel: int, nr_rdim: int) -> int: + if self.forced_vec_size is not None: + return self.forced_vec_size + + per_lane = self.get_numel_per_lane(tile_size) + stride = self.vlane_stride + if nr_rdim: + val = per_lane // max(reduction_numel, 1) + for mult in [8, 4, 2]: + if per_lane >= val * mult: + return val * mult return val - if (self.get_numel_per_lane() // self.vlane_stride) >= 8: - return self.vlane_stride * 8 - if (self.get_numel_per_lane() // self.vlane_stride) >= 4: - return self.vlane_stride * 4 - if (self.get_numel_per_lane() // self.vlane_stride) >= 2: - return self.vlane_stride * 2 - return self.vlane_stride + for mult in [8, 4, 2]: + if (per_lane // stride) >= mult: + return stride * mult + return stride - @staticmethod - def div_round_up(size, round_val): - return (size + round_val - 1) // round_val +class TileAdjustMixin(): + def __init__(self): + self.tail_ratio_threshold = 0.01 def apply_divisor(self, axis: int, divisor: int, mode: str = "split"): - # Apply divisor to tile size at given axis. - # This method based on axis order. + """Split or pad a given axis of the tile.""" old_size = self._tile_size[axis] - if divisor == 1: + if divisor <= 1: return - padded = self.div_round_up(old_size, divisor) * divisor - outer = self.div_round_up(old_size, divisor) - inner = divisor + + padded = math.ceil(old_size / divisor) * divisor + outer = math.ceil(old_size / divisor) + inner = divisor + if mode == "pad": self._tile_size[axis] = padded self.update_tile_stride() @@ -382,54 +284,277 @@ def apply_divisor(self, axis: int, divisor: int, mode: str = "split"): new_sizes.insert(axis + 1, inner) self._tile_size = new_sizes - # Update tile_axis_order old_order_val = self.tile_axis_order[axis] new_order = list(self.tile_axis_order) new_order.insert(axis + 1, old_order_val + 0.1) - sorted_pairs = sorted( - zip(range(len(new_order)), new_order), - key=lambda x: x[1] - ) - self.tile_axis_order = [idx for idx, _ in sorted_pairs] + self.tile_axis_order = [idx for idx, _ in sorted( + zip(range(len(new_order)), new_order), key=lambda x: x[1] + )] self.update_tile_stride() - if self.vlane_split_axis == axis: - self.vlane_split_axis = axis - elif self.vlane_split_axis > axis: - self.vlane_split_axis += 1 + # Adjust split axis for vmap + if self.vmap.vlane_split_axis > axis: + self.vmap.vlane_split_axis += 1 return - else: - raise ValueError(f"Unknown mode: {mode}. Supported modes are 'pad' and 'split'.") - def get_reduction_numel(self): - return reduce(mul, self.get_tile_size()[-1*self.nr_rdim:], 1) + raise ValueError(f"Unknown mode: {mode}. Supported: 'pad', 'split'.") - def is_dim_dividable(self, dim_sizes): + def is_dim_dividable(self, dim_sizes: list[int]) -> bool: if len(dim_sizes) != len(self._tile_size): - raise ValueError("dim_sizes must match the tile size dimensions.") - dim_sizes_cpy = [int(d) for d in dim_sizes] - remain = dim_sizes_cpy[self.vlane_split_axis] % self.vlane_stride + raise ValueError("dim_sizes must match the tile size dimensions") + + dim_sizes_cpy = list(dim_sizes) + axis, stride = self.vmap.vlane_split_axis, self.vmap.vlane_stride + remain = dim_sizes_cpy[axis] % stride if remain: - dim_sizes_cpy[self.vlane_split_axis] += self.vlane_stride - remain + dim_sizes_cpy[axis] += stride - remain + return all(d % t == 0 for d, t in zip(dim_sizes_cpy, self._tile_size)) - def adjust_tile_to_divisible(self, dim_sizes): + def adjust_tile_to_divisible(self, dim_sizes: list[int]) -> list[int]: + """Adjust current tile to be divisible by given dimensions.""" + if len(dim_sizes) != len(self._tile_size): + raise ValueError("dim_sizes must match the tile size dimensions") + def _adjust_one(dim_size, tile_size): for candidate in range(tile_size, 0, -1): if dim_size % candidate == 0: return candidate return 1 - if len(dim_sizes) != len(self._tile_size): - raise ValueError("dim_sizes must match the tile size dimensions.") candidate_tile_size = [_adjust_one(d, t) for d, t in zip(dim_sizes, self._tile_size)] - # FIXME. Is this the only solution? - # Round up - remain = candidate_tile_size[self.vlane_split_axis] % self.vlane_stride + for i in range(len(candidate_tile_size)): + self.tile_constraint[i].must_divide_dim = True + + axis, stride = self.vmap.vlane_split_axis, self.vmap.vlane_stride + remain = candidate_tile_size[axis] % stride + if remain: - candidate_tile_size[self.vlane_split_axis] += self.vlane_stride - remain + candidate_tile_size[axis] += stride - remain + self.tile_constraint[axis].must_divide_dim = False return candidate_tile_size + def scale_tile_dim(self, axis, dim_sz, scale_factor=2): + axis_constrinat = self.tile_constraint[axis] + current_sz = self._tile_size[axis] + new_sz = axis_constrinat.adjust(current_sz, int(current_sz * scale_factor), dim_sz) + self._tile_size[axis] = new_sz + self.update_tile_stride() + return current_sz != new_sz + + def decrease_tile_size(self, dim_size): + tile_size = self._tile_size + vlane_split_axis, vlane_stride, vector_lane = self.vmap.vlane_split_axis, self.vmap.vlane_stride, self.vmap.vector_lane + tile_size = list(tile_size) + + # Decrease vlane_split_axis when it is too large + if tile_size[vlane_split_axis] > 2 * vlane_stride * vector_lane: + if self.scale_tile_dim(vlane_split_axis, dim_size[vlane_split_axis], scale_factor=0.5): + return + + for i in range(len(tile_size)): + if i == vlane_split_axis: + continue + if tile_size[i] > 1: + if self.scale_tile_dim(i, dim_size[i], scale_factor=0.5): + return + + # Decrease vlane_split_axis at the end to maximize the vlane usage + self.scale_tile_dim(vlane_split_axis, dim_size[vlane_split_axis], scale_factor=0.5) + return + + def trim_large_tail(self, ranges: list[int]): + for i, (dim_range, tile_range) in enumerate(zip(ranges, self._tile_size)): + ALPHA = 1.0 + BETA = 0.5 + constraint = self.tile_constraint[i] + if constraint.fixed: + continue + elif constraint.must_divide_dim: + BETA = 0 + + padding_ratio = TileAdjustMixin.get_padding_ratio(tile_range, dim_range) + if padding_ratio < self.tail_ratio_threshold: + continue + best_tile = tile_range + best_cost = ( + ALPHA * padding_ratio + + BETA * (dim_range / tile_range) + ) + + min_tile = 1 + for candidate in range(tile_range - 1, min_tile - 1, -1): + new_candidate = constraint.adjust(tile_range, candidate, dim_range) + ratio = TileAdjustMixin.get_padding_ratio(new_candidate, dim_range) + iter_penalty = (dim_range / new_candidate) + + cost = ALPHA * ratio + BETA * iter_penalty + if cost < best_cost: + best_tile, best_cost = new_candidate, cost + self._tile_size[i] = best_tile + + def select_vlane_axis(self): + best_vlane_split_axis = 0 + best_used_vlane = math.ceil(self._tile_size[0] / self.vmap.vlane_stride) + for i, dim in enumerate(self._tile_size[1:len(self._tile_size)-self.nr_rdim]): + used_vlane = math.ceil(dim / self.vmap.vlane_stride) + if used_vlane > best_used_vlane: + best_used_vlane = used_vlane + best_vlane_split_axis = i+1 + self.vmap.vlane_split_axis = best_vlane_split_axis + + def pad_vlane_tile(self): + # FIXME. this doesn't follow tile constraints... + vlane_split_axis, vlane_stride, vector_lane = self.vmap.vlane_split_axis, self.vmap.vlane_stride, self.vmap.vector_lane + used_vlane = min(math.ceil(self._tile_size[vlane_split_axis] / vlane_stride), vector_lane) + padded_size = used_vlane * vlane_stride + self._tile_size[vlane_split_axis] = math.ceil(self._tile_size[vlane_split_axis] / padded_size) * padded_size + + def apply_constraints(self, constraints, ranges): + for idx, (axis_constraints, axis_size) in enumerate(zip(constraints.values(), ranges)): + for const in axis_constraints: + if const.args[1] == 1: + continue + divider = int(const.args[1]) + + if not self.tile_constraint[idx].fixed: + self.tile_constraint[idx].fixed = True + self._tile_size[idx] = divider + elif self.tile_constraint[idx].fixed and self._tile_size[idx] > divider: + self._tile_size[idx] = divider + self.update_tile_stride() + + @staticmethod + def init_tile_size(ranges, vlane_stride, vector_lane): + nr_dim = len(ranges) + tile_size = [1] * nr_dim + if len(tile_size) == 2: + tile_size[-1] = vlane_stride * vector_lane + tile_size[-2] = 2 * vector_lane + elif len(tile_size) == 0: # Scalar + tile_size = [1] + ranges = [1] + elif len(tile_size) == 1 and ranges[0]==1: + tile_size[0] = 1 + elif len(tile_size) == 1: + tile_size[0] = 2 * vlane_stride * vector_lane + elif len(tile_size) == 3: + tile_size[-1] = vector_lane + tile_size[-2] = 4 * vector_lane + tile_size[-3] = 2 + elif len(tile_size) == 4: + tile_size[-1] = vector_lane + tile_size[-2] = 4 * vector_lane + tile_size[-3] = 2 + tile_size[-4] = 1 + else: + raise NotImplementedError("dummy tile size fail!") + return tile_size + + @staticmethod + def get_padding_ratio(tile_range: int, dim_range: int) -> float: + if tile_range <= 0 or dim_range <= 0: + raise ValueError("tile_range and dim_range must be positive integers") + tail = dim_range % tile_range + padding = (tile_range - tail) % tile_range + return float(padding / dim_range) + +@dataclass +class TileConstraint: + multiple_of: int = 1 + must_divide_dim: bool = False + fixed: bool = False + + def adjust(self, old: int, new: int, dim: int) -> int: + if self.fixed: + return old # Fixed tile size + + tail = new % self.multiple_of + new -= tail + if not self.must_divide_dim: + return max(new, self.multiple_of) + + while new > 0: + if dim % new == 0: + return new + new -= self.multiple_of + raise extension_codecache.TileSizeError("Cannot find suitable tile size under the given constraints.") + +class MLIRMultiDimTile(TileAdjustMixin): + def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None, forced_vec_size=None): + super().__init__() + self.name = "" + self._tile_size = list(tile_size) + self._tile_stride = None + self.tile_constraint = [TileConstraint(vlane_stride) for _ in tile_size] + self.tile_axis_order = list(range(len(tile_size))) + self.update_tile_stride() + + # Vector lane mapping config + self.vmap = VectorLaneMapping( + vector_lane=vector_lane, + forced_vec_size=forced_vec_size, + vlane_split_axis=vlane_split_axis, + vlane_stride=vlane_stride + ) + + self.implicit_dim_size = None + self.nr_rdim = 0 + self.offset = sympy.Integer(0) # Dram offset + + def set_name(self, name: str): self.name = name + def get_name(self) -> str: return self.name + def get_tile_size(self): return list(self._tile_size) + def get_tile_stride(self): return list(self._tile_stride) + def get_numel(self) -> int :return math.prod(self._tile_size) + def get_nr_dim(self) -> str: return len(self._tile_size) + def get_reduction_numel(self): return reduce(mul, self.get_tile_size()[-1*self.nr_rdim:], 1) + + def set_tile_size(self, tile_size, tile_axis_order=None, constraints=None): + self._tile_size = list(tile_size) + self.tile_axis_order = list(range(len(tile_size))) if tile_axis_order is None else tile_axis_order + self.update_tile_stride() + + def set_tile_size_stride(self, tile_size, tile_stride): + self._tile_size = list(tile_size) + self._tile_stride = list(tile_stride) + + def update_tile_stride(self): + strides = [1] * len(self._tile_size) + init = 1 + + original_indices = list(range(len(self.tile_axis_order))) + sorted_pairs = sorted( + zip(self.tile_axis_order, self._tile_size, original_indices), + key=lambda x: x[0], reverse=True + ) + for _, size, original_indices in sorted_pairs: + strides[original_indices] = init + init *= size + self._tile_stride = strides + + def get_dim_size(self, index): + if isinstance(index, int): + return self._tile_size[index] + elif "index" in str(index): + return self._tile_size[int(str(index)[5:])] + raise NotImplementedError("Unsupported format of index") + + # Vector mapping delegation + def get_tile_size_per_lane(self): return self.vmap.get_tile_size_per_lane(self._tile_size) + def get_used_vlane(self): return self.vmap.get_used_vlane(self._tile_size) + def get_numel_per_lane(self): return self.vmap.get_numel_per_lane(self._tile_size) + def get_tile_stride_per_lane(self): return self.vmap.get_tile_stride_per_lane(self._tile_size, self._tile_stride) + def get_compute_vec_size(self): return self.vmap.get_compute_vec_size(self._tile_size, self.get_reduction_numel(), self.nr_rdim) + + # Helper functions for codegen + def get_mlir_shape(self, dtype): + shape = "x".join([str(dim) for dim in self._tile_size]) + return f"memref<{shape}x{dtype}, 1>" + + def get_mlir_vshape(self, mlir_dtype): + return f"vector<{self.get_compute_vec_size()}x{mlir_dtype}>" if self.get_compute_vec_size() > 1 else f"{mlir_dtype}" + class MLIRWrapperKenrelGroup(cpp.KernelGroup): def __init__(self): super().__init__() @@ -525,191 +650,96 @@ def call_kernel(self, kernel_name): def is_modular_indexing(self, expr): return "ModularIndexing" in str(expr) - def compute_tile_size(self, nodes, vars, reduction_vars): - # Handle implict dims. Input operand could have larger dimension space. - implicit_ranges = False - target_operand : MemoryDep = None - implicit_dim_size = defaultdict(list) - for read_operand in nodes[0].read_writes.reads: - read_operand : MemoryDep - if isinstance(read_operand, StarDep) or isinstance(read_operand, WeakDep): # FIXME: WeakDep & StarDep are not supported (MoE case) - continue - read_index = read_operand.index - for arg in read_index.args: - if "ModularIndexing" in str(arg) or "//" in str(arg): - implicit_ranges = True - target_operand = read_operand - break - - if implicit_ranges: - #print("This operation contain implicit dimension space!") - linearized_stride = [1] * len(target_operand.var_names) - for i in range(len(target_operand[3])-2, -1, -1): - linearized_stride[i] = linearized_stride[i+1] * target_operand[3][i+1] - - linearized_index = sympy.Integer(0) - for dim, stride in zip(target_operand[2], linearized_stride): - linearized_index += stride * dim - - new_dim_expression = [] - new_dim_size = [] - for arg in target_operand.index.args: + def implicit_dim_ops(self, nodes): + target_patterns = (ModularIndexing, FloorDiv, Mod) + target_operands = [] + for target_node in nodes: + for read_operand in target_node.read_writes.reads: + read_operand: MemoryDep + if isinstance(read_operand, StarDep) or isinstance(read_operand, WeakDep): + continue + read_index = read_operand.index + for arg_expr in read_index.args: + if arg_expr.atoms(*target_patterns): + target_operands.append(read_operand) + return target_operands + + def extract_dividers(self, implicit_ops): + # When a specific axis is processed, the key constraint to verify is the divider. + # The tile size must be forced to match the divider size. + dim_dividers = defaultdict(set) + for operand in implicit_ops: + subs_map = { + s: sympy.symbols(s.name.replace("c", "index", 1)) + for s in operand.index.free_symbols + } + rev_subs_map = { + sympy.symbols(s.name.replace("c", "index", 1)) : s + for s in operand.index.free_symbols + } + new_index = operand.index.subs(subs_map) + for arg in new_index.args: if len(arg.free_symbols) != 1: raise NotImplementedError("Not supporting this view operation...!") - if arg.is_Mul and arg.args[0].is_number: arg = arg.args[1] if isinstance(arg, ModularIndexing): modular_expr = ModularIndexing(arg.args[0], arg.args[1], arg.args[2]) + modular_expr.original_expr = arg elif arg.is_symbol: - modular_expr = ModularIndexing(arg, 1, target_operand.ranges[arg]) + modular_expr = ModularIndexing(arg, 1, operand.ranges[rev_subs_map[arg]]) + modular_expr.original_expr = arg elif "//" in str(arg): - modular_expr = ModularIndexing(arg.args[0], arg.args[1], target_operand.ranges[arg.args[0]]//arg.args[1]) + modular_expr = ModularIndexing(arg.args[0], arg.args[1], operand.ranges[rev_subs_map[arg.args[0]]]//arg.args[1]) + modular_expr.original_expr = arg else: raise NotImplementedError("What is this case?") - new_dim_expression.append(modular_expr) - new_dim_size.append(modular_expr.args[2]) - implicit_dim_size[int(str(modular_expr.args[0])[1:])].append(int(modular_expr.args[2])) - - # Sanity check - for dim, sub_dims in implicit_dim_size.items(): - sz = reduce(mul, sub_dims, 1) - if sz != target_operand[3][dim]: - raise NotImplementedError("Not supporting type...") - - vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop - - # FIXME: Naive decrease tile size - def decrease_tile_size(tile_size, vlane_split_axis): - is_decreased = False - - # Decrease vlane_split_axis when it is too large - if tile_size[vlane_split_axis] > vlane_stride * self.vector_lane: - tile_size[vlane_split_axis] = int(tile_size[vlane_split_axis] // 2) - return tile_size - - for i in range(len(tile_size)): - if i == vlane_split_axis: - continue - if tile_size[i] > 1: - tile_size[i] = int(tile_size[i] // 2) - is_decreased = True - break - - # Decrease vlane_split_axis at the end to maximize the vlane usage - if not is_decreased: - if tile_size[vlane_split_axis] > 1: - tile_size[vlane_split_axis] = int(tile_size[vlane_split_axis] // 2) - return tile_size - - # Dummy tile size - def dummy_tile_size(): - tile_size = [1] * (len(vars) + len(reduction_vars)) - if len(tile_size) == 2: - tile_size[-1] = vlane_stride * self.vector_lane - tile_size[-2] = 2 * self.vector_lane - elif len(tile_size) == 0: # Scalar - tile_size = [1] - self.ranges = [1] - elif len(tile_size) == 1: - tile_size[0] = 2 * vlane_stride * self.vector_lane - elif len(tile_size) == 3: - tile_size[-1] = self.vector_lane - tile_size[-2] = 4 * self.vector_lane - tile_size[-3] = 2 - elif len(tile_size) == 4: - tile_size[-1] = self.vector_lane - tile_size[-2] = 4 * self.vector_lane - tile_size[-3] = 2 - tile_size[-4] = 1 - else: - raise NotImplementedError("dummy tile size fail!") - return tile_size + dim_dividers[modular_expr.args[0]].add(modular_expr) + return dim_dividers + def compute_tile_size(self, nodes, vars, reduction_vars): + vlane_split_axis = len(vars) - 1 vlane_stride = extension_config.CONFIG_VECTOR_LANE_STRIDE - if self.recodegen is None: - tile_size = dummy_tile_size() - else: + + # Set initial tile size & vector lane mapping + if self.kernel_group.tile_desc is None: + tile_size = MLIRMultiDimTile.init_tile_size(self.ranges, vlane_stride, self.vector_lane) + init_tile_desc = MLIRMultiDimTile(tile_size, self.vector_lane, vlane_split_axis, vlane_stride) + init_tile_desc.nr_rdim = len(reduction_vars) + self.kernel_group.set_tile_info(init_tile_desc) + + # Handle edge case + if len(self.ranges)==1 and self.ranges[0] == 1: # Scalar case 2 + self.kernel_group.tile_desc.vmap.vlane_stride = 1 + self.kernel_group.tile_desc.vmap.vlane_split_axis = 0 + elif vlane_split_axis == -1: # Reduction only case + self.kernel_group.tile_desc.vmap.vlane_split_axis = 0 + self.kernel_group.tile_desc.vmap.vlane_stride = self.kernel_group.tile_desc.get_tile_size()[0] + + # Handle implict dims. Input operand could be high dimension tensor. + # Note: https://github.com/PSAL-POSTECH/PyTorchSim/issues/173 + implicit_ops = self.implicit_dim_ops(nodes) + if implicit_ops: + tile_constraints = self.extract_dividers(implicit_ops) + self.kernel_group.tile_desc.apply_constraints(tile_constraints, self.ranges) + self.kernel_group.tile_desc.implicit_dim_size = tile_constraints + + # Check recodegen reason + if self.recodegen is not None: if self.recodegen == "spad_overflow": - tile_size = self.kernel_group.tile_desc.get_tile_size() - decrease_tile_size(tile_size, vlane_split_axis) - elif self.recodegen == "vlane_stride": - tile_size = dummy_tile_size() - elif "tile_size" in self.recodegen: - dim = int(self.recodegen.split("_")[-1]) - tile_size = self.kernel_group.tile_desc.get_tile_size() # TODO: - tile_size[dim] = tile_size[dim] * 2 + self.kernel_group.tile_desc.decrease_tile_size(self.ranges) elif self.recodegen == "recompile": return self.kernel_group.tile_desc else: raise NotImplementedError(f"Unknown recodegen reason: {self.recodegen}") - # FIXME: Not considering removed buffers - n_buffer = sum( - len(node.read_writes.reads) + len(node.read_writes.writes) - for node in nodes - ) - - spad_overflow = True - # Find proper tile size - while spad_overflow: - # Adjust tile size to avoid too much paddings - for i in range(1, len(tile_size)+1): - target_range = self.ranges[-i] - if implicit_ranges: - target_range = implicit_dim_size[len(tile_size)-i][-1] - - if tile_size[-i] > target_range: - remains = (target_range % vlane_stride) - self.stop_autotune = True - tile_size[-i] = target_range - if remains: - tile_size[-i] += vlane_stride - remains - - # Adjust tile size - for i in range(len(vars)): - if tile_size[i] >= self.vector_lane: # maximize used vector lane - vlane_split_axis = i - used_vlane = min((tile_size[vlane_split_axis] + vlane_stride - 1) // vlane_stride, self.vector_lane) - padded_size = used_vlane * vlane_stride - tile_size[vlane_split_axis] = ((tile_size[vlane_split_axis] + padded_size - 1) // padded_size) * padded_size - - # Check spad overflow - spad_usage_per_vlane = n_buffer * math.prod(tile_size) * self.precision // used_vlane - if spad_usage_per_vlane >= self.spad_info["spad_size"]: - new_tile_size = decrease_tile_size(tile_size.copy(), vlane_split_axis) - if new_tile_size == tile_size: - raise NotImplementedError("Error: Cannot find proper tile size") - tile_size = new_tile_size - spad_overflow = True - self.stop_autotune = True # for auto-tune - continue - else: - spad_overflow = False - - # Maximize the utilizaiotn of vectorlane - if len(reduction_vars): - minimum_stride = max(self.roundup_vectorlane(tile_size[vlane_split_axis]) // self.vector_lane, 2) - vlane_stride = min(minimum_stride, 8) - - # Handle scalar case - if len(self.ranges)==1 and self.ranges[0] == 1: - vlane_stride = 1 - vlane_split_axis = 0 - tile_size[0] = 1 - elif vlane_split_axis == -1: - vlane_split_axis = 0 - vlane_stride = tile_size[0] - - # Select tile info. - # Note: Kernel Group have to share same tile desc for fusion - tile_desc = MLIRMultiDimTile(tile_size, self.vector_lane) - tile_desc.vlane_split_axis = vlane_split_axis - tile_desc.vlane_stride = vlane_stride - tile_desc.implicit_dim_size = implicit_dim_size - tile_desc.nr_rdim = len(reduction_vars) - return tile_desc + # Adjust tile size & vector lane mapping + self.kernel_group.tile_desc.trim_large_tail(self.ranges) + self.kernel_group.tile_desc.select_vlane_axis() + self.kernel_group.tile_desc.pad_vlane_tile() + self.kernel_group.tile_desc.update_tile_stride() + return self.kernel_group.tile_desc def codegen_nodes(self, nodes, kernel_name): recompile_try = 0 @@ -724,7 +754,6 @@ def codegen_nodes(self, nodes, kernel_name): tile_desc = self.compute_tile_size(nodes, vars, reduction_vars) self.compute_body_loop.size = tile_desc.get_numel_per_lane() self.compute_body_loop.step = tile_desc.get_compute_vec_size() - self.kernel_group.set_tile_info(tile_desc) try: _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs() with self as kernel: @@ -743,29 +772,6 @@ def codegen_nodes(self, nodes, kernel_name): self.meta_kernel() return src_code - def run_bench(self, nodes, kernel_name, src_code): - _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs() - input_call_args = tuple(self.args.input_buffers.keys()) - output_call_args = tuple(self.args.output_buffers.keys()) - full_input_nodes = tuple([V.graph.get_buffer(k) for k in input_call_args]) - full_output_nodes = tuple([V.graph.get_buffer(k) for k in output_call_args]) - - bmreq = MLIRBenchmarkRequest( - kernel_name=kernel_name, - input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes), - output_tensor_meta=TensorMeta.from_irnodes(full_output_nodes), - extra_args={ - "vector_lane" : self.vector_lane, - "spad_info": self.spad_info, - "vlen" : self.vlen, - "arg_attributes" : arg_attributes - }, - source_code=src_code, - ) - dummy_inputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.input_tensor_meta] - dummy_outputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.output_tensor_meta] - return bmreq.make_run_fn(dummy_inputs, dummy_outputs) - def codegen_kernel(self, kernel_name): arg_defs, _, _, _ = self.kernel_group.args.mlir_argdefs() arg_defs = ",\n".ljust(25).join(arg_defs) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py new file mode 100644 index 00000000..77826730 --- /dev/null +++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py @@ -0,0 +1,120 @@ +import os +import math +from typing import List, Optional + +from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs +from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate +from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel +from torch._inductor.ir import IRNode +from PyTorchSimFrontend import extension_config + +class MLIRConvCommonTemplate(MLIRTemplate): + WRAPPER_TEMPLATE = None + def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): + super().__init__("kernel", input_nodes, layout, input_reorder) + self.stride = kwargs["stride"] + self.padding = kwargs["padding"] + self.dilation = kwargs["dilation"] + self.weight_shape = [str(i) for i in input_nodes[1].layout.size] + self.input_shape = [str(i) for i in input_nodes[0].layout.size] + self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \ + + "_".join([str(i) for i in self.stride]) \ + + "_" + "_".join([str(i) for i in self.padding]) \ + + "_" + "_".join([str(i) for i in self.dilation]) + self.kernel_args = ['X', 'W', 'Bias', 'Y'] + + def get_padded_input_size(self, X): + input_padded = list(X.layout.size) + input_padded[2] += 2 * self.padding[0] + input_padded[3] += 2 * self.padding[1] + return math.prod(input_padded) + + def render(self, + kernel: MLIRTemplateKernel, + template_buffer_node = None, + epilogue_nodes: Optional[List[IRNode]] = None, + tile_info = None, + **kwargs): + raise NotImplementedError() + + def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): + raise NotImplementedError() + + def extract_info(self, kernel, template_buffer_node, epilogue_nodes): + if template_buffer_node is not None: + self.output_node = template_buffer_node + self.kernel = kernel + self.epilogue_nodes = epilogue_nodes + + X, W = self.input_nodes[0], self.input_nodes[1] + Y = self.output_node + Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] + + if epilogue_nodes is not None: + extra_node_rw = { + item.name for epilogue_node in epilogue_nodes + for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes + if item.name != Y.name + } + n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0 + + BATCH, I_C, I_H, I_W = X.layout.size + O_C, _, K_H, K_W = W.layout.size + O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2] + O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3] + PADDING_H=self.padding[0] + PADDING_W=self.padding[1] + STRIDE_H=self.stride[0] + STRIDE_W=self.stride[1] + return X,W,Y,Bias,n_extra_node,BATCH,I_C,I_H,I_W,O_C,K_H,K_W,O_H,O_W,PADDING_H,PADDING_W,STRIDE_H,STRIDE_W + + def get_tile_candidates(self, + kernel: MLIRTemplateKernel, + template_buffer_node = None, + epilogue_nodes: Optional[List[IRNode]] = None, + **kwargs): + # Extract input arguments info + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) + return self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + + def outer_func_render(self, kernel_name, input_args): + X, W = self.input_nodes[0], self.input_nodes[1] + Y = self.output_node + Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] + + eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) + options = dict( + kernel=self.kernel, + KERNEL_NAME=kernel_name, + FUNC_NAME=self.function_name + f"_{len(input_args)}", + INPUT=X, + WEIGHT=W, + BIAS=Bias, + OUTPUT=Y, + PADDING_H=self.padding[0], + PADDING_W=self.padding[1], + VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE, + TOGSIM_EAGER_MODE=eager_mode, + input_reorder=self.input_reorder + ) + code = self._template_from_string(self.WRAPPER_TEMPLATE).render(**options) + return code, self.function_name + f"_{len(input_args)}" + + def get_arg_attributes(self): + arg_attributes = [] + + X = self.input_nodes[0] + X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)] + X_shape[0] += 2 * self.padding[0] + X_shape[1] += 2 * self.padding[1] + + def compute_stride(shape): + stride = [1] * len(shape) + for i in range(len(shape)-2, -1, -1): + stride[i] = stride[i+1] * shape[i+1] + return stride + + X_stride = compute_stride(X_shape) + arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) + + return arg_attributes diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py index 6dd17576..0bf01421 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py @@ -1,17 +1,10 @@ -import os -import math from sympy import Symbol, Number from typing import List, Optional -from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs -from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate +from PyTorchSimFrontend.mlir.mlir_conv_common import MLIRConvCommonTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode -from torch._inductor.codecache import write_atomic -import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir import mlir_common -from torch._inductor.codecache import get_hash -from PyTorchSimFrontend import extension_config CONV_TEMPLATE = r""" // Multi Channel Tile Conv2D kernel @@ -104,7 +97,8 @@ } """ -WRAPPER_TEMPLATE = r""" +class MLIRConvMultiTileTemplate(MLIRConvCommonTemplate): + WRAPPER_TEMPLATE = r""" def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Padding input padded_shape = list(X.shape) @@ -126,67 +120,30 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if BACKENDSIM_EAGER_MODE %} + {%- if TOGSIM_EAGER_MODE %} yield ({{KERNEL_NAME}}, ) {%- endif %} """ - -class MLIRConvMultiTileTemplate(MLIRTemplate): def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): - super().__init__("kernel", input_nodes, layout, input_reorder) - self.stride = kwargs["stride"] - self.padding = kwargs["padding"] - self.dilation = kwargs["dilation"] - self.weight_shape = [str(i) for i in input_nodes[1].layout.size] - self.input_shape = [str(i) for i in input_nodes[0].layout.size] - self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \ - + "_".join([str(i) for i in self.stride]) \ - + "_" + "_".join([str(i) for i in self.padding]) \ - + "_" + "_".join([str(i) for i in self.dilation]) - self.kernel_args = ['X', 'W', 'Bias', 'Y'] - - def get_padded_input_size(self, X): - input_padded = list(X.layout.size) - input_padded[2] += 2 * self.padding[0] - input_padded[3] += 2 * self.padding[1] - return math.prod(input_padded) + super().__init__(input_nodes, layout, input_reorder, **kwargs) def render(self, kernel: MLIRTemplateKernel, template_buffer_node = None, epilogue_nodes: Optional[List[IRNode]] = None, + tile_info = None, **kwargs): # Extract input arguments info - if template_buffer_node is not None: - self.output_node = template_buffer_node - self.kernel = kernel - self.epilogue_nodes = epilogue_nodes - - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - if epilogue_nodes is not None: - extra_node_rw = { - item.name for epilogue_node in epilogue_nodes - for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes - if item.name != Y.name - } - n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0 - - BATCH, I_C, I_H, I_W = X.layout.size - O_C, _, K_H, K_W = W.layout.size - O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2] - O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3] - PADDING_H=self.padding[0] - PADDING_W=self.padding[1] - STRIDE_H=self.stride[0] - STRIDE_W=self.stride[1] + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) # Select tile size adn template conv_template = CONV_TEMPLATE - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + if tile_info is None: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0] + else: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N + TOG_latency = O_W if TILE_M > O_W else TILE_M TOG_latency = 8 if TOG_latency < 8 else TOG_latency kernel.loop_size = [TOG_latency, TILE_N, TILE_K] @@ -284,69 +241,13 @@ def render(self, return code def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) - SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane - SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane - - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) - TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] - TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] - SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 - SUB_TILE_K = TILE_K - - TOG_latency = O_W if TILE_M > O_W else TILE_M - return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency - - def outer_func_render(self, kernel_name, input_args): - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) - options = dict( - kernel=self.kernel, - KERNEL_NAME=kernel_name, - FUNC_NAME=self.function_name + f"_{len(input_args)}", - INPUT=X, - WEIGHT=W, - BIAS=Bias, - OUTPUT=Y, - PADDING_H=self.padding[0], - PADDING_W=self.padding[1], - VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, - BACKENDSIM_EAGER_MODE=eager_mode, - input_reorder=self.input_reorder - ) - code = self._template_from_string(WRAPPER_TEMPLATE).render(**options) - return code, self.function_name + f"_{len(input_args)}" - - def get_arg_attributes(self): - arg_attributes = [] - - X = self.input_nodes[0] - X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)] - X_shape[0] += 2 * self.padding[0] - X_shape[1] += 2 * self.padding[1] - - def compute_stride(shape): - stride = [1] * len(shape) - for i in range(len(shape)-2, -1, -1): - stride[i] = stride[i+1] * shape[i+1] - return stride - - X_stride = compute_stride(X_shape) - arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) - - return arg_attributes - - def codegen_header(self, code, extra_headers): - write_path = extension_codecache.get_write_path(code) - if not os.path.exists(write_path): - os.makedirs(write_path) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, extra_headers[0]) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, extra_headers[1]) - self.hash_value = get_hash(code.strip()) \ No newline at end of file + tile_candidates = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) + for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): + TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] + SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 + SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane + SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane + SUB_TILE_K = TILE_K + tile_candidates[idx] = TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + return tile_candidates diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py index 8b1bf7c5..92b9a525 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py @@ -1,17 +1,10 @@ -import os -import math from sympy import Symbol, Number from typing import List, Optional -from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs -from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate +from PyTorchSimFrontend.mlir.mlir_conv_common import MLIRConvCommonTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode -from torch._inductor.codecache import write_atomic -import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir import mlir_common -from torch._inductor.codecache import get_hash -from PyTorchSimFrontend import extension_config CONV_TEMPLATE = r""" // Single Batch Conv2D kernel @@ -105,7 +98,8 @@ } """ -WRAPPER_TEMPLATE = r""" +class MLIRConvSingleBatchTemplate(MLIRConvCommonTemplate): + WRAPPER_TEMPLATE = r""" def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Padding input padded_shape = list(X.shape) @@ -127,67 +121,30 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if BACKENDSIM_EAGER_MODE %} + {%- if TOGSIM_EAGER_MODE %} yield ({{KERNEL_NAME}}, ) {%- endif %} """ - -class MLIRConvSingleBatchTemplate(MLIRTemplate): def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): - super().__init__("kernel", input_nodes, layout, input_reorder) - self.stride = kwargs["stride"] - self.padding = kwargs["padding"] - self.dilation = kwargs["dilation"] - self.weight_shape = [str(i) for i in input_nodes[1].layout.size] - self.input_shape = [str(i) for i in input_nodes[0].layout.size] - self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \ - + "_".join([str(i) for i in self.stride]) \ - + "_" + "_".join([str(i) for i in self.padding]) \ - + "_" + "_".join([str(i) for i in self.dilation]) - self.kernel_args = ['X', 'W', 'Bias', 'Y'] - - def get_padded_input_size(self, X): - input_padded = list(X.layout.size) - input_padded[2] += 2 * self.padding[0] - input_padded[3] += 2 * self.padding[1] - return math.prod(input_padded) + super().__init__(input_nodes, layout, input_reorder, **kwargs) def render(self, kernel: MLIRTemplateKernel, template_buffer_node = None, epilogue_nodes: Optional[List[IRNode]] = None, + tile_info = None, **kwargs): # Extract input arguments info - if template_buffer_node is not None: - self.output_node = template_buffer_node - self.kernel = kernel - self.epilogue_nodes = epilogue_nodes - - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - if epilogue_nodes is not None: - extra_node_rw = { - item.name for epilogue_node in epilogue_nodes - for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes - if item.name != Y.name - } - n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0 - - BATCH, I_C, I_H, I_W = X.layout.size - O_C, _, K_H, K_W = W.layout.size - O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2] - O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3] - PADDING_H=self.padding[0] - PADDING_W=self.padding[1] - STRIDE_H=self.stride[0] - STRIDE_W=self.stride[1] + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) # Select tile size adn template conv_template = CONV_TEMPLATE - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + if tile_info is None: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0] + else: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N + TOG_latency = O_W if TILE_M > O_W else TILE_M TOG_latency = 8 if TOG_latency < 8 else TOG_latency kernel.loop_size = [TOG_latency, TILE_N, TILE_K] # Prepare tile descriptors @@ -283,66 +240,13 @@ def render(self, return code def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W - TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] - TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] - SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 - SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane - SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane - SUB_TILE_K = TILE_K - TOG_latency = O_W if TILE_M > O_W else TILE_M - return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency - - def outer_func_render(self, kernel_name, input_args): - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) - options = dict( - kernel=self.kernel, - KERNEL_NAME=kernel_name, - FUNC_NAME=self.function_name + f"_{len(input_args)}", - INPUT=X, - WEIGHT=W, - BIAS=Bias, - OUTPUT=Y, - PADDING_H=self.padding[0], - PADDING_W=self.padding[1], - VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, - BACKENDSIM_EAGER_MODE=eager_mode, - input_reorder=self.input_reorder - ) - code = self._template_from_string(WRAPPER_TEMPLATE).render(**options) - return code, self.function_name + f"_{len(input_args)}" - - def get_arg_attributes(self): - arg_attributes = [] - - X = self.input_nodes[0] - X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)] - X_shape[0] += 2 * self.padding[0] - X_shape[1] += 2 * self.padding[1] - - def compute_stride(shape): - stride = [1] * len(shape) - for i in range(len(shape)-2, -1, -1): - stride[i] = stride[i+1] * shape[i+1] - return stride - - X_stride = compute_stride(X_shape) - arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) - - return arg_attributes - - def codegen_header(self, code, extra_headers): - write_path = extension_codecache.get_write_path(code) - if not os.path.exists(write_path): - os.makedirs(write_path) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, extra_headers[0]) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, extra_headers[1]) - self.hash_value = get_hash(code.strip()) \ No newline at end of file + tile_candidates = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W + for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): + TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] + TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] + SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 + SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane + SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane + SUB_TILE_K = TILE_K + tile_candidates[idx] = TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + return tile_candidates diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py index 2284c86c..ab124852 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py @@ -1,17 +1,10 @@ -import os -import math from sympy import Symbol, Number from typing import List, Optional -from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs -from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate +from PyTorchSimFrontend.mlir.mlir_conv_common import MLIRConvCommonTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode -from torch._inductor.codecache import write_atomic -import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir import mlir_common -from torch._inductor.codecache import get_hash -from PyTorchSimFrontend import extension_config CONV_TEMPLATE = r""" // Single Batch Conv2D (Stride != 1) kernel @@ -105,7 +98,8 @@ } """ -WRAPPER_TEMPLATE = r""" +class MLIRConvSingleBatchStridedTemplate(MLIRConvCommonTemplate): + WRAPPER_TEMPLATE = r""" def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Padding input padded_shape = list(X.shape) @@ -127,67 +121,30 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if BACKENDSIM_EAGER_MODE %} + {%- if TOGSIM_EAGER_MODE %} yield ({{KERNEL_NAME}}, ) {%- endif %} """ - -class MLIRConvSingleBatchStridedTemplate(MLIRTemplate): def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): - super().__init__("kernel", input_nodes, layout, input_reorder) - self.stride = kwargs["stride"] - self.padding = kwargs["padding"] - self.dilation = kwargs["dilation"] - self.weight_shape = [str(i) for i in input_nodes[1].layout.size] - self.input_shape = [str(i) for i in input_nodes[0].layout.size] - self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \ - + "_".join([str(i) for i in self.stride]) \ - + "_" + "_".join([str(i) for i in self.padding]) \ - + "_" + "_".join([str(i) for i in self.dilation]) - self.kernel_args = ['X', 'W', 'Bias', 'Y'] - - def get_padded_input_size(self, X): - input_padded = list(X.layout.size) - input_padded[2] += 2 * self.padding[0] - input_padded[3] += 2 * self.padding[1] - return math.prod(input_padded) + super().__init__(input_nodes, layout, input_reorder, **kwargs) def render(self, kernel: MLIRTemplateKernel, template_buffer_node = None, epilogue_nodes: Optional[List[IRNode]] = None, + tile_info = None, **kwargs): # Extract input arguments info - if template_buffer_node is not None: - self.output_node = template_buffer_node - self.kernel = kernel - self.epilogue_nodes = epilogue_nodes - - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - if epilogue_nodes is not None: - extra_node_rw = { - item.name for epilogue_node in epilogue_nodes - for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes - if item.name != Y.name - } - n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0 - - BATCH, I_C, I_H, I_W = X.layout.size - O_C, _, K_H, K_W = W.layout.size - O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2] - O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3] - PADDING_H=self.padding[0] - PADDING_W=self.padding[1] - STRIDE_H=self.stride[0] - STRIDE_W=self.stride[1] + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) # Select tile size adn template conv_template = CONV_TEMPLATE - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + if tile_info is None: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0] + else: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N + TOG_latency = O_W if TILE_M > O_W else TILE_M TOG_latency = 8 if TOG_latency < 8 else TOG_latency kernel.loop_size = [TOG_latency, TILE_N, TILE_K] @@ -284,66 +241,13 @@ def render(self, return code def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W - TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] - TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] - SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 - SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane - SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane - SUB_TILE_K = TILE_K - TOG_latency = O_W if TILE_M > O_W else TILE_M - return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency - - def outer_func_render(self, kernel_name, input_args): - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) - options = dict( - kernel=self.kernel, - KERNEL_NAME=kernel_name, - FUNC_NAME=self.function_name + f"_{len(input_args)}", - INPUT=X, - WEIGHT=W, - BIAS=Bias, - OUTPUT=Y, - PADDING_H=self.padding[0], - PADDING_W=self.padding[1], - VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, - BACKENDSIM_EAGER_MODE=eager_mode, - input_reorder=self.input_reorder - ) - code = self._template_from_string(WRAPPER_TEMPLATE).render(**options) - return code, self.function_name + f"_{len(input_args)}" - - def get_arg_attributes(self): - arg_attributes = [] - - X = self.input_nodes[0] - X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)] - X_shape[0] += 2 * self.padding[0] - X_shape[1] += 2 * self.padding[1] - - def compute_stride(shape): - stride = [1] * len(shape) - for i in range(len(shape)-2, -1, -1): - stride[i] = stride[i+1] * shape[i+1] - return stride - - X_stride = compute_stride(X_shape) - arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) - - return arg_attributes - - def codegen_header(self, code, extra_headers): - write_path = extension_codecache.get_write_path(code) - if not os.path.exists(write_path): - os.makedirs(write_path) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, extra_headers[0]) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, extra_headers[1]) - self.hash_value = get_hash(code.strip()) \ No newline at end of file + tile_candidates = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W + for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): + TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] + TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] + SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 + SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane + SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane + SUB_TILE_K = TILE_K + tile_candidates[idx] = TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + return tile_candidates diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py index 890b76b7..66aa0a27 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py @@ -1,17 +1,10 @@ -import os -import math from sympy import Symbol, Number from typing import List, Optional -from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs -from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate +from PyTorchSimFrontend.mlir.mlir_conv_common import MLIRConvCommonTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode -from torch._inductor.codecache import write_atomic -import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir import mlir_common -from torch._inductor.codecache import get_hash -from PyTorchSimFrontend import extension_config CONV_TEMPLATE = r""" // Conv2D kernel @@ -109,7 +102,8 @@ } """ -WRAPPER_TEMPLATE = r""" +class MLIRConvTemplate(MLIRConvCommonTemplate): + WRAPPER_TEMPLATE = r""" def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Padding input padded_shape = list(X.shape) @@ -131,67 +125,29 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if BACKENDSIM_EAGER_MODE %} + {%- if TOGSIM_EAGER_MODE %} yield ({{KERNEL_NAME}}, ) {%- endif %} """ - -class MLIRConvTemplate(MLIRTemplate): def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): - super().__init__("kernel", input_nodes, layout, input_reorder) - self.stride = kwargs["stride"] - self.padding = kwargs["padding"] - self.dilation = kwargs["dilation"] - self.weight_shape = [str(i) for i in input_nodes[1].layout.size] - self.input_shape = [str(i) for i in input_nodes[0].layout.size] - self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \ - + "_".join([str(i) for i in self.stride]) \ - + "_" + "_".join([str(i) for i in self.padding]) \ - + "_" + "_".join([str(i) for i in self.dilation]) - self.kernel_args = ['X', 'W', 'Bias', 'Y'] - - def get_padded_input_size(self, X): - input_padded = list(X.layout.size) - input_padded[2] += 2 * self.padding[0] - input_padded[3] += 2 * self.padding[1] - return math.prod(input_padded) + super().__init__(input_nodes, layout, input_reorder, **kwargs) def render(self, kernel: MLIRTemplateKernel, template_buffer_node = None, epilogue_nodes: Optional[List[IRNode]] = None, + tile_info = None, **kwargs): # Extract input arguments info - if template_buffer_node is not None: - self.output_node = template_buffer_node - self.kernel = kernel - self.epilogue_nodes = epilogue_nodes - - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - if epilogue_nodes is not None: - extra_node_rw = { - item.name for epilogue_node in epilogue_nodes - for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes - if item.name != Y.name - } - n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0 - - BATCH, I_C, I_H, I_W = X.layout.size - O_C, _, K_H, K_W = W.layout.size - O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2] - O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3] - PADDING_H=self.padding[0] - PADDING_W=self.padding[1] - STRIDE_H=self.stride[0] - STRIDE_W=self.stride[1] + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) # Select tile size adn template conv_template = CONV_TEMPLATE - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) - SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N + if tile_info is None: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0] + else: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info + TOG_latency = BATCH if TILE_M > BATCH else TILE_M TOG_latency = 8 if TOG_latency < 8 else TOG_latency kernel.loop_size = [TOG_latency, TILE_N, TILE_K] @@ -289,68 +245,14 @@ def render(self, return code def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) - SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane - SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane - SUB_TILE_K = TILE_K - TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] - TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] - SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 - SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N - TOG_latency = BATCH if TILE_M > BATCH else TILE_M - TOG_latency = 8 if TOG_latency < 8 else TOG_latency - return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency - - def outer_func_render(self, kernel_name, input_args): - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) - options = dict( - kernel=self.kernel, - KERNEL_NAME=kernel_name, - FUNC_NAME=self.function_name + f"_{len(input_args)}", - INPUT=X, - WEIGHT=W, - BIAS=Bias, - OUTPUT=Y, - PADDING_H=self.padding[0], - PADDING_W=self.padding[1], - VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, - BACKENDSIM_EAGER_MODE=eager_mode, - input_reorder=self.input_reorder - ) - code = self._template_from_string(WRAPPER_TEMPLATE).render(**options) - return code, self.function_name + f"_{len(input_args)}" - - def get_arg_attributes(self): - arg_attributes = [] - - X = self.input_nodes[0] - X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)] - X_shape[0] += 2 * self.padding[0] - X_shape[1] += 2 * self.padding[1] - - def compute_stride(shape): - stride = [1] * len(shape) - for i in range(len(shape)-2, -1, -1): - stride[i] = stride[i+1] * shape[i+1] - return stride - - X_stride = compute_stride(X_shape) - arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) - - return arg_attributes - - def codegen_header(self, code, extra_headers): - write_path = extension_codecache.get_write_path(code) - if not os.path.exists(write_path): - os.makedirs(write_path) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, extra_headers[0]) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, extra_headers[1]) - self.hash_value = get_hash(code.strip()) \ No newline at end of file + tile_candidates = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) + for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): + TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] + TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] + SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 + SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane + SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane + SUB_TILE_K = TILE_K + SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N + tile_candidates[idx] = TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + return tile_candidates diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py index ae793c06..6271b548 100644 --- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py @@ -1,4 +1,3 @@ -import os import json from pathlib import Path from torch import empty_strided @@ -8,8 +7,6 @@ from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode -from torch._inductor.codecache import write_atomic -import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend import extension_config from PyTorchSimFrontend.mlir import mlir_common @@ -114,30 +111,13 @@ def render(self, template_buffer_node = None, epilogue_nodes: Optional[List[IRNode]] = None, prologue_nodes: Optional[List[IRNode]] = None, + tile_info = None, **kwargs): - if template_buffer_node is not None: - self.output_node = template_buffer_node - - # Extract input arguments info - X, W, Y = self.input_nodes[0], self.input_nodes[1], self.output_node - X_tensor = empty_strided(X.layout.size, X.layout.stride) - W_tensor = empty_strided(W.layout.size, W.layout.stride) - if len(W_tensor.size()) > 2 or len(X_tensor.size()) > 2: - raise NotImplementedError("Please report this case to us...") - - # Extract fusion info - n_epilogue_node = len(epilogue_nodes) if epilogue_nodes is not None else 0 - n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0 - n_extra_read = set() - if epilogue_nodes is not None: - for enode in epilogue_nodes: - n_extra_read.update(enode.node.get_read_names()) - if self.output_node.name in n_extra_read: - n_extra_read.remove(self.output_node.name) - - # Select tile size - M, N, K = X_tensor.size()[0], W_tensor.size()[1], X_tensor.size()[1] - TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node) + X, W, Y, M, N, K, n_epilogue_node, n_prologue_node, n_extra_read = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) + if tile_info is None: + TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node)[0] + else: + TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info # Select template code if (M == 0) or (N == 0) or (K == 0): # exception for MoE @@ -275,12 +255,47 @@ def render(self, dram_idx = Y_idx, dram_tile_desc = Y_tile_desc, nr_rdim = nr_rdim, + r_dim_size = M, dim_aliasing = epilogue_dim_aliasing ) code = self._template_from_string(template).render(**kernel.render_options) kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]]) return code + def get_tile_candidates(self, + kernel: MLIRTemplateKernel, + template_buffer_node = None, + epilogue_nodes: Optional[List[IRNode]] = None, + prologue_nodes: Optional[List[IRNode]] = None, + **kwargs): + X, W, Y, M, N, K, n_epilogue_node, n_prologue_node, n_extra_read = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) + return self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node) + + def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes): + if template_buffer_node is not None: + self.output_node = template_buffer_node + + # Extract input arguments info + X, W, Y = self.input_nodes[0], self.input_nodes[1], self.output_node + X_tensor = empty_strided(X.layout.size, X.layout.stride) + W_tensor = empty_strided(W.layout.size, W.layout.stride) + if len(W_tensor.size()) > 2 or len(X_tensor.size()) > 2: + raise NotImplementedError("Please report this case to us...") + + # Extract fusion info + n_epilogue_node = len(epilogue_nodes) if epilogue_nodes is not None else 0 + n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0 + n_extra_read = set() + if epilogue_nodes is not None: + for enode in epilogue_nodes: + n_extra_read.update(enode.node.get_read_names()) + if self.output_node.name in n_extra_read: + n_extra_read.remove(self.output_node.name) + + # Select tile size + M, N, K = X_tensor.size()[0], W_tensor.size()[1], X_tensor.size()[1] + return X,W,Y,M,N,K,n_epilogue_node,n_prologue_node,len(n_extra_read) + def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node): # Check cheat sheet cheatsheet_path = extension_config.CONFIG_GEMM_CHEATSHEET_PATH @@ -292,52 +307,49 @@ def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_no data = json.load(f) gemm_shape = f"{M}_{K}_{N}" - if gemm_shape in data: + if extension_config.CONFIG_MANUAL_TILE_SIZE: + # case 1: use manual tile size + TILE_M = extension_config.CONFIG_TILE_M + TILE_N = extension_config.CONFIG_TILE_N + TILE_K = extension_config.CONFIG_TILE_K + tile_candidates = [[TILE_M, TILE_N, TILE_K]] + elif gemm_shape in data: + # case 2: cached tile size tile_info = data[gemm_shape] TILE_M = tile_info["TILE_M"] TILE_N = tile_info["TILE_N"] TILE_K = tile_info["TILE_K"] - else: # case 2: use gemm_combination_mapping + tile_candidates = [[TILE_M, TILE_N, TILE_K]] + else: + # case 3: use gemm_combination_mapping min_tile = (n_extra_node + n_prologue_node) == 0 - TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, max(len(n_extra_read)-2, 0), n_prologue_node, min_tile=True) - # case 3: use manual tile size - if extension_config.CONFIG_MANUAL_TILE_SIZE: - TILE_M = extension_config.CONFIG_TILE_M - TILE_N = extension_config.CONFIG_TILE_N - TILE_K = extension_config.CONFIG_TILE_K + tile_candidates = kernel.gemm_combination_mapping(M, N, K, max(n_extra_read-2, 0), n_prologue_node, min_tile=True) # Edge case if (M == 0) or (N == 0) or (K == 0): TILE_M, TILE_N, TILE_K = 1, 1, 1 + tile_candidates = [[TILE_M, TILE_N, TILE_K]] - # Calculate Sub Tile Size for fine-grained DMA - if extension_config.CONFIG_SUBTILE: - # Case 1: adjust selective fine-grained DMA (SFG-DMA) - SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane or n_prologue_node) else kernel.vector_lane - if (TILE_M == M and TILE_N == N and TILE_N <= 512): - SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane - else: # Avoid Row Conflict of weights + full_tile_candidates = [] + for idx, (TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): + # Calculate Sub Tile Size for fine-grained DMA + if extension_config.CONFIG_SUBTILE: + # Case 1: adjust selective fine-grained DMA (SFG-DMA) + SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane or n_prologue_node) else kernel.vector_lane + if (TILE_M == M and TILE_N == N and TILE_N <= 512): + SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane + else: # Avoid Row Conflict of weights + SUB_TILE_N = TILE_N + SUB_TILE_K = TILE_K + # Case 2: use manual sub tile size (FG-DMA) + if extension_config.CONFIG_MANUAL_SUBTILE_SIZE: + SUB_TILE_M = extension_config.CONFIG_SUBTILE_M + SUB_TILE_N = extension_config.CONFIG_SUBTILE_N + SUB_TILE_K = extension_config.CONFIG_SUBTILE_K + # Case 3: None Subtile + else: + SUB_TILE_M = TILE_M SUB_TILE_N = TILE_N - SUB_TILE_K = TILE_K - # Case 2: use manual sub tile size (FG-DMA) - if extension_config.CONFIG_MANUAL_SUBTILE_SIZE: - SUB_TILE_M = extension_config.CONFIG_SUBTILE_M - SUB_TILE_N = extension_config.CONFIG_SUBTILE_N - SUB_TILE_K = extension_config.CONFIG_SUBTILE_K - # Case 3: None Subtile - else: - SUB_TILE_M = TILE_M - SUB_TILE_N = TILE_N - SUB_TILE_K = TILE_K - return TILE_M,TILE_N,TILE_K, SUB_TILE_M,SUB_TILE_N,SUB_TILE_K - - def codegen_header(self, code, extra_headers): - write_path = extension_codecache.get_write_path(code) - if not os.path.exists(write_path): - os.makedirs(write_path) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, extra_headers[0]) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, extra_headers[1]) + SUB_TILE_K = TILE_K + full_tile_candidates.append([TILE_M,TILE_N,TILE_K, SUB_TILE_M,SUB_TILE_N,SUB_TILE_K]) + return full_tile_candidates diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py index 6508ea86..af59d88f 100644 --- a/PyTorchSimFrontend/mlir/mlir_lowering.py +++ b/PyTorchSimFrontend/mlir/mlir_lowering.py @@ -15,7 +15,7 @@ from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate -from PyTorchSimFrontend.extension_config import CONFIG_VECTOR_LANE, CONFIG_USE_TIMING_POOLING +from PyTorchSimFrontend import extension_config aten = torch.ops.aten aten_spmm = MLIRExternKernelChoice(torch.sparse.mm, "custom_op::sparse_addmm") @@ -106,11 +106,11 @@ def convolution( layout = conv_layout(x, weight, None, **kwargs) # Select conv kernel - if BATCH == 1 and stride[0] == 1: + if BATCH == 1 and stride[0] == 1 and extension_config.CONFIG_SINGLE_BATCH_CONV: mlir_template = MLIRConvSingleBatchTemplate([x, weight, bias], layout, **kwargs) - elif BATCH == 1 and stride[0] != 1: + elif BATCH == 1 and stride[0] != 1 and extension_config.CONFIG_SINGLE_BATCH_CONV: mlir_template = MLIRConvSingleBatchStridedTemplate([x, weight, bias], layout, **kwargs) - elif I_C < CONFIG_VECTOR_LANE // 8: # 8 is hard-coded for now. This should be changed to a better heuristic. + elif I_C < extension_config.CONFIG_VECTOR_LANE // 8 and extension_config.CONFIG_MULTI_TILE_CONV: # 8 is hard-coded for now. This should be changed to a better heuristic. mlir_template = MLIRConvMultiTileTemplate([x, weight, bias], layout, **kwargs) else: mlir_template = MLIRConvTemplate([x, weight, bias], layout, **kwargs) @@ -187,5 +187,5 @@ def custom_unsafe_index(x, indices): lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()}) lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()}) lowerings.update({getattr(aten._unsafe_index, overload): custom_unsafe_index for overload in aten._unsafe_index.overloads()}) -if CONFIG_USE_TIMING_POOLING: +if extension_config.CONFIG_USE_TIMING_POOLING: lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py index 6f605d56..3658f992 100644 --- a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py +++ b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py @@ -1,13 +1,9 @@ -import os from typing import List, Optional, cast from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import Buffer from torch._inductor.ir import IRNode -from torch._inductor.ir import ReinterpretView -from torch._inductor.codecache import write_atomic -import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir import mlir_common import sympy @@ -42,6 +38,7 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node = None, epilogue_nodes: Optional[List[IRNode]] = None, + tile_info = None, **kwargs): if template_buffer_node is not None: self.output_node = template_buffer_node @@ -99,14 +96,3 @@ def render(self, code = self._template_from_string(TEMPLATE).render(**kernel.render_options) kernel.add_loop_info([X.get_numel()], [kernel.vector_lane, kernel.vector_lane]) return code - - def codegen_header(self, code, extra_headers): - write_path = extension_codecache.get_write_path(code) - if not os.path.exists(write_path): - os.makedirs(write_path) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, extra_headers[0]) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, extra_headers[1]) \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 2bbdb41d..38603319 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -3,7 +3,7 @@ import sympy from functools import reduce import operator -from sympy import symbols, sympify, Symbol +from sympy import symbols, sympify from PyTorchSimFrontend import extension_config from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel @@ -94,6 +94,8 @@ def can_fuse_vertical(self, node1, node2): return self.can_fuse_horizontal(node1, node2) def can_fuse_horizontal(self, node1, node2): + if not extension_config.CONFIG_FUSION: + return False if (len(node1.get_nodes())+ len(node2.get_nodes())) > self.max_fusion_size: return False _, (vars1, reduce1) = node1.group @@ -214,7 +216,7 @@ def codegen_nodes(self, nodes): ex_kernel.call_kernel(kernel_name) _, args, _, _ = ex_kernel.args.mlir_argdefs() args = ", ".join(args) - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) + eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) if (eager_mode): V.graph.wrapper_code.writeline( f"yield ({kernel_name}, ({args}))" @@ -259,85 +261,6 @@ def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, loop_size wrapper.define_kernel(kernel_name, codecache_def.getvalue(), cuda=False) return kernel_name - def codegen_template_code(self, kernel, render, template_node, prologue_nodes, epilogue_nodes): - with kernel: - _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs() - for node in [template_node, *prologue_nodes, *epilogue_nodes]: - node.mark_run() - # Partial codgen template nodes - partial_code = render() - - # Swap load/store functions - kernel.load = kernel.load_epilogue - kernel.store = kernel.store_epilogue - kernel.store_reduction = kernel.store_reduction_epilogue - kernel.reduction = kernel.reduction_epilogue - - # Codegen prologue nodes - if prologue_nodes: - # Flush created varaibles, since template fusion doen't share variable - with kernel.prologue_buffer_group.as_local(): - _, (group, reduction_group) = max( - [prologue_nodes[-1]], key=lambda x: int(x.is_reduction()) - ).group - prologue_tile_desc = kernel.set_tile_size(kernel.prologue_info, prologue=True) - kernel.kernel_group.set_tile_info(prologue_tile_desc) - vars, reduction_vars = kernel.set_ranges(group, reduction_group) - for node in prologue_nodes: - # Reuse created spad - read_list = sorted([i.name for i in node.read_writes.reads]) - candidate_found = False - # Why? There is a case that memdep.get_size() != data.get_size() - buf_dict = {} - buf_dict.update({val.name : val for val in V.graph.buffers}) - buf_dict.update(V.graph.graph_inputs) - for candidate_read in read_list: - if candidate_read in buf_dict and reduce(operator.mul, buf_dict[candidate_read].get_size(), 1) == node.node.get_numel(): - prologue_input_arg = candidate_read - candidate_found = True - break - assert(candidate_found) - assert(len(node.read_writes.writes)==1) - prologue_output_arg = list(node.read_writes.writes)[0].name - template_buf = self.kernel_group.args.input_buffers[prologue_output_arg] - target_buf = f"{template_buf}_buffer" # FIXME. How to pass spad buffer name? - - # To skip the dma code gen - kernel.buffer_names[prologue_input_arg] = target_buf - kernel.buffer_names[prologue_output_arg] = target_buf - - # Edge delete - kernel.kernel_group.args.input_buffers = { - (arg if buf != template_buf else prologue_input_arg): buf - for arg, buf in kernel.kernel_group.args.input_buffers.items() - } - node.codegen((vars, reduction_vars)) - - # Codegen epilogue nodes - tile_desc = kernel.set_tile_size(kernel.epilogue_info) - kernel.kernel_group.set_tile_info(tile_desc) - kernel.call_ranges = None - if epilogue_nodes: - with kernel.epilogue_buffer_group.as_local(): - _, (group, reduction_group) = max( - epilogue_nodes, key=lambda x: int(x.is_reduction()) - ).group - vars, reduction_vars = kernel.set_ranges(group, reduction_group) - for node in epilogue_nodes: - node.codegen((vars, reduction_vars)) - - with V.set_kernel_handler(kernel): - src_code = ( - partial_code - if isinstance(partial_code, str) - else partial_code.finalize() - ) - - # For consistency, white space could make wrong write_path - buffer = IndentedBuffer() - buffer.splice(src_code) - return buffer.getvalue() - def codegen_template(self, template_node, epilogue_nodes): # Handle prologue pattern prologue_nodes = [] @@ -350,24 +273,13 @@ def codegen_template(self, template_node, epilogue_nodes): epilogue_nodes = epilogue_nodes[i+1:] break - _, (numel, rnumel) = template_node.group + # Generate template code template_buffer = template_node.node - kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group) + kernel, tile_candidates, render = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group) _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs() - - src_code = self.codegen_template_code(kernel, render, template_node, prologue_nodes, epilogue_nodes) - wrapper = V.graph.wrapper_code - - if src_code in wrapper.src_to_kernel: # [CONV] check inner function is already defined - kernel_name = wrapper.src_to_kernel[src_code] - kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_name=kernel_name) # update kernel name - src_code = self.codegen_template_code(kernel, render, template_node, prologue_nodes, epilogue_nodes) + src_code = kernel.codegen_nodes(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) with V.set_kernel_handler(kernel): - spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n" - spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({kernel.spad_info['spad_size']*kernel.vector_lane})));" - codegen_header(src_code, (kernel.header.getvalue()+spad_end_symbol+spad_section_end_symbol, kernel.gem5_header.getvalue())) - kernel.meta_kernel() kernel_name = self.define_kernel(src_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info, kernel.loop_size, origins={str(i) for i in template_node.node.origins}) self.define_function(kernel) @@ -375,7 +287,7 @@ def codegen_template(self, template_node, epilogue_nodes): kernel.call_kernel(kernel_name) V.graph.removed_buffers |= kernel.removed_buffers _, args, _, _ = self.kernel_group.args.mlir_argdefs() - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) + eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) if (eager_mode): target_kernel_name = kernel_name if kernel.outer_func_name is None else kernel.outer_func_name + f"_{len(args)}" args = ", ".join(args) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 820d5c0d..df3621eb 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -6,26 +6,30 @@ import contextlib import math import sympy +from functools import reduce +import operator from collections import OrderedDict from typing import List, Optional from unittest.mock import patch -from torch._inductor.codegen.common import Kernel, KernelTemplate, ChoiceCaller, OpOverrides, CSE, DeferredLine -from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, View +from torch._inductor.codegen.common import KernelTemplate, ChoiceCaller, CSE, DeferredLine +from torch._inductor.ir import Buffer, IRNode, TemplateBuffer from torch._inductor.select_algorithm import PartialRender from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller from torch._inductor.autotune_process import TensorMeta from torch._inductor.virtualized import V, NullHandler, _ops as ops from torch._inductor.utils import IndentedBuffer +from torch._inductor.codecache import write_atomic +import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest from PyTorchSimFrontend.mlir.mlir_common import BaseMLIRHardwareInfo from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel, reduction_init, reduction_partial_combine_vec, reduction_combine_vec, is_welford_reduction from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode from torch._inductor.codegen import common -from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_DIR +from PyTorchSimFrontend import extension_config from . import mlir_common class IndentedBufferGroup: @@ -93,7 +97,8 @@ def __init__(self, kernel_group = None, outer_func_name=None, outer_func_render=None, - kernel_arg_attributes=None) -> None: + kernel_arg_attributes=None, + reason=None) -> None: super().__init__(kernel_group if kernel_group is not None else mlir_common.MLIRWrapperKenrelGroup()) self.kernel_name = kernel_name self.input_nodes = input_nodes @@ -125,6 +130,15 @@ def __init__(self, self.reduction_mean = [] # Dim info self.dim_aliasing = {} + self.reason = reason + + def reset(self, reason): + self.__init__( + self.kernel_name, self.input_nodes, + self.call_size, self.kernel_group, + self.outer_func_name, self.outer_func_render, + self.kernel_arg_attributes, reason + ) def add_loop_info(self, mat_size, tile_size): for idx, (loop_size, stride) in enumerate(zip(mat_size, tile_size)): @@ -185,7 +199,8 @@ def gemmini_gemm_mapping(self, M, N, K): return inner_I, inner_J, inner_K - def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, pad_k=True, min_tile=False): + def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, pad_k=True, min_tile=False, is_conv=False): + tile_candidates = [] spad_size_per_lane = self.spad_info["spad_size"] spad_size = spad_size_per_lane * self.vector_lane max_spad_size = spad_size // 2 # double buffer @@ -219,7 +234,7 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane) if check_spad_size: - dir_path = f"{CONFIG_TORCHSIM_DIR}/validation/gemm_candidates" + dir_path = f"{extension_config.CONFIG_TORCHSIM_DIR}/validation/gemm_candidates" os.makedirs(dir_path, exist_ok=True) file_path = f"{dir_path}/gemm_{M}_{K}_{N}.txt" line_to_write = f"{tile_M} {tile_K} {tile_N}\n" @@ -249,52 +264,22 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p max_used_spad_size = used_spad_size maximize_i_j = tile_M * tile_N mapping = (tile_M, tile_N, tile_K) - return mapping - - def search_mapping_space(self, mapping, idx, increment, stride, dilation, n_extra_node=0): - if idx == 0 or idx == 1 or idx == 4 or idx == 5 or idx == 6: - raise NotImplementedError("Only O_H and O_W are supported for search_mapping_space") - spad_size_per_lane = self.spad_info["spad_size"] - spad_size = spad_size_per_lane * self.vector_lane - max_spad_size = spad_size // 2 # double buffer - max_spad_per_lane = spad_size_per_lane // 2 # double buffer - - mapping = list(mapping) - mapping[idx] += increment - k_h, k_w, o_h, o_w, M, N, K = mapping - i_h = 1 + (o_h - 1) * stride[0] + (k_h - 1) * dilation[0] - i_w = 1 + (o_w - 1) * stride[1] + (k_w - 1) * dilation[1] - weight_size = k_w * k_h * K * N - input_size = i_w * i_h * M * K - output_size = o_w * o_h * M * N - used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision - weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N) - input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K) - output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M * (1 + n_extra_node), N) - used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision - if used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane: - mapping = (k_h, k_w, o_h, o_w, M, N, K) - else: - mapping[idx] -= increment - - return mapping + if check_spad_size: + tile_candidates.append((used_spad_size, (tile_M, tile_N, tile_K))) - def pseudo_auto_tune(self, mapping, stride, dilation, O_H, O_W, n_extra_node=0): - # pseudo auto-tune - if mapping[2] == 1 and not (O_H == 1): - mapping = self.search_mapping_space(mapping, 2, 1, stride, dilation, n_extra_node=n_extra_node) - if mapping[3] == 1 and not (O_W == 1): - mapping = self.search_mapping_space(mapping, 3, 1, stride, dilation, n_extra_node=n_extra_node) - return mapping + tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) + tile_candidates = [v for _, v in tile_candidates] + return tile_candidates def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0): + tile_candidates = [] spad_size_per_lane = self.spad_info["spad_size"] spad_size = spad_size_per_lane * self.vector_lane max_spad_size = spad_size // 2 # double buffer max_spad_per_lane = spad_size_per_lane // 2 # double buffer max_used_spad_size = 0 - M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False) + M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0] max_k_h_w = 1 # maximize kernel size max_o_h_w = 1 # maximize output size K = min(K, self.vector_lane) @@ -312,27 +297,30 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K) output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M * (1 + n_extra_node), N) used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision - if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h * k_w and max_o_h_w <= o_h * o_w: - max_used_spad_size = used_spad_size - max_k_h_w = k_h * k_w - max_o_h_w = o_h * o_w - mapping = (k_h, k_w, o_h, o_w, M, N, K) + check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane) + if check_spad_size: + tile_candidates.append((used_spad_size, (k_h, k_w, o_h, o_w, M, N, K))) + if max_used_spad_size < used_spad_size and max_k_h_w <= k_h * k_w and max_o_h_w <= o_h * o_w: + max_used_spad_size = used_spad_size + max_k_h_w = k_h * k_w + max_o_h_w = o_h * o_w + mapping = (k_h, k_w, o_h, o_w, M, N, K) if max_used_spad_size == 0: raise RuntimeError("Cannot find a valid mapping") - # FIXME: this should be implemented with auto-tuning - mapping = self.pseudo_auto_tune(mapping, stride, dilation, O_H, O_W, n_extra_node=n_extra_node) - - return mapping + tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) + tile_candidates = [v for _, v in tile_candidates] + return tile_candidates def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0): + tile_candidates = [] spad_size_per_lane = self.spad_info["spad_size"] spad_size = spad_size_per_lane * self.vector_lane max_spad_size = spad_size // 2 max_spad_per_lane = spad_size_per_lane // 2 max_used_spad_size = 0 - M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node=n_extra_node, pad_k=False) + M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0] max_k_h_w = K_W for o_h in sympy.divisors(O_H): for o_w in sympy.divisors(O_W): @@ -347,22 +335,28 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K) output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M * (1 + n_extra_node), N) used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision - if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h: - max_used_spad_size = used_spad_size - max_k_h_w = k_h - mapping = (k_h, K_W, o_h, o_w, M, N, K) + check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane) + if check_spad_size: + tile_candidates.append((used_spad_size, (k_h, K_W, o_h, o_w, M, N, K))) + if max_used_spad_size < used_spad_size and max_k_h_w <= k_h: + max_used_spad_size = used_spad_size + max_k_h_w = k_h + mapping = (k_h, K_W, o_h, o_w, M, N, K) if max_used_spad_size == 0: raise RuntimeError("Cannot find a valid mapping") - return mapping + tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) + tile_candidates = [v for _, v in tile_candidates] + return tile_candidates def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0): + tile_candidates = [] spad_size_per_lane = self.spad_info["spad_size"] spad_size = spad_size_per_lane * self.vector_lane max_spad_size = spad_size // 2 max_spad_per_lane = spad_size_per_lane // 2 max_used_spad_size = 0 - M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node=n_extra_node, pad_k=False) + M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0] max_k_h_w = 1 for o_h in sympy.divisors(O_H): for k_h in sympy.divisors(K_H): @@ -377,13 +371,18 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * k_w, K) output_size_per_lane = self.get_spad_size_per_lane(M * o_h * (1 + n_extra_node), N) used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision - if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h * k_w: - max_used_spad_size = used_spad_size - max_k_h_w = k_h * k_w - mapping = (k_h, k_w, o_h, M, M, N, K) + check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane) + if check_spad_size: + tile_candidates.append((used_spad_size, (k_h, k_w, o_h, M, M, N, K))) + if max_used_spad_size < used_spad_size and max_k_h_w <= k_h * k_w: + max_used_spad_size = used_spad_size + max_k_h_w = k_h * k_w + mapping = (k_h, k_w, o_h, M, M, N, K) if max_used_spad_size == 0: raise RuntimeError("Cannot find a valid mapping") - return mapping + tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) + tile_candidates = [v for _, v in tile_candidates] + return tile_candidates def meta_kernel(self): wrapper = V.graph.wrapper_code @@ -407,6 +406,131 @@ def call_kernel(self, kernel_name): kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}", call_args, cuda=False) + def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes, tile_info): + with self as kernel: + _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs() + for node in [template_node, *prologue_nodes, *epilogue_nodes]: + node.mark_run() + + # Partial codgen template nodes + partial_code = render(kwargs={**render.keywords['kwargs'], 'tile_info': tile_info}) + + # Swap load/store functions + kernel.load = kernel.load_epilogue + kernel.store = kernel.store_epilogue + kernel.store_reduction = kernel.store_reduction_epilogue + kernel.reduction = kernel.reduction_epilogue + + # Codegen prologue nodes + if prologue_nodes: + # Flush created varaibles, since template fusion doen't share variable + with kernel.prologue_buffer_group.as_local(): + _, (group, reduction_group) = max( + [prologue_nodes[-1]], key=lambda x: int(x.is_reduction()) + ).group + prologue_tile_desc = kernel.set_tile_size(kernel.prologue_info, prologue=True) + kernel.kernel_group.set_tile_info(prologue_tile_desc) + vars, reduction_vars = kernel.set_ranges(group, reduction_group) + for node in prologue_nodes: + # Reuse created spad + read_list = sorted([i.name for i in node.read_writes.reads]) + candidate_found = False + # Why? There is a case that memdep.get_size() != data.get_size() + buf_dict = {} + buf_dict.update({val.name : val for val in V.graph.buffers}) + buf_dict.update(V.graph.graph_inputs) + for candidate_read in read_list: + if candidate_read in buf_dict and reduce(operator.mul, buf_dict[candidate_read].get_size(), 1) == node.node.get_numel(): + prologue_input_arg = candidate_read + candidate_found = True + break + assert(candidate_found) + assert(len(node.read_writes.writes)==1) + prologue_output_arg = list(node.read_writes.writes)[0].name + template_buf = self.kernel_group.args.input_buffers[prologue_output_arg] + target_buf = f"{template_buf}_buffer" # FIXME. How to pass spad buffer name? + + # To skip the dma code gen + kernel.buffer_names[prologue_input_arg] = target_buf + kernel.buffer_names[prologue_output_arg] = target_buf + + # Edge delete + kernel.kernel_group.args.input_buffers = { + (arg if buf != template_buf else prologue_input_arg): buf + for arg, buf in kernel.kernel_group.args.input_buffers.items() + } + node.codegen((vars, reduction_vars)) + + # Codegen epilogue nodes + tile_desc = kernel.set_tile_size(kernel.epilogue_info) + kernel.kernel_group.set_tile_info(tile_desc) + kernel.call_ranges = None + if epilogue_nodes: + with kernel.epilogue_buffer_group.as_local(): + _, (group, reduction_group) = max( + epilogue_nodes, key=lambda x: int(x.is_reduction()) + ).group + vars, reduction_vars = kernel.set_ranges(group, reduction_group) + for node in epilogue_nodes: + node.codegen((vars, reduction_vars)) + + with V.set_kernel_handler(kernel): + src_code = ( + partial_code + if isinstance(partial_code, str) + else partial_code.finalize() + ) + + # For consistency, white space could make wrong write_path + buffer = IndentedBuffer() + buffer.splice(src_code) + src_code = buffer.getvalue() + self._prepare_simulator_headers(src_code) + return src_code + + def make_choices(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes): + choices = [] + for tile_info in tile_candidates: + print(f"[Auto-tune] Trying tile size: {list(tile_info)}") + src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info) + bench_runner = self.run_bench([template_node], self.kernel_name, src_code) + choices.append((bench_runner, src_code, tile_info, self.loop_size)) + self.reset(reason=None) + return choices + + def _log_autotune_result(self, best_choice, best_cycle): + tile_size = best_choice[2] + print( + f"[Auto-tune] Optimal tile size: {list(tile_size)}, " + f"cycles: {best_cycle}" + ) + + def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes): + if extension_config.CONFIG_AUTOTUNE_TEMPLATE and len(tile_candidates): + src_code, loop_size = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) + self.loop_size = loop_size + else: + tile_info = tile_candidates[0] if tile_candidates else None + src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info) + + with V.set_kernel_handler(self): + self.meta_kernel() + return src_code + + def _prepare_simulator_headers(self, src_code): + spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n" + spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));" + + write_path = extension_codecache.get_write_path(src_code) + if not os.path.exists(write_path): + os.makedirs(write_path, exist_ok=True) + spike_write_path = os.path.join(write_path, "global_var.h") + gem5_write_path = os.path.join(write_path, "gem5_global_var.h") + if not os.path.exists(spike_write_path): + write_atomic(spike_write_path, self.header.getvalue()+spad_end_symbol+spad_section_end_symbol) + if not os.path.exists(gem5_write_path): + write_atomic(gem5_write_path, self.gem5_header.getvalue()) + def codegen_prologue_body(self): body = IndentedBuffer() with self.prologue_buffer_group.as_local(): @@ -685,8 +809,8 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com sram_var = tile_desc.get_name() tile_shape = tile_desc.get_mlir_shape(mlir_dtype) tile_stride = tile_desc.get_tile_stride() - vlane_split_axis = tile_desc.vlane_split_axis - vlane_stride = tile_desc.vlane_stride + vlane_split_axis = tile_desc.vmap.vlane_split_axis + vlane_stride = tile_desc.vmap.vlane_stride zero_cse = self.get_const_cse(0, "index") sram_index_var = ", ".join([f"%{str(zero_cse)}"]*tile_desc.get_nr_dim()) @@ -734,8 +858,8 @@ def load_epilogue(self, name: str, index: sympy.Expr): # Want to use tile_desc from epilogue_info index_var = self.parse_indices(index) dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()] - vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis - vlane_stride = self.kernel_group.tile_desc.vlane_stride + vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis + vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype) tile_stride = self.kernel_group.tile_desc.get_tile_stride() @@ -773,7 +897,7 @@ def load_epilogue(self, name: str, index: sympy.Expr): vshape = f"vector<{vsize}x{mlir_dtype}>" if compute_vec_size > 1: - offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.reduction_axis_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})") + offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.r_tile_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})") compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{offset}"]) operation = "affine.vector_load" line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}" @@ -793,8 +917,8 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs): index_var = self.parse_indices(index) dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()] - vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis - vlane_stride = self.kernel_group.tile_desc.vlane_stride + vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis + vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype) tile_stride = self.kernel_group.tile_desc.get_tile_stride() @@ -859,8 +983,8 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value): vec_size = self.compute_body_loop.step type_name = mlir_common.DTYPE_TO_MLIR[dtype] new_tile_size = self.kernel_group.tile_desc.get_tile_size()[:-1] + [vec_size] - new_vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis - new_vlane_stride = self.kernel_group.tile_desc.vlane_stride + new_vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis + new_vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride local_tile_desc = mlir_common.MLIRMultiDimTile(new_tile_size, self.vector_lane, new_vlane_split_axis, new_vlane_stride, vec_size) tile_shape = local_tile_desc.get_mlir_shape(type_name) @@ -906,8 +1030,8 @@ def store_reduction_epilogue(self, name, index, value): index_var = self.parse_indices(index, self.reductions_suffix, comments="// Store reduction") dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()][:-1] # Assume that there is only one reduction axis - vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis - vlane_stride = self.kernel_group.tile_desc.vlane_stride + vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis + vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride # Create final buffer descriptor nr_outer_loop = self.reduction_nr_outer_loop @@ -958,12 +1082,7 @@ def store_reduction_epilogue(self, name, index, value): if self.welford_reduce_out is not None: # NOTE: It not a real welford algorithm... We just used E(X^2) - E(X)^2 - divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.reduction_axis_size)} : f32") - if self.reduction_axis_size - 1 > 0: - divider2 = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.reduction_axis_size-1)} : f32") - else: - divider2 = divider - + divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.r_dim_size)} : f32") if self.buffer_types[name][1] > 1: divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to {new_reduced_shape}") else: @@ -1002,19 +1121,20 @@ def set_tile_size(self, template_fusion_info, prologue=False): if 'nr_rdim' in template_fusion_info and template_fusion_info['nr_rdim']==1: tile_desc.nr_rdim = 1 numel_per_lane = tile_desc.get_numel_per_lane() - reduction_axis_size = tile_desc.get_tile_size()[-1] - nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size - tile_desc.vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality... + r_tile_size = tile_desc.get_tile_size()[-1] + nr_outer_loop = (numel_per_lane + r_tile_size-1) // r_tile_size + tile_desc.vmap.forced_vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality... self.reduction_fusion = True - self.reduction_axis_size = tile_desc.get_tile_size()[-1] + self.r_tile_size = tile_desc.get_tile_size()[-1] + self.r_dim_size = template_fusion_info['r_dim_size'] self.reduction_nr_outer_loop = nr_outer_loop self.reduction_loop_idx = "reduce_loop_idx" - self.compute_body_loop.size = reduction_axis_size + self.compute_body_loop.size = r_tile_size self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_loop_idx, nr_outer_loop) else: - tile_desc.vec_size=64 + tile_desc.vmap.forced_vec_size = 64 if prologue: self.prologue_compute_body_loop.size = tile_desc.get_numel_per_lane() @@ -1110,7 +1230,8 @@ def make_kernel_render( template=self, kwargs=kwargs ) - return kernel, render, self.codegen_header + tile_candidates = self.get_tile_candidates(**kwargs)[:extension_config.CONFIG_AUTOTUNE_TEMPLATE_TOPK] + return kernel, tile_candidates, render return MLIRTemplateCaller( kernel_hash_name, @@ -1122,5 +1243,8 @@ def make_kernel_render( self, ) + def get_tile_candidates(self, **kwargs): + return [] + def render(self, **kwargs) -> str: raise NotImplementedError \ No newline at end of file diff --git a/README.md b/README.md index 56b58b28..dbfdf2e8 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ # PyTorchSim: A Comprehensive, Fast, and Accurate NPU Simulation Framework [![Docker Image CI](https://github.com/PSAL-POSTECH/PyTorchSim/actions/workflows/docker-image.yml/badge.svg)](https://github.com/PSAL-POSTECH/PyTorchSim/actions/workflows/docker-image.yml) -PyTorchSim is a comprehensive, high-speed, cycle-accurate NPU simulation framework -- We define a RISC-V-based NPU architecture and implement PyTorch compiler backend to run inference & training for PyTorch models -- Achieved high speed and accuracy with our novel Tile-Level Simulation (TLS) with compiler-generated Tile-Operation Graph (TOG), exploiting deterministic tile compute latency -- A generic and extensible NPU architecture based on RISC-V vector extension -- The functional simulator supports code correctness validation and data-dependent timing simulation +PyTorchSim is a comprehensive, high-speed, cycle-accurate NPU simulation framework. +- We define a RISC-V-based NPU architecture and implement PyTorch compiler backend to run inference & training for PyTorch models. +- Achieved high speed and accuracy with our novel Tile-Level Simulation (TLS) with compiler-generated Tile-Operation Graph (TOG), exploiting deterministic tile compute latency. +- A generic and extensible NPU architecture based on RISC-V vector extension. +- The functional simulator supports code correctness validation and data-dependent timing simulation. For more details, please refer to our [paper](https://doi.org/10.1145/3725843.3756045)! @@ -31,6 +31,7 @@ PyTorchSim **supports**: - [Multi-tenancy](#multi-tenancy) - [Compiler optimizations](#compiler-optimizations) - [Mapping](#mapping) +- [L2 Cache](#l2-cache) (persistent cache) ## Model Zoo | Model | Source | Status | Note | @@ -87,12 +88,17 @@ To download the latest Docker image and set up the environment, use the followin # Run the Docker container docker run -it --ipc=host --name torchsim -w /workspace/PyTorchSim ghcr.io/psal-postech/torchsim-ci:latest bash ``` +### Manual Setting (Optional) +This script provides building [Gem5](https://github.com/PSAL-POSTECH/gem5.git), [LLVM](https://github.com/PSAL-POSTECH/llvm-project.git), and [Spike](https://github.com/PSAL-POSTECH/riscv-isa-sim.git) simulator from source code for specific experts. +```bash +bash script/build_from_source.sh +``` ### Run Examples The `tests` directory contains several AI workloads examples. ```bash python tests/test_matmul.py ``` -The result is stored to `TORCHSIM_DUMP_PATH`/`hash`/backendsim_result/. The log file contains detailed core, memory, and interconnect stats. +The result is stored to `TORCHSIM_DUMP_PATH/hash/togsim_result/`. The log file contains detailed core, memory, and interconnect stats. ### Run Your Own Model on PyTorchSim You can run your own PyTorch model on PyTorchSim by setting up a custom NPU device. @@ -125,9 +131,9 @@ Wrapper Codegen Path = /tmp/torchinductor_root/yd/cyda7nhzv5mtakfhfcxtmmhtsv6kg7 [Gem5Simulator] cmd> /workspace/gem5/build/RISCV/gem5.opt -r --stdout-file=sto.log -d /tmp/torchinductor/tmp/fy6nnyudtno/m5out /root/workspace/PyTorchSim/gem5_script/script_systolic.py -c /tmp/torchinductor/tmp/fy6nnyudtno/cycle_bin --vlane 128 [Gem5Simulator] Simulation is still running... [SpikeSimulator] cmd> spike --isa rv64gcv --varch=vlen:256,elen:64 --vectorlane-size=128 -m0x80000000:0x1900000000,0x2000000000:0x1000000 --scratchpad-base-paddr=137438953472 --scratchpad-base-vaddr=3489660928 --scratchpad-size=131072 --kernel-addr=0000000000010400:10846 --base-path=/tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001 /workspace/riscv-pk/build/pk /tmp/torchinductor/tmp/fy6nnyudtno/validation_binary /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/arg0_1/0.raw /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/arg1_1/0.raw /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/buf0/0.raw -[BackendSimulator] cmd> /root/workspace/PyTorchSim/PyTorchSimBackend/build/bin/Simulator --config /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/attribute/0 -[BackendSimulator] Simulation is still running.. -[BackendSimulator] Simulation of "/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/fy6nnyudtno/backendsim_result/0" +[TOGSimulator] cmd> /root/workspace/PyTorchSim/TOGSim/build/bin/Simulator --config /root/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/attribute/0 +[TOGSimulator] Simulation is still running.. +[TOGSimulator] Simulation of "/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/0" ---------------------------- |Matmul Forward Test Passed| ---------------------------- @@ -137,25 +143,25 @@ Simulation consists of three steps 1. `Gem5Simulator` obatins compute latency for TOG. 2. `SpikeSimulator` verifies the output code. -3. `BackendSimulator` simulates a NPU architecture. +3. `TOGSimulator` simulates a NPU architecture. If you want to turn off the `SpikeSimulator` for fast simulation, you can set as below. ```bash -export TORCHSIM_VALIDATION_MODE=False +export TORCHSIM_FUNCTIONAL_MODE=False ``` Log contains memory & core stats. ```bash [info] HBM2-CH_0: avg BW utilization 37% (255 reads, 128 writes) [info] Row hits: 359, Row misses: 26, Row conflicts: 0 [info] ========= Core stat ========= -[info] Core [0] : Systolic array [0] Utilization(%) 0.00, active cycle 0, idle cycle 1014 -[info] Core [0] : Systolic array [1] Utilization(%) 12.62, active cycle 128, idle cycle 886 -[info] Core [0] : TMA active cycle 3 TMA idle cycle 1011 DRAM BW 182.000 GB/s (6144) -[info] Core [0] : Vector Unit Utilization(%) 4.34, active cycle 44, idle_cycle 0 -[info] Core [0] : Numa hit count : 0, Numa miss count : 0 -[info] Core [0] : Total cycle 1014 -[info] Total execution cycle: 1014 -[info] Simulation time: 0.039296 seconds +[info] Core [0] : Systolic array [0] Utilization(%) 0.00, active_cycles 0, idle_cycles 1014 +[info] Core [0] : Systolic array [1] Utilization(%) 12.62, active_cycles 128, idle_cycles 886 +[info] Core [0] : DMA active_cycles 3 DMA idle_cycles 1011 DRAM BW 182.000 GB/s (6144) +[info] Core [0] : Vector Unit Utilization(%) 4.34, active_cycles 44, idle_cycle 0 +[info] Core [0] : NUMA local memory: 34 requests, remote memory: 0 requests +[info] Core [0] : Total_cycles 1014 +[info] Total execution cycles: 1014 +[info] Wall-clock time for simulation: 0.039296 seconds ``` The log is dumped in `TORCHSIM_DUMP_PATH` and you can set the path as below. ```bash @@ -175,61 +181,96 @@ opt_step() `tests/test_mlp.py` provides an example of MLP training. ## Multi-tenancy -Load generator supports multi-tenancy experiments. You can simply run `tests/test_scheduler.py` +Our load generator supports multi-tenancy experiments. You can run a simple example by executing `tests/test_scheduler.py`. ```bash python tests/test_scheduler.py ``` -Below is an example code of multi-tenancy -`target_model1` and `target_model2` is your own PyTorch model. -You can set the request arrival time and request queue index. Request queue is used for scheduling and you can set the number of queue to each core in [TOGSim configuration](#togsim-configuration) -```python -# Init scheduler -scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) +Below is an example code of multi-tenancy `resnet18` and `EncoderBlock`. +In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSimulator config file(`.json`). The compiled PyTorch models are then registered with a unique model id. + +```python3 +import os +import sys +import torch +from torchvision.models import resnet18 +from test_transformer import EncoderBlock +base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') +config = f'{base_path}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' + +sys.path.append(base_path) +from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request +scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) + # Register compiled model -opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last)) -opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device())) -SchedulerDNNModel.register_model("resnet18", opt_model1) -SchedulerDNNModel.register_model("bert", opt_model2) - -# Init input data -model_input1 = torch.randn(1, 3, 224, 224) -model_input2 = torch.randn(128, 768) - -# Init request -new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0) -new_request2 = Request("bert", [model_input2], [], request_queue_idx=1) -new_request3 = Request("resnet18", [model_input1], [], request_queue_idx=0) -new_request4 = Request("bert", [model_input2], [], request_queue_idx=1) - -# Add request to scheduler -scheduler.add_request(new_request1, request_time=0) -scheduler.add_request(new_request2, request_time=0) -scheduler.add_request(new_request3, request_time=0) -scheduler.add_request(new_request4, request_time=0) +target_model0 = resnet18().eval() +target_model1 = EncoderBlock(768, 12).eval() +opt_model0 = torch.compile(target_model0.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last)) +opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device())) +SchedulerDNNModel.register_model("model0", opt_model0) +SchedulerDNNModel.register_model("model1", opt_model1) +``` + +The config file(`.json`) specifies two key items: +- `num_partition`: The total number of independent request queues to create. +- `partition`: Defines the hardware mapping, assigning each queue (identified by its index) to a specific physical core. +For example, the configuration below creates two scheduling queues (`0` and `1`) and maps `core_0` to queue `0` and `core_1` to queue `1`: +``` + "num_partition" : 2, + "partition": { + "core_0":0, + "core_1":1 + } +``` +Next, DNN model requests are generated and submitted. We provide a `poisson_request_generator` utility, which generates request arrival times. +Each `Request` is created with its model name, data, and a request_queue_idx to specify its target queue, then added via `scheduler.add_request`. +As shown in the code, `model0` requests are queued to `request_queue_idx=0`, while `model1` requests are queued to `request_queue_idx=1`. +```python3 +# Load Generation +model0_lambda = 5.0 +model1_lambda = 3.0 +max_time = 1000.0 # [s] + +# Generate Possion distribution requests for model0 +for model0_request_time in poisson_request_generator(model0_lambda, total_time=max_time): + x = torch.randn(1, 3, 224, 224) + new_request = Request("model0", [x], [], request_queue_idx=0) + scheduler.add_request(new_request, request_time=model0_request_time) + +# Generate Possion distribution requests for model1 +for model1_request_time in poisson_request_generator(model1_lambda, total_time=max_time): + x = torch.randn(128, 768) + new_request = Request("model1", [x], [], request_queue_idx=1) + scheduler.add_request(new_request, request_time=model1_request_time) +``` + +Finally, `scheduler.schedule()` is called in a loop until all requests are processed. +```python3 # Run scheduler while not scheduler.is_finished(): scheduler.schedule() ``` + ## Compiler Optimizations -PyTorchSim compiler supports fusions +PyTorchSim compiler supports several fusion optimizations: - GEMM prologue fusion - GEMM epilogue fusion - GEMM reduction fusion - CONV epilogue fusion -Depending on tensor shape, use different convolution template +Depending on tensor shape, use different convolution template: - Single batch optimization - Multi-channel optimization ## Mapping -PyTorchSim provids three mapping strategies +PyTorchSim provides three mapping strategies. ### Heuristic-based mapping We adopt and modified heuristic-based mapping of [GEMMINI](https://github.com/ucb-bar/gemmini) by default, which maximizes the utilization of scratchpad memory. ### Auto-tuning Heuristic method is not optimal for some cases. PyTorchSim provides auto-tuning to find best mapping for GEMM, CONV, and vector operations. It reduces searching space by sorting of scratchpad memory utilization and pick top-k candiates. Searching parameters are tile shape and vector lane stride. ```bash export AUTOTUNE=True +export AUTOTUNE_TEMPLATE=True ``` ### Manunal setting User can exploit third-party(e.g. Timeloop) mapping. Set the cheatsheet path and write down their own mapping. @@ -264,8 +305,27 @@ export TORCHSIM_TILE_M=512 export TORCHSIM_TILE_N=512 export TORCHSIM_TILE_K=512 ``` +## L2 Cache +It supports L2 cache as persistent cache. User can provide software-managed allocation/eviction strategy for tensors with persistent cache. + +Common Memory (CMEM) is a new feature introduced in the latest TPUs (newer than TPUv3). Multiple cores share this memory, which provides high bandwidth. Reusable tensors are stored and loaded from CMEM to avoid off-chip traffic. Our L2 cache can work like as CMEM + +To allocate a tensor in L2 cache, set the environment variable as shown below. The `tpuv4` directory provides example plans for L2 cache obtained from TPUv4 profiling. +```bash +export SRAM_BUFFER_PLAN_PATH=tpuv4/gemm_plan.py +``` +The L2 cache strategy file is composed as follows: +``` +plan = { + "arg0_1" +} +``` +In this example, only one input tensor is registered in L2 cache. You can refer to the tensor name from the wrapper code. After running the code, you can find the wrapper codegen path in the [result](#result) section. + +Last but not least, you must set `l2d_type` and `l2d_config` in the [TOGSim config](#togsim-configuration) to use L2 cache. The `l2d_config` follows the same configuration method as [AccelSim](https://github.com/accel-sim/accel-sim-framework). + ## Compiler Configuration -`PyTorchSimFrontend/extension_config.py` contains target hardware configuration to compile +`PyTorchSimFrontend/extension_config.py` contains target hardware configuration to compile. You can configure these options using environment variables. ```bash @@ -284,23 +344,27 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing ## TOGSim Configuration ![NPU_Core](./docs/npu_core.jpg) -`PyTorchSimBackend/configs` directory contains example NPU configuration files in the JSON format. +`TOGSim/configs` directory contains example NPU configuration files in the JSON format. ``` "num_cores" : 2, // Number of NPU cores - "core_freq" : 940, // Core's frequency (MHz) + "core_freq_mhz" : 940, // Core's frequency (MHz) "num_systolic_array_per_core" : 2, // Number of systolic array per core "dram_type" : "ramulator2", // DRAM type (ex. ramulator2, simple) - "dram_freq" : 940, // DRAM frequency (MHz) + "dram_freq_mhz" : 940, // DRAM frequency (MHz) "dram_channels": 32, // Number of DRAM channels "dram_req_size": 32, // DRAM request size (B) "dram_latency" : 10, // DRAM latency (cycle) "dram_nbl" : 2, // DRAM burst length size "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", // Ramulator2 config file path - "icnt_type" : "simple", // Interconnect type (ex. booksim, simple) - "icnt_latency" : 7, // Interconnect latency (cycle) - "icnt_freq" : 28000, // Interconnect frequency (MHz) + "l2d_type" : "datacache", + "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32", + + "icnt_type" : "simple", // Interconnect type (ex. booksim, simple) + "icnt_latency" : 7, // Interconnect latency (cycle) + "icnt_freq_mhz" : 940, // Interconnect frequency (MHz) + "icnt_injection_ports_per_core" : 16 // Interconnect injection ports per core "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", // Booksim2 config file path "precision" : 4, // Element's precision in tensor (Byte) @@ -313,7 +377,7 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing ``` You can set TOGSim config path as below. ```bash -export TORCHSIM_CONFIG=/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json +export TORCHSIM_CONFIG=/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json ``` ## Future Works Currently, PyTorchSim supports PyTorch 2.2. Support for newer versions will be added soon. @@ -346,11 +410,10 @@ If you use PyTorchSim for your research, please cite the following paper. @INPROCEEDINGS{yang2025pytorchsim, author={Yang, Wonhyuk and Shin, Yunseon and Woo, Okkyun and Park, Geonwoo and Ham, Hyungkyu and Kang, Jeehoon and Park, Jongse and Kim, Gwangsun}, title={PyTorchSim: A Comprehensive, Fast, and Accurate NPU Simulation Framework}, - booktitle={2025 58th IEEE/ACM International Symposium on Microarchitecture (MICRO)}, - volume={}, - number={}, - pages={}, + booktitle={Proceedings of the 58th IEEE/ACM International Symposium on Microarchitecture}, + pages={1363–1380}, year={2025}, - doi={10.1145/3725843.3756045} + doi={10.1145/3725843.3756045}, + series={MICRO '25} } -``` \ No newline at end of file +``` diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 834698a6..0b633fa9 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -5,7 +5,7 @@ from pathlib import Path import importlib.util from PyTorchSimFrontend.extension_codecache import hash_prefix -from Simulator.simulator import BackendSimulator +from Simulator.simulator import TOGSimulator from PyTorchSimFrontend import extension_config def import_module_from_path(module_name, path): @@ -140,11 +140,11 @@ def __str__(self): def register_model(model_name : str, compiled_model): SchedulerDNNModel.MODEL_MAP[model_name] = compiled_model -class ExecutionEngine: +class PyTorchSimRunner: PARTITION_BUSY = 0 PARTITION_IDLE = 1 SELECT_NOTHING = 2 - def __init__(self, backend_simulator : BackendSimulator, num_partion=1) -> None: + def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None: self.module = self.setup_device() self.num_partion = num_partion self.launch_model_dicts = [] @@ -156,11 +156,11 @@ def __init__(self, backend_simulator : BackendSimulator, num_partion=1) -> None: self.partition_state.append(self.PARTITION_IDLE) self.finish_req_dict = {} - self.backend_simulator = backend_simulator + self.tog_simulator = tog_simulator # Dry run for compile and create generator - os.environ["BACKENDSIM_DRYRUN"] = "1" - os.environ["BACKENDSIM_EAGER_MODE"] = "1" + os.environ["TOGSIM_DRYRUN"] = "1" + os.environ["TOGSIM_EAGER_MODE"] = "1" @staticmethod def setup_device(): @@ -171,7 +171,7 @@ def setup_device(): import torch.utils.cpp_extension module = torch.utils.cpp_extension.load( - name="extension_device", + name="npu", sources=[ str(source_file), ], @@ -179,7 +179,7 @@ def setup_device(): verbose=True, ) - torch.utils.rename_privateuse1_backend("extension_device") + torch.utils.rename_privateuse1_backend("npu") from torch._inductor.codegen.common import ( get_scheduling_for_device, get_wrapper_codegen_for_device, @@ -192,13 +192,13 @@ def setup_device(): MLIRScheduling ) register_backend_for_device( - "extension_device", MLIRScheduling, ExtensionWrapperCodegen + "npu", MLIRScheduling, ExtensionWrapperCodegen ) assert( - get_scheduling_for_device("extension_device") == MLIRScheduling + get_scheduling_for_device("npu") == MLIRScheduling ) assert( - get_wrapper_codegen_for_device("extension_device") + get_wrapper_codegen_for_device("npu") == ExtensionWrapperCodegen ) return module @@ -222,7 +222,7 @@ def is_all_idle(self): return all([self.is_partition_idle(i) for i in range(self.num_partion)]) def prepare_model(self, req_model: SchedulerDNNModel): - result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "backend_result", req_model.model_name) + result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "togsim_result", req_model.model_name) os.makedirs(result_path, exist_ok=True) index = str(len(os.listdir(result_path))) @@ -244,7 +244,7 @@ def prepare_launch_kernel(self, kernel, inputs): onnx_path = os.path.join(result_path, "tile_graph.onnx") attribute_path = os.path.join(runtime_path, "attribute") - attribute_path = self.backend_simulator.create_attribute_file(attribute_path, inputs) + attribute_path = self.tog_simulator.create_attribute_file(attribute_path, inputs) return onnx_path, attribute_path def launch_kernel(self, current_cycle, partion_idx=0): @@ -260,11 +260,11 @@ def launch_kernel(self, current_cycle, partion_idx=0): else: onnx_path, attribute_path = kernel, inputs self.partition_state[partion_idx] = self.PARTITION_BUSY - return self.backend_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx) + return self.tog_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx) -class FIFOExecutionEngine(ExecutionEngine): - def __init__(self, backend_simulator: BackendSimulator, num_partion=1) -> None: - super().__init__(backend_simulator, num_partion) +class FIFORunner(PyTorchSimRunner): + def __init__(self, tog_simulator: TOGSimulator, num_partion=1) -> None: + super().__init__(tog_simulator, num_partion) def select_kernel(self, partition_idx): while len(self.nested_launch_model_dicts[partition_idx]) or len(self.launch_model_dicts[partition_idx]): @@ -297,9 +297,9 @@ def select_kernel(self, partition_idx): # No proper kernel now return self.SELECT_NOTHING -class RRExecutionEngine(ExecutionEngine): - def __init__(self, backend_simulator: BackendSimulator, num_partion=1) -> None: - super().__init__(backend_simulator, num_partion) +class RoundRobinRunner(PyTorchSimRunner): + def __init__(self, tog_simulator: TOGSimulator, num_partion=1) -> None: + super().__init__(tog_simulator, num_partion) self.next_pointer = None def select_kernel(self, partition_idx): @@ -347,7 +347,7 @@ class Scheduler: FIFO_ENGINE = 0 RR_ENGINE = 1 - def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, backend_config=extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) -> None: + def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG) -> None: self.current_cycle = 0 self.max_batch = max_batch self.num_request_queue = num_request_queue @@ -356,13 +356,13 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, self.request_queue.append([]) self.finish_queue : List[Request] = [] - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - self.backend_simulator = BackendSimulator(backend_path, backend_config) - self.backend_simulator.interactive_simulation() + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + self.tog_simulator = TOGSimulator(togsim_path, togsim_config) + self.tog_simulator.interactive_simulation() if engine_select == Scheduler.FIFO_ENGINE: - self.execution_engine = FIFOExecutionEngine(self.backend_simulator, self.num_request_queue) + self.execution_engine = FIFORunner(self.tog_simulator, self.num_request_queue) elif engine_select == Scheduler.RR_ENGINE: - self.execution_engine = RRExecutionEngine(self.backend_simulator, self.num_request_queue) + self.execution_engine = RoundRobinRunner(self.tog_simulator, self.num_request_queue) else: print(f"Not supporetd engine type {engine_select}") exit(1) @@ -469,8 +469,8 @@ def schedule(self): # Need to forward the time until next_arrival_time if self.execution_engine.is_all_idle(): - reason = self.backend_simulator.until(self.msec_to_cycle(next_time)) - self.current_cycle = self.backend_simulator.cycle() + reason = self.tog_simulator.until(self.msec_to_cycle(next_time)) + self.current_cycle = self.tog_simulator.cycle() else: self.run(next_time) return @@ -480,7 +480,7 @@ def run(self, until_time): def execute_cycle(): launch_ret_info = [] for i in range(self.execution_engine.num_partion): - if self.execution_engine.partition_state[i] == ExecutionEngine.PARTITION_IDLE: + if self.execution_engine.partition_state[i] == PyTorchSimRunner.PARTITION_IDLE: ret = self.execution_engine.launch_kernel(self.current_cycle, i) launch_ret_info.append(ret) @@ -490,12 +490,12 @@ def execute_cycle(): return [] # Schedule jobs and update the current time - result_list = self.backend_simulator.until(self.msec_to_cycle(until_time)) - self.current_cycle = self.backend_simulator.cycle() + result_list = self.tog_simulator.until(self.msec_to_cycle(until_time)) + self.current_cycle = self.tog_simulator.cycle() for core_idx in result_list: # Kernel is finished. So set idle state - self.execution_engine.partition_state[core_idx] = ExecutionEngine.PARTITION_IDLE + self.execution_engine.partition_state[core_idx] = PyTorchSimRunner.PARTITION_IDLE return result_list @@ -526,7 +526,7 @@ def is_request_queue_empty(self): def is_finished(self): if self.is_request_queue_empty() and self.execution_engine.is_all_idle(): - self.backend_simulator.wait() + self.tog_simulator.wait() return True return False @@ -534,7 +534,7 @@ def current_time(self): return self.cycle_to_msec(self.current_cycle) def cycle_to_msec(self, cycle): - freq = self.backend_simulator.get_core_freq() + freq = self.tog_simulator.get_core_freq() return cycle / (freq / 1000) def msec_to_cycle(self, msec): @@ -542,5 +542,5 @@ def msec_to_cycle(self, msec): if (msec == -1): return msec - freq = self.backend_simulator.get_core_freq() - return int(msec * (freq / 1000)) \ No newline at end of file + freq = self.tog_simulator.get_core_freq() + return int(msec * (freq / 1000)) diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 81970cbe..c586c2fd 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -12,7 +12,7 @@ import torch import numpy as np -from PyTorchSimFrontend.llvm.llvm_common import LLVMKernelArgs +from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs from PyTorchSimFrontend import extension_config TORCH_TO_NUMPY = { @@ -64,10 +64,10 @@ def dump_args(self, args, arg_attributes, load_path, dump_path): for (arg_name, arg_attribute), arg in zip(arg_attributes, args): size = arg_attribute[2] if arg_attribute[1] != torch.bool else (arg_attribute[2] + 7) // 8 array_size.append(size) - if LLVMKernelArgs.is_llvm_arg_in(arg_attribute[0]): + if MLIRKernelArgs.is_mlir_arg_in(arg_attribute[0]): index = self.write_arg(arg, load_path, arg_name) file_path.append(os.path.join(load_path, arg_name, f'{index}.raw')) - elif LLVMKernelArgs.is_llvm_arg_out(arg_attribute[0]): + elif MLIRKernelArgs.is_mlir_arg_out(arg_attribute[0]): path = os.path.join(dump_path, arg_name) os.makedirs(path, exist_ok=True) file_path.append(os.path.join(path, f'{self.get_biggest_filename(path)}.raw')) @@ -101,15 +101,17 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size= os.makedirs(os.path.join(runtime_path, "indirect_access"), exist_ok=True) os.makedirs(os.path.join(runtime_path, "dma_access"), exist_ok=True) run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}' - if not silent_mode: - print("[SpikeSimulator] cmd> ", run) + if not silent_mode and extension_config.CONFIG_DEBUG_MODE: + print("[Spike] cmd> ", run) + print("[Spike] Running Spike simulator") run_cmd = shlex.split(run) try: stdout_setting = subprocess.DEVNULL if silent_mode else None stderr_setting = subprocess.DEVNULL if silent_mode else None subprocess.check_call(run_cmd, stdout=stdout_setting, stderr=stderr_setting) except subprocess.CalledProcessError as e: - print("[SpikeSimulator] Command failed with exit code", e.returncode) + if not silent_mode: + print("[Spike] Command failed with exit code", e.returncode) error_msg = "" if e.returncode == 200: error_msg = "INVALID_SPAD_ACCESS" @@ -120,7 +122,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size= raise RuntimeError(f"{error_msg}") for (arg_name, arg_attribute), arg, path in zip(arg_attributes, args, file_path): - if LLVMKernelArgs.is_llvm_arg_out(arg_attribute[0]): + if MLIRKernelArgs.is_mlir_arg_out(arg_attribute[0]): self.load_tensor(arg, arg_name, arg_attribute, path) if cleanup: @@ -154,7 +156,7 @@ def show_progress(): while not finished: i = (i + 1) % 3 tail = "." * i + " " * (3-i) - sys.stdout.write("\r[Gem5Simulator] Simulation is still running." + tail) + sys.stdout.write("\r[Gem5] Gem5 is running." + tail) time.sleep(1) print("") @@ -162,9 +164,10 @@ def show_progress(): gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-r", "--stdout-file=sto.log", "-d", dir_path, extension_config.CONFIG_GEM5_SCRIPT_PATH, "-c", target_binary, "--vlane", str(vectorlane_size)] try: # Create progress thread - is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) or silent_mode + is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) or silent_mode if not is_dryrun: - print("[Gem5Simulator] cmd> ", " ".join(gem5_cmd)) + if extension_config.CONFIG_DEBUG_MODE: + print("[Gem5] cmd> ", " ".join(gem5_cmd)) finished = False progress_thread = threading.Thread(target=show_progress) progress_thread.start() @@ -174,11 +177,11 @@ def show_progress(): else: output = subprocess.check_output(gem5_cmd, stderr=subprocess.DEVNULL) except subprocess.CalledProcessError as e: - print("[Gem5Simulator] Command failed with exit code", e.returncode) - print("[Gem5Simulator] Error output:", e.output) - finished = True - progress_thread.join() - assert(0) + print(f"[Gem5] Gem5 simulation failed with error: \"{e.output.decode()}\"") + if not is_dryrun: + finished = True + progress_thread.join() + raise RuntimeError(f"Gem5 Simulation Failed: \"{e.output.decode()}\"") with open(f"{dir_path}/stats.txt", "r") as stat_file: raw_list = stat_file.readlines() @@ -187,18 +190,18 @@ def show_progress(): cycle_list = cycle_list[:-1] return cycle_list -class BackendSimulator(): - BACKEND_RESULT_PATH_KEY = "BACKEND_RESULT_PATH" - FINISH_STR = "Simulation Finished" +class TOGSimulator(): + TOGSIM_RESULT_PATH_KEY = "TOGSIM_RESULT_PATH" + FINISH_STR = "Simulation finished" ALLOC_POOL = dict() # For eagermode buffer plan - def __init__(self, backend_path, config_path, vectorlane_size=-1) -> None: - self.base_dir = backend_path + def __init__(self, togsim_path, config_path, vectorlane_size=-1) -> None: + self.base_dir = togsim_path self.config_path = config_path self.config_json = self.load_json(self.config_path) self.process = None self.vectorlane_size = vectorlane_size - def get_backend_command(self): + def get_togsim_command(self): bin = os.path.join(self.base_dir, "build/bin/Simulator") config = os.path.join(self.base_dir, self.config_path) cmd = f"{bin} --config {config}" @@ -210,16 +213,16 @@ def show_progress(): while not finished: i = (i + 1) % 3 tail = "." * i + " " * (3-i) - sys.stdout.write("\r[BackendSimulator] Simulation is still running." + tail) + sys.stdout.write("\r[TOGSim] TOGSim is running." + tail) time.sleep(1) print("") - cmd = f"{self.get_backend_command()} --models_list {model_path}" - if extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL: - cmd += f" --log_level {extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL}" + cmd = f"{self.get_togsim_command()} --models_list {model_path}" + if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: + cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" if attribute_path: cmd = f"{cmd} --attributes_list {attribute_path}" - if not silent_mode: - print("[BackendSimulator] cmd> ", cmd) + if not silent_mode and extension_config.CONFIG_DEBUG_MODE: + print("[TOGSim] cmd> ", cmd) # Create progress thread if not silent_mode: @@ -235,28 +238,26 @@ def show_progress(): if not silent_mode: finished = True progress_thread.join() - print("[BackendSimulator] Command failed with exit code", e.returncode) - print("[BackendSimulator] Error output:", e.output) + print("[TOGSim] Command failed with exit code", e.returncode) + print("[TOGSim] Error output:", e.output) assert 0 - result_path = extension_config.CONFIG_BACKEND_RESULT_PATH_KEY - if result_path is None: - result_path = os.path.join(os.path.dirname(model_path), "backendsim_result") - # Save result to result_path + result_path = os.path.join(os.path.dirname(model_path), "togsim_result") os.makedirs(result_path, exist_ok=True) file_name = str(len(os.listdir(result_path))) result_path = os.path.join(result_path, file_name) with open(result_path, "w") as f: f.write(result.decode()) - print(f'[BackendSimulator] Simulation of "{model_path}" is stored to "{result_path}"') + print(f'[TOGSim] Simulation of "{model_path}" is stored to "{result_path}"') return result_path def interactive_simulation(self): - cmd = f"{self.get_backend_command()} --mode interactive" - if extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL: - cmd += f" --log_level {extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL}" + cmd = f"{self.get_togsim_command()} --mode interactive" + if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: + cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" - print("[BackendSimulator] cmd> ", cmd) + if extension_config.CONFIG_DEBUG_MODE: + print("[TOGSim] cmd> ", cmd) if self.process is None: self.process = subprocess.Popen( shlex.split(cmd), @@ -265,27 +266,27 @@ def interactive_simulation(self): universal_newlines=True ) else: - print("[BackendSimulator] Simulator is already running.") + print("[TOGSim] Simulator is already running.") def stop(self): if self.process: self.process.terminate() self.process.wait() self.process = None - print("[BackendSimulator] Simulator stopped.") + print("[TOGSim] Simulator stopped.") def wait(self): if self.process: - print("[BackendSimulator] Waiting for simulation to complete...") + print("[TOGSim] Waiting for simulation to complete...") self.quit() self.process.wait() self.process = None - print("[BackendSimulator] Simulation completed.") + print("[TOGSim] Simulation completed.") def send_command(self, command): if self.process: try: - if not extension_config.CONFIG_BACKENDSIM_DRYRUN: + if not extension_config.CONFIG_TOGSIM_DRYRUN: print(command, flush=True) self.process.stdin.write(command + '\n') self.process.stdin.flush() @@ -352,6 +353,8 @@ def create_attribute_file(self, attribute_path, inputs, **kwargs): with open(attribute_path, "w") as f: json.dump(json_content, f, indent=4) + f.flush() + os.fsync(f.fileno()) # There could be a race condition. return attribute_path def load_json(self, config_path): @@ -367,8 +370,8 @@ def load_json(self, config_path): raise ValueError(f"Invalid JSON format: {e}") def get_core_freq(self): - if "core_freq" in self.config_json: - return self.config_json["core_freq"] * 1000 * 1000 # MHz + if "core_freq_mhz" in self.config_json: + return self.config_json["core_freq_mhz"] * 1000 * 1000 # MHz else: raise KeyError("Key 'core_freq' not found in JSON.") @@ -403,13 +406,13 @@ def get_result_from_file(result_path): simulation_finished_idx = -1 simulation_finished = False for idx, line in enumerate(lines): - if BackendSimulator.FINISH_STR in line: + if TOGSimulator.FINISH_STR in line: simulation_finished = True simulation_finished_idx = idx break if simulation_finished_idx == -1: - print("[BackendSimulator] Treid to parsing wrong formated output file!") + print("[TOGSim] Tried to parsing wrong formated output file!") return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time total_stat_lines = lines[simulation_finished_idx:] @@ -440,15 +443,15 @@ def get_result_from_file(result_path): if 'DRAM: AVG BW Util' in line: avg_dram_bw = float(re.search(r'AVG BW Util (\d+\.?\d*)%', line).group(1)) - if 'Total execution cycle' in line: - total_cycle = int(re.search(r'Total execution cycle: (\d+)', line).group(1)) + if 'Total execution cycles' in line: + total_cycle = int(re.search(r'Total execution cycles: (\d+)', line).group(1)) # Parse total simulation time - if 'Simulation time' in line: - simulation_time = float(re.search(r'Simulation time: (\d+\.?\d*) seconds', line).group(1)) + if 'Wall-clock time for simulation' in line: + simulation_time = float(re.search(r'Wall-clock time for simulation: (\d+\.?\d*) seconds', line).group(1)) return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time, total_cycle if __name__ == "__main__": - sim = BackendSimulator("/workspace/PyTorchSim/PyTorchSimBackend", "/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c4_simple_noc_tpuv4.json") + sim = TOGSimulator("/workspace/PyTorchSim/TOGSim", "/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json") sim.interactive_simulation() sim.until(4000) \ No newline at end of file diff --git a/PyTorchSimBackend/CMakeLists.txt b/TOGSim/CMakeLists.txt similarity index 100% rename from PyTorchSimBackend/CMakeLists.txt rename to TOGSim/CMakeLists.txt diff --git a/PyTorchSimBackend/conanfile.txt b/TOGSim/conanfile.txt similarity index 100% rename from PyTorchSimBackend/conanfile.txt rename to TOGSim/conanfile.txt diff --git a/PyTorchSimBackend/configs/booksim2_configs/anynet.icnt b/TOGSim/configs/booksim2_configs/anynet.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/anynet.icnt rename to TOGSim/configs/booksim2_configs/anynet.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/anynet_file b/TOGSim/configs/booksim2_configs/anynet_file similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/anynet_file rename to TOGSim/configs/booksim2_configs/anynet_file diff --git a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt similarity index 75% rename from PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt rename to TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt index d18ff6e7..3102fecc 100644 --- a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt +++ b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt @@ -2,7 +2,7 @@ use_map = 0 flit_size = 32 topology = anynet -network_file = /workspace/PyTorchSim/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net +network_file = /workspace/PyTorchSim/TOGSim/configs/booksim2_configs/chiplet_32_32_2.net routing_function = min subnets = 1 routing_delay = 4 diff --git a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.net similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net rename to TOGSim/configs/booksim2_configs/chiplet_32_32_2.net diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m16.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m16.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m16.icnt rename to TOGSim/configs/booksim2_configs/fly_c16_m16.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m32.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m32.icnt rename to TOGSim/configs/booksim2_configs/fly_c16_m32.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c16_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m1.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m1.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m1.icnt rename to TOGSim/configs/booksim2_configs/fly_c1_m1.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m2.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m2.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m2.icnt rename to TOGSim/configs/booksim2_configs/fly_c1_m2.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c1_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c2_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c2_m32.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c2_m32.icnt rename to TOGSim/configs/booksim2_configs/fly_c2_m32.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c2_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c2_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c2_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c2_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m32.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m32.icnt rename to TOGSim/configs/booksim2_configs/fly_c32_m32.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m4.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m4.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m4.icnt rename to TOGSim/configs/booksim2_configs/fly_c32_m4.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c32_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m2.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m2.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m2.icnt rename to TOGSim/configs/booksim2_configs/fly_c4_m2.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m32.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m32.icnt rename to TOGSim/configs/booksim2_configs/fly_c4_m32.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c4_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c64_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-age.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8_sif-age.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-age.icnt rename to TOGSim/configs/booksim2_configs/fly_c64_m8_sif-age.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt rename to TOGSim/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/make_anynet_topology.py b/TOGSim/configs/booksim2_configs/make_anynet_topology.py similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/make_anynet_topology.py rename to TOGSim/configs/booksim2_configs/make_anynet_topology.py diff --git a/PyTorchSimBackend/configs/booksim2_configs/mesh_sif-age.icnt b/TOGSim/configs/booksim2_configs/mesh_sif-age.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/mesh_sif-age.icnt rename to TOGSim/configs/booksim2_configs/mesh_sif-age.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/mesh_sif-rr.icnt b/TOGSim/configs/booksim2_configs/mesh_sif-rr.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/mesh_sif-rr.icnt rename to TOGSim/configs/booksim2_configs/mesh_sif-rr.icnt diff --git a/TOGSim/configs/heterogeneous_c2_simple_noc.json b/TOGSim/configs/heterogeneous_c2_simple_noc.json new file mode 100644 index 00000000..60f160a8 --- /dev/null +++ b/TOGSim/configs/heterogeneous_c2_simple_noc.json @@ -0,0 +1,29 @@ +{ + "core_type" : ["stonne", "ws_mesh"], + "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", + "num_cores" : 2, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_stonne_per_core" : 8, + "num_stonne_port" : 64, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 16, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 940, + "icnt_injection_ports_per_core" : 16, + + "num_partition" : 2, + "partition": { + "core_0":0, + "core_1":1 + } +} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/ramulator2_configs/DDR4.yaml b/TOGSim/configs/ramulator2_configs/DDR4.yaml similarity index 100% rename from PyTorchSimBackend/configs/ramulator2_configs/DDR4.yaml rename to TOGSim/configs/ramulator2_configs/DDR4.yaml diff --git a/PyTorchSimBackend/configs/ramulator2_configs/HBM2.yaml b/TOGSim/configs/ramulator2_configs/HBM2.yaml similarity index 100% rename from PyTorchSimBackend/configs/ramulator2_configs/HBM2.yaml rename to TOGSim/configs/ramulator2_configs/HBM2.yaml diff --git a/PyTorchSimBackend/configs/ramulator2_configs/HBM2_TPUv3.yaml b/TOGSim/configs/ramulator2_configs/HBM2_TPUv3.yaml similarity index 100% rename from PyTorchSimBackend/configs/ramulator2_configs/HBM2_TPUv3.yaml rename to TOGSim/configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/PyTorchSimBackend/configs/ramulator_configs/ALDRAM-config.cfg b/TOGSim/configs/ramulator_configs/ALDRAM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/ALDRAM-config.cfg rename to TOGSim/configs/ramulator_configs/ALDRAM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/DDR3-config.cfg b/TOGSim/configs/ramulator_configs/DDR3-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/DDR3-config.cfg rename to TOGSim/configs/ramulator_configs/DDR3-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/DDR4-config.cfg b/TOGSim/configs/ramulator_configs/DDR4-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/DDR4-config.cfg rename to TOGSim/configs/ramulator_configs/DDR4-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/DSARP-config.cfg b/TOGSim/configs/ramulator_configs/DSARP-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/DSARP-config.cfg rename to TOGSim/configs/ramulator_configs/DSARP-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/GDDR5-config.cfg b/TOGSim/configs/ramulator_configs/GDDR5-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/GDDR5-config.cfg rename to TOGSim/configs/ramulator_configs/GDDR5-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config.cfg b/TOGSim/configs/ramulator_configs/HBM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config.cfg rename to TOGSim/configs/ramulator_configs/HBM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg b/TOGSim/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FCFS.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FCFS.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FCFS.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_FCFS.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg b/TOGSim/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg b/TOGSim/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBMx0.5ch-config.cfg b/TOGSim/configs/ramulator_configs/HBMx0.5ch-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBMx0.5ch-config.cfg rename to TOGSim/configs/ramulator_configs/HBMx0.5ch-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBMx2ch-config.cfg b/TOGSim/configs/ramulator_configs/HBMx2ch-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBMx2ch-config.cfg rename to TOGSim/configs/ramulator_configs/HBMx2ch-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/LPDDR3-config.cfg b/TOGSim/configs/ramulator_configs/LPDDR3-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/LPDDR3-config.cfg rename to TOGSim/configs/ramulator_configs/LPDDR3-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/LPDDR4-config.cfg b/TOGSim/configs/ramulator_configs/LPDDR4-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/LPDDR4-config.cfg rename to TOGSim/configs/ramulator_configs/LPDDR4-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/PCM-config.cfg b/TOGSim/configs/ramulator_configs/PCM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/PCM-config.cfg rename to TOGSim/configs/ramulator_configs/PCM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/SALP-config.cfg b/TOGSim/configs/ramulator_configs/SALP-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/SALP-config.cfg rename to TOGSim/configs/ramulator_configs/SALP-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/STTMRAM-config.cfg b/TOGSim/configs/ramulator_configs/STTMRAM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/STTMRAM-config.cfg rename to TOGSim/configs/ramulator_configs/STTMRAM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/TLDRAM-config.cfg b/TOGSim/configs/ramulator_configs/TLDRAM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/TLDRAM-config.cfg rename to TOGSim/configs/ramulator_configs/TLDRAM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/WideIO-config.cfg b/TOGSim/configs/ramulator_configs/WideIO-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/WideIO-config.cfg rename to TOGSim/configs/ramulator_configs/WideIO-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/WideIO2-config.cfg b/TOGSim/configs/ramulator_configs/WideIO2-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/WideIO2-config.cfg rename to TOGSim/configs/ramulator_configs/WideIO2-config.cfg diff --git a/TOGSim/configs/stonne_big_c1_simple_noc.json b/TOGSim/configs/stonne_big_c1_simple_noc.json new file mode 100644 index 00000000..5d563fbe --- /dev/null +++ b/TOGSim/configs/stonne_big_c1_simple_noc.json @@ -0,0 +1,22 @@ +{ + "core_type" : ["stonne"], + "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", + "num_cores" : 1, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_stonne_per_core" : 8, + "num_stonne_port" : 64, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 8, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycless": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 940, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/stonne_single_c1_simple_noc.json b/TOGSim/configs/stonne_single_c1_simple_noc.json new file mode 100644 index 00000000..304e84b3 --- /dev/null +++ b/TOGSim/configs/stonne_single_c1_simple_noc.json @@ -0,0 +1,22 @@ +{ + "core_type" : ["stonne"], + "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", + "num_cores" : 1, + "core_freq_mhz" : 700, + "core_stats_print_period_cycles" : 10000, + "num_stonne_per_core" : 1, + "num_stonne_port" : 8, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 700, + "dram_channels": 8, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 700, + "icnt_injection_ports_per_core" : 8 +} \ No newline at end of file diff --git a/TOGSim/configs/stonne_validation_c1_simple_noc.json b/TOGSim/configs/stonne_validation_c1_simple_noc.json new file mode 100644 index 00000000..38d4244c --- /dev/null +++ b/TOGSim/configs/stonne_validation_c1_simple_noc.json @@ -0,0 +1,23 @@ +{ + "core_type" : ["stonne"], + "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", + "num_cores" : 1, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 10000, + "num_stonne_per_core" : 1, + "num_stonne_port" : 32, + + "dram_type" : "simple", + "dram_freq_mhz" : 1000, + "dram_channels": 1, + "dram_req_size_byte": 32, + "dram_latency" : 100, + "dram_stats_print_period_cycles": 10000, + "l2d_type" : "datacache", + "l2d_config" : "S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 1000, + "icnt_injection_ports_per_core" : 8 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json new file mode 100644 index 00000000..58519aad --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json @@ -0,0 +1,19 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 700, + "core_stats_print_period_cycles" : 10000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :700, + "dram_channels": 16, + "dram_req_size_byte": 32, + + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + + "icnt_type" : "booksim2", + "icnt_freq_mhz" : 700, + "icnt_injection_ports_per_core" : 16, + "booksim_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt" +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json new file mode 100644 index 00000000..1257891c --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json @@ -0,0 +1,18 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 700, + "core_stats_print_period_cycles" : 10000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 700, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycless": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 700, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json new file mode 100644 index 00000000..b92d8029 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json @@ -0,0 +1,19 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 16, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 940, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json new file mode 100644 index 00000000..34896fc7 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json @@ -0,0 +1,19 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 8, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 940, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json new file mode 100644 index 00000000..59be9fd4 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json @@ -0,0 +1,21 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 1050, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 4, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :1200, + "dram_channels": 16, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + "l2d_type" : "datacache", + "l2d_config" : "S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 1050, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json new file mode 100644 index 00000000..271e7e1c --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json @@ -0,0 +1,19 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "booksim2", + "icnt_freq_mhz" : 940, + "icnt_injection_ports_per_core" : 16, + "booksim_config_path" : "../configs/booksim2_configs/fly_c32_m32.icnt" +} diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json similarity index 70% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json rename to TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json index d51e9c5f..7382c4c8 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json +++ b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json @@ -1,26 +1,25 @@ { "num_cores" : 2, - "core_freq" : 940, + "core_freq_mhz" : 940, "sram_size" : 65536, "core_print_interval" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", "dram_freq" : 940, - "dram_channels": 32, + "dram_channels": 8, "dram_req_size": 32, "dram_latency" : 10, - "dram_size" : 32, "dram_nbl" : 2, "dram_print_interval": 10000, "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - + "icnt_type" : "booksim2", - "icnt_latency" : 7, - "icnt_freq" : 28000, - "icnt_node_per_core" : 1, - "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m32.icnt", - + "icnt_latency" : 1, + "icnt_freq" : 940, + "icnt_injection_ports_per_core" : 16, + "icnt_config_path" : "../configs/booksim2_configs/fly_c32_m8.icnt", + "precision" : 4, "scheduler" : "simple", "num_partition" : 2, diff --git a/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json new file mode 100644 index 00000000..6561ffc0 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json @@ -0,0 +1,21 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "dram_num_partitions" : 2, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "booksim2", + "icnt_freq_mhz" : 1000, + "icnt_injection_ports_per_core" : 16, + "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", + "icnt_stats_print_period_cycles" : 10000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json new file mode 100644 index 00000000..fad63cc3 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json @@ -0,0 +1,20 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "dram_num_partitions" : 1, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "booksim2", + "icnt_freq_mhz" : 1000, + "icnt_injection_ports_per_core" : 16, + "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt" +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json new file mode 100644 index 00000000..2207f2b9 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json @@ -0,0 +1,18 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 700, + "core_stats_print_period_cycles" : 10000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :700, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 700, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json new file mode 100644 index 00000000..76f51b40 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json @@ -0,0 +1,19 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 940, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json new file mode 100644 index 00000000..42e003c7 --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json @@ -0,0 +1,25 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 940, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq_mhz" : 940, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 940, + "icnt_injection_ports_per_core" : 16, + + "num_partition" : 2, + "partition": { + "core_0":0, + "core_1":1 + } +} diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json new file mode 100644 index 00000000..44ec72fe --- /dev/null +++ b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json @@ -0,0 +1,21 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 1050, + "core_stats_print_period_cycles" : 10000, + "num_systolic_array_per_core" : 4, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :1200, + "dram_channels": 32, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_stats_print_period_cycles": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + "l2d_type" : "datacache", + "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32", + + "icnt_type" : "simple", + "icnt_latency" : 7, + "icnt_freq_mhz" : 1050, + "icnt_injection_ports_per_core" : 16 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json new file mode 100644 index 00000000..045407b7 --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json @@ -0,0 +1,17 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 1, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycles": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json new file mode 100644 index 00000000..d8f95d70 --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json @@ -0,0 +1,17 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 2, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycles": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json new file mode 100644 index 00000000..a5fa9585 --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json @@ -0,0 +1,17 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 4, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycles": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c1_booksim.json b/TOGSim/configs/systolic_ws_8x8_c1_booksim.json new file mode 100644 index 00000000..cf560171 --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c1_booksim.json @@ -0,0 +1,17 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 1, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycless": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "booksim2", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json new file mode 100644 index 00000000..8da61d72 --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json @@ -0,0 +1,17 @@ +{ + "num_cores" : 1, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 1, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycles": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json new file mode 100644 index 00000000..c5f429f9 --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json @@ -0,0 +1,18 @@ +{ + "core_type" : ["ws_mesh","ws_mesh"], + "num_cores" : 2, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 1, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycless": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json new file mode 100644 index 00000000..254520be --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json @@ -0,0 +1,17 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 2, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycles": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json new file mode 100644 index 00000000..e39867a7 --- /dev/null +++ b/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json @@ -0,0 +1,17 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 1000, + "core_stats_print_period_cycles" : 100000, + + "dram_type" : "ramulator2", + "dram_freq_mhz" :800, + "dram_channels": 4, + "dram_req_size_byte": 64, + "dram_num_burst_length" : 4, + "dram_stats_print_period_cycless": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + + "icnt_type" : "simple", + "icnt_latency" : 1, + "icnt_freq_mhz" : 1000 +} \ No newline at end of file diff --git a/PyTorchSimBackend/extern/booksim b/TOGSim/extern/booksim similarity index 100% rename from PyTorchSimBackend/extern/booksim rename to TOGSim/extern/booksim diff --git a/PyTorchSimBackend/extern/onnx b/TOGSim/extern/onnx similarity index 100% rename from PyTorchSimBackend/extern/onnx rename to TOGSim/extern/onnx diff --git a/PyTorchSimBackend/extern/protobuf b/TOGSim/extern/protobuf similarity index 100% rename from PyTorchSimBackend/extern/protobuf rename to TOGSim/extern/protobuf diff --git a/PyTorchSimBackend/extern/ramulator2 b/TOGSim/extern/ramulator2 similarity index 100% rename from PyTorchSimBackend/extern/ramulator2 rename to TOGSim/extern/ramulator2 diff --git a/PyTorchSimBackend/extern/ramulator_custom/.gitignore b/TOGSim/extern/ramulator_custom/.gitignore similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/.gitignore rename to TOGSim/extern/ramulator_custom/.gitignore diff --git a/PyTorchSimBackend/extern/ramulator_custom/CMakeLists.txt b/TOGSim/extern/ramulator_custom/CMakeLists.txt similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/CMakeLists.txt rename to TOGSim/extern/ramulator_custom/CMakeLists.txt diff --git a/PyTorchSimBackend/extern/ramulator_custom/include/ramulator/Ramulator.hpp b/TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/include/ramulator/Ramulator.hpp rename to TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Config.cpp b/TOGSim/extern/ramulator_custom/src/Config.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Config.cpp rename to TOGSim/extern/ramulator_custom/src/Config.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Config.h b/TOGSim/extern/ramulator_custom/src/Config.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Config.h rename to TOGSim/extern/ramulator_custom/src/Config.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Controller.h b/TOGSim/extern/ramulator_custom/src/Controller.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Controller.h rename to TOGSim/extern/ramulator_custom/src/Controller.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DDR4.cpp b/TOGSim/extern/ramulator_custom/src/DDR4.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/DDR4.cpp rename to TOGSim/extern/ramulator_custom/src/DDR4.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DDR4.h b/TOGSim/extern/ramulator_custom/src/DDR4.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/DDR4.h rename to TOGSim/extern/ramulator_custom/src/DDR4.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DRAM.h b/TOGSim/extern/ramulator_custom/src/DRAM.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/DRAM.h rename to TOGSim/extern/ramulator_custom/src/DRAM.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/HBM.cpp b/TOGSim/extern/ramulator_custom/src/HBM.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/HBM.cpp rename to TOGSim/extern/ramulator_custom/src/HBM.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/HBM.h b/TOGSim/extern/ramulator_custom/src/HBM.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/HBM.h rename to TOGSim/extern/ramulator_custom/src/HBM.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Memory.h b/TOGSim/extern/ramulator_custom/src/Memory.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Memory.h rename to TOGSim/extern/ramulator_custom/src/Memory.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.cpp b/TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.cpp rename to TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.h b/TOGSim/extern/ramulator_custom/src/MemoryFactory.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.h rename to TOGSim/extern/ramulator_custom/src/MemoryFactory.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Ramulator.cpp b/TOGSim/extern/ramulator_custom/src/Ramulator.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Ramulator.cpp rename to TOGSim/extern/ramulator_custom/src/Ramulator.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Refresh.cpp b/TOGSim/extern/ramulator_custom/src/Refresh.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Refresh.cpp rename to TOGSim/extern/ramulator_custom/src/Refresh.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Refresh.h b/TOGSim/extern/ramulator_custom/src/Refresh.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Refresh.h rename to TOGSim/extern/ramulator_custom/src/Refresh.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Request.cpp b/TOGSim/extern/ramulator_custom/src/Request.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Request.cpp rename to TOGSim/extern/ramulator_custom/src/Request.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Request.h b/TOGSim/extern/ramulator_custom/src/Request.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Request.h rename to TOGSim/extern/ramulator_custom/src/Request.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Scheduler.h b/TOGSim/extern/ramulator_custom/src/Scheduler.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Scheduler.h rename to TOGSim/extern/ramulator_custom/src/Scheduler.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/SpeedyController.h b/TOGSim/extern/ramulator_custom/src/SpeedyController.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/SpeedyController.h rename to TOGSim/extern/ramulator_custom/src/SpeedyController.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/StatType.cpp b/TOGSim/extern/ramulator_custom/src/StatType.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/StatType.cpp rename to TOGSim/extern/ramulator_custom/src/StatType.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/StatType.h b/TOGSim/extern/ramulator_custom/src/StatType.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/StatType.h rename to TOGSim/extern/ramulator_custom/src/StatType.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Statistics.h b/TOGSim/extern/ramulator_custom/src/Statistics.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Statistics.h rename to TOGSim/extern/ramulator_custom/src/Statistics.h diff --git a/PyTorchSimBackend/extern/stonneCore b/TOGSim/extern/stonneCore similarity index 100% rename from PyTorchSimBackend/extern/stonneCore rename to TOGSim/extern/stonneCore diff --git a/PyTorchSimBackend/include/Cache.h b/TOGSim/include/Cache.h similarity index 100% rename from PyTorchSimBackend/include/Cache.h rename to TOGSim/include/Cache.h diff --git a/PyTorchSimBackend/include/Cache_defs.h b/TOGSim/include/Cache_defs.h similarity index 100% rename from PyTorchSimBackend/include/Cache_defs.h rename to TOGSim/include/Cache_defs.h diff --git a/PyTorchSimBackend/include/Cache_stats.h b/TOGSim/include/Cache_stats.h similarity index 100% rename from PyTorchSimBackend/include/Cache_stats.h rename to TOGSim/include/Cache_stats.h diff --git a/PyTorchSimBackend/include/Common.h b/TOGSim/include/Common.h similarity index 100% rename from PyTorchSimBackend/include/Common.h rename to TOGSim/include/Common.h diff --git a/PyTorchSimBackend/include/Core.h b/TOGSim/include/Core.h similarity index 85% rename from PyTorchSimBackend/include/Core.h rename to TOGSim/include/Core.h index a3d55fa2..e4d2f30a 100644 --- a/PyTorchSimBackend/include/Core.h +++ b/TOGSim/include/Core.h @@ -9,7 +9,7 @@ #include "Dram.h" #include "Tile.h" #include "SimulationConfig.h" -#include "TMA.h" +#include "DMA.h" class Core { public: @@ -27,9 +27,9 @@ class Core { virtual void pop_memory_request(); virtual mem_fetch* top_memory_request() { return _request_queue.front(); } virtual void push_memory_response(mem_fetch* response); - void check_tag() { _tma.check_table(); } - void inc_numa_hit() { _stat_numa_hit++; } - void inc_numa_miss() { _stat_numa_miss++; } + void check_tag() { _dma.check_table(); } + void inc_numa_local_access() { _stat_numa_local_access++; } + void inc_numa_remote_access() { _stat_numa_remote_access++; } std::queue>& get_compute_pipeline(int compute_type); enum { @@ -50,20 +50,18 @@ class Core { /* Core id & config file */ const uint32_t _id; const SimulationConfig _config; - size_t _sram_size; - size_t _used_sram_size; uint32_t _num_systolic_array_per_core; uint32_t _systolic_array_rr = 0; - /* TMA Unit */ - TMA _tma; + /* DMA Unit */ + DMA _dma; /* cycle */ cycle_type _core_cycle; cycle_type _stat_tot_vu_compute_cycle = 0; std::vector _stat_tot_sa_compute_cycle; - cycle_type _stat_tot_tma_cycle = 0; - cycle_type _stat_tot_tma_idle_cycle = 0; + cycle_type _stat_tot_dma_cycle = 0; + cycle_type _stat_tot_dma_idle_cycle = 0; cycle_type _stat_tot_vu_compute_idle_cycle = 0; std::vector _stat_tot_sa_compute_idle_cycle; std::vector _stat_inst_count; @@ -71,13 +69,13 @@ class Core { uint64_t _stat_tot_mem_response = 0; uint64_t _stat_gemm_inst = 0; uint64_t _stat_skip_dma = 0; - uint64_t _stat_numa_hit = 0; - uint64_t _stat_numa_miss = 0; + uint64_t _stat_numa_local_access = 0; + uint64_t _stat_numa_remote_access = 0; cycle_type _stat_vu_compute_cycle = 0; std::vector _stat_sa_compute_cycle; - cycle_type _stat_tma_cycle = 0; - cycle_type _stat_tma_idle_cycle = 0; + cycle_type _stat_dma_cycle = 0; + cycle_type _stat_dma_idle_cycle = 0; cycle_type _stat_vu_compute_idle_cycle = 0; std::vector _stat_sa_compute_idle_cycle; uint64_t _stat_mem_response = 0; diff --git a/PyTorchSimBackend/include/TMA.h b/TOGSim/include/DMA.h similarity index 94% rename from PyTorchSimBackend/include/TMA.h rename to TOGSim/include/DMA.h index f8355470..2f41c6f3 100644 --- a/PyTorchSimBackend/include/TMA.h +++ b/TOGSim/include/DMA.h @@ -1,8 +1,9 @@ -#ifndef TMA_H -#define TMA_H +#ifndef DMA_H +#define DMA_H #include #include +#include #include #include #include "Instruction.h" @@ -16,9 +17,9 @@ struct VectorCompare { } }; -class TMA { +class DMA { public: - TMA(uint32_t id, uint32_t dram_req_size); + DMA(uint32_t id, uint32_t dram_req_size); void issue_tile(std::shared_ptr inst); bool is_finished() { return _finished; } @@ -114,7 +115,7 @@ class TMA { } std::shared_ptr& get_current_inst() { return _current_inst; } - std::shared_ptr> get_memory_access(); + std::shared_ptr> get_memory_access(cycle_type core_cycle, int nr_req); uint32_t generate_mem_access_id(); const uint32_t get_max_dim() { return _max_dim; } @@ -130,5 +131,7 @@ class TMA { bool _finished=true; std::map, uint32_t>> tag_table; std::map, std::vector>>> waiters; + std::queue _pending_accesses; + bool _generated_once = false; }; #endif \ No newline at end of file diff --git a/PyTorchSimBackend/include/DelayQueue.h b/TOGSim/include/DelayQueue.h similarity index 100% rename from PyTorchSimBackend/include/DelayQueue.h rename to TOGSim/include/DelayQueue.h diff --git a/PyTorchSimBackend/include/Dram.h b/TOGSim/include/Dram.h similarity index 99% rename from PyTorchSimBackend/include/Dram.h rename to TOGSim/include/Dram.h index 5e51b96d..d28ac25f 100644 --- a/PyTorchSimBackend/include/Dram.h +++ b/TOGSim/include/Dram.h @@ -6,7 +6,7 @@ #include #include "Common.h" -#include "TMA.h" +#include "DMA.h" #include "ramulator2.hh" #include "Hashing.h" #include "Cache.h" diff --git a/PyTorchSimBackend/include/Hashing.h b/TOGSim/include/Hashing.h similarity index 100% rename from PyTorchSimBackend/include/Hashing.h rename to TOGSim/include/Hashing.h diff --git a/PyTorchSimBackend/include/Instruction.h b/TOGSim/include/Instruction.h similarity index 96% rename from PyTorchSimBackend/include/Instruction.h rename to TOGSim/include/Instruction.h index 4c14dd81..9fad13f4 100644 --- a/PyTorchSimBackend/include/Instruction.h +++ b/TOGSim/include/Instruction.h @@ -60,9 +60,7 @@ class Instruction : public std::enable_shared_from_this { std::vector get_trace_address() { return _trace_address; } bool load_indirect_index(const std::string& path, uint64_t*& indirect_index, const std::vector& tile_size); void set_trace_address(std::vector& trace_address) { _trace_address = trace_address; } - size_t get_free_sram_size() { return _free_sram_size; } addr_type get_base_dram_address() { return dram_addr; } - void set_free_sram_size(size_t sram_size) { _free_sram_size=sram_size; } void* get_owner() { return _owner; } void set_owner(void *owner) { _owner = owner;} void set_owner_ready_queue(std::list>* q) { _owner_ready_queue_ref = q; } @@ -103,7 +101,6 @@ class Instruction : public std::enable_shared_from_this { size_t _tile_numel; size_t _nr_waiting_request=0; size_t _precision=0; - size_t _free_sram_size=0; addr_type dram_addr; uint32_t _numa_id = 0; // For DMA instruction int _compute_type = 0; diff --git a/PyTorchSimBackend/include/Interconnect.h b/TOGSim/include/Interconnect.h similarity index 95% rename from PyTorchSimBackend/include/Interconnect.h rename to TOGSim/include/Interconnect.h index 8467b7aa..e6b325d0 100644 --- a/PyTorchSimBackend/include/Interconnect.h +++ b/TOGSim/include/Interconnect.h @@ -1,6 +1,6 @@ #ifndef INTERCONNECT_H #define INTERCONNECT_H -#include "TMA.h" +#include "DMA.h" #include "booksim2/Interconnect.hpp" #include #include @@ -51,8 +51,9 @@ class SimpleInterconnect : public Interconnect { mem_fetch* access; }; - std::vector> _in_buffers; + std::vector>> _in_buffers; std::vector> _out_buffers; + std::vector _rr_next_src; std::vector _busy_node; }; diff --git a/PyTorchSimBackend/include/IntervalTree.h b/TOGSim/include/IntervalTree.h similarity index 100% rename from PyTorchSimBackend/include/IntervalTree.h rename to TOGSim/include/IntervalTree.h diff --git a/PyTorchSimBackend/include/L2Cache.h b/TOGSim/include/L2Cache.h similarity index 100% rename from PyTorchSimBackend/include/L2Cache.h rename to TOGSim/include/L2Cache.h diff --git a/PyTorchSimBackend/include/Memfetch.h b/TOGSim/include/Memfetch.h similarity index 100% rename from PyTorchSimBackend/include/Memfetch.h rename to TOGSim/include/Memfetch.h diff --git a/PyTorchSimBackend/include/Model.h b/TOGSim/include/Model.h similarity index 100% rename from PyTorchSimBackend/include/Model.h rename to TOGSim/include/Model.h diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/TOGSim/include/SimulationConfig.h similarity index 80% rename from PyTorchSimBackend/include/SimulationConfig.h rename to TOGSim/include/SimulationConfig.h index 8f011d00..64cfa223 100644 --- a/PyTorchSimBackend/include/SimulationConfig.h +++ b/TOGSim/include/SimulationConfig.h @@ -18,8 +18,7 @@ struct SimulationConfig { std::vector core_type; std::string stonne_config_path; uint32_t num_cores; - uint32_t core_freq; - uint32_t sram_size; + uint32_t core_freq_mhz; uint32_t core_print_interval = 0; uint32_t num_systolic_array_per_core = 1; uint32_t num_stonne_per_core = 1; @@ -28,7 +27,8 @@ struct SimulationConfig { /* DRAM config */ DramType dram_type; uint32_t dram_num_partitions = 1; - uint32_t dram_freq; + uint32_t dram_channels_per_partitions = 0; + uint32_t dram_freq_mhz; uint32_t dram_channels; uint32_t dram_req_size; uint32_t dram_latency; @@ -43,21 +43,20 @@ struct SimulationConfig { /* ICNT config */ IcntType icnt_type; - uint32_t icnt_node_per_core = 1; + uint32_t icnt_injection_ports_per_core = 1; std::string icnt_config_path; - uint32_t icnt_freq; + uint32_t icnt_freq_mhz; uint32_t icnt_latency; - uint32_t icnt_print_interval=0; + uint32_t icnt_stats_print_period_cycles=0; /* Sheduler config */ - uint32_t num_patition=1; + uint32_t num_partition=1; std::string scheduler_type; /* Core id, Partiton id mapping */ std::map partiton_map; /* Other configs */ - uint32_t precision; std::string layout; uint64_t align_address(uint64_t addr) { @@ -65,6 +64,6 @@ struct SimulationConfig { } float max_dram_bandwidth() { - return dram_freq * dram_channels * dram_req_size * 2 / dram_nbl / 1000; // GB/s + return dram_freq_mhz * dram_channels * dram_req_size * 2 / dram_nbl / 1000; // GB/s } }; \ No newline at end of file diff --git a/PyTorchSimBackend/include/Simulator.h b/TOGSim/include/Simulator.h similarity index 100% rename from PyTorchSimBackend/include/Simulator.h rename to TOGSim/include/Simulator.h diff --git a/PyTorchSimBackend/include/SparseCore.h b/TOGSim/include/SparseCore.h similarity index 100% rename from PyTorchSimBackend/include/SparseCore.h rename to TOGSim/include/SparseCore.h diff --git a/PyTorchSimBackend/include/Tile.h b/TOGSim/include/Tile.h similarity index 100% rename from PyTorchSimBackend/include/Tile.h rename to TOGSim/include/Tile.h diff --git a/PyTorchSimBackend/include/TileGraph.h b/TOGSim/include/TileGraph.h similarity index 100% rename from PyTorchSimBackend/include/TileGraph.h rename to TOGSim/include/TileGraph.h diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/TOGSim/include/TileGraphParser.h similarity index 100% rename from PyTorchSimBackend/include/TileGraphParser.h rename to TOGSim/include/TileGraphParser.h diff --git a/PyTorchSimBackend/include/scheduler/Scheduler.h b/TOGSim/include/scheduler/Scheduler.h similarity index 100% rename from PyTorchSimBackend/include/scheduler/Scheduler.h rename to TOGSim/include/scheduler/Scheduler.h diff --git a/PyTorchSimBackend/src/CMakeLists.txt b/TOGSim/src/CMakeLists.txt similarity index 100% rename from PyTorchSimBackend/src/CMakeLists.txt rename to TOGSim/src/CMakeLists.txt diff --git a/PyTorchSimBackend/src/Cache.cc b/TOGSim/src/Cache.cc similarity index 100% rename from PyTorchSimBackend/src/Cache.cc rename to TOGSim/src/Cache.cc diff --git a/PyTorchSimBackend/src/Cache_stats.cc b/TOGSim/src/Cache_stats.cc similarity index 100% rename from PyTorchSimBackend/src/Cache_stats.cc rename to TOGSim/src/Cache_stats.cc diff --git a/PyTorchSimBackend/src/Common.cc b/TOGSim/src/Common.cc similarity index 73% rename from PyTorchSimBackend/src/Common.cc rename to TOGSim/src/Common.cc index 5581f8bd..b5c092b3 100644 --- a/PyTorchSimBackend/src/Common.cc +++ b/TOGSim/src/Common.cc @@ -39,15 +39,14 @@ SimulationConfig initialize_config(json config) { for (int i=0; i(config, "core_print_interval"); + parsed_config.core_print_interval = get_config_value(config, "core_stats_print_period_cycles"); /* Stonne config */ if (config.contains("stonne_config_path")) @@ -63,20 +62,27 @@ SimulationConfig initialize_config(json config) { else throw std::runtime_error(fmt::format("Not implemented dram type {} ", (std::string)config["dram_type"])); - parsed_config.dram_freq = config["dram_freq"]; + parsed_config.dram_freq_mhz = config["dram_freq_mhz"]; if (config.contains("dram_latency")) parsed_config.dram_latency = config["dram_latency"]; - if (config.contains("dram_config_path")) - parsed_config.dram_config_path = config["dram_config_path"]; + if (config.contains("ramulator_config_path")) + parsed_config.dram_config_path = config["ramulator_config_path"]; parsed_config.dram_channels = config["dram_channels"]; - if (config.contains("dram_req_size")) - parsed_config.dram_req_size = config["dram_req_size"]; - if (config.contains("dram_print_interval")) - parsed_config.dram_print_interval = config["dram_print_interval"]; - if(config.contains("dram_nbl")) - parsed_config.dram_nbl = config["dram_nbl"]; - if (config.contains("dram_num_partitions")) + if (config.contains("dram_req_size_byte")) + parsed_config.dram_req_size = config["dram_req_size_byte"]; + if (config.contains("dram_stats_print_period_cycles")) + parsed_config.dram_print_interval = config["dram_stats_print_period_cycles"]; + if(config.contains("dram_num_burst_length")) + parsed_config.dram_nbl = config["dram_num_burst_length"]; + if (config.contains("dram_num_partitions")) { parsed_config.dram_num_partitions = config["dram_num_partitions"]; + if (parsed_config.dram_channels % parsed_config.dram_num_partitions != 0) { + throw std::runtime_error("[Config] DRAM channels must be divisible by dram_num_partitions"); + } + } + parsed_config.dram_channels_per_partitions = + parsed_config.dram_channels / parsed_config.dram_num_partitions; + /* L2D config */ if (config.contains("l2d_type")) { @@ -104,19 +110,20 @@ SimulationConfig initialize_config(json config) { else throw std::runtime_error(fmt::format("Not implemented icnt type {} ", (std::string)config["icnt_type"])); - parsed_config.icnt_freq = config["icnt_freq"]; + parsed_config.icnt_freq_mhz = config["icnt_freq_mhz"]; if (config.contains("icnt_latency")) parsed_config.icnt_latency = config["icnt_latency"]; - if (config.contains("icnt_config_path")) - parsed_config.icnt_config_path = config["icnt_config_path"]; - if (config.contains("icnt_print_interval")) - parsed_config.icnt_print_interval = config["icnt_print_interval"]; - if (config.contains("icnt_node_per_core")) - parsed_config.icnt_node_per_core = config["icnt_node_per_core"]; + if (config.contains("booksim_config_path")) + parsed_config.icnt_config_path = config["booksim_config_path"]; + if (config.contains("icnt_stats_print_period_cycles")) + parsed_config.icnt_stats_print_period_cycles = config["icnt_stats_print_period_cycles"]; + if (config.contains("icnt_injection_ports_per_core")) + parsed_config.icnt_injection_ports_per_core = config["icnt_injection_ports_per_core"]; - parsed_config.scheduler_type = config["scheduler"]; + if (config.contains("scheduler")) + parsed_config.scheduler_type = config["scheduler"]; if (config.contains("num_partition")) - parsed_config.num_patition = config["num_partition"]; + parsed_config.num_partition = config["num_partition"]; if (config.contains("partition")) { for (int i=0; i& op) { void Core::issue(std::shared_ptr op) { if (op->get_instructions().size()){ - spdlog::trace("[Core {}][{}] New Tile is issued, remain sram: {} Required size: {}, Free size: {}", - _id, _core_cycle, _sram_size-_used_sram_size, op->get_required_sram_size(), - op->get_instructions().back()->get_free_sram_size()); - } else { - spdlog::trace("[Core {}][{}] New Tile is issued, remain sram: {} Required size: {}", - _id, _core_cycle, _sram_size-_used_sram_size, op->get_required_sram_size()); + spdlog::trace("[{}][Core {}][TILE_SCHEDULED]", + _core_cycle, _id); } - //_used_sram_size += op->get_required_sram_size(); for (const auto& inst : op->get_instructions()) { if (inst->is_ready()) op->enqueue_ready(inst); @@ -125,39 +118,38 @@ void Core::dma_cycle() { /* Set tag table of async dma load */ if (instruction->is_dma_read() && instruction->is_async_dma()) { auto& key = instruction->get_tag_id(); - assert(!_tma.get_tag_finish(instruction->subgraph_id, key)); - _tma.set_tag_finish(instruction->subgraph_id, key); - spdlog::trace("[Core {}][{}] {} ASYNC FINISHED, Used sram: {}, Release sram: {}, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", - _id, _core_cycle, opcode_to_string(instruction->get_opcode()), - _used_sram_size, instruction->get_free_sram_size(), + assert(!_dma.get_tag_finish(instruction->subgraph_id, key)); + _dma.set_tag_finish(instruction->subgraph_id, key); + spdlog::trace("[{}][Core {}] {} ASYNC FINISHED, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", + _core_cycle, _id, opcode_to_string(instruction->get_opcode()), instruction->subgraph_id, instruction->get_addr_name(), fmt::format("[{}]", fmt::join(instruction->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(instruction->get_tag_idx_list(), ", ")), fmt::format("[{}]", fmt::join(instruction->get_tag_stride_list(), ", "))); - for (auto & wait_inst : _tma.get_tag_waiter(instruction->subgraph_id, key)) { - _tma.mark_tag_used(instruction->subgraph_id, key); + for (auto & wait_inst : _dma.get_tag_waiter(instruction->subgraph_id, key)) { + _dma.mark_tag_used(instruction->subgraph_id, key); finish_instruction(wait_inst); } } _dma_finished_queue.erase(_dma_finished_queue.begin()); } - if (_tma.is_finished()) { + if (_dma.is_finished()) { /* Finish instruction when it is DMA store */ - if (_tma.get_current_inst() != nullptr) { - std::shared_ptr finished_inst = std::move(_tma.get_current_inst()); + if (_dma.get_current_inst() != nullptr) { + std::shared_ptr finished_inst = std::move(_dma.get_current_inst()); if (finished_inst->is_dma_write()) { /* Only DMA write operation is finished! */ finish_instruction(finished_inst); } else if (finished_inst->is_dma_read() && finished_inst->is_async_dma()) { /* Register tag table for async dma load */ - _tma.register_tag(finished_inst->subgraph_id, finished_inst->get_tag_id()); + _dma.register_tag(finished_inst->subgraph_id, finished_inst->get_tag_id()); finish_instruction(finished_inst); } else if(!finished_inst->is_dma_read()) { - spdlog::error("[Core {}][{}] TMA instruction in not valid", _id, _core_cycle); + spdlog::error("[{}][Core {}] DMA instruction in not valid", _core_cycle, _id); exit(EXIT_FAILURE); } else if (finished_inst->get_opcode() == Opcode::BAR) { - spdlog::trace("[Core {}][{}] {} FINISHED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle, + spdlog::trace("[{}][Core {}] {} FINISHED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, opcode_to_string(finished_inst->get_opcode()), finished_inst->get_addr_name(), fmt::format("[{}]", fmt::join(finished_inst->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(finished_inst->get_tag_idx_list(), ", ")), @@ -170,27 +162,27 @@ void Core::dma_cycle() { /* Issue new DMA operation */ if (!_ld_inst_queue.empty()) { std::shared_ptr inst = _ld_inst_queue.front(); - _tma.issue_tile(inst); + _dma.issue_tile(inst); _ld_inst_queue.pop(); } else if (!_st_inst_queue.empty()) { std::shared_ptr inst = _st_inst_queue.front(); - _tma.issue_tile(inst); + _dma.issue_tile(inst); _st_inst_queue.pop(); } else { - /* TMA is idle */ - _stat_tma_idle_cycle++; + /* DMA is idle */ + _stat_dma_idle_cycle++; return; } } /* Generate memfetch */ - auto access_vec = _tma.get_memory_access(); + auto access_vec = _dma.get_memory_access(_core_cycle, _config.icnt_injection_ports_per_core); for (auto access : *access_vec) { access->set_start_cycle(_core_cycle); _request_queue.push(access); } - /* Increase tma stat cycle */ - _stat_tma_cycle++; + /* Increase dma stat cycle */ + _stat_dma_cycle++; } void Core::cycle() { @@ -218,20 +210,20 @@ void Core::cycle() { /* Check another MOVIN with same tag is issued */ auto& key = inst->get_tag_id(); if (inst->is_sparse_inst()) { - _tma.register_tag(inst->subgraph_id, key); - _tma.set_tag_sparse(inst->subgraph_id, key); + _dma.register_tag(inst->subgraph_id, key); + _dma.set_tag_sparse(inst->subgraph_id, key); finish_instruction(inst); issued = true; _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; break; - } else if (inst->is_async_dma() && _tma.tag_key_exist(inst->subgraph_id, key)) { - bool finished = _tma.get_tag_finish(inst->subgraph_id, key); + } else if (inst->is_async_dma() && _dma.tag_key_exist(inst->subgraph_id, key)) { + bool finished = _dma.get_tag_finish(inst->subgraph_id, key); if (finished) finish_instruction(inst); else - _tma.register_tag_waiter(inst->subgraph_id, key, inst); - spdlog::trace("[Core {}][{}] {} SKIPPED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(), + _dma.register_tag_waiter(inst->subgraph_id, key, inst); + spdlog::trace("[{}][Core {}][SIKIPPED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, + opcode_to_string(inst->get_opcode()), inst->get_addr_name(), fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), @@ -240,8 +232,8 @@ void Core::cycle() { _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; break; } else { - spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(), + spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, + opcode_to_string(inst->get_opcode()), inst->get_addr_name(), fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), @@ -252,8 +244,12 @@ void Core::cycle() { } } case Opcode::MOVOUT: - spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size()); + spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, + opcode_to_string(inst->get_opcode()), + inst->get_addr_name(), + fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), + fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), + fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", "))); _st_inst_queue.push(inst); issued = true; break; @@ -269,13 +265,14 @@ void Core::cycle() { inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle() - overlapped_cycle; inst->bubble_cycle = bubble_cycle; } + if (inst->get_compute_cycle() == 0) { inst->finish_instruction(); static_cast(inst->get_owner())->inc_finished_inst(); _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; instructions.erase(it); } else { - spdlog::trace("[Core {}][SA {}][{}] {}-{} ISSUED, finsh at {}", _id, _systolic_array_rr, _core_cycle, + spdlog::trace("[{}][Core {}][INST_ISSUED][SA {}] {}-{}, finsh at {}", _core_cycle, _id, _systolic_array_rr, opcode_to_string(inst->get_opcode()), inst->get_compute_type(), inst->finish_cycle); target_pipeline.push(inst); issued = true; @@ -288,7 +285,7 @@ void Core::cycle() { case Opcode::BAR: { auto& key = inst->get_tag_id(); - uint32_t finished = _tma.get_tag_finish(inst->subgraph_id, key); + uint32_t finished = _dma.get_tag_finish(inst->subgraph_id, key); if (finished == -1) { for (auto child_inst : inst->get_child_inst()) { if (child_inst->get_opcode() == Opcode::COMP && child_inst->get_compute_type() == MATMUL) { @@ -297,12 +294,12 @@ void Core::cycle() { } finish_instruction(inst); } else if (finished != 0) { - _tma.mark_tag_used(inst->subgraph_id, key); + _dma.mark_tag_used(inst->subgraph_id, key); finish_instruction(inst); } else { - _tma.register_tag_waiter(inst->subgraph_id, key, inst); + _dma.register_tag_waiter(inst->subgraph_id, key, inst); } - spdlog::trace("[Core {}][{}] {} ISSUED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle, + spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_addr_name(), fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), @@ -344,31 +341,26 @@ void Core::cycle() { } void Core::finish_instruction(std::shared_ptr& inst) { - size_t free_sram_size = inst->get_free_sram_size(); if (inst->finished) { - spdlog::error("[Core {}][{}] {} FINISHED, inst already finished!!", _id, _core_cycle, + spdlog::error("[{}][Core {}][ERROR] {} inst already finished!!", _core_cycle, _id, opcode_to_string(inst->get_opcode())); exit(EXIT_FAILURE); } inst->finish_instruction(); static_cast(inst->get_owner())->inc_finished_inst(); if (inst->get_opcode() == Opcode::COMP) { - spdlog::trace("[Core {}][{}] {}-{} FINISHED, Used sram: {}, Release sram: {}", - _id, _core_cycle, opcode_to_string(inst->get_opcode()), inst->get_compute_type(), - _used_sram_size, inst->get_free_sram_size()); + spdlog::trace("[{}][Core {}][INST_FINISHED] {}-{}", + _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_compute_type()); } else if (inst->get_opcode() != Opcode::BAR && inst->is_async_dma()){ - spdlog::trace("[Core {}][{}] {} ASYNC REGISTERED, Used sram: {}, Release sram: {} subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", - _id, _core_cycle, opcode_to_string(inst->get_opcode()), _used_sram_size, - inst->get_free_sram_size(), inst->subgraph_id, inst->get_addr_name(), + spdlog::trace("[{}][Core {}][ASYNC] {} subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", + _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->subgraph_id, inst->get_addr_name(), inst->get_tag_id(), fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", "))); } else if ((inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) && !inst->is_async_dma()) { - spdlog::trace("[Core {}][{}] {} FINISHED, free_sram_size: {} addr_name: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(), - inst->get_addr_name()); + spdlog::trace("[{}][Core {}][INST_FINISHED] {} addr_name: {}", _core_cycle, _id, + opcode_to_string(inst->get_opcode()), inst->get_addr_name()); } - //_used_sram_size -= free_sram_size; } bool Core::running() { @@ -378,7 +370,7 @@ bool Core::running() { for (int i=0; i<_num_systolic_array_per_core;i++) running = running || !_sa_compute_pipeline.at(i).empty(); running = running || !_dma_waiting_queue.empty() || !_dma_finished_queue.empty(); - running = running || !_tma.empty(); + running = running || !_dma.empty(); running = running || !_ld_inst_queue.empty(); running = running || !_st_inst_queue.empty(); return running; @@ -419,43 +411,62 @@ void Core::print_stats() { std::vector sa_utilization; update_stats(); spdlog::info("===== Instructions count ====="); - for (int i=0; i < static_cast(Opcode::COUNT); i++) { - if (i == static_cast(Opcode::COMP)) - spdlog::info("Core [{}] : {} inst count {} (GEMM: {}, Vector: {}), skipped inst count {}", _id, opcode_to_string(static_cast(i)), _stat_inst_count.at(i), _stat_gemm_inst, _stat_inst_count.at(i) - _stat_gemm_inst, _stat_tot_skipped_inst.at(i)); - else - spdlog::info("Core [{}] : {} inst count {}, skipped inst count {}", _id, opcode_to_string(static_cast(i)), _stat_inst_count.at(i), _stat_tot_skipped_inst.at(i)); + for (int i = 0; i < static_cast(Opcode::COUNT); i++) { + auto opcode = static_cast(i); + auto inst = _stat_inst_count.at(i); + auto skipped = _stat_tot_skipped_inst.at(i); + auto name = opcode_to_string(opcode); + + if (opcode == Opcode::COMP) { + auto gemm = _stat_gemm_inst; + auto vector = inst - gemm; + if (skipped) + spdlog::info("Core [{}] : {:8} inst_count {} (GEMM: {}, Vector: {}), skipped inst_count {}", + _id, name, inst, gemm, vector, skipped); + else + spdlog::info("Core [{}] : {:8} inst_count {} (GEMM: {}, Vector: {})", + _id, name, inst, gemm, vector); + } + else { + if (skipped) + spdlog::info("Core [{}] : {:8} inst_count {}, skipped inst_count {}", + _id, name, inst, skipped); + else + spdlog::info("Core [{}] : {:8} inst_count {}", + _id, name, inst); + } } spdlog::info("========= Core stat ========="); for (int i=0; i<_num_systolic_array_per_core; i++) sa_utilization.push_back(static_cast(_stat_tot_sa_compute_cycle.at(i) * 100) / _core_cycle); for (int i=0; i<_num_systolic_array_per_core; i++) - spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i), + spdlog::info("Core [{}] : Systolic array [{}] utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, i, sa_utilization.at(i), _stat_tot_sa_compute_cycle.at(i), _stat_tot_sa_compute_idle_cycle.at(i)); - float dram_bw = _config.dram_req_size * _stat_tot_mem_response * _config.core_freq / (_core_cycle * 1000); // B/cycle - spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_tot_tma_cycle, _stat_tot_tma_idle_cycle, dram_bw, _stat_tot_mem_response); - spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id, + float dram_bw = _config.dram_req_size * _stat_tot_mem_response * _config.core_freq_mhz / (_core_cycle * 1000); // B/cycle + spdlog::info("Core [{}] : DMA active_cycles, {} DMA idle_cycles {}, DRAM BW {:.3f} GB/s ({} responses)", _id, _stat_tot_dma_cycle, _stat_tot_dma_idle_cycle, dram_bw, _stat_tot_mem_response); + spdlog::info("Core [{}] : Vector unit utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id, static_cast(_stat_tot_vu_compute_cycle * 100) / _core_cycle, _stat_tot_vu_compute_cycle, _stat_tot_vu_compute_idle_cycle); - spdlog::info("Core [{}] : Numa hit count : {}, Numa miss count : {}", _id, _stat_numa_hit, _stat_numa_miss); - spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle); + spdlog::info("Core [{}] : NUMA local memory: {} requests, remote memory: {} requests", _id, _stat_numa_local_access, _stat_numa_remote_access); + spdlog::info("Core [{}] : Total_cycles {}", _id, _core_cycle); } void Core::print_current_stats() { std::vector sa_utilization; for (int i=0; i<_num_systolic_array_per_core; i++) sa_utilization.push_back(static_cast(_stat_sa_compute_cycle.at(i) * 100) / _config.core_print_interval); - float dram_bw = _config.dram_req_size * _stat_mem_response * _config.core_freq / (_config.core_print_interval * 1000); // B/cycle + float dram_bw = _config.dram_req_size * _stat_mem_response * _config.core_freq_mhz / (_config.core_print_interval * 1000); // B/cycle auto level = spdlog::level::info; if(_id != 0) level = spdlog::level::debug; spdlog::info("========= Core stat ========="); for (int i=0; i<_num_systolic_array_per_core; i++) - spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i), + spdlog::info("Core [{}] : Systolic array [{}] utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, i, sa_utilization.at(i), _stat_sa_compute_cycle.at(i), _stat_sa_compute_idle_cycle.at(i)); - spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_tma_cycle, _stat_tma_idle_cycle, dram_bw, _stat_mem_response); - spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id, + spdlog::info("Core [{}] : DMA active_cycles {}, DMA idle_cycles {}, DRAM BW {:.3f} GB/s ({} responses)", _id, _stat_dma_cycle, _stat_dma_idle_cycle, dram_bw, _stat_mem_response); + spdlog::info("Core [{}] : Vector unit Utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, static_cast(_stat_vu_compute_cycle * 100) / _config.core_print_interval, _stat_vu_compute_cycle, _stat_vu_compute_idle_cycle); - spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle); + spdlog::info("Core [{}] : Total_cycles {}", _id, _core_cycle); update_stats(); } @@ -468,13 +479,13 @@ void Core::update_stats() { } _stat_tot_vu_compute_cycle += _stat_vu_compute_cycle; - _stat_tot_tma_cycle += _stat_tma_cycle; - _stat_tot_tma_idle_cycle += _stat_tma_idle_cycle; + _stat_tot_dma_cycle += _stat_dma_cycle; + _stat_tot_dma_idle_cycle += _stat_dma_idle_cycle; _stat_tot_mem_response += +_stat_mem_response; _stat_vu_compute_cycle = 0; - _stat_tma_cycle = 0; - _stat_tma_idle_cycle = 0; + _stat_dma_cycle = 0; + _stat_dma_idle_cycle = 0; _stat_vu_compute_idle_cycle = 0; _stat_mem_response = 0; } \ No newline at end of file diff --git a/TOGSim/src/DMA.cc b/TOGSim/src/DMA.cc new file mode 100644 index 00000000..f8f21025 --- /dev/null +++ b/TOGSim/src/DMA.cc @@ -0,0 +1,83 @@ +#include "DMA.h" +#include "TileGraph.h" + +DMA::DMA(uint32_t id, uint32_t dram_req_size) { + _id = id; + _dram_req_size = dram_req_size; + _current_inst = nullptr; + _finished = true; +} + +void DMA::issue_tile(std::shared_ptr inst) { + _current_inst = std::move(inst); + std::vector& tile_size = _current_inst->get_tile_size(); + if (tile_size.size() <= 0 || tile_size.size() > get_max_dim()) { + spdlog::error("[DMA {}] issued tile is not supported format..", _id); + exit(EXIT_FAILURE); + } + _finished = false; +} + +std::shared_ptr> DMA::get_memory_access(cycle_type core_cycle, int nr_req) { + + if (!_generated_once) { + std::shared_ptr> addr_set = + _current_inst->get_dram_address(_dram_req_size); + + Tile* owner = (Tile*)_current_inst->get_owner(); + std::shared_ptr owner_subgraph = owner->get_owner(); + unsigned long long base_daddr = _current_inst->get_base_dram_address(); + + bool is_cacheable = + owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size); + + spdlog::trace("[{}][Core {}][SRAM] Address: 0x{:016x}, Is_cacheable: {}", + core_cycle, _id, base_daddr, is_cacheable); + spdlog::trace("[{}][Core {}][NUMA] Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}", + core_cycle, _id, owner_subgraph->get_core_id(), + _current_inst->get_numa_id(), _current_inst->get_addr_name(), + _current_inst->is_dma_write()); + for (const auto& addr : *addr_set) { + mem_access_type acc_type = + _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W + : mem_access_type::GLOBAL_ACC_R; + mf_type type = + _current_inst->is_dma_write() ? mf_type::WRITE_REQUEST + : mf_type::READ_REQUEST; + + mem_fetch* access = new mem_fetch( + addr, acc_type, type, _dram_req_size, + _current_inst->get_numa_id(), + static_cast(_current_inst.get())); + + access->set_cacheable(is_cacheable); + _current_inst->inc_waiting_request(); + _pending_accesses.push(access); + } + _generated_once = true; + } + + if (nr_req == -1) + nr_req = _pending_accesses.size(); + + // Return pending accesses up to nr_req + auto access_vec = std::make_shared>(); + for (int i = 0; i < nr_req; i++) { + if (_pending_accesses.empty()) + break; + access_vec->push_back(_pending_accesses.front()); + _pending_accesses.pop(); + } + + if (_pending_accesses.empty()) { + _finished = true; + _generated_once = false; + } + + return access_vec; +} + +uint32_t DMA::generate_mem_access_id() { + static uint32_t id_counter{0}; + return id_counter++; +} \ No newline at end of file diff --git a/PyTorchSimBackend/src/DelayQueue.cc b/TOGSim/src/DelayQueue.cc similarity index 100% rename from PyTorchSimBackend/src/DelayQueue.cc rename to TOGSim/src/DelayQueue.cc diff --git a/PyTorchSimBackend/src/Dram.cc b/TOGSim/src/Dram.cc similarity index 97% rename from PyTorchSimBackend/src/Dram.cc rename to TOGSim/src/Dram.cc index ab074bda..089c582e 100644 --- a/PyTorchSimBackend/src/Dram.cc +++ b/TOGSim/src/Dram.cc @@ -17,10 +17,10 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) { _n_bl = config.dram_nbl; _req_size = config.dram_req_size; _n_partitions = config.dram_num_partitions; - _n_ch_per_partition = _n_ch / _n_partitions; + _n_ch_per_partition = config.dram_channels_per_partitions; _config = config; - spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}", config.max_dram_bandwidth(), config.dram_freq, _n_ch, _req_size); + spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}B", config.max_dram_bandwidth(), config.dram_freq_mhz, _n_ch, _req_size); /* Initialize DRAM Channels */ for (int ch = 0; ch < _n_ch; ch++) { m_to_crossbar_queue.push_back(std::queue()); diff --git a/PyTorchSimBackend/src/Hashing.cc b/TOGSim/src/Hashing.cc similarity index 100% rename from PyTorchSimBackend/src/Hashing.cc rename to TOGSim/src/Hashing.cc diff --git a/PyTorchSimBackend/src/Instruction.cc b/TOGSim/src/Instruction.cc similarity index 100% rename from PyTorchSimBackend/src/Instruction.cc rename to TOGSim/src/Instruction.cc diff --git a/PyTorchSimBackend/src/Interconnect.cc b/TOGSim/src/Interconnect.cc similarity index 77% rename from PyTorchSimBackend/src/Interconnect.cc rename to TOGSim/src/Interconnect.cc index 8a684ff7..ab2d5d89 100644 --- a/PyTorchSimBackend/src/Interconnect.cc +++ b/TOGSim/src/Interconnect.cc @@ -4,12 +4,15 @@ SimpleInterconnect::SimpleInterconnect(SimulationConfig config) : _latency(config.icnt_latency) { _cycles = 0; _config = config; - _n_nodes = config.num_cores + config.dram_channels; + _n_nodes = config.num_cores * _config.icnt_injection_ports_per_core + config.dram_channels; _in_buffers.resize(_n_nodes); _out_buffers.resize(_n_nodes); _busy_node.resize(_n_nodes); + _rr_next_src.resize(_n_nodes); for(int node = 0; node < _n_nodes; node++) { _busy_node[node] = false; + _in_buffers.at(node).resize(_n_nodes); + _rr_next_src[node] = 0; } } @@ -19,35 +22,36 @@ bool SimpleInterconnect::running() { } void SimpleInterconnect::cycle() { - for(int node = 0; node < _n_nodes; node++) { - int src_node = (_rr_start + node ) % _n_nodes; - if(!_in_buffers[src_node].empty() && _in_buffers[src_node].front().finish_cycle <= _cycles) { - uint32_t dest = _in_buffers[src_node].front().dest; - if(!_busy_node[dest]) { - _out_buffers[dest].push(_in_buffers[src_node].front().access); - _in_buffers[src_node].pop(); - _busy_node[dest] = true; - // spdlog::trace("PUSH TO OUTBUFFER {} {}", src_node, dest); + for(int dest = 0; dest < _n_nodes; dest++) { + int src_start = _rr_next_src[dest]; + bool pushed = false; + + for(int i = 0; i < _n_nodes; i++) { + int src = (src_start + i) % _n_nodes; + + if (!_in_buffers[src][dest].empty() && + _in_buffers[src][dest].front().finish_cycle <= _cycles) { + + _out_buffers[dest].push(_in_buffers[src][dest].front().access); + _in_buffers[src][dest].pop(); + _rr_next_src[dest] = (src + 1) % _n_nodes; + pushed = true; + break; } } } - - for(int node = 0; node < _n_nodes; node++) { - _busy_node[node] = false; - } - _rr_start = (_rr_start + 1) % _n_nodes; _cycles++; } void SimpleInterconnect::push(uint32_t src, uint32_t dest, mem_fetch* request) { SimpleInterconnect::Entity entity; - if(_in_buffers[src].empty()) + if(_in_buffers[src][dest].empty()) entity.finish_cycle = _cycles + _latency; else - entity.finish_cycle = _in_buffers[src].back().finish_cycle + 1; + entity.finish_cycle = _in_buffers[src][dest].back().finish_cycle + 1; entity.dest = dest; entity.access = request; - _in_buffers[src].push(entity); + _in_buffers[src][dest].push(entity); } bool SimpleInterconnect::is_full(uint32_t nid, mem_fetch* request) { @@ -72,11 +76,11 @@ void SimpleInterconnect::pop(uint32_t nid) { Booksim2Interconnect::Booksim2Interconnect(SimulationConfig config) { _config = config; - _n_nodes = config.num_cores * _config.icnt_node_per_core + config.dram_channels; - spdlog::info("Initialize Booksim2"); + _n_nodes = config.num_cores * _config.icnt_injection_ports_per_core + config.dram_channels; + spdlog::info("Initialize Booksim2"); char* onnxim_path_env = std::getenv("TORCHSIM_DIR"); std::string onnxim_path = onnxim_path_env != NULL? - std::string(onnxim_path_env) + "/PyTorchSimBackend" : std::string("./"); + std::string(onnxim_path_env) + "/TOGSim" : std::string("./"); _config_path = fs::path(onnxim_path).append("configs").append((std::string)config.icnt_config_path).string(); spdlog::info("Config path : {}", _config_path); diff --git a/PyTorchSimBackend/src/L2Cache.cc b/TOGSim/src/L2Cache.cc similarity index 100% rename from PyTorchSimBackend/src/L2Cache.cc rename to TOGSim/src/L2Cache.cc diff --git a/PyTorchSimBackend/src/Simulator.cc b/TOGSim/src/Simulator.cc similarity index 89% rename from PyTorchSimBackend/src/Simulator.cc rename to TOGSim/src/Simulator.cc index 6bc80286..41a2c7a5 100644 --- a/PyTorchSimBackend/src/Simulator.cc +++ b/TOGSim/src/Simulator.cc @@ -3,9 +3,9 @@ Simulator::Simulator(SimulationConfig config) : _config(config), _core_cycles(0) { // Create dram object - _core_period = 1000000 / (config.core_freq); - _icnt_period = 1000000 / (config.icnt_freq); - _dram_period = 1000000 / (config.dram_freq); + _core_period = 1000000 / (config.core_freq_mhz); + _icnt_period = 1000000 / (config.icnt_freq_mhz); + _dram_period = 1000000 / (config.dram_freq_mhz); _core_time = 0; _dram_time = 0; _icnt_time = 0; @@ -14,20 +14,20 @@ Simulator::Simulator(SimulationConfig config) _n_cores = config.num_cores; _n_memories = config.dram_channels; _memory_req_size = config.dram_req_size; - _noc_node_per_core = config.icnt_node_per_core; + _noc_node_per_core = config.icnt_injection_ports_per_core; char* onnxim_path_env = std::getenv("TORCHSIM_DIR"); std::string onnxim_path = onnxim_path_env != NULL? - std::string(onnxim_path_env) + "/PyTorchSimBackend" : std::string("./"); + std::string(onnxim_path_env) + "/TOGSim" : std::string("./"); // Create core objects _cores.resize(_n_cores); for (int core_index = 0; core_index < _n_cores; core_index++) { if (config.core_type[core_index] == CoreType::WS_MESH) { - spdlog::info("[Config/Core] Core {}: {} MHz, Spad size: {} KB, Systolic array per core: {}", - core_index, config.core_freq , config.sram_size, config.num_systolic_array_per_core); + spdlog::info("[Config/Core] Core {}: {} MHz, Systolic array per core: {}", + core_index, config.core_freq_mhz, config.num_systolic_array_per_core); _cores.at(core_index) = std::make_unique(core_index, _config); } else if(config.core_type[core_index] == CoreType::STONNE) { - spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq); + spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq_mhz); _cores.at(core_index) = std::make_unique(core_index, _config); } else { throw std::runtime_error(fmt::format("Not implemented Core type {} ", @@ -51,7 +51,7 @@ Simulator::Simulator(SimulationConfig config) } // Create interconnect object - spdlog::info("[Config/Interconnect] Inerconnect freq: {} MHz", config.icnt_freq); + spdlog::info("[Config/Interconnect] Interconnect freq: {} MHz", config.icnt_freq_mhz); if (config.icnt_type == IcntType::SIMPLE) { spdlog::info("[Config/Interconnect] SimpleInerconnect selected"); _icnt = std::make_unique(config); @@ -62,10 +62,10 @@ Simulator::Simulator(SimulationConfig config) spdlog::error("[Configuration] Invalid interconnect type...!"); exit(EXIT_FAILURE); } - _icnt_interval = config.icnt_print_interval; + _icnt_interval = config.icnt_stats_print_period_cycles; // Initialize Scheduler - for (int i=0; i(Scheduler(config, &_core_cycles, &_core_time, i))); } @@ -117,11 +117,11 @@ void Simulator::icnt_cycle() { mem_fetch *front = _cores[core_id]->top_memory_request(); front->set_core_id(core_id); if (!_icnt->is_full(port_id, front)) { - //int node_id = _dram->get_channel_id(front) / 16; - //if (core_id == node_id) - // _cores[core_id]->inc_numa_hit(); - //else - // _cores[core_id]->inc_numa_miss(); + int node_id = _dram->get_channel_id(front) / _config.dram_channels_per_partitions; + if (core_id == node_id) + _cores[core_id]->inc_numa_local_access(); + else + _cores[core_id]->inc_numa_remote_access(); _icnt->push(port_id , get_dest_node(front), front); _cores[core_id]->pop_memory_request(); _nr_from_core++; @@ -229,7 +229,7 @@ void Simulator::cycle() { if (IS_ICNT_CYCLE(_cycle_mask)) icnt_cycle(); } - spdlog::info("Simulation Finished"); + spdlog::info("Simulation finished"); for (auto &core: _cores) { core->check_tag(); } @@ -291,5 +291,5 @@ void Simulator::print_core_stat() for (int core_id = 0; core_id < _n_cores; core_id++) { _cores[core_id]->print_stats(); } - spdlog::info("Total execution cycle: {}", _core_cycles); -} \ No newline at end of file + spdlog::info("Total execution cycles: {}", _core_cycles); +} diff --git a/PyTorchSimBackend/src/SparseCore.cc b/TOGSim/src/SparseCore.cc similarity index 86% rename from PyTorchSimBackend/src/SparseCore.cc rename to TOGSim/src/SparseCore.cc index 64d3da55..d5629b9c 100644 --- a/PyTorchSimBackend/src/SparseCore.cc +++ b/TOGSim/src/SparseCore.cc @@ -27,14 +27,14 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config) } Config stonneConfig = stonneCores.at(0)->getStonneConfig(); - unsigned int core_freq = config.core_freq; // MHz; + unsigned int core_freq_mhz = config.core_freq_mhz; // MHz; num_ms = stonneConfig.m_MSNetworkCfg.ms_size; r_port_nr = config.num_stonne_port; w_port_nr = config.num_stonne_port; - double compute_throughput = static_cast(num_ms) * core_freq / 1e3; // FLOPs/sec - double dn_bandwidth = static_cast(r_port_nr) * config.dram_req_size * core_freq * 1e6 / 8.0 / 1e9; // GB/s - double rn_bandwidth = static_cast(w_port_nr) * config.dram_req_size * core_freq * 1e6 / 8.0 / 1e9; // GB/s + double compute_throughput = static_cast(num_ms) * core_freq_mhz / 1e3; // FLOPs/sec + double dn_bandwidth = static_cast(r_port_nr) * config.dram_req_size * core_freq_mhz * 1e6 / 8.0 / 1e9; // GB/s + double rn_bandwidth = static_cast(w_port_nr) * config.dram_req_size * core_freq_mhz * 1e6 / 8.0 / 1e9; // GB/s for (int i=0; i tile) { } } if (selected_core_idx == -1) { - spdlog::error("[StonneCore {}] Faield to issue tile", _id); + spdlog::error("[StonneCore {}] Failed to issue tile", _id); exit(1); } stonneCores.at(selected_core_idx)->init(1); @@ -84,7 +84,7 @@ void SparseCore::issue(std::shared_ptr tile) { setTraceMode(selected_core_idx, is_trace_mode); percore_tiles.at(selected_core_idx).push_back(tile); coreBusy.at(selected_core_idx) = true; - spdlog::info("[StonneCore {}][{}] issued new tile (trace_mode: {})", _id, selected_core_idx, is_trace_mode); + spdlog::info("[{}][StonneCore {}/{}][Launch] New operation (trace_mode: {})", _core_cycle, _id, selected_core_idx, is_trace_mode); }; bool SparseCore::can_issue(const std::shared_ptr& op) { @@ -100,8 +100,8 @@ void SparseCore::checkStatus(uint32_t subcore_id) { int new_status = stonneCore->getMCFSMStats(); int compute_cycle = stonneCore->getMSStats().n_multiplications; if (traceCoreStatus.at(subcore_id) != new_status) { - spdlog::trace("Stonne Core [{}][{}] status transition {} -> {}, Load/Store: {}/{}, compute_cycle: {}", - _id, _core_cycle, traceCoreStatus.at(subcore_id), new_status, + spdlog::trace("[{}][StonneCore {}/{}][Transition] status {} -> {}, Load/Store: {}/{}, compute_cycle: {}", + _core_cycle, _id, subcore_id, traceCoreStatus.at(subcore_id), new_status, traceLoadTraffic.at(subcore_id).size(), traceStoreTraffic.at(subcore_id).size(), (compute_cycle - traceCoreCycle.at(subcore_id))/num_ms); if (traceLoadTraffic.at(subcore_id).size()) { TraceNode load_node = TraceNode(traceNodeList.at(subcore_id).size()+2, "load", TraceNode::StonneTraceLoad); @@ -151,14 +151,14 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { traceStoreTraffic.at(subcore_id).insert(target_addr); break; default: - spdlog::error("[SparseCore] Invalid request type from core"); + spdlog::error("[StonneCore] Invalid request type from core"); return; } req->request_time = _core_cycle; req->stonneId = subcore_id; std::tuple key = std::make_tuple(target_addr, acc_type, type, allocTrafficID()); registerMemfetch(key, [this, req, acc_type, type]() { - spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ + spdlog::trace("[{}][StonneCore][DRAM Response] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ _core_cycle, _core_cycle - req->request_time, req->getAddress(), int(req->getcmd()), _config.dram_req_size); req->setReply(); stonneCores.at(req->stonneId)->pushResponse(req); @@ -168,7 +168,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { /* Finish stonne core */ if (coreBusy.at(subcore_id) && stonneCore->isFinished()) { stonneCore->finish(); - spdlog::info("[SparseCore][{}] Operation finished at {}", _id, _core_cycle); + spdlog::info("[{}][StonneCore {}/{}][Finish] Operation done", _core_cycle, _id, subcore_id); std::shared_ptr target_tile = percore_tiles.at(subcore_id).front(); SST_STONNE::StonneOpDesc *opDesc = static_cast(target_tile->get_custom_data()); if (opDesc->trace_path != "") @@ -239,7 +239,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { { auto acc_type = mem_access_type::GLOBAL_ACC_R; auto type = mf_type::READ_REQUEST; - spdlog::trace("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle, + spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}", _core_cycle, _id, subcore_id, opcode_to_string(inst->get_opcode())); for (auto addr : inst->get_trace_address()) { addr = addr - (addr & _config.dram_req_size-1); @@ -247,8 +247,8 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { std::tuple key = std::make_tuple(addr, acc_type, type, allocTrafficID()); uint64_t current_time = _core_cycle; registerMemfetch(key, [this, inst, addr, current_time, type]() { - spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ - this->_core_cycle, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size); + spdlog::trace("[{}][StonneCore {}][RESPONSE] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ + this->_core_cycle, _id, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size); inst->dec_waiting_request(); }); } @@ -260,7 +260,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { { auto acc_type = mem_access_type::GLOBAL_ACC_W; auto type = mf_type::WRITE_REQUEST; - spdlog::trace("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle, + spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}", _core_cycle, _id, subcore_id, opcode_to_string(inst->get_opcode())); for (auto addr : inst->get_trace_address()) { addr = addr - (addr & _config.dram_req_size-1); @@ -268,8 +268,8 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { std::tuple key = std::make_tuple(addr, acc_type, type, allocTrafficID()); uint64_t current_time = _core_cycle; registerMemfetch(key, [this, inst, addr, current_time, type]() { - spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ - this->_core_cycle, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size); + spdlog::trace("[{}][StonneCore {}][RESPONSE] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ + this->_core_cycle, _id, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size); inst->dec_waiting_request(); }); } @@ -285,7 +285,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { inst->finish_cycle = _core_cycle + inst->get_compute_cycle(); else inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle(); - spdlog::trace("[Core {}][{}][{}] {} ISSUED, finsh at {}", _id, subcore_id, _core_cycle, + spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}, finsh at {}", _core_cycle, _id, subcore_id, opcode_to_string(inst->get_opcode()), inst->finish_cycle); target_pipeline.push(inst); issued = true; @@ -313,7 +313,7 @@ void SparseCore::cycle() { for (auto& req_pair : request_merge_table) { _request_queue.push(req_pair.second); request_merge_table.erase(req_pair.first); - spdlog::debug("[SparseCore][{}][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \ + spdlog::debug("[{}][StonneCore][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \ _core_cycle, _id, req_pair.second->get_addr(), int(req_pair.second->get_access_type()), int(req_pair.second->get_type()), _config.dram_req_size, nr_request); nr_request++; @@ -366,9 +366,9 @@ void SparseCore::print_current_stats() { } cycle_type nr_mul = percore_stat.at(i).n_multiplications; percore_stat.at(i).reset(); - spdlog::info("Stonne Core [{}][{}] : nr_multiplications: {}", _id, i, nr_mul); + spdlog::info("StonneCore [{}][{}] : nr_multiplications: {}", _id, i, nr_mul); } - spdlog::info("Stonne Core [{}] : Total cycle {}", _id, _core_cycle); + spdlog::info("StonneCore [{}] : Total cycle {}", _id, _core_cycle); } void SparseCore::print_stats() { @@ -383,9 +383,9 @@ void SparseCore::print_stats() { percore_total_stat.at(i) += percore_stat.at(i); } cycle_type nr_mul = percore_total_stat.at(i).n_multiplications; - spdlog::info("Stonne Core [{}][{}] : nr_multiplications: {}", _id, i, nr_mul); + spdlog::info("StonneCore [{}][{}] : nr_multiplications: {}", _id, i, nr_mul); } - spdlog::info("Stonne Core [{}] : Total cycle {}", _id, _core_cycle); + spdlog::info("StonneCore [{}] : Total cycle {}", _id, _core_cycle); } std::shared_ptr SparseCore::pop_finished_tile() { @@ -399,18 +399,18 @@ std::shared_ptr SparseCore::pop_finished_tile() { void SparseCore::finish_instruction(std::shared_ptr& inst) { if (inst->finished) { - spdlog::error("[Core {}][{}] {} FINISHED, inst already finished!!", _id, _core_cycle, + spdlog::error("[{}][StonneCore {}][Error] {} inst already finished!!", _core_cycle, _id, opcode_to_string(inst->get_opcode())); exit(EXIT_FAILURE); } inst->finish_instruction(); static_cast(inst->get_owner())->inc_finished_inst(); if (inst->get_opcode() == Opcode::COMP) { - spdlog::info("[StonneCore {}][{}] {} FINISHED", - _id, _core_cycle, opcode_to_string(inst->get_opcode())); + spdlog::info("[{}][StonneCore {}][INST_FINISHED] {}", + _core_cycle, _id, opcode_to_string(inst->get_opcode())); } else if (inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) { - spdlog::info("[StonneCore {}][{}] {} FINISHED, free_sram_size: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size()); + spdlog::info("[{}][StonneCore {}][INST_FINISHED] {}", _core_cycle, _id, + opcode_to_string(inst->get_opcode())); } } @@ -460,5 +460,5 @@ void SparseCore::dumpTrace(int stonne_core_id, const std::string& path) { outFile << traceNodeList.at(stonne_core_id)[i]; } outFile << "\n}" << std::endl; - spdlog::info("[StonneCore] Success to save trace dump file to \"{}\"", path); + spdlog::info("[{}][StonneCore] Success to save trace dump file to \"{}\"", _core_cycle, path); } diff --git a/PyTorchSimBackend/src/Tile.cc b/TOGSim/src/Tile.cc similarity index 100% rename from PyTorchSimBackend/src/Tile.cc rename to TOGSim/src/Tile.cc diff --git a/PyTorchSimBackend/src/TileGraph.cc b/TOGSim/src/TileGraph.cc similarity index 96% rename from PyTorchSimBackend/src/TileGraph.cc rename to TOGSim/src/TileGraph.cc index 33e995e9..120d49e2 100644 --- a/PyTorchSimBackend/src/TileGraph.cc +++ b/TOGSim/src/TileGraph.cc @@ -111,7 +111,6 @@ void TileGraph::allocate_subgraph(int core_id, int slot_id) { for (auto it = _subgraph_vec.begin(); it != _subgraph_vec.end(); ++it) { if ((*it)->get_core_id() == -1 || (*it)->get_core_id() == core_id) { - spdlog::trace("[TileGraph] Core {} allocated new subgraph(affinity={}) (remains: {})", core_id, (*it)->get_core_id(), _subgraph_vec.size()-1); std::shared_ptr subgraph = *it; _cpu_graph_map[core_id][slot_id] = subgraph; _subgraph_vec.erase(it); diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc similarity index 98% rename from PyTorchSimBackend/src/TileGraphParser.cc rename to TOGSim/src/TileGraphParser.cc index 4a562724..42776a51 100644 --- a/PyTorchSimBackend/src/TileGraphParser.cc +++ b/TOGSim/src/TileGraphParser.cc @@ -627,9 +627,6 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa } } } - /* Set last instruction's free sram size */ - if(parent->get_instructions().size()) - parent->get_instructions().back()->set_free_sram_size(parent->get_required_sram_size()); parent->append_child(child); /* Create new tile */ @@ -682,11 +679,6 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa tile_vec.back()->inc_required_sram_size(inst->get_tile_numel() * inst->get_precision()); } - /* Set last instruction's free sram size */ - std::shared_ptr parent = tile_vec.back(); - if (parent->get_instructions().size()) - parent->get_instructions().back()->set_free_sram_size(parent->get_required_sram_size()); - return tile_vec; } diff --git a/PyTorchSimBackend/src/helper/CommandLineParser.cc b/TOGSim/src/helper/CommandLineParser.cc similarity index 100% rename from PyTorchSimBackend/src/helper/CommandLineParser.cc rename to TOGSim/src/helper/CommandLineParser.cc diff --git a/PyTorchSimBackend/src/helper/CommandLineParser.h b/TOGSim/src/helper/CommandLineParser.h similarity index 100% rename from PyTorchSimBackend/src/helper/CommandLineParser.h rename to TOGSim/src/helper/CommandLineParser.h diff --git a/PyTorchSimBackend/src/main.cc b/TOGSim/src/main.cc similarity index 95% rename from PyTorchSimBackend/src/main.cc rename to TOGSim/src/main.cc index 214e7131..1af11257 100644 --- a/PyTorchSimBackend/src/main.cc +++ b/TOGSim/src/main.cc @@ -9,7 +9,7 @@ namespace fs = std::filesystem; namespace po = boost::program_options; -const char* env_value = std::getenv("BACKENDSIM_DRYRUN"); +const char* env_value = std::getenv("TOGSIM_DRYRUN"); bool isDryRun = (env_value != nullptr && std::string(env_value) == "1"); void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, std::string config_path, cycle_type request_time=0, int partiton_id=0) { @@ -38,7 +38,7 @@ int until(Simulator *simulator, cycle_type until_cycle) { void interactive_mode(Simulator* simulator) { std::string command; - std::cout << "[" << simulator->get_core_cycle() << "] BackendSim> "; + std::cout << "[" << simulator->get_core_cycle() << "] TOGSim> "; while (std::getline(std::cin, command)) { std::istringstream iss(command); @@ -79,7 +79,7 @@ void interactive_mode(Simulator* simulator) { spdlog::error("Error: unknown command {} Available commands are: launch, until, quit.", token); } if (isDryRun) - std::cout << "[" << simulator->get_core_cycle() << "] BackendSim> "; + std::cout << "[" << simulator->get_core_cycle() << "] TOGSim> "; } simulator->cycle(); if (simulator->get_core_cycle()==0) @@ -149,6 +149,6 @@ int main(int argc, char** argv) { /* Simulation time measurement */ auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end - start; - spdlog::info("Simulation time: {:2f} seconds", duration.count()); + spdlog::info("Wall-clock time for simulation: {:2f} seconds", duration.count()); return 0; } diff --git a/PyTorchSimBackend/src/scheduler/Scheduler.cc b/TOGSim/src/scheduler/Scheduler.cc similarity index 100% rename from PyTorchSimBackend/src/scheduler/Scheduler.cc rename to TOGSim/src/scheduler/Scheduler.cc diff --git a/experiments/BERT.py b/experiments/BERT.py index 3534505d..c5bb454e 100644 --- a/experiments/BERT.py +++ b/experiments/BERT.py @@ -9,7 +9,7 @@ def run_BERT(size, input_seq, config): from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request # from tests.test_transformer import EncoderBlock from tests.Fusion.test_transformer_fusion import EncoderBlock - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() hidden_dim = {'base': 768, 'large': 1024, 'xlarge': 2048} @@ -36,7 +36,7 @@ def run_BERT(size, input_seq, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path FIXME: gem5 result is different as directoy name sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -51,7 +51,7 @@ def run_BERT(size, input_seq, config): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_BERT(size, input_seq, config) diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh index a32cd0a6..28e6ad5e 100755 --- a/experiments/artifact/cycle_validation/run_cycle.sh +++ b/experiments/artifact/cycle_validation/run_cycle.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -export TORCHSIM_CONFIG=$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json +export TORCHSIM_CONFIG=$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs mkdir -p $LOG_DIR diff --git a/experiments/artifact/cycle_validation/summary_cycle.py b/experiments/artifact/cycle_validation/summary_cycle.py index 529d0161..c0f48ac3 100644 --- a/experiments/artifact/cycle_validation/summary_cycle.py +++ b/experiments/artifact/cycle_validation/summary_cycle.py @@ -88,7 +88,7 @@ def compute_mae(errors): name = file[:-4] with open(full_path, errors="ignore") as f: for line in f: - match = re.search(r"Total execution cycle:\s*([0-9]+)", line) + match = re.search(r"Total execution cycles:\s*([0-9]+)", line) if match: cycle_map[name] = int(match.group(1)) break diff --git a/experiments/artifact/speedup/run_speedup.sh b/experiments/artifact/speedup/run_speedup.sh index 7d0c0da2..2b9625e9 100755 --- a/experiments/artifact/speedup/run_speedup.sh +++ b/experiments/artifact/speedup/run_speedup.sh @@ -1,7 +1,7 @@ #!/bin/bash LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs -CONFIG_DIR="$TORCHSIM_DIR/PyTorchSimBackend/configs" -SIMULATOR_BIN="$TORCHSIM_DIR/PyTorchSimBackend/build/bin/Simulator" +CONFIG_DIR="$TORCHSIM_DIR/TOGSim/configs" +SIMULATOR_BIN="$TORCHSIM_DIR/TOGSim/build/bin/Simulator" configs=( "systolic_ws_128x128_c2_simple_noc_tpuv3.json" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh index 66829f02..4055b355 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh @@ -26,7 +26,7 @@ for i in "${config[@]}"; do echo "===== config=$i | model=$ops =====" >> "$output_file" sum=0.0 count=0 - config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i" + config_path="$TORCHSIM_DIR/TOGSim/configs/$i" for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh index 2f9718f1..83b3798a 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh @@ -27,7 +27,7 @@ for i in "${config[@]}"; do echo "===== config=$i | model=$ops =====" >> "$output_file" sum=0.0 count=0 - config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i" + config_path="$TORCHSIM_DIR/TOGSim/configs/$i" for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh index 8ff7e2b6..f1467614 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh @@ -25,7 +25,7 @@ for i in "${config[@]}"; do echo "===== config=$i | model=$ops =====" >> "$output_file" sum=0.0 count=0 - config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i" + config_path="$TORCHSIM_DIR/TOGSim/configs/$i" for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh index aa35735c..2ed3ca2a 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh @@ -33,7 +33,7 @@ for i in "${config[@]}"; do echo "===== config=$i | model=$ops =====" >> "$output_file" sum=0.0 count=0 - config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i" + config_path="$TORCHSIM_DIR/TOGSim/configs/$i" for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" diff --git a/experiments/attention.py b/experiments/attention.py index e8f89dac..5a8c5f45 100644 --- a/experiments/attention.py +++ b/experiments/attention.py @@ -14,7 +14,7 @@ def attention(query, key, value): p_attn = scores.softmax(dim=-2) return torch.matmul(value.transpose(-1, -2), p_attn) from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() query = torch.randn(size).to(device=device) key = torch.randn(size).to(device=device) @@ -36,7 +36,7 @@ def attention(query, key, value): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -50,7 +50,7 @@ def attention(query, key, value): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_attention(size, config) diff --git a/experiments/conv.py b/experiments/conv.py index e8b97906..c8ca9a37 100644 --- a/experiments/conv.py +++ b/experiments/conv.py @@ -15,7 +15,7 @@ def custom_conv2d(a, b, bias): conv2d.weight = torch.nn.Parameter(b) # conv2d.bias = torch.nn.Parameter(bias) return conv2d(a) - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() conv_input = torch.randn(batch_size, i_c, i_h, i_w).to(memory_format=torch.channels_last, device=device) conv_kernel = torch.randn(o_c, i_c, kernel_size, kernel_size).to(memory_format=torch.channels_last, device=device) @@ -37,7 +37,7 @@ def custom_conv2d(a, b, bias): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -51,7 +51,7 @@ def custom_conv2d(a, b, bias): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_conv2d(size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7], config) \ No newline at end of file diff --git a/experiments/gemm.py b/experiments/gemm.py index a1fdcff6..67dc4f79 100644 --- a/experiments/gemm.py +++ b/experiments/gemm.py @@ -10,7 +10,7 @@ def run_matmul(input_size, hidden_size, output_size, config): from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request def custom_matmul(a, b): return torch.matmul(a, b) - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() torch.manual_seed(0) input = torch.randn(input_size, hidden_size).to(device=device) @@ -31,7 +31,7 @@ def custom_matmul(a, b): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -45,10 +45,10 @@ def custom_matmul(a, b): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() run_matmul(size[0], size[1], size[2], config) diff --git a/experiments/layernorm.py b/experiments/layernorm.py index f149394e..0beaac6c 100644 --- a/experiments/layernorm.py +++ b/experiments/layernorm.py @@ -8,7 +8,7 @@ def run_layernorm(size, config): from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() input = torch.randn(size).to(device=device) opt_fn = torch.compile(dynamic=False)(torch.nn.LayerNorm(size[-1]).to(device=device)) @@ -27,7 +27,7 @@ def run_layernorm(size, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -42,7 +42,7 @@ def run_layernorm(size, config): os.environ['TORCHSIM_FUSION_REDUCTION_REDUCTION'] = "0" # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_layernorm(size, config) diff --git a/experiments/resnet18.py b/experiments/resnet18.py index 5d9dcf86..23d62e40 100644 --- a/experiments/resnet18.py +++ b/experiments/resnet18.py @@ -8,7 +8,7 @@ def run_resnet(batch, config): from torchvision.models import resnet18 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() model = resnet18().eval() input = torch.randn(batch, 3, 224, 224).to(device=device) @@ -29,7 +29,7 @@ def run_resnet(batch, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -43,7 +43,7 @@ def run_resnet(batch, config): os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1" # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_resnet(batch, config) diff --git a/experiments/resnet50.py b/experiments/resnet50.py index bd52afc1..60a46071 100644 --- a/experiments/resnet50.py +++ b/experiments/resnet50.py @@ -8,7 +8,7 @@ def run_resnet(batch, config): from torchvision.models import resnet50 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() model = resnet50().eval() input = torch.randn(batch, 3, 224, 224).to(device=device) @@ -29,7 +29,7 @@ def run_resnet(batch, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -43,7 +43,7 @@ def run_resnet(batch, config): os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1" # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_resnet(batch, config) diff --git a/experiments/softmax.py b/experiments/softmax.py index 14d28fee..532ef091 100644 --- a/experiments/softmax.py +++ b/experiments/softmax.py @@ -8,7 +8,7 @@ def run_softmax(size, config, dim=1): from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() input = torch.randn(size).to(device=device) opt_fn = torch.compile(dynamic=False)(torch.nn.Softmax(dim=dim).to(device=device)) @@ -27,7 +27,7 @@ def run_softmax(size, config, dim=1): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -41,7 +41,7 @@ def run_softmax(size, config, dim=1): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_softmax(size, config) diff --git a/scripts/CompilerOpt_experiment/DMAopt.sh b/scripts/CompilerOpt_experiment/DMAopt.sh index 469cf766..22118b1e 100644 --- a/scripts/CompilerOpt_experiment/DMAopt.sh +++ b/scripts/CompilerOpt_experiment/DMAopt.sh @@ -1,5 +1,5 @@ #!/bin/bash -export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json" +export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json" # None FG DMA export TORCHSIM_SUBTILE=0 diff --git a/scripts/ILS_experiment/test_matmul.py b/scripts/ILS_experiment/test_matmul.py index 09cc407d..667dfc66 100644 --- a/scripts/ILS_experiment/test_matmul.py +++ b/scripts/ILS_experiment/test_matmul.py @@ -60,7 +60,7 @@ def custom_matmul(bias, a, b): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_matmul(device, *shape) diff --git a/scripts/build_from_source.sh b/scripts/build_from_source.sh new file mode 100644 index 00000000..fb9e82e3 --- /dev/null +++ b/scripts/build_from_source.sh @@ -0,0 +1,22 @@ +#!/bin/bash +home="/workspace" +cd $home + +# Gem5 +apt -y update && apt -y upgrade && apt -y install scons +git clone https://github.com/PSAL-POSTECH/gem5.git +cd gem5 && scons build/RISCV/gem5.opt -j $(nproc) +export GEM5_PATH=$home/gem5/build/RISCV/gem5.opt +cd $home + +# LLVM +git clone https://github.com/PSAL-POSTECH/llvm-project.git +cd llvm-project && mkdir build && cd build && \ + cmake -DLLVM_ENABLE_PROJECTS=mlir -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/riscv-llvm -DLLVM_TARGETS_TO_BUILD=RISCV -G "Unix Makefiles" ../llvm && \ + make -j && make install +cd $home + +# Spike Simulator +git clone https://github.com/PSAL-POSTECH/riscv-isa-sim.git --branch TorchSim && cd riscv-isa-sim && mkdir build && cd build && \ + ../configure --prefix=$RISCV && make -j && make install +cd $home \ No newline at end of file diff --git a/scripts/chiplet.sh b/scripts/chiplet.sh index 3dfba3d9..2989e4fd 100755 --- a/scripts/chiplet.sh +++ b/scripts/chiplet.sh @@ -14,16 +14,16 @@ fi GEMM_PATH="$1" INDEX_NAME="$2" -SIMULATOR_PATH="$TORCHSIM_DIR/PyTorchSimBackend/build/bin/Simulator" +SIMULATOR_PATH="$TORCHSIM_DIR/TOGSim/build/bin/Simulator" GEMM_DIR_NAME=$(basename "$GEMM_PATH") echo "GEMM Directory Name: $GEMM_DIR_NAME" CONFIG_LIST=( - "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json" + "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json" ) CONFIG_LIST2=( - "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json" - "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json" + "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json" + "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json" ) shift shift @@ -51,7 +51,7 @@ for CONFIG in "${CONFIG_LIST[@]}"; do # Run Simulator echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" & - echo "[BackendSimulator] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\"" + echo "[TOGSim] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\"" done done @@ -65,6 +65,6 @@ for CONFIG in "${CONFIG_LIST2[@]}"; do # Run Simulator # echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" & - echo "[BackendSimulator] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\"" + echo "[TOGSim] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\"" done wait \ No newline at end of file diff --git a/scripts/chiplet_prep.py b/scripts/chiplet_prep.py index 168532f1..32f7ad50 100644 --- a/scripts/chiplet_prep.py +++ b/scripts/chiplet_prep.py @@ -61,8 +61,8 @@ def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() parser = argparse.ArgumentParser(description='Process folder argument.') parser.add_argument('size', type=int, help='Folder value', default=256) diff --git a/scripts/end2end.sh b/scripts/end2end.sh index 7ca5c93d..579b8c14 100755 --- a/scripts/end2end.sh +++ b/scripts/end2end.sh @@ -7,34 +7,34 @@ BASE_PATH=$1 # Input as the first argument total_sum=0 total_core=0 total_vector=0 -# Find all backendsim_result folders -mapfile -t backend_folders < <(find "$BASE_PATH" -type d -name "backendsim_result") +# Find all togsim_result folders +mapfile -t togsim_folders < <(find "$BASE_PATH" -type d -name "togsim_result") -# Iterate over each backendsim_result folder -for backend_folder in "${backend_folders[@]}"; do - # echo "Processing folder: $backend_folder" +# Iterate over each togsim_result folder +for togsim_folder in "${togsim_folders[@]}"; do + # echo "Processing folder: $togsim_folder" - # Find all files within the backendsim_result folder - mapfile -t files < <(find "$backend_folder" -type f) + # Find all files within the togsim_result folder + mapfile -t files < <(find "$togsim_folder" -type f) for file in "${files[@]}"; do # echo "Processing $file" - # Extract the last line containing "Total cycle" - total_cycle=$(grep "Total cycle" "$file" | tail -n 1 | sed -E 's/.*Total cycle ([0-9]+).*/\1/') + # Extract the last line containing "Total_cycles" + total_cycle=$(grep "Total_cycles" "$file" | tail -n 1 | sed -E 's/.*Total_cycles ([0-9]+).*/\1/') # echo "total_cycle: $total_cycle" - active_cycles=($(grep -o 'active cycle [0-9]*' "$file" | awk '{print $3}')) + active_cycles=($(grep -o 'active_cycles [0-9]*' "$file" | awk '{print $3}')) num_cycles=${#active_cycles[@]} if [ "$num_cycles" -ge 3 ]; then core_cycle=${active_cycles[$((num_cycles-3))]} else - echo "Error: cannot find core active cycle" + echo "Error: cannot find core active_cycles" fi if [[ "$num_cycles" -ge 1 ]]; then - # Extract the last two active cycles + # Extract the last two active_cycless vector_core_cycle=${active_cycles[$((num_cycles-1))]} else - echo "Error: cannot find vector core active cycle" + echo "Error: cannot find vector core active_cycles" fi echo "file: $file total_cycle: $total_cycle SA core_cycle: $core_cycle vector_core_cycle: $vector_core_cycle" diff --git a/scripts/get_tog_result.sh b/scripts/get_tog_result.sh index 9359e1e5..6fd399e0 100755 --- a/scripts/get_tog_result.sh +++ b/scripts/get_tog_result.sh @@ -3,8 +3,8 @@ total_cycles=0 # Read through input stream line by line while IFS= read -r line; do - # Check if the line contains both "[BackendSimulator]" and "stored" - if [[ "$line" == *"[BackendSimulator]"* && "$line" == *"stored"* ]]; then + # Check if the line contains both "[TOGSimulator]" and "stored" + if [[ "$line" == *"[TOGSimulator]"* && "$line" == *"stored"* ]]; then # Extract the file path from the line file_path=$(echo "$line" | sed -n 's/.*stored to "\(.*\)"$/\1/p') diff --git a/scripts/sim_time.sh b/scripts/sim_time.sh index 15c60736..95df5982 100755 --- a/scripts/sim_time.sh +++ b/scripts/sim_time.sh @@ -6,15 +6,15 @@ BASE_PATH=$1 # Input as the first argument # Initialize total_sum as string for awk processing total_sum=0.0 -# Find all backendsim_result folders -mapfile -t backend_folders < <(find "$BASE_PATH" -type d -name "backendsim_result") +# Find all togsim_result folders +mapfile -t togsim_folders < <(find "$BASE_PATH" -type d -name "togsim_result") -# Iterate over each backendsim_result folder -for backend_folder in "${backend_folders[@]}"; do - mapfile -t files < <(find "$backend_folder" -type f) +# Iterate over each togsim_result folder +for togsim_folder in "${togsim_folders[@]}"; do + mapfile -t files < <(find "$togsim_folder" -type f) for file in "${files[@]}"; do - sim_time=$(grep "Simulation time:" "$file" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+(\.[0-9]+)?).*/\1/') + sim_time=$(grep "Wall-clock time for simulation:" "$file" | tail -n 1 | sed -E 's/.*Wall-clock time for simulation: ([0-9]+(\.[0-9]+)?).*/\1/') echo "file: $file total_cycle: $sim_time" if [[ -n "$sim_time" ]]; then diff --git a/scripts/sparsity_experiment/run.sh b/scripts/sparsity_experiment/run.sh index 0b7bc6f5..94e00527 100755 --- a/scripts/sparsity_experiment/run.sh +++ b/scripts/sparsity_experiment/run.sh @@ -5,7 +5,7 @@ export TORCHSIM_FORCE_TIME_M=8 export TORCHSIM_FORCE_TIME_N=8 OUTPUT_DIR="12GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -13,7 +13,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="24GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -21,7 +21,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="48GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -29,7 +29,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="12GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -37,7 +37,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="24GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -45,7 +45,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="48GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 diff --git a/scripts/stonne_experiment2/tog_gen.py b/scripts/stonne_experiment2/tog_gen.py index 2f184f4c..be30795b 100644 --- a/scripts/stonne_experiment2/tog_gen.py +++ b/scripts/stonne_experiment2/tog_gen.py @@ -5,7 +5,7 @@ from collections import defaultdict sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) from AsmParser.tog_generator import tog_generator -from Simulator.simulator import BackendSimulator +from Simulator.simulator import TOGSimulator from PyTorchSimFrontend import extension_config def extract_simulation_stats(result_path): @@ -19,9 +19,9 @@ def extract_simulation_stats(result_path): for line in lines: if "nr_multiplications" in line: nr_multiplications = line.strip().split(":")[-1].strip() - elif "Total execution cycle" in line: + elif "Total execution cycles" in line: total_cycle = line.strip().split(":")[-1].strip() - elif "Simulation time" in line: + elif "Wall-clock time for simulation" in line: sim_time = line.strip().split(":")[-1].replace("seconds", "").strip() return nr_multiplications, total_cycle, sim_time @@ -71,9 +71,9 @@ def extract_simulation_stats(result_path): if "outerPro" in path: continue tog_path = os.path.join(path, "tile_graph.onnx") - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json' - backsim = BackendSimulator(backend_path, stonne_config_path) + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/TOGSim/configs/stonne_validation_c1_simple_noc.json' + backsim = TOGSimulator(togsim_path, stonne_config_path) result_path = backsim.simulation(tog_path) nr_multiplications, total_cycle, sim_time = extract_simulation_stats(result_path) sim_time, total_cycle = float(sim_time), int(total_cycle) diff --git a/test_extension_backend.py b/test_extension_backend.py index f0a9353a..5e6427ef 100644 --- a/test_extension_backend.py +++ b/test_extension_backend.py @@ -22,8 +22,8 @@ from tests.Fusion.test_matmul_activation import test_matmul_activation if __name__ == "__main__": - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_vectoradd(device, (47, 10)) #test_vector_scalar_add(device, (10, 10)) diff --git a/tests/Diffusion/test_diffusion.py b/tests/Diffusion/test_diffusion.py index 03d1b721..c5170209 100644 --- a/tests/Diffusion/test_diffusion.py +++ b/tests/Diffusion/test_diffusion.py @@ -553,8 +553,8 @@ def test_upsample2d( args = parser.parse_args() sys.path.append(os.environ.get("TORCHSIM_DIR", "/workspace/PyTorchSim")) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_upsample2d(device) diff --git a/tests/Fusion/test_addmm_residual.py b/tests/Fusion/test_addmm_residual.py index a5e05182..ef753a67 100644 --- a/tests/Fusion/test_addmm_residual.py +++ b/tests/Fusion/test_addmm_residual.py @@ -43,8 +43,8 @@ def addmm_residual(a, b, c, d): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_addmm_residual(device, 32, 32, 32) test_addmm_residual(device, 128, 128, 128) diff --git a/tests/Fusion/test_attention_fusion.py b/tests/Fusion/test_attention_fusion.py index 95bdf165..123376d1 100644 --- a/tests/Fusion/test_attention_fusion.py +++ b/tests/Fusion/test_attention_fusion.py @@ -75,8 +75,8 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_MHA(device) # test_Attention(device, head=16, seq=512, d_k=64) diff --git a/tests/Fusion/test_bmm_reduction.py b/tests/Fusion/test_bmm_reduction.py index 42e38095..4f4d3ad6 100644 --- a/tests/Fusion/test_bmm_reduction.py +++ b/tests/Fusion/test_bmm_reduction.py @@ -42,8 +42,8 @@ def bmm(a, b): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_bmm_reduce(device) test_bmm_reduce(device, 12, 512) diff --git a/tests/Fusion/test_conv_fusion.py b/tests/Fusion/test_conv_fusion.py index 42210b13..694f3bb9 100644 --- a/tests/Fusion/test_conv_fusion.py +++ b/tests/Fusion/test_conv_fusion.py @@ -101,8 +101,8 @@ def custom_conv_bn_relu(a, b, bias, c, d, e, f): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() # Vanila test diff --git a/tests/Fusion/test_matmul_activation.py b/tests/Fusion/test_matmul_activation.py index 2381bd8c..2f1d014f 100644 --- a/tests/Fusion/test_matmul_activation.py +++ b/tests/Fusion/test_matmul_activation.py @@ -73,8 +73,8 @@ def test_matmul_activation(device, batch_size=16, input_size=32, output_size=8, import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_matmul_activation(device) test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="sigmoid") diff --git a/tests/Fusion/test_matmul_reduction.py b/tests/Fusion/test_matmul_reduction.py index 31ea1b0d..df8cf969 100644 --- a/tests/Fusion/test_matmul_reduction.py +++ b/tests/Fusion/test_matmul_reduction.py @@ -89,8 +89,8 @@ def matmul_fused(a, b, c, d): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_matmul_reduce(device, 3072, 512, 768) test_matmul_var_mean(device) diff --git a/tests/Fusion/test_matmul_scalar.py b/tests/Fusion/test_matmul_scalar.py index 0dcb54f9..0815bb90 100644 --- a/tests/Fusion/test_matmul_scalar.py +++ b/tests/Fusion/test_matmul_scalar.py @@ -39,7 +39,7 @@ def matmul_fused(a, b, c): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_matmul_scalar(device) diff --git a/tests/Fusion/test_prologue_fusion.py b/tests/Fusion/test_prologue_fusion.py index 797f9e76..b27312a9 100644 --- a/tests/Fusion/test_prologue_fusion.py +++ b/tests/Fusion/test_prologue_fusion.py @@ -88,8 +88,8 @@ def bmm(a, b, c, d): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_elem_broadcast_fusion(device) test_elem_fusion(device) diff --git a/tests/Fusion/test_transformer_fusion.py b/tests/Fusion/test_transformer_fusion.py index 0e500b5b..b1cceb2c 100644 --- a/tests/Fusion/test_transformer_fusion.py +++ b/tests/Fusion/test_transformer_fusion.py @@ -203,8 +203,8 @@ def test_EncoderBlock_validation(head=12, embed_dim=768, input_seq=512): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_MHA(device) test_EncoderBlock(device) diff --git a/tests/MLP/test_mlp.py b/tests/MLP/test_mlp.py index 6f6c9444..31bcefdf 100644 --- a/tests/MLP/test_mlp.py +++ b/tests/MLP/test_mlp.py @@ -281,9 +281,9 @@ def train(model, device): return if __name__ == "__main__": - from Scheduler.scheduler import ExecutionEngine + from Scheduler.scheduler import PyTorchSimRunner torch.set_printoptions(threshold=float('inf'), linewidth=600) - module = ExecutionEngine.setup_device() + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_mlp(device) diff --git a/tests/MLP/test_mlp_cpu.py b/tests/MLP/test_mlp_cpu.py index 49f44650..112f5d07 100644 --- a/tests/MLP/test_mlp_cpu.py +++ b/tests/MLP/test_mlp_cpu.py @@ -399,7 +399,6 @@ def train(model, device): if __name__ == "__main__": - from Scheduler.scheduler import ExecutionEngine # torch.set_printoptions(threshold=float('inf'), linewidth=600) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py index aa1af651..6a7747f7 100644 --- a/tests/Mixtral_8x7B/test_attention.py +++ b/tests/Mixtral_8x7B/test_attention.py @@ -163,8 +163,8 @@ def test_rmsnorm(device, seq=32): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_rmsnorm(device, seq=1) test_concat(device, size1=(1, 8, 64, 64), size2=(1,8,1,64), dim=2) diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py index c5ab8107..ae16f0b0 100644 --- a/tests/MoE/test_moe.py +++ b/tests/MoE/test_moe.py @@ -783,9 +783,9 @@ def evaluation(model, evaluation_loader): train(opt_model, train_loader) if __name__ == "__main__": - from Scheduler.scheduler import ExecutionEngine + from Scheduler.scheduler import PyTorchSimRunner torch.set_printoptions(threshold=float('inf'), linewidth=600) - module = ExecutionEngine.setup_device() + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_moe(device) diff --git a/tests/test_activation.py b/tests/test_activation.py index de3542c3..575fc7e8 100644 --- a/tests/test_activation.py +++ b/tests/test_activation.py @@ -88,8 +88,8 @@ def test_SwiGLU(device, size=(128, 128)): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_ReLU(device, (47, 10)) test_ReLU(device, (128, 128)) diff --git a/tests/test_add.py b/tests/test_add.py index 5e1ab15e..118632d5 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -58,8 +58,8 @@ def vectoradd(a, b): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_vectoradd(device, (1, 1)) test_vectoradd(device, (47, 10)) diff --git a/tests/test_batchnorm.py b/tests/test_batchnorm.py index f7abacf5..251805f5 100644 --- a/tests/test_batchnorm.py +++ b/tests/test_batchnorm.py @@ -37,8 +37,8 @@ def test_BatchNorm(device, size=(1, 16, 64, 64)): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_BatchNorm(device) test_BatchNorm(device, size=(1,64, 32, 32)) diff --git a/tests/test_bmm.py b/tests/test_bmm.py index 6d9279aa..d90410db 100644 --- a/tests/test_bmm.py +++ b/tests/test_bmm.py @@ -46,8 +46,8 @@ def bmm(a, b, bias): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_BMM(device) test_BMM(device, 2, 256, 128, 256) diff --git a/tests/test_cnn.py b/tests/test_cnn.py index aaad2836..54225747 100644 --- a/tests/test_cnn.py +++ b/tests/test_cnn.py @@ -53,7 +53,7 @@ def test_CNN(device): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_CNN(device) diff --git a/tests/test_compile_overhead.py b/tests/test_compile_overhead.py index cf0dc1bb..c32b4364 100644 --- a/tests/test_compile_overhead.py +++ b/tests/test_compile_overhead.py @@ -21,7 +21,7 @@ # shutil.rmtree("/tmp/torchinductor") #except FileNotFoundError: # print("no cache") - scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json") + scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json") # Register compiled model opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False) SchedulerDNNModel.register_model("resnet18", opt_model1) diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py index 21bbfec7..e964319d 100644 --- a/tests/test_conv2d.py +++ b/tests/test_conv2d.py @@ -40,8 +40,8 @@ def custom_conv2d(a, b, bias): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() torch._dynamo.config.cache_size_limit = 64 test_conv2d(device, batch_size=8, in_channels=3, out_channels=32, input_size=32, kernel_size=1, stride=1, padding=0) diff --git a/tests/test_exponent.py b/tests/test_exponent.py index c95823cb..e60f8407 100644 --- a/tests/test_exponent.py +++ b/tests/test_exponent.py @@ -31,7 +31,7 @@ def exponent(a): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_exponent(device, size=(32, 32)) diff --git a/tests/test_hetro.py b/tests/test_hetro.py index 5e36d730..557ea5d6 100644 --- a/tests/test_hetro.py +++ b/tests/test_hetro.py @@ -26,7 +26,7 @@ def custom_matmul(a, b): K = args.K sparsity = args.sparsity mode = args.mode - config_path = f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/{args.config}" + config_path = f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/{args.config}" print("M: ", M) print("N: ", N) @@ -36,7 +36,7 @@ def custom_matmul(a, b): with torch.no_grad(): # Init scheduler scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, - backend_config=config_path) + togsim_config=config_path) # Register compiled model opt_model1 = torch.compile(custom_matmul) diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py index b7b20074..c6afaf86 100644 --- a/tests/test_indirect_access.py +++ b/tests/test_indirect_access.py @@ -48,8 +48,8 @@ def test_embedding(device, vocab_size, dim): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_indirect_vectoradd(device) #test_embedding(device, 1024, 2048) \ No newline at end of file diff --git a/tests/test_layernorm.py b/tests/test_layernorm.py index 1cea9d9f..28e38d37 100644 --- a/tests/test_layernorm.py +++ b/tests/test_layernorm.py @@ -41,8 +41,8 @@ def test_LayerNorm(device, size=(64, 64)): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_LayerNorm(device) test_LayerNorm(device, shape) diff --git a/tests/test_matmul.py b/tests/test_matmul.py index 6f41468b..cd30bd30 100644 --- a/tests/test_matmul.py +++ b/tests/test_matmul.py @@ -94,8 +94,8 @@ def custom_linear(a, b, bias): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_matmul(device, 32, 32, 32) test_matmul(device, 128, 128, 128) diff --git a/tests/test_mlp.py b/tests/test_mlp.py index b8118aa3..423d6e8e 100644 --- a/tests/test_mlp.py +++ b/tests/test_mlp.py @@ -109,8 +109,8 @@ def test_optimizer(device): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_mlp(device) test_mlp_inf(device, batch_size=1, input_size=256, hidden_size=512, output_size=256) diff --git a/tests/test_pool.py b/tests/test_pool.py index 304a5e7c..f5505dba 100644 --- a/tests/test_pool.py +++ b/tests/test_pool.py @@ -47,8 +47,8 @@ def avgpool(a): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_maxpool(device, b=1, c=8, h=16, w=16) #test_maxpool(device, b=1, c=8, h=112, w=112) diff --git a/tests/test_reduce.py b/tests/test_reduce.py index e1a84b7f..4781112d 100644 --- a/tests/test_reduce.py +++ b/tests/test_reduce.py @@ -47,8 +47,8 @@ def reduce_sum(a, dim, keepdim): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_reduce_sum(device, (29, 47), 1, keepdim=True) test_reduce_sum(device, (17, 68), 0, keepdim=True) diff --git a/tests/test_resnet.py b/tests/test_resnet.py index 97c60528..c83f13ba 100644 --- a/tests/test_resnet.py +++ b/tests/test_resnet.py @@ -49,7 +49,7 @@ def test_resnet(device, batch=1, model_type='resnet18'): args = args.parse_args() sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_resnet(device, model_type=args.model_type) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index c64093a0..91bf0ad8 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -7,13 +7,13 @@ base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') sys.path.append(base_path) from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request -config = f'{base_path}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' +config = f'{base_path}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' target_model1 = model1().eval() target_model2 = model2(768, 12).eval() # Init scheduler -scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) +scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) # Register compiled model opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last)) opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device())) diff --git a/tests/test_scheduler_batching.py b/tests/test_scheduler_batching.py index f3b54159..5a34d161 100644 --- a/tests/test_scheduler_batching.py +++ b/tests/test_scheduler_batching.py @@ -17,7 +17,7 @@ target_model1 = model1().eval() # Init scheduler - scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json") + scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json") # Register compiled model opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False) SchedulerDNNModel.register_model("resnet18", opt_model1) diff --git a/tests/test_single_perceptron.py b/tests/test_single_perceptron.py index c7fdca06..beab1c54 100644 --- a/tests/test_single_perceptron.py +++ b/tests/test_single_perceptron.py @@ -82,7 +82,7 @@ def weight_update(a, b, lr): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_single_perceptron(device) diff --git a/tests/test_softmax.py b/tests/test_softmax.py index 9fba41dd..e6e8cc1e 100644 --- a/tests/test_softmax.py +++ b/tests/test_softmax.py @@ -58,8 +58,8 @@ def test_softmax(device, size=(128, 128), dim=1): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_softmax(device, size=(64, 128)) test_softmax(device, size=(64, 128), dim=0) diff --git a/tests/test_sparse_core.py b/tests/test_sparse_core.py index b2b16818..72eda0c8 100644 --- a/tests/test_sparse_core.py +++ b/tests/test_sparse_core.py @@ -80,9 +80,9 @@ def test_sparse_mlp(device, batch_size=32, input_size=128, hidden_size=128, outp import os import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine + from Scheduler.scheduler import PyTorchSimRunner - module = ExecutionEngine.setup_device() + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_sparse_mlp(device, batch_size=8, input_size=16, hidden_size=32, output_size=64) diff --git a/tests/test_sparsity.py b/tests/test_sparsity.py index 3e079f83..a2493673 100644 --- a/tests/test_sparsity.py +++ b/tests/test_sparsity.py @@ -96,8 +96,8 @@ def test_mlp_inf(device, batch_size=64, input_size=64, hidden_size=32, output_si ) args = parser.parse_args() - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_dec_inf(device, sparsity=args.sparsity, block=args.block) diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py index 1cf0d3b3..c7abf0ae 100644 --- a/tests/test_spmm_scheduler.py +++ b/tests/test_spmm_scheduler.py @@ -25,7 +25,7 @@ output_size = args.output_size w1_sparsity = args.w1_sparsity w2_sparsity = args.w2_sparsity - config_path = f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/{args.config}" + config_path = f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/{args.config}" print("batch_size: ", batch_size) print("input_size: ", input_size) @@ -37,7 +37,7 @@ with torch.no_grad(): # Init scheduler scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, - backend_config=config_path) + togsim_config=config_path) target_model1 = model1(input_size, hidden_size, output_size, w1_sparsity, w2_sparsity, scheduler.execution_engine.module.custom_device()).eval() target_model2 = model2(768, 12).eval() diff --git a/tests/test_stonne.py b/tests/test_stonne.py index 5e4fe5fb..04ad05a8 100644 --- a/tests/test_stonne.py +++ b/tests/test_stonne.py @@ -54,7 +54,7 @@ def test_sparse_mm(device, input_size=128, hidden_size=128, output_size=128, spa args = parser.parse_args() sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_sparse_mm(device, args.sz, args.sz, args.sz, args.sparsity) \ No newline at end of file diff --git a/tests/test_topk.py b/tests/test_topk.py new file mode 100644 index 00000000..0d5c08ec --- /dev/null +++ b/tests/test_topk.py @@ -0,0 +1,54 @@ +import torch +import torch._dynamo +import torch.utils.cpp_extension + +def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): + if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): + message = f"|{name} Test Passed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + else: + message = f"|{name} Test Failed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + print("custom out: ", out.cpu()) + print("cpu out: ", cpu_out) + exit(1) + +def test_topk(device, size=(128, 128), k=5, dim=-1, largest=True, sorted=True): + # dim 해석을 위해 양수 인덱스로 변환 + dim_ = dim if dim >= 0 else (len(size) + dim) + assert 0 <= dim_ < len(size), "dim이 텐서 차원 범위를 벗어났습니다." + assert k <= size[dim_], f"k(={k})는 size[dim](={size[dim_]}) 이하여야 합니다." + + def topk_fn(a): + return torch.topk(a, k, dim=dim, largest=largest, sorted=sorted) + + x = torch.randn(size) + x = x.to(device=device) + + opt_topk = torch.compile(dynamic=False)(topk_fn) + res_values, res_indices = opt_topk(x) + + ref_values, ref_indices = torch.topk(x.cpu(), k, dim=dim, largest=largest, sorted=sorted) + + test_result("TopK/values", res_values, ref_values) + test_result("TopK/indices", res_indices, ref_indices) + +if __name__ == "__main__": + import os + import sys + import argparse + sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) + + parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape") + parser.add_argument('--shape', type=str, default="(512,768)") + args = parser.parse_args() + shape = tuple(map(int, args.shape.strip('()').split(','))) + + from Scheduler.scheduler import ExecutionEngine + module = ExecutionEngine.setup_device() + device = module.custom_device() + test_topk(device, (128, 128), k=2, dim=-1) \ No newline at end of file diff --git a/tests/test_transcendental.py b/tests/test_transcendental.py index 5f296581..38c2f4f6 100644 --- a/tests/test_transcendental.py +++ b/tests/test_transcendental.py @@ -73,8 +73,8 @@ def cos(a): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_tanh(device) test_exp(device) diff --git a/tests/test_transformer.py b/tests/test_transformer.py index 4d45707e..a3ac55d7 100644 --- a/tests/test_transformer.py +++ b/tests/test_transformer.py @@ -119,8 +119,8 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_EncoderBlock(device) # test_Attention(device, head=16, seq=512, d_k=64) diff --git a/tests/test_transpose2D.py b/tests/test_transpose2D.py index 14f16fbb..af5aacf7 100644 --- a/tests/test_transpose2D.py +++ b/tests/test_transpose2D.py @@ -46,8 +46,8 @@ def transpose(a, b): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_Transpose2D(device, [64, 156]) test_Transpose2D_2(device, [16, 64]) diff --git a/tests/test_transpose3D.py b/tests/test_transpose3D.py index 937948c4..d6c1092d 100644 --- a/tests/test_transpose3D.py +++ b/tests/test_transpose3D.py @@ -61,8 +61,8 @@ def transpose(a, b): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_Transpose3D_1(device, [62, 34, 44]) test_Transpose3D_1(device, [62, 134, 144]) diff --git a/tests/test_vectorops.py b/tests/test_vectorops.py index 0677b7ae..ed895171 100644 --- a/tests/test_vectorops.py +++ b/tests/test_vectorops.py @@ -6,8 +6,8 @@ import os import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() # Target shape diff --git a/tests/test_view3D_2D.py b/tests/test_view3D_2D.py index a5a31a85..148fe8fa 100644 --- a/tests/test_view3D_2D.py +++ b/tests/test_view3D_2D.py @@ -44,8 +44,8 @@ def view2D_3D(a): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_view3D_2D(device) test_view3D_2D(device, [12, 512, 64]) diff --git a/tests/test_vit.py b/tests/test_vit.py index 6f587127..aeb4f148 100644 --- a/tests/test_vit.py +++ b/tests/test_vit.py @@ -202,8 +202,8 @@ def test_encoder_block_with_class_token( shape = tuple(map(int, args.shape.strip('()').split(','))) sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_multihead_attention(device) #test_encoder_block(device, seq_len=197) diff --git a/tutorial/session1/HelloPyTorchSim.ipynb b/tutorial/session1/HelloPyTorchSim.ipynb new file mode 100644 index 00000000..dfb086a4 --- /dev/null +++ b/tutorial/session1/HelloPyTorchSim.ipynb @@ -0,0 +1,1216 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hello, PyTorchSim!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import os\n", + "import sys\n", + "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", + "sys.path.append(base_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## One Touch Simulation\n", + "### Normal Matmul Code" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "torch.manual_seed(0)\n", + "input = torch.randn(128, 128).to(device)\n", + "weight = torch.randn(128, 128).to(device)\n", + "\n", + "opt_fn = torch.compile(torch.matmul)\n", + "cpu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PyTorchSim Matmul Code" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/ro/croutbd6yxrzgdstfcplx7yrpn2do5frwhyx2md5r7rvrubdhdgd.py\n", + "[Gem5] Gem5 is running... \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running.. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/0\"\n" + ] + } + ], + "source": [ + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "torch.manual_seed(0)\n", + "input = torch.randn(128, 128).to(device)\n", + "weight = torch.randn(128, 128).to(device)\n", + "\n", + "opt_fn = torch.compile(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n", + " if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n", + " message = f\"|{name} Test Passed|\"\n", + " print(\"-\" * len(message))\n", + " print(message)\n", + " print(\"-\" * len(message))\n", + " else:\n", + " message = f\"|{name} Test Failed|\"\n", + " print(\"-\" * len(message))\n", + " print(message)\n", + " print(\"-\" * len(message))\n", + " print(\"npu out: \", npu_out.cpu())\n", + " print(\"cpu out: \", cpu_out)\n", + " exit(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------\n", + "|MatMul Test Passed|\n", + "--------------------\n" + ] + } + ], + "source": [ + "test_result(\"MatMul\", npu_out, cpu_out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# from Scheduler.scheduler import PyTorchSimRunner\n", + "# npu_device = PyTorchSimRunner.setup_device().custom_device()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Normal Backward Code" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "torch.manual_seed(0)\n", + "cpu_input = torch.randn(128, 128).to(device)\n", + "cpu_weight = torch.randn(128, 128).to(device)\n", + "cpu_target = torch.randn(128, 128).to(device)\n", + "cpu_input.requires_grad = True\n", + "cpu_weight.requires_grad = True\n", + "\n", + "opt_fn = torch.compile(torch.matmul)\n", + "cpu_out = opt_fn(cpu_input, cpu_weight)\n", + "\n", + "loss_fn = torch.nn.CrossEntropyLoss()\n", + "cpu_loss = loss_fn(cpu_out, cpu_target)\n", + "cpu_loss.backward()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PyTorchSim Backward Code" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrapper Codegen Path = /tmp/torchinductor_root/5i/c5isqyualxbaqsmuhsux7oubvkypfmh4kvamqvgref6z3ypnrpw5.py\n", + "[Gem5] Gem5 is running... \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running.. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/19\"\n" + ] + }, + { + "ename": "RuntimeError", + "evalue": "0 <= device.index() && device.index() < static_cast(device_ready_queues_.size()) INTERNAL ASSERT FAILED at \"/opt/conda/conda-bld/pytorch_1704987394225/work/torch/csrc/autograd/engine.cpp\":1423, please report a bug to PyTorch. ", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 15\u001b[0m\n\u001b[1;32m 13\u001b[0m loss_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mnn\u001b[38;5;241m.\u001b[39mCrossEntropyLoss()\n\u001b[1;32m 14\u001b[0m npu_loss \u001b[38;5;241m=\u001b[39m loss_fn(npu_out, npu_target)\n\u001b[0;32m---> 15\u001b[0m \u001b[43mnpu_loss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_tensor.py:522\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 512\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 513\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m 514\u001b[0m Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m 515\u001b[0m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 520\u001b[0m inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m 521\u001b[0m )\n\u001b[0;32m--> 522\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 523\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m 524\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py:266\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 261\u001b[0m retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[1;32m 263\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[1;32m 264\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m 265\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 266\u001b[0m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m 267\u001b[0m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 268\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 269\u001b[0m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 270\u001b[0m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 271\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 272\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 274\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mRuntimeError\u001b[0m: 0 <= device.index() && device.index() < static_cast(device_ready_queues_.size()) INTERNAL ASSERT FAILED at \"/opt/conda/conda-bld/pytorch_1704987394225/work/torch/csrc/autograd/engine.cpp\":1423, please report a bug to PyTorch. " + ] + } + ], + "source": [ + "from Scheduler.scheduler import PyTorchSimRunner\n", + "npu_device = PyTorchSimRunner.setup_device().custom_device()\n", + "torch.manual_seed(0)\n", + "npu_input = torch.randn(128, 128).to(npu_device)\n", + "npu_weight = torch.randn(128, 128).to(npu_device)\n", + "npu_target = torch.randn(128, 128).to(npu_device)\n", + "npu_input.requires_grad = True\n", + "npu_weight.requires_grad = True\n", + "\n", + "opt_fn = torch.compile(torch.matmul)\n", + "npu_out = opt_fn(npu_input, npu_weight)\n", + "\n", + "loss_fn = torch.nn.CrossEntropyLoss()\n", + "npu_loss = loss_fn(npu_out, npu_target)\n", + "npu_loss.backward()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'test_result' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtest_result\u001b[49m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMatMul Input Grad\u001b[39m\u001b[38;5;124m\"\u001b[39m, npu_input\u001b[38;5;241m.\u001b[39mgrad, cpu_input\u001b[38;5;241m.\u001b[39mgrad)\n\u001b[1;32m 2\u001b[0m test_result(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMatMul Weight Grad\u001b[39m\u001b[38;5;124m\"\u001b[39m, npu_weight\u001b[38;5;241m.\u001b[39mgrad, cpu_weight\u001b[38;5;241m.\u001b[39mgrad)\n", + "\u001b[0;31mNameError\u001b[0m: name 'test_result' is not defined" + ] + } + ], + "source": [ + "test_result(\"MatMul Input Grad\", npu_input.grad, cpu_input.grad)\n", + "test_result(\"MatMul Weight Grad\", npu_weight.grad, cpu_weight.grad)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Mapping\n", + "\n", + "Default mapping is based on heuristic." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrapper Codegen Path = /tmp/torchinductor_root/5z/c5z4ur2k2svn2gaawn776ev3t6gsa7esgu36la63523cqpbbt56d.py\n", + "[Gem5] Gem5 is running.. \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0\"\n" + ] + } + ], + "source": [ + "import torch\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 18:53:14.002] [info] Total execution cycle: 47158\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Manual Mapping\n", + "User can set tile size manually." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrapper Codegen Path = /tmp/torchinductor_root/mv/cmv6cp7oo3wwndv76iv3sib7r74tnbvodfwxi3rw33k7grlh3h4h.py\n", + "[Gem5] Gem5 is running. \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running... \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/75hiq5mugpq/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/75hiq5mugpq/togsim_result/0\"\n" + ] + } + ], + "source": [ + "torch._dynamo.reset()\n", + "\n", + "os.environ['TORCHSIM_MANUAL_TILE_SIZE']=\"1\"\n", + "os.environ['TORCHSIM_TILE_M']=\"512\"\n", + "os.environ['TORCHSIM_TILE_N']=\"512\"\n", + "os.environ['TORCHSIM_TILE_K']=\"512\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 18:54:00.878] [info] Total execution cycle: 53704\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/75hiq5mugpq/togsim_result/0 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Autotune" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Auto-tune] Trying tile size: [1024, 1024, 256, 128, 1024, 256]\n", + "[Auto-tune] Trying tile size: [256, 1024, 1024, 128, 1024, 1024]\n", + "[Auto-tune] Trying tile size: [1024, 256, 1024, 128, 256, 1024]\n", + "[Auto-tune] Trying tile size: [1024, 1024, 128, 128, 1024, 128]\n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/x27ipc5avjg/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/0\"\n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/7j33rcic2qn/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/7j33rcic2qn/togsim_result/0\"\n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/vsaamplubl5/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/vsaamplubl5/togsim_result/0\"\n", + "[Auto-tune] Optimal tile size: [1024, 1024, 128, 128, 1024, 128], cycles: 46423\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/3b/c3bebp4b4rp73grbvhbaq4xdxny7f5m7fgqkgpflp2cjn3x5uugr.py\n", + "[Gem5] Gem5 is running.. \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/x27ipc5avjg/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/1\"\n" + ] + } + ], + "source": [ + "torch._dynamo.reset()\n", + "os.environ['TORCHSIM_MANUAL_TILE_SIZE']=\"0\"\n", + "os.environ['AUTOTUNE_TEMPLATE']=\"1\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 18:54:53.051] [info] Total execution cycle: 46422\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/1 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Execution Mode\n", + "### Functional & Timing mode (Default)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrapper Codegen Path = /tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py\n", + "[Gem5] Gem5 is running.. \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/4\"\n" + ] + } + ], + "source": [ + "torch._dynamo.reset()\n", + "os.environ['AUTOTUNE_TEMPLATE']=\"0\"\n", + "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"1\"\n", + "os.environ['TORCHSIM_TIMING_MODE']=\"1\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Functional only mode" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Spike] Running Spike simulator\n" + ] + } + ], + "source": [ + "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"1\"\n", + "os.environ['TORCHSIM_TIMING_MODE']=\"0\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Timing only mode" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[23], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m 7\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 8\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m 11\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124;03m Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m inner\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified..forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m 899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m 900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func..g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper..runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 89\u001b[0m \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m 90\u001b[0m \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m 91\u001b[0m \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m 92\u001b[0m \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m 95\u001b[0m compiled_fn,\n\u001b[1;32m 96\u001b[0m args,\n\u001b[1;32m 97\u001b[0m disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m 98\u001b[0m )\n\u001b[1;32m 100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m 101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m 104\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 107\u001b[0m \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m 109\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 110\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 113\u001b[0m )\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base..rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m 117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m 884\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m 885\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m 886\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m 887\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m 888\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m 889\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m 890\u001b[0m )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m 124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m 127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir..dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m 284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m 287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m 27\u001b[0m file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'" + ] + } + ], + "source": [ + "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"0\"\n", + "os.environ['TORCHSIM_TIMING_MODE']=\"1\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TOGSim Configuration\n", + "### Single Core" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[22], line 7\u001b[0m\n\u001b[1;32m 4\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m 6\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 7\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m 11\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124;03m Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m inner\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified..forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m 899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m 900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func..g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper..runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 89\u001b[0m \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m 90\u001b[0m \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m 91\u001b[0m \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m 92\u001b[0m \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m 95\u001b[0m compiled_fn,\n\u001b[1;32m 96\u001b[0m args,\n\u001b[1;32m 97\u001b[0m disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m 98\u001b[0m )\n\u001b[1;32m 100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m 101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m 104\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 107\u001b[0m \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m 109\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 110\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 113\u001b[0m )\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base..rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m 117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m 884\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m 885\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m 886\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m 887\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m 888\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m 889\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m 890\u001b[0m )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m 124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m 127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir..dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m 284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m 287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m 27\u001b[0m file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'" + ] + } + ], + "source": [ + "os.environ['TORCHSIM_CONFIG']=\"/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 18:32:01.843] [info] Total execution cycle: 47126\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/11 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Multi-Core" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/12\"\n" + ] + } + ], + "source": [ + "os.environ['TORCHSIM_CONFIG']=\"/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 18:34:48.969] [info] Total execution cycle: 40736\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/12 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TOGSim log level\n", + "### log level info" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[21], line 7\u001b[0m\n\u001b[1;32m 4\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m 6\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 7\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m 11\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124;03m Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m inner\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified..forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m 899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m 900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func..g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper..runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 89\u001b[0m \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m 90\u001b[0m \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m 91\u001b[0m \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m 92\u001b[0m \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m 95\u001b[0m compiled_fn,\n\u001b[1;32m 96\u001b[0m args,\n\u001b[1;32m 97\u001b[0m disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m 98\u001b[0m )\n\u001b[1;32m 100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m 101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m 104\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 107\u001b[0m \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m 109\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 110\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 113\u001b[0m )\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base..rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m 117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m 884\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m 885\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m 886\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m 887\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m 888\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m 889\u001b[0m compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m 890\u001b[0m )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m 124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m 127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir..dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m 284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m 287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m 27\u001b[0m file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'" + ] + } + ], + "source": [ + "os.environ['TORCHSIM_DUMP_PATH']=\"/workspace/PyTorchSim\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### log level trace" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/workspace/PyTorchSim/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/workspace/PyTorchSim/tmp/4q4qv6gbpia/togsim_result/1\"\n" + ] + } + ], + "source": [ + "os.environ['BACKENDSIM_DEBUG_LEVEL']=\"trace\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scheduler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torchvision.models import resnet18\n", + "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request\n", + "from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_BACKEND_CONFIG\n", + "\n", + "scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=CONFIG_TORCHSIM_BACKEND_CONFIG)\n", + "device = scheduler.execution_engine.module.custom_device()\n", + "\n", + "model = resnet18().eval()\n", + "input = torch.randn(1, 3, 224, 224).to(device=device)\n", + "opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last))\n", + "\n", + "SchedulerDNNModel.register_model(\"resnet18\", opt_fn)\n", + "request = Request(\"resnet18\", [input], [], request_queue_idx=0)\n", + "scheduler.add_request(request, request_time=0)\n", + "\n", + "# Run scheduler\n", + "while not scheduler.is_finished():\n", + " with torch.no_grad():\n", + " scheduler.schedule()\n", + "\n", + "print(\"ResNet18 Simulation Done\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Generator" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 13:05:13.597] [info] [LoadConfig] Success to open \"/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\"\n", + "[2025-11-30 13:05:13.597] [info] [Config/Core] CPU 0: Partition 0\n", + "[2025-11-30 13:05:13.597] [info] [Config/Core] CPU 1: Partition 0\n", + "[2025-11-30 13:05:13.597] [info] [Config/Core] Core 0: 700 MHz, Systolic array per core: 1\n", + "[2025-11-30 13:05:13.597] [info] [Config/Core] Core 1: 700 MHz, Systolic array per core: 1\n", + "[2025-11-30 13:05:13.597] [info] [Config/DRAM] Ramulator2 config: /root/workspace/PyTorchSim/PyTorchSimBackend/configs/../configs/ramulator2_configs/HBM2.yaml\n", + "[2025-11-30 13:05:13.597] [info] [Config/DRAM] DRAM Bandwidth 716 GB/s, Freq: 700 MHz, Channels: 32, Request_size: 32B\n", + "[2025-11-30 13:05:13.597] [info] [Config/L2Cache] No L2 cache\n", + "[2025-11-30 13:05:13.673] [info] [Config/Interconnect] Interconnect freq: 20000 MHz\n", + "[2025-11-30 13:05:13.673] [info] [Config/Interconnect] SimpleInerconnect selected\n", + "[0] BackendSim> [Reqest] Resnet18 request time: 0\n", + "[Request issue] partition: 0 batch size: 1\n", + "[Request-0 issue] partition: 0 arrival_time: 0 start_time: 0.0\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/qx/cqxp5xnkdgcdwmer5w6ftyf46iegefhyjclg6mkz2smhktj4tpcy.py\n", + "launch /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx /tmp/torchinductor/tmp/w5hefiqdl3p/runtime_0001/attribute/0 0 0\n", + "[2025-11-30 13:05:22.114] [info] [LoadConfig] Success to open \"/tmp/torchinductor/tmp/w5hefiqdl3p/runtime_0001/attribute/0\"\n", + "[2025-11-30 13:05:22.114] [info] [LoadConfig] Success to open \"/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\"\n", + "[2025-11-30 13:05:22.115] [info] [TOGParser/Attribute] Address Attribute key: arg0 address: 0xa3056c0\n", + "[2025-11-30 13:05:22.115] [info] [TOGParser/Attribute] Address Attribute key: arg1 address: 0xc4a3d40\n", + "[2025-11-30 13:05:22.115] [info] [TOGParser] Register Metadata \"systolic_size\": \"128\"\n", + "[2025-11-30 13:05:22.115] [info] [TOGParser] Register Metadata \"stonneGraph\": \"0\"\n", + "[2025-11-30 13:05:22.116] [info] [Scheduler 0] Register graph path: /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx operation: primals_123 at 0\n", + "[2025-11-30 13:05:22.116] [info] [Scheduler 0] Tile Graph FIFO Scheduled\n", + "until -1\n", + "[2025-11-30 13:05:22.117] [info] HBM2-CH_0: BW utilization 0% (0 reads, 0 writes)\n", + "[2025-11-30 13:05:22.319] [info] [Scheduler 0] Graph path: /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx operation: primals_123 finish at 2424\n", + "[2025-11-30 13:05:22.319] [info] Total compute time 2424\n", + "cycle\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 33\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[38;5;66;03m# Run scheduler\u001b[39;00m\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m scheduler\u001b[38;5;241m.\u001b[39mis_finished():\n\u001b[0;32m---> 33\u001b[0m \u001b[43mscheduler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mschedule\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:475\u001b[0m, in \u001b[0;36mScheduler.schedule\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 473\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcurrent_cycle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbackend_simulator\u001b[38;5;241m.\u001b[39mcycle()\n\u001b[1;32m 474\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 475\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnext_time\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 476\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:507\u001b[0m, in \u001b[0;36mScheduler.run\u001b[0;34m(self, until_time)\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m until_time \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 506\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mis_any_idle(req_empty_info):\n\u001b[0;32m--> 507\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mexecute_cycle\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 508\u001b[0m req_empty_info \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest_empty(i) \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mnum_partion)]\n\u001b[1;32m 509\u001b[0m \u001b[38;5;66;03m# if result is not -1, schedule new request\u001b[39;00m\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:484\u001b[0m, in \u001b[0;36mScheduler.run..execute_cycle\u001b[0;34m()\u001b[0m\n\u001b[1;32m 482\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mnum_partion):\n\u001b[1;32m 483\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mpartition_state[i] \u001b[38;5;241m==\u001b[39m PyTorchSimRunner\u001b[38;5;241m.\u001b[39mPARTITION_IDLE:\n\u001b[0;32m--> 484\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlaunch_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_cycle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 485\u001b[0m launch_ret_info\u001b[38;5;241m.\u001b[39mappend(ret)\n\u001b[1;32m 487\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcheck_finish_request()\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:254\u001b[0m, in \u001b[0;36mPyTorchSimRunner.launch_kernel\u001b[0;34m(self, current_cycle, partion_idx)\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpartition_state[partion_idx] \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mPARTITION_IDLE:\n\u001b[1;32m 253\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpartition_state[partion_idx]\n\u001b[0;32m--> 254\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselect_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpartion_idx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mSELECT_NOTHING:\n\u001b[1;32m 256\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mSELECT_NOTHING\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:290\u001b[0m, in \u001b[0;36mFIFORunner.select_kernel\u001b[0;34m(self, partition_idx)\u001b[0m\n\u001b[1;32m 287\u001b[0m nested_gen \u001b[38;5;241m=\u001b[39m kernel(\u001b[38;5;241m*\u001b[39minputs)\n\u001b[1;32m 288\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnested_launch_model_dicts[partition_idx] \u001b[38;5;241m=\u001b[39m {req : nested_gen}\n\u001b[1;32m 289\u001b[0m kernel, inputs \u001b[38;5;241m=\u001b[39m \\\n\u001b[0;32m--> 290\u001b[0m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnested_launch_model_dicts\u001b[49m\u001b[43m[\u001b[49m\u001b[43mpartition_idx\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43mreq\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m kernel, inputs\n\u001b[1;32m 292\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 293\u001b[0m \u001b[38;5;66;03m# Retry\u001b[39;00m\n", + "File \u001b[0;32m/tmp/torchinductor_root/qx/cqxp5xnkdgcdwmer5w6ftyf46iegefhyjclg6mkz2smhktj4tpcy.py:227\u001b[0m, in \u001b[0;36mConv2D_1_3_224_22464_3_7_7_2_2_3_3_1_1_3\u001b[0;34m(X, W, Y)\u001b[0m\n\u001b[1;32m 224\u001b[0m W \u001b[38;5;241m=\u001b[39m W\u001b[38;5;241m.\u001b[39mpermute(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m0\u001b[39m)\u001b[38;5;241m.\u001b[39mcontiguous() \u001b[38;5;66;03m# (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)\u001b[39;00m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;66;03m# Launch kernel\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m \u001b[43mmlir_kernel_1\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mW\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 228\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m (mlir_kernel_1, (X, W, Y))\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:307\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir..dryrun_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 306\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdryrun_simulator\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 307\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 308\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfilelock\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FileLock\n\u001b[1;32m 309\u001b[0m lock_dir \u001b[38;5;241m=\u001b[39m get_lock_dir()\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/concurrent/futures/_base.py:453\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 450\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[1;32m 451\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__get_result()\n\u001b[0;32m--> 453\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_condition\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 455\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n\u001b[1;32m 456\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/threading.py:320\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m 319\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 321\u001b[0m gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 322\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "import os\n", + "import torch\n", + "from torchvision.models import resnet18\n", + "\n", + "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator\n", + "CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", + "\n", + "lambda_requests = 10\n", + "max_time = 30\n", + "\n", + "target_model1 = resnet18().eval()\n", + "\n", + "# Init scheduler\n", + "scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, backend_config=f\"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\")\n", + "# Register compiled model\n", + "opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)\n", + "SchedulerDNNModel.register_model(\"resnet18\", opt_model1)\n", + "\n", + "# Generate time stamp\n", + "for request_time in poisson_request_generator(lambda_requests, max_time):\n", + " # Init input data\n", + " model_input1 = torch.randn(1, 3, 224, 224)\n", + "\n", + " # Init request\n", + " new_request1 = Request(\"resnet18\", [model_input1], [], request_queue_idx=0)\n", + "\n", + " # Add request to scheduler\n", + " print(\"[Reqest] Resnet18 request time: \", request_time, flush=True)\n", + " scheduler.add_request(new_request1, request_time=request_time)\n", + "\n", + "# Run scheduler\n", + "while not scheduler.is_finished():\n", + " scheduler.schedule()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compiler Optimization\n", + "### GeMM + ReLU fusion" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrapper Codegen Path = /tmp/torchinductor_root/vr/cvrlybtkuzkk6pmnlfxu7o55375z24tajmiow6mszaen5t4ra6zo.py\n", + "[Gem5] Gem5 is running. \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/5o2xythi5z3/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/5o2xythi5z3/togsim_result/0\"\n" + ] + } + ], + "source": [ + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "def gemm_relu(a, b):\n", + " return torch.relu(torch.matmul(a, b))\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n", + "out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cat: /tmp/torchinductor/tmp/5o2xythi5z3/backendsim_result/0: No such file or directory\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/5o2xythi5z3/backendsim_result/0 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Disable fusion" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/tl/ctlqjsvukam6d4kteerml7exwbt4paw7cjtjbxcwdlsd7e4koriq.py\n", + "[Gem5] Gem5 is running... \n", + "[Gem5] Gem5 is running.. \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0\"\n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running.. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/37dfo4nczcq/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/37dfo4nczcq/togsim_result/0\"\n" + ] + } + ], + "source": [ + "os.environ['TORCHSIM_COMPILER_OPTIMIZATION']=\"none\"\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "def gemm_relu(a, b):\n", + " return torch.relu(torch.matmul(a, b))\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n", + "out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 12:52:49.376] [info] Total execution cycle: 47164\n", + "[2025-11-30 12:52:52.444] [info] Total execution cycle: 58510\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/backendsim_result/2 | grep \"Total execution cycle\"\n", + "!cat /tmp/torchinductor/tmp/37dfo4nczcq/backendsim_result/0 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Single kernel mode (TODO: remove it?)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/assumptions.py:499\u001b[0m, in \u001b[0;36mmake_property..getit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 498\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 499\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_assumptions\u001b[49m\u001b[43m[\u001b[49m\u001b[43mfact\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 500\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m:\n", + "\u001b[0;31mKeyError\u001b[0m: 'extended_negative'", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m model \u001b[38;5;241m=\u001b[39m resnet18()\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m 9\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(model)\n\u001b[0;32m---> 10\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:655\u001b[0m, in \u001b[0;36mcatch_errors_wrapper..catch_errors\u001b[0;34m(frame, cache_entry, frame_state)\u001b[0m\n\u001b[1;32m 652\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m hijacked_callback(frame, cache_entry, hooks, frame_state)\n\u001b[1;32m 654\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compile_lock, _disable_current_modes():\n\u001b[0;32m--> 655\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcallback\u001b[49m\u001b[43m(\u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_entry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:727\u001b[0m, in \u001b[0;36mconvert_frame.._convert_frame\u001b[0;34m(frame, cache_entry, hooks, frame_state)\u001b[0m\n\u001b[1;32m 725\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mframes\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtotal\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 726\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 727\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43minner_convert\u001b[49m\u001b[43m(\u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_entry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 728\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mframes\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mok\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 729\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:383\u001b[0m, in \u001b[0;36mconvert_frame_assert.._convert_frame_assert\u001b[0;34m(frame, cache_entry, hooks, frame_state)\u001b[0m\n\u001b[1;32m 370\u001b[0m signpost_event(\n\u001b[1;32m 371\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdynamo\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 372\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_convert_frame_assert._compile\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 379\u001b[0m },\n\u001b[1;32m 380\u001b[0m )\n\u001b[1;32m 382\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config\u001b[38;5;241m.\u001b[39mpatch(_patch_config_if_changed()):\n\u001b[0;32m--> 383\u001b[0m compiled_product \u001b[38;5;241m=\u001b[39m \u001b[43m_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 384\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_code\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 385\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_globals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 386\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_locals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 387\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_builtins\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 388\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompiler_fn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 389\u001b[0m \u001b[43m \u001b[49m\u001b[43mone_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 390\u001b[0m \u001b[43m \u001b[49m\u001b[43mexport\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 391\u001b[0m \u001b[43m \u001b[49m\u001b[43mexport_constraints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 392\u001b[0m \u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 393\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 394\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 395\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mframe_state\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompile_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcompile_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m compiled_product\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:646\u001b[0m, in \u001b[0;36m_compile\u001b[0;34m(code, globals, locals, builtins, compiler_fn, one_graph, export, export_constraints, hooks, cache_size, frame, frame_state, compile_id)\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compile_context(CompileContext(compile_id)):\n\u001b[1;32m 645\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 646\u001b[0m guarded_code \u001b[38;5;241m=\u001b[39m \u001b[43mcompile_inner\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mone_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransform\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 647\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m guarded_code\n\u001b[1;32m 648\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\n\u001b[1;32m 649\u001b[0m Unsupported,\n\u001b[1;32m 650\u001b[0m TorchRuntimeError,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 657\u001b[0m BisectValidationException,\n\u001b[1;32m 658\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:562\u001b[0m, in \u001b[0;36m_compile..compile_inner\u001b[0;34m(code, one_graph, hooks, transform)\u001b[0m\n\u001b[1;32m 560\u001b[0m CompileContext\u001b[38;5;241m.\u001b[39mget()\u001b[38;5;241m.\u001b[39mattempt \u001b[38;5;241m=\u001b[39m attempt\n\u001b[1;32m 561\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 562\u001b[0m out_code \u001b[38;5;241m=\u001b[39m \u001b[43mtransform_code_object\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransform\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 563\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 564\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mRestartAnalysis \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/bytecode_transformation.py:1033\u001b[0m, in \u001b[0;36mtransform_code_object\u001b[0;34m(code, transformations, safe)\u001b[0m\n\u001b[1;32m 1030\u001b[0m instructions \u001b[38;5;241m=\u001b[39m cleaned_instructions(code, safe)\n\u001b[1;32m 1031\u001b[0m propagate_line_nums(instructions)\n\u001b[0;32m-> 1033\u001b[0m \u001b[43mtransformations\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstructions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcode_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1034\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m clean_and_assemble_instructions(instructions, keys, code_options)[\u001b[38;5;241m1\u001b[39m]\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:151\u001b[0m, in \u001b[0;36mpreserve_global_state.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m cleanup \u001b[38;5;241m=\u001b[39m setup_compile_debug()\n\u001b[1;32m 150\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 151\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 153\u001b[0m cleanup\u001b[38;5;241m.\u001b[39mclose()\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:527\u001b[0m, in \u001b[0;36m_compile..transform\u001b[0;34m(instructions, code_options)\u001b[0m\n\u001b[1;32m 525\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 526\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m tracing(tracer\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mtracing_context), tracer\u001b[38;5;241m.\u001b[39mset_current_tx():\n\u001b[0;32m--> 527\u001b[0m \u001b[43mtracer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 528\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mUnspecializeRestartAnalysis:\n\u001b[1;32m 529\u001b[0m speculation_log\u001b[38;5;241m.\u001b[39mclear()\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:2128\u001b[0m, in \u001b[0;36mInstructionTranslator.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 2127\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m-> 2128\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:818\u001b[0m, in \u001b[0;36mInstructionTranslatorBase.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 813\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 814\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mpush_tx(\u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m 815\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m (\n\u001b[1;32m 816\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minstruction_pointer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 817\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mshould_exit\n\u001b[0;32m--> 818\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 819\u001b[0m ):\n\u001b[1;32m 820\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 821\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m BackendCompilerFailed:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:781\u001b[0m, in \u001b[0;36mInstructionTranslatorBase.step\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 777\u001b[0m unimplemented(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmissing: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minst\u001b[38;5;241m.\u001b[39mopname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 778\u001b[0m TracingContext\u001b[38;5;241m.\u001b[39mset_current_loc(\n\u001b[1;32m 779\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_filename, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlineno, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_name\n\u001b[1;32m 780\u001b[0m )\n\u001b[0;32m--> 781\u001b[0m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minst\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopname\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minst\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 783\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m inst\u001b[38;5;241m.\u001b[39mopname \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 784\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m Unsupported:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:2243\u001b[0m, in \u001b[0;36mInstructionTranslator.RETURN_VALUE\u001b[0;34m(self, inst)\u001b[0m\n\u001b[1;32m 2238\u001b[0m _step_logger()(\n\u001b[1;32m 2239\u001b[0m logging\u001b[38;5;241m.\u001b[39mINFO,\n\u001b[1;32m 2240\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorchdynamo done tracing \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (RETURN_VALUE)\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 2241\u001b[0m )\n\u001b[1;32m 2242\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE triggered compile\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 2243\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moutput\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompile_subgraph\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2244\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2245\u001b[0m \u001b[43m \u001b[49m\u001b[43mreason\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mGraphCompileReason\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2246\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreturn_value\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mframe_summary\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgraph_break\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[1;32m 2247\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2248\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompile_return_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 2249\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2250\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39madd_output_instructions([create_instruction(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE\u001b[39m\u001b[38;5;124m\"\u001b[39m)])\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:919\u001b[0m, in \u001b[0;36mOutputGraph.compile_subgraph\u001b[0;34m(self, tx, partial_convert, reason, compile_return_value)\u001b[0m\n\u001b[1;32m 916\u001b[0m append_prefix_insts()\n\u001b[1;32m 917\u001b[0m \u001b[38;5;66;03m# optimization to generate better code in a common case\u001b[39;00m\n\u001b[1;32m 918\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_output_instructions(\n\u001b[0;32m--> 919\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompile_and_call_fx_graph\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mreversed\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mstack_values\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mroot\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 920\u001b[0m \u001b[38;5;241m+\u001b[39m [create_instruction(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUNPACK_SEQUENCE\u001b[39m\u001b[38;5;124m\"\u001b[39m, arg\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(stack_values))]\n\u001b[1;32m 921\u001b[0m )\n\u001b[1;32m 922\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 923\u001b[0m graph_output_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnew_var(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgraph_out\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36mContextDecorator.__call__..inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds):\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1087\u001b[0m, in \u001b[0;36mOutputGraph.compile_and_call_fx_graph\u001b[0;34m(self, tx, rv, root)\u001b[0m\n\u001b[1;32m 1084\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtracing_context\u001b[38;5;241m.\u001b[39mfake_mode \u001b[38;5;241m=\u001b[39m backend_fake_mode\n\u001b[1;32m 1086\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrestore_global_state():\n\u001b[0;32m-> 1087\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcall_user_compiler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1088\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m disable(compiled_fn)\n\u001b[1;32m 1090\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstats\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munique_graphs\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1140\u001b[0m, in \u001b[0;36mOutputGraph.call_user_compiler\u001b[0;34m(self, gm)\u001b[0m\n\u001b[1;32m 1138\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39mverify_correctness:\n\u001b[1;32m 1139\u001b[0m compiler_fn \u001b[38;5;241m=\u001b[39m WrapperBackend(compiler_fn)\n\u001b[0;32m-> 1140\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexample_inputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1141\u001b[0m _step_logger()(logging\u001b[38;5;241m.\u001b[39mINFO, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdone compiler function \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1142\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(compiled_fn), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompiler_fn did not return callable\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/repro/after_dynamo.py:117\u001b[0m, in \u001b[0;36mwrap_backend_debug..debug_wrapper\u001b[0;34m(gm, example_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 117\u001b[0m compiled_gm \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 119\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m compiled_gm\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/__init__.py:1662\u001b[0m, in \u001b[0;36m_TorchCompileInductorWrapper.__call__\u001b[0;34m(self, model_, inputs_)\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, model_, inputs_):\n\u001b[1;32m 1660\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_inductor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompile_fx\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m compile_fx\n\u001b[0;32m-> 1662\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompile_fx\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig_patches\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:1168\u001b[0m, in \u001b[0;36mcompile_fx\u001b[0;34m(model_, example_inputs_, inner_compile, config_patches, decompositions)\u001b[0m\n\u001b[1;32m 1163\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m inference_compiler(unlifted_gm, example_inputs_)\n\u001b[1;32m 1165\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_fake_mode(fake_mode), torch\u001b[38;5;241m.\u001b[39m_guards\u001b[38;5;241m.\u001b[39mtracing(\n\u001b[1;32m 1166\u001b[0m tracing_context\n\u001b[1;32m 1167\u001b[0m ), compiled_autograd\u001b[38;5;241m.\u001b[39mdisable():\n\u001b[0;32m-> 1168\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43maot_autograd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1169\u001b[0m \u001b[43m \u001b[49m\u001b[43mfw_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1170\u001b[0m \u001b[43m \u001b[49m\u001b[43mbw_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbw_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1171\u001b[0m \u001b[43m \u001b[49m\u001b[43minference_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minference_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1172\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecompositions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecompositions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1173\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartition_fn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_fn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1174\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeep_inference_input_mutations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1175\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs_\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/backends/common.py:55\u001b[0m, in \u001b[0;36maot_autograd..compiler_fn\u001b[0;34m(gm, example_inputs)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# NB: NOT cloned!\u001b[39;00m\n\u001b[1;32m 54\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m enable_aot_logging(), patch_config:\n\u001b[0;32m---> 55\u001b[0m cg \u001b[38;5;241m=\u001b[39m \u001b[43maot_module_simplified\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 56\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot_autograd\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mok\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m disable(cg)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:887\u001b[0m, in \u001b[0;36maot_module_simplified\u001b[0;34m(mod, args, fw_compiler, bw_compiler, partition_fn, decompositions, keep_inference_input_mutations, inference_compiler)\u001b[0m\n\u001b[1;32m 871\u001b[0m aot_config \u001b[38;5;241m=\u001b[39m AOTConfig(\n\u001b[1;32m 872\u001b[0m fw_compiler\u001b[38;5;241m=\u001b[39mfw_compiler,\n\u001b[1;32m 873\u001b[0m bw_compiler\u001b[38;5;241m=\u001b[39mbw_compiler,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 883\u001b[0m no_tangents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 884\u001b[0m )\n\u001b[1;32m 886\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compiled_autograd\u001b[38;5;241m.\u001b[39mdisable():\n\u001b[0;32m--> 887\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_aot_dispatcher_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 888\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunctional_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 889\u001b[0m \u001b[43m \u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 890\u001b[0m \u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 891\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 893\u001b[0m \u001b[38;5;66;03m# TODO: There is something deeply wrong here; compiled_fn running with\u001b[39;00m\n\u001b[1;32m 894\u001b[0m \u001b[38;5;66;03m# the boxed calling convention, but aot_module_simplified somehow\u001b[39;00m\n\u001b[1;32m 895\u001b[0m \u001b[38;5;66;03m# historically returned a function that was not the boxed calling\u001b[39;00m\n\u001b[1;32m 896\u001b[0m \u001b[38;5;66;03m# convention. This should get fixed...\u001b[39;00m\n\u001b[1;32m 897\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39mruntime_args):\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:600\u001b[0m, in \u001b[0;36mcreate_aot_dispatcher_function\u001b[0;34m(flat_fn, flat_args, aot_config)\u001b[0m\n\u001b[1;32m 597\u001b[0m compiler_fn \u001b[38;5;241m=\u001b[39m partial(aot_wrapper_dedupe, compiler_fn\u001b[38;5;241m=\u001b[39mcompiler_fn)\n\u001b[1;32m 598\u001b[0m \u001b[38;5;66;03m# You can put more passes here\u001b[39;00m\n\u001b[0;32m--> 600\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfake_flat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 601\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m aot_config\u001b[38;5;241m.\u001b[39mis_export:\n\u001b[1;32m 602\u001b[0m mutated_user_inp_locs \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 603\u001b[0m idx \u001b[38;5;241m-\u001b[39m aot_config\u001b[38;5;241m.\u001b[39mnum_params_buffers\n\u001b[1;32m 604\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m fw_metadata\u001b[38;5;241m.\u001b[39mmutated_inp_runtime_indices\n\u001b[1;32m 605\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m idx \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m aot_config\u001b[38;5;241m.\u001b[39mnum_params_buffers\n\u001b[1;32m 606\u001b[0m ]\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:425\u001b[0m, in \u001b[0;36maot_wrapper_dedupe\u001b[0;34m(flat_fn, flat_args, aot_config, compiler_fn, fw_metadata)\u001b[0m\n\u001b[1;32m 422\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 424\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ok:\n\u001b[0;32m--> 425\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mleaf_flat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 427\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m requires_subclass_dispatch(leaf_flat_args, fw_metadata):\n\u001b[1;32m 428\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 429\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\\\u001b[39;00m\n\u001b[1;32m 430\u001b[0m \u001b[38;5;124;03mEncountered duplicate inputs that are mutated in the graph, but at least one input/output\u001b[39;00m\n\u001b[1;32m 431\u001b[0m \u001b[38;5;124;03mto the graph is a tensor subclass. This is not supported today. You can try to\u001b[39;00m\n\u001b[1;32m 432\u001b[0m \u001b[38;5;124;03mremove the aliasing yourself as a workaround, or otherwise file an issue on github.\"\"\"\u001b[39;00m\n\u001b[1;32m 433\u001b[0m )\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:630\u001b[0m, in \u001b[0;36maot_wrapper_synthetic_base\u001b[0;34m(flat_fn, flat_args, aot_config, fw_metadata, needs_autograd, compiler_fn)\u001b[0m\n\u001b[1;32m 628\u001b[0m \u001b[38;5;66;03m# Happy path: we don't need synthetic bases\u001b[39;00m\n\u001b[1;32m 629\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m synthetic_base_info \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 630\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mflat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 632\u001b[0m \u001b[38;5;66;03m# export path: ban synthetic bases for now, add later if requested.\u001b[39;00m\n\u001b[1;32m 633\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m requires_subclass_dispatch(flat_args, fw_metadata):\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:295\u001b[0m, in \u001b[0;36maot_dispatch_autograd\u001b[0;34m(flat_fn, flat_args, aot_config, fw_metadata)\u001b[0m\n\u001b[1;32m 292\u001b[0m tracing_context\u001b[38;5;241m.\u001b[39mfw_metadata \u001b[38;5;241m=\u001b[39m inner_meta\n\u001b[1;32m 294\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m TracingContext\u001b[38;5;241m.\u001b[39mreport_output_strides() \u001b[38;5;28;01mas\u001b[39;00m fwd_output_strides:\n\u001b[0;32m--> 295\u001b[0m compiled_fw_func \u001b[38;5;241m=\u001b[39m \u001b[43maot_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfw_compiler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfw_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43madjusted_flat_args\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(compiled_fw_func, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 297\u001b[0m compiled_fw_func \u001b[38;5;241m=\u001b[39m make_boxed_func(compiled_fw_func)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:1100\u001b[0m, in \u001b[0;36mcompile_fx..fw_compiler_base\u001b[0;34m(model, example_inputs, is_inference)\u001b[0m\n\u001b[1;32m 1092\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m orig_output_end_idx \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m num_model_outputs\n\u001b[1;32m 1094\u001b[0m user_visible_outputs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 1095\u001b[0m n\u001b[38;5;241m.\u001b[39mname\n\u001b[1;32m 1096\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m n \u001b[38;5;129;01min\u001b[39;00m model_outputs[original_output_start_index:orig_output_end_idx]\n\u001b[1;32m 1097\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(n, torch\u001b[38;5;241m.\u001b[39mfx\u001b[38;5;241m.\u001b[39mNode)\n\u001b[1;32m 1098\u001b[0m }\n\u001b[0;32m-> 1100\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1101\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1102\u001b[0m \u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1103\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_fixed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfixed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1104\u001b[0m \u001b[43m \u001b[49m\u001b[43mcudagraphs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcudagraphs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1105\u001b[0m \u001b[43m \u001b[49m\u001b[43mgraph_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgraph_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1106\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_inference\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_inference\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1107\u001b[0m \u001b[43m \u001b[49m\u001b[43mboxed_forward_device_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforward_device\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1108\u001b[0m \u001b[43m \u001b[49m\u001b[43muser_visible_outputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_visible_outputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1109\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/repro/after_aot.py:83\u001b[0m, in \u001b[0;36mwrap_compiler_debug..debug_wrapper\u001b[0;34m(gm, example_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m config\u001b[38;5;241m.\u001b[39mrepro_after \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdynamo\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 81\u001b[0m \u001b[38;5;66;03m# Call the compiler_fn - which is either aot_autograd or inductor\u001b[39;00m\n\u001b[1;32m 82\u001b[0m \u001b[38;5;66;03m# with fake inputs\u001b[39;00m\n\u001b[0;32m---> 83\u001b[0m inner_compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 84\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 85\u001b[0m \u001b[38;5;66;03m# TODO: Failures here are troublesome because no real inputs,\u001b[39;00m\n\u001b[1;32m 86\u001b[0m \u001b[38;5;66;03m# need a different serialization strategy\u001b[39;00m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39mrepro_after \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/debug.py:305\u001b[0m, in \u001b[0;36mDebugContext.wrap..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 302\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 304\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m DebugContext():\n\u001b[0;32m--> 305\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36mContextDecorator.__call__..inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds):\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:320\u001b[0m, in \u001b[0;36mcompile_fx_inner\u001b[0;34m(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, boxed_forward_device_index, user_visible_outputs, layout_opt, extern_node_serializer)\u001b[0m\n\u001b[1;32m 316\u001b[0m compiled_graph \u001b[38;5;241m=\u001b[39m FxGraphCache\u001b[38;5;241m.\u001b[39mload(\n\u001b[1;32m 317\u001b[0m fx_codegen_and_compile, gm, example_inputs, graph_kwargs\n\u001b[1;32m 318\u001b[0m )\n\u001b[1;32m 319\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m compiled_graph \u001b[38;5;241m=\u001b[39m \u001b[43mfx_codegen_and_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 321\u001b[0m \u001b[43m \u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mgraph_kwargs\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[arg-type]\u001b[39;49;00m\n\u001b[1;32m 322\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 324\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFX codegen and compilation took \u001b[39m\u001b[38;5;132;01m%.3f\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m, time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m start)\n\u001b[1;32m 326\u001b[0m \u001b[38;5;66;03m# Return the output strides to the caller via TracingContext\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:535\u001b[0m, in \u001b[0;36mfx_codegen_and_compile\u001b[0;34m(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, user_visible_outputs, layout_opt, extern_node_serializer)\u001b[0m\n\u001b[1;32m 519\u001b[0m graph \u001b[38;5;241m=\u001b[39m GraphLowering(\n\u001b[1;32m 520\u001b[0m gm,\n\u001b[1;32m 521\u001b[0m \u001b[38;5;66;03m# example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 532\u001b[0m is_inference\u001b[38;5;241m=\u001b[39mis_inference,\n\u001b[1;32m 533\u001b[0m )\n\u001b[1;32m 534\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_graph_handler(graph):\n\u001b[0;32m--> 535\u001b[0m \u001b[43mgraph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 536\u001b[0m output_strides: List[Optional[Tuple[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m]]] \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 537\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m graph\u001b[38;5;241m.\u001b[39mgraph_outputs \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 538\u001b[0m \u001b[38;5;66;03m# We'll put the output strides in the compiled graph so we\u001b[39;00m\n\u001b[1;32m 539\u001b[0m \u001b[38;5;66;03m# can later return them to the caller via TracingContext\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:519\u001b[0m, in \u001b[0;36mGraphLowering.run\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[38;5;129m@dynamo_timed\u001b[39m\n\u001b[1;32m 518\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs):\n\u001b[0;32m--> 519\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/fx/interpreter.py:138\u001b[0m, in \u001b[0;36mInterpreter.run\u001b[0;34m(self, initial_env, enable_io_processing, *args)\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m 137\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 138\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menv[node] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_node\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 140\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mextra_traceback:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:814\u001b[0m, in \u001b[0;36mGraphLowering.run_node\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 812\u001b[0m debug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayout_constraints\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 813\u001b[0m args, kwargs \u001b[38;5;241m=\u001b[39m layout_constraints[n\u001b[38;5;241m.\u001b[39mtarget](n, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 814\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcall_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 815\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_magic_method(n\u001b[38;5;241m.\u001b[39mtarget):\n\u001b[1;32m 816\u001b[0m \u001b[38;5;66;03m# TODO: this is sus, it probably should be handled in the\u001b[39;00m\n\u001b[1;32m 817\u001b[0m \u001b[38;5;66;03m# lowerings themselves similarly to sym_size/sym-stride\u001b[39;00m\n\u001b[1;32m 818\u001b[0m debug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_magic_method\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:691\u001b[0m, in \u001b[0;36mGraphLowering.call_function\u001b[0;34m(self, target, args, kwargs)\u001b[0m\n\u001b[1;32m 689\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 690\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m via \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, lowerings[target])\n\u001b[0;32m--> 691\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mlowerings\u001b[49m\u001b[43m[\u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 692\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m 693\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_lowering.py:117\u001b[0m, in \u001b[0;36mconvolution\u001b[0;34m(x, weight, bias, stride, padding, dilation, transposed, output_padding, groups)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 116\u001b[0m mlir_template \u001b[38;5;241m=\u001b[39m MLIRConvTemplate([x, weight, bias], layout, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 117\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmlir_template\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39moutput_node()\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_template.py:1189\u001b[0m, in \u001b[0;36mMLIRTemplate.generate\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 1184\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m patch\u001b[38;5;241m.\u001b[39mobject(V\u001b[38;5;241m.\u001b[39mgraph, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mget_dtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fake_get_dtype(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_node)):\n\u001b[1;32m 1185\u001b[0m kernel \u001b[38;5;241m=\u001b[39m MLIRTemplateKernel(kernel_name\u001b[38;5;241m=\u001b[39mkernel_name, input_nodes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minput_nodes, call_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlayout\u001b[38;5;241m.\u001b[39msize, kernel_group\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1186\u001b[0m outer_func_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunction_name \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfunction_name\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1187\u001b[0m outer_func_render\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mouter_func_render \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mouter_func_render\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1188\u001b[0m kernel_arg_attributes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_arg_attributes() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mget_arg_attributes\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m-> 1189\u001b[0m code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkernel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkernel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1191\u001b[0m kernel_hash_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmlir_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex_counter)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1192\u001b[0m extra_args \u001b[38;5;241m=\u001b[39m []\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py:238\u001b[0m, in \u001b[0;36mMLIRConvSingleBatchTemplate.render\u001b[0;34m(self, kernel, template_buffer_node, epilogue_nodes, tile_info, **kwargs)\u001b[0m\n\u001b[1;32m 229\u001b[0m kernel\u001b[38;5;241m.\u001b[39mepilogue_info \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(\n\u001b[1;32m 230\u001b[0m output_node \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_node\u001b[38;5;241m.\u001b[39mname,\n\u001b[1;32m 231\u001b[0m sram_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_buffer\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 235\u001b[0m dim_aliasing \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex0\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mc0\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex1\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtile_n\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex2\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mo_h\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex3\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtile_m\u001b[39m\u001b[38;5;124m\"\u001b[39m}\n\u001b[1;32m 236\u001b[0m )\n\u001b[1;32m 237\u001b[0m kernel\u001b[38;5;241m.\u001b[39mexception_nodes[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnumel\u001b[39m\u001b[38;5;124m\"\u001b[39m : (I_W\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m2\u001b[39m\u001b[38;5;241m*\u001b[39mPADDING_W)\u001b[38;5;241m*\u001b[39m(I_H\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m2\u001b[39m\u001b[38;5;241m*\u001b[39mPADDING_H)\u001b[38;5;241m*\u001b[39mI_C\u001b[38;5;241m*\u001b[39mBATCH}\n\u001b[0;32m--> 238\u001b[0m code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_template_from_string\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconv_template\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkernel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 239\u001b[0m kernel\u001b[38;5;241m.\u001b[39madd_loop_info([kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mK_H\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mK_W\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_H\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_W\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBATCH\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_C\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mI_C\u001b[39m\u001b[38;5;124m\"\u001b[39m]], [kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_M\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_N\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_K\u001b[39m\u001b[38;5;124m\"\u001b[39m]])\n\u001b[1;32m 240\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m code\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/jinja2/environment.py:1299\u001b[0m, in \u001b[0;36mTemplate.render\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1296\u001b[0m ctx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnew_context(\u001b[38;5;28mdict\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs))\n\u001b[1;32m 1298\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1299\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menvironment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconcat\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mroot_render_func\u001b[49m\u001b[43m(\u001b[49m\u001b[43mctx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 1300\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 1301\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menvironment\u001b[38;5;241m.\u001b[39mhandle_exception()\n", + "File \u001b[0;32m