From 38ebeebe5d2c1b09fe0b7714c97e090bdcee1917 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Tue, 28 Oct 2025 12:40:36 +0000 Subject: [PATCH 01/53] [docs] multi-tenancy load generator --- README.md | 73 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 56b58b28..c86b0685 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ # PyTorchSim: A Comprehensive, Fast, and Accurate NPU Simulation Framework [![Docker Image CI](https://github.com/PSAL-POSTECH/PyTorchSim/actions/workflows/docker-image.yml/badge.svg)](https://github.com/PSAL-POSTECH/PyTorchSim/actions/workflows/docker-image.yml) -PyTorchSim is a comprehensive, high-speed, cycle-accurate NPU simulation framework -- We define a RISC-V-based NPU architecture and implement PyTorch compiler backend to run inference & training for PyTorch models -- Achieved high speed and accuracy with our novel Tile-Level Simulation (TLS) with compiler-generated Tile-Operation Graph (TOG), exploiting deterministic tile compute latency -- A generic and extensible NPU architecture based on RISC-V vector extension -- The functional simulator supports code correctness validation and data-dependent timing simulation +PyTorchSim is a comprehensive, high-speed, cycle-accurate NPU simulation framework. +- We define a RISC-V-based NPU architecture and implement PyTorch compiler backend to run inference & training for PyTorch models. +- Achieved high speed and accuracy with our novel Tile-Level Simulation (TLS) with compiler-generated Tile-Operation Graph (TOG), exploiting deterministic tile compute latency. +- A generic and extensible NPU architecture based on RISC-V vector extension. +- The functional simulator supports code correctness validation and data-dependent timing simulation. For more details, please refer to our [paper](https://doi.org/10.1145/3725843.3756045)! @@ -92,7 +92,7 @@ The `tests` directory contains several AI workloads examples. ```bash python tests/test_matmul.py ``` -The result is stored to `TORCHSIM_DUMP_PATH`/`hash`/backendsim_result/. The log file contains detailed core, memory, and interconnect stats. +The result is stored to `TORCHSIM_DUMP_PATH/hash/backendsim_result/`. The log file contains detailed core, memory, and interconnect stats. ### Run Your Own Model on PyTorchSim You can run your own PyTorch model on PyTorchSim by setting up a custom NPU device. @@ -180,39 +180,41 @@ Load generator supports multi-tenancy experiments. You can simply run `tests/tes python tests/test_scheduler.py ``` Below is an example code of multi-tenancy -`target_model1` and `target_model2` is your own PyTorch model. -You can set the request arrival time and request queue index. Request queue is used for scheduling and you can set the number of queue to each core in [TOGSim configuration](#togsim-configuration) +`target_model0` and `target_model1` is your own PyTorch model. +You can set the request arrival time and request queue index. Request queue is used for scheduling and you can set the number of queue to each core in [TOGSim configuration](#togsim-configuration). +`poisson_request_generator` generates arrival time of requests in a Poisson distribution with `lamda` and `max_time`. ```python # Init scheduler scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) # Register compiled model -opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last)) -opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device())) -SchedulerDNNModel.register_model("resnet18", opt_model1) -SchedulerDNNModel.register_model("bert", opt_model2) - -# Init input data -model_input1 = torch.randn(1, 3, 224, 224) -model_input2 = torch.randn(128, 768) - -# Init request -new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0) -new_request2 = Request("bert", [model_input2], [], request_queue_idx=1) -new_request3 = Request("resnet18", [model_input1], [], request_queue_idx=0) -new_request4 = Request("bert", [model_input2], [], request_queue_idx=1) - -# Add request to scheduler -scheduler.add_request(new_request1, request_time=0) -scheduler.add_request(new_request2, request_time=0) -scheduler.add_request(new_request3, request_time=0) -scheduler.add_request(new_request4, request_time=0) +opt_model0 = torch.compile(target_model0.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last)) +opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device())) +SchedulerDNNModel.register_model("model0", opt_model0) +SchedulerDNNModel.register_model("model1", opt_model1) + +# Load Generation +model0_lambda = 5.0 +model1_lambda = 3.0 +max_time = 1000.0 # [s] + +# Generate Possion distribution requests for model0 +for model0_request_time in poisson_request_generator(model0_lambda, total_time=max_time): + x = torch.randn(1, 3, 224, 224) + new_request = Request("model0", [x], [], request_queue_idx=0) + scheduler.add_request(new_request, request_time=model0_request_time) + +# Generate Possion distribution requests for model1 +for model1_request_time in poisson_request_generator(model1_lambda, total_time=max_time): + x = torch.randn(128, 768) + new_request = Request("model1", [x], [], request_queue_idx=0) + scheduler.add_request(new_request, request_time=model1_request_time) # Run scheduler while not scheduler.is_finished(): scheduler.schedule() ``` ## Compiler Optimizations -PyTorchSim compiler supports fusions +PyTorchSim compiler supports several fusion optimizations: - GEMM prologue fusion - GEMM epilogue fusion - GEMM reduction fusion @@ -223,7 +225,7 @@ Depending on tensor shape, use different convolution template - Multi-channel optimization ## Mapping -PyTorchSim provids three mapping strategies +PyTorchSim provides three mapping strategies. ### Heuristic-based mapping We adopt and modified heuristic-based mapping of [GEMMINI](https://github.com/ucb-bar/gemmini) by default, which maximizes the utilization of scratchpad memory. ### Auto-tuning @@ -265,7 +267,7 @@ export TORCHSIM_TILE_N=512 export TORCHSIM_TILE_K=512 ``` ## Compiler Configuration -`PyTorchSimFrontend/extension_config.py` contains target hardware configuration to compile +`PyTorchSimFrontend/extension_config.py` contains target hardware configuration to compile. You can configure these options using environment variables. ```bash @@ -346,11 +348,10 @@ If you use PyTorchSim for your research, please cite the following paper. @INPROCEEDINGS{yang2025pytorchsim, author={Yang, Wonhyuk and Shin, Yunseon and Woo, Okkyun and Park, Geonwoo and Ham, Hyungkyu and Kang, Jeehoon and Park, Jongse and Kim, Gwangsun}, title={PyTorchSim: A Comprehensive, Fast, and Accurate NPU Simulation Framework}, - booktitle={2025 58th IEEE/ACM International Symposium on Microarchitecture (MICRO)}, - volume={}, - number={}, - pages={}, + booktitle={Proceedings of the 58th IEEE/ACM International Symposium on Microarchitecture}, + pages={1363–1380}, year={2025}, - doi={10.1145/3725843.3756045} + doi={10.1145/3725843.3756045}, + series={MICRO '25} } ``` \ No newline at end of file From ed18bb6f0d44207ec65107df1e4679be3620bcec Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 28 Oct 2025 23:43:28 +0900 Subject: [PATCH 02/53] Improve README with multi-tenancy examples and details Updated README to enhance clarity and provide code examples for multi-tenancy. --- README.md | 54 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index c86b0685..80329b16 100644 --- a/README.md +++ b/README.md @@ -175,23 +175,51 @@ opt_step() `tests/test_mlp.py` provides an example of MLP training. ## Multi-tenancy -Load generator supports multi-tenancy experiments. You can simply run `tests/test_scheduler.py` +Our load generator supports multi-tenancy experiments. You can run a simple example by executing `tests/test_scheduler.py`. ```bash python tests/test_scheduler.py ``` -Below is an example code of multi-tenancy -`target_model0` and `target_model1` is your own PyTorch model. -You can set the request arrival time and request queue index. Request queue is used for scheduling and you can set the number of queue to each core in [TOGSim configuration](#togsim-configuration). -`poisson_request_generator` generates arrival time of requests in a Poisson distribution with `lamda` and `max_time`. -```python -# Init scheduler +Below is an example code of multi-tenancy `resnet18` and `EncoderBlock`. +In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSim config file(`.json`). The compiled PyTorch models are then registered with a unique model id. + +```python3 +import os +import sys +import torch +from torchvision.models import resnet18 +from test_transformer import EncoderBlock +base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') +config = f'{base_path}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' + +sys.path.append(base_path) +from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + # Register compiled model +target_model0 = resnet18().eval() +target_model1 = EncoderBlock(768, 12).eval() opt_model0 = torch.compile(target_model0.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last)) opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device())) SchedulerDNNModel.register_model("model0", opt_model0) SchedulerDNNModel.register_model("model1", opt_model1) +``` +The config file(`.json`) specifies two key items: +- `num_partition`: The total number of independent request queues to create. +- `partition`: Defines the hardware mapping, assigning each queue (identified by its index) to a specific physical core. +For example, the configuration below creates two scheduling queues (`0` and `1`) and maps `core_0` to queue `0` and `core_1` to queue `1`: +``` + "num_partition" : 2, + "partition": { + "core_0":0, + "core_1":1 + } +``` + +Next, DNN model requests are generated and submitted. We provide a `poisson_request_generator` utility, which generates request arrival times. +Each `Request` is created with its model name, data, and a request_queue_idx to specify its target queue, then added via `scheduler.add_request`. +As shown in the code, `model0` requests are queued to `request_queue_idx=0`, while `model1` requests are queued to `request_queue_idx=1`. +```python3 # Load Generation model0_lambda = 5.0 model1_lambda = 3.0 @@ -202,17 +230,21 @@ for model0_request_time in poisson_request_generator(model0_lambda, total_time=m x = torch.randn(1, 3, 224, 224) new_request = Request("model0", [x], [], request_queue_idx=0) scheduler.add_request(new_request, request_time=model0_request_time) - + # Generate Possion distribution requests for model1 for model1_request_time in poisson_request_generator(model1_lambda, total_time=max_time): x = torch.randn(128, 768) - new_request = Request("model1", [x], [], request_queue_idx=0) + new_request = Request("model1", [x], [], request_queue_idx=1) scheduler.add_request(new_request, request_time=model1_request_time) +``` +Finally, `scheduler.schedule()` is called in a loop until all requests are processed. +```python3 # Run scheduler while not scheduler.is_finished(): scheduler.schedule() ``` + ## Compiler Optimizations PyTorchSim compiler supports several fusion optimizations: - GEMM prologue fusion @@ -220,7 +252,7 @@ PyTorchSim compiler supports several fusion optimizations: - GEMM reduction fusion - CONV epilogue fusion -Depending on tensor shape, use different convolution template +Depending on tensor shape, use different convolution template: - Single batch optimization - Multi-channel optimization @@ -354,4 +386,4 @@ If you use PyTorchSim for your research, please cite the following paper. doi={10.1145/3725843.3756045}, series={MICRO '25} } -``` \ No newline at end of file +``` From 0a6f24804bbf88ad5d2c1e86e2490168327b0a24 Mon Sep 17 00:00:00 2001 From: OkkyunWoo Date: Wed, 15 Oct 2025 11:48:34 +0000 Subject: [PATCH 03/53] [Frontend] Change ExecutionEngine to PyTorchSimExecutionEngine --- Scheduler/scheduler.py | 12 ++++++------ experiments/gemm.py | 4 ++-- scripts/ILS_experiment/test_matmul.py | 4 ++-- scripts/chiplet_prep.py | 4 ++-- test_extension_backend.py | 4 ++-- tests/Diffusion/test_diffusion.py | 4 ++-- tests/Fusion/test_addmm_residual.py | 4 ++-- tests/Fusion/test_attention_fusion.py | 4 ++-- tests/Fusion/test_bmm_reduction.py | 4 ++-- tests/Fusion/test_conv_fusion.py | 4 ++-- tests/Fusion/test_matmul_activation.py | 4 ++-- tests/Fusion/test_matmul_reduction.py | 4 ++-- tests/Fusion/test_matmul_scalar.py | 4 ++-- tests/Fusion/test_prologue_fusion.py | 4 ++-- tests/Fusion/test_transformer_fusion.py | 4 ++-- tests/MLP/test_mlp.py | 4 ++-- tests/MLP/test_mlp_cpu.py | 1 - tests/Mixtral_8x7B/test_attention.py | 4 ++-- tests/MoE/test_moe.py | 4 ++-- tests/test_activation.py | 4 ++-- tests/test_add.py | 4 ++-- tests/test_batchnorm.py | 4 ++-- tests/test_bmm.py | 4 ++-- tests/test_cnn.py | 4 ++-- tests/test_conv2d.py | 4 ++-- tests/test_exponent.py | 4 ++-- tests/test_indirect_access.py | 4 ++-- tests/test_layernorm.py | 4 ++-- tests/test_matmul.py | 4 ++-- tests/test_mlp.py | 4 ++-- tests/test_pool.py | 4 ++-- tests/test_reduce.py | 4 ++-- tests/test_resnet.py | 4 ++-- tests/test_single_perceptron.py | 4 ++-- tests/test_softmax.py | 4 ++-- tests/test_sparse_core.py | 4 ++-- tests/test_sparsity.py | 4 ++-- tests/test_stonne.py | 4 ++-- tests/test_transcendental.py | 4 ++-- tests/test_transformer.py | 4 ++-- tests/test_transpose2D.py | 4 ++-- tests/test_transpose3D.py | 4 ++-- tests/test_vectorops.py | 4 ++-- tests/test_view3D_2D.py | 4 ++-- tests/test_vit.py | 4 ++-- 45 files changed, 92 insertions(+), 93 deletions(-) diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 834698a6..149b08cf 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -140,7 +140,7 @@ def __str__(self): def register_model(model_name : str, compiled_model): SchedulerDNNModel.MODEL_MAP[model_name] = compiled_model -class ExecutionEngine: +class PyTorchSimExecutionEngine: PARTITION_BUSY = 0 PARTITION_IDLE = 1 SELECT_NOTHING = 2 @@ -262,7 +262,7 @@ def launch_kernel(self, current_cycle, partion_idx=0): self.partition_state[partion_idx] = self.PARTITION_BUSY return self.backend_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx) -class FIFOExecutionEngine(ExecutionEngine): +class FIFOExecutionEngine(PyTorchSimExecutionEngine): def __init__(self, backend_simulator: BackendSimulator, num_partion=1) -> None: super().__init__(backend_simulator, num_partion) @@ -297,7 +297,7 @@ def select_kernel(self, partition_idx): # No proper kernel now return self.SELECT_NOTHING -class RRExecutionEngine(ExecutionEngine): +class RRExecutionEngine(PyTorchSimExecutionEngine): def __init__(self, backend_simulator: BackendSimulator, num_partion=1) -> None: super().__init__(backend_simulator, num_partion) self.next_pointer = None @@ -480,7 +480,7 @@ def run(self, until_time): def execute_cycle(): launch_ret_info = [] for i in range(self.execution_engine.num_partion): - if self.execution_engine.partition_state[i] == ExecutionEngine.PARTITION_IDLE: + if self.execution_engine.partition_state[i] == PyTorchSimExecutionEngine.PARTITION_IDLE: ret = self.execution_engine.launch_kernel(self.current_cycle, i) launch_ret_info.append(ret) @@ -495,7 +495,7 @@ def execute_cycle(): for core_idx in result_list: # Kernel is finished. So set idle state - self.execution_engine.partition_state[core_idx] = ExecutionEngine.PARTITION_IDLE + self.execution_engine.partition_state[core_idx] = PyTorchSimExecutionEngine.PARTITION_IDLE return result_list @@ -543,4 +543,4 @@ def msec_to_cycle(self, msec): return msec freq = self.backend_simulator.get_core_freq() - return int(msec * (freq / 1000)) \ No newline at end of file + return int(msec * (freq / 1000)) diff --git a/experiments/gemm.py b/experiments/gemm.py index a1fdcff6..9fa02717 100644 --- a/experiments/gemm.py +++ b/experiments/gemm.py @@ -48,7 +48,7 @@ def custom_matmul(a, b): if 'BACKENDSIM_SPIKE_ONLY' in os.environ: del os.environ['BACKENDSIM_SPIKE_ONLY'] - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() run_matmul(size[0], size[1], size[2], config) diff --git a/scripts/ILS_experiment/test_matmul.py b/scripts/ILS_experiment/test_matmul.py index 09cc407d..256ad962 100644 --- a/scripts/ILS_experiment/test_matmul.py +++ b/scripts/ILS_experiment/test_matmul.py @@ -60,7 +60,7 @@ def custom_matmul(bias, a, b): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_matmul(device, *shape) diff --git a/scripts/chiplet_prep.py b/scripts/chiplet_prep.py index 168532f1..8df7d205 100644 --- a/scripts/chiplet_prep.py +++ b/scripts/chiplet_prep.py @@ -61,8 +61,8 @@ def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() parser = argparse.ArgumentParser(description='Process folder argument.') parser.add_argument('size', type=int, help='Folder value', default=256) diff --git a/test_extension_backend.py b/test_extension_backend.py index f0a9353a..9da2e03d 100644 --- a/test_extension_backend.py +++ b/test_extension_backend.py @@ -22,8 +22,8 @@ from tests.Fusion.test_matmul_activation import test_matmul_activation if __name__ == "__main__": - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() #test_vectoradd(device, (47, 10)) #test_vector_scalar_add(device, (10, 10)) diff --git a/tests/Diffusion/test_diffusion.py b/tests/Diffusion/test_diffusion.py index 03d1b721..243105c8 100644 --- a/tests/Diffusion/test_diffusion.py +++ b/tests/Diffusion/test_diffusion.py @@ -553,8 +553,8 @@ def test_upsample2d( args = parser.parse_args() sys.path.append(os.environ.get("TORCHSIM_DIR", "/workspace/PyTorchSim")) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() #test_upsample2d(device) diff --git a/tests/Fusion/test_addmm_residual.py b/tests/Fusion/test_addmm_residual.py index a5e05182..2bea7c80 100644 --- a/tests/Fusion/test_addmm_residual.py +++ b/tests/Fusion/test_addmm_residual.py @@ -43,8 +43,8 @@ def addmm_residual(a, b, c, d): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_addmm_residual(device, 32, 32, 32) test_addmm_residual(device, 128, 128, 128) diff --git a/tests/Fusion/test_attention_fusion.py b/tests/Fusion/test_attention_fusion.py index 95bdf165..3b4550ef 100644 --- a/tests/Fusion/test_attention_fusion.py +++ b/tests/Fusion/test_attention_fusion.py @@ -75,8 +75,8 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_MHA(device) # test_Attention(device, head=16, seq=512, d_k=64) diff --git a/tests/Fusion/test_bmm_reduction.py b/tests/Fusion/test_bmm_reduction.py index 42e38095..1a682798 100644 --- a/tests/Fusion/test_bmm_reduction.py +++ b/tests/Fusion/test_bmm_reduction.py @@ -42,8 +42,8 @@ def bmm(a, b): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() #test_bmm_reduce(device) test_bmm_reduce(device, 12, 512) diff --git a/tests/Fusion/test_conv_fusion.py b/tests/Fusion/test_conv_fusion.py index 42210b13..ed2b471e 100644 --- a/tests/Fusion/test_conv_fusion.py +++ b/tests/Fusion/test_conv_fusion.py @@ -101,8 +101,8 @@ def custom_conv_bn_relu(a, b, bias, c, d, e, f): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() # Vanila test diff --git a/tests/Fusion/test_matmul_activation.py b/tests/Fusion/test_matmul_activation.py index 2381bd8c..2e9bc4c4 100644 --- a/tests/Fusion/test_matmul_activation.py +++ b/tests/Fusion/test_matmul_activation.py @@ -73,8 +73,8 @@ def test_matmul_activation(device, batch_size=16, input_size=32, output_size=8, import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_matmul_activation(device) test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="sigmoid") diff --git a/tests/Fusion/test_matmul_reduction.py b/tests/Fusion/test_matmul_reduction.py index 31ea1b0d..ea3ed638 100644 --- a/tests/Fusion/test_matmul_reduction.py +++ b/tests/Fusion/test_matmul_reduction.py @@ -89,8 +89,8 @@ def matmul_fused(a, b, c, d): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_matmul_reduce(device, 3072, 512, 768) test_matmul_var_mean(device) diff --git a/tests/Fusion/test_matmul_scalar.py b/tests/Fusion/test_matmul_scalar.py index 0dcb54f9..044b1729 100644 --- a/tests/Fusion/test_matmul_scalar.py +++ b/tests/Fusion/test_matmul_scalar.py @@ -39,7 +39,7 @@ def matmul_fused(a, b, c): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_matmul_scalar(device) diff --git a/tests/Fusion/test_prologue_fusion.py b/tests/Fusion/test_prologue_fusion.py index 797f9e76..0f23d570 100644 --- a/tests/Fusion/test_prologue_fusion.py +++ b/tests/Fusion/test_prologue_fusion.py @@ -88,8 +88,8 @@ def bmm(a, b, c, d): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_elem_broadcast_fusion(device) test_elem_fusion(device) diff --git a/tests/Fusion/test_transformer_fusion.py b/tests/Fusion/test_transformer_fusion.py index 0e500b5b..410cf005 100644 --- a/tests/Fusion/test_transformer_fusion.py +++ b/tests/Fusion/test_transformer_fusion.py @@ -203,8 +203,8 @@ def test_EncoderBlock_validation(head=12, embed_dim=768, input_seq=512): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() #test_MHA(device) test_EncoderBlock(device) diff --git a/tests/MLP/test_mlp.py b/tests/MLP/test_mlp.py index 6f6c9444..f62dbe5f 100644 --- a/tests/MLP/test_mlp.py +++ b/tests/MLP/test_mlp.py @@ -281,9 +281,9 @@ def train(model, device): return if __name__ == "__main__": - from Scheduler.scheduler import ExecutionEngine + from Scheduler.scheduler import PyTorchSimExecutionEngine torch.set_printoptions(threshold=float('inf'), linewidth=600) - module = ExecutionEngine.setup_device() + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_mlp(device) diff --git a/tests/MLP/test_mlp_cpu.py b/tests/MLP/test_mlp_cpu.py index 49f44650..112f5d07 100644 --- a/tests/MLP/test_mlp_cpu.py +++ b/tests/MLP/test_mlp_cpu.py @@ -399,7 +399,6 @@ def train(model, device): if __name__ == "__main__": - from Scheduler.scheduler import ExecutionEngine # torch.set_printoptions(threshold=float('inf'), linewidth=600) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py index aa1af651..24ba9501 100644 --- a/tests/Mixtral_8x7B/test_attention.py +++ b/tests/Mixtral_8x7B/test_attention.py @@ -163,8 +163,8 @@ def test_rmsnorm(device, seq=32): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_rmsnorm(device, seq=1) test_concat(device, size1=(1, 8, 64, 64), size2=(1,8,1,64), dim=2) diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py index c5ab8107..ab1e118c 100644 --- a/tests/MoE/test_moe.py +++ b/tests/MoE/test_moe.py @@ -783,9 +783,9 @@ def evaluation(model, evaluation_loader): train(opt_model, train_loader) if __name__ == "__main__": - from Scheduler.scheduler import ExecutionEngine + from Scheduler.scheduler import PyTorchSimExecutionEngine torch.set_printoptions(threshold=float('inf'), linewidth=600) - module = ExecutionEngine.setup_device() + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_moe(device) diff --git a/tests/test_activation.py b/tests/test_activation.py index de3542c3..bc45b313 100644 --- a/tests/test_activation.py +++ b/tests/test_activation.py @@ -88,8 +88,8 @@ def test_SwiGLU(device, size=(128, 128)): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_ReLU(device, (47, 10)) test_ReLU(device, (128, 128)) diff --git a/tests/test_add.py b/tests/test_add.py index 5e1ab15e..7bdf4ed8 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -58,8 +58,8 @@ def vectoradd(a, b): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_vectoradd(device, (1, 1)) test_vectoradd(device, (47, 10)) diff --git a/tests/test_batchnorm.py b/tests/test_batchnorm.py index f7abacf5..7992969b 100644 --- a/tests/test_batchnorm.py +++ b/tests/test_batchnorm.py @@ -37,8 +37,8 @@ def test_BatchNorm(device, size=(1, 16, 64, 64)): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_BatchNorm(device) test_BatchNorm(device, size=(1,64, 32, 32)) diff --git a/tests/test_bmm.py b/tests/test_bmm.py index 6d9279aa..39b462a5 100644 --- a/tests/test_bmm.py +++ b/tests/test_bmm.py @@ -46,8 +46,8 @@ def bmm(a, b, bias): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_BMM(device) test_BMM(device, 2, 256, 128, 256) diff --git a/tests/test_cnn.py b/tests/test_cnn.py index aaad2836..acd6d05d 100644 --- a/tests/test_cnn.py +++ b/tests/test_cnn.py @@ -53,7 +53,7 @@ def test_CNN(device): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_CNN(device) diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py index 21bbfec7..cc59d075 100644 --- a/tests/test_conv2d.py +++ b/tests/test_conv2d.py @@ -40,8 +40,8 @@ def custom_conv2d(a, b, bias): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() torch._dynamo.config.cache_size_limit = 64 test_conv2d(device, batch_size=8, in_channels=3, out_channels=32, input_size=32, kernel_size=1, stride=1, padding=0) diff --git a/tests/test_exponent.py b/tests/test_exponent.py index c95823cb..25a70882 100644 --- a/tests/test_exponent.py +++ b/tests/test_exponent.py @@ -31,7 +31,7 @@ def exponent(a): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_exponent(device, size=(32, 32)) diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py index b7b20074..6331cb20 100644 --- a/tests/test_indirect_access.py +++ b/tests/test_indirect_access.py @@ -48,8 +48,8 @@ def test_embedding(device, vocab_size, dim): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_indirect_vectoradd(device) #test_embedding(device, 1024, 2048) \ No newline at end of file diff --git a/tests/test_layernorm.py b/tests/test_layernorm.py index 1cea9d9f..e743e2df 100644 --- a/tests/test_layernorm.py +++ b/tests/test_layernorm.py @@ -41,8 +41,8 @@ def test_LayerNorm(device, size=(64, 64)): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() #test_LayerNorm(device) test_LayerNorm(device, shape) diff --git a/tests/test_matmul.py b/tests/test_matmul.py index 6f41468b..463dc0b1 100644 --- a/tests/test_matmul.py +++ b/tests/test_matmul.py @@ -94,8 +94,8 @@ def custom_linear(a, b, bias): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_matmul(device, 32, 32, 32) test_matmul(device, 128, 128, 128) diff --git a/tests/test_mlp.py b/tests/test_mlp.py index b8118aa3..6401918e 100644 --- a/tests/test_mlp.py +++ b/tests/test_mlp.py @@ -109,8 +109,8 @@ def test_optimizer(device): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_mlp(device) test_mlp_inf(device, batch_size=1, input_size=256, hidden_size=512, output_size=256) diff --git a/tests/test_pool.py b/tests/test_pool.py index 304a5e7c..48be7c19 100644 --- a/tests/test_pool.py +++ b/tests/test_pool.py @@ -47,8 +47,8 @@ def avgpool(a): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() #test_maxpool(device, b=1, c=8, h=16, w=16) #test_maxpool(device, b=1, c=8, h=112, w=112) diff --git a/tests/test_reduce.py b/tests/test_reduce.py index e1a84b7f..1c02dfc0 100644 --- a/tests/test_reduce.py +++ b/tests/test_reduce.py @@ -47,8 +47,8 @@ def reduce_sum(a, dim, keepdim): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_reduce_sum(device, (29, 47), 1, keepdim=True) test_reduce_sum(device, (17, 68), 0, keepdim=True) diff --git a/tests/test_resnet.py b/tests/test_resnet.py index 97c60528..5fc598a9 100644 --- a/tests/test_resnet.py +++ b/tests/test_resnet.py @@ -49,7 +49,7 @@ def test_resnet(device, batch=1, model_type='resnet18'): args = args.parse_args() sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_resnet(device, model_type=args.model_type) diff --git a/tests/test_single_perceptron.py b/tests/test_single_perceptron.py index c7fdca06..1ad7c252 100644 --- a/tests/test_single_perceptron.py +++ b/tests/test_single_perceptron.py @@ -82,7 +82,7 @@ def weight_update(a, b, lr): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_single_perceptron(device) diff --git a/tests/test_softmax.py b/tests/test_softmax.py index 9fba41dd..4707c65d 100644 --- a/tests/test_softmax.py +++ b/tests/test_softmax.py @@ -58,8 +58,8 @@ def test_softmax(device, size=(128, 128), dim=1): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_softmax(device, size=(64, 128)) test_softmax(device, size=(64, 128), dim=0) diff --git a/tests/test_sparse_core.py b/tests/test_sparse_core.py index b2b16818..ae11a532 100644 --- a/tests/test_sparse_core.py +++ b/tests/test_sparse_core.py @@ -80,9 +80,9 @@ def test_sparse_mlp(device, batch_size=32, input_size=128, hidden_size=128, outp import os import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine + from Scheduler.scheduler import PyTorchSimExecutionEngine - module = ExecutionEngine.setup_device() + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_sparse_mlp(device, batch_size=8, input_size=16, hidden_size=32, output_size=64) diff --git a/tests/test_sparsity.py b/tests/test_sparsity.py index 3e079f83..cdf314f3 100644 --- a/tests/test_sparsity.py +++ b/tests/test_sparsity.py @@ -96,8 +96,8 @@ def test_mlp_inf(device, batch_size=64, input_size=64, hidden_size=32, output_si ) args = parser.parse_args() - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() #test_dec_inf(device, sparsity=args.sparsity, block=args.block) diff --git a/tests/test_stonne.py b/tests/test_stonne.py index 5e4fe5fb..64d17aaa 100644 --- a/tests/test_stonne.py +++ b/tests/test_stonne.py @@ -54,7 +54,7 @@ def test_sparse_mm(device, input_size=128, hidden_size=128, output_size=128, spa args = parser.parse_args() sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_sparse_mm(device, args.sz, args.sz, args.sz, args.sparsity) \ No newline at end of file diff --git a/tests/test_transcendental.py b/tests/test_transcendental.py index 5f296581..4afda089 100644 --- a/tests/test_transcendental.py +++ b/tests/test_transcendental.py @@ -73,8 +73,8 @@ def cos(a): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_tanh(device) test_exp(device) diff --git a/tests/test_transformer.py b/tests/test_transformer.py index 4d45707e..77d917fe 100644 --- a/tests/test_transformer.py +++ b/tests/test_transformer.py @@ -119,8 +119,8 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_EncoderBlock(device) # test_Attention(device, head=16, seq=512, d_k=64) diff --git a/tests/test_transpose2D.py b/tests/test_transpose2D.py index 14f16fbb..4ef4b0eb 100644 --- a/tests/test_transpose2D.py +++ b/tests/test_transpose2D.py @@ -46,8 +46,8 @@ def transpose(a, b): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_Transpose2D(device, [64, 156]) test_Transpose2D_2(device, [16, 64]) diff --git a/tests/test_transpose3D.py b/tests/test_transpose3D.py index 937948c4..b2fa6ccb 100644 --- a/tests/test_transpose3D.py +++ b/tests/test_transpose3D.py @@ -61,8 +61,8 @@ def transpose(a, b): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_Transpose3D_1(device, [62, 34, 44]) test_Transpose3D_1(device, [62, 134, 144]) diff --git a/tests/test_vectorops.py b/tests/test_vectorops.py index 0677b7ae..840b1062 100644 --- a/tests/test_vectorops.py +++ b/tests/test_vectorops.py @@ -6,8 +6,8 @@ import os import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() # Target shape diff --git a/tests/test_view3D_2D.py b/tests/test_view3D_2D.py index a5a31a85..50888696 100644 --- a/tests/test_view3D_2D.py +++ b/tests/test_view3D_2D.py @@ -44,8 +44,8 @@ def view2D_3D(a): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() test_view3D_2D(device) test_view3D_2D(device, [12, 512, 64]) diff --git a/tests/test_vit.py b/tests/test_vit.py index 6f587127..be1f723a 100644 --- a/tests/test_vit.py +++ b/tests/test_vit.py @@ -202,8 +202,8 @@ def test_encoder_block_with_class_token( shape = tuple(map(int, args.shape.strip('()').split(','))) sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimExecutionEngine + module = PyTorchSimExecutionEngine.setup_device() device = module.custom_device() #test_multihead_attention(device) #test_encoder_block(device, seq_len=197) From 216f4dedeea084c4f3f8a26c895e3cc92416f71a Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 29 Oct 2025 04:28:11 +0000 Subject: [PATCH 04/53] [Refactor] Rename execution engine as a PyTorchSimRunner --- Scheduler/scheduler.py | 14 +++---- experiments/gemm.py | 4 +- scripts/ILS_experiment/test_matmul.py | 4 +- scripts/chiplet_prep.py | 4 +- test_extension_backend.py | 4 +- tests/Diffusion/test_diffusion.py | 4 +- tests/Fusion/test_addmm_residual.py | 4 +- tests/Fusion/test_attention_fusion.py | 4 +- tests/Fusion/test_bmm_reduction.py | 4 +- tests/Fusion/test_conv_fusion.py | 4 +- tests/Fusion/test_matmul_activation.py | 4 +- tests/Fusion/test_matmul_reduction.py | 4 +- tests/Fusion/test_matmul_scalar.py | 4 +- tests/Fusion/test_prologue_fusion.py | 4 +- tests/Fusion/test_transformer_fusion.py | 4 +- tests/MLP/test_mlp.py | 4 +- tests/Mixtral_8x7B/test_attention.py | 4 +- tests/MoE/test_moe.py | 4 +- tests/test_activation.py | 4 +- tests/test_add.py | 4 +- tests/test_batchnorm.py | 4 +- tests/test_bmm.py | 4 +- tests/test_cnn.py | 4 +- tests/test_conv2d.py | 4 +- tests/test_exponent.py | 4 +- tests/test_indirect_access.py | 4 +- tests/test_layernorm.py | 4 +- tests/test_matmul.py | 4 +- tests/test_mlp.py | 4 +- tests/test_pool.py | 4 +- tests/test_reduce.py | 4 +- tests/test_resnet.py | 4 +- tests/test_single_perceptron.py | 4 +- tests/test_softmax.py | 4 +- tests/test_sparse_core.py | 4 +- tests/test_sparsity.py | 4 +- tests/test_stonne.py | 4 +- tests/test_topk.py | 54 +++++++++++++++++++++++++ tests/test_transcendental.py | 4 +- tests/test_transformer.py | 4 +- tests/test_transpose2D.py | 4 +- tests/test_transpose3D.py | 4 +- tests/test_vectorops.py | 4 +- tests/test_view3D_2D.py | 4 +- tests/test_vit.py | 4 +- 45 files changed, 147 insertions(+), 93 deletions(-) create mode 100644 tests/test_topk.py diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 149b08cf..10358321 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -140,7 +140,7 @@ def __str__(self): def register_model(model_name : str, compiled_model): SchedulerDNNModel.MODEL_MAP[model_name] = compiled_model -class PyTorchSimExecutionEngine: +class PyTorchSimRunner: PARTITION_BUSY = 0 PARTITION_IDLE = 1 SELECT_NOTHING = 2 @@ -262,7 +262,7 @@ def launch_kernel(self, current_cycle, partion_idx=0): self.partition_state[partion_idx] = self.PARTITION_BUSY return self.backend_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx) -class FIFOExecutionEngine(PyTorchSimExecutionEngine): +class FIFORunner(PyTorchSimRunner): def __init__(self, backend_simulator: BackendSimulator, num_partion=1) -> None: super().__init__(backend_simulator, num_partion) @@ -297,7 +297,7 @@ def select_kernel(self, partition_idx): # No proper kernel now return self.SELECT_NOTHING -class RRExecutionEngine(PyTorchSimExecutionEngine): +class RoundRobinRunner(PyTorchSimRunner): def __init__(self, backend_simulator: BackendSimulator, num_partion=1) -> None: super().__init__(backend_simulator, num_partion) self.next_pointer = None @@ -360,9 +360,9 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, self.backend_simulator = BackendSimulator(backend_path, backend_config) self.backend_simulator.interactive_simulation() if engine_select == Scheduler.FIFO_ENGINE: - self.execution_engine = FIFOExecutionEngine(self.backend_simulator, self.num_request_queue) + self.execution_engine = FIFORunner(self.backend_simulator, self.num_request_queue) elif engine_select == Scheduler.RR_ENGINE: - self.execution_engine = RRExecutionEngine(self.backend_simulator, self.num_request_queue) + self.execution_engine = RoundRobinRunner(self.backend_simulator, self.num_request_queue) else: print(f"Not supporetd engine type {engine_select}") exit(1) @@ -480,7 +480,7 @@ def run(self, until_time): def execute_cycle(): launch_ret_info = [] for i in range(self.execution_engine.num_partion): - if self.execution_engine.partition_state[i] == PyTorchSimExecutionEngine.PARTITION_IDLE: + if self.execution_engine.partition_state[i] == PyTorchSimRunner.PARTITION_IDLE: ret = self.execution_engine.launch_kernel(self.current_cycle, i) launch_ret_info.append(ret) @@ -495,7 +495,7 @@ def execute_cycle(): for core_idx in result_list: # Kernel is finished. So set idle state - self.execution_engine.partition_state[core_idx] = PyTorchSimExecutionEngine.PARTITION_IDLE + self.execution_engine.partition_state[core_idx] = PyTorchSimRunner.PARTITION_IDLE return result_list diff --git a/experiments/gemm.py b/experiments/gemm.py index 9fa02717..e7a639ad 100644 --- a/experiments/gemm.py +++ b/experiments/gemm.py @@ -48,7 +48,7 @@ def custom_matmul(a, b): if 'BACKENDSIM_SPIKE_ONLY' in os.environ: del os.environ['BACKENDSIM_SPIKE_ONLY'] - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() run_matmul(size[0], size[1], size[2], config) diff --git a/scripts/ILS_experiment/test_matmul.py b/scripts/ILS_experiment/test_matmul.py index 256ad962..667dfc66 100644 --- a/scripts/ILS_experiment/test_matmul.py +++ b/scripts/ILS_experiment/test_matmul.py @@ -60,7 +60,7 @@ def custom_matmul(bias, a, b): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_matmul(device, *shape) diff --git a/scripts/chiplet_prep.py b/scripts/chiplet_prep.py index 8df7d205..32f7ad50 100644 --- a/scripts/chiplet_prep.py +++ b/scripts/chiplet_prep.py @@ -61,8 +61,8 @@ def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() parser = argparse.ArgumentParser(description='Process folder argument.') parser.add_argument('size', type=int, help='Folder value', default=256) diff --git a/test_extension_backend.py b/test_extension_backend.py index 9da2e03d..5e6427ef 100644 --- a/test_extension_backend.py +++ b/test_extension_backend.py @@ -22,8 +22,8 @@ from tests.Fusion.test_matmul_activation import test_matmul_activation if __name__ == "__main__": - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_vectoradd(device, (47, 10)) #test_vector_scalar_add(device, (10, 10)) diff --git a/tests/Diffusion/test_diffusion.py b/tests/Diffusion/test_diffusion.py index 243105c8..c5170209 100644 --- a/tests/Diffusion/test_diffusion.py +++ b/tests/Diffusion/test_diffusion.py @@ -553,8 +553,8 @@ def test_upsample2d( args = parser.parse_args() sys.path.append(os.environ.get("TORCHSIM_DIR", "/workspace/PyTorchSim")) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_upsample2d(device) diff --git a/tests/Fusion/test_addmm_residual.py b/tests/Fusion/test_addmm_residual.py index 2bea7c80..ef753a67 100644 --- a/tests/Fusion/test_addmm_residual.py +++ b/tests/Fusion/test_addmm_residual.py @@ -43,8 +43,8 @@ def addmm_residual(a, b, c, d): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_addmm_residual(device, 32, 32, 32) test_addmm_residual(device, 128, 128, 128) diff --git a/tests/Fusion/test_attention_fusion.py b/tests/Fusion/test_attention_fusion.py index 3b4550ef..123376d1 100644 --- a/tests/Fusion/test_attention_fusion.py +++ b/tests/Fusion/test_attention_fusion.py @@ -75,8 +75,8 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_MHA(device) # test_Attention(device, head=16, seq=512, d_k=64) diff --git a/tests/Fusion/test_bmm_reduction.py b/tests/Fusion/test_bmm_reduction.py index 1a682798..4f4d3ad6 100644 --- a/tests/Fusion/test_bmm_reduction.py +++ b/tests/Fusion/test_bmm_reduction.py @@ -42,8 +42,8 @@ def bmm(a, b): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_bmm_reduce(device) test_bmm_reduce(device, 12, 512) diff --git a/tests/Fusion/test_conv_fusion.py b/tests/Fusion/test_conv_fusion.py index ed2b471e..694f3bb9 100644 --- a/tests/Fusion/test_conv_fusion.py +++ b/tests/Fusion/test_conv_fusion.py @@ -101,8 +101,8 @@ def custom_conv_bn_relu(a, b, bias, c, d, e, f): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() # Vanila test diff --git a/tests/Fusion/test_matmul_activation.py b/tests/Fusion/test_matmul_activation.py index 2e9bc4c4..2f1d014f 100644 --- a/tests/Fusion/test_matmul_activation.py +++ b/tests/Fusion/test_matmul_activation.py @@ -73,8 +73,8 @@ def test_matmul_activation(device, batch_size=16, input_size=32, output_size=8, import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_matmul_activation(device) test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="sigmoid") diff --git a/tests/Fusion/test_matmul_reduction.py b/tests/Fusion/test_matmul_reduction.py index ea3ed638..df8cf969 100644 --- a/tests/Fusion/test_matmul_reduction.py +++ b/tests/Fusion/test_matmul_reduction.py @@ -89,8 +89,8 @@ def matmul_fused(a, b, c, d): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_matmul_reduce(device, 3072, 512, 768) test_matmul_var_mean(device) diff --git a/tests/Fusion/test_matmul_scalar.py b/tests/Fusion/test_matmul_scalar.py index 044b1729..0815bb90 100644 --- a/tests/Fusion/test_matmul_scalar.py +++ b/tests/Fusion/test_matmul_scalar.py @@ -39,7 +39,7 @@ def matmul_fused(a, b, c): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_matmul_scalar(device) diff --git a/tests/Fusion/test_prologue_fusion.py b/tests/Fusion/test_prologue_fusion.py index 0f23d570..b27312a9 100644 --- a/tests/Fusion/test_prologue_fusion.py +++ b/tests/Fusion/test_prologue_fusion.py @@ -88,8 +88,8 @@ def bmm(a, b, c, d): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_elem_broadcast_fusion(device) test_elem_fusion(device) diff --git a/tests/Fusion/test_transformer_fusion.py b/tests/Fusion/test_transformer_fusion.py index 410cf005..b1cceb2c 100644 --- a/tests/Fusion/test_transformer_fusion.py +++ b/tests/Fusion/test_transformer_fusion.py @@ -203,8 +203,8 @@ def test_EncoderBlock_validation(head=12, embed_dim=768, input_seq=512): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_MHA(device) test_EncoderBlock(device) diff --git a/tests/MLP/test_mlp.py b/tests/MLP/test_mlp.py index f62dbe5f..31bcefdf 100644 --- a/tests/MLP/test_mlp.py +++ b/tests/MLP/test_mlp.py @@ -281,9 +281,9 @@ def train(model, device): return if __name__ == "__main__": - from Scheduler.scheduler import PyTorchSimExecutionEngine + from Scheduler.scheduler import PyTorchSimRunner torch.set_printoptions(threshold=float('inf'), linewidth=600) - module = PyTorchSimExecutionEngine.setup_device() + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_mlp(device) diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py index 24ba9501..6a7747f7 100644 --- a/tests/Mixtral_8x7B/test_attention.py +++ b/tests/Mixtral_8x7B/test_attention.py @@ -163,8 +163,8 @@ def test_rmsnorm(device, seq=32): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_rmsnorm(device, seq=1) test_concat(device, size1=(1, 8, 64, 64), size2=(1,8,1,64), dim=2) diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py index ab1e118c..ae16f0b0 100644 --- a/tests/MoE/test_moe.py +++ b/tests/MoE/test_moe.py @@ -783,9 +783,9 @@ def evaluation(model, evaluation_loader): train(opt_model, train_loader) if __name__ == "__main__": - from Scheduler.scheduler import PyTorchSimExecutionEngine + from Scheduler.scheduler import PyTorchSimRunner torch.set_printoptions(threshold=float('inf'), linewidth=600) - module = PyTorchSimExecutionEngine.setup_device() + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_moe(device) diff --git a/tests/test_activation.py b/tests/test_activation.py index bc45b313..575fc7e8 100644 --- a/tests/test_activation.py +++ b/tests/test_activation.py @@ -88,8 +88,8 @@ def test_SwiGLU(device, size=(128, 128)): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_ReLU(device, (47, 10)) test_ReLU(device, (128, 128)) diff --git a/tests/test_add.py b/tests/test_add.py index 7bdf4ed8..118632d5 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -58,8 +58,8 @@ def vectoradd(a, b): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_vectoradd(device, (1, 1)) test_vectoradd(device, (47, 10)) diff --git a/tests/test_batchnorm.py b/tests/test_batchnorm.py index 7992969b..251805f5 100644 --- a/tests/test_batchnorm.py +++ b/tests/test_batchnorm.py @@ -37,8 +37,8 @@ def test_BatchNorm(device, size=(1, 16, 64, 64)): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_BatchNorm(device) test_BatchNorm(device, size=(1,64, 32, 32)) diff --git a/tests/test_bmm.py b/tests/test_bmm.py index 39b462a5..d90410db 100644 --- a/tests/test_bmm.py +++ b/tests/test_bmm.py @@ -46,8 +46,8 @@ def bmm(a, b, bias): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_BMM(device) test_BMM(device, 2, 256, 128, 256) diff --git a/tests/test_cnn.py b/tests/test_cnn.py index acd6d05d..54225747 100644 --- a/tests/test_cnn.py +++ b/tests/test_cnn.py @@ -53,7 +53,7 @@ def test_CNN(device): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_CNN(device) diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py index cc59d075..e964319d 100644 --- a/tests/test_conv2d.py +++ b/tests/test_conv2d.py @@ -40,8 +40,8 @@ def custom_conv2d(a, b, bias): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() torch._dynamo.config.cache_size_limit = 64 test_conv2d(device, batch_size=8, in_channels=3, out_channels=32, input_size=32, kernel_size=1, stride=1, padding=0) diff --git a/tests/test_exponent.py b/tests/test_exponent.py index 25a70882..e60f8407 100644 --- a/tests/test_exponent.py +++ b/tests/test_exponent.py @@ -31,7 +31,7 @@ def exponent(a): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_exponent(device, size=(32, 32)) diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py index 6331cb20..c6afaf86 100644 --- a/tests/test_indirect_access.py +++ b/tests/test_indirect_access.py @@ -48,8 +48,8 @@ def test_embedding(device, vocab_size, dim): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_indirect_vectoradd(device) #test_embedding(device, 1024, 2048) \ No newline at end of file diff --git a/tests/test_layernorm.py b/tests/test_layernorm.py index e743e2df..28e38d37 100644 --- a/tests/test_layernorm.py +++ b/tests/test_layernorm.py @@ -41,8 +41,8 @@ def test_LayerNorm(device, size=(64, 64)): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_LayerNorm(device) test_LayerNorm(device, shape) diff --git a/tests/test_matmul.py b/tests/test_matmul.py index 463dc0b1..cd30bd30 100644 --- a/tests/test_matmul.py +++ b/tests/test_matmul.py @@ -94,8 +94,8 @@ def custom_linear(a, b, bias): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_matmul(device, 32, 32, 32) test_matmul(device, 128, 128, 128) diff --git a/tests/test_mlp.py b/tests/test_mlp.py index 6401918e..423d6e8e 100644 --- a/tests/test_mlp.py +++ b/tests/test_mlp.py @@ -109,8 +109,8 @@ def test_optimizer(device): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_mlp(device) test_mlp_inf(device, batch_size=1, input_size=256, hidden_size=512, output_size=256) diff --git a/tests/test_pool.py b/tests/test_pool.py index 48be7c19..f5505dba 100644 --- a/tests/test_pool.py +++ b/tests/test_pool.py @@ -47,8 +47,8 @@ def avgpool(a): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_maxpool(device, b=1, c=8, h=16, w=16) #test_maxpool(device, b=1, c=8, h=112, w=112) diff --git a/tests/test_reduce.py b/tests/test_reduce.py index 1c02dfc0..4781112d 100644 --- a/tests/test_reduce.py +++ b/tests/test_reduce.py @@ -47,8 +47,8 @@ def reduce_sum(a, dim, keepdim): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_reduce_sum(device, (29, 47), 1, keepdim=True) test_reduce_sum(device, (17, 68), 0, keepdim=True) diff --git a/tests/test_resnet.py b/tests/test_resnet.py index 5fc598a9..c83f13ba 100644 --- a/tests/test_resnet.py +++ b/tests/test_resnet.py @@ -49,7 +49,7 @@ def test_resnet(device, batch=1, model_type='resnet18'): args = args.parse_args() sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_resnet(device, model_type=args.model_type) diff --git a/tests/test_single_perceptron.py b/tests/test_single_perceptron.py index 1ad7c252..beab1c54 100644 --- a/tests/test_single_perceptron.py +++ b/tests/test_single_perceptron.py @@ -82,7 +82,7 @@ def weight_update(a, b, lr): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_single_perceptron(device) diff --git a/tests/test_softmax.py b/tests/test_softmax.py index 4707c65d..e6e8cc1e 100644 --- a/tests/test_softmax.py +++ b/tests/test_softmax.py @@ -58,8 +58,8 @@ def test_softmax(device, size=(128, 128), dim=1): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_softmax(device, size=(64, 128)) test_softmax(device, size=(64, 128), dim=0) diff --git a/tests/test_sparse_core.py b/tests/test_sparse_core.py index ae11a532..72eda0c8 100644 --- a/tests/test_sparse_core.py +++ b/tests/test_sparse_core.py @@ -80,9 +80,9 @@ def test_sparse_mlp(device, batch_size=32, input_size=128, hidden_size=128, outp import os import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine + from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimExecutionEngine.setup_device() + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_sparse_mlp(device, batch_size=8, input_size=16, hidden_size=32, output_size=64) diff --git a/tests/test_sparsity.py b/tests/test_sparsity.py index cdf314f3..a2493673 100644 --- a/tests/test_sparsity.py +++ b/tests/test_sparsity.py @@ -96,8 +96,8 @@ def test_mlp_inf(device, batch_size=64, input_size=64, hidden_size=32, output_si ) args = parser.parse_args() - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_dec_inf(device, sparsity=args.sparsity, block=args.block) diff --git a/tests/test_stonne.py b/tests/test_stonne.py index 64d17aaa..04ad05a8 100644 --- a/tests/test_stonne.py +++ b/tests/test_stonne.py @@ -54,7 +54,7 @@ def test_sparse_mm(device, input_size=128, hidden_size=128, output_size=128, spa args = parser.parse_args() sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_sparse_mm(device, args.sz, args.sz, args.sz, args.sparsity) \ No newline at end of file diff --git a/tests/test_topk.py b/tests/test_topk.py new file mode 100644 index 00000000..0d5c08ec --- /dev/null +++ b/tests/test_topk.py @@ -0,0 +1,54 @@ +import torch +import torch._dynamo +import torch.utils.cpp_extension + +def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): + if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): + message = f"|{name} Test Passed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + else: + message = f"|{name} Test Failed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + print("custom out: ", out.cpu()) + print("cpu out: ", cpu_out) + exit(1) + +def test_topk(device, size=(128, 128), k=5, dim=-1, largest=True, sorted=True): + # dim 해석을 위해 양수 인덱스로 변환 + dim_ = dim if dim >= 0 else (len(size) + dim) + assert 0 <= dim_ < len(size), "dim이 텐서 차원 범위를 벗어났습니다." + assert k <= size[dim_], f"k(={k})는 size[dim](={size[dim_]}) 이하여야 합니다." + + def topk_fn(a): + return torch.topk(a, k, dim=dim, largest=largest, sorted=sorted) + + x = torch.randn(size) + x = x.to(device=device) + + opt_topk = torch.compile(dynamic=False)(topk_fn) + res_values, res_indices = opt_topk(x) + + ref_values, ref_indices = torch.topk(x.cpu(), k, dim=dim, largest=largest, sorted=sorted) + + test_result("TopK/values", res_values, ref_values) + test_result("TopK/indices", res_indices, ref_indices) + +if __name__ == "__main__": + import os + import sys + import argparse + sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) + + parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape") + parser.add_argument('--shape', type=str, default="(512,768)") + args = parser.parse_args() + shape = tuple(map(int, args.shape.strip('()').split(','))) + + from Scheduler.scheduler import ExecutionEngine + module = ExecutionEngine.setup_device() + device = module.custom_device() + test_topk(device, (128, 128), k=2, dim=-1) \ No newline at end of file diff --git a/tests/test_transcendental.py b/tests/test_transcendental.py index 4afda089..38c2f4f6 100644 --- a/tests/test_transcendental.py +++ b/tests/test_transcendental.py @@ -73,8 +73,8 @@ def cos(a): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_tanh(device) test_exp(device) diff --git a/tests/test_transformer.py b/tests/test_transformer.py index 77d917fe..a3ac55d7 100644 --- a/tests/test_transformer.py +++ b/tests/test_transformer.py @@ -119,8 +119,8 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_EncoderBlock(device) # test_Attention(device, head=16, seq=512, d_k=64) diff --git a/tests/test_transpose2D.py b/tests/test_transpose2D.py index 4ef4b0eb..af5aacf7 100644 --- a/tests/test_transpose2D.py +++ b/tests/test_transpose2D.py @@ -46,8 +46,8 @@ def transpose(a, b): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_Transpose2D(device, [64, 156]) test_Transpose2D_2(device, [16, 64]) diff --git a/tests/test_transpose3D.py b/tests/test_transpose3D.py index b2fa6ccb..d6c1092d 100644 --- a/tests/test_transpose3D.py +++ b/tests/test_transpose3D.py @@ -61,8 +61,8 @@ def transpose(a, b): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_Transpose3D_1(device, [62, 34, 44]) test_Transpose3D_1(device, [62, 134, 144]) diff --git a/tests/test_vectorops.py b/tests/test_vectorops.py index 840b1062..ed895171 100644 --- a/tests/test_vectorops.py +++ b/tests/test_vectorops.py @@ -6,8 +6,8 @@ import os import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() # Target shape diff --git a/tests/test_view3D_2D.py b/tests/test_view3D_2D.py index 50888696..148fe8fa 100644 --- a/tests/test_view3D_2D.py +++ b/tests/test_view3D_2D.py @@ -44,8 +44,8 @@ def view2D_3D(a): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() test_view3D_2D(device) test_view3D_2D(device, [12, 512, 64]) diff --git a/tests/test_vit.py b/tests/test_vit.py index be1f723a..aeb4f148 100644 --- a/tests/test_vit.py +++ b/tests/test_vit.py @@ -202,8 +202,8 @@ def test_encoder_block_with_class_token( shape = tuple(map(int, args.shape.strip('()').split(','))) sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimExecutionEngine - module = PyTorchSimExecutionEngine.setup_device() + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() device = module.custom_device() #test_multihead_attention(device) #test_encoder_block(device, seq_len=197) From 84b8b21b0aae3e467fa32c3bc0f67b6b91dc435a Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 31 Oct 2025 06:25:55 +0000 Subject: [PATCH 05/53] [Refactor] Rearchitect MLIRMultiDim class --- PyTorchSimFrontend/extension_codecache.py | 56 +- PyTorchSimFrontend/extension_config.py | 3 - PyTorchSimFrontend/mlir/mlir_autotune.py | 25 + .../mlir/mlir_codegen_backend.py | 190 +++-- PyTorchSimFrontend/mlir/mlir_common.py | 706 +++++++++--------- PyTorchSimFrontend/mlir/mlir_template.py | 20 +- Simulator/simulator.py | 22 +- 7 files changed, 551 insertions(+), 471 deletions(-) diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index 20152e9f..1e756f96 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -142,6 +142,10 @@ class SpadOverflowError(Exception): def __init__(self, message="SPAD overflow occurred."): super().__init__(message) +class TileSizeError(Exception): + def __init__(self, message="SPAD overflow occurred."): + super().__init__(message) + class MLIRCodeCache: cache = dict() clear = staticmethod(cache.clear) # Todo: Cache @@ -278,8 +282,12 @@ def task(): loop_size = kwargs["loop_size"] else: loop_size = [] + + # In the autotune mode, skip validation to speed up + autotune = kwargs.get('autotune', False) + validate = kwargs.get('validate', False) if not autotune else False + def dummy_simulator(*args, **kwargs): - validate = kwargs.get('validate', False) # Wait for compilation key = future.result() from filelock import FileLock @@ -311,37 +319,29 @@ def dummy_simulator(*args, **kwargs): return result def dryrun_simulator(*args, **kwargs): - autotune = kwargs.get('autotune', False) key = future.result() - # Run simulator pass - result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key)) - # Dump arguments and meta data - dump_metadata(args, arg_attributes, result_path) - runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path) - if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: - return + from filelock import FileLock + lock_dir = get_lock_dir() + lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT) + with lock: + # Run simulator pass + result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key)) + # Dump arguments and meta data + dump_metadata(args, arg_attributes, result_path) + runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path) + if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: + return - if autotune: - onnx_path = os.path.join(result_path, "tile_graph.onnx") - attribute_path = os.path.join(runtime_path, "attribute") - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) - backsim.vectorlane_size = vectorlane_size - attribute_path = backsim.create_attribute_file(attribute_path, args, loop_size=loop_size) - result_path_2 = backsim.simulation(onnx_path, attribute_path) - result = BackendSimulator.get_result_from_file(result_path_2) - return result_path, runtime_path, result - - # Todo. Support valude dependent mode for graph mode - if False: # extension_config.CONFIG_TORCHSIM_VALIDATION_MODE: - funcsim = FunctionalSimulator(result_path, key) - funcsim.run_spike(args, arg_attributes, - runtime_path, self.validation_binary_name, - vectorlane_size=vectorlane_size, spad_info=spad_info, - cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS) + # Todo. Support valude dependent mode for graph mode + if False: # extension_config.CONFIG_TORCHSIM_VALIDATION_MODE: + funcsim = FunctionalSimulator(result_path, key) + funcsim.run_spike(args, arg_attributes, + runtime_path, self.validation_binary_name, + vectorlane_size=vectorlane_size, spad_info=spad_info, + cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS) return result_path, runtime_path, None - is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) + is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) and not autotune target_simulator = dryrun_simulator if is_dryrun else dummy_simulator target_simulator.arg_attributes = arg_attributes target_simulator.future = future diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index 59f3818c..80675682 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -18,9 +18,6 @@ # Tile size config CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') -# DUMP PATH -CONFIG_BACKEND_RESULT_PATH_KEY = os.getenv("BACKEND_RESULT_PATH_KEY") - CONFIG_TORCHSIM_DUMP_PATH = os.environ.get('TORCHSIM_DUMP_PATH', default = f"{tempfile.gettempdir()}/torchinductor") CONFIG_TORCHSIM_DUMP_FILE = int(os.environ.get('TORCHSIM_DUMP_FILE', default=True)) diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py index af101f44..54aed9c0 100644 --- a/PyTorchSimFrontend/mlir/mlir_autotune.py +++ b/PyTorchSimFrontend/mlir/mlir_autotune.py @@ -1,8 +1,12 @@ import functools import torch +import os import dataclasses from torch._inductor.autotune_process import BenchmarkRequest from torch._inductor.autotune_process import TensorMeta +from torch._inductor.codecache import get_hash, write +from PyTorchSimFrontend import extension_config +from Simulator.simulator import BackendSimulator from typing import ( Any, @@ -15,6 +19,14 @@ TYPE_CHECKING, Union, ) + +# FIXME. Avoid circular import +def hash_prefix(hash_value): + return hash_value[1:12] + +def get_write_path(src_code): + return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(get_hash(src_code.strip()))) + @dataclasses.dataclass class MLIRBenchmarkRequest(): def __init__( @@ -46,6 +58,18 @@ def make_run_fn( ) -> Callable[[], None]: from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile custom_async_compile = CustomAsyncCompile() + + # Check already cached result. + write_path = get_write_path(self.source_code) + key, _ = write(self.source_code, "mlir", specified_dir=write_path) + result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "backendsim_result/0") + if os.path.exists(result_path): + result = BackendSimulator.get_result_from_file(result_path) + def cached_run_fn(*args, **kwargs): + return result + return cached_run_fn + + # Run a candidate code run_method = custom_async_compile.mlir( self.source_code, vectorlane_size=self.extra_args["vector_lane"], loop_size=None, spad_info=self.extra_args["spad_info"], @@ -56,6 +80,7 @@ def make_run_fn( tensor for tensor in list(input_tensors) + list(output_tensors) ] + # Generate partial function. return functools.partial( run_method, diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 21d2868e..09ee129b 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1076,8 +1076,8 @@ def load(self, name: str, index: sympy.Expr): # Extract sram info local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index, buffer=apply_buffer) - vlane_split_axis = local_tile_desc.vlane_split_axis - vlane_stride = local_tile_desc.vlane_stride + vlane_split_axis = local_tile_desc.vmap.vlane_split_axis + vlane_stride = local_tile_desc.vmap.vlane_stride tile_numel_per_lane = local_tile_desc.get_numel_per_lane() tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype) tile_stride = local_tile_desc.get_tile_stride() @@ -1123,8 +1123,8 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs): # Prepare dma instruction local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index) - vlane_split_axis = local_tile_desc.vlane_split_axis - vlane_stride = local_tile_desc.vlane_stride + vlane_split_axis = local_tile_desc.vmap.vlane_split_axis + vlane_stride = local_tile_desc.vmap.vlane_stride dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name]) tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype) @@ -1271,8 +1271,8 @@ def store_reduction(self, name, index, value): # Tile is always reuduced in inner loop local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index, broadcast=False, store_reduction=True, buffer=self.reductions_suffix) - vlane_split_axis = local_tile_desc.vlane_split_axis - vlane_stride = local_tile_desc.vlane_stride + vlane_split_axis = local_tile_desc.vmap.vlane_split_axis + vlane_stride = local_tile_desc.vmap.vlane_stride dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name]) tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype) @@ -1354,15 +1354,15 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index): self.register_var_info(div_vec, [compute_vec_size, "index"]) self.register_var_info(mod_vec, [compute_vec_size, "index"]) dim = ops.modular(ops.div(vector_index, div_vec), mod_vec) - if idx == tile_desc.vlane_split_axis: # Need to add vector lane offset - offset = tile_desc.vlane_stride #* strides[idx] - outer_sz = tile_size[idx] // tile_desc.vlane_stride + if idx == tile_desc.vmap.vlane_split_axis: # Need to add vector lane offset + offset = tile_desc.vmap.vlane_stride #* strides[idx] + outer_sz = tile_size[idx] // tile_desc.vmap.vlane_stride nr_vector_lane = self.get_const_cse(self.vector_lane, "index") nr_vector_lane_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{nr_vector_lane} : index to vector<{compute_vec_size}xindex>") self.register_var_info(nr_vector_lane_vec, [compute_vec_size, "index"]) - vlane_stride_coeff = self.get_const_cse(tile_desc.vlane_stride, "index") + vlane_stride_coeff = self.get_const_cse(tile_desc.vmap.vlane_stride, "index") vlane_outer_coeff = self.get_const_cse(outer_sz, "index") vlane_stride_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{vlane_stride_coeff} : index to vector<{compute_vec_size}xindex>") vlane_outer_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{vlane_outer_coeff} : index to vector<{compute_vec_size}xindex>") @@ -1432,9 +1432,9 @@ def index_expr(self, index, dtype): # FIXME. This is a temporary solution to get tile stride of the reduction case tile_desc = mlir_common.MLIRMultiDimTile( base_tile_desc.get_tile_size(), - base_tile_desc.vector_lane, - base_tile_desc.vlane_split_axis, - base_tile_desc.vlane_stride, + base_tile_desc.vmap.vector_lane, + base_tile_desc.vmap.vlane_split_axis, + base_tile_desc.vmap.vlane_stride, base_tile_desc.get_compute_vec_size(), ) axis_order = list(range(len(tile_desc.get_tile_size()))) @@ -1536,32 +1536,76 @@ def codegen_loops(self): def make_choices(self, nodes, kernel_name): choices = [] initial_tile_size = self.kernel_group.tile_desc.get_tile_size() - previous_ranges = self.ranges - prevent_infinite_loop = 0 - if len(initial_tile_size) < 2: - return choices # Can't autotune for 1-D tile size + prev_ranges = self.ranges + prev_tail_threshold = self.kernel_group.tile_desc.tail_ratio_threshold + + # Allow more tail ratio during autotuning + self.kernel_group.tile_desc.tail_ratio_threshold = 0.3 + + if prev_ranges == [1] or len(prev_ranges) == 0: + return choices + #if len(initial_tile_size) < 2: + # return choices # Can't autotune for 1-D tile size + for vlane_stride in [2, 4, 8]: - os.environ['TORCHSIM_VECTOR_LANE_STRIDE'] = str(vlane_stride) - previous_tile_size = initial_tile_size - increase_dim = -2 # increase the first dimension - while previous_tile_size[increase_dim] * 2 <= previous_ranges[increase_dim] and previous_tile_size[increase_dim] <= 2 ** 13 and prevent_infinite_loop < 10: - incrase_dim = -1 # only increase the last dimension - prevent_infinite_loop += 1 - while previous_tile_size[incrase_dim] * 2 <= previous_ranges[incrase_dim] and previous_tile_size[incrase_dim] <= 2 ** 13: + self.kernel_group.tile_desc.set_tile_size(initial_tile_size) + self.kernel_group.tile_desc.vmap.vlane_stride = vlane_stride + prevent_infinite_loop = 0 + + # Get the dimension to increase + candidate_axes = [ + axis for axis, constr in enumerate(self.kernel_group.tile_desc.tile_constraint) + if not constr.fixed + ] + search_space = set() + + # Try initial tile size + self.reset(None) + src_code = super().codegen_nodes(nodes, kernel_name) + current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size()) + search_space.add(current_tile_sz) + + print(f"[Auto-tune] Trying tile size: {current_tile_sz}, vlane_stride: {vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}") + self._prepare_simulator_headers(src_code) + bench_runner = self.run_bench(nodes, kernel_name, src_code) + choices.append((bench_runner, src_code, self.kernel_group)) + + while prevent_infinite_loop < 10 and candidate_axes: + for axis in list(candidate_axes): + prev_tile_sz = self.kernel_group.tile_desc.get_tile_size() + + # If tile size is maximized for this axis, remove from candidate axes + if prev_tile_sz[axis] >= prev_ranges[axis] * 2 or prev_tile_sz[axis] >= 2 ** 13: + candidate_axes.remove(axis) + self.reset(None) + continue + + # Try increase tile size for this axis + try: + self.kernel_group.tile_desc.scale_tile_dim(axis, prev_ranges[axis], 2) + except extension_codecache.TileSizeError as e: + # Failed to find proper tile size + candidate_axes.remove(axis) + self.reset(None) + continue + + self.reset(None) src_code = super().codegen_nodes(nodes, kernel_name) - if self.stop_autotune: - print(f"[Auto-tune] Skipping autotuning due to enough tile size: {self.kernel_group.tile_desc.get_tile_size()}") - break - print(f"[Auto-tune] Trying tile size: {self.kernel_group.tile_desc.get_tile_size()}, vlane_stride: {vlane_stride}") - previous_tile_size = self.kernel_group.tile_desc.get_tile_size() + current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size()) + + # If tile size is converged for this axis, remove from candidate axes + if current_tile_sz in search_space: + candidate_axes.remove(axis) + continue + + # Add this choice + search_space.add(current_tile_sz) + print(f"[Auto-tune] Trying tile size: {current_tile_sz}, vlane_stride: {vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}") self._prepare_simulator_headers(src_code) bench_runner = self.run_bench(nodes, kernel_name, src_code) choices.append((bench_runner, src_code, self.kernel_group)) - self.reset(f"tile_size_{incrase_dim}") - previous_tile_size[incrase_dim] = initial_tile_size[incrase_dim] - self.kernel_group.tile_desc.set_tile_size(previous_tile_size) - self.reset(f"tile_size_{increase_dim}") - self.reset("vlane_stride") + prevent_infinite_loop += 1 + self.kernel_group.tile_desc.prev_tail_threshold = prev_tail_threshold return choices def autotune(self, nodes, kernel_name): @@ -1570,25 +1614,11 @@ def get_cycle(choice): for n_try in range(extension_config.CONFIG_MAX_AUTOTUNE_TRY): # TODO: make simple try: # bench_runner = self.run_bench(nodes, kernel_name, src_code) - if int(os.environ.get('BACKENDSIM_DRYRUN', default=False)): - _, _, out = bench_runner(autotune=1) - else: - out = bench_runner(validate=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE) + out = bench_runner(validate=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, autotune=True) return out[-1] except (extension_codecache.SpadOverflowError, RuntimeError) as e: return float("inf") - #if isinstance(e, RuntimeError) and str(e) != "STACK_OVERFLOW": - # print(f"Benchmark[trial-{n_try}] failed with unexpected error: {e}") - # return float("inf") - #print(f"Benchmark failed due to spad overflow with tile size: {self.kernel_group.tile_desc.get_tile_size()}") - #self.kernel_group = kernel_group # Reset to the original tile desc - #self.reset("spad_overflow") - #src_code = super().codegen_nodes(nodes, kernel_name) - #bench_runner = self.run_bench(nodes, kernel_name, src_code) - #kernel_group = self.kernel_group - #self._prepare_simulator_headers(src_code) return float("inf") # Exceeded maximum number of autotuning attempts - choices = self.make_choices(nodes, kernel_name) if len(choices) == 0: # can't autotune @@ -1598,7 +1628,7 @@ def get_cycle(choice): max_idx = results.index(min(results)) if min(results) == float("inf"): raise RuntimeError("Failed to find optimal tile size...") - print(f"[Auto-tune] Optimal tile size: {choices[max_idx][2].tile_desc.get_tile_size()}, vlane_stride: {choices[max_idx][2].tile_desc.vlane_stride}, cycles: {results[max_idx]}") + print(f"[Auto-tune] Optimal tile size: {choices[max_idx][2].tile_desc.get_tile_size()}, vlane_stride: {choices[max_idx][2].tile_desc.vmap.vlane_stride}, cycles: {results[max_idx]}") optimal_src_code = choices[max_idx][1] return optimal_src_code @@ -1664,78 +1694,73 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe index_var = self.parse_indices(index, buffer=buffer, indirect_dims=indirect_dims) - if kg_tile_desc.vlane_split_axis in local_dims: - local_vlane_split_axis = local_dims.index(kg_tile_desc.vlane_split_axis) + if kg_tile_desc.vmap.vlane_split_axis in local_dims: + local_vlane_split_axis = local_dims.index(kg_tile_desc.vmap.vlane_split_axis) else: local_vlane_split_axis = max(len(local_dims) - 1, 0) # Case 0. Tile is 0-D scalar if len(local_dims) == 0: if not store_reduction: - local_tile_desc.set_tile_size([kg_tile_desc.get_used_vlane() * kg_tile_desc.vlane_stride]) # Force it to use vector instruction. - local_tile_desc.vlane_split_axis = local_vlane_split_axis # last axis - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.set_tile_size([kg_tile_desc.get_used_vlane() * kg_tile_desc.vmap.vlane_stride]) # Force it to use vector instruction. + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis # last axis + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride else: local_tile_desc.set_tile_size([1]) - local_tile_desc.vlane_split_axis = 0 - local_tile_desc.vlane_stride = 1 + local_tile_desc.vmap.vlane_split_axis = 0 + local_tile_desc.vmap.vlane_stride = 1 dram_stride = [0] # Edge case # Case 1. Tile is 1-D vector type elif len(local_dims) == 1 and len(local_dims) <= self.reduction_depth: local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(local_dims[0])]) - local_tile_desc.vlane_split_axis = local_vlane_split_axis - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride # Case 2. Tile is 1-D vector type with reduction elif len(local_dims) == 1 and len(local_dims) == self.reduction_depth + 1: local_tile_desc.set_tile_size([1, kg_tile_desc.get_dim_size(local_dims[0])]) - local_tile_desc.vlane_split_axis = local_vlane_split_axis + 1 - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + 1 + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride # Case 3. Tile is 2-D tile elif len(local_dims) == 2: is_reduction = self.reduction_depth == 1 and not store_reduction if is_reduction: local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims], [1, 0]) - local_tile_desc.vlane_split_axis = local_vlane_split_axis - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride else: local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims]) - local_tile_desc.vlane_split_axis = local_vlane_split_axis - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride # Case 3. Tile is 3-D tile elif len(local_dims) == 3: is_reduction = self.reduction_depth < 3 and not store_reduction if is_reduction: axis_order = [1, 2, 0] if self.get_nr_rdim()==1 else [2, 1, 0] local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims], axis_order) - local_tile_desc.vlane_split_axis = local_vlane_split_axis - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride else: local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims]) - local_tile_desc.vlane_split_axis = local_vlane_split_axis - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride # Case 4. Tile is 4-D tile (e.g., Convolution epilogue) elif len(local_dims) == 4: is_reduction = self.reduction_depth < 3 and not store_reduction if is_reduction: raise NotImplementedError("Currently not implemented... ;)") local_tile_desc.set_tile_size([kg_tile_desc.get_dim_size(dim) for dim in local_dims]) - local_tile_desc.vlane_split_axis = local_vlane_split_axis - local_tile_desc.vlane_stride = kg_tile_desc.vlane_stride + local_tile_desc.vmap.vlane_split_axis = local_vlane_split_axis + local_tile_desc.vmap.vlane_stride = kg_tile_desc.vmap.vlane_stride else: raise NotImplementedError("Currently not implemented... ;)") if len(implicit_local_dims)!=0 and len(local_dims) != len(implicit_local_dims) and self.is_modular_indexing(index): - tile_size = local_tile_desc.get_tile_size() - new_tile_size = [] - new_vlane_split_axis = local_tile_desc.vlane_split_axis - implicit_dim_size = list(kg_tile_desc.implicit_dim_size.values()) - for i, target_dim_size in enumerate(implicit_dim_size): - new_tile_size += [1]*(len(target_dim_size)-1) + tile_size[i:i+1] - if local_tile_desc.vlane_split_axis >= i: - new_vlane_split_axis += len(target_dim_size)-1 - # Update - local_tile_desc.set_tile_size(new_tile_size) - local_tile_desc.vlane_split_axis = new_vlane_split_axis + for axis_constraints in self.kernel_group.tile_desc.implicit_dim_size.values(): + if len(axis_constraints) <= 1: + continue + sorted_constraints = sorted(axis_constraints, key=lambda c: int(c.args[1])) + for constraint in sorted_constraints[1:]: + index = index.replace(constraint.original_expr, 0) # Calculate dram stride dram_stride = [0] * local_tile_desc.get_nr_dim() @@ -1780,6 +1805,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe new_tile_sizes = list(self.kernel_group.tile_desc.get_tile_size()) new_tile_sizes[dim_idx] = new_size self.kernel_group.tile_desc.set_tile_size(new_tile_sizes) + self.kernel_group.tile_desc.tile_constraint[dim_idx].fixed = True # Send recompile signal self.reset("recompile") diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 73996351..67d5380f 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -1,5 +1,7 @@ import dataclasses import math +from dataclasses import dataclass +from typing import Optional, Iterable from typing import Dict from typing import List from collections import defaultdict @@ -13,7 +15,7 @@ from torch._inductor.virtualized import V from torch._inductor.ir import MultiOutputLayout from torch._inductor.dependencies import MemoryDep, StarDep, WeakDep -from torch.utils._sympy.functions import ModularIndexing +from torch.utils._sympy.functions import ModularIndexing, FloorDiv, Mod import sympy import contextlib @@ -32,6 +34,7 @@ unique, ) from PyTorchSimFrontend import extension_config +from PyTorchSimFrontend import extension_codecache from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest schedule_log = torch._logging.getArtifactLogger(__name__, "schedule") @@ -209,169 +212,72 @@ def set_info(outer, inner, arg_type): set_info(outer, inner, self.MLIR_ARGS_VAR) return arg_defs, call_args, arg_attributes, buffer_types -class MLIRMultiDimTile(): - def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None, vec_size=None): - self.name = "" - self._tile_size = list(tile_size) - self._tile_stride = None - self.tile_axis_order = list(range(len(tile_size))) - self.vec_size = vec_size - self.update_tile_stride() - - # Vector lane mapping config +class VectorLaneMapping(): + def __init__(self, vector_lane: int, forced_vec_size: int, vlane_split_axis: int, vlane_stride: int): self.vector_lane = vector_lane self.vlane_split_axis = vlane_split_axis self.vlane_stride = vlane_stride - self.implicit_dim_size = None - self.nr_rdim = 0 - - # Dram offset - self.offset = sympy.Integer(0) + self.forced_vec_size = forced_vec_size - def set_name(self, name: str): - self.name = name - - def set_tile_size(self, tile_size, tile_axis_order=None): - self._tile_size = tile_size - if tile_axis_order is None: - self.tile_axis_order = list(range(len(tile_size))) - else: - self.tile_axis_order = tile_axis_order - self.update_tile_stride() - - def set_tile_size_stride(self, tile_size, tile_stride): - self._tile_size = tile_size - self._tile_stride = tile_stride - - def get_name(self) -> str: - return self.name - - def get_tile_size(self): - return self._tile_size - - def get_numel(self): - """ - Return size of multi-dimensional tile - """ - size = 1 - for dim_size in self._tile_size: - size *= dim_size - return size - - def get_numel_per_lane(self): - tile_size_per_lane = self.get_tile_size_per_lane() - size = 1 - for dim_size in tile_size_per_lane: - size *= dim_size - return size - - def update_tile_stride(self): - strides = [1] * len(self._tile_size) - init = 1 - - original_indices = list(range(len(self.tile_axis_order))) - sorted_pairs = sorted( - zip(self.tile_axis_order, self._tile_size, original_indices), - key=lambda x: x[0], reverse=True + def get_used_vlane(self, tile_size: list[int]): + return min( + math.ceil(tile_size[self.vlane_split_axis] / self.vlane_stride), + self.vector_lane ) - for _, size, original_indices in sorted_pairs: - strides[original_indices] = init - init *= size - self._tile_stride = strides - def get_tile_stride(self): - return self._tile_stride + def get_tile_size_per_lane(self, tile_size: list[int]): + per_lane = tile_size.copy() + used = self.get_used_vlane(tile_size) + if self.vlane_split_axis < 0 or self.vlane_split_axis >= len(per_lane): + raise AssertionError("Not allowed split_axis") + per_lane[self.vlane_split_axis] = math.ceil(per_lane[self.vlane_split_axis] / used) + return per_lane - def get_tile_stride_per_lane(self): - tile_stride = list(self.get_tile_stride()) # original strides - tile_size = list(self.get_tile_size()) # original tile size - split_axis = self.vlane_split_axis + def get_numel_per_lane(self, tile_size: list[int]): + return math.prod(self.get_tile_size_per_lane(tile_size)) - tile_size_per_lane = self.get_tile_size_per_lane() - coeff = tile_size[split_axis]//tile_size_per_lane[split_axis] + def get_tile_stride_per_lane(self, tile_size: list[int], tile_stride: list[int]): + tile_stride = tile_stride.copy() # original strides + get_tile_size_per_lane = self.get_tile_size_per_lane(tile_size) + coeff = tile_size[self.vlane_split_axis]//get_tile_size_per_lane[self.vlane_split_axis] # Propagate stride according to per-lane tile size for i in range(len(tile_stride)): - if tile_stride[i] > tile_stride[split_axis]: + if tile_stride[i] > tile_stride[self.vlane_split_axis]: tile_stride[i] = tile_stride[i] // coeff return tile_stride - def get_tile_size_per_lane(self): - tile_size_per_lane = list(self._tile_size) - if self.vlane_split_axis < 0 or self.vlane_split_axis >= len(tile_size_per_lane): - raise AssertionError("Not allowed split_axis") - used_vlane = self.get_used_vlane() - tile_size_per_lane[self.vlane_split_axis] = \ - self.div_round_up(tile_size_per_lane[self.vlane_split_axis], used_vlane) - return tile_size_per_lane - - def get_nr_dim(self): - """ - Return number of dimensions - """ - return len(self._tile_size) - - def get_dim_size(self, index): - if isinstance(index, int): - return self._tile_size[index] - elif "index" in str(index): - return self._tile_size[int(str(index)[5:])] - raise NotImplementedError("Unsupported format of index") - - def get_mlir_shape(self, dtype): - str_tile_size = [str(dim) for dim in self._tile_size] - shape = "x".join(str_tile_size) - return f"memref<{shape}x{dtype}, 1>" - - def get_mlir_vshape(self, mlir_dtype): - return f"vector<{self.get_compute_vec_size()}x{mlir_dtype}>" if self.get_compute_vec_size() > 1 else f"{mlir_dtype}" - - def get_used_vlane(self): - """ - Return number of used vector lane - """ - if self.vlane_split_axis < 0 or self.vlane_split_axis >= len(self._tile_size): - raise AssertionError("Not allowed split_axis") - return min(self.div_round_up(self._tile_size[self.vlane_split_axis], self.vlane_stride), self.vector_lane) - - def get_vlane_stride(self): - return self.vlane_stride - - def get_compute_vec_size(self): - # Granule size used in compute loop - if self.vec_size is not None: - return self.vec_size - if self.nr_rdim: - assert self.nr_rdim!=0 - val = self.get_numel_per_lane() // self.get_reduction_numel() - if self.get_numel_per_lane() >= val * 8: - return val*8 - elif self.get_numel_per_lane() >= val * 4: - return val*4 - elif self.get_numel_per_lane() >= val * 2: - return val*2 + def get_compute_vec_size(self, tile_size: list[int], reduction_numel: int, nr_rdim: int) -> int: + if self.forced_vec_size is not None: + return self.forced_vec_size + + per_lane = self.get_numel_per_lane(tile_size) + stride = self.vlane_stride + if nr_rdim: + val = per_lane // max(reduction_numel, 1) + for mult in [8, 4, 2]: + if per_lane >= val * mult: + return val * mult return val - if (self.get_numel_per_lane() // self.vlane_stride) >= 8: - return self.vlane_stride * 8 - if (self.get_numel_per_lane() // self.vlane_stride) >= 4: - return self.vlane_stride * 4 - if (self.get_numel_per_lane() // self.vlane_stride) >= 2: - return self.vlane_stride * 2 - return self.vlane_stride + for mult in [8, 4, 2]: + if (per_lane // stride) >= mult: + return stride * mult + return stride - @staticmethod - def div_round_up(size, round_val): - return (size + round_val - 1) // round_val +class TileAdjustMixin(): + def __init__(self): + self.tail_ratio_threshold = 0.01 def apply_divisor(self, axis: int, divisor: int, mode: str = "split"): - # Apply divisor to tile size at given axis. - # This method based on axis order. + """Split or pad a given axis of the tile.""" old_size = self._tile_size[axis] - if divisor == 1: + if divisor <= 1: return - padded = self.div_round_up(old_size, divisor) * divisor - outer = self.div_round_up(old_size, divisor) - inner = divisor + + padded = math.ceil(old_size / divisor) * divisor + outer = math.ceil(old_size / divisor) + inner = divisor + if mode == "pad": self._tile_size[axis] = padded self.update_tile_stride() @@ -382,54 +288,276 @@ def apply_divisor(self, axis: int, divisor: int, mode: str = "split"): new_sizes.insert(axis + 1, inner) self._tile_size = new_sizes - # Update tile_axis_order old_order_val = self.tile_axis_order[axis] new_order = list(self.tile_axis_order) new_order.insert(axis + 1, old_order_val + 0.1) - sorted_pairs = sorted( - zip(range(len(new_order)), new_order), - key=lambda x: x[1] - ) - self.tile_axis_order = [idx for idx, _ in sorted_pairs] + self.tile_axis_order = [idx for idx, _ in sorted( + zip(range(len(new_order)), new_order), key=lambda x: x[1] + )] self.update_tile_stride() - if self.vlane_split_axis == axis: - self.vlane_split_axis = axis - elif self.vlane_split_axis > axis: - self.vlane_split_axis += 1 + # Adjust split axis for vmap + if self.vmap.vlane_split_axis > axis: + self.vmap.vlane_split_axis += 1 return - else: - raise ValueError(f"Unknown mode: {mode}. Supported modes are 'pad' and 'split'.") - def get_reduction_numel(self): - return reduce(mul, self.get_tile_size()[-1*self.nr_rdim:], 1) + raise ValueError(f"Unknown mode: {mode}. Supported: 'pad', 'split'.") - def is_dim_dividable(self, dim_sizes): + def is_dim_dividable(self, dim_sizes: list[int]) -> bool: if len(dim_sizes) != len(self._tile_size): - raise ValueError("dim_sizes must match the tile size dimensions.") - dim_sizes_cpy = [int(d) for d in dim_sizes] - remain = dim_sizes_cpy[self.vlane_split_axis] % self.vlane_stride + raise ValueError("dim_sizes must match the tile size dimensions") + + dim_sizes_cpy = list(dim_sizes) + axis, stride = self.vmap.vlane_split_axis, self.vmap.vlane_stride + remain = dim_sizes_cpy[axis] % stride if remain: - dim_sizes_cpy[self.vlane_split_axis] += self.vlane_stride - remain + dim_sizes_cpy[axis] += stride - remain + return all(d % t == 0 for d, t in zip(dim_sizes_cpy, self._tile_size)) - def adjust_tile_to_divisible(self, dim_sizes): + def adjust_tile_to_divisible(self, dim_sizes: list[int]) -> list[int]: + """Adjust current tile to be divisible by given dimensions.""" + if len(dim_sizes) != len(self._tile_size): + raise ValueError("dim_sizes must match the tile size dimensions") + def _adjust_one(dim_size, tile_size): for candidate in range(tile_size, 0, -1): if dim_size % candidate == 0: return candidate return 1 - if len(dim_sizes) != len(self._tile_size): - raise ValueError("dim_sizes must match the tile size dimensions.") candidate_tile_size = [_adjust_one(d, t) for d, t in zip(dim_sizes, self._tile_size)] - # FIXME. Is this the only solution? - # Round up - remain = candidate_tile_size[self.vlane_split_axis] % self.vlane_stride + for i in range(len(candidate_tile_size)): + self.tile_constraint[i].must_divide_dim = True + + axis, stride = self.vmap.vlane_split_axis, self.vmap.vlane_stride + remain = candidate_tile_size[axis] % stride + if remain: - candidate_tile_size[self.vlane_split_axis] += self.vlane_stride - remain + candidate_tile_size[axis] += stride - remain + self.tile_constraint[axis].must_divide_dim = False return candidate_tile_size + def scale_tile_dim(self, axis, dim_sz, scale_factor=2): + axis_constrinat = self.tile_constraint[axis] + current_sz = self._tile_size[axis] + new_sz = axis_constrinat.adjust(current_sz, int(current_sz * scale_factor), dim_sz) + self._tile_size[axis] = new_sz + self.update_tile_stride() + return current_sz != new_sz + + def decrease_tile_size(self, dim_size): + tile_size = self._tile_size + vlane_split_axis, vlane_stride, vector_lane = self.vmap.vlane_split_axis, self.vmap.vlane_stride, self.vmap.vector_lane + tile_size = list(tile_size) + + # Decrease vlane_split_axis when it is too large + if tile_size[vlane_split_axis] > 2 * vlane_stride * vector_lane: + if self.scale_tile_dim(vlane_split_axis, dim_size[vlane_split_axis], scale_factor=0.5): + return + + for i in range(len(tile_size)): + if i == vlane_split_axis: + continue + if tile_size[i] > 1: + if self.scale_tile_dim(i, dim_size[i], scale_factor=0.5): + return + + # Decrease vlane_split_axis at the end to maximize the vlane usage + self.scale_tile_dim(vlane_split_axis, dim_size[vlane_split_axis], scale_factor=0.5) + return + + def trim_large_tail(self, ranges: list[int]): + for i, (dim_range, tile_range) in enumerate(zip(ranges, self._tile_size)): + ALPHA = 1.0 + BETA = 0.5 + constraint = self.tile_constraint[i] + if constraint.fixed: + continue + elif constraint.must_divide_dim: + BETA = 0 + + padding_ratio = TileAdjustMixin.get_padding_ratio(tile_range, dim_range) + if padding_ratio < self.tail_ratio_threshold: + continue + best_tile = tile_range + best_cost = ( + ALPHA * padding_ratio + + BETA * (dim_range / tile_range) + ) + + min_tile = 1 + for candidate in range(tile_range - 1, min_tile - 1, -1): + new_candidate = constraint.adjust(tile_range, candidate, dim_range) + ratio = TileAdjustMixin.get_padding_ratio(new_candidate, dim_range) + iter_penalty = (dim_range / new_candidate) + + cost = ALPHA * ratio + BETA * iter_penalty + if cost < best_cost: + best_tile, best_cost = new_candidate, cost + self._tile_size[i] = best_tile + + def select_vlane_axis(self): + best_vlane_split_axis = 0 + best_used_vlane = math.ceil(self._tile_size[0] / self.vmap.vlane_stride) + for i, dim in enumerate(self._tile_size[1:len(self._tile_size)-self.nr_rdim]): + used_vlane = math.ceil(dim / self.vmap.vlane_stride) + if used_vlane > best_used_vlane: + best_used_vlane = used_vlane + best_vlane_split_axis = i+1 + self.vmap.vlane_split_axis = best_vlane_split_axis + + def pad_vlane_tile(self): + vlane_split_axis, vlane_stride, vector_lane = self.vmap.vlane_split_axis, self.vmap.vlane_stride, self.vmap.vector_lane + used_vlane = min(math.ceil(self._tile_size[vlane_split_axis] / vlane_stride), vector_lane) + padded_size = used_vlane * vlane_stride + self._tile_size[vlane_split_axis] = math.ceil(self._tile_size[vlane_split_axis] / padded_size) * padded_size + + def apply_constraints(self, constraints, ranges): + for idx, (axis_constraints, axis_size) in enumerate(zip(constraints.values(), ranges)): + for const in axis_constraints: + if const.args[1] == 1: + continue + divider = int(const.args[1]) + + if not self.tile_constraint[idx].fixed: + self.tile_constraint[idx].fixed = True + self._tile_size[idx] = divider + elif self.tile_constraint[idx].fixed and self._tile_size[idx] > divider: + self._tile_size[idx] = divider + self.update_tile_stride() + + @staticmethod + def init_tile_size(ranges, vlane_stride, vector_lane): + nr_dim = len(ranges) + tile_size = [1] * nr_dim + if len(tile_size) == 2: + tile_size[-1] = vlane_stride * vector_lane + tile_size[-2] = 2 * vector_lane + elif len(tile_size) == 0: # Scalar + tile_size = [1] + ranges = [1] + elif len(tile_size) == 1 and ranges[0]==1: + tile_size[0] = 1 + elif len(tile_size) == 1: + tile_size[0] = 2 * vlane_stride * vector_lane + elif len(tile_size) == 3: + tile_size[-1] = vector_lane + tile_size[-2] = 4 * vector_lane + tile_size[-3] = 2 + elif len(tile_size) == 4: + tile_size[-1] = vector_lane + tile_size[-2] = 4 * vector_lane + tile_size[-3] = 2 + tile_size[-4] = 1 + else: + raise NotImplementedError("dummy tile size fail!") + return tile_size + + @staticmethod + def get_padding_ratio(tile_range: int, dim_range: int) -> float: + if tile_range <= 0 or dim_range <= 0: + raise ValueError("tile_range and dim_range must be positive integers") + tail = dim_range % tile_range + padding = (tile_range - tail) % tile_range + return float(padding / dim_range) + +@dataclass +class TileConstraint: + multiple_of: int = 1 + must_divide_dim: bool = False + fixed: bool = False + + def adjust(self, old: int, new: int, dim: int) -> int: + if self.fixed: + return old # Fixed tile size + + tail = new % self.multiple_of + new -= tail + if not self.must_divide_dim: + return max(new, self.multiple_of) + + while new > 0: + if dim % new == 0: + return new + new -= self.multiple_of + raise extension_codecache.TileSizeError("Cannot find suitable tile size under the given constraints.") + +class MLIRMultiDimTile(TileAdjustMixin): + def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=None, forced_vec_size=None): + super().__init__() + self.name = "" + self._tile_size = list(tile_size) + self._tile_stride = None + self.tile_constraint = [TileConstraint(vlane_stride) for _ in tile_size] + self.tile_axis_order = list(range(len(tile_size))) + self.update_tile_stride() + + # Vector lane mapping config + self.vmap = VectorLaneMapping( + vector_lane=vector_lane, + forced_vec_size=forced_vec_size, + vlane_split_axis=vlane_split_axis, + vlane_stride=vlane_stride + ) + + self.implicit_dim_size = None + self.nr_rdim = 0 + self.offset = sympy.Integer(0) # Dram offset + + def set_name(self, name: str): self.name = name + def get_name(self) -> str: return self.name + def get_tile_size(self): return list(self._tile_size) + def get_tile_stride(self): return list(self._tile_stride) + def get_numel(self) -> int :return math.prod(self._tile_size) + def get_nr_dim(self) -> str: return len(self._tile_size) + def get_reduction_numel(self): return reduce(mul, self.get_tile_size()[-1*self.nr_rdim:], 1) + + def set_tile_size(self, tile_size, tile_axis_order=None, constraints=None): + self._tile_size = list(tile_size) + self.tile_axis_order = list(range(len(tile_size))) if tile_axis_order is None else tile_axis_order + self.update_tile_stride() + + def set_tile_size_stride(self, tile_size, tile_stride): + self._tile_size = list(tile_size) + self._tile_stride = list(tile_stride) + + def update_tile_stride(self): + strides = [1] * len(self._tile_size) + init = 1 + + original_indices = list(range(len(self.tile_axis_order))) + sorted_pairs = sorted( + zip(self.tile_axis_order, self._tile_size, original_indices), + key=lambda x: x[0], reverse=True + ) + for _, size, original_indices in sorted_pairs: + strides[original_indices] = init + init *= size + self._tile_stride = strides + + def get_dim_size(self, index): + if isinstance(index, int): + return self._tile_size[index] + elif "index" in str(index): + return self._tile_size[int(str(index)[5:])] + raise NotImplementedError("Unsupported format of index") + + # Vector mapping delegation + def get_tile_size_per_lane(self): return self.vmap.get_tile_size_per_lane(self._tile_size) + def get_used_vlane(self): return self.vmap.get_used_vlane(self._tile_size) + def get_numel_per_lane(self): return self.vmap.get_numel_per_lane(self._tile_size) + def get_tile_stride_per_lane(self): return self.vmap.get_tile_stride_per_lane(self._tile_size, self._tile_stride) + def get_compute_vec_size(self): return self.vmap.get_compute_vec_size(self._tile_size, self.get_reduction_numel(), self.nr_rdim) + + # Helper functions for codegen + def get_mlir_shape(self, dtype): + shape = "x".join([str(dim) for dim in self._tile_size]) + return f"memref<{shape}x{dtype}, 1>" + + def get_mlir_vshape(self, mlir_dtype): + return f"vector<{self.get_compute_vec_size()}x{mlir_dtype}>" if self.get_compute_vec_size() > 1 else f"{mlir_dtype}" + class MLIRWrapperKenrelGroup(cpp.KernelGroup): def __init__(self): super().__init__() @@ -525,191 +653,96 @@ def call_kernel(self, kernel_name): def is_modular_indexing(self, expr): return "ModularIndexing" in str(expr) - def compute_tile_size(self, nodes, vars, reduction_vars): - # Handle implict dims. Input operand could have larger dimension space. - implicit_ranges = False - target_operand : MemoryDep = None - implicit_dim_size = defaultdict(list) - for read_operand in nodes[0].read_writes.reads: - read_operand : MemoryDep - if isinstance(read_operand, StarDep) or isinstance(read_operand, WeakDep): # FIXME: WeakDep & StarDep are not supported (MoE case) - continue - read_index = read_operand.index - for arg in read_index.args: - if "ModularIndexing" in str(arg) or "//" in str(arg): - implicit_ranges = True - target_operand = read_operand - break - - if implicit_ranges: - #print("This operation contain implicit dimension space!") - linearized_stride = [1] * len(target_operand.var_names) - for i in range(len(target_operand[3])-2, -1, -1): - linearized_stride[i] = linearized_stride[i+1] * target_operand[3][i+1] - - linearized_index = sympy.Integer(0) - for dim, stride in zip(target_operand[2], linearized_stride): - linearized_index += stride * dim - - new_dim_expression = [] - new_dim_size = [] - for arg in target_operand.index.args: + def implicit_dim_ops(self, nodes): + target_patterns = (ModularIndexing, FloorDiv, Mod) + target_operands = [] + for target_node in nodes: + for read_operand in target_node.read_writes.reads: + read_operand: MemoryDep + if isinstance(read_operand, StarDep) or isinstance(read_operand, WeakDep): + continue + read_index = read_operand.index + for arg_expr in read_index.args: + if arg_expr.atoms(*target_patterns): + target_operands.append(read_operand) + return target_operands + + def extract_dividers(self, implicit_ops): + # When a specific axis is processed, the key constraint to verify is the divider. + # The tile size must be forced to match the divider size. + dim_dividers = defaultdict(set) + for operand in implicit_ops: + subs_map = { + s: sympy.symbols(s.name.replace("c", "index", 1)) + for s in operand.index.free_symbols + } + rev_subs_map = { + sympy.symbols(s.name.replace("c", "index", 1)) : s + for s in operand.index.free_symbols + } + new_index = operand.index.subs(subs_map) + for arg in new_index.args: if len(arg.free_symbols) != 1: raise NotImplementedError("Not supporting this view operation...!") - if arg.is_Mul and arg.args[0].is_number: arg = arg.args[1] if isinstance(arg, ModularIndexing): modular_expr = ModularIndexing(arg.args[0], arg.args[1], arg.args[2]) + modular_expr.original_expr = arg elif arg.is_symbol: - modular_expr = ModularIndexing(arg, 1, target_operand.ranges[arg]) + modular_expr = ModularIndexing(arg, 1, operand.ranges[rev_subs_map[arg]]) + modular_expr.original_expr = arg elif "//" in str(arg): - modular_expr = ModularIndexing(arg.args[0], arg.args[1], target_operand.ranges[arg.args[0]]//arg.args[1]) + modular_expr = ModularIndexing(arg.args[0], arg.args[1], operand.ranges[rev_subs_map[arg.args[0]]]//arg.args[1]) + modular_expr.original_expr = arg else: raise NotImplementedError("What is this case?") - new_dim_expression.append(modular_expr) - new_dim_size.append(modular_expr.args[2]) - implicit_dim_size[int(str(modular_expr.args[0])[1:])].append(int(modular_expr.args[2])) - - # Sanity check - for dim, sub_dims in implicit_dim_size.items(): - sz = reduce(mul, sub_dims, 1) - if sz != target_operand[3][dim]: - raise NotImplementedError("Not supporting type...") - - vlane_split_axis = len(vars) - 1 # Set split_axis as a last normal loop not reduction loop - - # FIXME: Naive decrease tile size - def decrease_tile_size(tile_size, vlane_split_axis): - is_decreased = False - - # Decrease vlane_split_axis when it is too large - if tile_size[vlane_split_axis] > vlane_stride * self.vector_lane: - tile_size[vlane_split_axis] = int(tile_size[vlane_split_axis] // 2) - return tile_size - - for i in range(len(tile_size)): - if i == vlane_split_axis: - continue - if tile_size[i] > 1: - tile_size[i] = int(tile_size[i] // 2) - is_decreased = True - break - - # Decrease vlane_split_axis at the end to maximize the vlane usage - if not is_decreased: - if tile_size[vlane_split_axis] > 1: - tile_size[vlane_split_axis] = int(tile_size[vlane_split_axis] // 2) - return tile_size - - # Dummy tile size - def dummy_tile_size(): - tile_size = [1] * (len(vars) + len(reduction_vars)) - if len(tile_size) == 2: - tile_size[-1] = vlane_stride * self.vector_lane - tile_size[-2] = 2 * self.vector_lane - elif len(tile_size) == 0: # Scalar - tile_size = [1] - self.ranges = [1] - elif len(tile_size) == 1: - tile_size[0] = 2 * vlane_stride * self.vector_lane - elif len(tile_size) == 3: - tile_size[-1] = self.vector_lane - tile_size[-2] = 4 * self.vector_lane - tile_size[-3] = 2 - elif len(tile_size) == 4: - tile_size[-1] = self.vector_lane - tile_size[-2] = 4 * self.vector_lane - tile_size[-3] = 2 - tile_size[-4] = 1 - else: - raise NotImplementedError("dummy tile size fail!") - return tile_size + dim_dividers[modular_expr.args[0]].add(modular_expr) + return dim_dividers + def compute_tile_size(self, nodes, vars, reduction_vars): + vlane_split_axis = len(vars) - 1 vlane_stride = extension_config.CONFIG_VECTOR_LANE_STRIDE - if self.recodegen is None: - tile_size = dummy_tile_size() - else: + + # Set initial tile size & vector lane mapping + if self.kernel_group.tile_desc is None: + tile_size = MLIRMultiDimTile.init_tile_size(self.ranges, vlane_stride, self.vector_lane) + init_tile_desc = MLIRMultiDimTile(tile_size, self.vector_lane, vlane_split_axis, vlane_stride) + init_tile_desc.nr_rdim = len(reduction_vars) + self.kernel_group.set_tile_info(init_tile_desc) + + # Handle edge case + if len(self.ranges)==1 and self.ranges[0] == 1: # Scalar case 2 + self.kernel_group.tile_desc.vmap.vlane_stride = 1 + self.kernel_group.tile_desc.vmap.vlane_split_axis = 0 + elif vlane_split_axis == -1: # Reduction only case + self.kernel_group.tile_desc.vmap.vlane_split_axis = 0 + self.kernel_group.tile_desc.vmap.vlane_stride = self.kernel_group.tile_desc.get_tile_size()[0] + + # Handle implict dims. Input operand could be high dimension tensor. + # Note: https://github.com/PSAL-POSTECH/PyTorchSim/issues/173 + implicit_ops = self.implicit_dim_ops(nodes) + if implicit_ops: + tile_constraints = self.extract_dividers(implicit_ops) + self.kernel_group.tile_desc.apply_constraints(tile_constraints, self.ranges) + self.kernel_group.tile_desc.implicit_dim_size = tile_constraints + + # Check recodegen reason + if self.recodegen is not None: if self.recodegen == "spad_overflow": - tile_size = self.kernel_group.tile_desc.get_tile_size() - decrease_tile_size(tile_size, vlane_split_axis) - elif self.recodegen == "vlane_stride": - tile_size = dummy_tile_size() - elif "tile_size" in self.recodegen: - dim = int(self.recodegen.split("_")[-1]) - tile_size = self.kernel_group.tile_desc.get_tile_size() # TODO: - tile_size[dim] = tile_size[dim] * 2 + self.kernel_group.tile_desc.decrease_tile_size(self.ranges) elif self.recodegen == "recompile": return self.kernel_group.tile_desc else: raise NotImplementedError(f"Unknown recodegen reason: {self.recodegen}") - # FIXME: Not considering removed buffers - n_buffer = sum( - len(node.read_writes.reads) + len(node.read_writes.writes) - for node in nodes - ) - - spad_overflow = True - # Find proper tile size - while spad_overflow: - # Adjust tile size to avoid too much paddings - for i in range(1, len(tile_size)+1): - target_range = self.ranges[-i] - if implicit_ranges: - target_range = implicit_dim_size[len(tile_size)-i][-1] - - if tile_size[-i] > target_range: - remains = (target_range % vlane_stride) - self.stop_autotune = True - tile_size[-i] = target_range - if remains: - tile_size[-i] += vlane_stride - remains - - # Adjust tile size - for i in range(len(vars)): - if tile_size[i] >= self.vector_lane: # maximize used vector lane - vlane_split_axis = i - used_vlane = min((tile_size[vlane_split_axis] + vlane_stride - 1) // vlane_stride, self.vector_lane) - padded_size = used_vlane * vlane_stride - tile_size[vlane_split_axis] = ((tile_size[vlane_split_axis] + padded_size - 1) // padded_size) * padded_size - - # Check spad overflow - spad_usage_per_vlane = n_buffer * math.prod(tile_size) * self.precision // used_vlane - if spad_usage_per_vlane >= self.spad_info["spad_size"]: - new_tile_size = decrease_tile_size(tile_size.copy(), vlane_split_axis) - if new_tile_size == tile_size: - raise NotImplementedError("Error: Cannot find proper tile size") - tile_size = new_tile_size - spad_overflow = True - self.stop_autotune = True # for auto-tune - continue - else: - spad_overflow = False - - # Maximize the utilizaiotn of vectorlane - if len(reduction_vars): - minimum_stride = max(self.roundup_vectorlane(tile_size[vlane_split_axis]) // self.vector_lane, 2) - vlane_stride = min(minimum_stride, 8) - - # Handle scalar case - if len(self.ranges)==1 and self.ranges[0] == 1: - vlane_stride = 1 - vlane_split_axis = 0 - tile_size[0] = 1 - elif vlane_split_axis == -1: - vlane_split_axis = 0 - vlane_stride = tile_size[0] - - # Select tile info. - # Note: Kernel Group have to share same tile desc for fusion - tile_desc = MLIRMultiDimTile(tile_size, self.vector_lane) - tile_desc.vlane_split_axis = vlane_split_axis - tile_desc.vlane_stride = vlane_stride - tile_desc.implicit_dim_size = implicit_dim_size - tile_desc.nr_rdim = len(reduction_vars) - return tile_desc + # Adjust tile size & vector lane mapping + self.kernel_group.tile_desc.trim_large_tail(self.ranges) + self.kernel_group.tile_desc.select_vlane_axis() + self.kernel_group.tile_desc.pad_vlane_tile() + self.kernel_group.tile_desc.update_tile_stride() + return self.kernel_group.tile_desc def codegen_nodes(self, nodes, kernel_name): recompile_try = 0 @@ -724,7 +757,6 @@ def codegen_nodes(self, nodes, kernel_name): tile_desc = self.compute_tile_size(nodes, vars, reduction_vars) self.compute_body_loop.size = tile_desc.get_numel_per_lane() self.compute_body_loop.step = tile_desc.get_compute_vec_size() - self.kernel_group.set_tile_info(tile_desc) try: _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs() with self as kernel: diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 820d5c0d..4f75dd84 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -685,8 +685,8 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com sram_var = tile_desc.get_name() tile_shape = tile_desc.get_mlir_shape(mlir_dtype) tile_stride = tile_desc.get_tile_stride() - vlane_split_axis = tile_desc.vlane_split_axis - vlane_stride = tile_desc.vlane_stride + vlane_split_axis = tile_desc.vmap.vlane_split_axis + vlane_stride = tile_desc.vmap.vlane_stride zero_cse = self.get_const_cse(0, "index") sram_index_var = ", ".join([f"%{str(zero_cse)}"]*tile_desc.get_nr_dim()) @@ -734,8 +734,8 @@ def load_epilogue(self, name: str, index: sympy.Expr): # Want to use tile_desc from epilogue_info index_var = self.parse_indices(index) dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()] - vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis - vlane_stride = self.kernel_group.tile_desc.vlane_stride + vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis + vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype) tile_stride = self.kernel_group.tile_desc.get_tile_stride() @@ -793,8 +793,8 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs): index_var = self.parse_indices(index) dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()] - vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis - vlane_stride = self.kernel_group.tile_desc.vlane_stride + vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis + vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype) tile_stride = self.kernel_group.tile_desc.get_tile_stride() @@ -859,8 +859,8 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value): vec_size = self.compute_body_loop.step type_name = mlir_common.DTYPE_TO_MLIR[dtype] new_tile_size = self.kernel_group.tile_desc.get_tile_size()[:-1] + [vec_size] - new_vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis - new_vlane_stride = self.kernel_group.tile_desc.vlane_stride + new_vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis + new_vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride local_tile_desc = mlir_common.MLIRMultiDimTile(new_tile_size, self.vector_lane, new_vlane_split_axis, new_vlane_stride, vec_size) tile_shape = local_tile_desc.get_mlir_shape(type_name) @@ -906,8 +906,8 @@ def store_reduction_epilogue(self, name, index, value): index_var = self.parse_indices(index, self.reductions_suffix, comments="// Store reduction") dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()][:-1] # Assume that there is only one reduction axis - vlane_split_axis = self.kernel_group.tile_desc.vlane_split_axis - vlane_stride = self.kernel_group.tile_desc.vlane_stride + vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis + vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride # Create final buffer descriptor nr_outer_loop = self.reduction_nr_outer_loop diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 81970cbe..4faf1c85 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -109,7 +109,8 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size= stderr_setting = subprocess.DEVNULL if silent_mode else None subprocess.check_call(run_cmd, stdout=stdout_setting, stderr=stderr_setting) except subprocess.CalledProcessError as e: - print("[SpikeSimulator] Command failed with exit code", e.returncode) + if not silent_mode: + print("[SpikeSimulator] Command failed with exit code", e.returncode) error_msg = "" if e.returncode == 200: error_msg = "INVALID_SPAD_ACCESS" @@ -174,11 +175,11 @@ def show_progress(): else: output = subprocess.check_output(gem5_cmd, stderr=subprocess.DEVNULL) except subprocess.CalledProcessError as e: - print("[Gem5Simulator] Command failed with exit code", e.returncode) - print("[Gem5Simulator] Error output:", e.output) - finished = True - progress_thread.join() - assert(0) + print(f"[Gem5Simulator] Gem5 simulation failed with error: \"{e.output.decode()}\"") + if not is_dryrun: + finished = True + progress_thread.join() + raise RuntimeError(f"GEM5 Simulation Failed: \"{e.output.decode()}\"") with open(f"{dir_path}/stats.txt", "r") as stat_file: raw_list = stat_file.readlines() @@ -238,11 +239,8 @@ def show_progress(): print("[BackendSimulator] Command failed with exit code", e.returncode) print("[BackendSimulator] Error output:", e.output) assert 0 - result_path = extension_config.CONFIG_BACKEND_RESULT_PATH_KEY - if result_path is None: - result_path = os.path.join(os.path.dirname(model_path), "backendsim_result") - # Save result to result_path + result_path = os.path.join(os.path.dirname(model_path), "backendsim_result") os.makedirs(result_path, exist_ok=True) file_name = str(len(os.listdir(result_path))) result_path = os.path.join(result_path, file_name) @@ -352,6 +350,8 @@ def create_attribute_file(self, attribute_path, inputs, **kwargs): with open(attribute_path, "w") as f: json.dump(json_content, f, indent=4) + f.flush() + os.fsync(f.fileno()) # There could be a race condition. return attribute_path def load_json(self, config_path): @@ -449,6 +449,6 @@ def get_result_from_file(result_path): return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time, total_cycle if __name__ == "__main__": - sim = BackendSimulator("/workspace/PyTorchSim/PyTorchSimBackend", "/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c4_simple_noc_tpuv4.json") + sim = BackendSimulator("/workspace/PyTorchSim/PyTorchSimBackend", "/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json") sim.interactive_simulation() sim.until(4000) \ No newline at end of file From f10038e4c49c93c921abe135cc1b4144b94ac16e Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Fri, 3 Oct 2025 07:56:15 +0000 Subject: [PATCH 06/53] [Frontend] Template autotune --- PyTorchSimFrontend/extension_config.py | 2 + .../mlir/mlir_codegen_backend.py | 18 +- .../mlir/mlir_conv_mt_template.py | 2 +- .../mlir/mlir_conv_sb_template.py | 2 +- .../mlir/mlir_conv_sbs_template.py | 2 +- PyTorchSimFrontend/mlir/mlir_conv_template.py | 2 +- PyTorchSimFrontend/mlir/mlir_gemm_template.py | 2 +- PyTorchSimFrontend/mlir/mlir_scheduling.py | 97 +-------- PyTorchSimFrontend/mlir/mlir_template.py | 184 ++++++++++++++++-- 9 files changed, 184 insertions(+), 127 deletions(-) diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index 80675682..7eddfcb9 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -46,7 +46,9 @@ # AUTOTUNE config CONFIG_AUTOTUNE = int(os.environ.get('AUTOTUNE', default=True)) +CONFIG_AUTOTUNE_TEMPLATE = int(os.environ.get('AUTOTUNE_TEMPLATE', default=True)) CONFIG_MAX_AUTOTUNE_TRY = int(os.environ.get('MAX_AUTOTUNE_TRY', default=10)) +CONFIG_AUTOTUNE_TOPK = int(os.environ.get('AUTOTUNE_TOPK', default=3)) # For block sparse CONFIG_BLOCK_SPARSE = int(os.environ.get('BLOCK_SPARSE', default=0)) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 09ee129b..d54963c2 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -17,8 +17,7 @@ sympy_product ) from torch.utils._sympy.functions import ModularIndexing, FloorDiv -import PyTorchSimFrontend.extension_codecache as extension_codecache - +from PyTorchSimFrontend import extension_codecache from PyTorchSimFrontend import extension_config from . import mlir_common from .mlir_common import LoopLevel, LoopNest @@ -1608,9 +1607,9 @@ def make_choices(self, nodes, kernel_name): self.kernel_group.tile_desc.prev_tail_threshold = prev_tail_threshold return choices - def autotune(self, nodes, kernel_name): + def autotune(self, *args): def get_cycle(choice): - bench_runner, src_code, kernel_group = choice + bench_runner = choice[0] for n_try in range(extension_config.CONFIG_MAX_AUTOTUNE_TRY): # TODO: make simple try: # bench_runner = self.run_bench(nodes, kernel_name, src_code) @@ -1619,7 +1618,7 @@ def get_cycle(choice): except (extension_codecache.SpadOverflowError, RuntimeError) as e: return float("inf") return float("inf") # Exceeded maximum number of autotuning attempts - choices = self.make_choices(nodes, kernel_name) + choices = self.make_choices(*args) if len(choices) == 0: # can't autotune return None @@ -1635,14 +1634,11 @@ def get_cycle(choice): def codegen_nodes(self, nodes, kernel_name): src_code = super().codegen_nodes(nodes, kernel_name) self._prepare_simulator_headers(src_code) - if not extension_config.CONFIG_AUTOTUNE or extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: - return src_code - else: + if extension_config.CONFIG_AUTOTUNE and extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: optimal_src_code = self.autotune(nodes, kernel_name) - if optimal_src_code: + if optimal_src_code is not None: return optimal_src_code - else: - return src_code + return src_code def _prepare_simulator_headers(self, src_code): write_path = extension_codecache.get_write_path(src_code) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py index 6dd17576..ddbdf793 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py @@ -342,7 +342,7 @@ def compute_stride(shape): def codegen_header(self, code, extra_headers): write_path = extension_codecache.get_write_path(code) if not os.path.exists(write_path): - os.makedirs(write_path) + os.makedirs(write_path, exist_ok=True) spike_write_path = os.path.join(write_path, "global_var.h") gem5_write_path = os.path.join(write_path, "gem5_global_var.h") if not os.path.exists(spike_write_path): diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py index 8b1bf7c5..46cdb4d0 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py @@ -338,7 +338,7 @@ def compute_stride(shape): def codegen_header(self, code, extra_headers): write_path = extension_codecache.get_write_path(code) if not os.path.exists(write_path): - os.makedirs(write_path) + os.makedirs(write_path, exist_ok=True) spike_write_path = os.path.join(write_path, "global_var.h") gem5_write_path = os.path.join(write_path, "gem5_global_var.h") if not os.path.exists(spike_write_path): diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py index 2284c86c..006d5112 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py @@ -339,7 +339,7 @@ def compute_stride(shape): def codegen_header(self, code, extra_headers): write_path = extension_codecache.get_write_path(code) if not os.path.exists(write_path): - os.makedirs(write_path) + os.makedirs(write_path, exist_ok=True) spike_write_path = os.path.join(write_path, "global_var.h") gem5_write_path = os.path.join(write_path, "gem5_global_var.h") if not os.path.exists(spike_write_path): diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py index 890b76b7..c744258c 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py @@ -346,7 +346,7 @@ def compute_stride(shape): def codegen_header(self, code, extra_headers): write_path = extension_codecache.get_write_path(code) if not os.path.exists(write_path): - os.makedirs(write_path) + os.makedirs(write_path, exist_ok=True) spike_write_path = os.path.join(write_path, "global_var.h") gem5_write_path = os.path.join(write_path, "gem5_global_var.h") if not os.path.exists(spike_write_path): diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py index ae793c06..119debd9 100644 --- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py @@ -334,7 +334,7 @@ def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_no def codegen_header(self, code, extra_headers): write_path = extension_codecache.get_write_path(code) if not os.path.exists(write_path): - os.makedirs(write_path) + os.makedirs(write_path, exist_ok=True) spike_write_path = os.path.join(write_path, "global_var.h") gem5_write_path = os.path.join(write_path, "gem5_global_var.h") if not os.path.exists(spike_write_path): diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 2bbdb41d..7b7b179b 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -4,8 +4,11 @@ from functools import reduce import operator from sympy import symbols, sympify, Symbol +from collections import OrderedDict +from concurrent.futures import ThreadPoolExecutor from PyTorchSimFrontend import extension_config from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel +from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest from torch._inductor import config from torch._inductor.scheduler import BaseScheduling, FusedSchedulerNode, SchedulerNode, BaseSchedulerNode @@ -259,85 +262,6 @@ def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, loop_size wrapper.define_kernel(kernel_name, codecache_def.getvalue(), cuda=False) return kernel_name - def codegen_template_code(self, kernel, render, template_node, prologue_nodes, epilogue_nodes): - with kernel: - _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs() - for node in [template_node, *prologue_nodes, *epilogue_nodes]: - node.mark_run() - # Partial codgen template nodes - partial_code = render() - - # Swap load/store functions - kernel.load = kernel.load_epilogue - kernel.store = kernel.store_epilogue - kernel.store_reduction = kernel.store_reduction_epilogue - kernel.reduction = kernel.reduction_epilogue - - # Codegen prologue nodes - if prologue_nodes: - # Flush created varaibles, since template fusion doen't share variable - with kernel.prologue_buffer_group.as_local(): - _, (group, reduction_group) = max( - [prologue_nodes[-1]], key=lambda x: int(x.is_reduction()) - ).group - prologue_tile_desc = kernel.set_tile_size(kernel.prologue_info, prologue=True) - kernel.kernel_group.set_tile_info(prologue_tile_desc) - vars, reduction_vars = kernel.set_ranges(group, reduction_group) - for node in prologue_nodes: - # Reuse created spad - read_list = sorted([i.name for i in node.read_writes.reads]) - candidate_found = False - # Why? There is a case that memdep.get_size() != data.get_size() - buf_dict = {} - buf_dict.update({val.name : val for val in V.graph.buffers}) - buf_dict.update(V.graph.graph_inputs) - for candidate_read in read_list: - if candidate_read in buf_dict and reduce(operator.mul, buf_dict[candidate_read].get_size(), 1) == node.node.get_numel(): - prologue_input_arg = candidate_read - candidate_found = True - break - assert(candidate_found) - assert(len(node.read_writes.writes)==1) - prologue_output_arg = list(node.read_writes.writes)[0].name - template_buf = self.kernel_group.args.input_buffers[prologue_output_arg] - target_buf = f"{template_buf}_buffer" # FIXME. How to pass spad buffer name? - - # To skip the dma code gen - kernel.buffer_names[prologue_input_arg] = target_buf - kernel.buffer_names[prologue_output_arg] = target_buf - - # Edge delete - kernel.kernel_group.args.input_buffers = { - (arg if buf != template_buf else prologue_input_arg): buf - for arg, buf in kernel.kernel_group.args.input_buffers.items() - } - node.codegen((vars, reduction_vars)) - - # Codegen epilogue nodes - tile_desc = kernel.set_tile_size(kernel.epilogue_info) - kernel.kernel_group.set_tile_info(tile_desc) - kernel.call_ranges = None - if epilogue_nodes: - with kernel.epilogue_buffer_group.as_local(): - _, (group, reduction_group) = max( - epilogue_nodes, key=lambda x: int(x.is_reduction()) - ).group - vars, reduction_vars = kernel.set_ranges(group, reduction_group) - for node in epilogue_nodes: - node.codegen((vars, reduction_vars)) - - with V.set_kernel_handler(kernel): - src_code = ( - partial_code - if isinstance(partial_code, str) - else partial_code.finalize() - ) - - # For consistency, white space could make wrong write_path - buffer = IndentedBuffer() - buffer.splice(src_code) - return buffer.getvalue() - def codegen_template(self, template_node, epilogue_nodes): # Handle prologue pattern prologue_nodes = [] @@ -350,24 +274,13 @@ def codegen_template(self, template_node, epilogue_nodes): epilogue_nodes = epilogue_nodes[i+1:] break - _, (numel, rnumel) = template_node.group + # Generate template code template_buffer = template_node.node kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group) _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs() - - src_code = self.codegen_template_code(kernel, render, template_node, prologue_nodes, epilogue_nodes) - wrapper = V.graph.wrapper_code - - if src_code in wrapper.src_to_kernel: # [CONV] check inner function is already defined - kernel_name = wrapper.src_to_kernel[src_code] - kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_name=kernel_name) # update kernel name - src_code = self.codegen_template_code(kernel, render, template_node, prologue_nodes, epilogue_nodes) + src_code = kernel.codegen_nodes(render, codegen_header, template_node, prologue_nodes, epilogue_nodes) with V.set_kernel_handler(kernel): - spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n" - spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({kernel.spad_info['spad_size']*kernel.vector_lane})));" - codegen_header(src_code, (kernel.header.getvalue()+spad_end_symbol+spad_section_end_symbol, kernel.gem5_header.getvalue())) - kernel.meta_kernel() kernel_name = self.define_kernel(src_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info, kernel.loop_size, origins={str(i) for i in template_node.node.origins}) self.define_function(kernel) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 4f75dd84..762d2a93 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -6,6 +6,8 @@ import contextlib import math import sympy +from functools import reduce +import operator from collections import OrderedDict from typing import List, Optional @@ -25,7 +27,7 @@ from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode from torch._inductor.codegen import common -from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_DIR +from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_DIR, CONFIG_AUTOTUNE_TEMPLATE, CONFIG_AUTOTUNE, CONFIG_BACKENDSIM_SPIKE_ONLY from . import mlir_common class IndentedBufferGroup: @@ -93,7 +95,8 @@ def __init__(self, kernel_group = None, outer_func_name=None, outer_func_render=None, - kernel_arg_attributes=None) -> None: + kernel_arg_attributes=None, + reason=None) -> None: super().__init__(kernel_group if kernel_group is not None else mlir_common.MLIRWrapperKenrelGroup()) self.kernel_name = kernel_name self.input_nodes = input_nodes @@ -125,6 +128,16 @@ def __init__(self, self.reduction_mean = [] # Dim info self.dim_aliasing = {} + self.autotune_idx = 0 + self.reason = reason + + def reset(self, reason): + self.__init__( + self.kernel_name, self.input_nodes, + self.call_size, self.kernel_group, + self.outer_func_name, self.outer_func_render, + self.kernel_arg_attributes, reason + ) def add_loop_info(self, mat_size, tile_size): for idx, (loop_size, stride) in enumerate(zip(mat_size, tile_size)): @@ -185,7 +198,8 @@ def gemmini_gemm_mapping(self, M, N, K): return inner_I, inner_J, inner_K - def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, pad_k=True, min_tile=False): + def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, pad_k=True, min_tile=False, is_conv=False): + tile_candidates = [] spad_size_per_lane = self.spad_info["spad_size"] spad_size = spad_size_per_lane * self.vector_lane max_spad_size = spad_size // 2 # double buffer @@ -249,6 +263,11 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p max_used_spad_size = used_spad_size maximize_i_j = tile_M * tile_N mapping = (tile_M, tile_N, tile_K) + if check_spad_size: + tile_candidates.append((used_spad_size, (tile_M, tile_N, tile_K))) + if CONFIG_AUTOTUNE_TEMPLATE and not is_conv: + tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) + mapping = tile_candidates[self.autotune_idx][1] if self.autotune_idx < len(tile_candidates) else mapping return mapping def search_mapping_space(self, mapping, idx, increment, stride, dilation, n_extra_node=0): @@ -288,13 +307,14 @@ def pseudo_auto_tune(self, mapping, stride, dilation, O_H, O_W, n_extra_node=0): return mapping def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0): + tile_candidates = [] spad_size_per_lane = self.spad_info["spad_size"] spad_size = spad_size_per_lane * self.vector_lane max_spad_size = spad_size // 2 # double buffer max_spad_per_lane = spad_size_per_lane // 2 # double buffer max_used_spad_size = 0 - M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False) + M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True) max_k_h_w = 1 # maximize kernel size max_o_h_w = 1 # maximize output size K = min(K, self.vector_lane) @@ -312,27 +332,34 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K) output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M * (1 + n_extra_node), N) used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision - if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h * k_w and max_o_h_w <= o_h * o_w: - max_used_spad_size = used_spad_size - max_k_h_w = k_h * k_w - max_o_h_w = o_h * o_w - mapping = (k_h, k_w, o_h, o_w, M, N, K) + check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane) + if check_spad_size: + tile_candidates.append((used_spad_size, (k_h, k_w, o_h, o_w, M, N, K))) + if max_used_spad_size < used_spad_size and max_k_h_w <= k_h * k_w and max_o_h_w <= o_h * o_w: + max_used_spad_size = used_spad_size + max_k_h_w = k_h * k_w + max_o_h_w = o_h * o_w + mapping = (k_h, k_w, o_h, o_w, M, N, K) if max_used_spad_size == 0: raise RuntimeError("Cannot find a valid mapping") # FIXME: this should be implemented with auto-tuning mapping = self.pseudo_auto_tune(mapping, stride, dilation, O_H, O_W, n_extra_node=n_extra_node) + if CONFIG_AUTOTUNE_TEMPLATE: + tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) + mapping = tile_candidates[self.autotune_idx][1] if self.autotune_idx < len(tile_candidates) else mapping return mapping def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0): + tile_candidates = [] spad_size_per_lane = self.spad_info["spad_size"] spad_size = spad_size_per_lane * self.vector_lane max_spad_size = spad_size // 2 max_spad_per_lane = spad_size_per_lane // 2 max_used_spad_size = 0 - M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node=n_extra_node, pad_k=False) + M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node=n_extra_node, pad_k=False, is_conv=True) max_k_h_w = K_W for o_h in sympy.divisors(O_H): for o_w in sympy.divisors(O_W): @@ -347,22 +374,29 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K) output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M * (1 + n_extra_node), N) used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision - if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h: - max_used_spad_size = used_spad_size - max_k_h_w = k_h - mapping = (k_h, K_W, o_h, o_w, M, N, K) + check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane) + if check_spad_size: + tile_candidates.append((used_spad_size, (k_h, K_W, o_h, o_w, M, N, K))) + if max_used_spad_size < used_spad_size and max_k_h_w <= k_h: + max_used_spad_size = used_spad_size + max_k_h_w = k_h + mapping = (k_h, K_W, o_h, o_w, M, N, K) if max_used_spad_size == 0: raise RuntimeError("Cannot find a valid mapping") + if CONFIG_AUTOTUNE_TEMPLATE: + tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) + mapping = tile_candidates[self.autotune_idx][1] if self.autotune_idx < len(tile_candidates) else mapping return mapping def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0): + tile_candidates = [] spad_size_per_lane = self.spad_info["spad_size"] spad_size = spad_size_per_lane * self.vector_lane max_spad_size = spad_size // 2 max_spad_per_lane = spad_size_per_lane // 2 max_used_spad_size = 0 - M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node=n_extra_node, pad_k=False) + M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True) max_k_h_w = 1 for o_h in sympy.divisors(O_H): for k_h in sympy.divisors(K_H): @@ -377,12 +411,18 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * k_w, K) output_size_per_lane = self.get_spad_size_per_lane(M * o_h * (1 + n_extra_node), N) used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision - if used_spad_size < max_spad_size and max_used_spad_size < used_spad_size and used_spad_size_per_lane < max_spad_per_lane and max_k_h_w <= k_h * k_w: - max_used_spad_size = used_spad_size - max_k_h_w = k_h * k_w - mapping = (k_h, k_w, o_h, M, M, N, K) + check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane) + if check_spad_size: + tile_candidates.append((used_spad_size, (k_h, k_w, o_h, M, M, N, K))) + if max_used_spad_size < used_spad_size and max_k_h_w <= k_h * k_w: + max_used_spad_size = used_spad_size + max_k_h_w = k_h * k_w + mapping = (k_h, k_w, o_h, M, M, N, K) if max_used_spad_size == 0: raise RuntimeError("Cannot find a valid mapping") + if CONFIG_AUTOTUNE_TEMPLATE: + tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) + mapping = tile_candidates[self.autotune_idx][1] if self.autotune_idx < len(tile_candidates) else mapping return mapping def meta_kernel(self): @@ -407,6 +447,112 @@ def call_kernel(self, kernel_name): kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}", call_args, cuda=False) + def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes): + with self as kernel: + _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs() + for node in [template_node, *prologue_nodes, *epilogue_nodes]: + node.mark_run() + + # Partial codgen template nodes + partial_code = render() + + # Swap load/store functions + kernel.load = kernel.load_epilogue + kernel.store = kernel.store_epilogue + kernel.store_reduction = kernel.store_reduction_epilogue + kernel.reduction = kernel.reduction_epilogue + + # Codegen prologue nodes + if prologue_nodes: + # Flush created varaibles, since template fusion doen't share variable + with kernel.prologue_buffer_group.as_local(): + _, (group, reduction_group) = max( + [prologue_nodes[-1]], key=lambda x: int(x.is_reduction()) + ).group + prologue_tile_desc = kernel.set_tile_size(kernel.prologue_info, prologue=True) + kernel.kernel_group.set_tile_info(prologue_tile_desc) + vars, reduction_vars = kernel.set_ranges(group, reduction_group) + for node in prologue_nodes: + # Reuse created spad + read_list = sorted([i.name for i in node.read_writes.reads]) + candidate_found = False + # Why? There is a case that memdep.get_size() != data.get_size() + buf_dict = {} + buf_dict.update({val.name : val for val in V.graph.buffers}) + buf_dict.update(V.graph.graph_inputs) + for candidate_read in read_list: + if candidate_read in buf_dict and reduce(operator.mul, buf_dict[candidate_read].get_size(), 1) == node.node.get_numel(): + prologue_input_arg = candidate_read + candidate_found = True + break + assert(candidate_found) + assert(len(node.read_writes.writes)==1) + prologue_output_arg = list(node.read_writes.writes)[0].name + template_buf = self.kernel_group.args.input_buffers[prologue_output_arg] + target_buf = f"{template_buf}_buffer" # FIXME. How to pass spad buffer name? + + # To skip the dma code gen + kernel.buffer_names[prologue_input_arg] = target_buf + kernel.buffer_names[prologue_output_arg] = target_buf + + # Edge delete + kernel.kernel_group.args.input_buffers = { + (arg if buf != template_buf else prologue_input_arg): buf + for arg, buf in kernel.kernel_group.args.input_buffers.items() + } + node.codegen((vars, reduction_vars)) + + # Codegen epilogue nodes + tile_desc = kernel.set_tile_size(kernel.epilogue_info) + kernel.kernel_group.set_tile_info(tile_desc) + kernel.call_ranges = None + if epilogue_nodes: + with kernel.epilogue_buffer_group.as_local(): + _, (group, reduction_group) = max( + epilogue_nodes, key=lambda x: int(x.is_reduction()) + ).group + vars, reduction_vars = kernel.set_ranges(group, reduction_group) + for node in epilogue_nodes: + node.codegen((vars, reduction_vars)) + + with V.set_kernel_handler(kernel): + src_code = ( + partial_code + if isinstance(partial_code, str) + else partial_code.finalize() + ) + + # For consistency, white space could make wrong write_path + buffer = IndentedBuffer() + buffer.splice(src_code) + return buffer.getvalue() + + def make_choices(self, render, template_node, prologue_nodes, epilogue_nodes): + choices = [] + for i in range(3): + self.autotune_idx = i + self.reset(reason=None) + src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes) + bench_runner = self.run_bench([template_node], self.kernel_name, src_code) + choices.append((bench_runner, src_code, self.kernel_group)) + return choices + + def codegen_nodes(self, render, codegen_header, template_node, prologue_nodes, epilogue_nodes): + src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes) + + if False:# CONFIG_AUTOTUNE_TEMPLATE and not CONFIG_BACKENDSIM_SPIKE_ONLY: + src_code = self.autotune(render, template_node, prologue_nodes, epilogue_nodes) + + with V.set_kernel_handler(self): + self._prepare_simulator_headers(src_code, codegen_header) + self.meta_kernel() + return src_code + + def _prepare_simulator_headers(self, src_code, codegen_header): + spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n" + spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));" + codegen_header(src_code, (self.header.getvalue()+spad_end_symbol+spad_section_end_symbol, self.gem5_header.getvalue())) + def codegen_prologue_body(self): body = IndentedBuffer() with self.prologue_buffer_group.as_local(): From fe22e9b133b3da2e6686bc9053f38c5815c25511 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 5 Nov 2025 03:20:54 +0000 Subject: [PATCH 07/53] [Cleanup] Remove codegen_headers --- PyTorchSimFrontend/mlir/mlir_bmm_template.py | 25 ++--- .../mlir/mlir_conv_mt_template.py | 21 +--- .../mlir/mlir_conv_sb_template.py | 21 +--- .../mlir/mlir_conv_sbs_template.py | 21 +--- PyTorchSimFrontend/mlir/mlir_conv_template.py | 22 +---- PyTorchSimFrontend/mlir/mlir_gemm_template.py | 95 ++++++++++--------- .../mlir/mlir_maxpool_template.py | 13 --- PyTorchSimFrontend/mlir/mlir_scheduling.py | 4 +- PyTorchSimFrontend/mlir/mlir_template.py | 52 ++++++---- 9 files changed, 105 insertions(+), 169 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py index 79e03bd5..0c6583a7 100644 --- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py @@ -6,8 +6,6 @@ from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode -from torch._inductor.codecache import write_atomic -import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir import mlir_common BMM_TEMPLATE = r""" @@ -184,14 +182,10 @@ def render(self, # Select tile size n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0 - TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node) - SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane) or prologue_nodes else kernel.vector_lane - SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane - SUB_TILE_K = TILE_K # if (TILE_K < kernel.vector_lane) or prologue_nodes else kernel.vector_lane + TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_extra_node, 0, len(prologue_nodes)) TOG_latency = M if TILE_M > M else TILE_M kernel.loop_size = [TOG_latency, TILE_N, TILE_K] - TILE_K = TILE_K // 2 if prologue_nodes else TILE_K # Select template code nr_reduction_nodes = [node for node in epilogue_nodes if node.is_reduction()] if epilogue_nodes is not None else [] @@ -329,13 +323,10 @@ def render(self, kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]]) return code - def codegen_header(self, code, extra_headers): - write_path = extension_codecache.get_write_path(code) - if not os.path.exists(write_path): - os.makedirs(write_path) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, extra_headers[0]) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, extra_headers[1]) \ No newline at end of file + def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node): + TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node) + SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane) or n_prologue_node else kernel.vector_lane + SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane + SUB_TILE_K = TILE_K # if (TILE_K < kernel.vector_lane) or prologue_nodes else kernel.vector_lane + TILE_K = TILE_K // 2 if n_prologue_node else TILE_K + return TILE_M,TILE_N,TILE_K,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py index ddbdf793..26657712 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py @@ -7,10 +7,7 @@ from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode -from torch._inductor.codecache import write_atomic -import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir import mlir_common -from torch._inductor.codecache import get_hash from PyTorchSimFrontend import extension_config CONV_TEMPLATE = r""" @@ -185,8 +182,9 @@ def render(self, # Select tile size adn template conv_template = CONV_TEMPLATE - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N + TOG_latency = O_W if TILE_M > O_W else TILE_M TOG_latency = 8 if TOG_latency < 8 else TOG_latency kernel.loop_size = [TOG_latency, TILE_N, TILE_K] @@ -294,8 +292,7 @@ def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 SUB_TILE_K = TILE_K - TOG_latency = O_W if TILE_M > O_W else TILE_M - return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency + return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K def outer_func_render(self, kernel_name, input_args): X, W = self.input_nodes[0], self.input_nodes[1] @@ -338,15 +335,3 @@ def compute_stride(shape): arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) return arg_attributes - - def codegen_header(self, code, extra_headers): - write_path = extension_codecache.get_write_path(code) - if not os.path.exists(write_path): - os.makedirs(write_path, exist_ok=True) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, extra_headers[0]) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, extra_headers[1]) - self.hash_value = get_hash(code.strip()) \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py index 46cdb4d0..856d4c09 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py @@ -7,10 +7,7 @@ from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode -from torch._inductor.codecache import write_atomic -import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir import mlir_common -from torch._inductor.codecache import get_hash from PyTorchSimFrontend import extension_config CONV_TEMPLATE = r""" @@ -186,8 +183,9 @@ def render(self, # Select tile size adn template conv_template = CONV_TEMPLATE - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N + TOG_latency = O_W if TILE_M > O_W else TILE_M TOG_latency = 8 if TOG_latency < 8 else TOG_latency kernel.loop_size = [TOG_latency, TILE_N, TILE_K] # Prepare tile descriptors @@ -290,8 +288,7 @@ def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane SUB_TILE_K = TILE_K - TOG_latency = O_W if TILE_M > O_W else TILE_M - return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency + return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K def outer_func_render(self, kernel_name, input_args): X, W = self.input_nodes[0], self.input_nodes[1] @@ -334,15 +331,3 @@ def compute_stride(shape): arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) return arg_attributes - - def codegen_header(self, code, extra_headers): - write_path = extension_codecache.get_write_path(code) - if not os.path.exists(write_path): - os.makedirs(write_path, exist_ok=True) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, extra_headers[0]) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, extra_headers[1]) - self.hash_value = get_hash(code.strip()) \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py index 006d5112..14b7d432 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py @@ -7,10 +7,7 @@ from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode -from torch._inductor.codecache import write_atomic -import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir import mlir_common -from torch._inductor.codecache import get_hash from PyTorchSimFrontend import extension_config CONV_TEMPLATE = r""" @@ -186,8 +183,9 @@ def render(self, # Select tile size adn template conv_template = CONV_TEMPLATE - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N + TOG_latency = O_W if TILE_M > O_W else TILE_M TOG_latency = 8 if TOG_latency < 8 else TOG_latency kernel.loop_size = [TOG_latency, TILE_N, TILE_K] @@ -291,8 +289,7 @@ def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane SUB_TILE_K = TILE_K - TOG_latency = O_W if TILE_M > O_W else TILE_M - return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency + return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K def outer_func_render(self, kernel_name, input_args): X, W = self.input_nodes[0], self.input_nodes[1] @@ -335,15 +332,3 @@ def compute_stride(shape): arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) return arg_attributes - - def codegen_header(self, code, extra_headers): - write_path = extension_codecache.get_write_path(code) - if not os.path.exists(write_path): - os.makedirs(write_path, exist_ok=True) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, extra_headers[0]) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, extra_headers[1]) - self.hash_value = get_hash(code.strip()) \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py index c744258c..ff426ceb 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py @@ -7,8 +7,6 @@ from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode -from torch._inductor.codecache import write_atomic -import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir import mlir_common from torch._inductor.codecache import get_hash from PyTorchSimFrontend import extension_config @@ -190,8 +188,8 @@ def render(self, # Select tile size adn template conv_template = CONV_TEMPLATE - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K, TOG_latency = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) - SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + TOG_latency = BATCH if TILE_M > BATCH else TILE_M TOG_latency = 8 if TOG_latency < 8 else TOG_latency kernel.loop_size = [TOG_latency, TILE_N, TILE_K] @@ -297,9 +295,7 @@ def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N - TOG_latency = BATCH if TILE_M > BATCH else TILE_M - TOG_latency = 8 if TOG_latency < 8 else TOG_latency - return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K,TOG_latency + return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K def outer_func_render(self, kernel_name, input_args): X, W = self.input_nodes[0], self.input_nodes[1] @@ -342,15 +338,3 @@ def compute_stride(shape): arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) return arg_attributes - - def codegen_header(self, code, extra_headers): - write_path = extension_codecache.get_write_path(code) - if not os.path.exists(write_path): - os.makedirs(write_path, exist_ok=True) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, extra_headers[0]) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, extra_headers[1]) - self.hash_value = get_hash(code.strip()) \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py index 119debd9..9d3d3acf 100644 --- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py @@ -8,8 +8,6 @@ from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode -from torch._inductor.codecache import write_atomic -import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend import extension_config from PyTorchSimFrontend.mlir import mlir_common @@ -114,30 +112,13 @@ def render(self, template_buffer_node = None, epilogue_nodes: Optional[List[IRNode]] = None, prologue_nodes: Optional[List[IRNode]] = None, + tile_info = None, **kwargs): - if template_buffer_node is not None: - self.output_node = template_buffer_node - - # Extract input arguments info - X, W, Y = self.input_nodes[0], self.input_nodes[1], self.output_node - X_tensor = empty_strided(X.layout.size, X.layout.stride) - W_tensor = empty_strided(W.layout.size, W.layout.stride) - if len(W_tensor.size()) > 2 or len(X_tensor.size()) > 2: - raise NotImplementedError("Please report this case to us...") - - # Extract fusion info - n_epilogue_node = len(epilogue_nodes) if epilogue_nodes is not None else 0 - n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0 - n_extra_read = set() - if epilogue_nodes is not None: - for enode in epilogue_nodes: - n_extra_read.update(enode.node.get_read_names()) - if self.output_node.name in n_extra_read: - n_extra_read.remove(self.output_node.name) - - # Select tile size - M, N, K = X_tensor.size()[0], W_tensor.size()[1], X_tensor.size()[1] - TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node) + X, W, Y, M, N, K, n_epilogue_node, n_prologue_node, n_extra_read = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) + if tile_info is None: + TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node) + else: + TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info # Select template code if (M == 0) or (N == 0) or (K == 0): # exception for MoE @@ -281,6 +262,41 @@ def render(self, kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]]) return code + def get_tile_candidates(self, + kernel: MLIRTemplateKernel, + template_buffer_node = None, + epilogue_nodes: Optional[List[IRNode]] = None, + prologue_nodes: Optional[List[IRNode]] = None, + **kwargs): + X, W, Y, M, N, K, n_epilogue_node, n_prologue_node, n_extra_read = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) + TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node) + return [[TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K]] + + def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes): + if template_buffer_node is not None: + self.output_node = template_buffer_node + + # Extract input arguments info + X, W, Y = self.input_nodes[0], self.input_nodes[1], self.output_node + X_tensor = empty_strided(X.layout.size, X.layout.stride) + W_tensor = empty_strided(W.layout.size, W.layout.stride) + if len(W_tensor.size()) > 2 or len(X_tensor.size()) > 2: + raise NotImplementedError("Please report this case to us...") + + # Extract fusion info + n_epilogue_node = len(epilogue_nodes) if epilogue_nodes is not None else 0 + n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0 + n_extra_read = set() + if epilogue_nodes is not None: + for enode in epilogue_nodes: + n_extra_read.update(enode.node.get_read_names()) + if self.output_node.name in n_extra_read: + n_extra_read.remove(self.output_node.name) + + # Select tile size + M, N, K = X_tensor.size()[0], W_tensor.size()[1], X_tensor.size()[1] + return X,W,Y,M,N,K,n_epilogue_node,n_prologue_node,len(n_extra_read) + def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node): # Check cheat sheet cheatsheet_path = extension_config.CONFIG_GEMM_CHEATSHEET_PATH @@ -292,19 +308,21 @@ def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_no data = json.load(f) gemm_shape = f"{M}_{K}_{N}" - if gemm_shape in data: + if extension_config.CONFIG_MANUAL_TILE_SIZE: + # case 1: use manual tile size + TILE_M = extension_config.CONFIG_TILE_M + TILE_N = extension_config.CONFIG_TILE_N + TILE_K = extension_config.CONFIG_TILE_K + elif gemm_shape in data: + # case 2: cached tile size tile_info = data[gemm_shape] TILE_M = tile_info["TILE_M"] TILE_N = tile_info["TILE_N"] TILE_K = tile_info["TILE_K"] - else: # case 2: use gemm_combination_mapping + else: + # case 3: use gemm_combination_mapping min_tile = (n_extra_node + n_prologue_node) == 0 - TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, max(len(n_extra_read)-2, 0), n_prologue_node, min_tile=True) - # case 3: use manual tile size - if extension_config.CONFIG_MANUAL_TILE_SIZE: - TILE_M = extension_config.CONFIG_TILE_M - TILE_N = extension_config.CONFIG_TILE_N - TILE_K = extension_config.CONFIG_TILE_K + TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, max(n_extra_read-2, 0), n_prologue_node, min_tile=True) # Edge case if (M == 0) or (N == 0) or (K == 0): @@ -330,14 +348,3 @@ def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_no SUB_TILE_N = TILE_N SUB_TILE_K = TILE_K return TILE_M,TILE_N,TILE_K, SUB_TILE_M,SUB_TILE_N,SUB_TILE_K - - def codegen_header(self, code, extra_headers): - write_path = extension_codecache.get_write_path(code) - if not os.path.exists(write_path): - os.makedirs(write_path, exist_ok=True) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, extra_headers[0]) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, extra_headers[1]) diff --git a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py index 6f605d56..a779e598 100644 --- a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py +++ b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py @@ -6,8 +6,6 @@ from torch._inductor.ir import Buffer from torch._inductor.ir import IRNode from torch._inductor.ir import ReinterpretView -from torch._inductor.codecache import write_atomic -import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir import mlir_common import sympy @@ -99,14 +97,3 @@ def render(self, code = self._template_from_string(TEMPLATE).render(**kernel.render_options) kernel.add_loop_info([X.get_numel()], [kernel.vector_lane, kernel.vector_lane]) return code - - def codegen_header(self, code, extra_headers): - write_path = extension_codecache.get_write_path(code) - if not os.path.exists(write_path): - os.makedirs(write_path) - spike_write_path = os.path.join(write_path, "global_var.h") - gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, extra_headers[0]) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, extra_headers[1]) \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 7b7b179b..26b90401 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -276,9 +276,9 @@ def codegen_template(self, template_node, epilogue_nodes): # Generate template code template_buffer = template_node.node - kernel, render, codegen_header = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group) + kernel, tile_candidates, render = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group) _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs() - src_code = kernel.codegen_nodes(render, codegen_header, template_node, prologue_nodes, epilogue_nodes) + src_code = kernel.codegen_nodes(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) with V.set_kernel_handler(kernel): kernel_name = self.define_kernel(src_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info, diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 762d2a93..07ebec51 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -20,7 +20,9 @@ from torch._inductor.autotune_process import TensorMeta from torch._inductor.virtualized import V, NullHandler, _ops as ops from torch._inductor.utils import IndentedBuffer +from torch._inductor.codecache import write_atomic +import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest from PyTorchSimFrontend.mlir.mlir_common import BaseMLIRHardwareInfo from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel, reduction_init, reduction_partial_combine_vec, reduction_combine_vec, is_welford_reduction @@ -447,14 +449,14 @@ def call_kernel(self, kernel_name): kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}", call_args, cuda=False) - def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes): + def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes, tile_info): with self as kernel: _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs() for node in [template_node, *prologue_nodes, *epilogue_nodes]: node.mark_run() # Partial codgen template nodes - partial_code = render() + partial_code = render(kwargs={**render.keywords['kwargs'], 'tile_info': tile_info}) # Swap load/store functions kernel.load = kernel.load_epilogue @@ -522,36 +524,42 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_ else partial_code.finalize() ) - # For consistency, white space could make wrong write_path - buffer = IndentedBuffer() - buffer.splice(src_code) - return buffer.getvalue() + # For consistency, white space could make wrong write_path + buffer = IndentedBuffer() + buffer.splice(src_code) + src_code = buffer.getvalue() + self._prepare_simulator_headers(src_code) + return src_code - def make_choices(self, render, template_node, prologue_nodes, epilogue_nodes): + def make_choices(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes): choices = [] - for i in range(3): - self.autotune_idx = i - self.reset(reason=None) - src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes) + for tile_info in tile_candidates: + src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info) bench_runner = self.run_bench([template_node], self.kernel_name, src_code) choices.append((bench_runner, src_code, self.kernel_group)) + self.reset(reason=None) return choices - def codegen_nodes(self, render, codegen_header, template_node, prologue_nodes, epilogue_nodes): - src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes) - - if False:# CONFIG_AUTOTUNE_TEMPLATE and not CONFIG_BACKENDSIM_SPIKE_ONLY: - src_code = self.autotune(render, template_node, prologue_nodes, epilogue_nodes) + def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes): + src_code = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) with V.set_kernel_handler(self): - self._prepare_simulator_headers(src_code, codegen_header) self.meta_kernel() return src_code - def _prepare_simulator_headers(self, src_code, codegen_header): + def _prepare_simulator_headers(self, src_code): spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n" spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));" - codegen_header(src_code, (self.header.getvalue()+spad_end_symbol+spad_section_end_symbol, self.gem5_header.getvalue())) + + write_path = extension_codecache.get_write_path(src_code) + if not os.path.exists(write_path): + os.makedirs(write_path, exist_ok=True) + spike_write_path = os.path.join(write_path, "global_var.h") + gem5_write_path = os.path.join(write_path, "gem5_global_var.h") + if not os.path.exists(spike_write_path): + write_atomic(spike_write_path, self.header.getvalue()+spad_end_symbol+spad_section_end_symbol) + if not os.path.exists(gem5_write_path): + write_atomic(gem5_write_path, self.gem5_header.getvalue()) def codegen_prologue_body(self): body = IndentedBuffer() @@ -1256,7 +1264,8 @@ def make_kernel_render( template=self, kwargs=kwargs ) - return kernel, render, self.codegen_header + tile_candidates = self.get_tile_candidates(**kwargs) + return kernel, tile_candidates, render return MLIRTemplateCaller( kernel_hash_name, @@ -1268,5 +1277,8 @@ def make_kernel_render( self, ) + def get_tile_candidates(self, **kwargs): + return [] + def render(self, **kwargs) -> str: raise NotImplementedError \ No newline at end of file From f400d6800e4d7d0aacdf2e86f9359950e7c53215 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 5 Nov 2025 04:35:49 +0000 Subject: [PATCH 08/53] [Cleanup] Refactor conv template + autotune primitive --- PyTorchSimFrontend/mlir/mlir_conv_common.py | 120 ++++++++++++++++++ .../mlir/mlir_conv_mt_template.py | 102 ++------------- .../mlir/mlir_conv_sb_template.py | 103 ++------------- .../mlir/mlir_conv_sbs_template.py | 103 ++------------- PyTorchSimFrontend/mlir/mlir_conv_template.py | 104 ++------------- .../mlir/mlir_maxpool_template.py | 1 + 6 files changed, 161 insertions(+), 372 deletions(-) create mode 100644 PyTorchSimFrontend/mlir/mlir_conv_common.py diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py new file mode 100644 index 00000000..e6379597 --- /dev/null +++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py @@ -0,0 +1,120 @@ +import os +import math +from typing import List, Optional + +from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs +from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate +from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel +from torch._inductor.ir import IRNode +from PyTorchSimFrontend import extension_config + +class MLIRConvCommonTemplate(MLIRTemplate): + WRAPPER_TEMPLATE = None + def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): + super().__init__("kernel", input_nodes, layout, input_reorder) + self.stride = kwargs["stride"] + self.padding = kwargs["padding"] + self.dilation = kwargs["dilation"] + self.weight_shape = [str(i) for i in input_nodes[1].layout.size] + self.input_shape = [str(i) for i in input_nodes[0].layout.size] + self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \ + + "_".join([str(i) for i in self.stride]) \ + + "_" + "_".join([str(i) for i in self.padding]) \ + + "_" + "_".join([str(i) for i in self.dilation]) + self.kernel_args = ['X', 'W', 'Bias', 'Y'] + + def get_padded_input_size(self, X): + input_padded = list(X.layout.size) + input_padded[2] += 2 * self.padding[0] + input_padded[3] += 2 * self.padding[1] + return math.prod(input_padded) + + def render(self, + kernel: MLIRTemplateKernel, + template_buffer_node = None, + epilogue_nodes: Optional[List[IRNode]] = None, + tile_info = None, + **kwargs): + raise NotImplementedError() + + def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): + raise NotImplementedError() + + def extract_info(self, kernel, template_buffer_node, epilogue_nodes): + if template_buffer_node is not None: + self.output_node = template_buffer_node + self.kernel = kernel + self.epilogue_nodes = epilogue_nodes + + X, W = self.input_nodes[0], self.input_nodes[1] + Y = self.output_node + Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] + + if epilogue_nodes is not None: + extra_node_rw = { + item.name for epilogue_node in epilogue_nodes + for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes + if item.name != Y.name + } + n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0 + + BATCH, I_C, I_H, I_W = X.layout.size + O_C, _, K_H, K_W = W.layout.size + O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2] + O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3] + PADDING_H=self.padding[0] + PADDING_W=self.padding[1] + STRIDE_H=self.stride[0] + STRIDE_W=self.stride[1] + return X,W,Y,Bias,n_extra_node,BATCH,I_C,I_H,I_W,O_C,K_H,K_W,O_H,O_W,PADDING_H,PADDING_W,STRIDE_H,STRIDE_W + + def get_tile_candidates(self, + kernel: MLIRTemplateKernel, + template_buffer_node = None, + epilogue_nodes: Optional[List[IRNode]] = None, + **kwargs): + # Extract input arguments info + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) + return [self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)] + + def outer_func_render(self, kernel_name, input_args): + X, W = self.input_nodes[0], self.input_nodes[1] + Y = self.output_node + Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] + + eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) + options = dict( + kernel=self.kernel, + KERNEL_NAME=kernel_name, + FUNC_NAME=self.function_name + f"_{len(input_args)}", + INPUT=X, + WEIGHT=W, + BIAS=Bias, + OUTPUT=Y, + PADDING_H=self.padding[0], + PADDING_W=self.padding[1], + VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, + BACKENDSIM_EAGER_MODE=eager_mode, + input_reorder=self.input_reorder + ) + code = self._template_from_string(self.WRAPPER_TEMPLATE).render(**options) + return code, self.function_name + f"_{len(input_args)}" + + def get_arg_attributes(self): + arg_attributes = [] + + X = self.input_nodes[0] + X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)] + X_shape[0] += 2 * self.padding[0] + X_shape[1] += 2 * self.padding[1] + + def compute_stride(shape): + stride = [1] * len(shape) + for i in range(len(shape)-2, -1, -1): + stride[i] = stride[i+1] * shape[i+1] + return stride + + X_stride = compute_stride(X_shape) + arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) + + return arg_attributes diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py index 26657712..3facedd5 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py @@ -1,10 +1,7 @@ -import os -import math from sympy import Symbol, Number from typing import List, Optional -from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs -from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate +from PyTorchSimFrontend.mlir.mlir_conv_common import MLIRConvCommonTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode from PyTorchSimFrontend.mlir import mlir_common @@ -101,7 +98,8 @@ } """ -WRAPPER_TEMPLATE = r""" +class MLIRConvMultiTileTemplate(MLIRConvCommonTemplate): + WRAPPER_TEMPLATE = r""" def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Padding input padded_shape = list(X.shape) @@ -127,62 +125,24 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: yield ({{KERNEL_NAME}}, ) {%- endif %} """ - -class MLIRConvMultiTileTemplate(MLIRTemplate): def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): - super().__init__("kernel", input_nodes, layout, input_reorder) - self.stride = kwargs["stride"] - self.padding = kwargs["padding"] - self.dilation = kwargs["dilation"] - self.weight_shape = [str(i) for i in input_nodes[1].layout.size] - self.input_shape = [str(i) for i in input_nodes[0].layout.size] - self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \ - + "_".join([str(i) for i in self.stride]) \ - + "_" + "_".join([str(i) for i in self.padding]) \ - + "_" + "_".join([str(i) for i in self.dilation]) - self.kernel_args = ['X', 'W', 'Bias', 'Y'] - - def get_padded_input_size(self, X): - input_padded = list(X.layout.size) - input_padded[2] += 2 * self.padding[0] - input_padded[3] += 2 * self.padding[1] - return math.prod(input_padded) + super().__init__(input_nodes, layout, input_reorder, **kwargs) def render(self, kernel: MLIRTemplateKernel, template_buffer_node = None, epilogue_nodes: Optional[List[IRNode]] = None, + tile_info = None, **kwargs): # Extract input arguments info - if template_buffer_node is not None: - self.output_node = template_buffer_node - self.kernel = kernel - self.epilogue_nodes = epilogue_nodes - - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - if epilogue_nodes is not None: - extra_node_rw = { - item.name for epilogue_node in epilogue_nodes - for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes - if item.name != Y.name - } - n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0 - - BATCH, I_C, I_H, I_W = X.layout.size - O_C, _, K_H, K_W = W.layout.size - O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2] - O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3] - PADDING_H=self.padding[0] - PADDING_W=self.padding[1] - STRIDE_H=self.stride[0] - STRIDE_W=self.stride[1] + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) # Select tile size adn template conv_template = CONV_TEMPLATE - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + if tile_info is None: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + else: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N TOG_latency = O_W if TILE_M > O_W else TILE_M TOG_latency = 8 if TOG_latency < 8 else TOG_latency @@ -293,45 +253,3 @@ def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) SUB_TILE_K = TILE_K return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K - - def outer_func_render(self, kernel_name, input_args): - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) - options = dict( - kernel=self.kernel, - KERNEL_NAME=kernel_name, - FUNC_NAME=self.function_name + f"_{len(input_args)}", - INPUT=X, - WEIGHT=W, - BIAS=Bias, - OUTPUT=Y, - PADDING_H=self.padding[0], - PADDING_W=self.padding[1], - VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, - BACKENDSIM_EAGER_MODE=eager_mode, - input_reorder=self.input_reorder - ) - code = self._template_from_string(WRAPPER_TEMPLATE).render(**options) - return code, self.function_name + f"_{len(input_args)}" - - def get_arg_attributes(self): - arg_attributes = [] - - X = self.input_nodes[0] - X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)] - X_shape[0] += 2 * self.padding[0] - X_shape[1] += 2 * self.padding[1] - - def compute_stride(shape): - stride = [1] * len(shape) - for i in range(len(shape)-2, -1, -1): - stride[i] = stride[i+1] * shape[i+1] - return stride - - X_stride = compute_stride(X_shape) - arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) - - return arg_attributes diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py index 856d4c09..6f3492c6 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py @@ -1,14 +1,10 @@ -import os -import math from sympy import Symbol, Number from typing import List, Optional -from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs -from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate +from PyTorchSimFrontend.mlir.mlir_conv_common import MLIRConvCommonTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode from PyTorchSimFrontend.mlir import mlir_common -from PyTorchSimFrontend import extension_config CONV_TEMPLATE = r""" // Single Batch Conv2D kernel @@ -102,7 +98,8 @@ } """ -WRAPPER_TEMPLATE = r""" +class MLIRConvSingleBatchTemplate(MLIRConvCommonTemplate): + WRAPPER_TEMPLATE = r""" def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Padding input padded_shape = list(X.shape) @@ -128,62 +125,24 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: yield ({{KERNEL_NAME}}, ) {%- endif %} """ - -class MLIRConvSingleBatchTemplate(MLIRTemplate): def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): - super().__init__("kernel", input_nodes, layout, input_reorder) - self.stride = kwargs["stride"] - self.padding = kwargs["padding"] - self.dilation = kwargs["dilation"] - self.weight_shape = [str(i) for i in input_nodes[1].layout.size] - self.input_shape = [str(i) for i in input_nodes[0].layout.size] - self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \ - + "_".join([str(i) for i in self.stride]) \ - + "_" + "_".join([str(i) for i in self.padding]) \ - + "_" + "_".join([str(i) for i in self.dilation]) - self.kernel_args = ['X', 'W', 'Bias', 'Y'] - - def get_padded_input_size(self, X): - input_padded = list(X.layout.size) - input_padded[2] += 2 * self.padding[0] - input_padded[3] += 2 * self.padding[1] - return math.prod(input_padded) + super().__init__(input_nodes, layout, input_reorder, **kwargs) def render(self, kernel: MLIRTemplateKernel, template_buffer_node = None, epilogue_nodes: Optional[List[IRNode]] = None, + tile_info = None, **kwargs): # Extract input arguments info - if template_buffer_node is not None: - self.output_node = template_buffer_node - self.kernel = kernel - self.epilogue_nodes = epilogue_nodes - - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - if epilogue_nodes is not None: - extra_node_rw = { - item.name for epilogue_node in epilogue_nodes - for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes - if item.name != Y.name - } - n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0 - - BATCH, I_C, I_H, I_W = X.layout.size - O_C, _, K_H, K_W = W.layout.size - O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2] - O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3] - PADDING_H=self.padding[0] - PADDING_W=self.padding[1] - STRIDE_H=self.stride[0] - STRIDE_W=self.stride[1] + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) # Select tile size adn template conv_template = CONV_TEMPLATE - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + if tile_info is None: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + else: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N TOG_latency = O_W if TILE_M > O_W else TILE_M TOG_latency = 8 if TOG_latency < 8 else TOG_latency @@ -289,45 +248,3 @@ def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane SUB_TILE_K = TILE_K return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K - - def outer_func_render(self, kernel_name, input_args): - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) - options = dict( - kernel=self.kernel, - KERNEL_NAME=kernel_name, - FUNC_NAME=self.function_name + f"_{len(input_args)}", - INPUT=X, - WEIGHT=W, - BIAS=Bias, - OUTPUT=Y, - PADDING_H=self.padding[0], - PADDING_W=self.padding[1], - VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, - BACKENDSIM_EAGER_MODE=eager_mode, - input_reorder=self.input_reorder - ) - code = self._template_from_string(WRAPPER_TEMPLATE).render(**options) - return code, self.function_name + f"_{len(input_args)}" - - def get_arg_attributes(self): - arg_attributes = [] - - X = self.input_nodes[0] - X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)] - X_shape[0] += 2 * self.padding[0] - X_shape[1] += 2 * self.padding[1] - - def compute_stride(shape): - stride = [1] * len(shape) - for i in range(len(shape)-2, -1, -1): - stride[i] = stride[i+1] * shape[i+1] - return stride - - X_stride = compute_stride(X_shape) - arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) - - return arg_attributes diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py index 14b7d432..53292858 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py @@ -1,14 +1,10 @@ -import os -import math from sympy import Symbol, Number from typing import List, Optional -from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs -from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate +from PyTorchSimFrontend.mlir.mlir_conv_common import MLIRConvCommonTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode from PyTorchSimFrontend.mlir import mlir_common -from PyTorchSimFrontend import extension_config CONV_TEMPLATE = r""" // Single Batch Conv2D (Stride != 1) kernel @@ -102,7 +98,8 @@ } """ -WRAPPER_TEMPLATE = r""" +class MLIRConvSingleBatchStridedTemplate(MLIRConvCommonTemplate): + WRAPPER_TEMPLATE = r""" def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Padding input padded_shape = list(X.shape) @@ -128,62 +125,24 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: yield ({{KERNEL_NAME}}, ) {%- endif %} """ - -class MLIRConvSingleBatchStridedTemplate(MLIRTemplate): def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): - super().__init__("kernel", input_nodes, layout, input_reorder) - self.stride = kwargs["stride"] - self.padding = kwargs["padding"] - self.dilation = kwargs["dilation"] - self.weight_shape = [str(i) for i in input_nodes[1].layout.size] - self.input_shape = [str(i) for i in input_nodes[0].layout.size] - self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \ - + "_".join([str(i) for i in self.stride]) \ - + "_" + "_".join([str(i) for i in self.padding]) \ - + "_" + "_".join([str(i) for i in self.dilation]) - self.kernel_args = ['X', 'W', 'Bias', 'Y'] - - def get_padded_input_size(self, X): - input_padded = list(X.layout.size) - input_padded[2] += 2 * self.padding[0] - input_padded[3] += 2 * self.padding[1] - return math.prod(input_padded) + super().__init__(input_nodes, layout, input_reorder, **kwargs) def render(self, kernel: MLIRTemplateKernel, template_buffer_node = None, epilogue_nodes: Optional[List[IRNode]] = None, + tile_info = None, **kwargs): # Extract input arguments info - if template_buffer_node is not None: - self.output_node = template_buffer_node - self.kernel = kernel - self.epilogue_nodes = epilogue_nodes - - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - if epilogue_nodes is not None: - extra_node_rw = { - item.name for epilogue_node in epilogue_nodes - for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes - if item.name != Y.name - } - n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0 - - BATCH, I_C, I_H, I_W = X.layout.size - O_C, _, K_H, K_W = W.layout.size - O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2] - O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3] - PADDING_H=self.padding[0] - PADDING_W=self.padding[1] - STRIDE_H=self.stride[0] - STRIDE_W=self.stride[1] + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) # Select tile size adn template conv_template = CONV_TEMPLATE - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + if tile_info is None: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + else: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N TOG_latency = O_W if TILE_M > O_W else TILE_M TOG_latency = 8 if TOG_latency < 8 else TOG_latency @@ -290,45 +249,3 @@ def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane SUB_TILE_K = TILE_K return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K - - def outer_func_render(self, kernel_name, input_args): - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) - options = dict( - kernel=self.kernel, - KERNEL_NAME=kernel_name, - FUNC_NAME=self.function_name + f"_{len(input_args)}", - INPUT=X, - WEIGHT=W, - BIAS=Bias, - OUTPUT=Y, - PADDING_H=self.padding[0], - PADDING_W=self.padding[1], - VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, - BACKENDSIM_EAGER_MODE=eager_mode, - input_reorder=self.input_reorder - ) - code = self._template_from_string(WRAPPER_TEMPLATE).render(**options) - return code, self.function_name + f"_{len(input_args)}" - - def get_arg_attributes(self): - arg_attributes = [] - - X = self.input_nodes[0] - X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)] - X_shape[0] += 2 * self.padding[0] - X_shape[1] += 2 * self.padding[1] - - def compute_stride(shape): - stride = [1] * len(shape) - for i in range(len(shape)-2, -1, -1): - stride[i] = stride[i+1] * shape[i+1] - return stride - - X_stride = compute_stride(X_shape) - arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) - - return arg_attributes diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py index ff426ceb..6fa3be53 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py @@ -1,15 +1,10 @@ -import os -import math from sympy import Symbol, Number from typing import List, Optional -from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs -from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate +from PyTorchSimFrontend.mlir.mlir_conv_common import MLIRConvCommonTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode from PyTorchSimFrontend.mlir import mlir_common -from torch._inductor.codecache import get_hash -from PyTorchSimFrontend import extension_config CONV_TEMPLATE = r""" // Conv2D kernel @@ -107,7 +102,8 @@ } """ -WRAPPER_TEMPLATE = r""" +class MLIRConvTemplate(MLIRConvCommonTemplate): + WRAPPER_TEMPLATE = r""" def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Padding input padded_shape = list(X.shape) @@ -133,62 +129,24 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: yield ({{KERNEL_NAME}}, ) {%- endif %} """ - -class MLIRConvTemplate(MLIRTemplate): def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): - super().__init__("kernel", input_nodes, layout, input_reorder) - self.stride = kwargs["stride"] - self.padding = kwargs["padding"] - self.dilation = kwargs["dilation"] - self.weight_shape = [str(i) for i in input_nodes[1].layout.size] - self.input_shape = [str(i) for i in input_nodes[0].layout.size] - self.function_name = "Conv2D_" + "_".join(self.input_shape) + "_".join(self.weight_shape)+ "_" \ - + "_".join([str(i) for i in self.stride]) \ - + "_" + "_".join([str(i) for i in self.padding]) \ - + "_" + "_".join([str(i) for i in self.dilation]) - self.kernel_args = ['X', 'W', 'Bias', 'Y'] - - def get_padded_input_size(self, X): - input_padded = list(X.layout.size) - input_padded[2] += 2 * self.padding[0] - input_padded[3] += 2 * self.padding[1] - return math.prod(input_padded) + super().__init__(input_nodes, layout, input_reorder, **kwargs) def render(self, kernel: MLIRTemplateKernel, template_buffer_node = None, epilogue_nodes: Optional[List[IRNode]] = None, + tile_info = None, **kwargs): # Extract input arguments info - if template_buffer_node is not None: - self.output_node = template_buffer_node - self.kernel = kernel - self.epilogue_nodes = epilogue_nodes - - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - if epilogue_nodes is not None: - extra_node_rw = { - item.name for epilogue_node in epilogue_nodes - for item in epilogue_node.read_writes.reads | epilogue_node.read_writes.writes - if item.name != Y.name - } - n_extra_node = len(extra_node_rw) if epilogue_nodes is not None else 0 - - BATCH, I_C, I_H, I_W = X.layout.size - O_C, _, K_H, K_W = W.layout.size - O_H = Y.layout.size[2] if template_buffer_node is None else template_buffer_node.layout.size[2] - O_W = Y.layout.size[3] if template_buffer_node is None else template_buffer_node.layout.size[3] - PADDING_H=self.padding[0] - PADDING_W=self.padding[1] - STRIDE_H=self.stride[0] - STRIDE_W=self.stride[1] + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) # Select tile size adn template conv_template = CONV_TEMPLATE - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + if tile_info is None: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + else: + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info TOG_latency = BATCH if TILE_M > BATCH else TILE_M TOG_latency = 8 if TOG_latency < 8 else TOG_latency kernel.loop_size = [TOG_latency, TILE_N, TILE_K] @@ -296,45 +254,3 @@ def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K - - def outer_func_render(self, kernel_name, input_args): - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) - options = dict( - kernel=self.kernel, - KERNEL_NAME=kernel_name, - FUNC_NAME=self.function_name + f"_{len(input_args)}", - INPUT=X, - WEIGHT=W, - BIAS=Bias, - OUTPUT=Y, - PADDING_H=self.padding[0], - PADDING_W=self.padding[1], - VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, - BACKENDSIM_EAGER_MODE=eager_mode, - input_reorder=self.input_reorder - ) - code = self._template_from_string(WRAPPER_TEMPLATE).render(**options) - return code, self.function_name + f"_{len(input_args)}" - - def get_arg_attributes(self): - arg_attributes = [] - - X = self.input_nodes[0] - X_shape = [X.get_size()[i] for i in (2, 3, 0, 1)] - X_shape[0] += 2 * self.padding[0] - X_shape[1] += 2 * self.padding[1] - - def compute_stride(shape): - stride = [1] * len(shape) - for i in range(len(shape)-2, -1, -1): - stride[i] = stride[i+1] * shape[i+1] - return stride - - X_stride = compute_stride(X_shape) - arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) - - return arg_attributes diff --git a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py index a779e598..2cca36b6 100644 --- a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py +++ b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py @@ -40,6 +40,7 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node = None, epilogue_nodes: Optional[List[IRNode]] = None, + tile_info = None, **kwargs): if template_buffer_node is not None: self.output_node = template_buffer_node From 8c96a5a57ec6888040905aaf0ac5521192d0da83 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 5 Nov 2025 05:29:22 +0000 Subject: [PATCH 09/53] [Autotune] Connect autotune template --- PyTorchSimFrontend/extension_codecache.py | 2 +- PyTorchSimFrontend/extension_config.py | 2 +- PyTorchSimFrontend/mlir/mlir_autotune.py | 3 +- PyTorchSimFrontend/mlir/mlir_bmm_template.py | 94 +++++++++++-------- .../mlir/mlir_codegen_backend.py | 29 ++++-- PyTorchSimFrontend/mlir/mlir_common.py | 5 +- PyTorchSimFrontend/mlir/mlir_conv_common.py | 2 +- .../mlir/mlir_conv_mt_template.py | 23 +++-- .../mlir/mlir_conv_sb_template.py | 20 ++-- .../mlir/mlir_conv_sbs_template.py | 20 ++-- PyTorchSimFrontend/mlir/mlir_conv_template.py | 22 +++-- PyTorchSimFrontend/mlir/mlir_gemm_template.py | 51 +++++----- PyTorchSimFrontend/mlir/mlir_template.py | 87 +++++------------ 13 files changed, 186 insertions(+), 174 deletions(-) diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index 1e756f96..ca669361 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -299,7 +299,7 @@ def dummy_simulator(*args, **kwargs): # Dump arguments and meta data dump_metadata(args, arg_attributes, result_path) runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path) - if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE or validate: + if not autotune and (extension_config.CONFIG_TORCHSIM_VALIDATION_MODE or validate): funcsim = FunctionalSimulator(result_path, key) funcsim.run_spike(args, arg_attributes, runtime_path, self.validation_binary_name, diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index 7eddfcb9..fa5d22b5 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -48,7 +48,7 @@ CONFIG_AUTOTUNE = int(os.environ.get('AUTOTUNE', default=True)) CONFIG_AUTOTUNE_TEMPLATE = int(os.environ.get('AUTOTUNE_TEMPLATE', default=True)) CONFIG_MAX_AUTOTUNE_TRY = int(os.environ.get('MAX_AUTOTUNE_TRY', default=10)) -CONFIG_AUTOTUNE_TOPK = int(os.environ.get('AUTOTUNE_TOPK', default=3)) +CONFIG_AUTOTUNE_TEMPLATE_TOPK = int(os.environ.get('AUTOTUNE_TEMPLATE_TOPK', default=4)) # For block sparse CONFIG_BLOCK_SPARSE = int(os.environ.get('BLOCK_SPARSE', default=0)) diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py index 54aed9c0..537809de 100644 --- a/PyTorchSimFrontend/mlir/mlir_autotune.py +++ b/PyTorchSimFrontend/mlir/mlir_autotune.py @@ -74,7 +74,8 @@ def cached_run_fn(*args, **kwargs): self.source_code, vectorlane_size=self.extra_args["vector_lane"], loop_size=None, spad_info=self.extra_args["spad_info"], vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"], - origins="Unknown", silent_mode=True) + origins="Unknown", silent_mode=True, + validate=self.extra_args['validate'], autotune=self.extra_args['autotune']) args = [ tensor diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py index 0c6583a7..9a12076a 100644 --- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py @@ -160,29 +160,13 @@ def render(self, template_buffer_node = None, epilogue_nodes: Optional[List[IRNode]] = None, prologue_nodes: Optional[List[IRNode]] = None, + tile_info = None, **kwargs): - if template_buffer_node is not None: - self.output_node = template_buffer_node - - # Extract input arguments info - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - W_tensor = empty_strided(W.layout.size, W.layout.stride) - X_tensor = empty_strided(X.layout.size, X.layout.stride) - if len(W_tensor.size()) > 3 or len(W_tensor.size()) == 2: - W_tensor = W_tensor.view([-1, W_tensor.shape[-2], W_tensor.shape[-1]]) - if len(X_tensor.size()) > 3 or len(X_tensor.size()) == 2: - X_tensor = X_tensor.view([-1, X_tensor.shape[-2], X_tensor.shape[-1]]) - B, M, N, K = X_tensor.size()[0], X_tensor.size()[1], W_tensor.size()[2], X_tensor.size()[2] - - W_stride = W_tensor.stride() - X_stride = X_tensor.stride() - - # Select tile size - n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0 - TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_extra_node, 0, len(prologue_nodes)) + X, W, Y, Bias, W_tensor, X_tensor, B, M, N, K, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) + if tile_info is None: + TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_extra_node, 0, n_prologue_node)[0] + else: + TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info TOG_latency = M if TILE_M > M else TILE_M kernel.loop_size = [TOG_latency, TILE_N, TILE_K] @@ -190,17 +174,17 @@ def render(self, # Select template code nr_reduction_nodes = [node for node in epilogue_nodes if node.is_reduction()] if epilogue_nodes is not None else [] if nr_reduction_nodes: - template = BMM_REDUCTION_TEMPLATE - epilogue_dim_aliasing = {"index0":"index0", "index1":"index2", "index2": "index1"} - nr_rdim = 1 + template = BMM_REDUCTION_TEMPLATE + epilogue_dim_aliasing = {"index0":"index0", "index1":"index2", "index2": "index1"} + nr_rdim = 1 elif prologue_nodes: - template = BMM_PROLOGUE_TEMPLATE - epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2"} - nr_rdim = 0 + template = BMM_PROLOGUE_TEMPLATE + epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2"} + nr_rdim = 0 else: - template = BMM_TEMPLATE - epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2"} - nr_rdim = 0 + template = BMM_TEMPLATE + epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2"} + nr_rdim = 0 # Prepare tile descriptors vlane_stride = 1 @@ -323,10 +307,46 @@ def render(self, kernel.add_loop_info([kernel.render_options["M"], kernel.render_options["N"], kernel.render_options["K"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]]) return code + def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes): + if template_buffer_node is not None: + self.output_node = template_buffer_node + + # Extract input arguments info + X, W = self.input_nodes[0], self.input_nodes[1] + Y = self.output_node + Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] + + W_tensor = empty_strided(W.layout.size, W.layout.stride) + X_tensor = empty_strided(X.layout.size, X.layout.stride) + if len(W_tensor.size()) > 3 or len(W_tensor.size()) == 2: + W_tensor = W_tensor.view([-1, W_tensor.shape[-2], W_tensor.shape[-1]]) + if len(X_tensor.size()) > 3 or len(X_tensor.size()) == 2: + X_tensor = X_tensor.view([-1, X_tensor.shape[-2], X_tensor.shape[-1]]) + B, M, N, K = X_tensor.size()[0], X_tensor.size()[1], W_tensor.size()[2], X_tensor.size()[2] + + W_stride = W_tensor.stride() + X_stride = X_tensor.stride() + + # Select tile size + n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0 + n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0 + return X,W,Y,Bias,W_tensor,X_tensor,B,M,N,K,n_extra_node, n_prologue_node + + def get_tile_candidates(self, + kernel: MLIRTemplateKernel, + template_buffer_node = None, + epilogue_nodes: Optional[List[IRNode]] = None, + prologue_nodes: Optional[List[IRNode]] = None, + **kwargs): + X, W, Y, Bias, W_tensor, X_tensor, B, M, N, K, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) + return self.select_tile(kernel, M, N, K, n_extra_node, 0, n_prologue_node) + def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node): - TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node) - SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane) or n_prologue_node else kernel.vector_lane - SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane - SUB_TILE_K = TILE_K # if (TILE_K < kernel.vector_lane) or prologue_nodes else kernel.vector_lane - TILE_K = TILE_K // 2 if n_prologue_node else TILE_K - return TILE_M,TILE_N,TILE_K,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + tile_candidates = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node) + for idx, (TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): + SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane) or n_prologue_node else kernel.vector_lane + SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane + SUB_TILE_K = TILE_K # if (TILE_K < kernel.vector_lane) or prologue_nodes else kernel.vector_lane + TILE_K = TILE_K // 2 if n_prologue_node else TILE_K + tile_candidates[idx] = TILE_M,TILE_N,TILE_K,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + return tile_candidates diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index d54963c2..b3352ea6 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1564,10 +1564,10 @@ def make_choices(self, nodes, kernel_name): current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size()) search_space.add(current_tile_sz) - print(f"[Auto-tune] Trying tile size: {current_tile_sz}, vlane_stride: {vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}") + print(f"[Auto-tune] Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}") self._prepare_simulator_headers(src_code) bench_runner = self.run_bench(nodes, kernel_name, src_code) - choices.append((bench_runner, src_code, self.kernel_group)) + choices.append((bench_runner, src_code, current_tile_sz, self.kernel_group.tile_desc.vmap.vlane_stride)) while prevent_infinite_loop < 10 and candidate_axes: for axis in list(candidate_axes): @@ -1592,6 +1592,13 @@ def make_choices(self, nodes, kernel_name): src_code = super().codegen_nodes(nodes, kernel_name) current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size()) + # FIXME. How to intergrate this constraint to tile system? + pad = self.kernel_group.tile_desc.vmap.get_used_vlane(current_tile_sz) * self.kernel_group.tile_desc.vmap.vlane_stride + vlane_size = current_tile_sz[self.kernel_group.tile_desc.vmap.vlane_split_axis] + if vlane_size > pad and vlane_size % pad: + prevent_infinite_loop += 1 + continue + # If tile size is converged for this axis, remove from candidate axes if current_tile_sz in search_space: candidate_axes.remove(axis) @@ -1599,10 +1606,10 @@ def make_choices(self, nodes, kernel_name): # Add this choice search_space.add(current_tile_sz) - print(f"[Auto-tune] Trying tile size: {current_tile_sz}, vlane_stride: {vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}") + print(f"[Auto-tune] Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}") self._prepare_simulator_headers(src_code) bench_runner = self.run_bench(nodes, kernel_name, src_code) - choices.append((bench_runner, src_code, self.kernel_group)) + choices.append((bench_runner, src_code, self.kernel_group.tile_desc.get_tile_size(), self.kernel_group.tile_desc.vmap.vlane_stride)) prevent_infinite_loop += 1 self.kernel_group.tile_desc.prev_tail_threshold = prev_tail_threshold return choices @@ -1612,8 +1619,7 @@ def get_cycle(choice): bench_runner = choice[0] for n_try in range(extension_config.CONFIG_MAX_AUTOTUNE_TRY): # TODO: make simple try: - # bench_runner = self.run_bench(nodes, kernel_name, src_code) - out = bench_runner(validate=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, autotune=True) + out = bench_runner() return out[-1] except (extension_codecache.SpadOverflowError, RuntimeError) as e: return float("inf") @@ -1627,14 +1633,21 @@ def get_cycle(choice): max_idx = results.index(min(results)) if min(results) == float("inf"): raise RuntimeError("Failed to find optimal tile size...") - print(f"[Auto-tune] Optimal tile size: {choices[max_idx][2].tile_desc.get_tile_size()}, vlane_stride: {choices[max_idx][2].tile_desc.vmap.vlane_stride}, cycles: {results[max_idx]}") + self._log_autotune_result(choices[max_idx], results[max_idx]) optimal_src_code = choices[max_idx][1] return optimal_src_code + def _log_autotune_result(self, best_choice, best_cycle): + print( + f"[Auto-tune] Optimal tile size: {list(best_choice[2])}, " + f"vlane_stride: {best_choice[3]}, " + f"cycles: {best_cycle}" + ) + def codegen_nodes(self, nodes, kernel_name): src_code = super().codegen_nodes(nodes, kernel_name) self._prepare_simulator_headers(src_code) - if extension_config.CONFIG_AUTOTUNE and extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: + if extension_config.CONFIG_AUTOTUNE and not extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: optimal_src_code = self.autotune(nodes, kernel_name) if optimal_src_code is not None: return optimal_src_code diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 67d5380f..2644f125 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -408,6 +408,7 @@ def select_vlane_axis(self): self.vmap.vlane_split_axis = best_vlane_split_axis def pad_vlane_tile(self): + # FIXME. this doesn't follow tile constraints... vlane_split_axis, vlane_stride, vector_lane = self.vmap.vlane_split_axis, self.vmap.vlane_stride, self.vmap.vector_lane used_vlane = min(math.ceil(self._tile_size[vlane_split_axis] / vlane_stride), vector_lane) padded_size = used_vlane * vlane_stride @@ -790,7 +791,9 @@ def run_bench(self, nodes, kernel_name, src_code): "vector_lane" : self.vector_lane, "spad_info": self.spad_info, "vlen" : self.vlen, - "arg_attributes" : arg_attributes + "arg_attributes" : arg_attributes, + "validate" : extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, + "autotune" : True, }, source_code=src_code, ) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py index e6379597..52979d73 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_common.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py @@ -75,7 +75,7 @@ def get_tile_candidates(self, **kwargs): # Extract input arguments info X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) - return [self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)] + return self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) def outer_func_render(self, kernel_name, input_args): X, W = self.input_nodes[0], self.input_nodes[1] diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py index 3facedd5..26018a94 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py @@ -140,7 +140,7 @@ def render(self, # Select tile size adn template conv_template = CONV_TEMPLATE if tile_info is None: - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0] else: TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N @@ -242,14 +242,13 @@ def render(self, return code def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) - SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane - SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane - - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) - TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] - TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] - SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 - SUB_TILE_K = TILE_K - - return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + tile_candidates = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) + for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): + TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] + SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 + SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane + SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane + SUB_TILE_K = TILE_K + tile_candidates[idx] = TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + return tile_candidates diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py index 6f3492c6..a2959b4d 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py @@ -140,7 +140,7 @@ def render(self, # Select tile size adn template conv_template = CONV_TEMPLATE if tile_info is None: - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0] else: TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N @@ -240,11 +240,13 @@ def render(self, return code def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W - TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] - TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] - SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 - SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane - SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane - SUB_TILE_K = TILE_K - return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + tile_candidates = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W + for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): + TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] + TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] + SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 + SUB_TILE_M = TILE_I_W if TILE_I_W < kernel.vector_lane else kernel.vector_lane + SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane + SUB_TILE_K = TILE_K + tile_candidates[idx] = TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + return tile_candidates diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py index 53292858..afbe9289 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py @@ -140,7 +140,7 @@ def render(self, # Select tile size adn template conv_template = CONV_TEMPLATE if tile_info is None: - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0] else: TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N @@ -241,11 +241,13 @@ def render(self, return code def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W - TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] - TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] - SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 - SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane - SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane - SUB_TILE_K = TILE_K - return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + tile_candidates = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W + for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): + TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] + TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] + SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 + SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane + SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane + SUB_TILE_K = TILE_K + tile_candidates[idx] = TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + return tile_candidates diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py index 6fa3be53..777d0a7b 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py @@ -144,7 +144,7 @@ def render(self, # Select tile size adn template conv_template = CONV_TEMPLATE if tile_info is None: - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0] else: TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info TOG_latency = BATCH if TILE_M > BATCH else TILE_M @@ -245,12 +245,14 @@ def render(self, return code def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) - SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane - SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane - SUB_TILE_K = TILE_K - TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] - TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] - SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 - SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N - return TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + tile_candidates = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) + for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): + TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] + TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] + SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W = 1, 1, 1, 1 + SUB_TILE_M = TILE_M if TILE_M < kernel.vector_lane else kernel.vector_lane + SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane + SUB_TILE_K = TILE_K + SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N + tile_candidates[idx] = TILE_K_H,TILE_K_W,TILE_O_H,TILE_O_W,TILE_M,TILE_N,TILE_K,TILE_I_H,TILE_I_W,SUB_TILE_I_H,SUB_TILE_I_W,SUB_TILE_K_H,SUB_TILE_K_W,SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + return tile_candidates diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py index 9d3d3acf..0830b4e6 100644 --- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py @@ -116,7 +116,7 @@ def render(self, **kwargs): X, W, Y, M, N, K, n_epilogue_node, n_prologue_node, n_extra_read = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) if tile_info is None: - TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node) + TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node)[0] else: TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info @@ -269,8 +269,7 @@ def get_tile_candidates(self, prologue_nodes: Optional[List[IRNode]] = None, **kwargs): X, W, Y, M, N, K, n_epilogue_node, n_prologue_node, n_extra_read = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) - TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node) - return [[TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K]] + return self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node) def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes): if template_buffer_node is not None: @@ -313,38 +312,44 @@ def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_no TILE_M = extension_config.CONFIG_TILE_M TILE_N = extension_config.CONFIG_TILE_N TILE_K = extension_config.CONFIG_TILE_K + tile_candidates = [[TILE_M, TILE_N, TILE_K]] elif gemm_shape in data: # case 2: cached tile size tile_info = data[gemm_shape] TILE_M = tile_info["TILE_M"] TILE_N = tile_info["TILE_N"] TILE_K = tile_info["TILE_K"] + tile_candidates = [[TILE_M, TILE_N, TILE_K]] else: # case 3: use gemm_combination_mapping min_tile = (n_extra_node + n_prologue_node) == 0 - TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, max(n_extra_read-2, 0), n_prologue_node, min_tile=True) + tile_candidates = kernel.gemm_combination_mapping(M, N, K, max(n_extra_read-2, 0), n_prologue_node, min_tile=True) # Edge case if (M == 0) or (N == 0) or (K == 0): TILE_M, TILE_N, TILE_K = 1, 1, 1 + tile_candidates = [[TILE_M, TILE_N, TILE_K]] - # Calculate Sub Tile Size for fine-grained DMA - if extension_config.CONFIG_SUBTILE: - # Case 1: adjust selective fine-grained DMA (SFG-DMA) - SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane or n_prologue_node) else kernel.vector_lane - if (TILE_M == M and TILE_N == N and TILE_N <= 512): - SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane - else: # Avoid Row Conflict of weights + full_tile_candidates = [] + for idx, (TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): + # Calculate Sub Tile Size for fine-grained DMA + if extension_config.CONFIG_SUBTILE: + # Case 1: adjust selective fine-grained DMA (SFG-DMA) + SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane or n_prologue_node) else kernel.vector_lane + if (TILE_M == M and TILE_N == N and TILE_N <= 512): + SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane + else: # Avoid Row Conflict of weights + SUB_TILE_N = TILE_N + SUB_TILE_K = TILE_K + # Case 2: use manual sub tile size (FG-DMA) + if extension_config.CONFIG_MANUAL_SUBTILE_SIZE: + SUB_TILE_M = extension_config.CONFIG_SUBTILE_M + SUB_TILE_N = extension_config.CONFIG_SUBTILE_N + SUB_TILE_K = extension_config.CONFIG_SUBTILE_K + # Case 3: None Subtile + else: + SUB_TILE_M = TILE_M SUB_TILE_N = TILE_N - SUB_TILE_K = TILE_K - # Case 2: use manual sub tile size (FG-DMA) - if extension_config.CONFIG_MANUAL_SUBTILE_SIZE: - SUB_TILE_M = extension_config.CONFIG_SUBTILE_M - SUB_TILE_N = extension_config.CONFIG_SUBTILE_N - SUB_TILE_K = extension_config.CONFIG_SUBTILE_K - # Case 3: None Subtile - else: - SUB_TILE_M = TILE_M - SUB_TILE_N = TILE_N - SUB_TILE_K = TILE_K - return TILE_M,TILE_N,TILE_K, SUB_TILE_M,SUB_TILE_N,SUB_TILE_K + SUB_TILE_K = TILE_K + full_tile_candidates.append([TILE_M,TILE_N,TILE_K, SUB_TILE_M,SUB_TILE_N,SUB_TILE_K]) + return full_tile_candidates diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 07ebec51..50fa6204 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -29,7 +29,7 @@ from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode from torch._inductor.codegen import common -from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_DIR, CONFIG_AUTOTUNE_TEMPLATE, CONFIG_AUTOTUNE, CONFIG_BACKENDSIM_SPIKE_ONLY +from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_DIR, CONFIG_AUTOTUNE_TEMPLATE_TOPK from . import mlir_common class IndentedBufferGroup: @@ -130,7 +130,6 @@ def __init__(self, self.reduction_mean = [] # Dim info self.dim_aliasing = {} - self.autotune_idx = 0 self.reason = reason def reset(self, reason): @@ -267,46 +266,10 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p mapping = (tile_M, tile_N, tile_K) if check_spad_size: tile_candidates.append((used_spad_size, (tile_M, tile_N, tile_K))) - if CONFIG_AUTOTUNE_TEMPLATE and not is_conv: - tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) - mapping = tile_candidates[self.autotune_idx][1] if self.autotune_idx < len(tile_candidates) else mapping - return mapping - - def search_mapping_space(self, mapping, idx, increment, stride, dilation, n_extra_node=0): - if idx == 0 or idx == 1 or idx == 4 or idx == 5 or idx == 6: - raise NotImplementedError("Only O_H and O_W are supported for search_mapping_space") - spad_size_per_lane = self.spad_info["spad_size"] - spad_size = spad_size_per_lane * self.vector_lane - max_spad_size = spad_size // 2 # double buffer - max_spad_per_lane = spad_size_per_lane // 2 # double buffer - - mapping = list(mapping) - mapping[idx] += increment - k_h, k_w, o_h, o_w, M, N, K = mapping - i_h = 1 + (o_h - 1) * stride[0] + (k_h - 1) * dilation[0] - i_w = 1 + (o_w - 1) * stride[1] + (k_w - 1) * dilation[1] - weight_size = k_w * k_h * K * N - input_size = i_w * i_h * M * K - output_size = o_w * o_h * M * N - used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision - weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N) - input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K) - output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M * (1 + n_extra_node), N) - used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision - if used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane: - mapping = (k_h, k_w, o_h, o_w, M, N, K) - else: - mapping[idx] -= increment - - return mapping - def pseudo_auto_tune(self, mapping, stride, dilation, O_H, O_W, n_extra_node=0): - # pseudo auto-tune - if mapping[2] == 1 and not (O_H == 1): - mapping = self.search_mapping_space(mapping, 2, 1, stride, dilation, n_extra_node=n_extra_node) - if mapping[3] == 1 and not (O_W == 1): - mapping = self.search_mapping_space(mapping, 3, 1, stride, dilation, n_extra_node=n_extra_node) - return mapping + tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) + tile_candidates = [v for _, v in tile_candidates] + return tile_candidates def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0): tile_candidates = [] @@ -316,7 +279,7 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation max_spad_per_lane = spad_size_per_lane // 2 # double buffer max_used_spad_size = 0 - M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True) + M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0] max_k_h_w = 1 # maximize kernel size max_o_h_w = 1 # maximize output size K = min(K, self.vector_lane) @@ -345,13 +308,9 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation if max_used_spad_size == 0: raise RuntimeError("Cannot find a valid mapping") - # FIXME: this should be implemented with auto-tuning - mapping = self.pseudo_auto_tune(mapping, stride, dilation, O_H, O_W, n_extra_node=n_extra_node) - if CONFIG_AUTOTUNE_TEMPLATE: - tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) - mapping = tile_candidates[self.autotune_idx][1] if self.autotune_idx < len(tile_candidates) else mapping - - return mapping + tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) + tile_candidates = [v for _, v in tile_candidates] + return tile_candidates def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0): tile_candidates = [] @@ -361,7 +320,7 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, max_spad_per_lane = spad_size_per_lane // 2 max_used_spad_size = 0 - M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node=n_extra_node, pad_k=False, is_conv=True) + M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0] max_k_h_w = K_W for o_h in sympy.divisors(O_H): for o_w in sympy.divisors(O_W): @@ -385,10 +344,9 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, mapping = (k_h, K_W, o_h, o_w, M, N, K) if max_used_spad_size == 0: raise RuntimeError("Cannot find a valid mapping") - if CONFIG_AUTOTUNE_TEMPLATE: - tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) - mapping = tile_candidates[self.autotune_idx][1] if self.autotune_idx < len(tile_candidates) else mapping - return mapping + tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) + tile_candidates = [v for _, v in tile_candidates] + return tile_candidates def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0): tile_candidates = [] @@ -398,7 +356,7 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio max_spad_per_lane = spad_size_per_lane // 2 max_used_spad_size = 0 - M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True) + M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0] max_k_h_w = 1 for o_h in sympy.divisors(O_H): for k_h in sympy.divisors(K_H): @@ -422,10 +380,9 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio mapping = (k_h, k_w, o_h, M, M, N, K) if max_used_spad_size == 0: raise RuntimeError("Cannot find a valid mapping") - if CONFIG_AUTOTUNE_TEMPLATE: - tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) - mapping = tile_candidates[self.autotune_idx][1] if self.autotune_idx < len(tile_candidates) else mapping - return mapping + tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) + tile_candidates = [v for _, v in tile_candidates] + return tile_candidates def meta_kernel(self): wrapper = V.graph.wrapper_code @@ -534,12 +491,20 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_ def make_choices(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes): choices = [] for tile_info in tile_candidates: + print(f"[Auto-tune] Trying tile size: {list(tile_info)}") src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info) bench_runner = self.run_bench([template_node], self.kernel_name, src_code) - choices.append((bench_runner, src_code, self.kernel_group)) + choices.append((bench_runner, src_code, tile_info)) self.reset(reason=None) return choices + def _log_autotune_result(self, best_choice, best_cycle): + tile_size = best_choice[2] + print( + f"[Auto-tune] Optimal tile size: {list(tile_size)}, " + f"cycles: {best_cycle}" + ) + def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes): src_code = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) @@ -1264,7 +1229,7 @@ def make_kernel_render( template=self, kwargs=kwargs ) - tile_candidates = self.get_tile_candidates(**kwargs) + tile_candidates = self.get_tile_candidates(**kwargs)[:CONFIG_AUTOTUNE_TEMPLATE_TOPK] return kernel, tile_candidates, render return MLIRTemplateCaller( From 3692365be9631792dd7bdd9deba3159e0c46fa66 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 6 Nov 2025 04:17:40 +0000 Subject: [PATCH 10/53] [Fix] Fix wrong divder in reduction fusion --- PyTorchSimFrontend/mlir/mlir_bmm_template.py | 1 + PyTorchSimFrontend/mlir/mlir_gemm_template.py | 1 + PyTorchSimFrontend/mlir/mlir_template.py | 18 +++++++----------- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py index 9a12076a..178ea987 100644 --- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py @@ -301,6 +301,7 @@ def render(self, dram_idx = Y_idx, dram_tile_desc = Y_tile_desc, nr_rdim = nr_rdim, + r_dim_size = M, dim_aliasing = epilogue_dim_aliasing ) code = self._template_from_string(template).render(**kernel.render_options) diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py index 0830b4e6..c2120e7b 100644 --- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py @@ -256,6 +256,7 @@ def render(self, dram_idx = Y_idx, dram_tile_desc = Y_tile_desc, nr_rdim = nr_rdim, + r_dim_size = M, dim_aliasing = epilogue_dim_aliasing ) code = self._template_from_string(template).render(**kernel.render_options) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 50fa6204..e6e9dd0c 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -892,7 +892,7 @@ def load_epilogue(self, name: str, index: sympy.Expr): vshape = f"vector<{vsize}x{mlir_dtype}>" if compute_vec_size > 1: - offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.reduction_axis_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})") + offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.r_tile_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})") compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{offset}"]) operation = "affine.vector_load" line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}" @@ -1077,12 +1077,7 @@ def store_reduction_epilogue(self, name, index, value): if self.welford_reduce_out is not None: # NOTE: It not a real welford algorithm... We just used E(X^2) - E(X)^2 - divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.reduction_axis_size)} : f32") - if self.reduction_axis_size - 1 > 0: - divider2 = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.reduction_axis_size-1)} : f32") - else: - divider2 = divider - + divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.r_dim_size)} : f32") if self.buffer_types[name][1] > 1: divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to {new_reduced_shape}") else: @@ -1121,15 +1116,16 @@ def set_tile_size(self, template_fusion_info, prologue=False): if 'nr_rdim' in template_fusion_info and template_fusion_info['nr_rdim']==1: tile_desc.nr_rdim = 1 numel_per_lane = tile_desc.get_numel_per_lane() - reduction_axis_size = tile_desc.get_tile_size()[-1] - nr_outer_loop = (numel_per_lane + reduction_axis_size-1) // reduction_axis_size + r_tile_size = tile_desc.get_tile_size()[-1] + nr_outer_loop = (numel_per_lane + r_tile_size-1) // r_tile_size tile_desc.vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality... self.reduction_fusion = True - self.reduction_axis_size = tile_desc.get_tile_size()[-1] + self.r_tile_size = tile_desc.get_tile_size()[-1] + self.r_dim_size = template_fusion_info['r_dim_size'] self.reduction_nr_outer_loop = nr_outer_loop self.reduction_loop_idx = "reduce_loop_idx" - self.compute_body_loop.size = reduction_axis_size + self.compute_body_loop.size = r_tile_size self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_loop_idx, nr_outer_loop) else: From 3660e2d79af87c69dbecaa4dfb95261ff6ecace4 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 10 Nov 2025 12:34:12 +0000 Subject: [PATCH 11/53] [CI] Add accuracy validation --- .github/workflows/pytorchsim_test.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index c27df48a..bc356d85 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -662,3 +662,24 @@ jobs: -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_scheduler.py + + test_accuracy: + name: Run test_accuracy + runs-on: self-hosted + steps: + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Run run_cycle.sh + run: | + echo "Running run_cycle.sh" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh From 1ed9ca65cacbb0597e418961bf46a4a46774f78f Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 12 Nov 2025 03:25:24 +0000 Subject: [PATCH 12/53] [Fix] fix typo in backendsim + simlator.py (#176, #177) Reported-by: journewki --- PyTorchSimBackend/include/SimulationConfig.h | 2 +- PyTorchSimBackend/src/Common.cc | 2 +- PyTorchSimBackend/src/Simulator.cc | 2 +- Simulator/simulator.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/PyTorchSimBackend/include/SimulationConfig.h index 8f011d00..06a41c9f 100644 --- a/PyTorchSimBackend/include/SimulationConfig.h +++ b/PyTorchSimBackend/include/SimulationConfig.h @@ -50,7 +50,7 @@ struct SimulationConfig { uint32_t icnt_print_interval=0; /* Sheduler config */ - uint32_t num_patition=1; + uint32_t num_partition=1; std::string scheduler_type; /* Core id, Partiton id mapping */ diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc index 5581f8bd..687f32f5 100644 --- a/PyTorchSimBackend/src/Common.cc +++ b/PyTorchSimBackend/src/Common.cc @@ -116,7 +116,7 @@ SimulationConfig initialize_config(json config) { parsed_config.scheduler_type = config["scheduler"]; if (config.contains("num_partition")) - parsed_config.num_patition = config["num_partition"]; + parsed_config.num_partition = config["num_partition"]; if (config.contains("partition")) { for (int i=0; i(Scheduler(config, &_core_cycles, &_core_time, i))); } diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 4faf1c85..bd048538 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -409,7 +409,7 @@ def get_result_from_file(result_path): break if simulation_finished_idx == -1: - print("[BackendSimulator] Treid to parsing wrong formated output file!") + print("[BackendSimulator] Tried to parsing wrong formated output file!") return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time total_stat_lines = lines[simulation_finished_idx:] From 9936bb08b37b4776aa4750569da32c3e84fde72e Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Wed, 12 Nov 2025 01:31:21 +0000 Subject: [PATCH 13/53] [doc] add CMEM in readme --- README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.md b/README.md index 80329b16..15161d04 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ PyTorchSim **supports**: - [Multi-tenancy](#multi-tenancy) - [Compiler optimizations](#compiler-optimizations) - [Mapping](#mapping) +- [L2 Cache](#l2-cache) (persistent cache) ## Model Zoo | Model | Source | Status | Note | @@ -298,6 +299,25 @@ export TORCHSIM_TILE_M=512 export TORCHSIM_TILE_N=512 export TORCHSIM_TILE_K=512 ``` +## L2 Cache +It supports L2 cache as persistent cache. User can provide software-managed allocation/eviction strategy for tensors with persistent cache. + +Common Memory (CMEM) is a new feature introduced in the latest TPUs (newer than TPUv3). Multiple cores share this memory, which provides high bandwidth. Reusable tensors are stored and loaded from CMEM to avoid off-chip traffic. Our L2 cache can work like as CMEM + +To allocate a tensor in L2 cache, set the environment variable as shown below. The `tpuv4` directory provides example plans for L2 cache obtained from TPUv4 profiling. +```bash +export SRAM_BUFFER_PLAN_PATH=tpuv4/gemm_plan.py +``` +The L2 cache strategy file is composed as follows: +``` +plan = { + "arg0_1" +} +``` +In this example, only one input tensor is registered in L2 cache. You can refer to the tensor name from the wrapper code. After running the code, you can find the wrapper codegen path in the [result](#result) section. + +Last but not least, you must set `l2d_type` and `l2d_config` in the [TOGSim config](#togsim-configuration) to use L2 cache. The `l2d_config` follows the same configuration method as [AccelSim](https://github.com/accel-sim/accel-sim-framework). + ## Compiler Configuration `PyTorchSimFrontend/extension_config.py` contains target hardware configuration to compile. @@ -332,6 +352,9 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing "dram_nbl" : 2, // DRAM burst length size "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", // Ramulator2 config file path + "l2d_type" : "datacache", + "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32", + "icnt_type" : "simple", // Interconnect type (ex. booksim, simple) "icnt_latency" : 7, // Interconnect latency (cycle) "icnt_freq" : 28000, // Interconnect frequency (MHz) From 2eb2ea36efc0bcdf96d2c6d5b7034e815d159616 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Wed, 12 Nov 2025 01:37:09 +0000 Subject: [PATCH 14/53] [doc] autotune template readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 15161d04..958ea15c 100644 --- a/README.md +++ b/README.md @@ -265,6 +265,7 @@ We adopt and modified heuristic-based mapping of [GEMMINI](https://github.com/uc Heuristic method is not optimal for some cases. PyTorchSim provides auto-tuning to find best mapping for GEMM, CONV, and vector operations. It reduces searching space by sorting of scratchpad memory utilization and pick top-k candiates. Searching parameters are tile shape and vector lane stride. ```bash export AUTOTUNE=True +export AUTOTUNE_TEMPLATE=True ``` ### Manunal setting User can exploit third-party(e.g. Timeloop) mapping. Set the cheatsheet path and write down their own mapping. From fd2bdc19128a74d451b34ad36c66dd5eec4cba67 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Wed, 12 Nov 2025 02:54:10 +0000 Subject: [PATCH 15/53] [environment] build from source handle #175 issue --- README.md | 5 +++++ scripts/build_from_source.sh | 22 ++++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 scripts/build_from_source.sh diff --git a/README.md b/README.md index 958ea15c..4289195e 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,11 @@ To download the latest Docker image and set up the environment, use the followin # Run the Docker container docker run -it --ipc=host --name torchsim -w /workspace/PyTorchSim ghcr.io/psal-postech/torchsim-ci:latest bash ``` +### Manual Setting (Optional) +This script provides building [Gem5](https://github.com/PSAL-POSTECH/gem5.git), [LLVM](https://github.com/PSAL-POSTECH/llvm-project.git), and [Spike](https://github.com/PSAL-POSTECH/riscv-isa-sim.git) simulator from source code for specific experts. +```bash +bash script/build_from_source.sh +``` ### Run Examples The `tests` directory contains several AI workloads examples. ```bash diff --git a/scripts/build_from_source.sh b/scripts/build_from_source.sh new file mode 100644 index 00000000..d9806069 --- /dev/null +++ b/scripts/build_from_source.sh @@ -0,0 +1,22 @@ +#!/bin/bash +home="/workspace" +cd $home + +# Gem5 +apt -y update && apt -y upgrade && apt -y install scons +git clone https://github.com/PSAL-POSTECH/gem5.git +cd gem5 && scons build/RISCV/gem5.opt -j $(nproc) +export GEM5_PATH=$home/gem5/release/gem5.opt +cd $home + +# LLVM +git clone https://github.com/PSAL-POSTECH/llvm-project.git +cd llvm-project && mkdir build && cd build && \ + cmake -DLLVM_ENABLE_PROJECTS=mlir -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/riscv-llvm -DLLVM_TARGETS_TO_BUILD=RISCV -G "Unix Makefiles" ../llvm && \ + make -j && make install +cd $home + +# Spike Simulator +git clone https://github.com/PSAL-POSTECH/riscv-isa-sim.git --branch TorchSim && cd riscv-isa-sim && mkdir build && cd build && \ + ../configure --prefix=$RISCV && make -j && make install +cd $home \ No newline at end of file From ce9950f7d9dd0ea118b38bacc31fe5394d6e654b Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 12 Nov 2025 05:54:38 +0000 Subject: [PATCH 16/53] [Performance] fix renamed variable access --- PyTorchSimFrontend/mlir/mlir_template.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index e6e9dd0c..4e9b74d2 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -1118,7 +1118,7 @@ def set_tile_size(self, template_fusion_info, prologue=False): numel_per_lane = tile_desc.get_numel_per_lane() r_tile_size = tile_desc.get_tile_size()[-1] nr_outer_loop = (numel_per_lane + r_tile_size-1) // r_tile_size - tile_desc.vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality... + tile_desc.vmap.forced_vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality... self.reduction_fusion = True self.r_tile_size = tile_desc.get_tile_size()[-1] @@ -1129,7 +1129,7 @@ def set_tile_size(self, template_fusion_info, prologue=False): self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_loop_idx, nr_outer_loop) else: - tile_desc.vec_size=64 + tile_desc.vmap.forced_vec_size = 64 if prologue: self.prologue_compute_body_loop.size = tile_desc.get_numel_per_lane() From a6e6ea9eefd179b45799e356f07c12493c11423c Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 13 Nov 2025 02:21:21 +0000 Subject: [PATCH 17/53] [CI] Store validation results as an artifact --- .github/workflows/pytorchsim_test.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index bc356d85..7de66163 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -682,4 +682,11 @@ jobs: -e TORCHSIM_DUMP_PATH=/dump \ -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ - ${{ inputs.image_name }} PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh + ${{ inputs.image_name }} PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh && cp /workspace/PyTorchSim/experiments/artifact/cycle_validation/summary_cycle.out /dump/summary_cycle.out + + - name: Upload Accuracy Report Artifact + uses: actions/upload-artifact@v4 + with: + name: accuracy-report + path: /tmp/torchsim-ci/${GITHUB_SHA}/summary_cycle.out + if-no-files-found: error \ No newline at end of file From ca6a75862d29026cf0d0eae6a68b8614ec9b8fb8 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 13 Nov 2025 02:22:33 +0000 Subject: [PATCH 18/53] [Fix] Keep loop_size variable in the choice list --- .github/workflows/pytorchsim_test.yml | 7 ++++++- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 8 ++++---- PyTorchSimFrontend/mlir/mlir_template.py | 11 ++++++++--- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index 7de66163..e515bc1b 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -674,6 +674,9 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Prepare volume directory + run: mkdir -p /tmp/torchsim-ci/${GITHUB_SHA} + - name: Run run_cycle.sh run: | echo "Running run_cycle.sh" @@ -682,7 +685,9 @@ jobs: -e TORCHSIM_DUMP_PATH=/dump \ -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ - ${{ inputs.image_name }} PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh && cp /workspace/PyTorchSim/experiments/artifact/cycle_validation/summary_cycle.out /dump/summary_cycle.out + ${{ inputs.image_name }} bash -c \ + "cd /workspace && PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh && \ + cp PyTorchSim/experiments/artifact/cycle_validation/summary_cycle.out /dump/summary_cycle.out" - name: Upload Accuracy Report Artifact uses: actions/upload-artifact@v4 diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index b3352ea6..6400f9c9 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1627,15 +1627,15 @@ def get_cycle(choice): choices = self.make_choices(*args) if len(choices) == 0: # can't autotune - return None + return [None, None] with ThreadPoolExecutor(max_workers=8) as executor: results = list(executor.map(get_cycle, choices)) max_idx = results.index(min(results)) if min(results) == float("inf"): raise RuntimeError("Failed to find optimal tile size...") self._log_autotune_result(choices[max_idx], results[max_idx]) - optimal_src_code = choices[max_idx][1] - return optimal_src_code + optimal_src_code, loop_size = choices[max_idx][1], choices[max_idx][-1] + return optimal_src_code, loop_size def _log_autotune_result(self, best_choice, best_cycle): print( @@ -1648,7 +1648,7 @@ def codegen_nodes(self, nodes, kernel_name): src_code = super().codegen_nodes(nodes, kernel_name) self._prepare_simulator_headers(src_code) if extension_config.CONFIG_AUTOTUNE and not extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: - optimal_src_code = self.autotune(nodes, kernel_name) + optimal_src_code = self.autotune(nodes, kernel_name)[0] if optimal_src_code is not None: return optimal_src_code return src_code diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 4e9b74d2..f9f25b93 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -29,7 +29,7 @@ from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode from torch._inductor.codegen import common -from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_DIR, CONFIG_AUTOTUNE_TEMPLATE_TOPK +from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_DIR, CONFIG_AUTOTUNE_TEMPLATE_TOPK, CONFIG_AUTOTUNE_TEMPLATE from . import mlir_common class IndentedBufferGroup: @@ -494,7 +494,7 @@ def make_choices(self, tile_candidates, render, template_node, prologue_nodes, e print(f"[Auto-tune] Trying tile size: {list(tile_info)}") src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info) bench_runner = self.run_bench([template_node], self.kernel_name, src_code) - choices.append((bench_runner, src_code, tile_info)) + choices.append((bench_runner, src_code, tile_info, self.loop_size)) self.reset(reason=None) return choices @@ -506,7 +506,12 @@ def _log_autotune_result(self, best_choice, best_cycle): ) def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes): - src_code = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) + if CONFIG_AUTOTUNE_TEMPLATE and len(tile_candidates): + src_code, loop_size = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) + self.loop_size = loop_size + else: + tile_info = tile_candidates[0] if tile_candidates else None + src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info) with V.set_kernel_handler(self): self.meta_kernel() From 0fcc6183721b16885157dcce063039459e7f04e5 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 20 Nov 2025 12:15:42 +0000 Subject: [PATCH 19/53] [Fix] Use correct vec_size #183 --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 6400f9c9..f1a417f4 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1287,7 +1287,7 @@ def store_reduction(self, name, index, value): # mean reduction_numel = reduce(mul, self.ranges[self.reduction_depth:], 1) divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(reduction_numel)} : f32") - if self.buffer_types[name][1] > 1: + if compute_vec_size > 1: divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to vector<{self.var_info[sum][0]}x{mlir_dtype}>") else: divider_vec = divider From 85a46a6e0154bd47b5aa94ceb1803303a33a38b9 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 21 Nov 2025 03:58:43 +0000 Subject: [PATCH 20/53] [CI] Use self-hosted runner for later stage of building --- .github/workflows/docker-image.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 61eb96e1..eba48da2 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -6,7 +6,7 @@ on: jobs: build-and-test: - runs-on: ubuntu-latest + runs-on: self-hosted permissions: contents: read @@ -35,6 +35,7 @@ jobs: context: . file: ./Dockerfile push: true + no-cache: true tags: ghcr.io/psal-postech/torchsim-test:${{ github.sha }} # Step 4: Wait for GHCR propagation From 0d306c5e81a9afe05ae8dd67172168991ef56a16 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 21 Nov 2025 04:25:55 +0000 Subject: [PATCH 21/53] [Build] Remove deprecated submodule --- .gitmodules | 3 --- PyTorchSimBackend/extern/torch2timeloop | 1 - 2 files changed, 4 deletions(-) delete mode 160000 PyTorchSimBackend/extern/torch2timeloop diff --git a/.gitmodules b/.gitmodules index f65e5f2b..8edc7036 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,9 +7,6 @@ [submodule "PyTorchSimBackend/extern/booksim"] path = PyTorchSimBackend/extern/booksim url = https://github.com/PSAL-POSTECH/booksim.git -[submodule "PyTorchSimBackend/extern/torch2timeloop"] - path = PyTorchSimBackend/extern/torch2timeloop - url = https://github.com/Accelergy-Project/pytorch2timeloop-converter.git [submodule "PyTorchSimBackend/extern/ramulator2"] path = PyTorchSimBackend/extern/ramulator2 url = https://github.com/PSAL-POSTECH/ramulator2 diff --git a/PyTorchSimBackend/extern/torch2timeloop b/PyTorchSimBackend/extern/torch2timeloop deleted file mode 160000 index 62aa1754..00000000 --- a/PyTorchSimBackend/extern/torch2timeloop +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 62aa175421165cc9cd7dfb182a02fc3e26c01e3a From 3238d72f3d103d8af9a0d39f3c80ffa31f26c9d6 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 21 Nov 2025 12:11:16 +0000 Subject: [PATCH 22/53] [Cleanup] Remove deprecated llvm folders --- PyTorchSimFrontend/extension_codecache.py | 15 - .../llvm/llvm_caller_codegen.py | 236 -------------- PyTorchSimFrontend/llvm/llvm_common.py | 304 ------------------ PyTorchSimFrontend/mlir/mlir_autotune.py | 4 - .../mlir/mlir_caller_codegen.py | 185 ++++++++++- .../mlir/mlir_codegen_backend.py | 30 +- PyTorchSimFrontend/mlir/mlir_common.py | 29 -- .../mlir/mlir_conv_mt_template.py | 1 - PyTorchSimFrontend/mlir/mlir_gemm_template.py | 1 - .../mlir/mlir_maxpool_template.py | 2 - PyTorchSimFrontend/mlir/mlir_scheduling.py | 5 +- PyTorchSimFrontend/mlir/mlir_template.py | 4 +- Simulator/simulator.py | 8 +- 13 files changed, 214 insertions(+), 610 deletions(-) delete mode 100644 PyTorchSimFrontend/llvm/llvm_caller_codegen.py delete mode 100644 PyTorchSimFrontend/llvm/llvm_common.py diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index ca669361..4995a7d4 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -27,21 +27,6 @@ def dump_metadata(args, arg_attributes, path): file.write(f'{arg_name}=({arg_attribute[0]}, {arg.dtype}, {arg.shape})\n') return -def parse_stack_sizes(file_path): - meta_path = file_path.split(".")[0]+".meta" - cmd = ["riscv64-unknown-elf-objcopy", "--dump-section", f".stack_sizes={meta_path}", file_path, "/dev/null"] - subprocess.run(cmd, check=True) - - with open(meta_path, 'rb') as f: - stack_sizes_data = list(f.read()) - if len(stack_sizes_data) <= 17: - raise ValueError("Invalid .stack_sizes section size") - - stack_size_bytes = stack_sizes_data[8:-9] - stack_size = int.from_bytes(stack_size_bytes, byteorder='little') - return stack_size - - def llvm_compile_command(input, output): opt_output = f"{input[:-3]}_opt.ll" return [re.sub(r"[ \n]+", " ", diff --git a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py b/PyTorchSimFrontend/llvm/llvm_caller_codegen.py deleted file mode 100644 index 3690f533..00000000 --- a/PyTorchSimFrontend/llvm/llvm_caller_codegen.py +++ /dev/null @@ -1,236 +0,0 @@ -import os -import subprocess -import shlex -import re - -from torch._inductor.utils import IndentedBuffer -from torch._inductor.codegen import cpp -from torch._inductor.codecache import write_atomic - -from PyTorchSimFrontend.llvm.llvm_common import LLVMKernelArgs - -class LLVMKernelCallerCodeGen(): - """ - Generate C that calls the llvm kernel. - """ - - def __init__(self, validation, arg_attributes): - super().__init__() - self.code = IndentedBuffer() - self.ending = ";" - self.open_bracket = "{" - self.closed_bracket = "}" - self.newline = "\n" - self.kernel_name = "kernel" - self.validation = validation - self.n_arg = len(arg_attributes) - self.arg_attributes = arg_attributes - self.arg_use_count = 1 - self.load_args = {} - self.kernel_start_addr = "" - self.kernel_end_addr = "" - - def get_argv_idx(self): - self.arg_use_count += 1 - return self.arg_use_count-1 - - def write_header(self): - self.writeline('#include ') - self.writeline('#include ') - self.writeline("#include ") - if self.validation: - self.writeline("#include ") - self.writeline('#include ') - self.writeline('#include ') - - def is_in_arg(self, arg_name): - value = self.arg_attributes[arg_name][0] - return LLVMKernelArgs.is_llvm_arg_in(value) - - def is_out_arg(self, arg_name): - value = self.arg_attributes[arg_name][0] - return LLVMKernelArgs.is_llvm_arg_out(value) - - def load_arg(self): - for i, arg_name in enumerate(self.arg_attributes.keys()): - if self.is_in_arg(arg_name): - argv_idx = self.get_argv_idx() if arg_name not in self.load_args else self.load_args[arg_name] - self.load_args[arg_name] = argv_idx - self.writeline(f'if(load_arg({arg_name}, sizeof({arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - - def dump_arg(self): - for i, arg_name in enumerate(self.arg_attributes.keys()): - if self.is_out_arg(arg_name): - argv_idx = self.get_argv_idx() if arg_name not in self.load_args else self.load_args[arg_name] - self.writeline(f'if(dump_arg({arg_name}, sizeof({arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - - def write_exit(self): - self.writeline(f'return 0{self.ending}') - - def generate_kernel_declare(self): - args_type_p = [f'{cpp.DTYPE_TO_CPP[arg_type[1]]}*' for arg_type in self.arg_attributes.values()] - - self.writeline(f"void {self.kernel_name}({', '.join(args_type_p)}){self.ending}{self.newline}") - - def generate_args_define(self): - for arg_name, (_, arg_type, arg_shape) in self.arg_attributes.items(): - self.writeline(f'{cpp.DTYPE_TO_CPP[arg_type]} {arg_name}[atoi(argv[{self.get_argv_idx()}])] __attribute__ ((aligned (4096))){self.ending}') - self.writeline(self.newline) - - def generate_load_dump_fn(self): - self.writeline(f'{self.newline}int load_arg(void *arg, size_t size, const char *path) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'int fd = open(path, 0x00000000){self.ending}') - self.writeline(f'if (fd == -1) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - - self.writeline(f'if (read(fd, arg, size) == -1) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - self.writeline(f'close(fd){self.ending}') - self.writeline(f'return 0{self.ending}') - self.writeline(self.closed_bracket) - - self.writeline(f'{self.newline}int dump_arg(void *arg, size_t size, const char *path) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'int fd = open(path, 0x00000001 | 0x00000040, 0644){self.ending}') - self.writeline(f'if (fd == -1) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - - self.writeline(f'if (write(fd, arg, size) == -1) {self.open_bracket}') - with self.code.indent(): - self.writeline(f'return -1{self.ending}') - self.writeline(self.closed_bracket) - self.writeline(f'close(fd){self.ending}') - self.writeline(f'return 0{self.ending}') - self.writeline(self.closed_bracket) - - def generate_main(self): - self.writeline(f'{self.newline}int main(int argc, char *argv[]) {self.open_bracket}{self.newline}') - with self.code.indent(): - if self.validation: - self.load_arg() - self.writeline(self.newline) - - self.writeline(f"{self.kernel_name}({', '.join(list(self.arg_attributes))}){self.ending}{self.newline}") - - if self.validation: - self.dump_arg() - - self.write_exit() - self.writeline(self.closed_bracket) - - def writeline(self, line): - self.code.writeline(line) - - def generate_wrapper_file(self, path, name): - self.dump_path = path - - self.write_header() - self.generate_kernel_declare() - - if self.validation: - self.generate_load_dump_fn() - self.generate_main() - - write_path = os.path.join(path, name+".c",) - write_atomic(write_path, self.code.getvalue()) - return - - def add_extention(self, name, extension): - return name + "." + extension - - def compile_wih_kernel(self, write_path, llvm_name, wrapper_name, binary_name, link_option=""): - main_path = os.path.join(write_path, self.add_extention(wrapper_name, 'c')) - main_obj_path = os.path.join(write_path, self.add_extention(wrapper_name, 'o')) - kernel_path = os.path.join(write_path, self.add_extention(llvm_name, 's')) - kernel_obj_path = os.path.join(write_path, self.add_extention(llvm_name, 'o')) - - main_compile = f'riscv64-unknown-elf-gcc -march=rv64gcv -c {main_path} -o {main_obj_path}' - kernel_compile = f'clang -c --target="riscv64" -march=rv64gcv -O2 -nostdlib {kernel_path} -o {kernel_obj_path}' - - target = os.path.join(write_path, binary_name) - link = f'riscv64-unknown-elf-gcc -march=rv64gcv {main_obj_path} {kernel_obj_path} -o {target} -lm {link_option}' - - main_compile_cmd = shlex.split(main_compile) - kernel_compile_cmd = shlex.split(kernel_compile) - link_cmd = shlex.split(link) - - try: - subprocess.check_call(main_compile_cmd) - subprocess.check_call(kernel_compile_cmd) - subprocess.check_call(link_cmd) - except subprocess.CalledProcessError as e: - print("Command failed with exit code", e.returncode) - print("Error output:", e.output) - assert(0) - - def parse_stack_sizes(self, file_path, vlenb=256): - with open(file_path, 'r') as f: - stack_sizes_data = f.readlines() - - in_proc = False - stack_base = None - dynamic_expr = None - max_offset = 0 - - for line in stack_sizes_data: - line = line.strip() - if line.startswith(".cfi_startproc"): - in_proc = True - continue - elif line.startswith(".cfi_endproc") and in_proc: - if dynamic_expr: - total_stack = eval(dynamic_expr, {"vlenb": vlenb}) - return total_stack - elif stack_base: - return stack_base - else: - return max_offset - - # Skip outer function - if not in_proc: - continue - - if line.startswith(".cfi_def_cfa_offset"): - stack_base = int(line.split()[-1]) - - if ".cfi_escape" in line and "#" in line: - comment = line.split("#")[-1].strip() - m = re.search(r"sp \+ (\d+)\s*\+\s*(\d+)\s*\*\s*vlenb", comment) - if m: - base, scale = int(m.group(1)), int(m.group(2)) - dynamic_expr = f"{base} + {scale} * vlenb" - - def get_spad_size(self, binary_path): - cmd = ["riscv64-unknown-elf-readelf", "-s", binary_path] - result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - if result.returncode != 0: - raise RuntimeError(f"Readelf error: {result.stderr}") - - output = result.stdout - spad_start = None - spad_end = None - for line in output.splitlines(): - if '.spad' in line and 'SECTION' in line: - parts = line.split() - spad_start = int(parts[1], 16) - elif 'spad_end' in line: - parts = line.split() - spad_end = int(parts[1], 16) - - if spad_start is None or spad_end is None: - return 0 - spad_size = spad_end - spad_start - return spad_size \ No newline at end of file diff --git a/PyTorchSimFrontend/llvm/llvm_common.py b/PyTorchSimFrontend/llvm/llvm_common.py deleted file mode 100644 index 1c76b826..00000000 --- a/PyTorchSimFrontend/llvm/llvm_common.py +++ /dev/null @@ -1,304 +0,0 @@ -import torch -from torch._inductor.codegen import common -from torch._inductor.virtualized import V -import sympy - -from typing import Callable - -import sympy - -import torch.fx -from torch.utils._sympy.value_ranges import ValueRanges - -from torch._inductor.utils import ( - free_symbol_startswith, - get_sympy_Expr_dtype, - IndentedBuffer, - sympy_subs, - unique, -) - -schedule_log = torch._logging.getArtifactLogger(__name__, "schedule") - -DTYPE_TO_LLVM = { - torch.float32: "float", - torch.float64: "double", - torch.float16: "half", - torch.int64: "i64", - torch.int32: "i32", - torch.int16: "i16", - torch.int8: "i8", - torch.uint8: "i8", - torch.bool: "i8", - torch.bfloat16: "bfloat", -} - -DTYPE_SIZE = { - torch.float32: 4, - torch.float64: 8, - torch.float16: 2, - torch.int64: 8, - torch.int32: 4, - torch.int16: 2, - torch.int8: 1, - torch.uint8: 1, - torch.bool: 1, - torch.bfloat16: 2, -} - -DTYPE_LOWP_FP = [ - torch.bfloat16, - torch.float16, -] - -class LLVMKernelArgs(common.KernelArgs): - LLVM_ARGS_IN = 0x01 - LLVM_ARGS_OUT = 0x02 - LLVM_ARGS_INOUT = 0x04 - LLVM_ARGS_VAR = 0x08 - - @staticmethod - def is_llvm_arg_in(value): - return (LLVMKernelArgs.LLVM_ARGS_IN & value) | (LLVMKernelArgs.LLVM_ARGS_INOUT & value) - - @staticmethod - def is_llvm_arg_out(value): - return (LLVMKernelArgs.LLVM_ARGS_OUT & value) | (LLVMKernelArgs.LLVM_ARGS_INOUT & value) - - def llvm_argdefs(self, only_args=False): - buffer_types = {x.get_name(): [x.get_dtype(), x.get_numel()] for x in V.graph.buffers} - for name, val in V.graph.graph_inputs.items(): - if isinstance(val, sympy.Expr): - buffer_types[name] = [get_sympy_Expr_dtype(val), 1] - else: - buffer_types[name] = [val.get_dtype(), val.get_numel()] - buffer_types.update( - {name: val.dtype for name, val in V.graph.constants.items()} - ) - - call_args = [] - arg_defs = [] - arg_attributes = {} - for inplaced in unique(self.inplace_buffers.values()): - if self._buffer_is_marked_removed(inplaced): - continue - outer = inplaced.other_names[-1] - inner = inplaced.inner_name - arg_defs.append(f"ptr %{inner}") - if not only_args: - call_args.append(outer) - arg_attributes[outer] = [self.LLVM_ARGS_INOUT] + buffer_types[outer] - for outer, inner in self.input_buffers.items(): - if outer in self.inplace_buffers: - continue - arg_defs.append(f"ptr readonly %{inner}") - if not only_args: - call_args.append(outer) - arg_attributes[outer] = [self.LLVM_ARGS_IN] + buffer_types[outer] - for outer, inner in self.output_buffers.items(): - if outer in self.inplace_buffers or self._buffer_is_marked_removed(inner): - continue - arg_defs.append(f"ptr %{inner}") - if not only_args: - call_args.append(outer) - arg_attributes[outer] = [self.LLVM_ARGS_OUT] + buffer_types[outer] - for outer, inner in self.sizevars.items(): - arg_defs.append(f"ptr readonly %{inner}") - if not only_args: - call_args.append(outer) - arg_attributes[outer] = [self.LLVM_ARGS_VAR] + buffer_types[outer] - return arg_defs, call_args, arg_attributes - -class BaseLLVMKernel(common.Kernel): - newvar_prefix = "%" - name_prefix = "body" - vector_prefix = "vector_body" - suffix = "" - overrides = None - load_format = None - store_format = None - - def __init__(self, args=None): - super().__init__(args) - self.vector_compute = IndentedBuffer() - self.reductions_suffix = IndentedBuffer() - self.cse = common.CSE(self.newvar_prefix, self.suffix, self.name_prefix) - self.vector_cse = common.CSE(self.newvar_prefix, self.suffix, self.vector_prefix) - self.tile_size = None - self.tile_shape = {} - - def load(self, name: str, index: sympy.Expr): - raise NotImplementedError() - - def store_reduction(self, name, index, value): - raise NotImplementedError() - - def store(self, name, index, value, mode=None): - raise NotImplementedError() - - def reduction(self, dtype, src_dtype, reduction_type, value): - raise NotImplementedError() - - def widening(self, args, buf_bounds): - if not args[0] in self.tile_shape or not args[1] in self.tile_shape: - return args, [1, 1] - tile_shape0 = self.tile_shape[args[0]] - tile_shape1 = self.tile_shape[args[1]] - vec_len0 = tile_shape0[0] * tile_shape0[1] - vec_len1 = tile_shape1[0] * tile_shape1[1] - if tile_shape0 != tile_shape1: - temp = list(args) - idx = 0 if tile_shape0[0] != tile_shape1[0] else 1 - if tile_shape0[idx] > tile_shape1[idx]: - if idx == 0: - indexes = [f"i32 {i%tile_shape1[idx-1]}" for i in range(vec_len0)] - else: - indexes = [f"i32 {i//tile_shape1[idx-1]}" for i in range(vec_len0)] - line = f"shufflevector <{vec_len1} x float> %{args[1]}, <{vec_len1} x float> undef, <{vec_len0} x i32> <{', '.join(indexes)}>" - temp[1] = self.cse.generate(self.compute, line, bounds=buf_bounds) - elif tile_shape0[idx] < tile_shape1[idx]: - if idx == 0: - indexes = [f"i32 {i%tile_shape0[idx-1]}" for i in range(vec_len1)] - else: - indexes = [f"i32 {i//tile_shape0[idx-1]}" for i in range(vec_len1)] - line = f"shufflevector <{vec_len0} x float> %{args[0]}, <{vec_len0} x float> undef, <{vec_len1} x i32> <{', '.join(indexes)}>" - temp[0] = self.cse.generate(self.compute, line, bounds=buf_bounds) - args = tuple(temp) - return args, max(tile_shape0, tile_shape1) - - def __enter__(self): - class CSEProxy: - self.name = "CSEProxy" - - @staticmethod - def __getattr__(name: str) -> Callable[..., common.CSEVariable]: # type: ignore[misc] - def inner(*args, **kwargs): - # TritonTemplateKernel has no current_node - buf_bounds = ValueRanges.unknown() - if hasattr(V.interpreter, "current_node"): - fx_node = V.interpreter.current_node - assert isinstance(self.node_to_bounds, dict) - buf_bounds = self.node_to_bounds.get( - fx_node, ValueRanges.unknown() - ) - - vector_csevar = None - if isinstance(args[0], list): - vector_args = (args[0][0], args[1][0]) - vector_csevar = self.vector_cse.generate( - self.vector_compute, - getattr(parent_handler, "vector_" + name)(*vector_args, **kwargs), # type: ignore[has-type] - bounds=buf_bounds, - ) - vector_csevar.update_on_args(name, vector_args, kwargs) - args = (args[0][1], args[1][1]) - if len(args) == 2: - args, tile_shape = self.widening(args, buf_bounds) - elif len(args) == 1: - tile_shape = self.tile_shape[args[0]] - else: - assert(0) # not implemented yet. - vec_len = tile_shape[0] * tile_shape[1] - csevar = self.cse.generate( - self.compute, - getattr(parent_handler, name)(*args, tile_size=vec_len, **kwargs), # type: ignore[has-type] - bounds=buf_bounds, - ) - self.tile_shape[csevar] = tile_shape - csevar.update_on_args(name, args, kwargs) - if vector_csevar is not None: - return [vector_csevar, csevar] - return csevar - - return inner - - @staticmethod - def indirect_indexing(index_var, size, check=True): - # Skip CSE since this doesn't return an expression - return self.indirect_indexing(index_var, size, check) # type: ignore[attr-defined] - - @staticmethod - def load(name: str, index: sympy.Expr): - if name in self.cse.invalidated_stores: - # A load from an invalidated store requires us to - # keep the actual buffer around - V.kernel.must_keep_buffers.add(name) - if free_symbol_startswith(index, "%"): - return self.indirect_load(name, index) - store_cache = self.cse.store_cache - if name in store_cache: - return store_cache[name] - return self.load(name, index) - - @staticmethod - def store(name, index, value, mode=None): - self.store_buffer_names.add(name) - if mode is None: - self.cse.store_cache[name] = value - if self.current_node: - for other_name in self.current_node.get_mutations(): - self.cse.store_cache[other_name] = value - if name not in V.graph.removed_buffers: - return self.store(name, index, value, mode=mode) - - @staticmethod - def store_reduction(name, index, value): - self.store_buffer_names.add(name) - self.cse.store_cache[name] = value - if self.current_node: - for other_name in self.current_node.get_mutations(): - self.cse.store_cache[other_name] = value - - if name not in V.graph.removed_buffers: - return self.store_reduction(name, index, value) - - @staticmethod - def reduction(dtype, src_dtype, reduction_type, value): - return self.reduction(dtype, src_dtype, reduction_type, value) - - @staticmethod - def bucketize( - values, - offsets_name: str, - offsets_size: sympy.Expr, - indexing_dtype: torch.dtype, - right: bool, - ): - """ - [Note: Inductor bucketize op] - - Given values (tensor) and offsets_name (reference to the name of a 1D - tensor), calculate the bucket that each value belongs to. - - e.g. for values [-1, 0, 1, 2, 3, 4, 5, 9], offsets [0, 4, 4, 8], right=True - return = [ 0, 1, 1, 1, 1, 3, 3, 4]. - - When right == False, bucket i refers to range (offsets[i], offsets[i+1]]. - When right == True, bucket i refers to range [offsets[i], offsets[i+1]). - - Offsets must be non-decreasing or the result is undefined. - """ - return self.bucketize( - values, offsets_name, offsets_size, indexing_dtype, right - ) - - super().__enter__() - assert self.overrides - parent_handler = self.overrides(V.get_ops_handler()) - self.exit_stack.enter_context(V.set_ops_handler(CSEProxy())) - self.exit_stack.enter_context(V.set_kernel_handler(self)) - return self - - def rename_indexing(self, index) -> sympy.Expr: - # adds the necessary kernel args for index expressions - # and renames variables in index expressions to kernel arg names - if isinstance(index, (list, tuple)): - return [self.rename_indexing(x) for x in index] - index = V.graph.sizevars.simplify(index) - sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name) - replacements = { - x: self.args.size(x) - for x in sorted_symbols - if x.name.startswith("s") or x.name.startswith("ps") - } - return sympy_subs(index, replacements) diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py index 537809de..1027ccd9 100644 --- a/PyTorchSimFrontend/mlir/mlir_autotune.py +++ b/PyTorchSimFrontend/mlir/mlir_autotune.py @@ -2,7 +2,6 @@ import torch import os import dataclasses -from torch._inductor.autotune_process import BenchmarkRequest from torch._inductor.autotune_process import TensorMeta from torch._inductor.codecache import get_hash, write from PyTorchSimFrontend import extension_config @@ -11,12 +10,9 @@ from typing import ( Any, Callable, - Dict, Iterable, List, Optional, - Sequence, - TYPE_CHECKING, Union, ) diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py index 3fff9958..dff6b0fd 100644 --- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py +++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py @@ -1,16 +1,46 @@ +import os +import subprocess +import shlex +import re import torch -from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs -from PyTorchSimFrontend.llvm.llvm_caller_codegen import LLVMKernelCallerCodeGen -from PyTorchSimFrontend.mlir.mlir_common import DTYPE_TO_C +from torch._inductor.utils import IndentedBuffer +from torch._inductor.codecache import write_atomic +from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs, DTYPE_TO_C -class MLIRKernelCallerCodeGen(LLVMKernelCallerCodeGen): +class MLIRKernelCallerCodeGen(): + """ + Generate C that calls the llvm kernel. + """ def __init__(self, validation, arg_attributes, cycle_sim=False): - super().__init__(validation, arg_attributes) + super().__init__() + self.code = IndentedBuffer() + self.ending = ";" + self.open_bracket = "{" + self.closed_bracket = "}" + self.newline = "\n" + self.kernel_name = "kernel" + self.validation = validation + self.n_arg = len(arg_attributes) + self.arg_attributes = arg_attributes + self.arg_use_count = 1 + self.load_args = {} + self.kernel_start_addr = "" + self.kernel_end_addr = "" self.cycle_sim = cycle_sim + def get_argv_idx(self): + self.arg_use_count += 1 + return self.arg_use_count-1 + def write_header(self): - super().write_header() + self.writeline('#include ') + self.writeline('#include ') + self.writeline("#include ") + if self.validation: + self.writeline("#include ") + self.writeline('#include ') + self.writeline('#include ') global_var_header = "gem5_global_var.h" if self.cycle_sim else "global_var.h" self.writeline(f"#include \"{global_var_header}\"") @@ -42,6 +72,9 @@ def dump_arg(self): self.writeline(f'return -1{self.ending}') self.writeline(self.closed_bracket) + def write_exit(self): + self.writeline(f'return 0{self.ending}') + def generate_kernel_declare(self): # memref to llvm arguments (memref -> ptr, ptr, i64, , ) allocated pointer, aligned pointer, offset, size, stride args_type_p = [f'{DTYPE_TO_C[arg_type[1]]}*, {DTYPE_TO_C[arg_type[1]]}*, int64_t, int64_t, int64_t' for (_, arg_type) in self.arg_attributes] @@ -86,4 +119,142 @@ def generate_main(self): self.dump_arg() self.write_exit() - self.writeline(self.closed_bracket) \ No newline at end of file + self.writeline(self.closed_bracket) + + def generate_load_dump_fn(self): + self.writeline(f'{self.newline}int load_arg(void *arg, size_t size, const char *path) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'int fd = open(path, 0x00000000){self.ending}') + self.writeline(f'if (fd == -1) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'return -1{self.ending}') + self.writeline(self.closed_bracket) + + self.writeline(f'if (read(fd, arg, size) == -1) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'return -1{self.ending}') + self.writeline(self.closed_bracket) + self.writeline(f'close(fd){self.ending}') + self.writeline(f'return 0{self.ending}') + self.writeline(self.closed_bracket) + + self.writeline(f'{self.newline}int dump_arg(void *arg, size_t size, const char *path) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'int fd = open(path, 0x00000001 | 0x00000040, 0644){self.ending}') + self.writeline(f'if (fd == -1) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'return -1{self.ending}') + self.writeline(self.closed_bracket) + + self.writeline(f'if (write(fd, arg, size) == -1) {self.open_bracket}') + with self.code.indent(): + self.writeline(f'return -1{self.ending}') + self.writeline(self.closed_bracket) + self.writeline(f'close(fd){self.ending}') + self.writeline(f'return 0{self.ending}') + self.writeline(self.closed_bracket) + + + def writeline(self, line): + self.code.writeline(line) + + def generate_wrapper_file(self, path, name): + self.dump_path = path + + self.write_header() + self.generate_kernel_declare() + + if self.validation: + self.generate_load_dump_fn() + self.generate_main() + + write_path = os.path.join(path, name+".c",) + write_atomic(write_path, self.code.getvalue()) + return + + def add_extention(self, name, extension): + return name + "." + extension + + def compile_wih_kernel(self, write_path, llvm_name, wrapper_name, binary_name, link_option=""): + main_path = os.path.join(write_path, self.add_extention(wrapper_name, 'c')) + main_obj_path = os.path.join(write_path, self.add_extention(wrapper_name, 'o')) + kernel_path = os.path.join(write_path, self.add_extention(llvm_name, 's')) + kernel_obj_path = os.path.join(write_path, self.add_extention(llvm_name, 'o')) + + main_compile = f'riscv64-unknown-elf-gcc -march=rv64gcv -c {main_path} -o {main_obj_path}' + kernel_compile = f'clang -c --target="riscv64" -march=rv64gcv -O2 -nostdlib {kernel_path} -o {kernel_obj_path}' + + target = os.path.join(write_path, binary_name) + link = f'riscv64-unknown-elf-gcc -march=rv64gcv {main_obj_path} {kernel_obj_path} -o {target} -lm {link_option}' + + main_compile_cmd = shlex.split(main_compile) + kernel_compile_cmd = shlex.split(kernel_compile) + link_cmd = shlex.split(link) + + try: + subprocess.check_call(main_compile_cmd) + subprocess.check_call(kernel_compile_cmd) + subprocess.check_call(link_cmd) + except subprocess.CalledProcessError as e: + print("Command failed with exit code", e.returncode) + print("Error output:", e.output) + assert(0) + + def parse_stack_sizes(self, file_path, vlenb=256): + with open(file_path, 'r') as f: + stack_sizes_data = f.readlines() + + in_proc = False + stack_base = None + dynamic_expr = None + max_offset = 0 + + for line in stack_sizes_data: + line = line.strip() + if line.startswith(".cfi_startproc"): + in_proc = True + continue + elif line.startswith(".cfi_endproc") and in_proc: + if dynamic_expr: + total_stack = eval(dynamic_expr, {"vlenb": vlenb}) + return total_stack + elif stack_base: + return stack_base + else: + return max_offset + + # Skip outer function + if not in_proc: + continue + + if line.startswith(".cfi_def_cfa_offset"): + stack_base = int(line.split()[-1]) + + if ".cfi_escape" in line and "#" in line: + comment = line.split("#")[-1].strip() + m = re.search(r"sp \+ (\d+)\s*\+\s*(\d+)\s*\*\s*vlenb", comment) + if m: + base, scale = int(m.group(1)), int(m.group(2)) + dynamic_expr = f"{base} + {scale} * vlenb" + + def get_spad_size(self, binary_path): + cmd = ["riscv64-unknown-elf-readelf", "-s", binary_path] + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if result.returncode != 0: + raise RuntimeError(f"Readelf error: {result.stderr}") + + output = result.stdout + spad_start = None + spad_end = None + for line in output.splitlines(): + if '.spad' in line and 'SECTION' in line: + parts = line.split() + spad_start = int(parts[1], 16) + elif 'spad_end' in line: + parts = line.split() + spad_end = int(parts[1], 16) + + if spad_start is None or spad_end is None: + return 0 + spad_size = spad_end - spad_start + return spad_size \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index f1a417f4..beeac439 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -7,10 +7,12 @@ import torch from collections import defaultdict from concurrent.futures import ThreadPoolExecutor +from torch._dynamo.testing import rand_strided +from torch._inductor.autotune_process import TensorMeta from torch._dynamo.utils import dynamo_timed from torch._inductor.codegen import cpp, wrapper, common, memory_planning from torch._inductor.virtualized import V, _ops as ops -from torch._inductor.codecache import write_atomic, write +from torch._inductor.codecache import write_atomic from torch._inductor.utils import ( IndentedBuffer, is_welford_reduction, @@ -21,6 +23,7 @@ from PyTorchSimFrontend import extension_config from . import mlir_common from .mlir_common import LoopLevel, LoopNest +from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest def reduction_init(reduction_type, dtype): if dtype in cpp.DTYPE_LOWP_FP: @@ -1637,6 +1640,31 @@ def get_cycle(choice): optimal_src_code, loop_size = choices[max_idx][1], choices[max_idx][-1] return optimal_src_code, loop_size + def run_bench(self, nodes, kernel_name, src_code): + _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs() + input_call_args = tuple(self.args.input_buffers.keys()) + output_call_args = tuple(self.args.output_buffers.keys()) + full_input_nodes = tuple([V.graph.get_buffer(k) for k in input_call_args]) + full_output_nodes = tuple([V.graph.get_buffer(k) for k in output_call_args]) + + bmreq = MLIRBenchmarkRequest( + kernel_name=kernel_name, + input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes), + output_tensor_meta=TensorMeta.from_irnodes(full_output_nodes), + extra_args={ + "vector_lane" : self.vector_lane, + "spad_info": self.spad_info, + "vlen" : self.vlen, + "arg_attributes" : arg_attributes, + "validate" : extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, + "autotune" : True, + }, + source_code=src_code, + ) + dummy_inputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.input_tensor_meta] + dummy_outputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.output_tensor_meta] + return bmreq.make_run_fn(dummy_inputs, dummy_outputs) + def _log_autotune_result(self, best_choice, best_cycle): print( f"[Auto-tune] Optimal tile size: {list(best_choice[2])}, " diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 2644f125..c655dde3 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -1,15 +1,12 @@ import dataclasses import math from dataclasses import dataclass -from typing import Optional, Iterable from typing import Dict from typing import List from collections import defaultdict from functools import reduce from operator import mul import torch -from torch._dynamo.testing import rand_strided -from torch._inductor.autotune_process import TensorMeta from torch._inductor.codegen import common from torch._inductor.codegen import cpp from torch._inductor.virtualized import V @@ -35,7 +32,6 @@ ) from PyTorchSimFrontend import extension_config from PyTorchSimFrontend import extension_codecache -from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest schedule_log = torch._logging.getArtifactLogger(__name__, "schedule") DTYPE_TO_MLIR = { @@ -776,31 +772,6 @@ def codegen_nodes(self, nodes, kernel_name): self.meta_kernel() return src_code - def run_bench(self, nodes, kernel_name, src_code): - _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs() - input_call_args = tuple(self.args.input_buffers.keys()) - output_call_args = tuple(self.args.output_buffers.keys()) - full_input_nodes = tuple([V.graph.get_buffer(k) for k in input_call_args]) - full_output_nodes = tuple([V.graph.get_buffer(k) for k in output_call_args]) - - bmreq = MLIRBenchmarkRequest( - kernel_name=kernel_name, - input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes), - output_tensor_meta=TensorMeta.from_irnodes(full_output_nodes), - extra_args={ - "vector_lane" : self.vector_lane, - "spad_info": self.spad_info, - "vlen" : self.vlen, - "arg_attributes" : arg_attributes, - "validate" : extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, - "autotune" : True, - }, - source_code=src_code, - ) - dummy_inputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.input_tensor_meta] - dummy_outputs = [rand_strided(meta.sizes,meta.strides,dtype=meta.dtype, extra_size=meta.offset).to(device=nodes[0].get_device()) for meta in bmreq.output_tensor_meta] - return bmreq.make_run_fn(dummy_inputs, dummy_outputs) - def codegen_kernel(self, kernel_name): arg_defs, _, _, _ = self.kernel_group.args.mlir_argdefs() arg_defs = ",\n".ljust(25).join(arg_defs) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py index 26018a94..f013af56 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py @@ -5,7 +5,6 @@ from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode from PyTorchSimFrontend.mlir import mlir_common -from PyTorchSimFrontend import extension_config CONV_TEMPLATE = r""" // Multi Channel Tile Conv2D kernel diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py index c2120e7b..6271b548 100644 --- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py @@ -1,4 +1,3 @@ -import os import json from pathlib import Path from torch import empty_strided diff --git a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py index 2cca36b6..3658f992 100644 --- a/PyTorchSimFrontend/mlir/mlir_maxpool_template.py +++ b/PyTorchSimFrontend/mlir/mlir_maxpool_template.py @@ -1,11 +1,9 @@ -import os from typing import List, Optional, cast from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import Buffer from torch._inductor.ir import IRNode -from torch._inductor.ir import ReinterpretView from PyTorchSimFrontend.mlir import mlir_common import sympy diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 26b90401..9af84446 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -3,12 +3,9 @@ import sympy from functools import reduce import operator -from sympy import symbols, sympify, Symbol -from collections import OrderedDict -from concurrent.futures import ThreadPoolExecutor +from sympy import symbols, sympify from PyTorchSimFrontend import extension_config from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel -from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest from torch._inductor import config from torch._inductor.scheduler import BaseScheduling, FusedSchedulerNode, SchedulerNode, BaseSchedulerNode diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index f9f25b93..2ad2e6a6 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -13,8 +13,8 @@ from typing import List, Optional from unittest.mock import patch -from torch._inductor.codegen.common import Kernel, KernelTemplate, ChoiceCaller, OpOverrides, CSE, DeferredLine -from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, View +from torch._inductor.codegen.common import KernelTemplate, ChoiceCaller, CSE, DeferredLine +from torch._inductor.ir import Buffer, IRNode, TemplateBuffer from torch._inductor.select_algorithm import PartialRender from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller from torch._inductor.autotune_process import TensorMeta diff --git a/Simulator/simulator.py b/Simulator/simulator.py index bd048538..b226259f 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -12,7 +12,7 @@ import torch import numpy as np -from PyTorchSimFrontend.llvm.llvm_common import LLVMKernelArgs +from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs from PyTorchSimFrontend import extension_config TORCH_TO_NUMPY = { @@ -64,10 +64,10 @@ def dump_args(self, args, arg_attributes, load_path, dump_path): for (arg_name, arg_attribute), arg in zip(arg_attributes, args): size = arg_attribute[2] if arg_attribute[1] != torch.bool else (arg_attribute[2] + 7) // 8 array_size.append(size) - if LLVMKernelArgs.is_llvm_arg_in(arg_attribute[0]): + if MLIRKernelArgs.is_mlir_arg_in(arg_attribute[0]): index = self.write_arg(arg, load_path, arg_name) file_path.append(os.path.join(load_path, arg_name, f'{index}.raw')) - elif LLVMKernelArgs.is_llvm_arg_out(arg_attribute[0]): + elif MLIRKernelArgs.is_mlir_arg_out(arg_attribute[0]): path = os.path.join(dump_path, arg_name) os.makedirs(path, exist_ok=True) file_path.append(os.path.join(path, f'{self.get_biggest_filename(path)}.raw')) @@ -121,7 +121,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size= raise RuntimeError(f"{error_msg}") for (arg_name, arg_attribute), arg, path in zip(arg_attributes, args, file_path): - if LLVMKernelArgs.is_llvm_arg_out(arg_attribute[0]): + if MLIRKernelArgs.is_mlir_arg_out(arg_attribute[0]): self.load_tensor(arg, arg_name, arg_attribute, path) if cleanup: From 18b769a67408d2b0879c0c1e232e076ecaeba11d Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 21 Nov 2025 12:13:45 +0000 Subject: [PATCH 23/53] [CI] debug --- .github/workflows/pytorchsim_test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index e515bc1b..a8e49e63 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -688,6 +688,7 @@ jobs: ${{ inputs.image_name }} bash -c \ "cd /workspace && PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh && \ cp PyTorchSim/experiments/artifact/cycle_validation/summary_cycle.out /dump/summary_cycle.out" + ls /tmp/torchsim-ci/${GITHUB_SHA} - name: Upload Accuracy Report Artifact uses: actions/upload-artifact@v4 From dbc0bec81b2a4daf9309dd36a849f1199aec14e8 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Sat, 29 Nov 2025 11:01:29 +0000 Subject: [PATCH 24/53] [fix] gem5 build pass --- scripts/build_from_source.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_from_source.sh b/scripts/build_from_source.sh index d9806069..fb9e82e3 100644 --- a/scripts/build_from_source.sh +++ b/scripts/build_from_source.sh @@ -6,7 +6,7 @@ cd $home apt -y update && apt -y upgrade && apt -y install scons git clone https://github.com/PSAL-POSTECH/gem5.git cd gem5 && scons build/RISCV/gem5.opt -j $(nproc) -export GEM5_PATH=$home/gem5/release/gem5.opt +export GEM5_PATH=$home/gem5/build/RISCV/gem5.opt cd $home # LLVM From 1e3f4452c5012c3283d973f17585227ca78fe160 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Sat, 29 Nov 2025 11:24:02 +0000 Subject: [PATCH 25/53] [refactor] config renaming --- .../configs/heterogeneous_c2_simple_noc.json | 23 +++++------- .../configs/stonne_big_c1_simple_noc.json | 23 +++++------- .../configs/stonne_single_c1_simple_noc.json | 22 +++++------- .../stonne_validation_c1_simple_noc.json | 17 ++++----- .../systolic_ws_128x128_c1_booksim_tpuv2.json | 27 +++++++------- ...stolic_ws_128x128_c1_simple_noc_tpuv2.json | 26 ++++++-------- ...stolic_ws_128x128_c1_simple_noc_tpuv3.json | 27 +++++--------- ...c_ws_128x128_c1_simple_noc_tpuv3_half.json | 27 ++++++-------- ...stolic_ws_128x128_c1_simple_noc_tpuv4.json | 27 ++++++-------- .../systolic_ws_128x128_c2_booksim_tpuv3.json | 26 ++++++-------- .../systolic_ws_128x128_c2_chiplet_tpuv3.json | 28 +++++++-------- ...lic_ws_128x128_c2_chiplet_tpuv3_xnuma.json | 27 ++++++-------- ...stolic_ws_128x128_c2_simple_noc_tpuv2.json | 26 ++++++-------- ...stolic_ws_128x128_c2_simple_noc_tpuv3.json | 23 +++++------- ...128x128_c2_simple_noc_tpuv3_partition.json | 23 +++++------- ...stolic_ws_128x128_c2_simple_noc_tpuv4.json | 25 ++++++------- .../systolic_ws_8x8_c1_12G_simple_noc.json | 25 ++++++------- .../systolic_ws_8x8_c1_24G_simple_noc.json | 25 ++++++------- .../systolic_ws_8x8_c1_48G_simple_noc.json | 25 ++++++------- .../configs/systolic_ws_8x8_c1_booksim.json | 26 ++++++-------- .../systolic_ws_8x8_c1_simple_noc.json | 25 +++++++------ .../systolic_ws_8x8_c2_12G_simple_noc.json | 25 ++++++------- .../systolic_ws_8x8_c2_24G_simple_noc.json | 25 ++++++------- .../systolic_ws_8x8_c2_48G_simple_noc.json | 25 ++++++------- PyTorchSimBackend/src/Common.cc | 36 +++++++++---------- 25 files changed, 260 insertions(+), 374 deletions(-) diff --git a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json index 8f196e81..b3254182 100644 --- a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json +++ b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json @@ -2,30 +2,25 @@ "core_type" : ["stonne", "ws_mesh"], "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, + "core_freq_mhz" : 940, + "core_log_print_interval_cycle" : 10000, "num_stonne_per_core" : 8, "num_stonne_port" : 64, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", - "dram_freq" : 940, + "dram_freq_mhz" : 940, "dram_channels": 16, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_log_print_interval_cycle": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq" : 15000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", + "icnt_freq_mhz" : 15000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", - "precision" : 4, - "scheduler" : "simple", "num_partition" : 2, "partition": { "core_0":0, diff --git a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json index c7ef15f7..f94aeffa 100644 --- a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json +++ b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json @@ -2,29 +2,24 @@ "core_type" : ["stonne"], "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", "num_cores" : 1, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, + "core_freq_mhz" : 940, + "core_log_print_interval_cycle" : 10000, "num_stonne_per_core" : 8, "num_stonne_port" : 64, "dram_type" : "ramulator2", - "dram_freq" : 940, + "dram_freq_mhz" : 940, "dram_channels": 8, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_log_print_interval_cycle": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq" : 15000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", + "icnt_freq_mhz" : 15000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", - "precision" : 4, - "scheduler" : "simple", "num_partition" : 1, "partition": { "core_0":0 diff --git a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json index 2293e197..f777c8d3 100644 --- a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json +++ b/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json @@ -2,28 +2,24 @@ "core_type" : ["stonne"], "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", "num_cores" : 1, - "core_freq" : 700, - "sram_size" : 65536, - "core_print_interval" : 10000, + "core_freq_mhz" : 700, + "core_log_print_interval_cycle" : 10000, "num_stonne_per_core" : 1, "num_stonne_port" : 8, "dram_type" : "ramulator2", - "dram_freq" : 700, + "dram_freq_mhz" : 700, "dram_channels": 8, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_log_print_interval_cycle": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq" : 7000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", + "icnt_freq_mhz" : 7000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", - "precision" : 4, - "scheduler" : "simple", "num_partition" : 1, "partition": { "core_0":0 diff --git a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json index 08548638..68d83d01 100644 --- a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json +++ b/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json @@ -2,28 +2,25 @@ "core_type" : ["stonne"], "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 65536, - "core_print_interval" : 10000, + "core_freq_mhz" : 1000, + "core_log_print_interval_cycle" : 10000, "num_stonne_per_core" : 1, "num_stonne_port" : 32, "dram_type" : "simple", - "dram_freq" : 1000, + "dram_freq_mhz" : 1000, "dram_channels": 1, - "dram_req_size": 32, + "dram_req_size_byte": 32, "dram_latency" : 100, - "dram_print_interval": 10000, + "dram_log_print_interval_cycle": 10000, "l2d_type" : "datacache", "l2d_config" : "S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32", "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq" : 7000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", + "icnt_freq_mhz" : 7000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", - "precision" : 4, - "scheduler" : "simple", "num_partition" : 1, "partition": { "core_0":0 diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json index 5d7b0d35..c45ea65e 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json @@ -1,27 +1,24 @@ { "num_cores" : 1, - "core_freq" : 700, - "sram_size" : 65536, - "core_print_interval" : 10000, + "core_freq_mhz" : 700, + "core_log_print_interval_cycle" : 10000, "dram_type" : "ramulator2", - "dram_freq" :700, + "dram_freq_mhz" :700, "dram_channels": 16, - "dram_req_size": 32, - "dram_latency" : 10, + "dram_req_size_byte": 32, + "dram_size" : 16, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - + "dram_num_burst_length" : 2, + "dram_log_print_interval_cycle": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + "icnt_type" : "booksim2", "icnt_latency" : 1, - "icnt_freq" : 8000, + "icnt_freq_mhz" : 8000, "icnt_node_per_core" : 16, - "icnt_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt", - - "precision" : 4, - "scheduler" : "simple", + "booksim_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt", + "num_partition" : 2, "partition": { "core_0":0, diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json index 38acafc0..082a9010 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json @@ -1,26 +1,22 @@ { "num_cores" : 1, - "core_freq" : 700, - "sram_size" : 65536, - "core_print_interval" : 10000, + "core_freq_mhz" : 700, + "core_log_print_interval_cycle" : 10000, "dram_type" : "ramulator2", - "dram_freq" : 700, + "dram_freq_mhz" : 700, "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, + "dram_req_size_byte": 32, "dram_size" : 16, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - + "dram_num_burst_length" : 2, + "dram_log_print_interval_cycle": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq" : 10000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", + "icnt_freq_mhz" : 10000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", + "num_partition" : 2, "partition": { "core_0":0, diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json index 7348d5bc..b7cf129d 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json @@ -1,29 +1,18 @@ { "num_cores" : 1, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, + "core_freq_mhz" : 940, + "core_log_print_interval_cycle" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", - "dram_freq" : 940, + "dram_freq_mhz" : 940, "dram_channels": 16, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_log_print_interval_cycle": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq" : 15000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 1, - "partition": { - "core_0": 0 - } + "icnt_freq_mhz" : 15000 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json index 69ec8bd0..7e53a9e4 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json @@ -1,27 +1,22 @@ { "num_cores" : 1, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, + "core_freq_mhz" : 940, + "core_log_print_interval_cycle" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", - "dram_freq" : 940, + "dram_freq_mhz" : 940, "dram_channels": 8, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_log_print_interval_cycle": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq" : 15000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", + "icnt_freq_mhz" : 15000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", + "num_partition" : 1, "partition": { "core_0": 0 diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json index bff4e224..47d90bf2 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json @@ -1,29 +1,24 @@ { "num_cores" : 1, - "core_freq" : 1050, - "sram_size" : 16777216, - "core_print_interval" : 10000, + "core_freq_mhz" : 1050, + "core_log_print_interval_cycle" : 10000, "num_systolic_array_per_core" : 4, "dram_type" : "ramulator2", - "dram_freq" :1200, + "dram_freq_mhz" :1200, "dram_channels": 16, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 16, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_log_print_interval_cycle": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", "l2d_type" : "datacache", "l2d_config" : "S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32", - + "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq" : 19200, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", + "icnt_freq_mhz" : 19200, + "booksim_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", + "num_partition" : 1, "partition": { "core_0":0 diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json index d51e9c5f..76f48f67 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json @@ -1,31 +1,25 @@ { "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, + "core_freq_mhz" : 940, + "core_log_print_interval_cycle" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", - "dram_freq" : 940, + "dram_freq_mhz" : 940, "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_print_interval_cycle": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "booksim2", - "icnt_latency" : 7, - "icnt_freq" : 28000, + "icnt_freq_mhz" : 28000, "icnt_node_per_core" : 1, - "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m32.icnt", + "booksim_config_path" : "../configs/booksim2_configs/fly_c2_m32.icnt", - "precision" : 4, - "scheduler" : "simple", "num_partition" : 2, "partition": { "core_0":0, "core_1":0 } -} \ No newline at end of file +} diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json index b2661894..9375fea1 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json @@ -1,30 +1,26 @@ { "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, + "core_freq_mhz" : 940, + "core_log_print_interval_cycle" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", - "dram_freq" : 940, + "dram_freq_mhz" : 940, "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_log_print_interval_cycle": 10000, "dram_num_partitions" : 2, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + "icnt_type" : "booksim2", "icnt_latency" : 1, - "icnt_freq" : 1000, + "icnt_freq_mhz" : 1000, "icnt_node_per_core" : 16, - "icnt_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", + "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", "icnt_print_interval" : 10000, - - "precision" : 4, - "scheduler" : "simple", + + "num_partition" : 2, "partition": { "core_0":0, diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json index 922ede5b..662c1e00 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json @@ -1,29 +1,24 @@ { "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, + "core_freq_mhz" : 940, + "core_log_print_interval_cycle" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", - "dram_freq" : 940, + "dram_freq_mhz" : 940, "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_log_print_interval_cycle": 10000, "dram_num_partitions" : 1, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + "icnt_type" : "booksim2", "icnt_latency" : 1, - "icnt_freq" : 1000, + "icnt_freq_mhz" : 1000, "icnt_node_per_core" : 16, - "icnt_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", - - "precision" : 4, - "scheduler" : "simple", + "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", + "num_partition" : 2, "partition": { "core_0":0, diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json index 034542fe..712cb32f 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json @@ -1,26 +1,22 @@ { "num_cores" : 2, - "core_freq" : 700, - "sram_size" : 65536, - "core_print_interval" : 10000, + "core_freq_mhz" : 700, + "core_log_print_interval_cycle" : 10000, "dram_type" : "ramulator2", - "dram_freq" :700, + "dram_freq_mhz" :700, "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, + "dram_req_size_byte": 32, "dram_size" : 16, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - + "dram_num_burst_length" : 2, + "dram_log_print_interval_cycle": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq" : 20000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "precision" : 4, - "scheduler" : "simple", + "icnt_freq_mhz" : 20000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", + "num_partition" : 2, "partition": { "core_0":0, diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json index 82f42c00..4865b1ad 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json @@ -1,27 +1,22 @@ { "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, + "core_freq_mhz" : 940, + "core_log_print_interval_cycle" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", - "dram_freq" : 940, + "dram_freq_mhz" : 940, "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_log_print_interval_cycle": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq" : 28000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", + "icnt_freq_mhz" : 28000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - "precision" : 4, - "scheduler" : "simple", "num_partition" : 2, "partition": { "core_0":0, diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json index 132a52e6..5aad699e 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json @@ -1,27 +1,22 @@ { "num_cores" : 2, - "core_freq" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, + "core_freq_mhz" : 940, + "core_log_print_interval_cycle" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", - "dram_freq" : 940, + "dram_freq_mhz" : 940, "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_log_print_interval_cycle": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq" : 28000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", + "icnt_freq_mhz" : 28000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - "precision" : 4, - "scheduler" : "simple", "num_partition" : 2, "partition": { "core_0":0, diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json index a93e8ae2..eb7d76e8 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json @@ -1,30 +1,25 @@ { "num_cores" : 2, - "core_freq" : 1050, - "sram_size" : 32768, - "core_print_interval" : 10000, + "core_freq_mhz" : 1050, + "core_log_print_interval_cycle" : 10000, "num_systolic_array_per_core" : 4, "dram_type" : "ramulator2", - "dram_freq" :1200, + "dram_freq_mhz" :1200, "dram_channels": 32, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_size" : 32, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", + "dram_req_size_byte": 32, + "dram_num_burst_length" : 2, + "dram_log_print_interval_cycle": 10000, + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", "l2d_type" : "datacache", "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32", - + "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq" : 48000, + "icnt_freq_mhz" : 48000, "icnt_node_per_core" : 1, - "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", + "booksim_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - "precision" : 4, - "scheduler" : "simple", "num_partition" : 1, "partition": { "core_0":0, diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json index e9a64f2e..9afe18d1 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json @@ -1,24 +1,19 @@ { "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, + "core_freq_mhz" : 1000, + "core_log_print_interval_cycle" : 100000, "dram_type" : "ramulator2", - "dram_freq" :800, + "dram_freq_mhz" :800, "dram_channels": 1, - "dram_req_size": 64, - "dram_latency" : 10, + "dram_req_size_byte": 64, "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - + "dram_num_burst_length" : 4, + "dram_log_print_interval_cycle": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + "icnt_type" : "simple", "icnt_latency" : 1, - "icnt_freq" : 1000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple" + "icnt_freq_mhz" : 1000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt" } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json index 37e18b35..3f1435e8 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json @@ -1,24 +1,19 @@ { "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, + "core_freq_mhz" : 1000, + "core_log_print_interval_cycle" : 100000, "dram_type" : "ramulator2", - "dram_freq" :800, + "dram_freq_mhz" :800, "dram_channels": 2, - "dram_req_size": 64, - "dram_latency" : 10, + "dram_req_size_byte": 64, "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - + "dram_num_burst_length" : 4, + "dram_log_print_interval_cycle": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + "icnt_type" : "simple", "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple" + "icnt_freq_mhz" : 8000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt" } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json index 49225d77..7037a045 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json @@ -1,24 +1,19 @@ { "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, + "core_freq_mhz" : 1000, + "core_log_print_interval_cycle" : 100000, "dram_type" : "ramulator2", - "dram_freq" :800, + "dram_freq_mhz" :800, "dram_channels": 4, - "dram_req_size": 64, - "dram_latency" : 10, + "dram_req_size_byte": 64, "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - + "dram_num_burst_length" : 4, + "dram_log_print_interval_cycle": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + "icnt_type" : "simple", "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple" + "icnt_freq_mhz" : 8000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt" } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json index 4ea2c6ff..4fab1fd7 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json @@ -1,26 +1,22 @@ { "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, + "core_freq_mhz" : 1000, + "core_log_print_interval_cycle" : 100000, "dram_type" : "ramulator2", - "dram_freq" :800, + "dram_freq_mhz" :800, "dram_channels": 1, - "dram_req_size": 64, - "dram_latency" : 10, + "dram_req_size_byte": 64, "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - + "dram_num_burst_length" : 4, + "dram_log_print_interval_cycle": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + "icnt_type" : "booksim2", "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple", + "icnt_freq_mhz" : 8000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", + "num_partition" : 2, "partition": { "core_0":0, diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json index 8aee751b..29955051 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json @@ -1,24 +1,23 @@ { "num_cores" : 1, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, + "core_freq_mhz" : 1000, + "core_log_print_interval_cycle" : 100000, "dram_type" : "ramulator2", - "dram_freq" :800, + "dram_freq_mhz" :800, "dram_channels": 1, - "dram_req_size": 64, - "dram_latency" : 10, + "dram_req_size_byte": 64, + "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + "dram_num_burst_length" : 4, + "dram_log_print_interval_cycle": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", "icnt_type" : "simple", "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", + "icnt_freq_mhz" : 8000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - "precision" : 4, - "scheduler" : "simple" + + } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json index f76fec32..3e6844fd 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json @@ -1,25 +1,20 @@ { "core_type" : ["ws_mesh","ws_mesh"], "num_cores" : 2, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, + "core_freq_mhz" : 1000, + "core_log_print_interval_cycle" : 100000, "dram_type" : "ramulator2", - "dram_freq" :800, + "dram_freq_mhz" :800, "dram_channels": 1, - "dram_req_size": 64, - "dram_latency" : 10, + "dram_req_size_byte": 64, "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - + "dram_num_burst_length" : 4, + "dram_log_print_interval_cycle": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + "icnt_type" : "simple", "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m4.icnt", - - "precision" : 4, - "scheduler" : "simple" + "icnt_freq_mhz" : 8000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c2_m4.icnt" } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json index 7571b830..d0927648 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json @@ -1,24 +1,19 @@ { "num_cores" : 2, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, + "core_freq_mhz" : 1000, + "core_log_print_interval_cycle" : 100000, "dram_type" : "ramulator2", - "dram_freq" :800, + "dram_freq_mhz" :800, "dram_channels": 2, - "dram_req_size": 64, - "dram_latency" : 10, + "dram_req_size_byte": 64, "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - + "dram_num_burst_length" : 4, + "dram_log_print_interval_cycle": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + "icnt_type" : "simple", "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c2_m8.icnt", - - "precision" : 4, - "scheduler" : "simple" + "icnt_freq_mhz" : 8000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c2_m8.icnt" } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json index be163336..095ea820 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json @@ -1,24 +1,19 @@ { "num_cores" : 2, - "core_freq" : 1000, - "sram_size" : 256, - "core_print_interval" : 100000, + "core_freq_mhz" : 1000, + "core_log_print_interval_cycle" : 100000, "dram_type" : "ramulator2", - "dram_freq" :800, + "dram_freq_mhz" :800, "dram_channels": 4, - "dram_req_size": 64, - "dram_latency" : 10, + "dram_req_size_byte": 64, "dram_size" : 16, - "dram_nbl" : 4, - "dram_print_interval": 100000, - "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - + "dram_num_burst_length" : 4, + "dram_log_print_interval_cycle": 100000, + "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", + "icnt_type" : "simple", "icnt_latency" : 1, - "icnt_freq" : 8000, - "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "precision" : 4, - "scheduler" : "simple" + "icnt_freq_mhz" : 8000, + "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt" } \ No newline at end of file diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc index 5581f8bd..fd75a3e5 100644 --- a/PyTorchSimBackend/src/Common.cc +++ b/PyTorchSimBackend/src/Common.cc @@ -39,15 +39,14 @@ SimulationConfig initialize_config(json config) { for (int i=0; i(config, "core_print_interval"); + parsed_config.core_print_interval = get_config_value(config, "core_log_print_interval_cycle"); /* Stonne config */ if (config.contains("stonne_config_path")) @@ -63,18 +62,18 @@ SimulationConfig initialize_config(json config) { else throw std::runtime_error(fmt::format("Not implemented dram type {} ", (std::string)config["dram_type"])); - parsed_config.dram_freq = config["dram_freq"]; + parsed_config.dram_freq = config["dram_freq_mhz"]; if (config.contains("dram_latency")) parsed_config.dram_latency = config["dram_latency"]; - if (config.contains("dram_config_path")) - parsed_config.dram_config_path = config["dram_config_path"]; + if (config.contains("ramulator_config_path")) + parsed_config.dram_config_path = config["ramulator_config_path"]; parsed_config.dram_channels = config["dram_channels"]; - if (config.contains("dram_req_size")) - parsed_config.dram_req_size = config["dram_req_size"]; - if (config.contains("dram_print_interval")) - parsed_config.dram_print_interval = config["dram_print_interval"]; - if(config.contains("dram_nbl")) - parsed_config.dram_nbl = config["dram_nbl"]; + if (config.contains("dram_req_size_byte")) + parsed_config.dram_req_size = config["dram_req_size_byte"]; + if (config.contains("dram_log_print_interval_cycle")) + parsed_config.dram_print_interval = config["dram_log_print_interval_cycle"]; + if(config.contains("dram_num_burst_length")) + parsed_config.dram_nbl = config["dram_num_burst_length"]; if (config.contains("dram_num_partitions")) parsed_config.dram_num_partitions = config["dram_num_partitions"]; @@ -104,17 +103,18 @@ SimulationConfig initialize_config(json config) { else throw std::runtime_error(fmt::format("Not implemented icnt type {} ", (std::string)config["icnt_type"])); - parsed_config.icnt_freq = config["icnt_freq"]; + parsed_config.icnt_freq = config["icnt_freq_mhz"]; if (config.contains("icnt_latency")) parsed_config.icnt_latency = config["icnt_latency"]; - if (config.contains("icnt_config_path")) - parsed_config.icnt_config_path = config["icnt_config_path"]; - if (config.contains("icnt_print_interval")) - parsed_config.icnt_print_interval = config["icnt_print_interval"]; + if (config.contains("booksim_config_path")) + parsed_config.icnt_config_path = config["booksim_config_path"]; + if (config.contains("icnt_log_print_interval_cycle")) + parsed_config.icnt_print_interval = config["icnt_log_print_interval_cycle"]; if (config.contains("icnt_node_per_core")) parsed_config.icnt_node_per_core = config["icnt_node_per_core"]; - parsed_config.scheduler_type = config["scheduler"]; + if (config.contains("scheduler")) + parsed_config.scheduler_type = config["scheduler"]; if (config.contains("num_partition")) parsed_config.num_patition = config["num_partition"]; if (config.contains("partition")) { From 87a87a4a115042e3cc62a1b43a66c2ddbe7061de Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Sat, 29 Nov 2025 12:47:55 +0000 Subject: [PATCH 26/53] [refactor] environment variable --- PyTorchSimFrontend/extension_codecache.py | 14 +++++++------- PyTorchSimFrontend/extension_config.py | 8 ++++---- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +- PyTorchSimFrontend/mlir/mlir_common.py | 2 +- PyTorchSimFrontend/mlir/mlir_conv_common.py | 2 +- PyTorchSimFrontend/mlir/mlir_template.py | 4 ++-- README.md | 2 +- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index ca669361..ffbbc2be 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -180,7 +180,7 @@ def load(cls, source_code, else: link_option = "" # Generate LLVM kernel calller and binary for validation - if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE: + if extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE: # Use custom malloc to avoid size error new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free" cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen) @@ -197,7 +197,7 @@ def load(cls, source_code, print("Error output:", e.output) assert(0) - val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, arg_attributes) + val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE, arg_attributes) val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name) val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name, validation_binary_name, new_link_option) @@ -228,7 +228,7 @@ def load(cls, source_code, print("Error output:", e.output) assert(0) - if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: + if not extension_config.CONFIG_TORCHSIM_TIMING_MODE: return key # Generate MLIR kernel calller and binary for cycle calculation @@ -299,13 +299,13 @@ def dummy_simulator(*args, **kwargs): # Dump arguments and meta data dump_metadata(args, arg_attributes, result_path) runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path) - if not autotune and (extension_config.CONFIG_TORCHSIM_VALIDATION_MODE or validate): + if not autotune and (extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE or validate): funcsim = FunctionalSimulator(result_path, key) funcsim.run_spike(args, arg_attributes, runtime_path, self.validation_binary_name, vectorlane_size=vectorlane_size, spad_info=spad_info, cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS, silent_mode=silent_mode) - if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: + if not extension_config.CONFIG_TORCHSIM_TIMING_MODE: return onnx_path = os.path.join(result_path, "tile_graph.onnx") @@ -329,11 +329,11 @@ def dryrun_simulator(*args, **kwargs): # Dump arguments and meta data dump_metadata(args, arg_attributes, result_path) runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path) - if extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: + if not extension_config.CONFIG_TORCHSIM_TIMING_MODE: return # Todo. Support valude dependent mode for graph mode - if False: # extension_config.CONFIG_TORCHSIM_VALIDATION_MODE: + if False: # extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE: funcsim = FunctionalSimulator(result_path, key) funcsim.run_spike(args, arg_attributes, runtime_path, self.validation_binary_name, diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index fa5d22b5..7aa594c1 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -21,7 +21,8 @@ CONFIG_TORCHSIM_DUMP_PATH = os.environ.get('TORCHSIM_DUMP_PATH', default = f"{tempfile.gettempdir()}/torchinductor") CONFIG_TORCHSIM_DUMP_FILE = int(os.environ.get('TORCHSIM_DUMP_FILE', default=True)) -CONFIG_TORCHSIM_VALIDATION_MODE = int(os.environ.get('TORCHSIM_VALIDATION_MODE', default=True)) +CONFIG_TORCHSIM_FUNCTIONAL_MODE = int(os.environ.get('TORCHSIM_FUNCTIONAL_MODE', default=True)) +CONFIG_TORCHSIM_TIMING_MODE = int(os.environ.get("TORCHSIM_TIMING_MODE", True)) CONFIG_CLEANUP_DUMP_ARGS = int(os.environ.get('CLEANUP_DUMP_ARGS', default=False)) # LLVM PATH @@ -34,7 +35,6 @@ # Backendsim config CONFIG_TORCHSIM_BACKEND_CONFIG = os.environ.get('TORCHSIM_CONFIG', default=f'{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') -CONFIG_BACKENDSIM_SPIKE_ONLY = int(os.environ.get("BACKENDSIM_SPIKE_ONLY", False)) CONFIG_BACKENDSIM_EAGER_MODE = int(os.environ.get("BACKENDSIM_EAGER_MODE", default=False)) CONFIG_BACKENDSIM_DRYRUN = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) CONFIG_BACKENDSIM_DEBUG_LEVEL = os.environ.get("BACKENDSIM_DEBUG_LEVEL", "") @@ -45,8 +45,8 @@ default=f"{CONFIG_TORCHSIM_DIR}/gem5_script/script_systolic.py") # AUTOTUNE config -CONFIG_AUTOTUNE = int(os.environ.get('AUTOTUNE', default=True)) -CONFIG_AUTOTUNE_TEMPLATE = int(os.environ.get('AUTOTUNE_TEMPLATE', default=True)) +CONFIG_AUTOTUNE = int(os.environ.get('AUTOTUNE', default=False)) +CONFIG_AUTOTUNE_TEMPLATE = int(os.environ.get('AUTOTUNE_TEMPLATE', default=False)) CONFIG_MAX_AUTOTUNE_TRY = int(os.environ.get('MAX_AUTOTUNE_TRY', default=10)) CONFIG_AUTOTUNE_TEMPLATE_TOPK = int(os.environ.get('AUTOTUNE_TEMPLATE_TOPK', default=4)) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index b3352ea6..ddb215e3 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1647,7 +1647,7 @@ def _log_autotune_result(self, best_choice, best_cycle): def codegen_nodes(self, nodes, kernel_name): src_code = super().codegen_nodes(nodes, kernel_name) self._prepare_simulator_headers(src_code) - if extension_config.CONFIG_AUTOTUNE and not extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY: + if extension_config.CONFIG_AUTOTUNE and extension_config.CONFIG_TORCHSIM_TIMING_MODE: optimal_src_code = self.autotune(nodes, kernel_name) if optimal_src_code is not None: return optimal_src_code diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 2644f125..cfc13230 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -792,7 +792,7 @@ def run_bench(self, nodes, kernel_name, src_code): "spad_info": self.spad_info, "vlen" : self.vlen, "arg_attributes" : arg_attributes, - "validate" : extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, + "validate" : extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE, "autotune" : True, }, source_code=src_code, diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py index 52979d73..a4cd14c6 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_common.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py @@ -93,7 +93,7 @@ def outer_func_render(self, kernel_name, input_args): OUTPUT=Y, PADDING_H=self.padding[0], PADDING_W=self.padding[1], - VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, + VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE, BACKENDSIM_EAGER_MODE=eager_mode, input_reorder=self.input_reorder ) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index e6e9dd0c..9215d7f5 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -29,7 +29,7 @@ from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode from torch._inductor.codegen import common -from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_DIR, CONFIG_AUTOTUNE_TEMPLATE_TOPK +from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_DIR, CONFIG_AUTOTUNE_TEMPLATE_TOPK, CONFIG_AUTOTUNE_TEMPLATE from . import mlir_common class IndentedBufferGroup: @@ -506,7 +506,7 @@ def _log_autotune_result(self, best_choice, best_cycle): ) def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes): - src_code = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) + src_code = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) if CONFIG_AUTOTUNE_TEMPLATE else self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_candidates[0]) with V.set_kernel_handler(self): self.meta_kernel() diff --git a/README.md b/README.md index 4289195e..0407fd0d 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,7 @@ Simulation consists of three steps If you want to turn off the `SpikeSimulator` for fast simulation, you can set as below. ```bash -export TORCHSIM_VALIDATION_MODE=False +export TORCHSIM_FUNCTIONAL_MODE=False ``` Log contains memory & core stats. ```bash From a44bbdde1186047bb41b437a92dcdc6620ac8871 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Sat, 29 Nov 2025 12:58:00 +0000 Subject: [PATCH 27/53] [Refactor] code & logs --- PyTorchSimBackend/include/Core.h | 18 ++- PyTorchSimBackend/include/{TMA.h => DMA.h} | 10 +- PyTorchSimBackend/include/Dram.h | 2 +- PyTorchSimBackend/include/Instruction.h | 3 - PyTorchSimBackend/include/Interconnect.h | 2 +- PyTorchSimBackend/include/SimulationConfig.h | 2 - PyTorchSimBackend/src/Core.cc | 123 +++++++++---------- PyTorchSimBackend/src/{TMA.cc => DMA.cc} | 18 +-- PyTorchSimBackend/src/Dram.cc | 2 +- PyTorchSimBackend/src/Simulator.cc | 4 +- PyTorchSimBackend/src/SparseCore.cc | 28 ++--- PyTorchSimBackend/src/TileGraph.cc | 1 - PyTorchSimBackend/src/TileGraphParser.cc | 8 -- PyTorchSimBackend/src/main.cc | 2 +- README.md | 4 +- 15 files changed, 99 insertions(+), 128 deletions(-) rename PyTorchSimBackend/include/{TMA.h => DMA.h} (96%) rename PyTorchSimBackend/src/{TMA.cc => DMA.cc} (68%) diff --git a/PyTorchSimBackend/include/Core.h b/PyTorchSimBackend/include/Core.h index a3d55fa2..4272bd52 100644 --- a/PyTorchSimBackend/include/Core.h +++ b/PyTorchSimBackend/include/Core.h @@ -9,7 +9,7 @@ #include "Dram.h" #include "Tile.h" #include "SimulationConfig.h" -#include "TMA.h" +#include "DMA.h" class Core { public: @@ -27,7 +27,7 @@ class Core { virtual void pop_memory_request(); virtual mem_fetch* top_memory_request() { return _request_queue.front(); } virtual void push_memory_response(mem_fetch* response); - void check_tag() { _tma.check_table(); } + void check_tag() { _dma.check_table(); } void inc_numa_hit() { _stat_numa_hit++; } void inc_numa_miss() { _stat_numa_miss++; } @@ -50,20 +50,18 @@ class Core { /* Core id & config file */ const uint32_t _id; const SimulationConfig _config; - size_t _sram_size; - size_t _used_sram_size; uint32_t _num_systolic_array_per_core; uint32_t _systolic_array_rr = 0; - /* TMA Unit */ - TMA _tma; + /* DMA Unit */ + DMA _dma; /* cycle */ cycle_type _core_cycle; cycle_type _stat_tot_vu_compute_cycle = 0; std::vector _stat_tot_sa_compute_cycle; - cycle_type _stat_tot_tma_cycle = 0; - cycle_type _stat_tot_tma_idle_cycle = 0; + cycle_type _stat_tot_dma_cycle = 0; + cycle_type _stat_tot_dma_idle_cycle = 0; cycle_type _stat_tot_vu_compute_idle_cycle = 0; std::vector _stat_tot_sa_compute_idle_cycle; std::vector _stat_inst_count; @@ -76,8 +74,8 @@ class Core { cycle_type _stat_vu_compute_cycle = 0; std::vector _stat_sa_compute_cycle; - cycle_type _stat_tma_cycle = 0; - cycle_type _stat_tma_idle_cycle = 0; + cycle_type _stat_dma_cycle = 0; + cycle_type _stat_dma_idle_cycle = 0; cycle_type _stat_vu_compute_idle_cycle = 0; std::vector _stat_sa_compute_idle_cycle; uint64_t _stat_mem_response = 0; diff --git a/PyTorchSimBackend/include/TMA.h b/PyTorchSimBackend/include/DMA.h similarity index 96% rename from PyTorchSimBackend/include/TMA.h rename to PyTorchSimBackend/include/DMA.h index f8355470..f6da6c49 100644 --- a/PyTorchSimBackend/include/TMA.h +++ b/PyTorchSimBackend/include/DMA.h @@ -1,5 +1,5 @@ -#ifndef TMA_H -#define TMA_H +#ifndef DMA_H +#define DMA_H #include #include @@ -16,9 +16,9 @@ struct VectorCompare { } }; -class TMA { +class DMA { public: - TMA(uint32_t id, uint32_t dram_req_size); + DMA(uint32_t id, uint32_t dram_req_size); void issue_tile(std::shared_ptr inst); bool is_finished() { return _finished; } @@ -114,7 +114,7 @@ class TMA { } std::shared_ptr& get_current_inst() { return _current_inst; } - std::shared_ptr> get_memory_access(); + std::shared_ptr> get_memory_access(cycle_type core_cycle); uint32_t generate_mem_access_id(); const uint32_t get_max_dim() { return _max_dim; } diff --git a/PyTorchSimBackend/include/Dram.h b/PyTorchSimBackend/include/Dram.h index 5e51b96d..d28ac25f 100644 --- a/PyTorchSimBackend/include/Dram.h +++ b/PyTorchSimBackend/include/Dram.h @@ -6,7 +6,7 @@ #include #include "Common.h" -#include "TMA.h" +#include "DMA.h" #include "ramulator2.hh" #include "Hashing.h" #include "Cache.h" diff --git a/PyTorchSimBackend/include/Instruction.h b/PyTorchSimBackend/include/Instruction.h index 4c14dd81..9fad13f4 100644 --- a/PyTorchSimBackend/include/Instruction.h +++ b/PyTorchSimBackend/include/Instruction.h @@ -60,9 +60,7 @@ class Instruction : public std::enable_shared_from_this { std::vector get_trace_address() { return _trace_address; } bool load_indirect_index(const std::string& path, uint64_t*& indirect_index, const std::vector& tile_size); void set_trace_address(std::vector& trace_address) { _trace_address = trace_address; } - size_t get_free_sram_size() { return _free_sram_size; } addr_type get_base_dram_address() { return dram_addr; } - void set_free_sram_size(size_t sram_size) { _free_sram_size=sram_size; } void* get_owner() { return _owner; } void set_owner(void *owner) { _owner = owner;} void set_owner_ready_queue(std::list>* q) { _owner_ready_queue_ref = q; } @@ -103,7 +101,6 @@ class Instruction : public std::enable_shared_from_this { size_t _tile_numel; size_t _nr_waiting_request=0; size_t _precision=0; - size_t _free_sram_size=0; addr_type dram_addr; uint32_t _numa_id = 0; // For DMA instruction int _compute_type = 0; diff --git a/PyTorchSimBackend/include/Interconnect.h b/PyTorchSimBackend/include/Interconnect.h index 8467b7aa..59440fa3 100644 --- a/PyTorchSimBackend/include/Interconnect.h +++ b/PyTorchSimBackend/include/Interconnect.h @@ -1,6 +1,6 @@ #ifndef INTERCONNECT_H #define INTERCONNECT_H -#include "TMA.h" +#include "DMA.h" #include "booksim2/Interconnect.hpp" #include #include diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/PyTorchSimBackend/include/SimulationConfig.h index 8f011d00..6ff93506 100644 --- a/PyTorchSimBackend/include/SimulationConfig.h +++ b/PyTorchSimBackend/include/SimulationConfig.h @@ -19,7 +19,6 @@ struct SimulationConfig { std::string stonne_config_path; uint32_t num_cores; uint32_t core_freq; - uint32_t sram_size; uint32_t core_print_interval = 0; uint32_t num_systolic_array_per_core = 1; uint32_t num_stonne_per_core = 1; @@ -57,7 +56,6 @@ struct SimulationConfig { std::map partiton_map; /* Other configs */ - uint32_t precision; std::string layout; uint64_t align_address(uint64_t addr) { diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc index 4be41a70..cd641810 100644 --- a/PyTorchSimBackend/src/Core.cc +++ b/PyTorchSimBackend/src/Core.cc @@ -4,11 +4,9 @@ Core::Core(uint32_t id, SimulationConfig config) : _id(id), _config(config), _core_cycle(0), - _stat_tma_cycle(0), + _stat_dma_cycle(0), _num_systolic_array_per_core(config.num_systolic_array_per_core), - _tma(id, config.dram_req_size) { - _sram_size = _config.sram_size * 1024; - _used_sram_size = 0; + _dma(id, config.dram_req_size) { _sa_compute_pipeline.resize(_num_systolic_array_per_core); _stat_tot_sa_compute_cycle.resize(_num_systolic_array_per_core); _stat_sa_compute_cycle.resize(_num_systolic_array_per_core); @@ -25,14 +23,9 @@ bool Core::can_issue(const std::shared_ptr& op) { void Core::issue(std::shared_ptr op) { if (op->get_instructions().size()){ - spdlog::trace("[Core {}][{}] New Tile is issued, remain sram: {} Required size: {}, Free size: {}", - _id, _core_cycle, _sram_size-_used_sram_size, op->get_required_sram_size(), - op->get_instructions().back()->get_free_sram_size()); - } else { - spdlog::trace("[Core {}][{}] New Tile is issued, remain sram: {} Required size: {}", - _id, _core_cycle, _sram_size-_used_sram_size, op->get_required_sram_size()); + spdlog::trace("[{}][Core {}] New Tile is issued", + _core_cycle, _id); } - //_used_sram_size += op->get_required_sram_size(); for (const auto& inst : op->get_instructions()) { if (inst->is_ready()) op->enqueue_ready(inst); @@ -125,39 +118,38 @@ void Core::dma_cycle() { /* Set tag table of async dma load */ if (instruction->is_dma_read() && instruction->is_async_dma()) { auto& key = instruction->get_tag_id(); - assert(!_tma.get_tag_finish(instruction->subgraph_id, key)); - _tma.set_tag_finish(instruction->subgraph_id, key); - spdlog::trace("[Core {}][{}] {} ASYNC FINISHED, Used sram: {}, Release sram: {}, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", - _id, _core_cycle, opcode_to_string(instruction->get_opcode()), - _used_sram_size, instruction->get_free_sram_size(), + assert(!_dma.get_tag_finish(instruction->subgraph_id, key)); + _dma.set_tag_finish(instruction->subgraph_id, key); + spdlog::trace("[{}][Core {}] {} ASYNC FINISHED, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", + _core_cycle, _id, opcode_to_string(instruction->get_opcode()), instruction->subgraph_id, instruction->get_addr_name(), fmt::format("[{}]", fmt::join(instruction->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(instruction->get_tag_idx_list(), ", ")), fmt::format("[{}]", fmt::join(instruction->get_tag_stride_list(), ", "))); - for (auto & wait_inst : _tma.get_tag_waiter(instruction->subgraph_id, key)) { - _tma.mark_tag_used(instruction->subgraph_id, key); + for (auto & wait_inst : _dma.get_tag_waiter(instruction->subgraph_id, key)) { + _dma.mark_tag_used(instruction->subgraph_id, key); finish_instruction(wait_inst); } } _dma_finished_queue.erase(_dma_finished_queue.begin()); } - if (_tma.is_finished()) { + if (_dma.is_finished()) { /* Finish instruction when it is DMA store */ - if (_tma.get_current_inst() != nullptr) { - std::shared_ptr finished_inst = std::move(_tma.get_current_inst()); + if (_dma.get_current_inst() != nullptr) { + std::shared_ptr finished_inst = std::move(_dma.get_current_inst()); if (finished_inst->is_dma_write()) { /* Only DMA write operation is finished! */ finish_instruction(finished_inst); } else if (finished_inst->is_dma_read() && finished_inst->is_async_dma()) { /* Register tag table for async dma load */ - _tma.register_tag(finished_inst->subgraph_id, finished_inst->get_tag_id()); + _dma.register_tag(finished_inst->subgraph_id, finished_inst->get_tag_id()); finish_instruction(finished_inst); } else if(!finished_inst->is_dma_read()) { - spdlog::error("[Core {}][{}] TMA instruction in not valid", _id, _core_cycle); + spdlog::error("[{}][Core {}] DMA instruction in not valid", _core_cycle, _id); exit(EXIT_FAILURE); } else if (finished_inst->get_opcode() == Opcode::BAR) { - spdlog::trace("[Core {}][{}] {} FINISHED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle, + spdlog::trace("[{}][Core {}] {} FINISHED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, opcode_to_string(finished_inst->get_opcode()), finished_inst->get_addr_name(), fmt::format("[{}]", fmt::join(finished_inst->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(finished_inst->get_tag_idx_list(), ", ")), @@ -170,27 +162,27 @@ void Core::dma_cycle() { /* Issue new DMA operation */ if (!_ld_inst_queue.empty()) { std::shared_ptr inst = _ld_inst_queue.front(); - _tma.issue_tile(inst); + _dma.issue_tile(inst); _ld_inst_queue.pop(); } else if (!_st_inst_queue.empty()) { std::shared_ptr inst = _st_inst_queue.front(); - _tma.issue_tile(inst); + _dma.issue_tile(inst); _st_inst_queue.pop(); } else { - /* TMA is idle */ - _stat_tma_idle_cycle++; + /* DMA is idle */ + _stat_dma_idle_cycle++; return; } } /* Generate memfetch */ - auto access_vec = _tma.get_memory_access(); + auto access_vec = _dma.get_memory_access(_core_cycle); for (auto access : *access_vec) { access->set_start_cycle(_core_cycle); _request_queue.push(access); } - /* Increase tma stat cycle */ - _stat_tma_cycle++; + /* Increase dma stat cycle */ + _stat_dma_cycle++; } void Core::cycle() { @@ -218,20 +210,20 @@ void Core::cycle() { /* Check another MOVIN with same tag is issued */ auto& key = inst->get_tag_id(); if (inst->is_sparse_inst()) { - _tma.register_tag(inst->subgraph_id, key); - _tma.set_tag_sparse(inst->subgraph_id, key); + _dma.register_tag(inst->subgraph_id, key); + _dma.set_tag_sparse(inst->subgraph_id, key); finish_instruction(inst); issued = true; _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; break; - } else if (inst->is_async_dma() && _tma.tag_key_exist(inst->subgraph_id, key)) { - bool finished = _tma.get_tag_finish(inst->subgraph_id, key); + } else if (inst->is_async_dma() && _dma.tag_key_exist(inst->subgraph_id, key)) { + bool finished = _dma.get_tag_finish(inst->subgraph_id, key); if (finished) finish_instruction(inst); else - _tma.register_tag_waiter(inst->subgraph_id, key, inst); - spdlog::trace("[Core {}][{}] {} SKIPPED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(), + _dma.register_tag_waiter(inst->subgraph_id, key, inst); + spdlog::trace("[{}][Core {}] {} SKIPPED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, + opcode_to_string(inst->get_opcode()), inst->get_addr_name(), fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), @@ -240,8 +232,8 @@ void Core::cycle() { _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; break; } else { - spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(), + spdlog::trace("[{}][Core {}] {} ISSUED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, + opcode_to_string(inst->get_opcode()), inst->get_addr_name(), fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), @@ -252,8 +244,8 @@ void Core::cycle() { } } case Opcode::MOVOUT: - spdlog::trace("[Core {}][{}] {} ISSUED, free_sram_size: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size()); + spdlog::trace("[{}][Core {}] {} ISSUED", _core_cycle, _id, + opcode_to_string(inst->get_opcode())); _st_inst_queue.push(inst); issued = true; break; @@ -275,7 +267,7 @@ void Core::cycle() { _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; instructions.erase(it); } else { - spdlog::trace("[Core {}][SA {}][{}] {}-{} ISSUED, finsh at {}", _id, _systolic_array_rr, _core_cycle, + spdlog::trace("[{}][Core {}][SA {}] {}-{} ISSUED, finsh at {}", _core_cycle, _id, _systolic_array_rr, opcode_to_string(inst->get_opcode()), inst->get_compute_type(), inst->finish_cycle); target_pipeline.push(inst); issued = true; @@ -288,7 +280,7 @@ void Core::cycle() { case Opcode::BAR: { auto& key = inst->get_tag_id(); - uint32_t finished = _tma.get_tag_finish(inst->subgraph_id, key); + uint32_t finished = _dma.get_tag_finish(inst->subgraph_id, key); if (finished == -1) { for (auto child_inst : inst->get_child_inst()) { if (child_inst->get_opcode() == Opcode::COMP && child_inst->get_compute_type() == MATMUL) { @@ -297,12 +289,12 @@ void Core::cycle() { } finish_instruction(inst); } else if (finished != 0) { - _tma.mark_tag_used(inst->subgraph_id, key); + _dma.mark_tag_used(inst->subgraph_id, key); finish_instruction(inst); } else { - _tma.register_tag_waiter(inst->subgraph_id, key, inst); + _dma.register_tag_waiter(inst->subgraph_id, key, inst); } - spdlog::trace("[Core {}][{}] {} ISSUED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _id, _core_cycle, + spdlog::trace("[{}][Core {}] {} ISSUED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_addr_name(), fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), @@ -344,31 +336,26 @@ void Core::cycle() { } void Core::finish_instruction(std::shared_ptr& inst) { - size_t free_sram_size = inst->get_free_sram_size(); if (inst->finished) { - spdlog::error("[Core {}][{}] {} FINISHED, inst already finished!!", _id, _core_cycle, + spdlog::error("[{}][Core {}] {} FINISHED, inst already finished!!", _core_cycle, _id, opcode_to_string(inst->get_opcode())); exit(EXIT_FAILURE); } inst->finish_instruction(); static_cast(inst->get_owner())->inc_finished_inst(); if (inst->get_opcode() == Opcode::COMP) { - spdlog::trace("[Core {}][{}] {}-{} FINISHED, Used sram: {}, Release sram: {}", - _id, _core_cycle, opcode_to_string(inst->get_opcode()), inst->get_compute_type(), - _used_sram_size, inst->get_free_sram_size()); + spdlog::trace("[{}][Core {}] {}-{} FINISHED", + _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_compute_type()); } else if (inst->get_opcode() != Opcode::BAR && inst->is_async_dma()){ - spdlog::trace("[Core {}][{}] {} ASYNC REGISTERED, Used sram: {}, Release sram: {} subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", - _id, _core_cycle, opcode_to_string(inst->get_opcode()), _used_sram_size, - inst->get_free_sram_size(), inst->subgraph_id, inst->get_addr_name(), + spdlog::trace("[{}][Core {}] {} ASYNC REGISTERED, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", + _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->subgraph_id, inst->get_addr_name(), inst->get_tag_id(), fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", "))); } else if ((inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) && !inst->is_async_dma()) { - spdlog::trace("[Core {}][{}] {} FINISHED, free_sram_size: {} addr_name: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size(), - inst->get_addr_name()); + spdlog::trace("[{}][Core {}] {} FINISHED, addr_name: {}", _core_cycle, _id, + opcode_to_string(inst->get_opcode()), inst->get_addr_name()); } - //_used_sram_size -= free_sram_size; } bool Core::running() { @@ -378,7 +365,7 @@ bool Core::running() { for (int i=0; i<_num_systolic_array_per_core;i++) running = running || !_sa_compute_pipeline.at(i).empty(); running = running || !_dma_waiting_queue.empty() || !_dma_finished_queue.empty(); - running = running || !_tma.empty(); + running = running || !_dma.empty(); running = running || !_ld_inst_queue.empty(); running = running || !_st_inst_queue.empty(); return running; @@ -432,10 +419,10 @@ void Core::print_stats() { spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i), _stat_tot_sa_compute_cycle.at(i), _stat_tot_sa_compute_idle_cycle.at(i)); float dram_bw = _config.dram_req_size * _stat_tot_mem_response * _config.core_freq / (_core_cycle * 1000); // B/cycle - spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_tot_tma_cycle, _stat_tot_tma_idle_cycle, dram_bw, _stat_tot_mem_response); - spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id, + spdlog::info("Core [{}] : DMA active cycle {} DMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_tot_dma_cycle, _stat_tot_dma_idle_cycle, dram_bw, _stat_tot_mem_response); + spdlog::info("Core [{}] : Vector Unit utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id, static_cast(_stat_tot_vu_compute_cycle * 100) / _core_cycle, _stat_tot_vu_compute_cycle, _stat_tot_vu_compute_idle_cycle); - spdlog::info("Core [{}] : Numa hit count : {}, Numa miss count : {}", _id, _stat_numa_hit, _stat_numa_miss); + spdlog::info("Core [{}] : NUMA local access count : {}, NUMA remote access count : {}", _id, _stat_numa_hit, _stat_numa_miss); spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle); } @@ -452,7 +439,7 @@ void Core::print_current_stats() { for (int i=0; i<_num_systolic_array_per_core; i++) spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i), _stat_sa_compute_cycle.at(i), _stat_sa_compute_idle_cycle.at(i)); - spdlog::info("Core [{}] : TMA active cycle {} TMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_tma_cycle, _stat_tma_idle_cycle, dram_bw, _stat_mem_response); + spdlog::info("Core [{}] : DMA active cycle {} DMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_dma_cycle, _stat_dma_idle_cycle, dram_bw, _stat_mem_response); spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id, static_cast(_stat_vu_compute_cycle * 100) / _config.core_print_interval, _stat_vu_compute_cycle, _stat_vu_compute_idle_cycle); spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle); @@ -468,13 +455,13 @@ void Core::update_stats() { } _stat_tot_vu_compute_cycle += _stat_vu_compute_cycle; - _stat_tot_tma_cycle += _stat_tma_cycle; - _stat_tot_tma_idle_cycle += _stat_tma_idle_cycle; + _stat_tot_dma_cycle += _stat_dma_cycle; + _stat_tot_dma_idle_cycle += _stat_dma_idle_cycle; _stat_tot_mem_response += +_stat_mem_response; _stat_vu_compute_cycle = 0; - _stat_tma_cycle = 0; - _stat_tma_idle_cycle = 0; + _stat_dma_cycle = 0; + _stat_dma_idle_cycle = 0; _stat_vu_compute_idle_cycle = 0; _stat_mem_response = 0; } \ No newline at end of file diff --git a/PyTorchSimBackend/src/TMA.cc b/PyTorchSimBackend/src/DMA.cc similarity index 68% rename from PyTorchSimBackend/src/TMA.cc rename to PyTorchSimBackend/src/DMA.cc index 7744b0f5..7c8eb656 100644 --- a/PyTorchSimBackend/src/TMA.cc +++ b/PyTorchSimBackend/src/DMA.cc @@ -1,24 +1,24 @@ -#include "TMA.h" +#include "DMA.h" #include "TileGraph.h" -TMA::TMA(uint32_t id, uint32_t dram_req_size) { +DMA::DMA(uint32_t id, uint32_t dram_req_size) { _id = id; _dram_req_size = dram_req_size; _current_inst = nullptr; _finished = true; } -void TMA::issue_tile(std::shared_ptr inst) { +void DMA::issue_tile(std::shared_ptr inst) { _current_inst = std::move(inst); std::vector& tile_size = _current_inst->get_tile_size(); if (tile_size.size() <= 0 || tile_size.size() > get_max_dim()) { - spdlog::error("[TMA {}] issued tile is not supported format..", _id); + spdlog::error("[DMA {}] issued tile is not supported format..", _id); exit(EXIT_FAILURE); } _finished = false; } -std::shared_ptr> TMA::get_memory_access() { +std::shared_ptr> DMA::get_memory_access(cycle_type core_cycle) { auto addr_set = _current_inst->get_dram_address(_dram_req_size); auto access_vec = std::make_shared>(); Tile* owner = (Tile*)_current_inst->get_owner(); @@ -26,9 +26,9 @@ std::shared_ptr> TMA::get_memory_access() { unsigned long long base_daddr = _current_inst->get_base_dram_address(); // Todo. We use a ternsor level buffer allocation, so we don't need to check all memfetch bool is_cacheable = owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size); - spdlog::trace("[SRAM Trace] Core-{}, Address: 0x{:016x}, Is_cacheable: {}", _id, base_daddr, is_cacheable); - spdlog::trace("[NUMA Trace] Core-{}, Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}", - _id, owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write()); + spdlog::trace("[{}][SRAM Trace] Core-{}, Address: 0x{:016x}, Is_cacheable: {}", core_cycle, _id, base_daddr, is_cacheable); + spdlog::trace("[{}][NUMA Trace] Core-{}, Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}", + core_cycle, _id, owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write()); for (auto addr: *addr_set) { mem_access_type acc_type = _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W : mem_access_type::GLOBAL_ACC_R; @@ -42,7 +42,7 @@ std::shared_ptr> TMA::get_memory_access() { return access_vec; } -uint32_t TMA::generate_mem_access_id() { +uint32_t DMA::generate_mem_access_id() { static uint32_t id_counter{0}; return id_counter++; } \ No newline at end of file diff --git a/PyTorchSimBackend/src/Dram.cc b/PyTorchSimBackend/src/Dram.cc index ab074bda..ca922fe4 100644 --- a/PyTorchSimBackend/src/Dram.cc +++ b/PyTorchSimBackend/src/Dram.cc @@ -20,7 +20,7 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) { _n_ch_per_partition = _n_ch / _n_partitions; _config = config; - spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}", config.max_dram_bandwidth(), config.dram_freq, _n_ch, _req_size); + spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}B", config.max_dram_bandwidth(), config.dram_freq, _n_ch, _req_size); /* Initialize DRAM Channels */ for (int ch = 0; ch < _n_ch; ch++) { m_to_crossbar_queue.push_back(std::queue()); diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc index 6bc80286..1c1b379b 100644 --- a/PyTorchSimBackend/src/Simulator.cc +++ b/PyTorchSimBackend/src/Simulator.cc @@ -23,8 +23,8 @@ Simulator::Simulator(SimulationConfig config) _cores.resize(_n_cores); for (int core_index = 0; core_index < _n_cores; core_index++) { if (config.core_type[core_index] == CoreType::WS_MESH) { - spdlog::info("[Config/Core] Core {}: {} MHz, Spad size: {} KB, Systolic array per core: {}", - core_index, config.core_freq , config.sram_size, config.num_systolic_array_per_core); + spdlog::info("[Config/Core] Core {}: {} MHz, Systolic array per core: {}", + core_index, config.core_freq, config.num_systolic_array_per_core); _cores.at(core_index) = std::make_unique(core_index, _config); } else if(config.core_type[core_index] == CoreType::STONNE) { spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq); diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc index 64d3da55..49aff8ed 100644 --- a/PyTorchSimBackend/src/SparseCore.cc +++ b/PyTorchSimBackend/src/SparseCore.cc @@ -100,8 +100,8 @@ void SparseCore::checkStatus(uint32_t subcore_id) { int new_status = stonneCore->getMCFSMStats(); int compute_cycle = stonneCore->getMSStats().n_multiplications; if (traceCoreStatus.at(subcore_id) != new_status) { - spdlog::trace("Stonne Core [{}][{}] status transition {} -> {}, Load/Store: {}/{}, compute_cycle: {}", - _id, _core_cycle, traceCoreStatus.at(subcore_id), new_status, + spdlog::trace("[{}]Stonne Core [{}] status transition {} -> {}, Load/Store: {}/{}, compute_cycle: {}", + _core_cycle, _id, traceCoreStatus.at(subcore_id), new_status, traceLoadTraffic.at(subcore_id).size(), traceStoreTraffic.at(subcore_id).size(), (compute_cycle - traceCoreCycle.at(subcore_id))/num_ms); if (traceLoadTraffic.at(subcore_id).size()) { TraceNode load_node = TraceNode(traceNodeList.at(subcore_id).size()+2, "load", TraceNode::StonneTraceLoad); @@ -158,7 +158,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { req->stonneId = subcore_id; std::tuple key = std::make_tuple(target_addr, acc_type, type, allocTrafficID()); registerMemfetch(key, [this, req, acc_type, type]() { - spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ + spdlog::trace("[{}][SparseCore] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ _core_cycle, _core_cycle - req->request_time, req->getAddress(), int(req->getcmd()), _config.dram_req_size); req->setReply(); stonneCores.at(req->stonneId)->pushResponse(req); @@ -168,7 +168,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { /* Finish stonne core */ if (coreBusy.at(subcore_id) && stonneCore->isFinished()) { stonneCore->finish(); - spdlog::info("[SparseCore][{}] Operation finished at {}", _id, _core_cycle); + spdlog::info("[{}][SparseCore] Operation finished at {}", _core_cycle, _id); std::shared_ptr target_tile = percore_tiles.at(subcore_id).front(); SST_STONNE::StonneOpDesc *opDesc = static_cast(target_tile->get_custom_data()); if (opDesc->trace_path != "") @@ -239,7 +239,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { { auto acc_type = mem_access_type::GLOBAL_ACC_R; auto type = mf_type::READ_REQUEST; - spdlog::trace("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle, + spdlog::trace("[{}][StonneCore {}][{}] {} ISSUED", _core_cycle, _id, subcore_id, opcode_to_string(inst->get_opcode())); for (auto addr : inst->get_trace_address()) { addr = addr - (addr & _config.dram_req_size-1); @@ -247,7 +247,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { std::tuple key = std::make_tuple(addr, acc_type, type, allocTrafficID()); uint64_t current_time = _core_cycle; registerMemfetch(key, [this, inst, addr, current_time, type]() { - spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ + spdlog::trace("[{}][SparseCore] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ this->_core_cycle, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size); inst->dec_waiting_request(); }); @@ -260,7 +260,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { { auto acc_type = mem_access_type::GLOBAL_ACC_W; auto type = mf_type::WRITE_REQUEST; - spdlog::trace("[StonneCore {}][{}][{}] {} ISSUED", _id, subcore_id, _core_cycle, + spdlog::trace("[{}][StonneCore {}][{}] {} ISSUED", _core_cycle, _id, subcore_id, opcode_to_string(inst->get_opcode())); for (auto addr : inst->get_trace_address()) { addr = addr - (addr & _config.dram_req_size-1); @@ -268,7 +268,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { std::tuple key = std::make_tuple(addr, acc_type, type, allocTrafficID()); uint64_t current_time = _core_cycle; registerMemfetch(key, [this, inst, addr, current_time, type]() { - spdlog::trace("[SparseCore][{}] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ + spdlog::trace("[{}][SparseCore] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ this->_core_cycle, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size); inst->dec_waiting_request(); }); @@ -285,7 +285,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { inst->finish_cycle = _core_cycle + inst->get_compute_cycle(); else inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle(); - spdlog::trace("[Core {}][{}][{}] {} ISSUED, finsh at {}", _id, subcore_id, _core_cycle, + spdlog::trace("[{}][Core {}][{}] {} ISSUED, finsh at {}", _core_cycle, _id, subcore_id, opcode_to_string(inst->get_opcode()), inst->finish_cycle); target_pipeline.push(inst); issued = true; @@ -313,7 +313,7 @@ void SparseCore::cycle() { for (auto& req_pair : request_merge_table) { _request_queue.push(req_pair.second); request_merge_table.erase(req_pair.first); - spdlog::debug("[SparseCore][{}][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \ + spdlog::debug("[{}][SparseCore][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \ _core_cycle, _id, req_pair.second->get_addr(), int(req_pair.second->get_access_type()), int(req_pair.second->get_type()), _config.dram_req_size, nr_request); nr_request++; @@ -406,11 +406,11 @@ void SparseCore::finish_instruction(std::shared_ptr& inst) { inst->finish_instruction(); static_cast(inst->get_owner())->inc_finished_inst(); if (inst->get_opcode() == Opcode::COMP) { - spdlog::info("[StonneCore {}][{}] {} FINISHED", - _id, _core_cycle, opcode_to_string(inst->get_opcode())); + spdlog::info("[{}][StonneCore {}] {} FINISHED", + _core_cycle, _id, opcode_to_string(inst->get_opcode())); } else if (inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) { - spdlog::info("[StonneCore {}][{}] {} FINISHED, free_sram_size: {}", _id, _core_cycle, - opcode_to_string(inst->get_opcode()), inst->get_free_sram_size()); + spdlog::info("[{}][StonneCore {}] {} FINISHED", _core_cycle, _id, + opcode_to_string(inst->get_opcode())); } } diff --git a/PyTorchSimBackend/src/TileGraph.cc b/PyTorchSimBackend/src/TileGraph.cc index 33e995e9..120d49e2 100644 --- a/PyTorchSimBackend/src/TileGraph.cc +++ b/PyTorchSimBackend/src/TileGraph.cc @@ -111,7 +111,6 @@ void TileGraph::allocate_subgraph(int core_id, int slot_id) { for (auto it = _subgraph_vec.begin(); it != _subgraph_vec.end(); ++it) { if ((*it)->get_core_id() == -1 || (*it)->get_core_id() == core_id) { - spdlog::trace("[TileGraph] Core {} allocated new subgraph(affinity={}) (remains: {})", core_id, (*it)->get_core_id(), _subgraph_vec.size()-1); std::shared_ptr subgraph = *it; _cpu_graph_map[core_id][slot_id] = subgraph; _subgraph_vec.erase(it); diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc index 4a562724..42776a51 100644 --- a/PyTorchSimBackend/src/TileGraphParser.cc +++ b/PyTorchSimBackend/src/TileGraphParser.cc @@ -627,9 +627,6 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa } } } - /* Set last instruction's free sram size */ - if(parent->get_instructions().size()) - parent->get_instructions().back()->set_free_sram_size(parent->get_required_sram_size()); parent->append_child(child); /* Create new tile */ @@ -682,11 +679,6 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa tile_vec.back()->inc_required_sram_size(inst->get_tile_numel() * inst->get_precision()); } - /* Set last instruction's free sram size */ - std::shared_ptr parent = tile_vec.back(); - if (parent->get_instructions().size()) - parent->get_instructions().back()->set_free_sram_size(parent->get_required_sram_size()); - return tile_vec; } diff --git a/PyTorchSimBackend/src/main.cc b/PyTorchSimBackend/src/main.cc index 214e7131..5c4a21e9 100644 --- a/PyTorchSimBackend/src/main.cc +++ b/PyTorchSimBackend/src/main.cc @@ -149,6 +149,6 @@ int main(int argc, char** argv) { /* Simulation time measurement */ auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end - start; - spdlog::info("Simulation time: {:2f} seconds", duration.count()); + spdlog::info("Simulation wall clock time: {:2f} seconds", duration.count()); return 0; } diff --git a/README.md b/README.md index 0407fd0d..41bd01a1 100644 --- a/README.md +++ b/README.md @@ -156,9 +156,9 @@ Log contains memory & core stats. [info] ========= Core stat ========= [info] Core [0] : Systolic array [0] Utilization(%) 0.00, active cycle 0, idle cycle 1014 [info] Core [0] : Systolic array [1] Utilization(%) 12.62, active cycle 128, idle cycle 886 -[info] Core [0] : TMA active cycle 3 TMA idle cycle 1011 DRAM BW 182.000 GB/s (6144) +[info] Core [0] : DMA active cycle 3 DMA idle cycle 1011 DRAM BW 182.000 GB/s (6144) [info] Core [0] : Vector Unit Utilization(%) 4.34, active cycle 44, idle_cycle 0 -[info] Core [0] : Numa hit count : 0, Numa miss count : 0 +[info] Core [0] : NUMA local access count : 0, NUMA remote access count : 0 [info] Core [0] : Total cycle 1014 [info] Total execution cycle: 1014 [info] Simulation time: 0.039296 seconds From 764f1235d9e2d8eca19f6b449d42a358b3192eb3 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Sat, 29 Nov 2025 12:59:21 +0000 Subject: [PATCH 28/53] [refactor] device name --- Scheduler/scheduler.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 10358321..18e44de4 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -171,7 +171,7 @@ def setup_device(): import torch.utils.cpp_extension module = torch.utils.cpp_extension.load( - name="extension_device", + name="npu", sources=[ str(source_file), ], @@ -179,7 +179,7 @@ def setup_device(): verbose=True, ) - torch.utils.rename_privateuse1_backend("extension_device") + torch.utils.rename_privateuse1_backend("npu") from torch._inductor.codegen.common import ( get_scheduling_for_device, get_wrapper_codegen_for_device, @@ -192,13 +192,13 @@ def setup_device(): MLIRScheduling ) register_backend_for_device( - "extension_device", MLIRScheduling, ExtensionWrapperCodegen + "npu", MLIRScheduling, ExtensionWrapperCodegen ) assert( - get_scheduling_for_device("extension_device") == MLIRScheduling + get_scheduling_for_device("npu") == MLIRScheduling ) assert( - get_wrapper_codegen_for_device("extension_device") + get_wrapper_codegen_for_device("npu") == ExtensionWrapperCodegen ) return module From 7ded98c3b4990625a9455fd788083e8da969d8e1 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Sun, 30 Nov 2025 02:00:28 +0000 Subject: [PATCH 29/53] [Refactor] Add missing core_freq -> core_freq_mhz --- PyTorchSimBackend/include/SimulationConfig.h | 2 +- PyTorchSimBackend/src/Common.cc | 2 +- PyTorchSimBackend/src/Core.cc | 4 ++-- PyTorchSimBackend/src/Simulator.cc | 6 +++--- PyTorchSimBackend/src/SparseCore.cc | 8 ++++---- README.md | 2 +- Simulator/simulator.py | 4 ++-- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/PyTorchSimBackend/include/SimulationConfig.h index 6ff93506..a9825516 100644 --- a/PyTorchSimBackend/include/SimulationConfig.h +++ b/PyTorchSimBackend/include/SimulationConfig.h @@ -18,7 +18,7 @@ struct SimulationConfig { std::vector core_type; std::string stonne_config_path; uint32_t num_cores; - uint32_t core_freq; + uint32_t core_freq_mhz; uint32_t core_print_interval = 0; uint32_t num_systolic_array_per_core = 1; uint32_t num_stonne_per_core = 1; diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc index fd75a3e5..c8ea9abc 100644 --- a/PyTorchSimBackend/src/Common.cc +++ b/PyTorchSimBackend/src/Common.cc @@ -39,7 +39,7 @@ SimulationConfig initialize_config(json config) { for (int i=0; i(_stat_tot_vu_compute_cycle * 100) / _core_cycle, _stat_tot_vu_compute_cycle, _stat_tot_vu_compute_idle_cycle); @@ -430,7 +430,7 @@ void Core::print_current_stats() { std::vector sa_utilization; for (int i=0; i<_num_systolic_array_per_core; i++) sa_utilization.push_back(static_cast(_stat_sa_compute_cycle.at(i) * 100) / _config.core_print_interval); - float dram_bw = _config.dram_req_size * _stat_mem_response * _config.core_freq / (_config.core_print_interval * 1000); // B/cycle + float dram_bw = _config.dram_req_size * _stat_mem_response * _config.core_freq_mhz / (_config.core_print_interval * 1000); // B/cycle auto level = spdlog::level::info; if(_id != 0) level = spdlog::level::debug; diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc index 1c1b379b..b09e0388 100644 --- a/PyTorchSimBackend/src/Simulator.cc +++ b/PyTorchSimBackend/src/Simulator.cc @@ -3,7 +3,7 @@ Simulator::Simulator(SimulationConfig config) : _config(config), _core_cycles(0) { // Create dram object - _core_period = 1000000 / (config.core_freq); + _core_period = 1000000 / (config.core_freq_mhz); _icnt_period = 1000000 / (config.icnt_freq); _dram_period = 1000000 / (config.dram_freq); _core_time = 0; @@ -24,10 +24,10 @@ Simulator::Simulator(SimulationConfig config) for (int core_index = 0; core_index < _n_cores; core_index++) { if (config.core_type[core_index] == CoreType::WS_MESH) { spdlog::info("[Config/Core] Core {}: {} MHz, Systolic array per core: {}", - core_index, config.core_freq, config.num_systolic_array_per_core); + core_index, config.core_freq_mhz, config.num_systolic_array_per_core); _cores.at(core_index) = std::make_unique(core_index, _config); } else if(config.core_type[core_index] == CoreType::STONNE) { - spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq); + spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq_mhz); _cores.at(core_index) = std::make_unique(core_index, _config); } else { throw std::runtime_error(fmt::format("Not implemented Core type {} ", diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc index 49aff8ed..490a3feb 100644 --- a/PyTorchSimBackend/src/SparseCore.cc +++ b/PyTorchSimBackend/src/SparseCore.cc @@ -27,14 +27,14 @@ SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config) } Config stonneConfig = stonneCores.at(0)->getStonneConfig(); - unsigned int core_freq = config.core_freq; // MHz; + unsigned int core_freq_mhz = config.core_freq_mhz; // MHz; num_ms = stonneConfig.m_MSNetworkCfg.ms_size; r_port_nr = config.num_stonne_port; w_port_nr = config.num_stonne_port; - double compute_throughput = static_cast(num_ms) * core_freq / 1e3; // FLOPs/sec - double dn_bandwidth = static_cast(r_port_nr) * config.dram_req_size * core_freq * 1e6 / 8.0 / 1e9; // GB/s - double rn_bandwidth = static_cast(w_port_nr) * config.dram_req_size * core_freq * 1e6 / 8.0 / 1e9; // GB/s + double compute_throughput = static_cast(num_ms) * core_freq_mhz / 1e3; // FLOPs/sec + double dn_bandwidth = static_cast(r_port_nr) * config.dram_req_size * core_freq_mhz * 1e6 / 8.0 / 1e9; // GB/s + double rn_bandwidth = static_cast(w_port_nr) * config.dram_req_size * core_freq_mhz * 1e6 / 8.0 / 1e9; // GB/s for (int i=0; i Date: Sun, 30 Nov 2025 02:08:48 +0000 Subject: [PATCH 30/53] Refactor] Add missing dram_freq -> dram_freq_mhz --- PyTorchSimBackend/include/SimulationConfig.h | 4 ++-- PyTorchSimBackend/src/Common.cc | 2 +- PyTorchSimBackend/src/Dram.cc | 2 +- PyTorchSimBackend/src/Simulator.cc | 2 +- README.md | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/PyTorchSimBackend/include/SimulationConfig.h index a9825516..573f90ba 100644 --- a/PyTorchSimBackend/include/SimulationConfig.h +++ b/PyTorchSimBackend/include/SimulationConfig.h @@ -27,7 +27,7 @@ struct SimulationConfig { /* DRAM config */ DramType dram_type; uint32_t dram_num_partitions = 1; - uint32_t dram_freq; + uint32_t dram_freq_mhz; uint32_t dram_channels; uint32_t dram_req_size; uint32_t dram_latency; @@ -63,6 +63,6 @@ struct SimulationConfig { } float max_dram_bandwidth() { - return dram_freq * dram_channels * dram_req_size * 2 / dram_nbl / 1000; // GB/s + return dram_freq_mhz * dram_channels * dram_req_size * 2 / dram_nbl / 1000; // GB/s } }; \ No newline at end of file diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc index c8ea9abc..c72dbe44 100644 --- a/PyTorchSimBackend/src/Common.cc +++ b/PyTorchSimBackend/src/Common.cc @@ -62,7 +62,7 @@ SimulationConfig initialize_config(json config) { else throw std::runtime_error(fmt::format("Not implemented dram type {} ", (std::string)config["dram_type"])); - parsed_config.dram_freq = config["dram_freq_mhz"]; + parsed_config.dram_freq_mhz = config["dram_freq_mhz"]; if (config.contains("dram_latency")) parsed_config.dram_latency = config["dram_latency"]; if (config.contains("ramulator_config_path")) diff --git a/PyTorchSimBackend/src/Dram.cc b/PyTorchSimBackend/src/Dram.cc index ca922fe4..2664ee44 100644 --- a/PyTorchSimBackend/src/Dram.cc +++ b/PyTorchSimBackend/src/Dram.cc @@ -20,7 +20,7 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) { _n_ch_per_partition = _n_ch / _n_partitions; _config = config; - spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}B", config.max_dram_bandwidth(), config.dram_freq, _n_ch, _req_size); + spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}B", config.max_dram_bandwidth(), config.dram_freq_mhz, _n_ch, _req_size); /* Initialize DRAM Channels */ for (int ch = 0; ch < _n_ch; ch++) { m_to_crossbar_queue.push_back(std::queue()); diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc index b09e0388..6cb3da76 100644 --- a/PyTorchSimBackend/src/Simulator.cc +++ b/PyTorchSimBackend/src/Simulator.cc @@ -5,7 +5,7 @@ Simulator::Simulator(SimulationConfig config) // Create dram object _core_period = 1000000 / (config.core_freq_mhz); _icnt_period = 1000000 / (config.icnt_freq); - _dram_period = 1000000 / (config.dram_freq); + _dram_period = 1000000 / (config.dram_freq_mhz); _core_time = 0; _dram_time = 0; _icnt_time = 0; diff --git a/README.md b/README.md index 0939dd11..9cbe6168 100644 --- a/README.md +++ b/README.md @@ -351,7 +351,7 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing "num_systolic_array_per_core" : 2, // Number of systolic array per core "dram_type" : "ramulator2", // DRAM type (ex. ramulator2, simple) - "dram_freq" : 940, // DRAM frequency (MHz) + "dram_freq_mhz" : 940, // DRAM frequency (MHz) "dram_channels": 32, // Number of DRAM channels "dram_req_size": 32, // DRAM request size (B) "dram_latency" : 10, // DRAM latency (cycle) From b9eaeff390b0488972f38589b3131ac9a5f0cac0 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Sun, 30 Nov 2025 02:13:37 +0000 Subject: [PATCH 31/53] [Refactor] Rename *_log_print_interval_cycle -> *_stats_print_period_cycles --- .../configs/heterogeneous_c2_simple_noc.json | 4 +-- .../configs/stonne_big_c1_simple_noc.json | 4 +-- .../configs/stonne_single_c1_simple_noc.json | 4 +-- .../stonne_validation_c1_simple_noc.json | 4 +-- .../systolic_ws_128x128_c1_booksim_tpuv2.json | 4 +-- ...stolic_ws_128x128_c1_simple_noc_tpuv2.json | 4 +-- ...stolic_ws_128x128_c1_simple_noc_tpuv3.json | 4 +-- ...c_ws_128x128_c1_simple_noc_tpuv3_half.json | 4 +-- ...stolic_ws_128x128_c1_simple_noc_tpuv4.json | 4 +-- .../systolic_ws_128x128_c2_booksim_tpuv3.json | 2 +- ...s_128x128_c2_booksim_tpuv3_bw_quarter.json | 31 +++++++++++++++++++ .../systolic_ws_128x128_c2_chiplet_tpuv3.json | 13 ++------ ...lic_ws_128x128_c2_chiplet_tpuv3_xnuma.json | 4 +-- ...stolic_ws_128x128_c2_simple_noc_tpuv2.json | 4 +-- ...stolic_ws_128x128_c2_simple_noc_tpuv3.json | 4 +-- ...128x128_c2_simple_noc_tpuv3_partition.json | 4 +-- ...stolic_ws_128x128_c2_simple_noc_tpuv4.json | 4 +-- .../systolic_ws_8x8_c1_12G_simple_noc.json | 4 +-- .../systolic_ws_8x8_c1_24G_simple_noc.json | 4 +-- .../systolic_ws_8x8_c1_48G_simple_noc.json | 4 +-- .../configs/systolic_ws_8x8_c1_booksim.json | 4 +-- .../systolic_ws_8x8_c1_simple_noc.json | 4 +-- .../systolic_ws_8x8_c2_12G_simple_noc.json | 4 +-- .../systolic_ws_8x8_c2_24G_simple_noc.json | 4 +-- .../systolic_ws_8x8_c2_48G_simple_noc.json | 4 +-- PyTorchSimBackend/include/SimulationConfig.h | 2 +- PyTorchSimBackend/src/Common.cc | 8 ++--- PyTorchSimBackend/src/Simulator.cc | 2 +- 28 files changed, 85 insertions(+), 61 deletions(-) create mode 100644 PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json diff --git a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json index b3254182..ea5a8f09 100644 --- a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json +++ b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json @@ -3,7 +3,7 @@ "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", "num_cores" : 2, "core_freq_mhz" : 940, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "num_stonne_per_core" : 8, "num_stonne_port" : 64, "num_systolic_array_per_core" : 2, @@ -13,7 +13,7 @@ "dram_channels": 16, "dram_req_size_byte": 32, "dram_num_burst_length" : 2, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycles": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json index f94aeffa..f14087c9 100644 --- a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json +++ b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json @@ -3,7 +3,7 @@ "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", "num_cores" : 1, "core_freq_mhz" : 940, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "num_stonne_per_core" : 8, "num_stonne_port" : 64, @@ -12,7 +12,7 @@ "dram_channels": 8, "dram_req_size_byte": 32, "dram_num_burst_length" : 2, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycless": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json index f777c8d3..55d64800 100644 --- a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json +++ b/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json @@ -3,7 +3,7 @@ "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", "num_cores" : 1, "core_freq_mhz" : 700, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "num_stonne_per_core" : 1, "num_stonne_port" : 8, @@ -12,7 +12,7 @@ "dram_channels": 8, "dram_req_size_byte": 32, "dram_num_burst_length" : 2, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycles": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json index 68d83d01..e1d98c36 100644 --- a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json +++ b/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json @@ -3,7 +3,7 @@ "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", "num_cores" : 1, "core_freq_mhz" : 1000, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "num_stonne_per_core" : 1, "num_stonne_port" : 32, @@ -12,7 +12,7 @@ "dram_channels": 1, "dram_req_size_byte": 32, "dram_latency" : 100, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycles": 10000, "l2d_type" : "datacache", "l2d_config" : "S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32", diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json index c45ea65e..2a7c5ed1 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json @@ -1,7 +1,7 @@ { "num_cores" : 1, "core_freq_mhz" : 700, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "dram_type" : "ramulator2", "dram_freq_mhz" :700, @@ -10,7 +10,7 @@ "dram_size" : 16, "dram_num_burst_length" : 2, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycles": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", "icnt_type" : "booksim2", diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json index 082a9010..9c237ff0 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json @@ -1,7 +1,7 @@ { "num_cores" : 1, "core_freq_mhz" : 700, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "dram_type" : "ramulator2", "dram_freq_mhz" : 700, @@ -9,7 +9,7 @@ "dram_req_size_byte": 32, "dram_size" : 16, "dram_num_burst_length" : 2, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycless": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json index b7cf129d..398f5300 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json @@ -1,7 +1,7 @@ { "num_cores" : 1, "core_freq_mhz" : 940, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", @@ -9,7 +9,7 @@ "dram_channels": 16, "dram_req_size_byte": 32, "dram_num_burst_length" : 2, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycles": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json index 7e53a9e4..e680bc90 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json @@ -1,7 +1,7 @@ { "num_cores" : 1, "core_freq_mhz" : 940, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", @@ -9,7 +9,7 @@ "dram_channels": 8, "dram_req_size_byte": 32, "dram_num_burst_length" : 2, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycles": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json index 47d90bf2..85e3761d 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json @@ -1,7 +1,7 @@ { "num_cores" : 1, "core_freq_mhz" : 1050, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "num_systolic_array_per_core" : 4, "dram_type" : "ramulator2", @@ -9,7 +9,7 @@ "dram_channels": 16, "dram_req_size_byte": 32, "dram_num_burst_length" : 2, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycles": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", "l2d_type" : "datacache", "l2d_config" : "S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32", diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json index 76f48f67..2988b9b4 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json @@ -1,7 +1,7 @@ { "num_cores" : 2, "core_freq_mhz" : 940, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json new file mode 100644 index 00000000..2ca7dc6c --- /dev/null +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json @@ -0,0 +1,31 @@ +{ + "num_cores" : 2, + "core_freq_mhz" : 940, + "sram_size" : 65536, + "core_print_interval" : 10000, + "num_systolic_array_per_core" : 2, + + "dram_type" : "ramulator2", + "dram_freq" : 940, + "dram_channels": 8, + "dram_req_size": 32, + "dram_latency" : 10, + "dram_size" : 32, + "dram_nbl" : 2, + "dram_print_interval": 10000, + "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + + "icnt_type" : "booksim2", + "icnt_latency" : 1, + "icnt_freq" : 2000, + "icnt_node_per_core" : 16, + "icnt_config_path" : "../configs/booksim2_configs/fly_c32_m8.icnt", + + "precision" : 4, + "scheduler" : "simple", + "num_partition" : 2, + "partition": { + "core_0":0, + "core_1":0 + } +} \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json index 9375fea1..36991226 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json @@ -1,7 +1,7 @@ { "num_cores" : 2, "core_freq_mhz" : 940, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", @@ -9,7 +9,7 @@ "dram_channels": 32, "dram_req_size_byte": 32, "dram_num_burst_length" : 2, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycles": 10000, "dram_num_partitions" : 2, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", @@ -18,12 +18,5 @@ "icnt_freq_mhz" : 1000, "icnt_node_per_core" : 16, "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", - "icnt_print_interval" : 10000, - - - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } + "icnt_stats_print_period_cycles" : 10000 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json index 662c1e00..2790498e 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json @@ -1,7 +1,7 @@ { "num_cores" : 2, "core_freq_mhz" : 940, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", @@ -9,7 +9,7 @@ "dram_channels": 32, "dram_req_size_byte": 32, "dram_num_burst_length" : 2, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycles": 10000, "dram_num_partitions" : 1, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json index 712cb32f..77fb8b16 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json @@ -1,7 +1,7 @@ { "num_cores" : 2, "core_freq_mhz" : 700, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "dram_type" : "ramulator2", "dram_freq_mhz" :700, @@ -9,7 +9,7 @@ "dram_req_size_byte": 32, "dram_size" : 16, "dram_num_burst_length" : 2, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycles": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json index 4865b1ad..578da715 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json @@ -1,7 +1,7 @@ { "num_cores" : 2, "core_freq_mhz" : 940, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", @@ -9,7 +9,7 @@ "dram_channels": 32, "dram_req_size_byte": 32, "dram_num_burst_length" : 2, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycles": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json index 5aad699e..a320984f 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json @@ -1,7 +1,7 @@ { "num_cores" : 2, "core_freq_mhz" : 940, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycless" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", @@ -9,7 +9,7 @@ "dram_channels": 32, "dram_req_size_byte": 32, "dram_num_burst_length" : 2, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycles": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json index eb7d76e8..6d7737bd 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json @@ -1,7 +1,7 @@ { "num_cores" : 2, "core_freq_mhz" : 1050, - "core_log_print_interval_cycle" : 10000, + "core_stats_print_period_cycles" : 10000, "num_systolic_array_per_core" : 4, "dram_type" : "ramulator2", @@ -9,7 +9,7 @@ "dram_channels": 32, "dram_req_size_byte": 32, "dram_num_burst_length" : 2, - "dram_log_print_interval_cycle": 10000, + "dram_stats_print_period_cycles": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", "l2d_type" : "datacache", "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32", diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json index 9afe18d1..efdeaf06 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json @@ -1,7 +1,7 @@ { "num_cores" : 1, "core_freq_mhz" : 1000, - "core_log_print_interval_cycle" : 100000, + "core_stats_print_period_cycles" : 100000, "dram_type" : "ramulator2", "dram_freq_mhz" :800, @@ -9,7 +9,7 @@ "dram_req_size_byte": 64, "dram_size" : 16, "dram_num_burst_length" : 4, - "dram_log_print_interval_cycle": 100000, + "dram_stats_print_period_cycles": 100000, "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json index 3f1435e8..a9bd7d99 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json @@ -1,7 +1,7 @@ { "num_cores" : 1, "core_freq_mhz" : 1000, - "core_log_print_interval_cycle" : 100000, + "core_stats_print_period_cycles" : 100000, "dram_type" : "ramulator2", "dram_freq_mhz" :800, @@ -9,7 +9,7 @@ "dram_req_size_byte": 64, "dram_size" : 16, "dram_num_burst_length" : 4, - "dram_log_print_interval_cycle": 100000, + "dram_stats_print_period_cycles": 100000, "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json index 7037a045..ae76358a 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json @@ -1,7 +1,7 @@ { "num_cores" : 1, "core_freq_mhz" : 1000, - "core_log_print_interval_cycle" : 100000, + "core_stats_print_period_cycles" : 100000, "dram_type" : "ramulator2", "dram_freq_mhz" :800, @@ -9,7 +9,7 @@ "dram_req_size_byte": 64, "dram_size" : 16, "dram_num_burst_length" : 4, - "dram_log_print_interval_cycle": 100000, + "dram_stats_print_period_cycles": 100000, "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json index 4fab1fd7..61be8c57 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json @@ -1,7 +1,7 @@ { "num_cores" : 1, "core_freq_mhz" : 1000, - "core_log_print_interval_cycle" : 100000, + "core_stats_print_period_cycles" : 100000, "dram_type" : "ramulator2", "dram_freq_mhz" :800, @@ -9,7 +9,7 @@ "dram_req_size_byte": 64, "dram_size" : 16, "dram_num_burst_length" : 4, - "dram_log_print_interval_cycle": 100000, + "dram_stats_print_period_cycless": 100000, "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", "icnt_type" : "booksim2", diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json index 29955051..5c4824f7 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json @@ -1,7 +1,7 @@ { "num_cores" : 1, "core_freq_mhz" : 1000, - "core_log_print_interval_cycle" : 100000, + "core_stats_print_period_cycles" : 100000, "dram_type" : "ramulator2", "dram_freq_mhz" :800, @@ -10,7 +10,7 @@ "dram_size" : 16, "dram_num_burst_length" : 4, - "dram_log_print_interval_cycle": 100000, + "dram_stats_print_period_cycles": 100000, "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json index 3e6844fd..d29a05bf 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json @@ -2,7 +2,7 @@ "core_type" : ["ws_mesh","ws_mesh"], "num_cores" : 2, "core_freq_mhz" : 1000, - "core_log_print_interval_cycle" : 100000, + "core_stats_print_period_cycles" : 100000, "dram_type" : "ramulator2", "dram_freq_mhz" :800, @@ -10,7 +10,7 @@ "dram_req_size_byte": 64, "dram_size" : 16, "dram_num_burst_length" : 4, - "dram_log_print_interval_cycle": 100000, + "dram_stats_print_period_cycless": 100000, "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json index d0927648..841313f2 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json @@ -1,7 +1,7 @@ { "num_cores" : 2, "core_freq_mhz" : 1000, - "core_log_print_interval_cycle" : 100000, + "core_stats_print_period_cycles" : 100000, "dram_type" : "ramulator2", "dram_freq_mhz" :800, @@ -9,7 +9,7 @@ "dram_req_size_byte": 64, "dram_size" : 16, "dram_num_burst_length" : 4, - "dram_log_print_interval_cycle": 100000, + "dram_stats_print_period_cycles": 100000, "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json index 095ea820..a13eb73a 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json @@ -1,7 +1,7 @@ { "num_cores" : 2, "core_freq_mhz" : 1000, - "core_log_print_interval_cycle" : 100000, + "core_stats_print_period_cycles" : 100000, "dram_type" : "ramulator2", "dram_freq_mhz" :800, @@ -9,7 +9,7 @@ "dram_req_size_byte": 64, "dram_size" : 16, "dram_num_burst_length" : 4, - "dram_log_print_interval_cycle": 100000, + "dram_stats_print_period_cycless": 100000, "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", "icnt_type" : "simple", diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/PyTorchSimBackend/include/SimulationConfig.h index 573f90ba..25b482ae 100644 --- a/PyTorchSimBackend/include/SimulationConfig.h +++ b/PyTorchSimBackend/include/SimulationConfig.h @@ -46,7 +46,7 @@ struct SimulationConfig { std::string icnt_config_path; uint32_t icnt_freq; uint32_t icnt_latency; - uint32_t icnt_print_interval=0; + uint32_t icnt_stats_print_period_cycles=0; /* Sheduler config */ uint32_t num_patition=1; diff --git a/PyTorchSimBackend/src/Common.cc b/PyTorchSimBackend/src/Common.cc index c72dbe44..7e88becb 100644 --- a/PyTorchSimBackend/src/Common.cc +++ b/PyTorchSimBackend/src/Common.cc @@ -46,7 +46,7 @@ SimulationConfig initialize_config(json config) { parsed_config.num_stonne_per_core = config["num_stonne_per_core"]; if (config.contains("num_stonne_port")) parsed_config.num_stonne_port = config["num_stonne_port"]; - parsed_config.core_print_interval = get_config_value(config, "core_log_print_interval_cycle"); + parsed_config.core_print_interval = get_config_value(config, "core_stats_print_period_cycles"); /* Stonne config */ if (config.contains("stonne_config_path")) @@ -70,8 +70,8 @@ SimulationConfig initialize_config(json config) { parsed_config.dram_channels = config["dram_channels"]; if (config.contains("dram_req_size_byte")) parsed_config.dram_req_size = config["dram_req_size_byte"]; - if (config.contains("dram_log_print_interval_cycle")) - parsed_config.dram_print_interval = config["dram_log_print_interval_cycle"]; + if (config.contains("dram_stats_print_period_cycles")) + parsed_config.dram_print_interval = config["dram_stats_print_period_cycles"]; if(config.contains("dram_num_burst_length")) parsed_config.dram_nbl = config["dram_num_burst_length"]; if (config.contains("dram_num_partitions")) @@ -109,7 +109,7 @@ SimulationConfig initialize_config(json config) { if (config.contains("booksim_config_path")) parsed_config.icnt_config_path = config["booksim_config_path"]; if (config.contains("icnt_log_print_interval_cycle")) - parsed_config.icnt_print_interval = config["icnt_log_print_interval_cycle"]; + parsed_config.icnt_stats_print_period_cycles = config["icnt_log_print_interval_cycle"]; if (config.contains("icnt_node_per_core")) parsed_config.icnt_node_per_core = config["icnt_node_per_core"]; diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc index 6cb3da76..c8bc95ed 100644 --- a/PyTorchSimBackend/src/Simulator.cc +++ b/PyTorchSimBackend/src/Simulator.cc @@ -62,7 +62,7 @@ Simulator::Simulator(SimulationConfig config) spdlog::error("[Configuration] Invalid interconnect type...!"); exit(EXIT_FAILURE); } - _icnt_interval = config.icnt_print_interval; + _icnt_interval = config.icnt_stats_print_period_cycles; // Initialize Scheduler for (int i=0; i Date: Sun, 30 Nov 2025 02:27:46 +0000 Subject: [PATCH 32/53] [Refactor] Cleanup unused config options --- .../configs/heterogeneous_c2_simple_noc.json | 1 - .../configs/stonne_big_c1_simple_noc.json | 1 - .../configs/stonne_single_c1_simple_noc.json | 1 - .../configs/stonne_validation_c1_simple_noc.json | 1 - .../systolic_ws_128x128_c1_booksim_tpuv2.json | 10 +--------- .../systolic_ws_128x128_c1_simple_noc_tpuv2.json | 10 +--------- ...ystolic_ws_128x128_c1_simple_noc_tpuv3_half.json | 8 +------- .../systolic_ws_128x128_c1_simple_noc_tpuv4.json | 8 +------- .../systolic_ws_128x128_c2_booksim_tpuv3.json | 8 +------- .../systolic_ws_128x128_c2_chiplet_tpuv3.json | 1 - .../systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json | 9 +-------- .../systolic_ws_128x128_c2_simple_noc_tpuv2.json | 10 +--------- .../systolic_ws_128x128_c2_simple_noc_tpuv3.json | 13 +++---------- ...ic_ws_128x128_c2_simple_noc_tpuv3_partition.json | 1 - .../systolic_ws_128x128_c2_simple_noc_tpuv4.json | 9 +-------- .../configs/systolic_ws_8x8_c1_12G_simple_noc.json | 4 +--- .../configs/systolic_ws_8x8_c1_24G_simple_noc.json | 3 +-- .../configs/systolic_ws_8x8_c1_48G_simple_noc.json | 3 +-- .../configs/systolic_ws_8x8_c1_booksim.json | 9 +-------- .../configs/systolic_ws_8x8_c1_simple_noc.json | 6 +----- .../configs/systolic_ws_8x8_c2_12G_simple_noc.json | 3 +-- .../configs/systolic_ws_8x8_c2_24G_simple_noc.json | 4 +--- .../configs/systolic_ws_8x8_c2_48G_simple_noc.json | 4 +--- 23 files changed, 19 insertions(+), 108 deletions(-) diff --git a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json index ea5a8f09..6d1ff722 100644 --- a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json +++ b/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json @@ -19,7 +19,6 @@ "icnt_type" : "simple", "icnt_latency" : 7, "icnt_freq_mhz" : 15000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", "num_partition" : 2, "partition": { diff --git a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json index f14087c9..1bafbfa8 100644 --- a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json +++ b/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json @@ -18,7 +18,6 @@ "icnt_type" : "simple", "icnt_latency" : 7, "icnt_freq_mhz" : 15000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", "num_partition" : 1, "partition": { diff --git a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json index 55d64800..9ebbc593 100644 --- a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json +++ b/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json @@ -18,7 +18,6 @@ "icnt_type" : "simple", "icnt_latency" : 7, "icnt_freq_mhz" : 7000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", "num_partition" : 1, "partition": { diff --git a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json b/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json index e1d98c36..0bd9af2c 100644 --- a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json +++ b/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json @@ -19,7 +19,6 @@ "icnt_type" : "simple", "icnt_latency" : 7, "icnt_freq_mhz" : 7000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m8.icnt", "num_partition" : 1, "partition": { diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json index 2a7c5ed1..838aa521 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json @@ -8,20 +8,12 @@ "dram_channels": 16, "dram_req_size_byte": 32, - "dram_size" : 16, "dram_num_burst_length" : 2, "dram_stats_print_period_cycles": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", "icnt_type" : "booksim2", - "icnt_latency" : 1, "icnt_freq_mhz" : 8000, "icnt_node_per_core" : 16, - "booksim_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt", - - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } + "booksim_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt" } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json index 9c237ff0..e1f7f3bb 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json @@ -7,19 +7,11 @@ "dram_freq_mhz" : 700, "dram_channels": 32, "dram_req_size_byte": 32, - "dram_size" : 16, "dram_num_burst_length" : 2, "dram_stats_print_period_cycless": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq_mhz" : 10000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } + "icnt_freq_mhz" : 10000 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json index e680bc90..87d18519 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json @@ -14,11 +14,5 @@ "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq_mhz" : 15000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "num_partition" : 1, - "partition": { - "core_0": 0 - } + "icnt_freq_mhz" : 15000 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json index 85e3761d..53d24db5 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json @@ -16,11 +16,5 @@ "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq_mhz" : 19200, - "booksim_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "num_partition" : 1, - "partition": { - "core_0":0 - } + "icnt_freq_mhz" : 19200 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json index 2988b9b4..1479a7fb 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json @@ -15,11 +15,5 @@ "icnt_type" : "booksim2", "icnt_freq_mhz" : 28000, "icnt_node_per_core" : 1, - "booksim_config_path" : "../configs/booksim2_configs/fly_c2_m32.icnt", - - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } + "booksim_config_path" : "../configs/booksim2_configs/fly_c2_m32.icnt" } diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json index 36991226..6f2e37f8 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json @@ -14,7 +14,6 @@ "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "booksim2", - "icnt_latency" : 1, "icnt_freq_mhz" : 1000, "icnt_node_per_core" : 16, "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json index 2790498e..1cb3e529 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json @@ -14,14 +14,7 @@ "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "booksim2", - "icnt_latency" : 1, "icnt_freq_mhz" : 1000, "icnt_node_per_core" : 16, - "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", - - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } + "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt" } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json index 77fb8b16..a5ceeeba 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json @@ -7,19 +7,11 @@ "dram_freq_mhz" :700, "dram_channels": 32, "dram_req_size_byte": 32, - "dram_size" : 16, "dram_num_burst_length" : 2, "dram_stats_print_period_cycles": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq_mhz" : 20000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } + "icnt_freq_mhz" : 20000 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json index 578da715..386ee598 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json @@ -1,7 +1,7 @@ { "num_cores" : 2, "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, + "core_log_print_interval_cycle" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", @@ -9,17 +9,10 @@ "dram_channels": 32, "dram_req_size_byte": 32, "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, + "dram_log_print_interval_cycle": 10000, "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq_mhz" : 28000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } + "icnt_freq_mhz" : 28000 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json index a320984f..5b059dc8 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json @@ -15,7 +15,6 @@ "icnt_type" : "simple", "icnt_latency" : 7, "icnt_freq_mhz" : 28000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", "num_partition" : 2, "partition": { diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json index 6d7737bd..04adc328 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json @@ -17,12 +17,5 @@ "icnt_type" : "simple", "icnt_latency" : 7, "icnt_freq_mhz" : 48000, - "icnt_node_per_core" : 1, - "booksim_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", - - "num_partition" : 1, - "partition": { - "core_0":0, - "core_1":0 - } + "icnt_node_per_core" : 1 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json index efdeaf06..045407b7 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json @@ -7,13 +7,11 @@ "dram_freq_mhz" :800, "dram_channels": 1, "dram_req_size_byte": 64, - "dram_size" : 16, "dram_num_burst_length" : 4, "dram_stats_print_period_cycles": 100000, "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", "icnt_type" : "simple", "icnt_latency" : 1, - "icnt_freq_mhz" : 1000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt" + "icnt_freq_mhz" : 1000 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json index a9bd7d99..4c15105e 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json @@ -14,6 +14,5 @@ "icnt_type" : "simple", "icnt_latency" : 1, - "icnt_freq_mhz" : 8000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt" + "icnt_freq_mhz" : 8000 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json index ae76358a..e05eef3e 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json @@ -14,6 +14,5 @@ "icnt_type" : "simple", "icnt_latency" : 1, - "icnt_freq_mhz" : 8000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt" + "icnt_freq_mhz" : 8000 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json index 61be8c57..3c360d29 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json @@ -14,12 +14,5 @@ "icnt_type" : "booksim2", "icnt_latency" : 1, - "icnt_freq_mhz" : 8000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - } + "icnt_freq_mhz" : 8000 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json index 5c4824f7..120894a3 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json @@ -15,9 +15,5 @@ "icnt_type" : "simple", "icnt_latency" : 1, - "icnt_freq_mhz" : 8000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt", - - - + "icnt_freq_mhz" : 8000 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json index d29a05bf..4db99419 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json @@ -15,6 +15,5 @@ "icnt_type" : "simple", "icnt_latency" : 1, - "icnt_freq_mhz" : 8000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c2_m4.icnt" + "icnt_freq_mhz" : 8000 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json index 841313f2..0b447f92 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json @@ -7,13 +7,11 @@ "dram_freq_mhz" :800, "dram_channels": 2, "dram_req_size_byte": 64, - "dram_size" : 16, "dram_num_burst_length" : 4, "dram_stats_print_period_cycles": 100000, "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", "icnt_type" : "simple", "icnt_latency" : 1, - "icnt_freq_mhz" : 8000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c2_m8.icnt" + "icnt_freq_mhz" : 8000 } \ No newline at end of file diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json index a13eb73a..1666591d 100644 --- a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json +++ b/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json @@ -7,13 +7,11 @@ "dram_freq_mhz" :800, "dram_channels": 4, "dram_req_size_byte": 64, - "dram_size" : 16, "dram_num_burst_length" : 4, "dram_stats_print_period_cycless": 100000, "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", "icnt_type" : "simple", "icnt_latency" : 1, - "icnt_freq_mhz" : 8000, - "booksim_config_path" : "../configs/booksim2_configs/fly_c1_m1.icnt" + "icnt_freq_mhz" : 8000 } \ No newline at end of file From 5bba4ae1cacba3930d38a58c1023743485533324 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Sun, 30 Nov 2025 03:39:56 +0000 Subject: [PATCH 33/53] [Refactor] Define trace log format: [cycle][core][event] --- PyTorchSimBackend/src/Core.cc | 27 +++++++++------- PyTorchSimBackend/src/DMA.cc | 4 +-- PyTorchSimBackend/src/Simulator.cc | 2 +- PyTorchSimBackend/src/SparseCore.cc | 46 +++++++++++++-------------- README.md | 2 +- Simulator/simulator.py | 6 ++-- scripts/sim_time.sh | 2 +- scripts/stonne_experiment2/tog_gen.py | 2 +- 8 files changed, 48 insertions(+), 43 deletions(-) diff --git a/PyTorchSimBackend/src/Core.cc b/PyTorchSimBackend/src/Core.cc index d18702f8..b033d500 100644 --- a/PyTorchSimBackend/src/Core.cc +++ b/PyTorchSimBackend/src/Core.cc @@ -23,7 +23,7 @@ bool Core::can_issue(const std::shared_ptr& op) { void Core::issue(std::shared_ptr op) { if (op->get_instructions().size()){ - spdlog::trace("[{}][Core {}] New Tile is issued", + spdlog::trace("[{}][Core {}][TILE_SCHEDULED]", _core_cycle, _id); } for (const auto& inst : op->get_instructions()) { @@ -222,7 +222,7 @@ void Core::cycle() { finish_instruction(inst); else _dma.register_tag_waiter(inst->subgraph_id, key, inst); - spdlog::trace("[{}][Core {}] {} SKIPPED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, + spdlog::trace("[{}][Core {}][SIKIPPED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_addr_name(), fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), @@ -232,7 +232,7 @@ void Core::cycle() { _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; break; } else { - spdlog::trace("[{}][Core {}] {} ISSUED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, + spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_addr_name(), fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), @@ -244,8 +244,12 @@ void Core::cycle() { } } case Opcode::MOVOUT: - spdlog::trace("[{}][Core {}] {} ISSUED", _core_cycle, _id, - opcode_to_string(inst->get_opcode())); + spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, + opcode_to_string(inst->get_opcode()), + inst->get_addr_name(), + fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), + fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), + fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", "))); _st_inst_queue.push(inst); issued = true; break; @@ -261,13 +265,14 @@ void Core::cycle() { inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle() - overlapped_cycle; inst->bubble_cycle = bubble_cycle; } + if (inst->get_compute_cycle() == 0) { inst->finish_instruction(); static_cast(inst->get_owner())->inc_finished_inst(); _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; instructions.erase(it); } else { - spdlog::trace("[{}][Core {}][SA {}] {}-{} ISSUED, finsh at {}", _core_cycle, _id, _systolic_array_rr, + spdlog::trace("[{}][Core {}][INST_ISSUED][SA {}] {}-{}, finsh at {}", _core_cycle, _id, _systolic_array_rr, opcode_to_string(inst->get_opcode()), inst->get_compute_type(), inst->finish_cycle); target_pipeline.push(inst); issued = true; @@ -294,7 +299,7 @@ void Core::cycle() { } else { _dma.register_tag_waiter(inst->subgraph_id, key, inst); } - spdlog::trace("[{}][Core {}] {} ISSUED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, + spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_addr_name(), fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), @@ -337,23 +342,23 @@ void Core::cycle() { void Core::finish_instruction(std::shared_ptr& inst) { if (inst->finished) { - spdlog::error("[{}][Core {}] {} FINISHED, inst already finished!!", _core_cycle, _id, + spdlog::error("[{}][Core {}][ERROR] {} inst already finished!!", _core_cycle, _id, opcode_to_string(inst->get_opcode())); exit(EXIT_FAILURE); } inst->finish_instruction(); static_cast(inst->get_owner())->inc_finished_inst(); if (inst->get_opcode() == Opcode::COMP) { - spdlog::trace("[{}][Core {}] {}-{} FINISHED", + spdlog::trace("[{}][Core {}][INST_FINISHED] {}-{}", _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_compute_type()); } else if (inst->get_opcode() != Opcode::BAR && inst->is_async_dma()){ - spdlog::trace("[{}][Core {}] {} ASYNC REGISTERED, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", + spdlog::trace("[{}][Core {}][ASYNC] {} subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->subgraph_id, inst->get_addr_name(), inst->get_tag_id(), fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", "))); } else if ((inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) && !inst->is_async_dma()) { - spdlog::trace("[{}][Core {}] {} FINISHED, addr_name: {}", _core_cycle, _id, + spdlog::trace("[{}][Core {}][INST_FINISHED] {} addr_name: {}", _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_addr_name()); } } diff --git a/PyTorchSimBackend/src/DMA.cc b/PyTorchSimBackend/src/DMA.cc index 7c8eb656..03509fa0 100644 --- a/PyTorchSimBackend/src/DMA.cc +++ b/PyTorchSimBackend/src/DMA.cc @@ -26,8 +26,8 @@ std::shared_ptr> DMA::get_memory_access(cycle_type core_ unsigned long long base_daddr = _current_inst->get_base_dram_address(); // Todo. We use a ternsor level buffer allocation, so we don't need to check all memfetch bool is_cacheable = owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size); - spdlog::trace("[{}][SRAM Trace] Core-{}, Address: 0x{:016x}, Is_cacheable: {}", core_cycle, _id, base_daddr, is_cacheable); - spdlog::trace("[{}][NUMA Trace] Core-{}, Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}", + spdlog::trace("[{}][Core {}][SRAM] Address: 0x{:016x}, Is_cacheable: {}", core_cycle, _id, base_daddr, is_cacheable); + spdlog::trace("[{}][Core {}][NUMA] Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}", core_cycle, _id, owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write()); for (auto addr: *addr_set) { diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc index c8bc95ed..2b11f1bf 100644 --- a/PyTorchSimBackend/src/Simulator.cc +++ b/PyTorchSimBackend/src/Simulator.cc @@ -229,7 +229,7 @@ void Simulator::cycle() { if (IS_ICNT_CYCLE(_cycle_mask)) icnt_cycle(); } - spdlog::info("Simulation Finished"); + spdlog::info("Simulation finished"); for (auto &core: _cores) { core->check_tag(); } diff --git a/PyTorchSimBackend/src/SparseCore.cc b/PyTorchSimBackend/src/SparseCore.cc index 490a3feb..d5629b9c 100644 --- a/PyTorchSimBackend/src/SparseCore.cc +++ b/PyTorchSimBackend/src/SparseCore.cc @@ -68,7 +68,7 @@ void SparseCore::issue(std::shared_ptr tile) { } } if (selected_core_idx == -1) { - spdlog::error("[StonneCore {}] Faield to issue tile", _id); + spdlog::error("[StonneCore {}] Failed to issue tile", _id); exit(1); } stonneCores.at(selected_core_idx)->init(1); @@ -84,7 +84,7 @@ void SparseCore::issue(std::shared_ptr tile) { setTraceMode(selected_core_idx, is_trace_mode); percore_tiles.at(selected_core_idx).push_back(tile); coreBusy.at(selected_core_idx) = true; - spdlog::info("[StonneCore {}][{}] issued new tile (trace_mode: {})", _id, selected_core_idx, is_trace_mode); + spdlog::info("[{}][StonneCore {}/{}][Launch] New operation (trace_mode: {})", _core_cycle, _id, selected_core_idx, is_trace_mode); }; bool SparseCore::can_issue(const std::shared_ptr& op) { @@ -100,8 +100,8 @@ void SparseCore::checkStatus(uint32_t subcore_id) { int new_status = stonneCore->getMCFSMStats(); int compute_cycle = stonneCore->getMSStats().n_multiplications; if (traceCoreStatus.at(subcore_id) != new_status) { - spdlog::trace("[{}]Stonne Core [{}] status transition {} -> {}, Load/Store: {}/{}, compute_cycle: {}", - _core_cycle, _id, traceCoreStatus.at(subcore_id), new_status, + spdlog::trace("[{}][StonneCore {}/{}][Transition] status {} -> {}, Load/Store: {}/{}, compute_cycle: {}", + _core_cycle, _id, subcore_id, traceCoreStatus.at(subcore_id), new_status, traceLoadTraffic.at(subcore_id).size(), traceStoreTraffic.at(subcore_id).size(), (compute_cycle - traceCoreCycle.at(subcore_id))/num_ms); if (traceLoadTraffic.at(subcore_id).size()) { TraceNode load_node = TraceNode(traceNodeList.at(subcore_id).size()+2, "load", TraceNode::StonneTraceLoad); @@ -151,14 +151,14 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { traceStoreTraffic.at(subcore_id).insert(target_addr); break; default: - spdlog::error("[SparseCore] Invalid request type from core"); + spdlog::error("[StonneCore] Invalid request type from core"); return; } req->request_time = _core_cycle; req->stonneId = subcore_id; std::tuple key = std::make_tuple(target_addr, acc_type, type, allocTrafficID()); registerMemfetch(key, [this, req, acc_type, type]() { - spdlog::trace("[{}][SparseCore] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ + spdlog::trace("[{}][StonneCore][DRAM Response] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ _core_cycle, _core_cycle - req->request_time, req->getAddress(), int(req->getcmd()), _config.dram_req_size); req->setReply(); stonneCores.at(req->stonneId)->pushResponse(req); @@ -168,7 +168,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { /* Finish stonne core */ if (coreBusy.at(subcore_id) && stonneCore->isFinished()) { stonneCore->finish(); - spdlog::info("[{}][SparseCore] Operation finished at {}", _core_cycle, _id); + spdlog::info("[{}][StonneCore {}/{}][Finish] Operation done", _core_cycle, _id, subcore_id); std::shared_ptr target_tile = percore_tiles.at(subcore_id).front(); SST_STONNE::StonneOpDesc *opDesc = static_cast(target_tile->get_custom_data()); if (opDesc->trace_path != "") @@ -239,7 +239,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { { auto acc_type = mem_access_type::GLOBAL_ACC_R; auto type = mf_type::READ_REQUEST; - spdlog::trace("[{}][StonneCore {}][{}] {} ISSUED", _core_cycle, _id, subcore_id, + spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}", _core_cycle, _id, subcore_id, opcode_to_string(inst->get_opcode())); for (auto addr : inst->get_trace_address()) { addr = addr - (addr & _config.dram_req_size-1); @@ -247,8 +247,8 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { std::tuple key = std::make_tuple(addr, acc_type, type, allocTrafficID()); uint64_t current_time = _core_cycle; registerMemfetch(key, [this, inst, addr, current_time, type]() { - spdlog::trace("[{}][SparseCore] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ - this->_core_cycle, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size); + spdlog::trace("[{}][StonneCore {}][RESPONSE] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ + this->_core_cycle, _id, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size); inst->dec_waiting_request(); }); } @@ -260,7 +260,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { { auto acc_type = mem_access_type::GLOBAL_ACC_W; auto type = mf_type::WRITE_REQUEST; - spdlog::trace("[{}][StonneCore {}][{}] {} ISSUED", _core_cycle, _id, subcore_id, + spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}", _core_cycle, _id, subcore_id, opcode_to_string(inst->get_opcode())); for (auto addr : inst->get_trace_address()) { addr = addr - (addr & _config.dram_req_size-1); @@ -268,8 +268,8 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { std::tuple key = std::make_tuple(addr, acc_type, type, allocTrafficID()); uint64_t current_time = _core_cycle; registerMemfetch(key, [this, inst, addr, current_time, type]() { - spdlog::trace("[{}][SparseCore] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ - this->_core_cycle, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size); + spdlog::trace("[{}][StonneCore {}][RESPONSE] Round Trip Cycle: {}, Address: {:#x}, Request Type: {}, DRAM Req Size: {}", \ + this->_core_cycle, _id, this->_core_cycle - current_time, addr, int(type), _config.dram_req_size); inst->dec_waiting_request(); }); } @@ -285,7 +285,7 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { inst->finish_cycle = _core_cycle + inst->get_compute_cycle(); else inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle(); - spdlog::trace("[{}][Core {}][{}] {} ISSUED, finsh at {}", _core_cycle, _id, subcore_id, + spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}, finsh at {}", _core_cycle, _id, subcore_id, opcode_to_string(inst->get_opcode()), inst->finish_cycle); target_pipeline.push(inst); issued = true; @@ -313,7 +313,7 @@ void SparseCore::cycle() { for (auto& req_pair : request_merge_table) { _request_queue.push(req_pair.second); request_merge_table.erase(req_pair.first); - spdlog::debug("[{}][SparseCore][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \ + spdlog::debug("[{}][StonneCore][{}] Address: {:#x}, Access Type: {}, Request Type: {}, DRAM Req Size: {}, nr_request: {}", \ _core_cycle, _id, req_pair.second->get_addr(), int(req_pair.second->get_access_type()), int(req_pair.second->get_type()), _config.dram_req_size, nr_request); nr_request++; @@ -366,9 +366,9 @@ void SparseCore::print_current_stats() { } cycle_type nr_mul = percore_stat.at(i).n_multiplications; percore_stat.at(i).reset(); - spdlog::info("Stonne Core [{}][{}] : nr_multiplications: {}", _id, i, nr_mul); + spdlog::info("StonneCore [{}][{}] : nr_multiplications: {}", _id, i, nr_mul); } - spdlog::info("Stonne Core [{}] : Total cycle {}", _id, _core_cycle); + spdlog::info("StonneCore [{}] : Total cycle {}", _id, _core_cycle); } void SparseCore::print_stats() { @@ -383,9 +383,9 @@ void SparseCore::print_stats() { percore_total_stat.at(i) += percore_stat.at(i); } cycle_type nr_mul = percore_total_stat.at(i).n_multiplications; - spdlog::info("Stonne Core [{}][{}] : nr_multiplications: {}", _id, i, nr_mul); + spdlog::info("StonneCore [{}][{}] : nr_multiplications: {}", _id, i, nr_mul); } - spdlog::info("Stonne Core [{}] : Total cycle {}", _id, _core_cycle); + spdlog::info("StonneCore [{}] : Total cycle {}", _id, _core_cycle); } std::shared_ptr SparseCore::pop_finished_tile() { @@ -399,17 +399,17 @@ std::shared_ptr SparseCore::pop_finished_tile() { void SparseCore::finish_instruction(std::shared_ptr& inst) { if (inst->finished) { - spdlog::error("[Core {}][{}] {} FINISHED, inst already finished!!", _id, _core_cycle, + spdlog::error("[{}][StonneCore {}][Error] {} inst already finished!!", _core_cycle, _id, opcode_to_string(inst->get_opcode())); exit(EXIT_FAILURE); } inst->finish_instruction(); static_cast(inst->get_owner())->inc_finished_inst(); if (inst->get_opcode() == Opcode::COMP) { - spdlog::info("[{}][StonneCore {}] {} FINISHED", + spdlog::info("[{}][StonneCore {}][INST_FINISHED] {}", _core_cycle, _id, opcode_to_string(inst->get_opcode())); } else if (inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) { - spdlog::info("[{}][StonneCore {}] {} FINISHED", _core_cycle, _id, + spdlog::info("[{}][StonneCore {}][INST_FINISHED] {}", _core_cycle, _id, opcode_to_string(inst->get_opcode())); } } @@ -460,5 +460,5 @@ void SparseCore::dumpTrace(int stonne_core_id, const std::string& path) { outFile << traceNodeList.at(stonne_core_id)[i]; } outFile << "\n}" << std::endl; - spdlog::info("[StonneCore] Success to save trace dump file to \"{}\"", path); + spdlog::info("[{}][StonneCore] Success to save trace dump file to \"{}\"", _core_cycle, path); } diff --git a/README.md b/README.md index 9cbe6168..66d68e11 100644 --- a/README.md +++ b/README.md @@ -161,7 +161,7 @@ Log contains memory & core stats. [info] Core [0] : NUMA local access count : 0, NUMA remote access count : 0 [info] Core [0] : Total cycle 1014 [info] Total execution cycle: 1014 -[info] Simulation time: 0.039296 seconds +[info] Simulation wall clock time: 0.039296 seconds ``` The log is dumped in `TORCHSIM_DUMP_PATH` and you can set the path as below. ```bash diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 7d4dc821..73225e8e 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -190,7 +190,7 @@ def show_progress(): class BackendSimulator(): BACKEND_RESULT_PATH_KEY = "BACKEND_RESULT_PATH" - FINISH_STR = "Simulation Finished" + FINISH_STR = "Simulation finished" ALLOC_POOL = dict() # For eagermode buffer plan def __init__(self, backend_path, config_path, vectorlane_size=-1) -> None: self.base_dir = backend_path @@ -444,8 +444,8 @@ def get_result_from_file(result_path): total_cycle = int(re.search(r'Total execution cycle: (\d+)', line).group(1)) # Parse total simulation time - if 'Simulation time' in line: - simulation_time = float(re.search(r'Simulation time: (\d+\.?\d*) seconds', line).group(1)) + if 'Simulation wall clock time' in line: + simulation_time = float(re.search(r'Simulation wall clock time: (\d+\.?\d*) seconds', line).group(1)) return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time, total_cycle if __name__ == "__main__": diff --git a/scripts/sim_time.sh b/scripts/sim_time.sh index 15c60736..494bf0e1 100755 --- a/scripts/sim_time.sh +++ b/scripts/sim_time.sh @@ -14,7 +14,7 @@ for backend_folder in "${backend_folders[@]}"; do mapfile -t files < <(find "$backend_folder" -type f) for file in "${files[@]}"; do - sim_time=$(grep "Simulation time:" "$file" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+(\.[0-9]+)?).*/\1/') + sim_time=$(grep "Simulation wall clock time:" "$file" | tail -n 1 | sed -E 's/.*Simulation wall clock time: ([0-9]+(\.[0-9]+)?).*/\1/') echo "file: $file total_cycle: $sim_time" if [[ -n "$sim_time" ]]; then diff --git a/scripts/stonne_experiment2/tog_gen.py b/scripts/stonne_experiment2/tog_gen.py index 2f184f4c..819390d9 100644 --- a/scripts/stonne_experiment2/tog_gen.py +++ b/scripts/stonne_experiment2/tog_gen.py @@ -21,7 +21,7 @@ def extract_simulation_stats(result_path): nr_multiplications = line.strip().split(":")[-1].strip() elif "Total execution cycle" in line: total_cycle = line.strip().split(":")[-1].strip() - elif "Simulation time" in line: + elif "Simulation wall clock time" in line: sim_time = line.strip().split(":")[-1].replace("seconds", "").strip() return nr_multiplications, total_cycle, sim_time From 902f13ab8befb1939af94d75a4e12c29c9c28b98 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Sun, 30 Nov 2025 13:07:24 +0900 Subject: [PATCH 34/53] [Fix] Change validation mode to functional mode --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index bd2f218a..6199e12c 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1656,7 +1656,7 @@ def run_bench(self, nodes, kernel_name, src_code): "spad_info": self.spad_info, "vlen" : self.vlen, "arg_attributes" : arg_attributes, - "validate" : extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, + "validate" : extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE, "autotune" : True, }, source_code=src_code, From 2a0c0d71729857116dba0ba17be94c9c2e29292c Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Sun, 30 Nov 2025 13:15:25 +0900 Subject: [PATCH 35/53] [Fix] typo in interconnect logging message --- PyTorchSimBackend/src/Simulator.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PyTorchSimBackend/src/Simulator.cc b/PyTorchSimBackend/src/Simulator.cc index 2054ade9..cb81611a 100644 --- a/PyTorchSimBackend/src/Simulator.cc +++ b/PyTorchSimBackend/src/Simulator.cc @@ -51,7 +51,7 @@ Simulator::Simulator(SimulationConfig config) } // Create interconnect object - spdlog::info("[Config/Interconnect] Inerconnect freq: {} MHz", config.icnt_freq); + spdlog::info("[Config/Interconnect] Interconnect freq: {} MHz", config.icnt_freq); if (config.icnt_type == IcntType::SIMPLE) { spdlog::info("[Config/Interconnect] SimpleInerconnect selected"); _icnt = std::make_unique(config); @@ -292,4 +292,4 @@ void Simulator::print_core_stat() _cores[core_id]->print_stats(); } spdlog::info("Total execution cycle: {}", _core_cycles); -} \ No newline at end of file +} From d62b12ecddac344b163064bb175c58e866cfe576 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Sun, 30 Nov 2025 16:07:06 +0900 Subject: [PATCH 36/53] Fix typo in core_stats_print_period_cycles --- .../systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json index 5b059dc8..5ead2742 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json @@ -1,7 +1,7 @@ { "num_cores" : 2, "core_freq_mhz" : 940, - "core_stats_print_period_cycless" : 10000, + "core_stats_print_period_cycles" : 10000, "num_systolic_array_per_core" : 2, "dram_type" : "ramulator2", @@ -21,4 +21,4 @@ "core_0":0, "core_1":1 } -} \ No newline at end of file +} From 8bcceaf468b0f51fe136f8e870e1d6eccee0fd8f Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Sun, 30 Nov 2025 16:13:54 +0900 Subject: [PATCH 37/53] Fix path syntax for accuracy report artifact --- .github/workflows/pytorchsim_test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index a8e49e63..32d6543c 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -694,5 +694,5 @@ jobs: uses: actions/upload-artifact@v4 with: name: accuracy-report - path: /tmp/torchsim-ci/${GITHUB_SHA}/summary_cycle.out - if-no-files-found: error \ No newline at end of file + path: /tmp/torchsim-ci/${{ github.sha }}/summary_cycle.out + if-no-files-found: error From 623b7be754651bbb251bfc5984cc7e387fbf2554 Mon Sep 17 00:00:00 2001 From: OkkyunWoo Date: Sun, 30 Nov 2025 12:14:52 +0000 Subject: [PATCH 38/53] Change PyTorchSimBackend to TOGSim --- .gitignore | 2 +- .gitmodules | 20 +++---- Dockerfile | 2 +- PyTorchSimFrontend/extension_codecache.py | 10 ++-- PyTorchSimFrontend/extension_config.py | 12 ++--- PyTorchSimFrontend/extension_op.py | 12 ++--- PyTorchSimFrontend/mlir/mlir_autotune.py | 6 +-- .../mlir/mlir_codegen_backend.py | 8 +-- PyTorchSimFrontend/mlir/mlir_conv_common.py | 4 +- .../mlir/mlir_conv_mt_template.py | 2 +- .../mlir/mlir_conv_sb_template.py | 2 +- .../mlir/mlir_conv_sbs_template.py | 2 +- PyTorchSimFrontend/mlir/mlir_conv_template.py | 2 +- PyTorchSimFrontend/mlir/mlir_scheduling.py | 4 +- README.md | 20 +++---- Scheduler/scheduler.py | 50 ++++++++--------- Simulator/simulator.py | 54 +++++++++---------- {PyTorchSimBackend => TOGSim}/CMakeLists.txt | 0 {PyTorchSimBackend => TOGSim}/conanfile.txt | 0 .../configs/booksim2_configs/anynet.icnt | 0 .../configs/booksim2_configs/anynet_file | 0 .../booksim2_configs/chiplet_32_32_2.icnt | 2 +- .../booksim2_configs/chiplet_32_32_2.net | 0 .../configs/booksim2_configs/fly_c16_m16.icnt | 0 .../configs/booksim2_configs/fly_c16_m32.icnt | 0 .../configs/booksim2_configs/fly_c16_m8.icnt | 0 .../configs/booksim2_configs/fly_c1_m1.icnt | 0 .../configs/booksim2_configs/fly_c1_m2.icnt | 0 .../configs/booksim2_configs/fly_c1_m8.icnt | 0 .../configs/booksim2_configs/fly_c2_m32.icnt | 0 .../configs/booksim2_configs/fly_c2_m8.icnt | 0 .../configs/booksim2_configs/fly_c32_m32.icnt | 0 .../configs/booksim2_configs/fly_c32_m4.icnt | 0 .../configs/booksim2_configs/fly_c32_m8.icnt | 0 .../configs/booksim2_configs/fly_c4_m2.icnt | 0 .../configs/booksim2_configs/fly_c4_m32.icnt | 0 .../configs/booksim2_configs/fly_c4_m8.icnt | 0 .../configs/booksim2_configs/fly_c64_m8.icnt | 0 .../booksim2_configs/fly_c64_m8_sif-age.icnt | 0 .../booksim2_configs/fly_c64_m8_sif-rr.icnt | 0 .../booksim2_configs/make_anynet_topology.py | 0 .../booksim2_configs/mesh_sif-age.icnt | 0 .../configs/booksim2_configs/mesh_sif-rr.icnt | 0 .../configs/heterogeneous_c2_simple_noc.json | 2 +- .../configs/ramulator2_configs/DDR4.yaml | 0 .../configs/ramulator2_configs/HBM2.yaml | 0 .../ramulator2_configs/HBM2_TPUv3.yaml | 0 .../ramulator_configs/ALDRAM-config.cfg | 0 .../configs/ramulator_configs/DDR3-config.cfg | 0 .../configs/ramulator_configs/DDR4-config.cfg | 0 .../ramulator_configs/DSARP-config.cfg | 0 .../ramulator_configs/GDDR5-config.cfg | 0 .../configs/ramulator_configs/HBM-config.cfg | 0 .../HBM-config_ChRaBaRoCo.cfg | 0 .../ramulator_configs/HBM-config_FCFS.cfg | 0 .../ramulator_configs/HBM-config_FRFCFS.cfg | 0 .../HBM-config_FRFCFS_Cap.cfg | 0 .../HBM-config_FRFCFS_PriorHit.cfg | 0 .../HBM-config_RoBaRaCoCh.cfg | 0 .../HBM-config_RoCoBaRaCh.cfg | 0 .../ramulator_configs/HBMx0.5ch-config.cfg | 0 .../ramulator_configs/HBMx2ch-config.cfg | 0 .../ramulator_configs/LPDDR3-config.cfg | 0 .../ramulator_configs/LPDDR4-config.cfg | 0 .../configs/ramulator_configs/PCM-config.cfg | 0 .../configs/ramulator_configs/SALP-config.cfg | 0 .../ramulator_configs/STTMRAM-config.cfg | 0 .../ramulator_configs/TLDRAM-config.cfg | 0 .../ramulator_configs/WideIO-config.cfg | 0 .../ramulator_configs/WideIO2-config.cfg | 0 .../configs/stonne_big_c1_simple_noc.json | 2 +- .../configs/stonne_single_c1_simple_noc.json | 2 +- .../stonne_validation_c1_simple_noc.json | 2 +- .../systolic_ws_128x128_c1_booksim_tpuv2.json | 0 ...stolic_ws_128x128_c1_simple_noc_tpuv2.json | 0 ...stolic_ws_128x128_c1_simple_noc_tpuv3.json | 0 ...c_ws_128x128_c1_simple_noc_tpuv3_half.json | 0 ...stolic_ws_128x128_c1_simple_noc_tpuv4.json | 0 .../systolic_ws_128x128_c2_booksim_tpuv3.json | 0 ...s_128x128_c2_booksim_tpuv3_bw_quarter.json | 0 .../systolic_ws_128x128_c2_chiplet_tpuv3.json | 0 ...lic_ws_128x128_c2_chiplet_tpuv3_xnuma.json | 0 ...stolic_ws_128x128_c2_simple_noc_tpuv2.json | 0 ...stolic_ws_128x128_c2_simple_noc_tpuv3.json | 0 ...128x128_c2_simple_noc_tpuv3_partition.json | 0 ...stolic_ws_128x128_c2_simple_noc_tpuv4.json | 0 .../systolic_ws_8x8_c1_12G_simple_noc.json | 0 .../systolic_ws_8x8_c1_24G_simple_noc.json | 0 .../systolic_ws_8x8_c1_48G_simple_noc.json | 0 .../configs/systolic_ws_8x8_c1_booksim.json | 0 .../systolic_ws_8x8_c1_simple_noc.json | 0 .../systolic_ws_8x8_c2_12G_simple_noc.json | 0 .../systolic_ws_8x8_c2_24G_simple_noc.json | 0 .../systolic_ws_8x8_c2_48G_simple_noc.json | 0 {PyTorchSimBackend => TOGSim}/extern/booksim | 0 {PyTorchSimBackend => TOGSim}/extern/onnx | 0 {PyTorchSimBackend => TOGSim}/extern/protobuf | 0 .../extern/ramulator2 | 0 .../extern/ramulator_custom/.gitignore | 0 .../extern/ramulator_custom/CMakeLists.txt | 0 .../include/ramulator/Ramulator.hpp | 0 .../extern/ramulator_custom/src/Config.cpp | 0 .../extern/ramulator_custom/src/Config.h | 0 .../extern/ramulator_custom/src/Controller.h | 0 .../extern/ramulator_custom/src/DDR4.cpp | 0 .../extern/ramulator_custom/src/DDR4.h | 0 .../extern/ramulator_custom/src/DRAM.h | 0 .../extern/ramulator_custom/src/HBM.cpp | 0 .../extern/ramulator_custom/src/HBM.h | 0 .../extern/ramulator_custom/src/Memory.h | 0 .../ramulator_custom/src/MemoryFactory.cpp | 0 .../ramulator_custom/src/MemoryFactory.h | 0 .../extern/ramulator_custom/src/Ramulator.cpp | 0 .../extern/ramulator_custom/src/Refresh.cpp | 0 .../extern/ramulator_custom/src/Refresh.h | 0 .../extern/ramulator_custom/src/Request.cpp | 0 .../extern/ramulator_custom/src/Request.h | 0 .../extern/ramulator_custom/src/Scheduler.h | 0 .../ramulator_custom/src/SpeedyController.h | 0 .../extern/ramulator_custom/src/StatType.cpp | 0 .../extern/ramulator_custom/src/StatType.h | 0 .../extern/ramulator_custom/src/Statistics.h | 0 .../extern/stonneCore | 0 {PyTorchSimBackend => TOGSim}/include/Cache.h | 0 .../include/Cache_defs.h | 0 .../include/Cache_stats.h | 0 .../include/Common.h | 0 {PyTorchSimBackend => TOGSim}/include/Core.h | 0 {PyTorchSimBackend => TOGSim}/include/DMA.h | 0 .../include/DelayQueue.h | 0 {PyTorchSimBackend => TOGSim}/include/Dram.h | 0 .../include/Hashing.h | 0 .../include/Instruction.h | 0 .../include/Interconnect.h | 0 .../include/IntervalTree.h | 0 .../include/L2Cache.h | 0 .../include/Memfetch.h | 0 {PyTorchSimBackend => TOGSim}/include/Model.h | 0 .../include/SimulationConfig.h | 0 .../include/Simulator.h | 0 .../include/SparseCore.h | 0 {PyTorchSimBackend => TOGSim}/include/Tile.h | 0 .../include/TileGraph.h | 0 .../include/TileGraphParser.h | 0 .../include/scheduler/Scheduler.h | 0 .../src/CMakeLists.txt | 0 {PyTorchSimBackend => TOGSim}/src/Cache.cc | 0 .../src/Cache_stats.cc | 0 {PyTorchSimBackend => TOGSim}/src/Common.cc | 0 {PyTorchSimBackend => TOGSim}/src/Core.cc | 0 {PyTorchSimBackend => TOGSim}/src/DMA.cc | 0 .../src/DelayQueue.cc | 0 {PyTorchSimBackend => TOGSim}/src/Dram.cc | 0 {PyTorchSimBackend => TOGSim}/src/Hashing.cc | 0 .../src/Instruction.cc | 0 .../src/Interconnect.cc | 2 +- {PyTorchSimBackend => TOGSim}/src/L2Cache.cc | 0 .../src/Simulator.cc | 2 +- .../src/SparseCore.cc | 0 {PyTorchSimBackend => TOGSim}/src/Tile.cc | 0 .../src/TileGraph.cc | 0 .../src/TileGraphParser.cc | 0 .../src/helper/CommandLineParser.cc | 0 .../src/helper/CommandLineParser.h | 0 {PyTorchSimBackend => TOGSim}/src/main.cc | 6 +-- .../src/scheduler/Scheduler.cc | 0 experiments/BERT.py | 8 +-- .../artifact/cycle_validation/run_cycle.sh | 2 +- experiments/artifact/speedup/run_speedup.sh | 4 +- .../speedup/scripts/run_speed_ils_bert.sh | 2 +- .../speedup/scripts/run_speed_ils_conv.sh | 2 +- .../speedup/scripts/run_speed_ils_matmul.sh | 2 +- .../speedup/scripts/run_speed_ils_resnet.sh | 2 +- experiments/attention.py | 8 +-- experiments/conv.py | 8 +-- experiments/gemm.py | 8 +-- experiments/layernorm.py | 8 +-- experiments/resnet18.py | 8 +-- experiments/resnet50.py | 8 +-- experiments/softmax.py | 8 +-- scripts/CompilerOpt_experiment/DMAopt.sh | 2 +- scripts/chiplet.sh | 12 ++--- scripts/end2end.sh | 14 ++--- scripts/get_tog_result.sh | 4 +- scripts/sim_time.sh | 10 ++-- scripts/sparsity_experiment/run.sh | 12 ++--- scripts/stonne_experiment2/tog_gen.py | 8 +-- tests/test_compile_overhead.py | 2 +- tests/test_hetro.py | 4 +- tests/test_scheduler.py | 4 +- tests/test_scheduler_batching.py | 2 +- tests/test_spmm_scheduler.py | 4 +- 192 files changed, 194 insertions(+), 194 deletions(-) rename {PyTorchSimBackend => TOGSim}/CMakeLists.txt (100%) rename {PyTorchSimBackend => TOGSim}/conanfile.txt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/anynet.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/anynet_file (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/chiplet_32_32_2.icnt (75%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/chiplet_32_32_2.net (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c16_m16.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c16_m32.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c16_m8.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c1_m1.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c1_m2.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c1_m8.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c2_m32.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c2_m8.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c32_m32.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c32_m4.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c32_m8.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c4_m2.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c4_m32.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c4_m8.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c64_m8.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c64_m8_sif-age.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/make_anynet_topology.py (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/mesh_sif-age.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/booksim2_configs/mesh_sif-rr.icnt (100%) rename {PyTorchSimBackend => TOGSim}/configs/heterogeneous_c2_simple_noc.json (83%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator2_configs/DDR4.yaml (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator2_configs/HBM2.yaml (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator2_configs/HBM2_TPUv3.yaml (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/ALDRAM-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/DDR3-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/DDR4-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/DSARP-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/GDDR5-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/HBM-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/HBM-config_FCFS.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/HBM-config_FRFCFS.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/HBMx0.5ch-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/HBMx2ch-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/LPDDR3-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/LPDDR4-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/PCM-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/SALP-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/STTMRAM-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/TLDRAM-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/WideIO-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/ramulator_configs/WideIO2-config.cfg (100%) rename {PyTorchSimBackend => TOGSim}/configs/stonne_big_c1_simple_noc.json (81%) rename {PyTorchSimBackend => TOGSim}/configs/stonne_single_c1_simple_noc.json (81%) rename {PyTorchSimBackend => TOGSim}/configs/stonne_validation_c1_simple_noc.json (82%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_128x128_c1_booksim_tpuv2.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_128x128_c2_booksim_tpuv3.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_8x8_c1_12G_simple_noc.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_8x8_c1_24G_simple_noc.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_8x8_c1_48G_simple_noc.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_8x8_c1_booksim.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_8x8_c1_simple_noc.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_8x8_c2_12G_simple_noc.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_8x8_c2_24G_simple_noc.json (100%) rename {PyTorchSimBackend => TOGSim}/configs/systolic_ws_8x8_c2_48G_simple_noc.json (100%) rename {PyTorchSimBackend => TOGSim}/extern/booksim (100%) rename {PyTorchSimBackend => TOGSim}/extern/onnx (100%) rename {PyTorchSimBackend => TOGSim}/extern/protobuf (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator2 (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/.gitignore (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/CMakeLists.txt (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/include/ramulator/Ramulator.hpp (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/Config.cpp (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/Config.h (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/Controller.h (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/DDR4.cpp (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/DDR4.h (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/DRAM.h (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/HBM.cpp (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/HBM.h (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/Memory.h (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/MemoryFactory.cpp (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/MemoryFactory.h (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/Ramulator.cpp (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/Refresh.cpp (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/Refresh.h (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/Request.cpp (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/Request.h (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/Scheduler.h (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/SpeedyController.h (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/StatType.cpp (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/StatType.h (100%) rename {PyTorchSimBackend => TOGSim}/extern/ramulator_custom/src/Statistics.h (100%) rename {PyTorchSimBackend => TOGSim}/extern/stonneCore (100%) rename {PyTorchSimBackend => TOGSim}/include/Cache.h (100%) rename {PyTorchSimBackend => TOGSim}/include/Cache_defs.h (100%) rename {PyTorchSimBackend => TOGSim}/include/Cache_stats.h (100%) rename {PyTorchSimBackend => TOGSim}/include/Common.h (100%) rename {PyTorchSimBackend => TOGSim}/include/Core.h (100%) rename {PyTorchSimBackend => TOGSim}/include/DMA.h (100%) rename {PyTorchSimBackend => TOGSim}/include/DelayQueue.h (100%) rename {PyTorchSimBackend => TOGSim}/include/Dram.h (100%) rename {PyTorchSimBackend => TOGSim}/include/Hashing.h (100%) rename {PyTorchSimBackend => TOGSim}/include/Instruction.h (100%) rename {PyTorchSimBackend => TOGSim}/include/Interconnect.h (100%) rename {PyTorchSimBackend => TOGSim}/include/IntervalTree.h (100%) rename {PyTorchSimBackend => TOGSim}/include/L2Cache.h (100%) rename {PyTorchSimBackend => TOGSim}/include/Memfetch.h (100%) rename {PyTorchSimBackend => TOGSim}/include/Model.h (100%) rename {PyTorchSimBackend => TOGSim}/include/SimulationConfig.h (100%) rename {PyTorchSimBackend => TOGSim}/include/Simulator.h (100%) rename {PyTorchSimBackend => TOGSim}/include/SparseCore.h (100%) rename {PyTorchSimBackend => TOGSim}/include/Tile.h (100%) rename {PyTorchSimBackend => TOGSim}/include/TileGraph.h (100%) rename {PyTorchSimBackend => TOGSim}/include/TileGraphParser.h (100%) rename {PyTorchSimBackend => TOGSim}/include/scheduler/Scheduler.h (100%) rename {PyTorchSimBackend => TOGSim}/src/CMakeLists.txt (100%) rename {PyTorchSimBackend => TOGSim}/src/Cache.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/Cache_stats.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/Common.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/Core.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/DMA.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/DelayQueue.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/Dram.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/Hashing.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/Instruction.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/Interconnect.cc (98%) rename {PyTorchSimBackend => TOGSim}/src/L2Cache.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/Simulator.cc (99%) rename {PyTorchSimBackend => TOGSim}/src/SparseCore.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/Tile.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/TileGraph.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/TileGraphParser.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/helper/CommandLineParser.cc (100%) rename {PyTorchSimBackend => TOGSim}/src/helper/CommandLineParser.h (100%) rename {PyTorchSimBackend => TOGSim}/src/main.cc (96%) rename {PyTorchSimBackend => TOGSim}/src/scheduler/Scheduler.cc (100%) diff --git a/.gitignore b/.gitignore index 88eb2fb8..9decced5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ __pycache__/ -PyTorchSimBackend/build/ +TOGSim/build/ .vscode diff --git a/.gitmodules b/.gitmodules index 8edc7036..24f9ccaf 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,15 +1,15 @@ -[submodule "PyTorchSimBackend/extern/onnx"] - path = PyTorchSimBackend/extern/onnx +[submodule "TOGSim/extern/onnx"] + path = TOGSim/extern/onnx url = https://github.com/onnx/onnx.git -[submodule "PyTorchSimBackend/extern/protobuf"] - path = PyTorchSimBackend/extern/protobuf +[submodule "TOGSim/extern/protobuf"] + path = TOGSim/extern/protobuf url = https://github.com/protocolbuffers/protobuf.git -[submodule "PyTorchSimBackend/extern/booksim"] - path = PyTorchSimBackend/extern/booksim +[submodule "TOGSim/extern/booksim"] + path = TOGSim/extern/booksim url = https://github.com/PSAL-POSTECH/booksim.git -[submodule "PyTorchSimBackend/extern/ramulator2"] - path = PyTorchSimBackend/extern/ramulator2 +[submodule "TOGSim/extern/ramulator2"] + path = TOGSim/extern/ramulator2 url = https://github.com/PSAL-POSTECH/ramulator2 -[submodule "PyTorchSimBackend/extern/stonneCore"] - path = PyTorchSimBackend/extern/stonneCore +[submodule "TOGSim/extern/stonneCore"] + path = TOGSim/extern/stonneCore url = https://github.com/PSAL-POSTECH/stonne_core.git diff --git a/Dockerfile b/Dockerfile index 293dcb60..37721940 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ FROM ghcr.io/psal-postech/torchsim_base:latest # Prepare PyTorchSim project COPY . /workspace/PyTorchSim -RUN cd PyTorchSim/PyTorchSimBackend && \ +RUN cd PyTorchSim/TOGSim && \ mkdir -p build && \ cd build && \ conan install .. --build=missing && \ diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index ab507e6a..577c45e9 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -7,7 +7,7 @@ from AsmParser.tog_generator import tog_generator from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen from PyTorchSimFrontend import extension_config -from Simulator.simulator import FunctionalSimulator, CycleSimulator, BackendSimulator +from Simulator.simulator import FunctionalSimulator, CycleSimulator, TOGSimulator LOCK_TIMEOUT = 600 @@ -295,12 +295,12 @@ def dummy_simulator(*args, **kwargs): onnx_path = os.path.join(result_path, "tile_graph.onnx") attribute_path = os.path.join(runtime_path, "attribute") - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + backsim = TOGSimulator(togsim_path, extension_config.CONFIG_TOGSIM_CONFIG) backsim.vectorlane_size = vectorlane_size attribute_path = backsim.create_attribute_file(attribute_path, args, loop_size=loop_size) result_path = backsim.simulation(onnx_path, attribute_path, silent_mode=silent_mode) - result = BackendSimulator.get_result_from_file(result_path) + result = TOGSimulator.get_result_from_file(result_path) return result def dryrun_simulator(*args, **kwargs): @@ -326,7 +326,7 @@ def dryrun_simulator(*args, **kwargs): cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS) return result_path, runtime_path, None - is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) and not autotune + is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) and not autotune target_simulator = dryrun_simulator if is_dryrun else dummy_simulator target_simulator.arg_attributes = arg_attributes target_simulator.future = future diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index 7aa594c1..5cda38c5 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -32,12 +32,12 @@ CONFIG_TORCHSIM_DUMP_MLIR_IR = int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False)) CONFIG_TORCHSIM_DUMP_LLVM_IR = int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False)) -# Backendsim config -CONFIG_TORCHSIM_BACKEND_CONFIG = os.environ.get('TORCHSIM_CONFIG', - default=f'{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') -CONFIG_BACKENDSIM_EAGER_MODE = int(os.environ.get("BACKENDSIM_EAGER_MODE", default=False)) -CONFIG_BACKENDSIM_DRYRUN = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) -CONFIG_BACKENDSIM_DEBUG_LEVEL = os.environ.get("BACKENDSIM_DEBUG_LEVEL", "") +# TOGSim config +CONFIG_TOGSIM_CONFIG = os.environ.get('TORCHSIM_CONFIG', + default=f'{CONFIG_TORCHSIM_DIR}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') +CONFIG_TOGSIM_EAGER_MODE = int(os.environ.get("TOGSIM_EAGER_MODE", default=False)) +CONFIG_TOGSIM_DRYRUN = int(os.environ.get('TOGSIM_DRYRUN', default=False)) +CONFIG_TOGSIM_DEBUG_LEVEL = os.environ.get("TOGSIM_DEBUG_LEVEL", "") # GEM5 config CONFIG_GEM5_PATH = os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt") diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py index 22a727c5..167544f2 100644 --- a/PyTorchSimFrontend/extension_op.py +++ b/PyTorchSimFrontend/extension_op.py @@ -13,7 +13,7 @@ from torch._inductor.codecache import write from PyTorchSimFrontend.extension_codecache import get_write_path from PyTorchSimFrontend import extension_config -from Simulator.simulator import BackendSimulator, TORCH_TO_NUMPY +from Simulator.simulator import TOGSimulator, TORCH_TO_NUMPY graph_template = { 0: { @@ -46,7 +46,7 @@ class MLIRExternKernelChoice(ExternKernelChoice): def call_name(self): - is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) + is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) if is_dryrun: return f"yield from sparse_mm_dummy_stonne_outer" return f"torch.ops.extension_op.{self.name}" @@ -275,11 +275,11 @@ def prepare_outer_product_matrix(a, b, out): def sparse_mm_stonne_outer(a, b, out): onnx_path, attribute_path, c_result_path = prepare_outer_product_matrix(a, b, out) - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json' - backsim = BackendSimulator(backend_path, stonne_config_path) + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/TOGSim/configs/stonne_single_c1_simple_noc.json' + backsim = TOGSimulator(togsim_path, stonne_config_path) result_path = backsim.simulation(onnx_path) - BackendSimulator.get_result_from_file(result_path) + TOGSimulator.get_result_from_file(result_path) # Load result data #with open(c_result_path, 'rb') as f: diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py index 1027ccd9..e52d6cff 100644 --- a/PyTorchSimFrontend/mlir/mlir_autotune.py +++ b/PyTorchSimFrontend/mlir/mlir_autotune.py @@ -5,7 +5,7 @@ from torch._inductor.autotune_process import TensorMeta from torch._inductor.codecache import get_hash, write from PyTorchSimFrontend import extension_config -from Simulator.simulator import BackendSimulator +from Simulator.simulator import TOGSimulator from typing import ( Any, @@ -58,9 +58,9 @@ def make_run_fn( # Check already cached result. write_path = get_write_path(self.source_code) key, _ = write(self.source_code, "mlir", specified_dir=write_path) - result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "backendsim_result/0") + result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "togsim_result/0") if os.path.exists(result_path): - result = BackendSimulator.get_result_from_file(result_path) + result = TOGSimulator.get_result_from_file(result_path) def cached_run_fn(*args, **kwargs): return result return cached_run_fn diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 6199e12c..6a091851 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -98,8 +98,8 @@ def write_header(self): from torch import device, empty, empty_strided from {extension_codecache.__name__} import CustomAsyncCompile - from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_BACKENDSIM_EAGER_MODE - from Simulator.simulator import BackendSimulator + from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_TOGSIM_EAGER_MODE + from Simulator.simulator import TOGSimulator from PyTorchSimFrontend.extension_op import sparse_mm_dummy_stonne_outer from torch._inductor.select_algorithm import extern_kernels @@ -121,7 +121,7 @@ def sram_plan_prefix(buffer_name, buffer): start = buffer.data_ptr() end = start + buffer_size # print(f'Alloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})') - BackendSimulator.sram_alloc(buffer_name, [start, end]) + TOGSimulator.sram_alloc(buffer_name, [start, end]) def sram_plan_postfix(buffer_name, buffer): if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN): @@ -130,7 +130,7 @@ def sram_plan_postfix(buffer_name, buffer): start = buffer.data_ptr() end = start + buffer_size # print(f'Dealloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})') - BackendSimulator.sram_dealloc(buffer_name, [start, end]) + TOGSimulator.sram_dealloc(buffer_name, [start, end]) def host2device_memcopy(buffer): pass diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py index a4cd14c6..77826730 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_common.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py @@ -82,7 +82,7 @@ def outer_func_render(self, kernel_name, input_args): Y = self.output_node Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) + eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) options = dict( kernel=self.kernel, KERNEL_NAME=kernel_name, @@ -94,7 +94,7 @@ def outer_func_render(self, kernel_name, input_args): PADDING_H=self.padding[0], PADDING_W=self.padding[1], VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE, - BACKENDSIM_EAGER_MODE=eager_mode, + TOGSIM_EAGER_MODE=eager_mode, input_reorder=self.input_reorder ) code = self._template_from_string(self.WRAPPER_TEMPLATE).render(**options) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py index f013af56..0bf01421 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py @@ -120,7 +120,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if BACKENDSIM_EAGER_MODE %} + {%- if TOGSIM_EAGER_MODE %} yield ({{KERNEL_NAME}}, ) {%- endif %} """ diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py index a2959b4d..92b9a525 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py @@ -121,7 +121,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if BACKENDSIM_EAGER_MODE %} + {%- if TOGSIM_EAGER_MODE %} yield ({{KERNEL_NAME}}, ) {%- endif %} """ diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py index afbe9289..ab124852 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py @@ -121,7 +121,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if BACKENDSIM_EAGER_MODE %} + {%- if TOGSIM_EAGER_MODE %} yield ({{KERNEL_NAME}}, ) {%- endif %} """ diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py index 777d0a7b..66aa0a27 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py @@ -125,7 +125,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if BACKENDSIM_EAGER_MODE %} + {%- if TOGSIM_EAGER_MODE %} yield ({{KERNEL_NAME}}, ) {%- endif %} """ diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 9af84446..9696269e 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -214,7 +214,7 @@ def codegen_nodes(self, nodes): ex_kernel.call_kernel(kernel_name) _, args, _, _ = ex_kernel.args.mlir_argdefs() args = ", ".join(args) - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) + eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) if (eager_mode): V.graph.wrapper_code.writeline( f"yield ({kernel_name}, ({args}))" @@ -285,7 +285,7 @@ def codegen_template(self, template_node, epilogue_nodes): kernel.call_kernel(kernel_name) V.graph.removed_buffers |= kernel.removed_buffers _, args, _, _ = self.kernel_group.args.mlir_argdefs() - eager_mode = int(os.environ.get('BACKENDSIM_EAGER_MODE', default=False)) + eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) if (eager_mode): target_kernel_name = kernel_name if kernel.outer_func_name is None else kernel.outer_func_name + f"_{len(args)}" args = ", ".join(args) diff --git a/README.md b/README.md index 66d68e11..64de3fbe 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ The `tests` directory contains several AI workloads examples. ```bash python tests/test_matmul.py ``` -The result is stored to `TORCHSIM_DUMP_PATH/hash/backendsim_result/`. The log file contains detailed core, memory, and interconnect stats. +The result is stored to `TORCHSIM_DUMP_PATH/hash/togsim_result/`. The log file contains detailed core, memory, and interconnect stats. ### Run Your Own Model on PyTorchSim You can run your own PyTorch model on PyTorchSim by setting up a custom NPU device. @@ -131,9 +131,9 @@ Wrapper Codegen Path = /tmp/torchinductor_root/yd/cyda7nhzv5mtakfhfcxtmmhtsv6kg7 [Gem5Simulator] cmd> /workspace/gem5/build/RISCV/gem5.opt -r --stdout-file=sto.log -d /tmp/torchinductor/tmp/fy6nnyudtno/m5out /root/workspace/PyTorchSim/gem5_script/script_systolic.py -c /tmp/torchinductor/tmp/fy6nnyudtno/cycle_bin --vlane 128 [Gem5Simulator] Simulation is still running... [SpikeSimulator] cmd> spike --isa rv64gcv --varch=vlen:256,elen:64 --vectorlane-size=128 -m0x80000000:0x1900000000,0x2000000000:0x1000000 --scratchpad-base-paddr=137438953472 --scratchpad-base-vaddr=3489660928 --scratchpad-size=131072 --kernel-addr=0000000000010400:10846 --base-path=/tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001 /workspace/riscv-pk/build/pk /tmp/torchinductor/tmp/fy6nnyudtno/validation_binary /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/arg0_1/0.raw /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/arg1_1/0.raw /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/buf0/0.raw -[BackendSimulator] cmd> /root/workspace/PyTorchSim/PyTorchSimBackend/build/bin/Simulator --config /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/attribute/0 -[BackendSimulator] Simulation is still running.. -[BackendSimulator] Simulation of "/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/fy6nnyudtno/backendsim_result/0" +[TOGSimulator] cmd> /root/workspace/PyTorchSim/TOGSim/build/bin/Simulator --config /root/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/attribute/0 +[TOGSimulator] Simulation is still running.. +[TOGSimulator] Simulation of "/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/0" ---------------------------- |Matmul Forward Test Passed| ---------------------------- @@ -143,7 +143,7 @@ Simulation consists of three steps 1. `Gem5Simulator` obatins compute latency for TOG. 2. `SpikeSimulator` verifies the output code. -3. `BackendSimulator` simulates a NPU architecture. +3. `TOGSimulator` simulates a NPU architecture. If you want to turn off the `SpikeSimulator` for fast simulation, you can set as below. ```bash @@ -186,7 +186,7 @@ Our load generator supports multi-tenancy experiments. You can run a simple exam python tests/test_scheduler.py ``` Below is an example code of multi-tenancy `resnet18` and `EncoderBlock`. -In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSim config file(`.json`). The compiled PyTorch models are then registered with a unique model id. +In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSimulator config file(`.json`). The compiled PyTorch models are then registered with a unique model id. ```python3 import os @@ -195,11 +195,11 @@ import torch from torchvision.models import resnet18 from test_transformer import EncoderBlock base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') -config = f'{base_path}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' +config = f'{base_path}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' sys.path.append(base_path) from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request -scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) +scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) # Register compiled model target_model0 = resnet18().eval() @@ -344,7 +344,7 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing ## TOGSim Configuration ![NPU_Core](./docs/npu_core.jpg) -`PyTorchSimBackend/configs` directory contains example NPU configuration files in the JSON format. +`TOGSim/configs` directory contains example NPU configuration files in the JSON format. ``` "num_cores" : 2, // Number of NPU cores "core_freq_mhz" : 940, // Core's frequency (MHz) @@ -376,7 +376,7 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing ``` You can set TOGSim config path as below. ```bash -export TORCHSIM_CONFIG=/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json +export TORCHSIM_CONFIG=/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json ``` ## Future Works Currently, PyTorchSim supports PyTorch 2.2. Support for newer versions will be added soon. diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 18e44de4..0b633fa9 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -5,7 +5,7 @@ from pathlib import Path import importlib.util from PyTorchSimFrontend.extension_codecache import hash_prefix -from Simulator.simulator import BackendSimulator +from Simulator.simulator import TOGSimulator from PyTorchSimFrontend import extension_config def import_module_from_path(module_name, path): @@ -144,7 +144,7 @@ class PyTorchSimRunner: PARTITION_BUSY = 0 PARTITION_IDLE = 1 SELECT_NOTHING = 2 - def __init__(self, backend_simulator : BackendSimulator, num_partion=1) -> None: + def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None: self.module = self.setup_device() self.num_partion = num_partion self.launch_model_dicts = [] @@ -156,11 +156,11 @@ def __init__(self, backend_simulator : BackendSimulator, num_partion=1) -> None: self.partition_state.append(self.PARTITION_IDLE) self.finish_req_dict = {} - self.backend_simulator = backend_simulator + self.tog_simulator = tog_simulator # Dry run for compile and create generator - os.environ["BACKENDSIM_DRYRUN"] = "1" - os.environ["BACKENDSIM_EAGER_MODE"] = "1" + os.environ["TOGSIM_DRYRUN"] = "1" + os.environ["TOGSIM_EAGER_MODE"] = "1" @staticmethod def setup_device(): @@ -222,7 +222,7 @@ def is_all_idle(self): return all([self.is_partition_idle(i) for i in range(self.num_partion)]) def prepare_model(self, req_model: SchedulerDNNModel): - result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "backend_result", req_model.model_name) + result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "togsim_result", req_model.model_name) os.makedirs(result_path, exist_ok=True) index = str(len(os.listdir(result_path))) @@ -244,7 +244,7 @@ def prepare_launch_kernel(self, kernel, inputs): onnx_path = os.path.join(result_path, "tile_graph.onnx") attribute_path = os.path.join(runtime_path, "attribute") - attribute_path = self.backend_simulator.create_attribute_file(attribute_path, inputs) + attribute_path = self.tog_simulator.create_attribute_file(attribute_path, inputs) return onnx_path, attribute_path def launch_kernel(self, current_cycle, partion_idx=0): @@ -260,11 +260,11 @@ def launch_kernel(self, current_cycle, partion_idx=0): else: onnx_path, attribute_path = kernel, inputs self.partition_state[partion_idx] = self.PARTITION_BUSY - return self.backend_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx) + return self.tog_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx) class FIFORunner(PyTorchSimRunner): - def __init__(self, backend_simulator: BackendSimulator, num_partion=1) -> None: - super().__init__(backend_simulator, num_partion) + def __init__(self, tog_simulator: TOGSimulator, num_partion=1) -> None: + super().__init__(tog_simulator, num_partion) def select_kernel(self, partition_idx): while len(self.nested_launch_model_dicts[partition_idx]) or len(self.launch_model_dicts[partition_idx]): @@ -298,8 +298,8 @@ def select_kernel(self, partition_idx): return self.SELECT_NOTHING class RoundRobinRunner(PyTorchSimRunner): - def __init__(self, backend_simulator: BackendSimulator, num_partion=1) -> None: - super().__init__(backend_simulator, num_partion) + def __init__(self, tog_simulator: TOGSimulator, num_partion=1) -> None: + super().__init__(tog_simulator, num_partion) self.next_pointer = None def select_kernel(self, partition_idx): @@ -347,7 +347,7 @@ class Scheduler: FIFO_ENGINE = 0 RR_ENGINE = 1 - def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, backend_config=extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) -> None: + def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG) -> None: self.current_cycle = 0 self.max_batch = max_batch self.num_request_queue = num_request_queue @@ -356,13 +356,13 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, self.request_queue.append([]) self.finish_queue : List[Request] = [] - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - self.backend_simulator = BackendSimulator(backend_path, backend_config) - self.backend_simulator.interactive_simulation() + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + self.tog_simulator = TOGSimulator(togsim_path, togsim_config) + self.tog_simulator.interactive_simulation() if engine_select == Scheduler.FIFO_ENGINE: - self.execution_engine = FIFORunner(self.backend_simulator, self.num_request_queue) + self.execution_engine = FIFORunner(self.tog_simulator, self.num_request_queue) elif engine_select == Scheduler.RR_ENGINE: - self.execution_engine = RoundRobinRunner(self.backend_simulator, self.num_request_queue) + self.execution_engine = RoundRobinRunner(self.tog_simulator, self.num_request_queue) else: print(f"Not supporetd engine type {engine_select}") exit(1) @@ -469,8 +469,8 @@ def schedule(self): # Need to forward the time until next_arrival_time if self.execution_engine.is_all_idle(): - reason = self.backend_simulator.until(self.msec_to_cycle(next_time)) - self.current_cycle = self.backend_simulator.cycle() + reason = self.tog_simulator.until(self.msec_to_cycle(next_time)) + self.current_cycle = self.tog_simulator.cycle() else: self.run(next_time) return @@ -490,8 +490,8 @@ def execute_cycle(): return [] # Schedule jobs and update the current time - result_list = self.backend_simulator.until(self.msec_to_cycle(until_time)) - self.current_cycle = self.backend_simulator.cycle() + result_list = self.tog_simulator.until(self.msec_to_cycle(until_time)) + self.current_cycle = self.tog_simulator.cycle() for core_idx in result_list: # Kernel is finished. So set idle state @@ -526,7 +526,7 @@ def is_request_queue_empty(self): def is_finished(self): if self.is_request_queue_empty() and self.execution_engine.is_all_idle(): - self.backend_simulator.wait() + self.tog_simulator.wait() return True return False @@ -534,7 +534,7 @@ def current_time(self): return self.cycle_to_msec(self.current_cycle) def cycle_to_msec(self, cycle): - freq = self.backend_simulator.get_core_freq() + freq = self.tog_simulator.get_core_freq() return cycle / (freq / 1000) def msec_to_cycle(self, msec): @@ -542,5 +542,5 @@ def msec_to_cycle(self, msec): if (msec == -1): return msec - freq = self.backend_simulator.get_core_freq() + freq = self.tog_simulator.get_core_freq() return int(msec * (freq / 1000)) diff --git a/Simulator/simulator.py b/Simulator/simulator.py index f304d7e0..7c19f98e 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -163,7 +163,7 @@ def show_progress(): gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-r", "--stdout-file=sto.log", "-d", dir_path, extension_config.CONFIG_GEM5_SCRIPT_PATH, "-c", target_binary, "--vlane", str(vectorlane_size)] try: # Create progress thread - is_dryrun = int(os.environ.get('BACKENDSIM_DRYRUN', default=False)) or silent_mode + is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) or silent_mode if not is_dryrun: print("[Gem5Simulator] cmd> ", " ".join(gem5_cmd)) finished = False @@ -188,18 +188,18 @@ def show_progress(): cycle_list = cycle_list[:-1] return cycle_list -class BackendSimulator(): - BACKEND_RESULT_PATH_KEY = "BACKEND_RESULT_PATH" +class TOGSimulator(): + TOGSIM_RESULT_PATH_KEY = "TOGSIM_RESULT_PATH" FINISH_STR = "Simulation finished" ALLOC_POOL = dict() # For eagermode buffer plan - def __init__(self, backend_path, config_path, vectorlane_size=-1) -> None: - self.base_dir = backend_path + def __init__(self, togsim_path, config_path, vectorlane_size=-1) -> None: + self.base_dir = togsim_path self.config_path = config_path self.config_json = self.load_json(self.config_path) self.process = None self.vectorlane_size = vectorlane_size - def get_backend_command(self): + def get_togsim_command(self): bin = os.path.join(self.base_dir, "build/bin/Simulator") config = os.path.join(self.base_dir, self.config_path) cmd = f"{bin} --config {config}" @@ -211,16 +211,16 @@ def show_progress(): while not finished: i = (i + 1) % 3 tail = "." * i + " " * (3-i) - sys.stdout.write("\r[BackendSimulator] Simulation is still running." + tail) + sys.stdout.write("\r[TOGSimulator] Simulation is still running." + tail) time.sleep(1) print("") - cmd = f"{self.get_backend_command()} --models_list {model_path}" - if extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL: - cmd += f" --log_level {extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL}" + cmd = f"{self.get_togsim_command()} --models_list {model_path}" + if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: + cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" if attribute_path: cmd = f"{cmd} --attributes_list {attribute_path}" if not silent_mode: - print("[BackendSimulator] cmd> ", cmd) + print("[TOGSimulator] cmd> ", cmd) # Create progress thread if not silent_mode: @@ -236,25 +236,25 @@ def show_progress(): if not silent_mode: finished = True progress_thread.join() - print("[BackendSimulator] Command failed with exit code", e.returncode) - print("[BackendSimulator] Error output:", e.output) + print("[TOGSimulator] Command failed with exit code", e.returncode) + print("[TOGSimulator] Error output:", e.output) assert 0 # Save result to result_path - result_path = os.path.join(os.path.dirname(model_path), "backendsim_result") + result_path = os.path.join(os.path.dirname(model_path), "togsim_result") os.makedirs(result_path, exist_ok=True) file_name = str(len(os.listdir(result_path))) result_path = os.path.join(result_path, file_name) with open(result_path, "w") as f: f.write(result.decode()) - print(f'[BackendSimulator] Simulation of "{model_path}" is stored to "{result_path}"') + print(f'[TOGSimulator] Simulation of "{model_path}" is stored to "{result_path}"') return result_path def interactive_simulation(self): - cmd = f"{self.get_backend_command()} --mode interactive" - if extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL: - cmd += f" --log_level {extension_config.CONFIG_BACKENDSIM_DEBUG_LEVEL}" + cmd = f"{self.get_togsim_command()} --mode interactive" + if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: + cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" - print("[BackendSimulator] cmd> ", cmd) + print("[TOGSimulator] cmd> ", cmd) if self.process is None: self.process = subprocess.Popen( shlex.split(cmd), @@ -263,27 +263,27 @@ def interactive_simulation(self): universal_newlines=True ) else: - print("[BackendSimulator] Simulator is already running.") + print("[TOGSimulator] Simulator is already running.") def stop(self): if self.process: self.process.terminate() self.process.wait() self.process = None - print("[BackendSimulator] Simulator stopped.") + print("[TOGSimulator] Simulator stopped.") def wait(self): if self.process: - print("[BackendSimulator] Waiting for simulation to complete...") + print("[TOGSimulator] Waiting for simulation to complete...") self.quit() self.process.wait() self.process = None - print("[BackendSimulator] Simulation completed.") + print("[TOGSimulator] Simulation completed.") def send_command(self, command): if self.process: try: - if not extension_config.CONFIG_BACKENDSIM_DRYRUN: + if not extension_config.CONFIG_TOGSIM_DRYRUN: print(command, flush=True) self.process.stdin.write(command + '\n') self.process.stdin.flush() @@ -403,13 +403,13 @@ def get_result_from_file(result_path): simulation_finished_idx = -1 simulation_finished = False for idx, line in enumerate(lines): - if BackendSimulator.FINISH_STR in line: + if TOGSimulator.FINISH_STR in line: simulation_finished = True simulation_finished_idx = idx break if simulation_finished_idx == -1: - print("[BackendSimulator] Tried to parsing wrong formated output file!") + print("[TOGSimulator] Tried to parsing wrong formated output file!") return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time total_stat_lines = lines[simulation_finished_idx:] @@ -449,6 +449,6 @@ def get_result_from_file(result_path): return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time, total_cycle if __name__ == "__main__": - sim = BackendSimulator("/workspace/PyTorchSim/PyTorchSimBackend", "/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json") + sim = TOGSimulator("/workspace/PyTorchSim/TOGSim", "/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json") sim.interactive_simulation() sim.until(4000) \ No newline at end of file diff --git a/PyTorchSimBackend/CMakeLists.txt b/TOGSim/CMakeLists.txt similarity index 100% rename from PyTorchSimBackend/CMakeLists.txt rename to TOGSim/CMakeLists.txt diff --git a/PyTorchSimBackend/conanfile.txt b/TOGSim/conanfile.txt similarity index 100% rename from PyTorchSimBackend/conanfile.txt rename to TOGSim/conanfile.txt diff --git a/PyTorchSimBackend/configs/booksim2_configs/anynet.icnt b/TOGSim/configs/booksim2_configs/anynet.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/anynet.icnt rename to TOGSim/configs/booksim2_configs/anynet.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/anynet_file b/TOGSim/configs/booksim2_configs/anynet_file similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/anynet_file rename to TOGSim/configs/booksim2_configs/anynet_file diff --git a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt similarity index 75% rename from PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt rename to TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt index d18ff6e7..3102fecc 100644 --- a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.icnt +++ b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt @@ -2,7 +2,7 @@ use_map = 0 flit_size = 32 topology = anynet -network_file = /workspace/PyTorchSim/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net +network_file = /workspace/PyTorchSim/TOGSim/configs/booksim2_configs/chiplet_32_32_2.net routing_function = min subnets = 1 routing_delay = 4 diff --git a/PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net b/TOGSim/configs/booksim2_configs/chiplet_32_32_2.net similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/chiplet_32_32_2.net rename to TOGSim/configs/booksim2_configs/chiplet_32_32_2.net diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m16.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m16.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m16.icnt rename to TOGSim/configs/booksim2_configs/fly_c16_m16.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m32.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m32.icnt rename to TOGSim/configs/booksim2_configs/fly_c16_m32.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c16_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c16_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c16_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c16_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m1.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m1.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m1.icnt rename to TOGSim/configs/booksim2_configs/fly_c1_m1.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m2.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m2.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m2.icnt rename to TOGSim/configs/booksim2_configs/fly_c1_m2.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c1_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c1_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c1_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c1_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c2_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c2_m32.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c2_m32.icnt rename to TOGSim/configs/booksim2_configs/fly_c2_m32.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c2_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c2_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c2_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c2_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m32.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m32.icnt rename to TOGSim/configs/booksim2_configs/fly_c32_m32.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m4.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m4.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m4.icnt rename to TOGSim/configs/booksim2_configs/fly_c32_m4.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c32_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c32_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c32_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c32_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m2.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m2.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m2.icnt rename to TOGSim/configs/booksim2_configs/fly_c4_m2.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m32.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m32.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m32.icnt rename to TOGSim/configs/booksim2_configs/fly_c4_m32.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c4_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c4_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c4_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c4_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8.icnt rename to TOGSim/configs/booksim2_configs/fly_c64_m8.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-age.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8_sif-age.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-age.icnt rename to TOGSim/configs/booksim2_configs/fly_c64_m8_sif-age.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt b/TOGSim/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt rename to TOGSim/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/make_anynet_topology.py b/TOGSim/configs/booksim2_configs/make_anynet_topology.py similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/make_anynet_topology.py rename to TOGSim/configs/booksim2_configs/make_anynet_topology.py diff --git a/PyTorchSimBackend/configs/booksim2_configs/mesh_sif-age.icnt b/TOGSim/configs/booksim2_configs/mesh_sif-age.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/mesh_sif-age.icnt rename to TOGSim/configs/booksim2_configs/mesh_sif-age.icnt diff --git a/PyTorchSimBackend/configs/booksim2_configs/mesh_sif-rr.icnt b/TOGSim/configs/booksim2_configs/mesh_sif-rr.icnt similarity index 100% rename from PyTorchSimBackend/configs/booksim2_configs/mesh_sif-rr.icnt rename to TOGSim/configs/booksim2_configs/mesh_sif-rr.icnt diff --git a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json b/TOGSim/configs/heterogeneous_c2_simple_noc.json similarity index 83% rename from PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json rename to TOGSim/configs/heterogeneous_c2_simple_noc.json index 6d1ff722..5c3c5971 100644 --- a/PyTorchSimBackend/configs/heterogeneous_c2_simple_noc.json +++ b/TOGSim/configs/heterogeneous_c2_simple_noc.json @@ -1,6 +1,6 @@ { "core_type" : ["stonne", "ws_mesh"], - "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", + "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", "num_cores" : 2, "core_freq_mhz" : 940, "core_stats_print_period_cycles" : 10000, diff --git a/PyTorchSimBackend/configs/ramulator2_configs/DDR4.yaml b/TOGSim/configs/ramulator2_configs/DDR4.yaml similarity index 100% rename from PyTorchSimBackend/configs/ramulator2_configs/DDR4.yaml rename to TOGSim/configs/ramulator2_configs/DDR4.yaml diff --git a/PyTorchSimBackend/configs/ramulator2_configs/HBM2.yaml b/TOGSim/configs/ramulator2_configs/HBM2.yaml similarity index 100% rename from PyTorchSimBackend/configs/ramulator2_configs/HBM2.yaml rename to TOGSim/configs/ramulator2_configs/HBM2.yaml diff --git a/PyTorchSimBackend/configs/ramulator2_configs/HBM2_TPUv3.yaml b/TOGSim/configs/ramulator2_configs/HBM2_TPUv3.yaml similarity index 100% rename from PyTorchSimBackend/configs/ramulator2_configs/HBM2_TPUv3.yaml rename to TOGSim/configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/PyTorchSimBackend/configs/ramulator_configs/ALDRAM-config.cfg b/TOGSim/configs/ramulator_configs/ALDRAM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/ALDRAM-config.cfg rename to TOGSim/configs/ramulator_configs/ALDRAM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/DDR3-config.cfg b/TOGSim/configs/ramulator_configs/DDR3-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/DDR3-config.cfg rename to TOGSim/configs/ramulator_configs/DDR3-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/DDR4-config.cfg b/TOGSim/configs/ramulator_configs/DDR4-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/DDR4-config.cfg rename to TOGSim/configs/ramulator_configs/DDR4-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/DSARP-config.cfg b/TOGSim/configs/ramulator_configs/DSARP-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/DSARP-config.cfg rename to TOGSim/configs/ramulator_configs/DSARP-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/GDDR5-config.cfg b/TOGSim/configs/ramulator_configs/GDDR5-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/GDDR5-config.cfg rename to TOGSim/configs/ramulator_configs/GDDR5-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config.cfg b/TOGSim/configs/ramulator_configs/HBM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config.cfg rename to TOGSim/configs/ramulator_configs/HBM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg b/TOGSim/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FCFS.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FCFS.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FCFS.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_FCFS.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg b/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg b/TOGSim/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg b/TOGSim/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg rename to TOGSim/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBMx0.5ch-config.cfg b/TOGSim/configs/ramulator_configs/HBMx0.5ch-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBMx0.5ch-config.cfg rename to TOGSim/configs/ramulator_configs/HBMx0.5ch-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/HBMx2ch-config.cfg b/TOGSim/configs/ramulator_configs/HBMx2ch-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/HBMx2ch-config.cfg rename to TOGSim/configs/ramulator_configs/HBMx2ch-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/LPDDR3-config.cfg b/TOGSim/configs/ramulator_configs/LPDDR3-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/LPDDR3-config.cfg rename to TOGSim/configs/ramulator_configs/LPDDR3-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/LPDDR4-config.cfg b/TOGSim/configs/ramulator_configs/LPDDR4-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/LPDDR4-config.cfg rename to TOGSim/configs/ramulator_configs/LPDDR4-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/PCM-config.cfg b/TOGSim/configs/ramulator_configs/PCM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/PCM-config.cfg rename to TOGSim/configs/ramulator_configs/PCM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/SALP-config.cfg b/TOGSim/configs/ramulator_configs/SALP-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/SALP-config.cfg rename to TOGSim/configs/ramulator_configs/SALP-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/STTMRAM-config.cfg b/TOGSim/configs/ramulator_configs/STTMRAM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/STTMRAM-config.cfg rename to TOGSim/configs/ramulator_configs/STTMRAM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/TLDRAM-config.cfg b/TOGSim/configs/ramulator_configs/TLDRAM-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/TLDRAM-config.cfg rename to TOGSim/configs/ramulator_configs/TLDRAM-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/WideIO-config.cfg b/TOGSim/configs/ramulator_configs/WideIO-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/WideIO-config.cfg rename to TOGSim/configs/ramulator_configs/WideIO-config.cfg diff --git a/PyTorchSimBackend/configs/ramulator_configs/WideIO2-config.cfg b/TOGSim/configs/ramulator_configs/WideIO2-config.cfg similarity index 100% rename from PyTorchSimBackend/configs/ramulator_configs/WideIO2-config.cfg rename to TOGSim/configs/ramulator_configs/WideIO2-config.cfg diff --git a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json b/TOGSim/configs/stonne_big_c1_simple_noc.json similarity index 81% rename from PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json rename to TOGSim/configs/stonne_big_c1_simple_noc.json index 1bafbfa8..1294b3a9 100644 --- a/PyTorchSimBackend/configs/stonne_big_c1_simple_noc.json +++ b/TOGSim/configs/stonne_big_c1_simple_noc.json @@ -1,6 +1,6 @@ { "core_type" : ["stonne"], - "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", + "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", "num_cores" : 1, "core_freq_mhz" : 940, "core_stats_print_period_cycles" : 10000, diff --git a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json b/TOGSim/configs/stonne_single_c1_simple_noc.json similarity index 81% rename from PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json rename to TOGSim/configs/stonne_single_c1_simple_noc.json index 9ebbc593..42eac4bc 100644 --- a/PyTorchSimBackend/configs/stonne_single_c1_simple_noc.json +++ b/TOGSim/configs/stonne_single_c1_simple_noc.json @@ -1,6 +1,6 @@ { "core_type" : ["stonne"], - "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", + "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", "num_cores" : 1, "core_freq_mhz" : 700, "core_stats_print_period_cycles" : 10000, diff --git a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json b/TOGSim/configs/stonne_validation_c1_simple_noc.json similarity index 82% rename from PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json rename to TOGSim/configs/stonne_validation_c1_simple_noc.json index 0bd9af2c..10d2c810 100644 --- a/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json +++ b/TOGSim/configs/stonne_validation_c1_simple_noc.json @@ -1,6 +1,6 @@ { "core_type" : ["stonne"], - "stonne_config_path" : "/workspace/PyTorchSim/PyTorchSimBackend/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", + "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", "num_cores" : 1, "core_freq_mhz" : 1000, "core_stats_print_period_cycles" : 10000, diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c1_booksim_tpuv2.json rename to TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json rename to TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json rename to TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json rename to TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json rename to TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json rename to TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json b/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json rename to TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json rename to TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json rename to TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json rename to TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json rename to TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json rename to TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json rename to TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json rename to TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json rename to TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json rename to TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json b/TOGSim/configs/systolic_ws_8x8_c1_booksim.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_8x8_c1_booksim.json rename to TOGSim/configs/systolic_ws_8x8_c1_booksim.json diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_8x8_c1_simple_noc.json rename to TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json rename to TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json rename to TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json diff --git a/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json similarity index 100% rename from PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json rename to TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json diff --git a/PyTorchSimBackend/extern/booksim b/TOGSim/extern/booksim similarity index 100% rename from PyTorchSimBackend/extern/booksim rename to TOGSim/extern/booksim diff --git a/PyTorchSimBackend/extern/onnx b/TOGSim/extern/onnx similarity index 100% rename from PyTorchSimBackend/extern/onnx rename to TOGSim/extern/onnx diff --git a/PyTorchSimBackend/extern/protobuf b/TOGSim/extern/protobuf similarity index 100% rename from PyTorchSimBackend/extern/protobuf rename to TOGSim/extern/protobuf diff --git a/PyTorchSimBackend/extern/ramulator2 b/TOGSim/extern/ramulator2 similarity index 100% rename from PyTorchSimBackend/extern/ramulator2 rename to TOGSim/extern/ramulator2 diff --git a/PyTorchSimBackend/extern/ramulator_custom/.gitignore b/TOGSim/extern/ramulator_custom/.gitignore similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/.gitignore rename to TOGSim/extern/ramulator_custom/.gitignore diff --git a/PyTorchSimBackend/extern/ramulator_custom/CMakeLists.txt b/TOGSim/extern/ramulator_custom/CMakeLists.txt similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/CMakeLists.txt rename to TOGSim/extern/ramulator_custom/CMakeLists.txt diff --git a/PyTorchSimBackend/extern/ramulator_custom/include/ramulator/Ramulator.hpp b/TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/include/ramulator/Ramulator.hpp rename to TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Config.cpp b/TOGSim/extern/ramulator_custom/src/Config.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Config.cpp rename to TOGSim/extern/ramulator_custom/src/Config.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Config.h b/TOGSim/extern/ramulator_custom/src/Config.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Config.h rename to TOGSim/extern/ramulator_custom/src/Config.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Controller.h b/TOGSim/extern/ramulator_custom/src/Controller.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Controller.h rename to TOGSim/extern/ramulator_custom/src/Controller.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DDR4.cpp b/TOGSim/extern/ramulator_custom/src/DDR4.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/DDR4.cpp rename to TOGSim/extern/ramulator_custom/src/DDR4.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DDR4.h b/TOGSim/extern/ramulator_custom/src/DDR4.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/DDR4.h rename to TOGSim/extern/ramulator_custom/src/DDR4.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/DRAM.h b/TOGSim/extern/ramulator_custom/src/DRAM.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/DRAM.h rename to TOGSim/extern/ramulator_custom/src/DRAM.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/HBM.cpp b/TOGSim/extern/ramulator_custom/src/HBM.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/HBM.cpp rename to TOGSim/extern/ramulator_custom/src/HBM.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/HBM.h b/TOGSim/extern/ramulator_custom/src/HBM.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/HBM.h rename to TOGSim/extern/ramulator_custom/src/HBM.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Memory.h b/TOGSim/extern/ramulator_custom/src/Memory.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Memory.h rename to TOGSim/extern/ramulator_custom/src/Memory.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.cpp b/TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.cpp rename to TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.h b/TOGSim/extern/ramulator_custom/src/MemoryFactory.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/MemoryFactory.h rename to TOGSim/extern/ramulator_custom/src/MemoryFactory.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Ramulator.cpp b/TOGSim/extern/ramulator_custom/src/Ramulator.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Ramulator.cpp rename to TOGSim/extern/ramulator_custom/src/Ramulator.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Refresh.cpp b/TOGSim/extern/ramulator_custom/src/Refresh.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Refresh.cpp rename to TOGSim/extern/ramulator_custom/src/Refresh.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Refresh.h b/TOGSim/extern/ramulator_custom/src/Refresh.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Refresh.h rename to TOGSim/extern/ramulator_custom/src/Refresh.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Request.cpp b/TOGSim/extern/ramulator_custom/src/Request.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Request.cpp rename to TOGSim/extern/ramulator_custom/src/Request.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Request.h b/TOGSim/extern/ramulator_custom/src/Request.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Request.h rename to TOGSim/extern/ramulator_custom/src/Request.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Scheduler.h b/TOGSim/extern/ramulator_custom/src/Scheduler.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Scheduler.h rename to TOGSim/extern/ramulator_custom/src/Scheduler.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/SpeedyController.h b/TOGSim/extern/ramulator_custom/src/SpeedyController.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/SpeedyController.h rename to TOGSim/extern/ramulator_custom/src/SpeedyController.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/StatType.cpp b/TOGSim/extern/ramulator_custom/src/StatType.cpp similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/StatType.cpp rename to TOGSim/extern/ramulator_custom/src/StatType.cpp diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/StatType.h b/TOGSim/extern/ramulator_custom/src/StatType.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/StatType.h rename to TOGSim/extern/ramulator_custom/src/StatType.h diff --git a/PyTorchSimBackend/extern/ramulator_custom/src/Statistics.h b/TOGSim/extern/ramulator_custom/src/Statistics.h similarity index 100% rename from PyTorchSimBackend/extern/ramulator_custom/src/Statistics.h rename to TOGSim/extern/ramulator_custom/src/Statistics.h diff --git a/PyTorchSimBackend/extern/stonneCore b/TOGSim/extern/stonneCore similarity index 100% rename from PyTorchSimBackend/extern/stonneCore rename to TOGSim/extern/stonneCore diff --git a/PyTorchSimBackend/include/Cache.h b/TOGSim/include/Cache.h similarity index 100% rename from PyTorchSimBackend/include/Cache.h rename to TOGSim/include/Cache.h diff --git a/PyTorchSimBackend/include/Cache_defs.h b/TOGSim/include/Cache_defs.h similarity index 100% rename from PyTorchSimBackend/include/Cache_defs.h rename to TOGSim/include/Cache_defs.h diff --git a/PyTorchSimBackend/include/Cache_stats.h b/TOGSim/include/Cache_stats.h similarity index 100% rename from PyTorchSimBackend/include/Cache_stats.h rename to TOGSim/include/Cache_stats.h diff --git a/PyTorchSimBackend/include/Common.h b/TOGSim/include/Common.h similarity index 100% rename from PyTorchSimBackend/include/Common.h rename to TOGSim/include/Common.h diff --git a/PyTorchSimBackend/include/Core.h b/TOGSim/include/Core.h similarity index 100% rename from PyTorchSimBackend/include/Core.h rename to TOGSim/include/Core.h diff --git a/PyTorchSimBackend/include/DMA.h b/TOGSim/include/DMA.h similarity index 100% rename from PyTorchSimBackend/include/DMA.h rename to TOGSim/include/DMA.h diff --git a/PyTorchSimBackend/include/DelayQueue.h b/TOGSim/include/DelayQueue.h similarity index 100% rename from PyTorchSimBackend/include/DelayQueue.h rename to TOGSim/include/DelayQueue.h diff --git a/PyTorchSimBackend/include/Dram.h b/TOGSim/include/Dram.h similarity index 100% rename from PyTorchSimBackend/include/Dram.h rename to TOGSim/include/Dram.h diff --git a/PyTorchSimBackend/include/Hashing.h b/TOGSim/include/Hashing.h similarity index 100% rename from PyTorchSimBackend/include/Hashing.h rename to TOGSim/include/Hashing.h diff --git a/PyTorchSimBackend/include/Instruction.h b/TOGSim/include/Instruction.h similarity index 100% rename from PyTorchSimBackend/include/Instruction.h rename to TOGSim/include/Instruction.h diff --git a/PyTorchSimBackend/include/Interconnect.h b/TOGSim/include/Interconnect.h similarity index 100% rename from PyTorchSimBackend/include/Interconnect.h rename to TOGSim/include/Interconnect.h diff --git a/PyTorchSimBackend/include/IntervalTree.h b/TOGSim/include/IntervalTree.h similarity index 100% rename from PyTorchSimBackend/include/IntervalTree.h rename to TOGSim/include/IntervalTree.h diff --git a/PyTorchSimBackend/include/L2Cache.h b/TOGSim/include/L2Cache.h similarity index 100% rename from PyTorchSimBackend/include/L2Cache.h rename to TOGSim/include/L2Cache.h diff --git a/PyTorchSimBackend/include/Memfetch.h b/TOGSim/include/Memfetch.h similarity index 100% rename from PyTorchSimBackend/include/Memfetch.h rename to TOGSim/include/Memfetch.h diff --git a/PyTorchSimBackend/include/Model.h b/TOGSim/include/Model.h similarity index 100% rename from PyTorchSimBackend/include/Model.h rename to TOGSim/include/Model.h diff --git a/PyTorchSimBackend/include/SimulationConfig.h b/TOGSim/include/SimulationConfig.h similarity index 100% rename from PyTorchSimBackend/include/SimulationConfig.h rename to TOGSim/include/SimulationConfig.h diff --git a/PyTorchSimBackend/include/Simulator.h b/TOGSim/include/Simulator.h similarity index 100% rename from PyTorchSimBackend/include/Simulator.h rename to TOGSim/include/Simulator.h diff --git a/PyTorchSimBackend/include/SparseCore.h b/TOGSim/include/SparseCore.h similarity index 100% rename from PyTorchSimBackend/include/SparseCore.h rename to TOGSim/include/SparseCore.h diff --git a/PyTorchSimBackend/include/Tile.h b/TOGSim/include/Tile.h similarity index 100% rename from PyTorchSimBackend/include/Tile.h rename to TOGSim/include/Tile.h diff --git a/PyTorchSimBackend/include/TileGraph.h b/TOGSim/include/TileGraph.h similarity index 100% rename from PyTorchSimBackend/include/TileGraph.h rename to TOGSim/include/TileGraph.h diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/TOGSim/include/TileGraphParser.h similarity index 100% rename from PyTorchSimBackend/include/TileGraphParser.h rename to TOGSim/include/TileGraphParser.h diff --git a/PyTorchSimBackend/include/scheduler/Scheduler.h b/TOGSim/include/scheduler/Scheduler.h similarity index 100% rename from PyTorchSimBackend/include/scheduler/Scheduler.h rename to TOGSim/include/scheduler/Scheduler.h diff --git a/PyTorchSimBackend/src/CMakeLists.txt b/TOGSim/src/CMakeLists.txt similarity index 100% rename from PyTorchSimBackend/src/CMakeLists.txt rename to TOGSim/src/CMakeLists.txt diff --git a/PyTorchSimBackend/src/Cache.cc b/TOGSim/src/Cache.cc similarity index 100% rename from PyTorchSimBackend/src/Cache.cc rename to TOGSim/src/Cache.cc diff --git a/PyTorchSimBackend/src/Cache_stats.cc b/TOGSim/src/Cache_stats.cc similarity index 100% rename from PyTorchSimBackend/src/Cache_stats.cc rename to TOGSim/src/Cache_stats.cc diff --git a/PyTorchSimBackend/src/Common.cc b/TOGSim/src/Common.cc similarity index 100% rename from PyTorchSimBackend/src/Common.cc rename to TOGSim/src/Common.cc diff --git a/PyTorchSimBackend/src/Core.cc b/TOGSim/src/Core.cc similarity index 100% rename from PyTorchSimBackend/src/Core.cc rename to TOGSim/src/Core.cc diff --git a/PyTorchSimBackend/src/DMA.cc b/TOGSim/src/DMA.cc similarity index 100% rename from PyTorchSimBackend/src/DMA.cc rename to TOGSim/src/DMA.cc diff --git a/PyTorchSimBackend/src/DelayQueue.cc b/TOGSim/src/DelayQueue.cc similarity index 100% rename from PyTorchSimBackend/src/DelayQueue.cc rename to TOGSim/src/DelayQueue.cc diff --git a/PyTorchSimBackend/src/Dram.cc b/TOGSim/src/Dram.cc similarity index 100% rename from PyTorchSimBackend/src/Dram.cc rename to TOGSim/src/Dram.cc diff --git a/PyTorchSimBackend/src/Hashing.cc b/TOGSim/src/Hashing.cc similarity index 100% rename from PyTorchSimBackend/src/Hashing.cc rename to TOGSim/src/Hashing.cc diff --git a/PyTorchSimBackend/src/Instruction.cc b/TOGSim/src/Instruction.cc similarity index 100% rename from PyTorchSimBackend/src/Instruction.cc rename to TOGSim/src/Instruction.cc diff --git a/PyTorchSimBackend/src/Interconnect.cc b/TOGSim/src/Interconnect.cc similarity index 98% rename from PyTorchSimBackend/src/Interconnect.cc rename to TOGSim/src/Interconnect.cc index 8a684ff7..04505aca 100644 --- a/PyTorchSimBackend/src/Interconnect.cc +++ b/TOGSim/src/Interconnect.cc @@ -76,7 +76,7 @@ Booksim2Interconnect::Booksim2Interconnect(SimulationConfig config) { spdlog::info("Initialize Booksim2"); char* onnxim_path_env = std::getenv("TORCHSIM_DIR"); std::string onnxim_path = onnxim_path_env != NULL? - std::string(onnxim_path_env) + "/PyTorchSimBackend" : std::string("./"); + std::string(onnxim_path_env) + "/TOGSim" : std::string("./"); _config_path = fs::path(onnxim_path).append("configs").append((std::string)config.icnt_config_path).string(); spdlog::info("Config path : {}", _config_path); diff --git a/PyTorchSimBackend/src/L2Cache.cc b/TOGSim/src/L2Cache.cc similarity index 100% rename from PyTorchSimBackend/src/L2Cache.cc rename to TOGSim/src/L2Cache.cc diff --git a/PyTorchSimBackend/src/Simulator.cc b/TOGSim/src/Simulator.cc similarity index 99% rename from PyTorchSimBackend/src/Simulator.cc rename to TOGSim/src/Simulator.cc index cb81611a..1171a3bd 100644 --- a/PyTorchSimBackend/src/Simulator.cc +++ b/TOGSim/src/Simulator.cc @@ -17,7 +17,7 @@ Simulator::Simulator(SimulationConfig config) _noc_node_per_core = config.icnt_node_per_core; char* onnxim_path_env = std::getenv("TORCHSIM_DIR"); std::string onnxim_path = onnxim_path_env != NULL? - std::string(onnxim_path_env) + "/PyTorchSimBackend" : std::string("./"); + std::string(onnxim_path_env) + "/TOGSim" : std::string("./"); // Create core objects _cores.resize(_n_cores); diff --git a/PyTorchSimBackend/src/SparseCore.cc b/TOGSim/src/SparseCore.cc similarity index 100% rename from PyTorchSimBackend/src/SparseCore.cc rename to TOGSim/src/SparseCore.cc diff --git a/PyTorchSimBackend/src/Tile.cc b/TOGSim/src/Tile.cc similarity index 100% rename from PyTorchSimBackend/src/Tile.cc rename to TOGSim/src/Tile.cc diff --git a/PyTorchSimBackend/src/TileGraph.cc b/TOGSim/src/TileGraph.cc similarity index 100% rename from PyTorchSimBackend/src/TileGraph.cc rename to TOGSim/src/TileGraph.cc diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc similarity index 100% rename from PyTorchSimBackend/src/TileGraphParser.cc rename to TOGSim/src/TileGraphParser.cc diff --git a/PyTorchSimBackend/src/helper/CommandLineParser.cc b/TOGSim/src/helper/CommandLineParser.cc similarity index 100% rename from PyTorchSimBackend/src/helper/CommandLineParser.cc rename to TOGSim/src/helper/CommandLineParser.cc diff --git a/PyTorchSimBackend/src/helper/CommandLineParser.h b/TOGSim/src/helper/CommandLineParser.h similarity index 100% rename from PyTorchSimBackend/src/helper/CommandLineParser.h rename to TOGSim/src/helper/CommandLineParser.h diff --git a/PyTorchSimBackend/src/main.cc b/TOGSim/src/main.cc similarity index 96% rename from PyTorchSimBackend/src/main.cc rename to TOGSim/src/main.cc index 5c4a21e9..81ba1f49 100644 --- a/PyTorchSimBackend/src/main.cc +++ b/TOGSim/src/main.cc @@ -9,7 +9,7 @@ namespace fs = std::filesystem; namespace po = boost::program_options; -const char* env_value = std::getenv("BACKENDSIM_DRYRUN"); +const char* env_value = std::getenv("TOGSIM_DRYRUN"); bool isDryRun = (env_value != nullptr && std::string(env_value) == "1"); void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, std::string config_path, cycle_type request_time=0, int partiton_id=0) { @@ -38,7 +38,7 @@ int until(Simulator *simulator, cycle_type until_cycle) { void interactive_mode(Simulator* simulator) { std::string command; - std::cout << "[" << simulator->get_core_cycle() << "] BackendSim> "; + std::cout << "[" << simulator->get_core_cycle() << "] TOGSim> "; while (std::getline(std::cin, command)) { std::istringstream iss(command); @@ -79,7 +79,7 @@ void interactive_mode(Simulator* simulator) { spdlog::error("Error: unknown command {} Available commands are: launch, until, quit.", token); } if (isDryRun) - std::cout << "[" << simulator->get_core_cycle() << "] BackendSim> "; + std::cout << "[" << simulator->get_core_cycle() << "] TOGSim> "; } simulator->cycle(); if (simulator->get_core_cycle()==0) diff --git a/PyTorchSimBackend/src/scheduler/Scheduler.cc b/TOGSim/src/scheduler/Scheduler.cc similarity index 100% rename from PyTorchSimBackend/src/scheduler/Scheduler.cc rename to TOGSim/src/scheduler/Scheduler.cc diff --git a/experiments/BERT.py b/experiments/BERT.py index 3534505d..c5bb454e 100644 --- a/experiments/BERT.py +++ b/experiments/BERT.py @@ -9,7 +9,7 @@ def run_BERT(size, input_seq, config): from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request # from tests.test_transformer import EncoderBlock from tests.Fusion.test_transformer_fusion import EncoderBlock - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() hidden_dim = {'base': 768, 'large': 1024, 'xlarge': 2048} @@ -36,7 +36,7 @@ def run_BERT(size, input_seq, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path FIXME: gem5 result is different as directoy name sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -51,7 +51,7 @@ def run_BERT(size, input_seq, config): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_BERT(size, input_seq, config) diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh index a32cd0a6..28e6ad5e 100755 --- a/experiments/artifact/cycle_validation/run_cycle.sh +++ b/experiments/artifact/cycle_validation/run_cycle.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -export TORCHSIM_CONFIG=$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json +export TORCHSIM_CONFIG=$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs mkdir -p $LOG_DIR diff --git a/experiments/artifact/speedup/run_speedup.sh b/experiments/artifact/speedup/run_speedup.sh index 7d0c0da2..2b9625e9 100755 --- a/experiments/artifact/speedup/run_speedup.sh +++ b/experiments/artifact/speedup/run_speedup.sh @@ -1,7 +1,7 @@ #!/bin/bash LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs -CONFIG_DIR="$TORCHSIM_DIR/PyTorchSimBackend/configs" -SIMULATOR_BIN="$TORCHSIM_DIR/PyTorchSimBackend/build/bin/Simulator" +CONFIG_DIR="$TORCHSIM_DIR/TOGSim/configs" +SIMULATOR_BIN="$TORCHSIM_DIR/TOGSim/build/bin/Simulator" configs=( "systolic_ws_128x128_c2_simple_noc_tpuv3.json" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh index 66829f02..4055b355 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh @@ -26,7 +26,7 @@ for i in "${config[@]}"; do echo "===== config=$i | model=$ops =====" >> "$output_file" sum=0.0 count=0 - config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i" + config_path="$TORCHSIM_DIR/TOGSim/configs/$i" for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh index 2f9718f1..83b3798a 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh @@ -27,7 +27,7 @@ for i in "${config[@]}"; do echo "===== config=$i | model=$ops =====" >> "$output_file" sum=0.0 count=0 - config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i" + config_path="$TORCHSIM_DIR/TOGSim/configs/$i" for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh index 8ff7e2b6..f1467614 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh @@ -25,7 +25,7 @@ for i in "${config[@]}"; do echo "===== config=$i | model=$ops =====" >> "$output_file" sum=0.0 count=0 - config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i" + config_path="$TORCHSIM_DIR/TOGSim/configs/$i" for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh index aa35735c..2ed3ca2a 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh @@ -33,7 +33,7 @@ for i in "${config[@]}"; do echo "===== config=$i | model=$ops =====" >> "$output_file" sum=0.0 count=0 - config_path="$TORCHSIM_DIR/PyTorchSimBackend/configs/$i" + config_path="$TORCHSIM_DIR/TOGSim/configs/$i" for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" diff --git a/experiments/attention.py b/experiments/attention.py index e8f89dac..5a8c5f45 100644 --- a/experiments/attention.py +++ b/experiments/attention.py @@ -14,7 +14,7 @@ def attention(query, key, value): p_attn = scores.softmax(dim=-2) return torch.matmul(value.transpose(-1, -2), p_attn) from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() query = torch.randn(size).to(device=device) key = torch.randn(size).to(device=device) @@ -36,7 +36,7 @@ def attention(query, key, value): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -50,7 +50,7 @@ def attention(query, key, value): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_attention(size, config) diff --git a/experiments/conv.py b/experiments/conv.py index e8b97906..c8ca9a37 100644 --- a/experiments/conv.py +++ b/experiments/conv.py @@ -15,7 +15,7 @@ def custom_conv2d(a, b, bias): conv2d.weight = torch.nn.Parameter(b) # conv2d.bias = torch.nn.Parameter(bias) return conv2d(a) - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() conv_input = torch.randn(batch_size, i_c, i_h, i_w).to(memory_format=torch.channels_last, device=device) conv_kernel = torch.randn(o_c, i_c, kernel_size, kernel_size).to(memory_format=torch.channels_last, device=device) @@ -37,7 +37,7 @@ def custom_conv2d(a, b, bias): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -51,7 +51,7 @@ def custom_conv2d(a, b, bias): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_conv2d(size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7], config) \ No newline at end of file diff --git a/experiments/gemm.py b/experiments/gemm.py index e7a639ad..67dc4f79 100644 --- a/experiments/gemm.py +++ b/experiments/gemm.py @@ -10,7 +10,7 @@ def run_matmul(input_size, hidden_size, output_size, config): from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request def custom_matmul(a, b): return torch.matmul(a, b) - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() torch.manual_seed(0) input = torch.randn(input_size, hidden_size).to(device=device) @@ -31,7 +31,7 @@ def custom_matmul(a, b): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -45,8 +45,8 @@ def custom_matmul(a, b): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] from Scheduler.scheduler import PyTorchSimRunner module = PyTorchSimRunner.setup_device() diff --git a/experiments/layernorm.py b/experiments/layernorm.py index f149394e..0beaac6c 100644 --- a/experiments/layernorm.py +++ b/experiments/layernorm.py @@ -8,7 +8,7 @@ def run_layernorm(size, config): from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() input = torch.randn(size).to(device=device) opt_fn = torch.compile(dynamic=False)(torch.nn.LayerNorm(size[-1]).to(device=device)) @@ -27,7 +27,7 @@ def run_layernorm(size, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -42,7 +42,7 @@ def run_layernorm(size, config): os.environ['TORCHSIM_FUSION_REDUCTION_REDUCTION'] = "0" # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_layernorm(size, config) diff --git a/experiments/resnet18.py b/experiments/resnet18.py index 5d9dcf86..23d62e40 100644 --- a/experiments/resnet18.py +++ b/experiments/resnet18.py @@ -8,7 +8,7 @@ def run_resnet(batch, config): from torchvision.models import resnet18 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() model = resnet18().eval() input = torch.randn(batch, 3, 224, 224).to(device=device) @@ -29,7 +29,7 @@ def run_resnet(batch, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -43,7 +43,7 @@ def run_resnet(batch, config): os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1" # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_resnet(batch, config) diff --git a/experiments/resnet50.py b/experiments/resnet50.py index bd52afc1..60a46071 100644 --- a/experiments/resnet50.py +++ b/experiments/resnet50.py @@ -8,7 +8,7 @@ def run_resnet(batch, config): from torchvision.models import resnet50 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() model = resnet50().eval() input = torch.randn(batch, 3, 224, 224).to(device=device) @@ -29,7 +29,7 @@ def run_resnet(batch, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -43,7 +43,7 @@ def run_resnet(batch, config): os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1" # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_resnet(batch, config) diff --git a/experiments/softmax.py b/experiments/softmax.py index 14d28fee..532ef091 100644 --- a/experiments/softmax.py +++ b/experiments/softmax.py @@ -8,7 +8,7 @@ def run_softmax(size, config, dim=1): from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) device = scheduler.execution_engine.module.custom_device() input = torch.randn(size).to(device=device) opt_fn = torch.compile(dynamic=False)(torch.nn.Softmax(dim=dim).to(device=device)) @@ -27,7 +27,7 @@ def run_softmax(size, config, dim=1): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() @@ -41,7 +41,7 @@ def run_softmax(size, config, dim=1): os.environ['TORCHSIM_DUMP_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'BACKENDSIM_SPIKE_ONLY' in os.environ: - del os.environ['BACKENDSIM_SPIKE_ONLY'] + if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ: + del os.environ['TORCHSIM_FUNCTIONAL_MODE'] run_softmax(size, config) diff --git a/scripts/CompilerOpt_experiment/DMAopt.sh b/scripts/CompilerOpt_experiment/DMAopt.sh index 469cf766..22118b1e 100644 --- a/scripts/CompilerOpt_experiment/DMAopt.sh +++ b/scripts/CompilerOpt_experiment/DMAopt.sh @@ -1,5 +1,5 @@ #!/bin/bash -export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json" +export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json" # None FG DMA export TORCHSIM_SUBTILE=0 diff --git a/scripts/chiplet.sh b/scripts/chiplet.sh index 3dfba3d9..2989e4fd 100755 --- a/scripts/chiplet.sh +++ b/scripts/chiplet.sh @@ -14,16 +14,16 @@ fi GEMM_PATH="$1" INDEX_NAME="$2" -SIMULATOR_PATH="$TORCHSIM_DIR/PyTorchSimBackend/build/bin/Simulator" +SIMULATOR_PATH="$TORCHSIM_DIR/TOGSim/build/bin/Simulator" GEMM_DIR_NAME=$(basename "$GEMM_PATH") echo "GEMM Directory Name: $GEMM_DIR_NAME" CONFIG_LIST=( - "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json" + "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json" ) CONFIG_LIST2=( - "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_booksim_tpuv3.json" - "$TORCHSIM_DIR/PyTorchSimBackend/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json" + "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json" + "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json" ) shift shift @@ -51,7 +51,7 @@ for CONFIG in "${CONFIG_LIST[@]}"; do # Run Simulator echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" & - echo "[BackendSimulator] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\"" + echo "[TOGSim] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\"" done done @@ -65,6 +65,6 @@ for CONFIG in "${CONFIG_LIST2[@]}"; do # Run Simulator # echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" & - echo "[BackendSimulator] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\"" + echo "[TOGSim] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\"" done wait \ No newline at end of file diff --git a/scripts/end2end.sh b/scripts/end2end.sh index 7ca5c93d..e3ad4fb7 100755 --- a/scripts/end2end.sh +++ b/scripts/end2end.sh @@ -7,15 +7,15 @@ BASE_PATH=$1 # Input as the first argument total_sum=0 total_core=0 total_vector=0 -# Find all backendsim_result folders -mapfile -t backend_folders < <(find "$BASE_PATH" -type d -name "backendsim_result") +# Find all togsim_result folders +mapfile -t togsim_folders < <(find "$BASE_PATH" -type d -name "togsim_result") -# Iterate over each backendsim_result folder -for backend_folder in "${backend_folders[@]}"; do - # echo "Processing folder: $backend_folder" +# Iterate over each togsim_result folder +for togsim_folder in "${togsim_folders[@]}"; do + # echo "Processing folder: $togsim_folder" - # Find all files within the backendsim_result folder - mapfile -t files < <(find "$backend_folder" -type f) + # Find all files within the togsim_result folder + mapfile -t files < <(find "$togsim_folder" -type f) for file in "${files[@]}"; do # echo "Processing $file" diff --git a/scripts/get_tog_result.sh b/scripts/get_tog_result.sh index 9359e1e5..6fd399e0 100755 --- a/scripts/get_tog_result.sh +++ b/scripts/get_tog_result.sh @@ -3,8 +3,8 @@ total_cycles=0 # Read through input stream line by line while IFS= read -r line; do - # Check if the line contains both "[BackendSimulator]" and "stored" - if [[ "$line" == *"[BackendSimulator]"* && "$line" == *"stored"* ]]; then + # Check if the line contains both "[TOGSimulator]" and "stored" + if [[ "$line" == *"[TOGSimulator]"* && "$line" == *"stored"* ]]; then # Extract the file path from the line file_path=$(echo "$line" | sed -n 's/.*stored to "\(.*\)"$/\1/p') diff --git a/scripts/sim_time.sh b/scripts/sim_time.sh index 494bf0e1..6f9063b8 100755 --- a/scripts/sim_time.sh +++ b/scripts/sim_time.sh @@ -6,12 +6,12 @@ BASE_PATH=$1 # Input as the first argument # Initialize total_sum as string for awk processing total_sum=0.0 -# Find all backendsim_result folders -mapfile -t backend_folders < <(find "$BASE_PATH" -type d -name "backendsim_result") +# Find all togsim_result folders +mapfile -t togsim_folders < <(find "$BASE_PATH" -type d -name "togsim_result") -# Iterate over each backendsim_result folder -for backend_folder in "${backend_folders[@]}"; do - mapfile -t files < <(find "$backend_folder" -type f) +# Iterate over each togsim_result folder +for togsim_folder in "${togsim_folders[@]}"; do + mapfile -t files < <(find "$togsim_folder" -type f) for file in "${files[@]}"; do sim_time=$(grep "Simulation wall clock time:" "$file" | tail -n 1 | sed -E 's/.*Simulation wall clock time: ([0-9]+(\.[0-9]+)?).*/\1/') diff --git a/scripts/sparsity_experiment/run.sh b/scripts/sparsity_experiment/run.sh index 0b7bc6f5..94e00527 100755 --- a/scripts/sparsity_experiment/run.sh +++ b/scripts/sparsity_experiment/run.sh @@ -5,7 +5,7 @@ export TORCHSIM_FORCE_TIME_M=8 export TORCHSIM_FORCE_TIME_N=8 OUTPUT_DIR="12GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_12G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -13,7 +13,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="24GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_24G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -21,7 +21,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="48GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c1_48G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -29,7 +29,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="12GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_12G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -37,7 +37,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="24GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_24G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -45,7 +45,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="48GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_8x8_c2_48G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 diff --git a/scripts/stonne_experiment2/tog_gen.py b/scripts/stonne_experiment2/tog_gen.py index 819390d9..76cb7f9f 100644 --- a/scripts/stonne_experiment2/tog_gen.py +++ b/scripts/stonne_experiment2/tog_gen.py @@ -5,7 +5,7 @@ from collections import defaultdict sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) from AsmParser.tog_generator import tog_generator -from Simulator.simulator import BackendSimulator +from Simulator.simulator import TOGSimulator from PyTorchSimFrontend import extension_config def extract_simulation_stats(result_path): @@ -71,9 +71,9 @@ def extract_simulation_stats(result_path): if "outerPro" in path: continue tog_path = os.path.join(path, "tile_graph.onnx") - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/stonne_validation_c1_simple_noc.json' - backsim = BackendSimulator(backend_path, stonne_config_path) + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/TOGSim/configs/stonne_validation_c1_simple_noc.json' + backsim = TOGSimulator(togsim_path, stonne_config_path) result_path = backsim.simulation(tog_path) nr_multiplications, total_cycle, sim_time = extract_simulation_stats(result_path) sim_time, total_cycle = float(sim_time), int(total_cycle) diff --git a/tests/test_compile_overhead.py b/tests/test_compile_overhead.py index cf0dc1bb..c32b4364 100644 --- a/tests/test_compile_overhead.py +++ b/tests/test_compile_overhead.py @@ -21,7 +21,7 @@ # shutil.rmtree("/tmp/torchinductor") #except FileNotFoundError: # print("no cache") - scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json") + scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json") # Register compiled model opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False) SchedulerDNNModel.register_model("resnet18", opt_model1) diff --git a/tests/test_hetro.py b/tests/test_hetro.py index 5e36d730..557ea5d6 100644 --- a/tests/test_hetro.py +++ b/tests/test_hetro.py @@ -26,7 +26,7 @@ def custom_matmul(a, b): K = args.K sparsity = args.sparsity mode = args.mode - config_path = f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/{args.config}" + config_path = f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/{args.config}" print("M: ", M) print("N: ", N) @@ -36,7 +36,7 @@ def custom_matmul(a, b): with torch.no_grad(): # Init scheduler scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, - backend_config=config_path) + togsim_config=config_path) # Register compiled model opt_model1 = torch.compile(custom_matmul) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index c64093a0..91bf0ad8 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -7,13 +7,13 @@ base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') sys.path.append(base_path) from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request -config = f'{base_path}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' +config = f'{base_path}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' target_model1 = model1().eval() target_model2 = model2(768, 12).eval() # Init scheduler -scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) +scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) # Register compiled model opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last)) opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device())) diff --git a/tests/test_scheduler_batching.py b/tests/test_scheduler_batching.py index f3b54159..5a34d161 100644 --- a/tests/test_scheduler_batching.py +++ b/tests/test_scheduler_batching.py @@ -17,7 +17,7 @@ target_model1 = model1().eval() # Init scheduler - scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, backend_config=f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json") + scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json") # Register compiled model opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False) SchedulerDNNModel.register_model("resnet18", opt_model1) diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py index 1cf0d3b3..c7abf0ae 100644 --- a/tests/test_spmm_scheduler.py +++ b/tests/test_spmm_scheduler.py @@ -25,7 +25,7 @@ output_size = args.output_size w1_sparsity = args.w1_sparsity w2_sparsity = args.w2_sparsity - config_path = f"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/{args.config}" + config_path = f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/{args.config}" print("batch_size: ", batch_size) print("input_size: ", input_size) @@ -37,7 +37,7 @@ with torch.no_grad(): # Init scheduler scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, - backend_config=config_path) + togsim_config=config_path) target_model1 = model1(input_size, hidden_size, output_size, w1_sparsity, w2_sparsity, scheduler.execution_engine.module.custom_device()).eval() target_model2 = model2(768, 12).eval() From e3cc1fbf796c9d65b9638fcb723ed8c971ed3d6b Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Sun, 30 Nov 2025 14:56:58 +0000 Subject: [PATCH 39/53] [refactor] compiler opt config --- PyTorchSimFrontend/extension_config.py | 11 ++++++++--- PyTorchSimFrontend/mlir/mlir_lowering.py | 8 ++++---- PyTorchSimFrontend/mlir/mlir_scheduling.py | 2 ++ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index 5cda38c5..1ae636e6 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -66,10 +66,15 @@ CONFIG_SUBTILE_N = int(os.environ.get('TORCHSIM_SUBTILE_N', default=CONFIG_VECTOR_LANE)) CONFIG_SUBTILE_K = int(os.environ.get('TORCHSIM_SUBTILE_K', default=CONFIG_VECTOR_LANE)) +# Compiler Optimization +CONFIG_COMPILER_OPTIMIZATION = os.environ.get('TORCHSIM_COMPILER_OPTIMIZATION', default="all") # options: all, none, custom # Advanced fusion options -CONFIG_FUSION_REDUCTION_EPILOGUE = int(os.environ.get('TORCHSIM_FUSION_REDUCTION_EPILOGUE', default=True)) -CONFIG_FUSION_REDUCTION_REDUCTION = int(os.environ.get('TORCHSIM_FUSION_REDUCTION_REDUCTION', default=True)) -CONFIG_FUSION_PROLOGUE = int(os.environ.get('TORCHSIM_FUSION_PROLOGUE', default=True)) +CONFIG_FUSION = True if (CONFIG_COMPILER_OPTIMIZATION == "all" or "fusion" in CONFIG_COMPILER_OPTIMIZATION) else False +CONFIG_FUSION_REDUCTION_EPILOGUE = True if (CONFIG_COMPILER_OPTIMIZATION == "all" or "reduction_epliogue" in CONFIG_COMPILER_OPTIMIZATION) else False +CONFIG_FUSION_REDUCTION_REDUCTION = True if (CONFIG_COMPILER_OPTIMIZATION == "all" or "reduction_reduction" in CONFIG_COMPILER_OPTIMIZATION) else False +CONFIG_FUSION_PROLOGUE = True if ((CONFIG_COMPILER_OPTIMIZATION == "all") or ("prologue" in CONFIG_COMPILER_OPTIMIZATION)) else False +CONFIG_SINGLE_BATCH_CONV = True if (CONFIG_COMPILER_OPTIMIZATION == "all" or "single_batch_conv" in CONFIG_COMPILER_OPTIMIZATION) else False +CONFIG_MULTI_TILE_CONV = True if (CONFIG_COMPILER_OPTIMIZATION == "all" or "multi_tile_conv" in CONFIG_COMPILER_OPTIMIZATION) else False # SRAM Buffer allocation plan def load_plan_from_module(module_path): diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py index 6508ea86..89c477de 100644 --- a/PyTorchSimFrontend/mlir/mlir_lowering.py +++ b/PyTorchSimFrontend/mlir/mlir_lowering.py @@ -15,7 +15,7 @@ from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate -from PyTorchSimFrontend.extension_config import CONFIG_VECTOR_LANE, CONFIG_USE_TIMING_POOLING +from PyTorchSimFrontend.extension_config import CONFIG_VECTOR_LANE, CONFIG_USE_TIMING_POOLING, CONFIG_SINGLE_BATCH_CONV, CONFIG_MULTI_TILE_CONV aten = torch.ops.aten aten_spmm = MLIRExternKernelChoice(torch.sparse.mm, "custom_op::sparse_addmm") @@ -106,11 +106,11 @@ def convolution( layout = conv_layout(x, weight, None, **kwargs) # Select conv kernel - if BATCH == 1 and stride[0] == 1: + if BATCH == 1 and stride[0] == 1 and CONFIG_SINGLE_BATCH_CONV: mlir_template = MLIRConvSingleBatchTemplate([x, weight, bias], layout, **kwargs) - elif BATCH == 1 and stride[0] != 1: + elif BATCH == 1 and stride[0] != 1 and CONFIG_SINGLE_BATCH_CONV: mlir_template = MLIRConvSingleBatchStridedTemplate([x, weight, bias], layout, **kwargs) - elif I_C < CONFIG_VECTOR_LANE // 8: # 8 is hard-coded for now. This should be changed to a better heuristic. + elif I_C < CONFIG_VECTOR_LANE // 8 and CONFIG_MULTI_TILE_CONV: # 8 is hard-coded for now. This should be changed to a better heuristic. mlir_template = MLIRConvMultiTileTemplate([x, weight, bias], layout, **kwargs) else: mlir_template = MLIRConvTemplate([x, weight, bias], layout, **kwargs) diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 9696269e..38603319 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -94,6 +94,8 @@ def can_fuse_vertical(self, node1, node2): return self.can_fuse_horizontal(node1, node2) def can_fuse_horizontal(self, node1, node2): + if not extension_config.CONFIG_FUSION: + return False if (len(node1.get_nodes())+ len(node2.get_nodes())) > self.max_fusion_size: return False _, (vars1, reduce1) = node1.group From 10364c8a9b39b7b8e392c253688503fe454b8e36 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Sun, 30 Nov 2025 14:58:39 +0000 Subject: [PATCH 40/53] [refactor] simulator log --- PyTorchSimFrontend/extension_config.py | 2 ++ Simulator/simulator.py | 41 ++++++++++++++------------ 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index 1ae636e6..8b59c898 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -101,3 +101,5 @@ def load_plan_from_module(module_path): CONFIG_TLS_MODE = int(os.environ.get('TORCHSIM_TLS_MODE', default=1)) CONFIG_USE_TIMING_POOLING = int(os.environ.get('TORCHSIM_USE_TIMING_POOLING', default=0)) + +CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=0)) \ No newline at end of file diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 7c19f98e..785e11bb 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -101,8 +101,9 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size= os.makedirs(os.path.join(runtime_path, "indirect_access"), exist_ok=True) os.makedirs(os.path.join(runtime_path, "dma_access"), exist_ok=True) run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}' - if not silent_mode: - print("[SpikeSimulator] cmd> ", run) + if not silent_mode and extension_config.CONFIG_DEBUG_MODE: + print("[Spike] cmd> ", run) + print("[Spike] Running Spike simulator") run_cmd = shlex.split(run) try: stdout_setting = subprocess.DEVNULL if silent_mode else None @@ -110,7 +111,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size= subprocess.check_call(run_cmd, stdout=stdout_setting, stderr=stderr_setting) except subprocess.CalledProcessError as e: if not silent_mode: - print("[SpikeSimulator] Command failed with exit code", e.returncode) + print("[Spike] Command failed with exit code", e.returncode) error_msg = "" if e.returncode == 200: error_msg = "INVALID_SPAD_ACCESS" @@ -155,7 +156,7 @@ def show_progress(): while not finished: i = (i + 1) % 3 tail = "." * i + " " * (3-i) - sys.stdout.write("\r[Gem5Simulator] Simulation is still running." + tail) + sys.stdout.write("\r[Gem5] Gem5 is running." + tail) time.sleep(1) print("") @@ -165,7 +166,8 @@ def show_progress(): # Create progress thread is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) or silent_mode if not is_dryrun: - print("[Gem5Simulator] cmd> ", " ".join(gem5_cmd)) + if extension_config.CONFIG_DEBUG_MODE: + print("[Gem5] cmd> ", " ".join(gem5_cmd)) finished = False progress_thread = threading.Thread(target=show_progress) progress_thread.start() @@ -175,11 +177,11 @@ def show_progress(): else: output = subprocess.check_output(gem5_cmd, stderr=subprocess.DEVNULL) except subprocess.CalledProcessError as e: - print(f"[Gem5Simulator] Gem5 simulation failed with error: \"{e.output.decode()}\"") + print(f"[Gem5] Gem5 simulation failed with error: \"{e.output.decode()}\"") if not is_dryrun: finished = True progress_thread.join() - raise RuntimeError(f"GEM5 Simulation Failed: \"{e.output.decode()}\"") + raise RuntimeError(f"Gem5 Simulation Failed: \"{e.output.decode()}\"") with open(f"{dir_path}/stats.txt", "r") as stat_file: raw_list = stat_file.readlines() @@ -211,7 +213,7 @@ def show_progress(): while not finished: i = (i + 1) % 3 tail = "." * i + " " * (3-i) - sys.stdout.write("\r[TOGSimulator] Simulation is still running." + tail) + sys.stdout.write("\r[TOGSim] TOGSim is running." + tail) time.sleep(1) print("") cmd = f"{self.get_togsim_command()} --models_list {model_path}" @@ -219,8 +221,8 @@ def show_progress(): cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" if attribute_path: cmd = f"{cmd} --attributes_list {attribute_path}" - if not silent_mode: - print("[TOGSimulator] cmd> ", cmd) + if not silent_mode and extension_config.CONFIG_DEBUG_MODE: + print("[TOGSim] cmd> ", cmd) # Create progress thread if not silent_mode: @@ -236,8 +238,8 @@ def show_progress(): if not silent_mode: finished = True progress_thread.join() - print("[TOGSimulator] Command failed with exit code", e.returncode) - print("[TOGSimulator] Error output:", e.output) + print("[TOGSim] Command failed with exit code", e.returncode) + print("[TOGSim] Error output:", e.output) assert 0 # Save result to result_path result_path = os.path.join(os.path.dirname(model_path), "togsim_result") @@ -246,7 +248,7 @@ def show_progress(): result_path = os.path.join(result_path, file_name) with open(result_path, "w") as f: f.write(result.decode()) - print(f'[TOGSimulator] Simulation of "{model_path}" is stored to "{result_path}"') + print(f'[TOGSim] Simulation of "{model_path}" is stored to "{result_path}"') return result_path def interactive_simulation(self): @@ -254,7 +256,8 @@ def interactive_simulation(self): if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" - print("[TOGSimulator] cmd> ", cmd) + if extension_config.CONFIG_DEBUG_MODE: + print("[TOGSim] cmd> ", cmd) if self.process is None: self.process = subprocess.Popen( shlex.split(cmd), @@ -263,22 +266,22 @@ def interactive_simulation(self): universal_newlines=True ) else: - print("[TOGSimulator] Simulator is already running.") + print("[TOGSim] Simulator is already running.") def stop(self): if self.process: self.process.terminate() self.process.wait() self.process = None - print("[TOGSimulator] Simulator stopped.") + print("[TOGSim] Simulator stopped.") def wait(self): if self.process: - print("[TOGSimulator] Waiting for simulation to complete...") + print("[TOGSim] Waiting for simulation to complete...") self.quit() self.process.wait() self.process = None - print("[TOGSimulator] Simulation completed.") + print("[TOGSim] Simulation completed.") def send_command(self, command): if self.process: @@ -409,7 +412,7 @@ def get_result_from_file(result_path): break if simulation_finished_idx == -1: - print("[TOGSimulator] Tried to parsing wrong formated output file!") + print("[TOGSim] Tried to parsing wrong formated output file!") return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time total_stat_lines = lines[simulation_finished_idx:] From b220a90b0fda33c63dc6c8dd8337c0b54ef1d681 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Sun, 30 Nov 2025 14:59:13 +0000 Subject: [PATCH 41/53] [Tutorial] HelloPyTorchSim --- HelloPyTorchSim.ipynb | 1308 + tutorial/HelloPyTorchSim.ipynb | 132796 ++++++++++++++++++++++++++++++ 2 files changed, 134104 insertions(+) create mode 100644 HelloPyTorchSim.ipynb create mode 100644 tutorial/HelloPyTorchSim.ipynb diff --git a/HelloPyTorchSim.ipynb b/HelloPyTorchSim.ipynb new file mode 100644 index 00000000..f894d28b --- /dev/null +++ b/HelloPyTorchSim.ipynb @@ -0,0 +1,1308 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hello, PyTorchSim!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import torch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## One Touch Simulation\n", + "### Normal Matmul Code" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "torch.manual_seed(0)\n", + "input = torch.randn(128, 128).to(device)\n", + "weight = torch.randn(128, 128).to(device)\n", + "\n", + "opt_fn = torch.compile(torch.matmul)\n", + "cpu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PyTorchSim Matmul Code" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/4k/c4kdekvfqmwtt4iw6tk4i74vsoaww5lsz7eg4wsjckcd2kssdj4p.py\n", + "[Gem5] Gem5 is running... \n", + "[TOGSim] TOGSim is running.. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/fy6nnyudtno/backendsim_result/4\"\n" + ] + } + ], + "source": [ + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "torch.manual_seed(0)\n", + "input = torch.randn(128, 128).to(device)\n", + "weight = torch.randn(128, 128).to(device)\n", + "\n", + "opt_fn = torch.compile(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n", + " if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n", + " message = f\"|{name} Test Passed|\"\n", + " print(\"-\" * len(message))\n", + " print(message)\n", + " print(\"-\" * len(message))\n", + " else:\n", + " message = f\"|{name} Test Failed|\"\n", + " print(\"-\" * len(message))\n", + " print(message)\n", + " print(\"-\" * len(message))\n", + " print(\"npu out: \", npu_out.cpu())\n", + " print(\"cpu out: \", cpu_out)\n", + " exit(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------\n", + "|MatMul Test Passed|\n", + "--------------------\n" + ] + } + ], + "source": [ + "test_result(\"MatMul\", npu_out, cpu_out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training\n", + "### Normal Backward Code" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n" + ] + } + ], + "source": [ + "import torch\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "npu_device = PyTorchSimRunner.setup_device().custom_device()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "torch.manual_seed(0)\n", + "cpu_input = torch.randn(128, 128).to(device)\n", + "cpu_weight = torch.randn(128, 128).to(device)\n", + "cpu_target = torch.randn(128, 128).to(device)\n", + "cpu_input.requires_grad = True\n", + "cpu_weight.requires_grad = True\n", + "\n", + "opt_fn = torch.matmul\n", + "cpu_out = opt_fn(cpu_input, cpu_weight)\n", + "\n", + "loss_fn = torch.nn.CrossEntropyLoss()\n", + "cpu_loss = loss_fn(cpu_out, cpu_target)\n", + "cpu_loss.backward()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PyTorchSim Backward Code" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrapper Codegen Path = /tmp/torchinductor_root/4n/c4nc22ifxgcu4mrbpkvqmhj7zjte5dlvp62kc3om2gddwv7c5fww.py\n", + "[Gem5] Gem5 is running... \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running.. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/fy6nnyudtno/backendsim_result/0\"\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/77/c77xg7rijruagidrzofkss2so4rw54wnja63tgzjm2gveekm64dn.py\n", + "[Gem5] Gem5 is running... \n", + "\n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running.. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/vaymr3umaez/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/vaymr3umaez/backendsim_result/0\"\n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running.. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/jspabiga5jh/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/jspabiga5jh/backendsim_result/0\"\n" + ] + } + ], + "source": [ + "torch.manual_seed(0)\n", + "npu_input = torch.randn(128, 128).to(npu_device)\n", + "npu_weight = torch.randn(128, 128).to(npu_device)\n", + "npu_target = torch.randn(128, 128).to(npu_device)\n", + "npu_input.requires_grad = True\n", + "npu_weight.requires_grad = True\n", + "\n", + "opt_fn = torch.compile(torch.matmul)\n", + "npu_out = opt_fn(npu_input, npu_weight)\n", + "\n", + "loss_fn = torch.nn.CrossEntropyLoss()\n", + "npu_loss = loss_fn(npu_out, npu_target)\n", + "npu_loss.backward()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------------------------\n", + "|MatMul Input Grad Test Passed|\n", + "-------------------------------\n", + "--------------------------------\n", + "|MatMul Weight Grad Test Passed|\n", + "--------------------------------\n" + ] + } + ], + "source": [ + "test_result(\"MatMul Input Grad\", npu_input.grad, cpu_input.grad)\n", + "test_result(\"MatMul Weight Grad\", npu_weight.grad, cpu_weight.grad)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Mapping\n", + "\n", + "Default mapping is based on heuristic." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/ar/carykuyxk5ggb2ebsr37pvvynp4h5qygv3lxmocx3q3fxagmsivd.py\n", + "[Gem5Simulator] cmd> /workspace/gem5/build/RISCV/gem5.opt -r --stdout-file=sto.log -d /tmp/torchinductor/tmp/tjueq5rwe4f/m5out /root/workspace/PyTorchSim/gem5_script/script_systolic.py -c /tmp/torchinductor/tmp/tjueq5rwe4f/cycle_bin --vlane 128\n", + "[Gem5Simulator] Simulation is still running.. \n", + "[SpikeSimulator] cmd> spike --isa rv64gcv --varch=vlen:256,elen:64 --vectorlane-size=128 -m0x80000000:0x1900000000,0x2000000000:0x1000000 --scratchpad-base-paddr=137438953472 --scratchpad-base-vaddr=3489660928 --scratchpad-size=131072 --kernel-addr=000000000001040c:1111a --base-path=/tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0006 /workspace/riscv-pk/build/pk /tmp/torchinductor/tmp/tjueq5rwe4f/validation_binary /tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0006/arg0_1/0.raw /tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0006/arg1_1/0.raw /tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0006/buf0/0.raw\n", + "[BackendSimulator] cmd> /root/workspace/PyTorchSim/PyTorchSimBackend/build/bin/Simulator --config /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/tjueq5rwe4f/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0006/attribute/0\n", + "[BackendSimulator] Simulation is still running. \n", + "[BackendSimulator] Simulation of \"/tmp/torchinductor/tmp/tjueq5rwe4f/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/tjueq5rwe4f/backendsim_result/6\"\n" + ] + } + ], + "source": [ + "import torch\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-25 05:31:40.391] [info] Total execution cycle: 47130\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/tjueq5rwe4f/backendsim_result/0 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Manual Mapping\n", + "User can set tile size manually." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/kn/cknlxr55nmtksl5vgx47lbkdfgjopmhlvjcgn6kjjg2qkyj6ilde.py\n", + "[Gem5Simulator] cmd> /workspace/gem5/build/RISCV/gem5.opt -r --stdout-file=sto.log -d /tmp/torchinductor/tmp/56injjmtsqd/m5out /root/workspace/PyTorchSim/gem5_script/script_systolic.py -c /tmp/torchinductor/tmp/56injjmtsqd/cycle_bin --vlane 128\n", + "[Gem5Simulator] Simulation is still running. \n", + "[SpikeSimulator] cmd> spike --isa rv64gcv --varch=vlen:256,elen:64 --vectorlane-size=128 -m0x80000000:0x1900000000,0x2000000000:0x1000000 --scratchpad-base-paddr=137438953472 --scratchpad-base-vaddr=3489660928 --scratchpad-size=131072 --kernel-addr=000000000001040c:10cee --base-path=/tmp/torchinductor/tmp/56injjmtsqd/runtime_0002 /workspace/riscv-pk/build/pk /tmp/torchinductor/tmp/56injjmtsqd/validation_binary /tmp/torchinductor/tmp/56injjmtsqd/runtime_0002/arg0_1/0.raw /tmp/torchinductor/tmp/56injjmtsqd/runtime_0002/arg1_1/0.raw /tmp/torchinductor/tmp/56injjmtsqd/runtime_0002/buf0/0.raw\n", + "[BackendSimulator] cmd> /root/workspace/PyTorchSim/PyTorchSimBackend/build/bin/Simulator --config /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/56injjmtsqd/tile_graph.onnx --log_level trace --attributes_list /tmp/torchinductor/tmp/56injjmtsqd/runtime_0002/attribute/0\n", + "[BackendSimulator] Simulation is still running... \n", + "[BackendSimulator] Simulation of \"/tmp/torchinductor/tmp/56injjmtsqd/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/56injjmtsqd/backendsim_result/2\"\n" + ] + } + ], + "source": [ + "import os\n", + "os.environ['TORCHSIM_MANUAL_TILE_SIZE']=str(1)\n", + "os.environ['TORCHSIM_TILE_M']=\"512\"\n", + "os.environ['TORCHSIM_TILE_N']=\"512\"\n", + "os.environ['TORCHSIM_TILE_K']=\"512\"\n", + "\n", + "import torch\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-27 14:26:59.504] [info] Total execution cycle: 53651\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/56injjmtsqd/backendsim_result/0 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Autotune" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "[Auto-tune] Trying tile size: [1024, 1024, 256, 128, 1024, 256]\n", + "[Auto-tune] Trying tile size: [256, 1024, 1024, 128, 1024, 1024]\n", + "[Auto-tune] Trying tile size: [1024, 256, 1024, 128, 256, 1024]\n", + "[Auto-tune] Trying tile size: [1024, 1024, 128, 128, 1024, 128]\n", + "[BackendSimulator] Simulation of \"/tmp/torchinductor/tmp/tjueq5rwe4f/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/tjueq5rwe4f/backendsim_result/0\"\n", + "[BackendSimulator] Simulation of \"/tmp/torchinductor/tmp/35obw7tdlpc/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/35obw7tdlpc/backendsim_result/0\"\n", + "[BackendSimulator] Simulation of \"/tmp/torchinductor/tmp/jfhr7qyj4du/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/jfhr7qyj4du/backendsim_result/0\"\n", + "[BackendSimulator] Simulation of \"/tmp/torchinductor/tmp/2m7l3chv4pr/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/2m7l3chv4pr/backendsim_result/0\"\n", + "[Auto-tune] Optimal tile size: [1024, 1024, 128, 128, 1024, 128], cycles: 46347\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/4x/c4xznhca3huqoofcyxrhcphdarfspitqpwvyoizxh257b772cylf.py\n", + "[Gem5Simulator] cmd> /workspace/gem5/build/RISCV/gem5.opt -r --stdout-file=sto.log -d /tmp/torchinductor/tmp/35obw7tdlpc/m5out /root/workspace/PyTorchSim/gem5_script/script_systolic.py -c /tmp/torchinductor/tmp/35obw7tdlpc/cycle_bin --vlane 128\n", + "[Gem5Simulator] Simulation is still running.. \n", + "[SpikeSimulator] cmd> spike --isa rv64gcv --varch=vlen:256,elen:64 --vectorlane-size=128 -m0x80000000:0x1900000000,0x2000000000:0x1000000 --scratchpad-base-paddr=137438953472 --scratchpad-base-vaddr=3489660928 --scratchpad-size=131072 --kernel-addr=000000000001040c:10fb8 --base-path=/tmp/torchinductor/tmp/35obw7tdlpc/runtime_0001 /workspace/riscv-pk/build/pk /tmp/torchinductor/tmp/35obw7tdlpc/validation_binary /tmp/torchinductor/tmp/35obw7tdlpc/runtime_0001/arg0_1/0.raw /tmp/torchinductor/tmp/35obw7tdlpc/runtime_0001/arg1_1/0.raw /tmp/torchinductor/tmp/35obw7tdlpc/runtime_0001/buf0/0.raw\n", + "[BackendSimulator] cmd> /root/workspace/PyTorchSim/PyTorchSimBackend/build/bin/Simulator --config /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/35obw7tdlpc/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/35obw7tdlpc/runtime_0001/attribute/0\n", + "[BackendSimulator] Simulation is still running. \n", + "[BackendSimulator] Simulation of \"/tmp/torchinductor/tmp/35obw7tdlpc/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/35obw7tdlpc/backendsim_result/1\"\n" + ] + } + ], + "source": [ + "import os\n", + "os.environ['AUTOTUNE_TEMPLATE']=\"1\"\n", + "\n", + "import torch\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-27 14:30:10.628] [info] Total execution cycle: 46392\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/35obw7tdlpc/backendsim_result/1 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TOGSim Configuration\n", + "### Single Core" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/ar/carykuyxk5ggb2ebsr37pvvynp4h5qygv3lxmocx3q3fxagmsivd.py\n", + "[Gem5Simulator] cmd> /workspace/gem5/build/RISCV/gem5.opt -r --stdout-file=sto.log -d /tmp/torchinductor/tmp/tjueq5rwe4f/m5out /root/workspace/PyTorchSim/gem5_script/script_systolic.py -c /tmp/torchinductor/tmp/tjueq5rwe4f/cycle_bin --vlane 128\n", + "[Gem5Simulator] Simulation is still running.. \n", + "[SpikeSimulator] cmd> spike --isa rv64gcv --varch=vlen:256,elen:64 --vectorlane-size=128 -m0x80000000:0x1900000000,0x2000000000:0x1000000 --scratchpad-base-paddr=137438953472 --scratchpad-base-vaddr=3489660928 --scratchpad-size=131072 --kernel-addr=000000000001040c:1111a --base-path=/tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0009 /workspace/riscv-pk/build/pk /tmp/torchinductor/tmp/tjueq5rwe4f/validation_binary /tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0009/arg0_1/0.raw /tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0009/arg1_1/0.raw /tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0009/buf0/0.raw\n", + "[BackendSimulator] cmd> /root/workspace/PyTorchSim/PyTorchSimBackend/build/bin/Simulator --config /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/tjueq5rwe4f/tile_graph.onnx --log_level trace --attributes_list /tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0009/attribute/0\n", + "[BackendSimulator] Simulation is still running. \n", + "[BackendSimulator] Simulation of \"/tmp/torchinductor/tmp/tjueq5rwe4f/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/tjueq5rwe4f/backendsim_result/7\"\n" + ] + } + ], + "source": [ + "import os\n", + "os.environ['TORCHSIM_CONFIG']=\"/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json\"\n", + "\n", + "import torch\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-28 07:30:43.223] [info] Total execution cycle: 47154\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/tjueq5rwe4f/backendsim_result/5 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Multi-Core" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/ar/carykuyxk5ggb2ebsr37pvvynp4h5qygv3lxmocx3q3fxagmsivd.py\n", + "[Gem5Simulator] cmd> /workspace/gem5/build/RISCV/gem5.opt -r --stdout-file=sto.log -d /tmp/torchinductor/tmp/tjueq5rwe4f/m5out /root/workspace/PyTorchSim/gem5_script/script_systolic.py -c /tmp/torchinductor/tmp/tjueq5rwe4f/cycle_bin --vlane 128\n", + "[Gem5Simulator] Simulation is still running.. \n", + "[SpikeSimulator] cmd> spike --isa rv64gcv --varch=vlen:256,elen:64 --vectorlane-size=128 -m0x80000000:0x1900000000,0x2000000000:0x1000000 --scratchpad-base-paddr=137438953472 --scratchpad-base-vaddr=3489660928 --scratchpad-size=131072 --kernel-addr=000000000001040c:1111a --base-path=/tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0008 /workspace/riscv-pk/build/pk /tmp/torchinductor/tmp/tjueq5rwe4f/validation_binary /tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0008/arg0_1/0.raw /tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0008/arg1_1/0.raw /tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0008/buf0/0.raw\n", + "[BackendSimulator] cmd> /root/workspace/PyTorchSim/PyTorchSimBackend/build/bin/Simulator --config /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/tjueq5rwe4f/tile_graph.onnx --log_level trace --attributes_list /tmp/torchinductor/tmp/tjueq5rwe4f/runtime_0008/attribute/0\n", + "[BackendSimulator] Simulation is still running. \n", + "[BackendSimulator] Simulation of \"/tmp/torchinductor/tmp/tjueq5rwe4f/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/tjueq5rwe4f/backendsim_result/6\"\n" + ] + } + ], + "source": [ + "import os\n", + "os.environ['TORCHSIM_CONFIG']=\"/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json\"\n", + "\n", + "import torch\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-28 07:32:35.543] [info] Total execution cycle: 40734\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/tjueq5rwe4f/backendsim_result/6 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TOGSim log level\n", + "### log level info" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/ar/carykuyxk5ggb2ebsr37pvvynp4h5qygv3lxmocx3q3fxagmsivd.py\n", + "[Gem5Simulator] cmd> /workspace/gem5/build/RISCV/gem5.opt -r --stdout-file=sto.log -d /root/workspace/PyTorchSim/tmp/tjueq5rwe4f/m5out /root/workspace/PyTorchSim/gem5_script/script_systolic.py -c /root/workspace/PyTorchSim/tmp/tjueq5rwe4f/cycle_bin --vlane 128\n", + "[Gem5Simulator] Simulation is still running.. \n", + "[SpikeSimulator] cmd> spike --isa rv64gcv --varch=vlen:256,elen:64 --vectorlane-size=128 -m0x80000000:0x1900000000,0x2000000000:0x1000000 --scratchpad-base-paddr=137438953472 --scratchpad-base-vaddr=3489660928 --scratchpad-size=131072 --kernel-addr=000000000001040c:1111a --base-path=/root/workspace/PyTorchSim/tmp/tjueq5rwe4f/runtime_0001 /workspace/riscv-pk/build/pk /root/workspace/PyTorchSim/tmp/tjueq5rwe4f/validation_binary /root/workspace/PyTorchSim/tmp/tjueq5rwe4f/runtime_0001/arg0_1/0.raw /root/workspace/PyTorchSim/tmp/tjueq5rwe4f/runtime_0001/arg1_1/0.raw /root/workspace/PyTorchSim/tmp/tjueq5rwe4f/runtime_0001/buf0/0.raw\n", + "[BackendSimulator] cmd> /root/workspace/PyTorchSim/PyTorchSimBackend/build/bin/Simulator --config /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /root/workspace/PyTorchSim/tmp/tjueq5rwe4f/tile_graph.onnx --log_level trace --attributes_list /root/workspace/PyTorchSim/tmp/tjueq5rwe4f/runtime_0001/attribute/0\n", + "[BackendSimulator] Simulation is still running. \n", + "[BackendSimulator] Simulation of \"/root/workspace/PyTorchSim/tmp/tjueq5rwe4f/tile_graph.onnx\" is stored to \"/root/workspace/PyTorchSim/tmp/tjueq5rwe4f/backendsim_result/2\"\n" + ] + } + ], + "source": [ + "import os\n", + "os.environ['TORCHSIM_DUMP_PATH']=\"/root/workspace/PyTorchSim\"\n", + "\n", + "import torch\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### log level trace" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/bu/cbu53ehqhygnup2oknudw7ojvagnky7sphj2el6knnulypmeh6uv.py\n", + "[Gem5] Gem5 is running.. \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/workspace/PyTorchSim/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/workspace/PyTorchSim/tmp/4q4qv6gbpia/backendsim_result/0\"\n" + ] + } + ], + "source": [ + "import os\n", + "os.environ['TORCHSIM_DUMP_PATH']=\"/workspace/PyTorchSim\"\n", + "os.environ['BACKENDSIM_DEBUG_LEVEL']=\"trace\"\n", + "\n", + "import torch\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Execution Mode\n", + "### Functional & Timing mode (Default)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrapper Codegen Path = /tmp/torchinductor_root/nu/cnurrki2j7djdoqnysfmzm44lx6xei736l4bhgtv7syzw7gz5avh.py\n", + "[Gem5] Gem5 is running.. \n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/backendsim_result/0\"\n" + ] + } + ], + "source": [ + "import torch\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Functional only mode" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/bu/cbu53ehqhygnup2oknudw7ojvagnky7sphj2el6knnulypmeh6uv.py\n", + "[Spike] Running Spike simulator\n" + ] + } + ], + "source": [ + "import os\n", + "os.environ['TORCHSIM_TIMING_MODE']=\"0\"\n", + "\n", + "import torch\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Timing only mode" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/bu/cbu53ehqhygnup2oknudw7ojvagnky7sphj2el6knnulypmeh6uv.py\n", + "[Gem5] Gem5 is running.. \n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/backendsim_result/1\"\n" + ] + } + ], + "source": [ + "import os\n", + "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"0\"\n", + "\n", + "import torch\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scheduler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torchvision.models import resnet18\n", + "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request\n", + "from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_BACKEND_CONFIG\n", + "\n", + "scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=CONFIG_TORCHSIM_BACKEND_CONFIG)\n", + "device = scheduler.execution_engine.module.custom_device()\n", + "\n", + "model = resnet18().eval()\n", + "input = torch.randn(1, 3, 224, 224).to(device=device)\n", + "opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last))\n", + "\n", + "SchedulerDNNModel.register_model(\"resnet18\", opt_fn)\n", + "request = Request(\"resnet18\", [input], [], request_queue_idx=0)\n", + "scheduler.add_request(request, request_time=0)\n", + "\n", + "# Run scheduler\n", + "while not scheduler.is_finished():\n", + " with torch.no_grad():\n", + " scheduler.schedule()\n", + "\n", + "print(\"ResNet18 Simulation Done\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Generator" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 13:05:13.597] [info] [LoadConfig] Success to open \"/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\"\n", + "[2025-11-30 13:05:13.597] [info] [Config/Core] CPU 0: Partition 0\n", + "[2025-11-30 13:05:13.597] [info] [Config/Core] CPU 1: Partition 0\n", + "[2025-11-30 13:05:13.597] [info] [Config/Core] Core 0: 700 MHz, Systolic array per core: 1\n", + "[2025-11-30 13:05:13.597] [info] [Config/Core] Core 1: 700 MHz, Systolic array per core: 1\n", + "[2025-11-30 13:05:13.597] [info] [Config/DRAM] Ramulator2 config: /root/workspace/PyTorchSim/PyTorchSimBackend/configs/../configs/ramulator2_configs/HBM2.yaml\n", + "[2025-11-30 13:05:13.597] [info] [Config/DRAM] DRAM Bandwidth 716 GB/s, Freq: 700 MHz, Channels: 32, Request_size: 32B\n", + "[2025-11-30 13:05:13.597] [info] [Config/L2Cache] No L2 cache\n", + "[2025-11-30 13:05:13.673] [info] [Config/Interconnect] Interconnect freq: 20000 MHz\n", + "[2025-11-30 13:05:13.673] [info] [Config/Interconnect] SimpleInerconnect selected\n", + "[0] BackendSim> [Reqest] Resnet18 request time: 0\n", + "[Request issue] partition: 0 batch size: 1\n", + "[Request-0 issue] partition: 0 arrival_time: 0 start_time: 0.0\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/qx/cqxp5xnkdgcdwmer5w6ftyf46iegefhyjclg6mkz2smhktj4tpcy.py\n", + "launch /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx /tmp/torchinductor/tmp/w5hefiqdl3p/runtime_0001/attribute/0 0 0\n", + "[2025-11-30 13:05:22.114] [info] [LoadConfig] Success to open \"/tmp/torchinductor/tmp/w5hefiqdl3p/runtime_0001/attribute/0\"\n", + "[2025-11-30 13:05:22.114] [info] [LoadConfig] Success to open \"/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\"\n", + "[2025-11-30 13:05:22.115] [info] [TOGParser/Attribute] Address Attribute key: arg0 address: 0xa3056c0\n", + "[2025-11-30 13:05:22.115] [info] [TOGParser/Attribute] Address Attribute key: arg1 address: 0xc4a3d40\n", + "[2025-11-30 13:05:22.115] [info] [TOGParser] Register Metadata \"systolic_size\": \"128\"\n", + "[2025-11-30 13:05:22.115] [info] [TOGParser] Register Metadata \"stonneGraph\": \"0\"\n", + "[2025-11-30 13:05:22.116] [info] [Scheduler 0] Register graph path: /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx operation: primals_123 at 0\n", + "[2025-11-30 13:05:22.116] [info] [Scheduler 0] Tile Graph FIFO Scheduled\n", + "until -1\n", + "[2025-11-30 13:05:22.117] [info] HBM2-CH_0: BW utilization 0% (0 reads, 0 writes)\n", + "[2025-11-30 13:05:22.319] [info] [Scheduler 0] Graph path: /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx operation: primals_123 finish at 2424\n", + "[2025-11-30 13:05:22.319] [info] Total compute time 2424\n", + "cycle\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 33\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[38;5;66;03m# Run scheduler\u001b[39;00m\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m scheduler\u001b[38;5;241m.\u001b[39mis_finished():\n\u001b[0;32m---> 33\u001b[0m \u001b[43mscheduler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mschedule\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:475\u001b[0m, in \u001b[0;36mScheduler.schedule\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 473\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcurrent_cycle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbackend_simulator\u001b[38;5;241m.\u001b[39mcycle()\n\u001b[1;32m 474\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 475\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnext_time\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 476\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:507\u001b[0m, in \u001b[0;36mScheduler.run\u001b[0;34m(self, until_time)\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m until_time \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 506\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mis_any_idle(req_empty_info):\n\u001b[0;32m--> 507\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mexecute_cycle\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 508\u001b[0m req_empty_info \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest_empty(i) \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mnum_partion)]\n\u001b[1;32m 509\u001b[0m \u001b[38;5;66;03m# if result is not -1, schedule new request\u001b[39;00m\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:484\u001b[0m, in \u001b[0;36mScheduler.run..execute_cycle\u001b[0;34m()\u001b[0m\n\u001b[1;32m 482\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mnum_partion):\n\u001b[1;32m 483\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mpartition_state[i] \u001b[38;5;241m==\u001b[39m PyTorchSimRunner\u001b[38;5;241m.\u001b[39mPARTITION_IDLE:\n\u001b[0;32m--> 484\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlaunch_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_cycle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 485\u001b[0m launch_ret_info\u001b[38;5;241m.\u001b[39mappend(ret)\n\u001b[1;32m 487\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcheck_finish_request()\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:254\u001b[0m, in \u001b[0;36mPyTorchSimRunner.launch_kernel\u001b[0;34m(self, current_cycle, partion_idx)\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpartition_state[partion_idx] \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mPARTITION_IDLE:\n\u001b[1;32m 253\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpartition_state[partion_idx]\n\u001b[0;32m--> 254\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselect_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpartion_idx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mSELECT_NOTHING:\n\u001b[1;32m 256\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mSELECT_NOTHING\n", + "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:290\u001b[0m, in \u001b[0;36mFIFORunner.select_kernel\u001b[0;34m(self, partition_idx)\u001b[0m\n\u001b[1;32m 287\u001b[0m nested_gen \u001b[38;5;241m=\u001b[39m kernel(\u001b[38;5;241m*\u001b[39minputs)\n\u001b[1;32m 288\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnested_launch_model_dicts[partition_idx] \u001b[38;5;241m=\u001b[39m {req : nested_gen}\n\u001b[1;32m 289\u001b[0m kernel, inputs \u001b[38;5;241m=\u001b[39m \\\n\u001b[0;32m--> 290\u001b[0m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnested_launch_model_dicts\u001b[49m\u001b[43m[\u001b[49m\u001b[43mpartition_idx\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43mreq\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m kernel, inputs\n\u001b[1;32m 292\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 293\u001b[0m \u001b[38;5;66;03m# Retry\u001b[39;00m\n", + "File \u001b[0;32m/tmp/torchinductor_root/qx/cqxp5xnkdgcdwmer5w6ftyf46iegefhyjclg6mkz2smhktj4tpcy.py:227\u001b[0m, in \u001b[0;36mConv2D_1_3_224_22464_3_7_7_2_2_3_3_1_1_3\u001b[0;34m(X, W, Y)\u001b[0m\n\u001b[1;32m 224\u001b[0m W \u001b[38;5;241m=\u001b[39m W\u001b[38;5;241m.\u001b[39mpermute(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m0\u001b[39m)\u001b[38;5;241m.\u001b[39mcontiguous() \u001b[38;5;66;03m# (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)\u001b[39;00m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;66;03m# Launch kernel\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m \u001b[43mmlir_kernel_1\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mW\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 228\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m (mlir_kernel_1, (X, W, Y))\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:307\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir..dryrun_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 306\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdryrun_simulator\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 307\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 308\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfilelock\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FileLock\n\u001b[1;32m 309\u001b[0m lock_dir \u001b[38;5;241m=\u001b[39m get_lock_dir()\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/concurrent/futures/_base.py:453\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 450\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[1;32m 451\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__get_result()\n\u001b[0;32m--> 453\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_condition\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 455\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n\u001b[1;32m 456\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/threading.py:320\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m 319\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 321\u001b[0m gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 322\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "import os\n", + "import torch\n", + "from torchvision.models import resnet18\n", + "\n", + "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator\n", + "CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", + "\n", + "lambda_requests = 10\n", + "max_time = 30\n", + "\n", + "target_model1 = resnet18().eval()\n", + "\n", + "# Init scheduler\n", + "scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, backend_config=f\"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\")\n", + "# Register compiled model\n", + "opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)\n", + "SchedulerDNNModel.register_model(\"resnet18\", opt_model1)\n", + "\n", + "# Generate time stamp\n", + "for request_time in poisson_request_generator(lambda_requests, max_time):\n", + " # Init input data\n", + " model_input1 = torch.randn(1, 3, 224, 224)\n", + "\n", + " # Init request\n", + " new_request1 = Request(\"resnet18\", [model_input1], [], request_queue_idx=0)\n", + "\n", + " # Add request to scheduler\n", + " print(\"[Reqest] Resnet18 request time: \", request_time, flush=True)\n", + " scheduler.add_request(new_request1, request_time=request_time)\n", + "\n", + "# Run scheduler\n", + "while not scheduler.is_finished():\n", + " scheduler.schedule()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compiler Optimization\n", + "### GeMM + ReLU fusion" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrapper Codegen Path = /tmp/torchinductor_root/tr/ctrncpsl4yc6zhwph2djjj2jkkm3uoam76ptc4bkldhcfxukpqmp.py\n", + "[Gem5] Gem5 is running. \n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/5o2xythi5z3/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/5o2xythi5z3/backendsim_result/0\"\n" + ] + } + ], + "source": [ + "import torch\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "def gemm_relu(a, b):\n", + " return torch.relu(torch.matmul(a, b))\n", + "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n", + "out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 12:43:43.912] [info] Total execution cycle: 50348\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/5o2xythi5z3/backendsim_result/0 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Disable fusion" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n", + "Building extension module npu...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "Loading extension module npu...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\n", + "Wrapper Codegen Path = /tmp/torchinductor_root/4w/c4ws6azjjklukf2gdi24fx2tplonbddh5h3i2ti2iysn7euxt2wf.py\n", + "[Gem5] Gem5 is running... \n", + "[Gem5] Gem5 is running.. \n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/backendsim_result/2\"\n", + "[Spike] Running Spike simulator\n", + "[TOGSim] TOGSim is running.. \n", + "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/37dfo4nczcq/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/37dfo4nczcq/backendsim_result/0\"\n" + ] + } + ], + "source": [ + "import os\n", + "os.environ['TORCHSIM_COMPILER_OPTIMIZATION']=\"none\"\n", + "\n", + "import torch\n", + "from Scheduler.scheduler import PyTorchSimRunner\n", + "\n", + "device = PyTorchSimRunner.setup_device().custom_device()\n", + "\n", + "input = torch.randn(1024, 1024).to(device=device)\n", + "weight = torch.randn(1024, 1024).to(device=device)\n", + "\n", + "def gemm_relu(a, b):\n", + " return torch.relu(torch.matmul(a, b))\n", + "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n", + "out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-11-30 12:52:49.376] [info] Total execution cycle: 47164\n", + "[2025-11-30 12:52:52.444] [info] Total execution cycle: 58510\n" + ] + } + ], + "source": [ + "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/backendsim_result/2 | grep \"Total execution cycle\"\n", + "!cat /tmp/torchinductor/tmp/37dfo4nczcq/backendsim_result/0 | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Single kernel mode (TODO: remove it?)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", + "No modifications detected for re-loaded extension module npu, skipping build step...\n", + "Loading extension module npu...\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/assumptions.py:499\u001b[0m, in \u001b[0;36mmake_property..getit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 498\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 499\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_assumptions\u001b[49m\u001b[43m[\u001b[49m\u001b[43mfact\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 500\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m:\n", + "\u001b[0;31mKeyError\u001b[0m: 'extended_negative'", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m model \u001b[38;5;241m=\u001b[39m resnet18()\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m 9\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(model)\n\u001b[0;32m---> 10\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 487\u001b[0m dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 491\u001b[0m set_eval_frame(prior)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1523\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:655\u001b[0m, in \u001b[0;36mcatch_errors_wrapper..catch_errors\u001b[0;34m(frame, cache_entry, frame_state)\u001b[0m\n\u001b[1;32m 652\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m hijacked_callback(frame, cache_entry, hooks, frame_state)\n\u001b[1;32m 654\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compile_lock, _disable_current_modes():\n\u001b[0;32m--> 655\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcallback\u001b[49m\u001b[43m(\u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_entry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:727\u001b[0m, in \u001b[0;36mconvert_frame.._convert_frame\u001b[0;34m(frame, cache_entry, hooks, frame_state)\u001b[0m\n\u001b[1;32m 725\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mframes\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtotal\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 726\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 727\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43minner_convert\u001b[49m\u001b[43m(\u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_entry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 728\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mframes\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mok\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 729\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:383\u001b[0m, in \u001b[0;36mconvert_frame_assert.._convert_frame_assert\u001b[0;34m(frame, cache_entry, hooks, frame_state)\u001b[0m\n\u001b[1;32m 370\u001b[0m signpost_event(\n\u001b[1;32m 371\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdynamo\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 372\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_convert_frame_assert._compile\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 379\u001b[0m },\n\u001b[1;32m 380\u001b[0m )\n\u001b[1;32m 382\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config\u001b[38;5;241m.\u001b[39mpatch(_patch_config_if_changed()):\n\u001b[0;32m--> 383\u001b[0m compiled_product \u001b[38;5;241m=\u001b[39m \u001b[43m_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 384\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_code\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 385\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_globals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 386\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_locals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 387\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_builtins\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 388\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompiler_fn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 389\u001b[0m \u001b[43m \u001b[49m\u001b[43mone_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 390\u001b[0m \u001b[43m \u001b[49m\u001b[43mexport\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 391\u001b[0m \u001b[43m \u001b[49m\u001b[43mexport_constraints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 392\u001b[0m \u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 393\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 394\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 395\u001b[0m \u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mframe_state\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompile_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcompile_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m compiled_product\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:646\u001b[0m, in \u001b[0;36m_compile\u001b[0;34m(code, globals, locals, builtins, compiler_fn, one_graph, export, export_constraints, hooks, cache_size, frame, frame_state, compile_id)\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compile_context(CompileContext(compile_id)):\n\u001b[1;32m 645\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 646\u001b[0m guarded_code \u001b[38;5;241m=\u001b[39m \u001b[43mcompile_inner\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mone_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransform\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 647\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m guarded_code\n\u001b[1;32m 648\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\n\u001b[1;32m 649\u001b[0m Unsupported,\n\u001b[1;32m 650\u001b[0m TorchRuntimeError,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 657\u001b[0m BisectValidationException,\n\u001b[1;32m 658\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:562\u001b[0m, in \u001b[0;36m_compile..compile_inner\u001b[0;34m(code, one_graph, hooks, transform)\u001b[0m\n\u001b[1;32m 560\u001b[0m CompileContext\u001b[38;5;241m.\u001b[39mget()\u001b[38;5;241m.\u001b[39mattempt \u001b[38;5;241m=\u001b[39m attempt\n\u001b[1;32m 561\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 562\u001b[0m out_code \u001b[38;5;241m=\u001b[39m \u001b[43mtransform_code_object\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransform\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 563\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 564\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mRestartAnalysis \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/bytecode_transformation.py:1033\u001b[0m, in \u001b[0;36mtransform_code_object\u001b[0;34m(code, transformations, safe)\u001b[0m\n\u001b[1;32m 1030\u001b[0m instructions \u001b[38;5;241m=\u001b[39m cleaned_instructions(code, safe)\n\u001b[1;32m 1031\u001b[0m propagate_line_nums(instructions)\n\u001b[0;32m-> 1033\u001b[0m \u001b[43mtransformations\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstructions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcode_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1034\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m clean_and_assemble_instructions(instructions, keys, code_options)[\u001b[38;5;241m1\u001b[39m]\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:151\u001b[0m, in \u001b[0;36mpreserve_global_state.._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m cleanup \u001b[38;5;241m=\u001b[39m setup_compile_debug()\n\u001b[1;32m 150\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 151\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 153\u001b[0m cleanup\u001b[38;5;241m.\u001b[39mclose()\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:527\u001b[0m, in \u001b[0;36m_compile..transform\u001b[0;34m(instructions, code_options)\u001b[0m\n\u001b[1;32m 525\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 526\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m tracing(tracer\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mtracing_context), tracer\u001b[38;5;241m.\u001b[39mset_current_tx():\n\u001b[0;32m--> 527\u001b[0m \u001b[43mtracer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 528\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mUnspecializeRestartAnalysis:\n\u001b[1;32m 529\u001b[0m speculation_log\u001b[38;5;241m.\u001b[39mclear()\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:2128\u001b[0m, in \u001b[0;36mInstructionTranslator.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 2127\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m-> 2128\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:818\u001b[0m, in \u001b[0;36mInstructionTranslatorBase.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 813\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 814\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mpush_tx(\u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m 815\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m (\n\u001b[1;32m 816\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minstruction_pointer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 817\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mshould_exit\n\u001b[0;32m--> 818\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 819\u001b[0m ):\n\u001b[1;32m 820\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 821\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m BackendCompilerFailed:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:781\u001b[0m, in \u001b[0;36mInstructionTranslatorBase.step\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 777\u001b[0m unimplemented(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmissing: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minst\u001b[38;5;241m.\u001b[39mopname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 778\u001b[0m TracingContext\u001b[38;5;241m.\u001b[39mset_current_loc(\n\u001b[1;32m 779\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_filename, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlineno, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_name\n\u001b[1;32m 780\u001b[0m )\n\u001b[0;32m--> 781\u001b[0m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minst\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopname\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minst\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 783\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m inst\u001b[38;5;241m.\u001b[39mopname \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 784\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m Unsupported:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:2243\u001b[0m, in \u001b[0;36mInstructionTranslator.RETURN_VALUE\u001b[0;34m(self, inst)\u001b[0m\n\u001b[1;32m 2238\u001b[0m _step_logger()(\n\u001b[1;32m 2239\u001b[0m logging\u001b[38;5;241m.\u001b[39mINFO,\n\u001b[1;32m 2240\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorchdynamo done tracing \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (RETURN_VALUE)\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 2241\u001b[0m )\n\u001b[1;32m 2242\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE triggered compile\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 2243\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moutput\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompile_subgraph\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2244\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2245\u001b[0m \u001b[43m \u001b[49m\u001b[43mreason\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mGraphCompileReason\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2246\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreturn_value\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mframe_summary\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgraph_break\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[1;32m 2247\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2248\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompile_return_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 2249\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2250\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39madd_output_instructions([create_instruction(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE\u001b[39m\u001b[38;5;124m\"\u001b[39m)])\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:919\u001b[0m, in \u001b[0;36mOutputGraph.compile_subgraph\u001b[0;34m(self, tx, partial_convert, reason, compile_return_value)\u001b[0m\n\u001b[1;32m 916\u001b[0m append_prefix_insts()\n\u001b[1;32m 917\u001b[0m \u001b[38;5;66;03m# optimization to generate better code in a common case\u001b[39;00m\n\u001b[1;32m 918\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_output_instructions(\n\u001b[0;32m--> 919\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompile_and_call_fx_graph\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mreversed\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mstack_values\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mroot\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 920\u001b[0m \u001b[38;5;241m+\u001b[39m [create_instruction(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUNPACK_SEQUENCE\u001b[39m\u001b[38;5;124m\"\u001b[39m, arg\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(stack_values))]\n\u001b[1;32m 921\u001b[0m )\n\u001b[1;32m 922\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 923\u001b[0m graph_output_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnew_var(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgraph_out\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36mContextDecorator.__call__..inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds):\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1087\u001b[0m, in \u001b[0;36mOutputGraph.compile_and_call_fx_graph\u001b[0;34m(self, tx, rv, root)\u001b[0m\n\u001b[1;32m 1084\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtracing_context\u001b[38;5;241m.\u001b[39mfake_mode \u001b[38;5;241m=\u001b[39m backend_fake_mode\n\u001b[1;32m 1086\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrestore_global_state():\n\u001b[0;32m-> 1087\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcall_user_compiler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1088\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m disable(compiled_fn)\n\u001b[1;32m 1090\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstats\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munique_graphs\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1140\u001b[0m, in \u001b[0;36mOutputGraph.call_user_compiler\u001b[0;34m(self, gm)\u001b[0m\n\u001b[1;32m 1138\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39mverify_correctness:\n\u001b[1;32m 1139\u001b[0m compiler_fn \u001b[38;5;241m=\u001b[39m WrapperBackend(compiler_fn)\n\u001b[0;32m-> 1140\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexample_inputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1141\u001b[0m _step_logger()(logging\u001b[38;5;241m.\u001b[39mINFO, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdone compiler function \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1142\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(compiled_fn), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompiler_fn did not return callable\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/repro/after_dynamo.py:117\u001b[0m, in \u001b[0;36mwrap_backend_debug..debug_wrapper\u001b[0;34m(gm, example_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 117\u001b[0m compiled_gm \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 119\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m compiled_gm\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/__init__.py:1662\u001b[0m, in \u001b[0;36m_TorchCompileInductorWrapper.__call__\u001b[0;34m(self, model_, inputs_)\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, model_, inputs_):\n\u001b[1;32m 1660\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_inductor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompile_fx\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m compile_fx\n\u001b[0;32m-> 1662\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompile_fx\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig_patches\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:1168\u001b[0m, in \u001b[0;36mcompile_fx\u001b[0;34m(model_, example_inputs_, inner_compile, config_patches, decompositions)\u001b[0m\n\u001b[1;32m 1163\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m inference_compiler(unlifted_gm, example_inputs_)\n\u001b[1;32m 1165\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_fake_mode(fake_mode), torch\u001b[38;5;241m.\u001b[39m_guards\u001b[38;5;241m.\u001b[39mtracing(\n\u001b[1;32m 1166\u001b[0m tracing_context\n\u001b[1;32m 1167\u001b[0m ), compiled_autograd\u001b[38;5;241m.\u001b[39mdisable():\n\u001b[0;32m-> 1168\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43maot_autograd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1169\u001b[0m \u001b[43m \u001b[49m\u001b[43mfw_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1170\u001b[0m \u001b[43m \u001b[49m\u001b[43mbw_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbw_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1171\u001b[0m \u001b[43m \u001b[49m\u001b[43minference_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minference_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1172\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecompositions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecompositions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1173\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartition_fn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_fn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1174\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeep_inference_input_mutations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1175\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs_\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/backends/common.py:55\u001b[0m, in \u001b[0;36maot_autograd..compiler_fn\u001b[0;34m(gm, example_inputs)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# NB: NOT cloned!\u001b[39;00m\n\u001b[1;32m 54\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m enable_aot_logging(), patch_config:\n\u001b[0;32m---> 55\u001b[0m cg \u001b[38;5;241m=\u001b[39m \u001b[43maot_module_simplified\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 56\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot_autograd\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mok\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m disable(cg)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:887\u001b[0m, in \u001b[0;36maot_module_simplified\u001b[0;34m(mod, args, fw_compiler, bw_compiler, partition_fn, decompositions, keep_inference_input_mutations, inference_compiler)\u001b[0m\n\u001b[1;32m 871\u001b[0m aot_config \u001b[38;5;241m=\u001b[39m AOTConfig(\n\u001b[1;32m 872\u001b[0m fw_compiler\u001b[38;5;241m=\u001b[39mfw_compiler,\n\u001b[1;32m 873\u001b[0m bw_compiler\u001b[38;5;241m=\u001b[39mbw_compiler,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 883\u001b[0m no_tangents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 884\u001b[0m )\n\u001b[1;32m 886\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compiled_autograd\u001b[38;5;241m.\u001b[39mdisable():\n\u001b[0;32m--> 887\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_aot_dispatcher_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 888\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunctional_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 889\u001b[0m \u001b[43m \u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 890\u001b[0m \u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 891\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 893\u001b[0m \u001b[38;5;66;03m# TODO: There is something deeply wrong here; compiled_fn running with\u001b[39;00m\n\u001b[1;32m 894\u001b[0m \u001b[38;5;66;03m# the boxed calling convention, but aot_module_simplified somehow\u001b[39;00m\n\u001b[1;32m 895\u001b[0m \u001b[38;5;66;03m# historically returned a function that was not the boxed calling\u001b[39;00m\n\u001b[1;32m 896\u001b[0m \u001b[38;5;66;03m# convention. This should get fixed...\u001b[39;00m\n\u001b[1;32m 897\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39mruntime_args):\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:600\u001b[0m, in \u001b[0;36mcreate_aot_dispatcher_function\u001b[0;34m(flat_fn, flat_args, aot_config)\u001b[0m\n\u001b[1;32m 597\u001b[0m compiler_fn \u001b[38;5;241m=\u001b[39m partial(aot_wrapper_dedupe, compiler_fn\u001b[38;5;241m=\u001b[39mcompiler_fn)\n\u001b[1;32m 598\u001b[0m \u001b[38;5;66;03m# You can put more passes here\u001b[39;00m\n\u001b[0;32m--> 600\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfake_flat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 601\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m aot_config\u001b[38;5;241m.\u001b[39mis_export:\n\u001b[1;32m 602\u001b[0m mutated_user_inp_locs \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 603\u001b[0m idx \u001b[38;5;241m-\u001b[39m aot_config\u001b[38;5;241m.\u001b[39mnum_params_buffers\n\u001b[1;32m 604\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m fw_metadata\u001b[38;5;241m.\u001b[39mmutated_inp_runtime_indices\n\u001b[1;32m 605\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m idx \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m aot_config\u001b[38;5;241m.\u001b[39mnum_params_buffers\n\u001b[1;32m 606\u001b[0m ]\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:425\u001b[0m, in \u001b[0;36maot_wrapper_dedupe\u001b[0;34m(flat_fn, flat_args, aot_config, compiler_fn, fw_metadata)\u001b[0m\n\u001b[1;32m 422\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 424\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ok:\n\u001b[0;32m--> 425\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mleaf_flat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 427\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m requires_subclass_dispatch(leaf_flat_args, fw_metadata):\n\u001b[1;32m 428\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 429\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\\\u001b[39;00m\n\u001b[1;32m 430\u001b[0m \u001b[38;5;124;03mEncountered duplicate inputs that are mutated in the graph, but at least one input/output\u001b[39;00m\n\u001b[1;32m 431\u001b[0m \u001b[38;5;124;03mto the graph is a tensor subclass. This is not supported today. You can try to\u001b[39;00m\n\u001b[1;32m 432\u001b[0m \u001b[38;5;124;03mremove the aliasing yourself as a workaround, or otherwise file an issue on github.\"\"\"\u001b[39;00m\n\u001b[1;32m 433\u001b[0m )\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:630\u001b[0m, in \u001b[0;36maot_wrapper_synthetic_base\u001b[0;34m(flat_fn, flat_args, aot_config, fw_metadata, needs_autograd, compiler_fn)\u001b[0m\n\u001b[1;32m 628\u001b[0m \u001b[38;5;66;03m# Happy path: we don't need synthetic bases\u001b[39;00m\n\u001b[1;32m 629\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m synthetic_base_info \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 630\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mflat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 632\u001b[0m \u001b[38;5;66;03m# export path: ban synthetic bases for now, add later if requested.\u001b[39;00m\n\u001b[1;32m 633\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m requires_subclass_dispatch(flat_args, fw_metadata):\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:295\u001b[0m, in \u001b[0;36maot_dispatch_autograd\u001b[0;34m(flat_fn, flat_args, aot_config, fw_metadata)\u001b[0m\n\u001b[1;32m 292\u001b[0m tracing_context\u001b[38;5;241m.\u001b[39mfw_metadata \u001b[38;5;241m=\u001b[39m inner_meta\n\u001b[1;32m 294\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m TracingContext\u001b[38;5;241m.\u001b[39mreport_output_strides() \u001b[38;5;28;01mas\u001b[39;00m fwd_output_strides:\n\u001b[0;32m--> 295\u001b[0m compiled_fw_func \u001b[38;5;241m=\u001b[39m \u001b[43maot_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfw_compiler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfw_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43madjusted_flat_args\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(compiled_fw_func, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 297\u001b[0m compiled_fw_func \u001b[38;5;241m=\u001b[39m make_boxed_func(compiled_fw_func)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:1100\u001b[0m, in \u001b[0;36mcompile_fx..fw_compiler_base\u001b[0;34m(model, example_inputs, is_inference)\u001b[0m\n\u001b[1;32m 1092\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m orig_output_end_idx \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m num_model_outputs\n\u001b[1;32m 1094\u001b[0m user_visible_outputs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 1095\u001b[0m n\u001b[38;5;241m.\u001b[39mname\n\u001b[1;32m 1096\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m n \u001b[38;5;129;01min\u001b[39;00m model_outputs[original_output_start_index:orig_output_end_idx]\n\u001b[1;32m 1097\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(n, torch\u001b[38;5;241m.\u001b[39mfx\u001b[38;5;241m.\u001b[39mNode)\n\u001b[1;32m 1098\u001b[0m }\n\u001b[0;32m-> 1100\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1101\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1102\u001b[0m \u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1103\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_fixed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfixed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1104\u001b[0m \u001b[43m \u001b[49m\u001b[43mcudagraphs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcudagraphs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1105\u001b[0m \u001b[43m \u001b[49m\u001b[43mgraph_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgraph_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1106\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_inference\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_inference\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1107\u001b[0m \u001b[43m \u001b[49m\u001b[43mboxed_forward_device_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforward_device\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1108\u001b[0m \u001b[43m \u001b[49m\u001b[43muser_visible_outputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_visible_outputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1109\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/repro/after_aot.py:83\u001b[0m, in \u001b[0;36mwrap_compiler_debug..debug_wrapper\u001b[0;34m(gm, example_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m config\u001b[38;5;241m.\u001b[39mrepro_after \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdynamo\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 81\u001b[0m \u001b[38;5;66;03m# Call the compiler_fn - which is either aot_autograd or inductor\u001b[39;00m\n\u001b[1;32m 82\u001b[0m \u001b[38;5;66;03m# with fake inputs\u001b[39;00m\n\u001b[0;32m---> 83\u001b[0m inner_compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 84\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 85\u001b[0m \u001b[38;5;66;03m# TODO: Failures here are troublesome because no real inputs,\u001b[39;00m\n\u001b[1;32m 86\u001b[0m \u001b[38;5;66;03m# need a different serialization strategy\u001b[39;00m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39mrepro_after \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/debug.py:305\u001b[0m, in \u001b[0;36mDebugContext.wrap..inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 302\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 304\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m DebugContext():\n\u001b[0;32m--> 305\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36mContextDecorator.__call__..inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds):\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:320\u001b[0m, in \u001b[0;36mcompile_fx_inner\u001b[0;34m(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, boxed_forward_device_index, user_visible_outputs, layout_opt, extern_node_serializer)\u001b[0m\n\u001b[1;32m 316\u001b[0m compiled_graph \u001b[38;5;241m=\u001b[39m FxGraphCache\u001b[38;5;241m.\u001b[39mload(\n\u001b[1;32m 317\u001b[0m fx_codegen_and_compile, gm, example_inputs, graph_kwargs\n\u001b[1;32m 318\u001b[0m )\n\u001b[1;32m 319\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m compiled_graph \u001b[38;5;241m=\u001b[39m \u001b[43mfx_codegen_and_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 321\u001b[0m \u001b[43m \u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mgraph_kwargs\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[arg-type]\u001b[39;49;00m\n\u001b[1;32m 322\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 324\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFX codegen and compilation took \u001b[39m\u001b[38;5;132;01m%.3f\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m, time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m start)\n\u001b[1;32m 326\u001b[0m \u001b[38;5;66;03m# Return the output strides to the caller via TracingContext\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:535\u001b[0m, in \u001b[0;36mfx_codegen_and_compile\u001b[0;34m(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, user_visible_outputs, layout_opt, extern_node_serializer)\u001b[0m\n\u001b[1;32m 519\u001b[0m graph \u001b[38;5;241m=\u001b[39m GraphLowering(\n\u001b[1;32m 520\u001b[0m gm,\n\u001b[1;32m 521\u001b[0m \u001b[38;5;66;03m# example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 532\u001b[0m is_inference\u001b[38;5;241m=\u001b[39mis_inference,\n\u001b[1;32m 533\u001b[0m )\n\u001b[1;32m 534\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_graph_handler(graph):\n\u001b[0;32m--> 535\u001b[0m \u001b[43mgraph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 536\u001b[0m output_strides: List[Optional[Tuple[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m]]] \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 537\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m graph\u001b[38;5;241m.\u001b[39mgraph_outputs \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 538\u001b[0m \u001b[38;5;66;03m# We'll put the output strides in the compiled graph so we\u001b[39;00m\n\u001b[1;32m 539\u001b[0m \u001b[38;5;66;03m# can later return them to the caller via TracingContext\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed..dynamo_timed_inner..time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 243\u001b[0m t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 245\u001b[0m time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m 246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:519\u001b[0m, in \u001b[0;36mGraphLowering.run\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[38;5;129m@dynamo_timed\u001b[39m\n\u001b[1;32m 518\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs):\n\u001b[0;32m--> 519\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/fx/interpreter.py:138\u001b[0m, in \u001b[0;36mInterpreter.run\u001b[0;34m(self, initial_env, enable_io_processing, *args)\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m 137\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 138\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menv[node] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_node\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 140\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mextra_traceback:\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:814\u001b[0m, in \u001b[0;36mGraphLowering.run_node\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 812\u001b[0m debug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayout_constraints\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 813\u001b[0m args, kwargs \u001b[38;5;241m=\u001b[39m layout_constraints[n\u001b[38;5;241m.\u001b[39mtarget](n, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 814\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcall_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 815\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_magic_method(n\u001b[38;5;241m.\u001b[39mtarget):\n\u001b[1;32m 816\u001b[0m \u001b[38;5;66;03m# TODO: this is sus, it probably should be handled in the\u001b[39;00m\n\u001b[1;32m 817\u001b[0m \u001b[38;5;66;03m# lowerings themselves similarly to sym_size/sym-stride\u001b[39;00m\n\u001b[1;32m 818\u001b[0m debug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_magic_method\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:691\u001b[0m, in \u001b[0;36mGraphLowering.call_function\u001b[0;34m(self, target, args, kwargs)\u001b[0m\n\u001b[1;32m 689\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 690\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m via \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, lowerings[target])\n\u001b[0;32m--> 691\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mlowerings\u001b[49m\u001b[43m[\u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 692\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m 693\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_lowering.py:117\u001b[0m, in \u001b[0;36mconvolution\u001b[0;34m(x, weight, bias, stride, padding, dilation, transposed, output_padding, groups)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 116\u001b[0m mlir_template \u001b[38;5;241m=\u001b[39m MLIRConvTemplate([x, weight, bias], layout, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 117\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmlir_template\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39moutput_node()\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_template.py:1189\u001b[0m, in \u001b[0;36mMLIRTemplate.generate\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 1184\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m patch\u001b[38;5;241m.\u001b[39mobject(V\u001b[38;5;241m.\u001b[39mgraph, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mget_dtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fake_get_dtype(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_node)):\n\u001b[1;32m 1185\u001b[0m kernel \u001b[38;5;241m=\u001b[39m MLIRTemplateKernel(kernel_name\u001b[38;5;241m=\u001b[39mkernel_name, input_nodes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minput_nodes, call_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlayout\u001b[38;5;241m.\u001b[39msize, kernel_group\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1186\u001b[0m outer_func_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunction_name \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfunction_name\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1187\u001b[0m outer_func_render\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mouter_func_render \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mouter_func_render\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1188\u001b[0m kernel_arg_attributes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_arg_attributes() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mget_arg_attributes\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m-> 1189\u001b[0m code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkernel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkernel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1191\u001b[0m kernel_hash_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmlir_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex_counter)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1192\u001b[0m extra_args \u001b[38;5;241m=\u001b[39m []\n", + "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py:238\u001b[0m, in \u001b[0;36mMLIRConvSingleBatchTemplate.render\u001b[0;34m(self, kernel, template_buffer_node, epilogue_nodes, tile_info, **kwargs)\u001b[0m\n\u001b[1;32m 229\u001b[0m kernel\u001b[38;5;241m.\u001b[39mepilogue_info \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(\n\u001b[1;32m 230\u001b[0m output_node \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_node\u001b[38;5;241m.\u001b[39mname,\n\u001b[1;32m 231\u001b[0m sram_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_buffer\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 235\u001b[0m dim_aliasing \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex0\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mc0\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex1\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtile_n\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex2\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mo_h\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex3\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtile_m\u001b[39m\u001b[38;5;124m\"\u001b[39m}\n\u001b[1;32m 236\u001b[0m )\n\u001b[1;32m 237\u001b[0m kernel\u001b[38;5;241m.\u001b[39mexception_nodes[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnumel\u001b[39m\u001b[38;5;124m\"\u001b[39m : (I_W\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m2\u001b[39m\u001b[38;5;241m*\u001b[39mPADDING_W)\u001b[38;5;241m*\u001b[39m(I_H\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m2\u001b[39m\u001b[38;5;241m*\u001b[39mPADDING_H)\u001b[38;5;241m*\u001b[39mI_C\u001b[38;5;241m*\u001b[39mBATCH}\n\u001b[0;32m--> 238\u001b[0m code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_template_from_string\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconv_template\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkernel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 239\u001b[0m kernel\u001b[38;5;241m.\u001b[39madd_loop_info([kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mK_H\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mK_W\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_H\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_W\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBATCH\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_C\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mI_C\u001b[39m\u001b[38;5;124m\"\u001b[39m]], [kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_M\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_N\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_K\u001b[39m\u001b[38;5;124m\"\u001b[39m]])\n\u001b[1;32m 240\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m code\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/jinja2/environment.py:1299\u001b[0m, in \u001b[0;36mTemplate.render\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1296\u001b[0m ctx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnew_context(\u001b[38;5;28mdict\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs))\n\u001b[1;32m 1298\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1299\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menvironment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconcat\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mroot_render_func\u001b[49m\u001b[43m(\u001b[49m\u001b[43mctx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 1300\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 1301\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menvironment\u001b[38;5;241m.\u001b[39mhandle_exception()\n", + "File \u001b[0;32m