From 38ebeebe5d2c1b09fe0b7714c97e090bdcee1917 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Tue, 28 Oct 2025 12:40:36 +0000 Subject: [PATCH 1/2] [docs] multi-tenancy load generator --- README.md | 73 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 56b58b28..c86b0685 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ # PyTorchSim: A Comprehensive, Fast, and Accurate NPU Simulation Framework [![Docker Image CI](https://github.com/PSAL-POSTECH/PyTorchSim/actions/workflows/docker-image.yml/badge.svg)](https://github.com/PSAL-POSTECH/PyTorchSim/actions/workflows/docker-image.yml) -PyTorchSim is a comprehensive, high-speed, cycle-accurate NPU simulation framework -- We define a RISC-V-based NPU architecture and implement PyTorch compiler backend to run inference & training for PyTorch models -- Achieved high speed and accuracy with our novel Tile-Level Simulation (TLS) with compiler-generated Tile-Operation Graph (TOG), exploiting deterministic tile compute latency -- A generic and extensible NPU architecture based on RISC-V vector extension -- The functional simulator supports code correctness validation and data-dependent timing simulation +PyTorchSim is a comprehensive, high-speed, cycle-accurate NPU simulation framework. +- We define a RISC-V-based NPU architecture and implement PyTorch compiler backend to run inference & training for PyTorch models. +- Achieved high speed and accuracy with our novel Tile-Level Simulation (TLS) with compiler-generated Tile-Operation Graph (TOG), exploiting deterministic tile compute latency. +- A generic and extensible NPU architecture based on RISC-V vector extension. +- The functional simulator supports code correctness validation and data-dependent timing simulation. For more details, please refer to our [paper](https://doi.org/10.1145/3725843.3756045)! @@ -92,7 +92,7 @@ The `tests` directory contains several AI workloads examples. ```bash python tests/test_matmul.py ``` -The result is stored to `TORCHSIM_DUMP_PATH`/`hash`/backendsim_result/. The log file contains detailed core, memory, and interconnect stats. +The result is stored to `TORCHSIM_DUMP_PATH/hash/backendsim_result/`. The log file contains detailed core, memory, and interconnect stats. ### Run Your Own Model on PyTorchSim You can run your own PyTorch model on PyTorchSim by setting up a custom NPU device. @@ -180,39 +180,41 @@ Load generator supports multi-tenancy experiments. You can simply run `tests/tes python tests/test_scheduler.py ``` Below is an example code of multi-tenancy -`target_model1` and `target_model2` is your own PyTorch model. -You can set the request arrival time and request queue index. Request queue is used for scheduling and you can set the number of queue to each core in [TOGSim configuration](#togsim-configuration) +`target_model0` and `target_model1` is your own PyTorch model. +You can set the request arrival time and request queue index. Request queue is used for scheduling and you can set the number of queue to each core in [TOGSim configuration](#togsim-configuration). +`poisson_request_generator` generates arrival time of requests in a Poisson distribution with `lamda` and `max_time`. ```python # Init scheduler scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) # Register compiled model -opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last)) -opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device())) -SchedulerDNNModel.register_model("resnet18", opt_model1) -SchedulerDNNModel.register_model("bert", opt_model2) - -# Init input data -model_input1 = torch.randn(1, 3, 224, 224) -model_input2 = torch.randn(128, 768) - -# Init request -new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0) -new_request2 = Request("bert", [model_input2], [], request_queue_idx=1) -new_request3 = Request("resnet18", [model_input1], [], request_queue_idx=0) -new_request4 = Request("bert", [model_input2], [], request_queue_idx=1) - -# Add request to scheduler -scheduler.add_request(new_request1, request_time=0) -scheduler.add_request(new_request2, request_time=0) -scheduler.add_request(new_request3, request_time=0) -scheduler.add_request(new_request4, request_time=0) +opt_model0 = torch.compile(target_model0.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last)) +opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device())) +SchedulerDNNModel.register_model("model0", opt_model0) +SchedulerDNNModel.register_model("model1", opt_model1) + +# Load Generation +model0_lambda = 5.0 +model1_lambda = 3.0 +max_time = 1000.0 # [s] + +# Generate Possion distribution requests for model0 +for model0_request_time in poisson_request_generator(model0_lambda, total_time=max_time): + x = torch.randn(1, 3, 224, 224) + new_request = Request("model0", [x], [], request_queue_idx=0) + scheduler.add_request(new_request, request_time=model0_request_time) + +# Generate Possion distribution requests for model1 +for model1_request_time in poisson_request_generator(model1_lambda, total_time=max_time): + x = torch.randn(128, 768) + new_request = Request("model1", [x], [], request_queue_idx=0) + scheduler.add_request(new_request, request_time=model1_request_time) # Run scheduler while not scheduler.is_finished(): scheduler.schedule() ``` ## Compiler Optimizations -PyTorchSim compiler supports fusions +PyTorchSim compiler supports several fusion optimizations: - GEMM prologue fusion - GEMM epilogue fusion - GEMM reduction fusion @@ -223,7 +225,7 @@ Depending on tensor shape, use different convolution template - Multi-channel optimization ## Mapping -PyTorchSim provids three mapping strategies +PyTorchSim provides three mapping strategies. ### Heuristic-based mapping We adopt and modified heuristic-based mapping of [GEMMINI](https://github.com/ucb-bar/gemmini) by default, which maximizes the utilization of scratchpad memory. ### Auto-tuning @@ -265,7 +267,7 @@ export TORCHSIM_TILE_N=512 export TORCHSIM_TILE_K=512 ``` ## Compiler Configuration -`PyTorchSimFrontend/extension_config.py` contains target hardware configuration to compile +`PyTorchSimFrontend/extension_config.py` contains target hardware configuration to compile. You can configure these options using environment variables. ```bash @@ -346,11 +348,10 @@ If you use PyTorchSim for your research, please cite the following paper. @INPROCEEDINGS{yang2025pytorchsim, author={Yang, Wonhyuk and Shin, Yunseon and Woo, Okkyun and Park, Geonwoo and Ham, Hyungkyu and Kang, Jeehoon and Park, Jongse and Kim, Gwangsun}, title={PyTorchSim: A Comprehensive, Fast, and Accurate NPU Simulation Framework}, - booktitle={2025 58th IEEE/ACM International Symposium on Microarchitecture (MICRO)}, - volume={}, - number={}, - pages={}, + booktitle={Proceedings of the 58th IEEE/ACM International Symposium on Microarchitecture}, + pages={1363–1380}, year={2025}, - doi={10.1145/3725843.3756045} + doi={10.1145/3725843.3756045}, + series={MICRO '25} } ``` \ No newline at end of file From ed18bb6f0d44207ec65107df1e4679be3620bcec Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 28 Oct 2025 23:43:28 +0900 Subject: [PATCH 2/2] Improve README with multi-tenancy examples and details Updated README to enhance clarity and provide code examples for multi-tenancy. --- README.md | 54 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index c86b0685..80329b16 100644 --- a/README.md +++ b/README.md @@ -175,23 +175,51 @@ opt_step() `tests/test_mlp.py` provides an example of MLP training. ## Multi-tenancy -Load generator supports multi-tenancy experiments. You can simply run `tests/test_scheduler.py` +Our load generator supports multi-tenancy experiments. You can run a simple example by executing `tests/test_scheduler.py`. ```bash python tests/test_scheduler.py ``` -Below is an example code of multi-tenancy -`target_model0` and `target_model1` is your own PyTorch model. -You can set the request arrival time and request queue index. Request queue is used for scheduling and you can set the number of queue to each core in [TOGSim configuration](#togsim-configuration). -`poisson_request_generator` generates arrival time of requests in a Poisson distribution with `lamda` and `max_time`. -```python -# Init scheduler +Below is an example code of multi-tenancy `resnet18` and `EncoderBlock`. +In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSim config file(`.json`). The compiled PyTorch models are then registered with a unique model id. + +```python3 +import os +import sys +import torch +from torchvision.models import resnet18 +from test_transformer import EncoderBlock +base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') +config = f'{base_path}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' + +sys.path.append(base_path) +from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, backend_config=config) + # Register compiled model +target_model0 = resnet18().eval() +target_model1 = EncoderBlock(768, 12).eval() opt_model0 = torch.compile(target_model0.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last)) opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device())) SchedulerDNNModel.register_model("model0", opt_model0) SchedulerDNNModel.register_model("model1", opt_model1) +``` +The config file(`.json`) specifies two key items: +- `num_partition`: The total number of independent request queues to create. +- `partition`: Defines the hardware mapping, assigning each queue (identified by its index) to a specific physical core. +For example, the configuration below creates two scheduling queues (`0` and `1`) and maps `core_0` to queue `0` and `core_1` to queue `1`: +``` + "num_partition" : 2, + "partition": { + "core_0":0, + "core_1":1 + } +``` + +Next, DNN model requests are generated and submitted. We provide a `poisson_request_generator` utility, which generates request arrival times. +Each `Request` is created with its model name, data, and a request_queue_idx to specify its target queue, then added via `scheduler.add_request`. +As shown in the code, `model0` requests are queued to `request_queue_idx=0`, while `model1` requests are queued to `request_queue_idx=1`. +```python3 # Load Generation model0_lambda = 5.0 model1_lambda = 3.0 @@ -202,17 +230,21 @@ for model0_request_time in poisson_request_generator(model0_lambda, total_time=m x = torch.randn(1, 3, 224, 224) new_request = Request("model0", [x], [], request_queue_idx=0) scheduler.add_request(new_request, request_time=model0_request_time) - + # Generate Possion distribution requests for model1 for model1_request_time in poisson_request_generator(model1_lambda, total_time=max_time): x = torch.randn(128, 768) - new_request = Request("model1", [x], [], request_queue_idx=0) + new_request = Request("model1", [x], [], request_queue_idx=1) scheduler.add_request(new_request, request_time=model1_request_time) +``` +Finally, `scheduler.schedule()` is called in a loop until all requests are processed. +```python3 # Run scheduler while not scheduler.is_finished(): scheduler.schedule() ``` + ## Compiler Optimizations PyTorchSim compiler supports several fusion optimizations: - GEMM prologue fusion @@ -220,7 +252,7 @@ PyTorchSim compiler supports several fusion optimizations: - GEMM reduction fusion - CONV epilogue fusion -Depending on tensor shape, use different convolution template +Depending on tensor shape, use different convolution template: - Single batch optimization - Multi-channel optimization @@ -354,4 +386,4 @@ If you use PyTorchSim for your research, please cite the following paper. doi={10.1145/3725843.3756045}, series={MICRO '25} } -``` \ No newline at end of file +```