PSAL-POSTECH · YWHyuk · Sep 16, 2025 · Aug 11, 2025 · Aug 18, 2025 · Sep 11, 2025
diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
@@ -15,11 +15,12 @@
   "dram_print_interval": 10000,
   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
   "l2d_type" : "datacache",
-  "l2d_config" : "S:64:128:512,32,L:T:m:W:L,A:192:4,32:0,32",
+  "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32",
 
   "icnt_type" : "simple",
   "icnt_latency" : 7,
-  "icnt_freq" : 38400,
+  "icnt_freq" : 48000,
+  "icnt_node_per_core" : 1,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
 
   "precision" : 4,

diff --git a/PyTorchSimBackend/include/L2Cache.h b/PyTorchSimBackend/include/L2Cache.h
@@ -27,6 +27,10 @@ class L2CacheBase {
   CacheConfig l_cache_config; // L2 cache config
   uint32_t l_id;              // L2 partition id
   uint32_t l2d_hit_latency;
+  uint32_t n_read_port = 2; // Number of read ports (CMEM Read 2TB/s)
+  uint32_t n_write_port = 1; // Number of write ports (CMEM Write 1TB/s)
+  std::vector<uint32_t> read_port; // Current read port
+  std::vector<uint32_t> write_port; // Current write port
   std::queue<mem_fetch*> *l_to_xbar_queue;
   std::queue<mem_fetch*> *l_from_xbar_queue;
   std::queue<mem_fetch*> l_to_mem_queue;
@@ -47,9 +51,14 @@ class L2DataCache : public L2CacheBase {
 public:
   typedef IntervalTree<new_addr_type, int> CachePlan;
   L2DataCache(std::string name,  CacheConfig &cache_config, uint32_t id, cycle_type *core_cycle,
-    uint32_t l2d_hit_latency, std::queue<mem_fetch*> *to_xbar_queue,
+    uint32_t l2d_hit_latency, uint32_t num_cores, std::queue<mem_fetch*> *to_xbar_queue,
     std::queue<mem_fetch*> *from_xbar_queue);
   void cycle() override;
   bool push(mem_fetch* req) override;  // Push memory response from DRAM
+  bool port_free(mem_fetch* req);
+  void clear_port() { std::fill(read_port.begin(), read_port.end(), 0);
+                      std::fill(write_port.begin(), write_port.end(), 0); }
   virtual void print_stats() override;
+private:
+  uint32_t _n_cores;
 };
diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/PyTorchSimBackend/include/TileGraphParser.h
@@ -68,7 +68,7 @@ class TileNode {
 
 class TileGraphParser {
  public:
-  TileGraphParser(std::string onnx_path, std::string attribute_path);
+  TileGraphParser(std::string onnx_path, std::string attribute_path, std::string config_path);
   std::shared_ptr<TileNode> get_top_loop();
   std::unique_ptr<TileGraph>& get_tile_graph() { return _tile_graph; }
   addr_type lookup(std::string key);
@@ -136,6 +136,7 @@ class TileGraphParser {
   int _loop_stack_pointer = 0;
 
   json _attribute_json;
+  json _config_json;
   std::string _tog_path;
   std::string _attribute_path;
   uint64_t indirect_counter = 0;

diff --git a/PyTorchSimBackend/src/Dram.cc b/PyTorchSimBackend/src/Dram.cc
@@ -42,7 +42,7 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) {
             _m_cache_config.get_num_sets(), _m_cache_config.get_num_assoc(),
             _m_cache_config.get_line_size(), _m_cache_config.get_sector_size());
     for (int ch = 0; ch < _n_ch; ch++)
-      _m_caches[ch] = new L2DataCache(name, _m_cache_config, ch, _core_cycles, _config.l2d_hit_latency, &m_to_crossbar_queue[ch], &m_from_crossbar_queue[ch]);
+      _m_caches[ch] = new L2DataCache(name, _m_cache_config, ch, _core_cycles, _config.l2d_hit_latency, _config.num_cores, &m_to_crossbar_queue[ch], &m_from_crossbar_queue[ch]);
   } else {
     spdlog::error("[Config/L2D] Invalid L2 cache type...!");
     exit(EXIT_FAILURE);

diff --git a/PyTorchSimBackend/src/L2Cache.cc b/PyTorchSimBackend/src/L2Cache.cc
@@ -13,21 +13,28 @@ void NoL2Cache::cycle() {
 }
 
 L2DataCache::L2DataCache(std::string name,  CacheConfig &cache_config, uint32_t id,
-  cycle_type *core_cycle, uint32_t l2d_hit_latency,
+  cycle_type *core_cycle, uint32_t l2d_hit_latency, uint32_t num_cores,
   std::queue<mem_fetch*> *to_xbar_queue, std::queue<mem_fetch*> *from_xbar_queue) :
-  L2CacheBase(name, cache_config, id, core_cycle, l2d_hit_latency, to_xbar_queue, from_xbar_queue) {
+  L2CacheBase(name, cache_config, id, core_cycle, l2d_hit_latency, to_xbar_queue, from_xbar_queue), _n_cores(num_cores) {
   l_cache = std::make_unique<DataCache>(name, cache_config, id, 0, &l_to_mem_queue);
   l_from_cache_queue = DelayQueue<mem_fetch*>(l_name + "_latency_queue", true, 0);
+  read_port.resize(num_cores, 0);
+  write_port.resize(num_cores, 0);
 }
 
 bool L2DataCache::push(mem_fetch* req) {
-  if (l_cache->waiting_for_fill(req)) {
-    if (!l_cache->fill_port_free())
-      return false;
-    l_cache->fill(req, *l_core_cycle);
+  bool is_cacheable = req->is_cacheable();
+  if (!is_cacheable) {
+    l_to_xbar_queue->push(req);
   } else {
-    if (req->get_access_type() == GLOBAL_ACC_R || req->get_access_type() == GLOBAL_ACC_W)
-      l_to_xbar_queue->push(req);
+    if (l_cache->waiting_for_fill(req)) {
+      if (!l_cache->fill_port_free())
+        return false;
+      l_cache->fill(req, *l_core_cycle);
+    } else {
+      if (req->get_access_type() == GLOBAL_ACC_R || req->get_access_type() == GLOBAL_ACC_W)
+        l_to_xbar_queue->push(req);
+    }
   }
   return true;
 }
@@ -39,63 +46,83 @@ void L2DataCache::cycle() {
   // Mem to Cache
   uint32_t line_size = l_cache_config.get_line_size();
   uint32_t sector_size = l_cache_config.get_sector_size();
+  clear_port();
 
   /* Pass a request to cache */
-  if (!l_from_xbar_queue->empty()) {
-    mem_fetch* req = l_from_xbar_queue->front();
-    /* Check cache plan */
-    bool is_cacheable = req->is_cacheable();
+  for (int i = 0; i < (n_read_port + n_write_port) * _n_cores; i++) {
+    if (!l_from_xbar_queue->empty()) {
+      mem_fetch* req = l_from_xbar_queue->front();
+      /* Check cache plan */
+      bool is_cacheable = req->is_cacheable();
 
-    /* Go to l2 cache */
-    if (is_cacheable && l_cache->data_port_free()) {
-      req->set_access_sector_mask(line_size, sector_size);
-      std::deque<CacheEvent> events;
-      CacheRequestStatus status = l_cache->access(
-          req->get_addr(), *l_core_cycle, req, events);
-      bool write_sent = CacheEvent::was_write_sent(events);
-      bool read_sent = CacheEvent::was_read_sent(events);
-      if (status == HIT) {
-        if (!write_sent) {
-          req->set_reply();
-          req->current_state = "L2HIT";
-          l_from_cache_queue.push(req, l2d_hit_latency);
+      /* Go to l2 cache */
+      if (is_cacheable && l_cache->data_port_free()) {
+        if (!port_free(req)) continue;
+        req->set_access_sector_mask(line_size, sector_size);
+        std::deque<CacheEvent> events;
+        CacheRequestStatus status = l_cache->access(
+            req->get_addr(), *l_core_cycle, req, events);
+        bool write_sent = CacheEvent::was_write_sent(events);
+        bool read_sent = CacheEvent::was_read_sent(events);
+        if (status == HIT) {
+          if (!write_sent) {
+            req->set_reply();
+            req->current_state = "L2HIT";
+            l_from_cache_queue.push(req, l2d_hit_latency);
+          }
+          l_from_xbar_queue->pop();
+        } else if (status != RESERVATION_FAIL) {
+          req->current_state = "L2MISS";
+          if (req->is_write() &&
+              (l_cache_config.get_write_alloc_policy() == FETCH_ON_WRITE ||
+                l_cache_config.get_write_alloc_policy() == LAZY_FETCH_ON_READ)) {
+            req->set_reply();
+            req->current_state = "L2MISS-WRITE";
+            l_from_cache_queue.push(req, l2d_hit_latency);
+          }
+          l_from_xbar_queue->pop();
+        } else {
+          // Status Reservation fail, Retry it
+          assert(!write_sent);
+          assert(!read_sent);
         }
+      } else if (!is_cacheable) {
+        l_to_mem_queue.push(req);
         l_from_xbar_queue->pop();
-      } else if (status != RESERVATION_FAIL) {
-        req->current_state = "L2MISS";
-        if (req->is_write() &&
-            (l_cache_config.get_write_alloc_policy() == FETCH_ON_WRITE ||
-              l_cache_config.get_write_alloc_policy() == LAZY_FETCH_ON_READ)) {
-          req->set_reply();
-          req->current_state = "L2MISS-WRITE";
-          l_from_cache_queue.push(req, l2d_hit_latency);
-        }
-        l_from_xbar_queue->pop();
-      } else {
-        // Status Reservation fail, Retry it
-        assert(!write_sent);
-        assert(!read_sent);
       }
-    } else if (!is_cacheable) {
-      l_to_mem_queue.push(req);
-      l_from_xbar_queue->pop();
     }
-  }
 
-  if (l_cache->access_ready() &&
-      !l_from_cache_queue.full()) {
-    mem_fetch* req = l_cache->top_next_access();
-    if (req->is_request()) req->set_reply();
-    l_from_cache_queue.push(req, l2d_hit_latency);
-    l_cache->pop_next_access();
+    if (l_cache->access_ready() &&
+        !l_from_cache_queue.full()) {
+      mem_fetch* req = l_cache->top_next_access();
+      if (req->is_request()) req->set_reply();
+      l_from_cache_queue.push(req, l2d_hit_latency);
+      l_cache->pop_next_access();
+    }
+
+    if (l_from_cache_queue.arrived()) {
+      mem_fetch* req = l_from_cache_queue.top();
+      if (req->get_access_type() == GLOBAL_ACC_R || req->get_access_type() == GLOBAL_ACC_W)
+        l_to_xbar_queue->push(req);
+      l_from_cache_queue.pop();
+    }
   }
+}
 
-  if (l_from_cache_queue.arrived()) {
-    mem_fetch* req = l_from_cache_queue.top();
-    if (req->get_access_type() == GLOBAL_ACC_R || req->get_access_type() == GLOBAL_ACC_W)
-      l_to_xbar_queue->push(req);
-    l_from_cache_queue.pop();
+bool L2DataCache::port_free(mem_fetch* req) {
+  int core_id = req->get_core_id();
+  if (req->is_write()) {
+    write_port[core_id]++;
+    if (write_port[core_id] > n_write_port) {
+      return false; // No more write port available
+    }
+  } else {
+    read_port[core_id]++;
+    if (read_port[core_id] > n_read_port) {
+      return false; // No more read port available
+    }
   }
+  return true; // Port is free
 }
 
 void L2DataCache::print_stats() {

diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc
@@ -699,8 +699,9 @@ void TileLoopNode::print_node() {
   spdlog::debug("{} stride: {} ", spaces, _stride);
 }
 
-TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_path) {
+TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_path, std::string config_path) {
   loadConfig(attribute_path, _attribute_json);
+  loadConfig(config_path, _config_json);
   _attribute_path = attribute_path;
 
   /* Note: this parsing algorithm assume that all node are sorted in topological-order */
@@ -727,7 +728,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa
       spdlog::info("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", it.key(), fmt::join(_arg_numa_stride[it.key()], ", "));
     }
   }
-  if (_attribute_json.contains("sram_alloc") and _attribute_json.contains("l2d_type") and _attribute_json["l2d_type"] == "datacache") {
+  if (_attribute_json.contains("sram_alloc") and _config_json.contains("l2d_type") and _config_json["l2d_type"] == "datacache") {
     auto sram_alloc_list = _attribute_json["sram_alloc"];
     spdlog::info("[TOGParser/Attribute] ================= SRAM Alloc Plan ================");
     for (auto it = sram_alloc_list.begin(); it != sram_alloc_list.end(); ++it) {

diff --git a/PyTorchSimBackend/src/main.cc b/PyTorchSimBackend/src/main.cc
@@ -12,8 +12,8 @@ namespace po = boost::program_options;
 const char* env_value = std::getenv("BACKENDSIM_DRYRUN");
 bool isDryRun = (env_value != nullptr && std::string(env_value) == "1");
 
-void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, cycle_type request_time=0, int partiton_id=0) {
-  auto graph_praser = TileGraphParser(onnx_path, attribute_path);
+void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, std::string config_path, cycle_type request_time=0, int partiton_id=0) {
+  auto graph_praser = TileGraphParser(onnx_path, attribute_path, config_path);
   std::unique_ptr<TileGraph>& tile_graph = graph_praser.get_tile_graph();
   tile_graph->set_arrival_time(request_time ? request_time : simulator->get_core_cycle());
   spdlog::info("[Scheduler {}] Register graph path: {} operation: {} at {}", partiton_id, onnx_path, tile_graph->get_name(), simulator->get_core_cycle());
@@ -46,16 +46,16 @@ void interactive_mode(Simulator* simulator) {
     // Parse the first part of the command (e.g., "launch", "until", "quit")
     iss >> token;
     if (token == "launch") {
-      std::string onnx_path, attribute_path;
+      std::string onnx_path, attribute_path, config_path;
       cycle_type request_time = 0;
       int partition_id = 0;
-      iss >> onnx_path >> attribute_path >> request_time >> partition_id;
+      iss >> config_path >> onnx_path >> attribute_path >> request_time >> partition_id;
 
       // Check if both paths were provided
       if (onnx_path.empty() || attribute_path.empty()) {
         spdlog::error("Error: Please provide both ONNX path and Attribute path in the format: launch onnx/path attribute/path");
       } else {
-        launchKernel(simulator, onnx_path, attribute_path, request_time, partition_id);
+        launchKernel(simulator, onnx_path, attribute_path, config_path, request_time, partition_id);
         std::cerr << "launch done" << std::endl;
       }
     } else if (token == "until") {
@@ -135,7 +135,7 @@ int main(int argc, char** argv) {
     cmd_parser.set_if_defined("attributes_list", &attribute_path);
 
     /* launch kernels */
-    launchKernel(simulator, onnx_path, attribute_path);
+    launchKernel(simulator, onnx_path, attribute_path, config_path);
     simulator->run_simulator();
     if (simulator->get_core_cycle()==0)
       simulator->until(1);

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -113,22 +113,18 @@ def write_header(self):
         self.header.splice(
             f"""
             def sram_plan_prefix(buffer_name, buffer):
-                #if CONFIG_SRAM_BUFFER_PLAN is None:
-                #    return
-                #elif buffer_name not in CONFIG_SRAM_BUFFER_PLAN:
-                #    return
-                buffer_size = buffer.element_size() * buffer.untyped_storage().size()
+                if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN):
+                    return
+                buffer_size = buffer.untyped_storage().size()
                 start = buffer.data_ptr()
                 end = start + buffer_size
                 # print(f'Alloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})')
                 BackendSimulator.sram_alloc(buffer_name, [start, end])
 
             def sram_plan_postfix(buffer_name, buffer):
-                #if CONFIG_SRAM_BUFFER_PLAN is None:
-                #    return
-                #elif buffer_name not in CONFIG_SRAM_BUFFER_PLAN:
-                #    return
-                buffer_size = buffer.element_size() * buffer.untyped_storage().size()
+                if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN):
+                    return
+                buffer_size = buffer.untyped_storage().size()
                 start = buffer.data_ptr()
                 end = start + buffer_size
                 # print(f'Dealloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})')

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -299,7 +299,7 @@ def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_no
             TILE_K = tile_info["TILE_K"]
         else: # case 2: use gemm_combination_mapping
             min_tile = (n_extra_node + n_prologue_node) == 0
-            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, max(len(n_extra_read)-2, 0), n_prologue_node, min_tile=min_tile)
+            TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, max(len(n_extra_read)-2, 0), n_prologue_node, min_tile=True)
         # case 3: use manual tile size
         if extension_config.CONFIG_MANUAL_TILE_SIZE:
             TILE_M = extension_config.CONFIG_TILE_M

diff --git a/Simulator/simulator.py b/Simulator/simulator.py
@@ -302,7 +302,7 @@ def send_command(self, command):
             return None
 
     def launch(self, onnx_path, attribute_path, arrival_time=0, partion_id=0):
-        command = f"launch {onnx_path} {attribute_path} {arrival_time} {partion_id}"
+        command = f"launch {self.config_path} {onnx_path} {attribute_path} {arrival_time} {partion_id}"
         ret = self.send_command(command)
         return 0
 

diff --git a/tpuv4/CONV0_plan.py b/tpuv4/CONV0_plan.py
@@ -0,0 +1,3 @@
+plan = {
+    "primals_2"
+}
diff --git a/tpuv4/CONV1_plan.py b/tpuv4/CONV1_plan.py
@@ -0,0 +1,3 @@
+plan = {
+    "primals_2"
+}
diff --git a/tpuv4/CONV2_plan.py b/tpuv4/CONV2_plan.py
@@ -0,0 +1,3 @@
+plan = {
+    "primals_2"
+}
diff --git a/tpuv4/CONV3_plan.py b/tpuv4/CONV3_plan.py
@@ -0,0 +1,3 @@
+plan = {
+    "primals_1"
+}
diff --git a/tpuv4/gemm_plan.py b/tpuv4/gemm_plan.py
@@ -0,0 +1,3 @@
+plan = {
+    "arg0_1"
+}
diff --git a/tpuv4/softmax_plan_2048x2048.py b/tpuv4/softmax_plan_2048x2048.py
@@ -0,0 +1,5 @@
+plan = {
+    "arg0_1",
+    "buf0",
+    "buf1"
+}
diff --git a/tpuv4/softmax_plan_512x512.py b/tpuv4/softmax_plan_512x512.py
@@ -0,0 +1,5 @@
+plan = {
+    "arg0_1",
+    "buf0",
+    "buf1"
+}
diff --git a/tpuv4/softmax_plan_8192x8192.py b/tpuv4/softmax_plan_8192x8192.py
@@ -0,0 +1,4 @@
+plan = {
+    "buf0",
+    "buf1"
+}