diff --git a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json index 4b4df4e6..a93e8ae2 100644 --- a/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json +++ b/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json @@ -15,11 +15,12 @@ "dram_print_interval": 10000, "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", "l2d_type" : "datacache", - "l2d_config" : "S:64:128:512,32,L:T:m:W:L,A:192:4,32:0,32", + "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32", "icnt_type" : "simple", "icnt_latency" : 7, - "icnt_freq" : 38400, + "icnt_freq" : 48000, + "icnt_node_per_core" : 1, "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", "precision" : 4, diff --git a/PyTorchSimBackend/include/L2Cache.h b/PyTorchSimBackend/include/L2Cache.h index e822e6be..ba851b0b 100644 --- a/PyTorchSimBackend/include/L2Cache.h +++ b/PyTorchSimBackend/include/L2Cache.h @@ -27,6 +27,10 @@ class L2CacheBase { CacheConfig l_cache_config; // L2 cache config uint32_t l_id; // L2 partition id uint32_t l2d_hit_latency; + uint32_t n_read_port = 2; // Number of read ports (CMEM Read 2TB/s) + uint32_t n_write_port = 1; // Number of write ports (CMEM Write 1TB/s) + std::vector read_port; // Current read port + std::vector write_port; // Current write port std::queue *l_to_xbar_queue; std::queue *l_from_xbar_queue; std::queue l_to_mem_queue; @@ -47,9 +51,14 @@ class L2DataCache : public L2CacheBase { public: typedef IntervalTree CachePlan; L2DataCache(std::string name, CacheConfig &cache_config, uint32_t id, cycle_type *core_cycle, - uint32_t l2d_hit_latency, std::queue *to_xbar_queue, + uint32_t l2d_hit_latency, uint32_t num_cores, std::queue *to_xbar_queue, std::queue *from_xbar_queue); void cycle() override; bool push(mem_fetch* req) override; // Push memory response from DRAM + bool port_free(mem_fetch* req); + void clear_port() { std::fill(read_port.begin(), read_port.end(), 0); + std::fill(write_port.begin(), write_port.end(), 0); } virtual void print_stats() override; +private: + uint32_t _n_cores; }; \ No newline at end of file diff --git a/PyTorchSimBackend/include/TileGraphParser.h b/PyTorchSimBackend/include/TileGraphParser.h index 5b561127..9cc61d4a 100644 --- a/PyTorchSimBackend/include/TileGraphParser.h +++ b/PyTorchSimBackend/include/TileGraphParser.h @@ -68,7 +68,7 @@ class TileNode { class TileGraphParser { public: - TileGraphParser(std::string onnx_path, std::string attribute_path); + TileGraphParser(std::string onnx_path, std::string attribute_path, std::string config_path); std::shared_ptr get_top_loop(); std::unique_ptr& get_tile_graph() { return _tile_graph; } addr_type lookup(std::string key); @@ -136,6 +136,7 @@ class TileGraphParser { int _loop_stack_pointer = 0; json _attribute_json; + json _config_json; std::string _tog_path; std::string _attribute_path; uint64_t indirect_counter = 0; diff --git a/PyTorchSimBackend/src/Dram.cc b/PyTorchSimBackend/src/Dram.cc index e604f73f..ab074bda 100644 --- a/PyTorchSimBackend/src/Dram.cc +++ b/PyTorchSimBackend/src/Dram.cc @@ -42,7 +42,7 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) { _m_cache_config.get_num_sets(), _m_cache_config.get_num_assoc(), _m_cache_config.get_line_size(), _m_cache_config.get_sector_size()); for (int ch = 0; ch < _n_ch; ch++) - _m_caches[ch] = new L2DataCache(name, _m_cache_config, ch, _core_cycles, _config.l2d_hit_latency, &m_to_crossbar_queue[ch], &m_from_crossbar_queue[ch]); + _m_caches[ch] = new L2DataCache(name, _m_cache_config, ch, _core_cycles, _config.l2d_hit_latency, _config.num_cores, &m_to_crossbar_queue[ch], &m_from_crossbar_queue[ch]); } else { spdlog::error("[Config/L2D] Invalid L2 cache type...!"); exit(EXIT_FAILURE); diff --git a/PyTorchSimBackend/src/L2Cache.cc b/PyTorchSimBackend/src/L2Cache.cc index 14c9b9da..a02f6e2b 100644 --- a/PyTorchSimBackend/src/L2Cache.cc +++ b/PyTorchSimBackend/src/L2Cache.cc @@ -13,21 +13,28 @@ void NoL2Cache::cycle() { } L2DataCache::L2DataCache(std::string name, CacheConfig &cache_config, uint32_t id, - cycle_type *core_cycle, uint32_t l2d_hit_latency, + cycle_type *core_cycle, uint32_t l2d_hit_latency, uint32_t num_cores, std::queue *to_xbar_queue, std::queue *from_xbar_queue) : - L2CacheBase(name, cache_config, id, core_cycle, l2d_hit_latency, to_xbar_queue, from_xbar_queue) { + L2CacheBase(name, cache_config, id, core_cycle, l2d_hit_latency, to_xbar_queue, from_xbar_queue), _n_cores(num_cores) { l_cache = std::make_unique(name, cache_config, id, 0, &l_to_mem_queue); l_from_cache_queue = DelayQueue(l_name + "_latency_queue", true, 0); + read_port.resize(num_cores, 0); + write_port.resize(num_cores, 0); } bool L2DataCache::push(mem_fetch* req) { - if (l_cache->waiting_for_fill(req)) { - if (!l_cache->fill_port_free()) - return false; - l_cache->fill(req, *l_core_cycle); + bool is_cacheable = req->is_cacheable(); + if (!is_cacheable) { + l_to_xbar_queue->push(req); } else { - if (req->get_access_type() == GLOBAL_ACC_R || req->get_access_type() == GLOBAL_ACC_W) - l_to_xbar_queue->push(req); + if (l_cache->waiting_for_fill(req)) { + if (!l_cache->fill_port_free()) + return false; + l_cache->fill(req, *l_core_cycle); + } else { + if (req->get_access_type() == GLOBAL_ACC_R || req->get_access_type() == GLOBAL_ACC_W) + l_to_xbar_queue->push(req); + } } return true; } @@ -39,63 +46,83 @@ void L2DataCache::cycle() { // Mem to Cache uint32_t line_size = l_cache_config.get_line_size(); uint32_t sector_size = l_cache_config.get_sector_size(); + clear_port(); /* Pass a request to cache */ - if (!l_from_xbar_queue->empty()) { - mem_fetch* req = l_from_xbar_queue->front(); - /* Check cache plan */ - bool is_cacheable = req->is_cacheable(); + for (int i = 0; i < (n_read_port + n_write_port) * _n_cores; i++) { + if (!l_from_xbar_queue->empty()) { + mem_fetch* req = l_from_xbar_queue->front(); + /* Check cache plan */ + bool is_cacheable = req->is_cacheable(); - /* Go to l2 cache */ - if (is_cacheable && l_cache->data_port_free()) { - req->set_access_sector_mask(line_size, sector_size); - std::deque events; - CacheRequestStatus status = l_cache->access( - req->get_addr(), *l_core_cycle, req, events); - bool write_sent = CacheEvent::was_write_sent(events); - bool read_sent = CacheEvent::was_read_sent(events); - if (status == HIT) { - if (!write_sent) { - req->set_reply(); - req->current_state = "L2HIT"; - l_from_cache_queue.push(req, l2d_hit_latency); + /* Go to l2 cache */ + if (is_cacheable && l_cache->data_port_free()) { + if (!port_free(req)) continue; + req->set_access_sector_mask(line_size, sector_size); + std::deque events; + CacheRequestStatus status = l_cache->access( + req->get_addr(), *l_core_cycle, req, events); + bool write_sent = CacheEvent::was_write_sent(events); + bool read_sent = CacheEvent::was_read_sent(events); + if (status == HIT) { + if (!write_sent) { + req->set_reply(); + req->current_state = "L2HIT"; + l_from_cache_queue.push(req, l2d_hit_latency); + } + l_from_xbar_queue->pop(); + } else if (status != RESERVATION_FAIL) { + req->current_state = "L2MISS"; + if (req->is_write() && + (l_cache_config.get_write_alloc_policy() == FETCH_ON_WRITE || + l_cache_config.get_write_alloc_policy() == LAZY_FETCH_ON_READ)) { + req->set_reply(); + req->current_state = "L2MISS-WRITE"; + l_from_cache_queue.push(req, l2d_hit_latency); + } + l_from_xbar_queue->pop(); + } else { + // Status Reservation fail, Retry it + assert(!write_sent); + assert(!read_sent); } + } else if (!is_cacheable) { + l_to_mem_queue.push(req); l_from_xbar_queue->pop(); - } else if (status != RESERVATION_FAIL) { - req->current_state = "L2MISS"; - if (req->is_write() && - (l_cache_config.get_write_alloc_policy() == FETCH_ON_WRITE || - l_cache_config.get_write_alloc_policy() == LAZY_FETCH_ON_READ)) { - req->set_reply(); - req->current_state = "L2MISS-WRITE"; - l_from_cache_queue.push(req, l2d_hit_latency); - } - l_from_xbar_queue->pop(); - } else { - // Status Reservation fail, Retry it - assert(!write_sent); - assert(!read_sent); } - } else if (!is_cacheable) { - l_to_mem_queue.push(req); - l_from_xbar_queue->pop(); } - } - if (l_cache->access_ready() && - !l_from_cache_queue.full()) { - mem_fetch* req = l_cache->top_next_access(); - if (req->is_request()) req->set_reply(); - l_from_cache_queue.push(req, l2d_hit_latency); - l_cache->pop_next_access(); + if (l_cache->access_ready() && + !l_from_cache_queue.full()) { + mem_fetch* req = l_cache->top_next_access(); + if (req->is_request()) req->set_reply(); + l_from_cache_queue.push(req, l2d_hit_latency); + l_cache->pop_next_access(); + } + + if (l_from_cache_queue.arrived()) { + mem_fetch* req = l_from_cache_queue.top(); + if (req->get_access_type() == GLOBAL_ACC_R || req->get_access_type() == GLOBAL_ACC_W) + l_to_xbar_queue->push(req); + l_from_cache_queue.pop(); + } } +} - if (l_from_cache_queue.arrived()) { - mem_fetch* req = l_from_cache_queue.top(); - if (req->get_access_type() == GLOBAL_ACC_R || req->get_access_type() == GLOBAL_ACC_W) - l_to_xbar_queue->push(req); - l_from_cache_queue.pop(); +bool L2DataCache::port_free(mem_fetch* req) { + int core_id = req->get_core_id(); + if (req->is_write()) { + write_port[core_id]++; + if (write_port[core_id] > n_write_port) { + return false; // No more write port available + } + } else { + read_port[core_id]++; + if (read_port[core_id] > n_read_port) { + return false; // No more read port available + } } + return true; // Port is free } void L2DataCache::print_stats() { diff --git a/PyTorchSimBackend/src/TileGraphParser.cc b/PyTorchSimBackend/src/TileGraphParser.cc index 9374dcb5..4a562724 100644 --- a/PyTorchSimBackend/src/TileGraphParser.cc +++ b/PyTorchSimBackend/src/TileGraphParser.cc @@ -699,8 +699,9 @@ void TileLoopNode::print_node() { spdlog::debug("{} stride: {} ", spaces, _stride); } -TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_path) { +TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_path, std::string config_path) { loadConfig(attribute_path, _attribute_json); + loadConfig(config_path, _config_json); _attribute_path = attribute_path; /* Note: this parsing algorithm assume that all node are sorted in topological-order */ @@ -727,7 +728,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa spdlog::info("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", it.key(), fmt::join(_arg_numa_stride[it.key()], ", ")); } } - if (_attribute_json.contains("sram_alloc") and _attribute_json.contains("l2d_type") and _attribute_json["l2d_type"] == "datacache") { + if (_attribute_json.contains("sram_alloc") and _config_json.contains("l2d_type") and _config_json["l2d_type"] == "datacache") { auto sram_alloc_list = _attribute_json["sram_alloc"]; spdlog::info("[TOGParser/Attribute] ================= SRAM Alloc Plan ================"); for (auto it = sram_alloc_list.begin(); it != sram_alloc_list.end(); ++it) { diff --git a/PyTorchSimBackend/src/main.cc b/PyTorchSimBackend/src/main.cc index ecdd85aa..214e7131 100644 --- a/PyTorchSimBackend/src/main.cc +++ b/PyTorchSimBackend/src/main.cc @@ -12,8 +12,8 @@ namespace po = boost::program_options; const char* env_value = std::getenv("BACKENDSIM_DRYRUN"); bool isDryRun = (env_value != nullptr && std::string(env_value) == "1"); -void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, cycle_type request_time=0, int partiton_id=0) { - auto graph_praser = TileGraphParser(onnx_path, attribute_path); +void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, std::string config_path, cycle_type request_time=0, int partiton_id=0) { + auto graph_praser = TileGraphParser(onnx_path, attribute_path, config_path); std::unique_ptr& tile_graph = graph_praser.get_tile_graph(); tile_graph->set_arrival_time(request_time ? request_time : simulator->get_core_cycle()); spdlog::info("[Scheduler {}] Register graph path: {} operation: {} at {}", partiton_id, onnx_path, tile_graph->get_name(), simulator->get_core_cycle()); @@ -46,16 +46,16 @@ void interactive_mode(Simulator* simulator) { // Parse the first part of the command (e.g., "launch", "until", "quit") iss >> token; if (token == "launch") { - std::string onnx_path, attribute_path; + std::string onnx_path, attribute_path, config_path; cycle_type request_time = 0; int partition_id = 0; - iss >> onnx_path >> attribute_path >> request_time >> partition_id; + iss >> config_path >> onnx_path >> attribute_path >> request_time >> partition_id; // Check if both paths were provided if (onnx_path.empty() || attribute_path.empty()) { spdlog::error("Error: Please provide both ONNX path and Attribute path in the format: launch onnx/path attribute/path"); } else { - launchKernel(simulator, onnx_path, attribute_path, request_time, partition_id); + launchKernel(simulator, onnx_path, attribute_path, config_path, request_time, partition_id); std::cerr << "launch done" << std::endl; } } else if (token == "until") { @@ -135,7 +135,7 @@ int main(int argc, char** argv) { cmd_parser.set_if_defined("attributes_list", &attribute_path); /* launch kernels */ - launchKernel(simulator, onnx_path, attribute_path); + launchKernel(simulator, onnx_path, attribute_path, config_path); simulator->run_simulator(); if (simulator->get_core_cycle()==0) simulator->until(1); diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 79677a3d..21d2868e 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -113,22 +113,18 @@ def write_header(self): self.header.splice( f""" def sram_plan_prefix(buffer_name, buffer): - #if CONFIG_SRAM_BUFFER_PLAN is None: - # return - #elif buffer_name not in CONFIG_SRAM_BUFFER_PLAN: - # return - buffer_size = buffer.element_size() * buffer.untyped_storage().size() + if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN): + return + buffer_size = buffer.untyped_storage().size() start = buffer.data_ptr() end = start + buffer_size # print(f'Alloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})') BackendSimulator.sram_alloc(buffer_name, [start, end]) def sram_plan_postfix(buffer_name, buffer): - #if CONFIG_SRAM_BUFFER_PLAN is None: - # return - #elif buffer_name not in CONFIG_SRAM_BUFFER_PLAN: - # return - buffer_size = buffer.element_size() * buffer.untyped_storage().size() + if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN): + return + buffer_size = buffer.untyped_storage().size() start = buffer.data_ptr() end = start + buffer_size # print(f'Dealloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})') diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py index 5b339189..ae793c06 100644 --- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py @@ -299,7 +299,7 @@ def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_no TILE_K = tile_info["TILE_K"] else: # case 2: use gemm_combination_mapping min_tile = (n_extra_node + n_prologue_node) == 0 - TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, max(len(n_extra_read)-2, 0), n_prologue_node, min_tile=min_tile) + TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, max(len(n_extra_read)-2, 0), n_prologue_node, min_tile=True) # case 3: use manual tile size if extension_config.CONFIG_MANUAL_TILE_SIZE: TILE_M = extension_config.CONFIG_TILE_M diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 292c5a9b..81970cbe 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -302,7 +302,7 @@ def send_command(self, command): return None def launch(self, onnx_path, attribute_path, arrival_time=0, partion_id=0): - command = f"launch {onnx_path} {attribute_path} {arrival_time} {partion_id}" + command = f"launch {self.config_path} {onnx_path} {attribute_path} {arrival_time} {partion_id}" ret = self.send_command(command) return 0 diff --git a/tpuv4/CONV0_plan.py b/tpuv4/CONV0_plan.py new file mode 100644 index 00000000..02a8a2d9 --- /dev/null +++ b/tpuv4/CONV0_plan.py @@ -0,0 +1,3 @@ +plan = { + "primals_2" +} \ No newline at end of file diff --git a/tpuv4/CONV1_plan.py b/tpuv4/CONV1_plan.py new file mode 100644 index 00000000..02a8a2d9 --- /dev/null +++ b/tpuv4/CONV1_plan.py @@ -0,0 +1,3 @@ +plan = { + "primals_2" +} \ No newline at end of file diff --git a/tpuv4/CONV2_plan.py b/tpuv4/CONV2_plan.py new file mode 100644 index 00000000..02a8a2d9 --- /dev/null +++ b/tpuv4/CONV2_plan.py @@ -0,0 +1,3 @@ +plan = { + "primals_2" +} \ No newline at end of file diff --git a/tpuv4/CONV3_plan.py b/tpuv4/CONV3_plan.py new file mode 100644 index 00000000..2091c6ca --- /dev/null +++ b/tpuv4/CONV3_plan.py @@ -0,0 +1,3 @@ +plan = { + "primals_1" +} \ No newline at end of file diff --git a/tpuv4/gemm_plan.py b/tpuv4/gemm_plan.py new file mode 100644 index 00000000..b54d43cb --- /dev/null +++ b/tpuv4/gemm_plan.py @@ -0,0 +1,3 @@ +plan = { + "arg0_1" +} \ No newline at end of file diff --git a/tpuv4/softmax_plan_2048x2048.py b/tpuv4/softmax_plan_2048x2048.py new file mode 100644 index 00000000..4c36867f --- /dev/null +++ b/tpuv4/softmax_plan_2048x2048.py @@ -0,0 +1,5 @@ +plan = { + "arg0_1", + "buf0", + "buf1" +} \ No newline at end of file diff --git a/tpuv4/softmax_plan_512x512.py b/tpuv4/softmax_plan_512x512.py new file mode 100644 index 00000000..4c36867f --- /dev/null +++ b/tpuv4/softmax_plan_512x512.py @@ -0,0 +1,5 @@ +plan = { + "arg0_1", + "buf0", + "buf1" +} \ No newline at end of file diff --git a/tpuv4/softmax_plan_8192x8192.py b/tpuv4/softmax_plan_8192x8192.py new file mode 100644 index 00000000..95cb9575 --- /dev/null +++ b/tpuv4/softmax_plan_8192x8192.py @@ -0,0 +1,4 @@ +plan = { + "buf0", + "buf1" +} \ No newline at end of file