Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@
"dram_print_interval": 10000,
"dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
"l2d_type" : "datacache",
"l2d_config" : "S:64:128:512,32,L:T:m:W:L,A:192:4,32:0,32",
"l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32",

"icnt_type" : "simple",
"icnt_latency" : 7,
"icnt_freq" : 38400,
"icnt_freq" : 48000,
"icnt_node_per_core" : 1,
"icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",

"precision" : 4,
Expand Down
11 changes: 10 additions & 1 deletion PyTorchSimBackend/include/L2Cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ class L2CacheBase {
CacheConfig l_cache_config; // L2 cache config
uint32_t l_id; // L2 partition id
uint32_t l2d_hit_latency;
uint32_t n_read_port = 2; // Number of read ports (CMEM Read 2TB/s)
uint32_t n_write_port = 1; // Number of write ports (CMEM Write 1TB/s)
std::vector<uint32_t> read_port; // Current read port
std::vector<uint32_t> write_port; // Current write port
std::queue<mem_fetch*> *l_to_xbar_queue;
std::queue<mem_fetch*> *l_from_xbar_queue;
std::queue<mem_fetch*> l_to_mem_queue;
Expand All @@ -47,9 +51,14 @@ class L2DataCache : public L2CacheBase {
public:
typedef IntervalTree<new_addr_type, int> CachePlan;
L2DataCache(std::string name, CacheConfig &cache_config, uint32_t id, cycle_type *core_cycle,
uint32_t l2d_hit_latency, std::queue<mem_fetch*> *to_xbar_queue,
uint32_t l2d_hit_latency, uint32_t num_cores, std::queue<mem_fetch*> *to_xbar_queue,
std::queue<mem_fetch*> *from_xbar_queue);
void cycle() override;
bool push(mem_fetch* req) override; // Push memory response from DRAM
bool port_free(mem_fetch* req);
void clear_port() { std::fill(read_port.begin(), read_port.end(), 0);
std::fill(write_port.begin(), write_port.end(), 0); }
virtual void print_stats() override;
private:
uint32_t _n_cores;
};
3 changes: 2 additions & 1 deletion PyTorchSimBackend/include/TileGraphParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class TileNode {

class TileGraphParser {
public:
TileGraphParser(std::string onnx_path, std::string attribute_path);
TileGraphParser(std::string onnx_path, std::string attribute_path, std::string config_path);
std::shared_ptr<TileNode> get_top_loop();
std::unique_ptr<TileGraph>& get_tile_graph() { return _tile_graph; }
addr_type lookup(std::string key);
Expand Down Expand Up @@ -136,6 +136,7 @@ class TileGraphParser {
int _loop_stack_pointer = 0;

json _attribute_json;
json _config_json;
std::string _tog_path;
std::string _attribute_path;
uint64_t indirect_counter = 0;
Expand Down
2 changes: 1 addition & 1 deletion PyTorchSimBackend/src/Dram.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) {
_m_cache_config.get_num_sets(), _m_cache_config.get_num_assoc(),
_m_cache_config.get_line_size(), _m_cache_config.get_sector_size());
for (int ch = 0; ch < _n_ch; ch++)
_m_caches[ch] = new L2DataCache(name, _m_cache_config, ch, _core_cycles, _config.l2d_hit_latency, &m_to_crossbar_queue[ch], &m_from_crossbar_queue[ch]);
_m_caches[ch] = new L2DataCache(name, _m_cache_config, ch, _core_cycles, _config.l2d_hit_latency, _config.num_cores, &m_to_crossbar_queue[ch], &m_from_crossbar_queue[ch]);
} else {
spdlog::error("[Config/L2D] Invalid L2 cache type...!");
exit(EXIT_FAILURE);
Expand Down
135 changes: 81 additions & 54 deletions PyTorchSimBackend/src/L2Cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,28 @@ void NoL2Cache::cycle() {
}

L2DataCache::L2DataCache(std::string name, CacheConfig &cache_config, uint32_t id,
cycle_type *core_cycle, uint32_t l2d_hit_latency,
cycle_type *core_cycle, uint32_t l2d_hit_latency, uint32_t num_cores,
std::queue<mem_fetch*> *to_xbar_queue, std::queue<mem_fetch*> *from_xbar_queue) :
L2CacheBase(name, cache_config, id, core_cycle, l2d_hit_latency, to_xbar_queue, from_xbar_queue) {
L2CacheBase(name, cache_config, id, core_cycle, l2d_hit_latency, to_xbar_queue, from_xbar_queue), _n_cores(num_cores) {
l_cache = std::make_unique<DataCache>(name, cache_config, id, 0, &l_to_mem_queue);
l_from_cache_queue = DelayQueue<mem_fetch*>(l_name + "_latency_queue", true, 0);
read_port.resize(num_cores, 0);
write_port.resize(num_cores, 0);
}

bool L2DataCache::push(mem_fetch* req) {
if (l_cache->waiting_for_fill(req)) {
if (!l_cache->fill_port_free())
return false;
l_cache->fill(req, *l_core_cycle);
bool is_cacheable = req->is_cacheable();
if (!is_cacheable) {
l_to_xbar_queue->push(req);
} else {
if (req->get_access_type() == GLOBAL_ACC_R || req->get_access_type() == GLOBAL_ACC_W)
l_to_xbar_queue->push(req);
if (l_cache->waiting_for_fill(req)) {
if (!l_cache->fill_port_free())
return false;
l_cache->fill(req, *l_core_cycle);
} else {
if (req->get_access_type() == GLOBAL_ACC_R || req->get_access_type() == GLOBAL_ACC_W)
l_to_xbar_queue->push(req);
}
}
return true;
}
Expand All @@ -39,63 +46,83 @@ void L2DataCache::cycle() {
// Mem to Cache
uint32_t line_size = l_cache_config.get_line_size();
uint32_t sector_size = l_cache_config.get_sector_size();
clear_port();

/* Pass a request to cache */
if (!l_from_xbar_queue->empty()) {
mem_fetch* req = l_from_xbar_queue->front();
/* Check cache plan */
bool is_cacheable = req->is_cacheable();
for (int i = 0; i < (n_read_port + n_write_port) * _n_cores; i++) {
if (!l_from_xbar_queue->empty()) {
mem_fetch* req = l_from_xbar_queue->front();
/* Check cache plan */
bool is_cacheable = req->is_cacheable();

/* Go to l2 cache */
if (is_cacheable && l_cache->data_port_free()) {
req->set_access_sector_mask(line_size, sector_size);
std::deque<CacheEvent> events;
CacheRequestStatus status = l_cache->access(
req->get_addr(), *l_core_cycle, req, events);
bool write_sent = CacheEvent::was_write_sent(events);
bool read_sent = CacheEvent::was_read_sent(events);
if (status == HIT) {
if (!write_sent) {
req->set_reply();
req->current_state = "L2HIT";
l_from_cache_queue.push(req, l2d_hit_latency);
/* Go to l2 cache */
if (is_cacheable && l_cache->data_port_free()) {
if (!port_free(req)) continue;
req->set_access_sector_mask(line_size, sector_size);
std::deque<CacheEvent> events;
CacheRequestStatus status = l_cache->access(
req->get_addr(), *l_core_cycle, req, events);
bool write_sent = CacheEvent::was_write_sent(events);
bool read_sent = CacheEvent::was_read_sent(events);
if (status == HIT) {
if (!write_sent) {
req->set_reply();
req->current_state = "L2HIT";
l_from_cache_queue.push(req, l2d_hit_latency);
}
l_from_xbar_queue->pop();
} else if (status != RESERVATION_FAIL) {
req->current_state = "L2MISS";
if (req->is_write() &&
(l_cache_config.get_write_alloc_policy() == FETCH_ON_WRITE ||
l_cache_config.get_write_alloc_policy() == LAZY_FETCH_ON_READ)) {
req->set_reply();
req->current_state = "L2MISS-WRITE";
l_from_cache_queue.push(req, l2d_hit_latency);
}
l_from_xbar_queue->pop();
} else {
// Status Reservation fail, Retry it
assert(!write_sent);
assert(!read_sent);
}
} else if (!is_cacheable) {
l_to_mem_queue.push(req);
l_from_xbar_queue->pop();
} else if (status != RESERVATION_FAIL) {
req->current_state = "L2MISS";
if (req->is_write() &&
(l_cache_config.get_write_alloc_policy() == FETCH_ON_WRITE ||
l_cache_config.get_write_alloc_policy() == LAZY_FETCH_ON_READ)) {
req->set_reply();
req->current_state = "L2MISS-WRITE";
l_from_cache_queue.push(req, l2d_hit_latency);
}
l_from_xbar_queue->pop();
} else {
// Status Reservation fail, Retry it
assert(!write_sent);
assert(!read_sent);
}
} else if (!is_cacheable) {
l_to_mem_queue.push(req);
l_from_xbar_queue->pop();
}
}

if (l_cache->access_ready() &&
!l_from_cache_queue.full()) {
mem_fetch* req = l_cache->top_next_access();
if (req->is_request()) req->set_reply();
l_from_cache_queue.push(req, l2d_hit_latency);
l_cache->pop_next_access();
if (l_cache->access_ready() &&
!l_from_cache_queue.full()) {
mem_fetch* req = l_cache->top_next_access();
if (req->is_request()) req->set_reply();
l_from_cache_queue.push(req, l2d_hit_latency);
l_cache->pop_next_access();
}

if (l_from_cache_queue.arrived()) {
mem_fetch* req = l_from_cache_queue.top();
if (req->get_access_type() == GLOBAL_ACC_R || req->get_access_type() == GLOBAL_ACC_W)
l_to_xbar_queue->push(req);
l_from_cache_queue.pop();
}
}
}

if (l_from_cache_queue.arrived()) {
mem_fetch* req = l_from_cache_queue.top();
if (req->get_access_type() == GLOBAL_ACC_R || req->get_access_type() == GLOBAL_ACC_W)
l_to_xbar_queue->push(req);
l_from_cache_queue.pop();
bool L2DataCache::port_free(mem_fetch* req) {
int core_id = req->get_core_id();
if (req->is_write()) {
write_port[core_id]++;
if (write_port[core_id] > n_write_port) {
return false; // No more write port available
}
} else {
read_port[core_id]++;
if (read_port[core_id] > n_read_port) {
return false; // No more read port available
}
}
return true; // Port is free
}

void L2DataCache::print_stats() {
Expand Down
5 changes: 3 additions & 2 deletions PyTorchSimBackend/src/TileGraphParser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -699,8 +699,9 @@ void TileLoopNode::print_node() {
spdlog::debug("{} stride: {} ", spaces, _stride);
}

TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_path) {
TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_path, std::string config_path) {
loadConfig(attribute_path, _attribute_json);
loadConfig(config_path, _config_json);
_attribute_path = attribute_path;

/* Note: this parsing algorithm assume that all node are sorted in topological-order */
Expand All @@ -727,7 +728,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa
spdlog::info("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", it.key(), fmt::join(_arg_numa_stride[it.key()], ", "));
}
}
if (_attribute_json.contains("sram_alloc") and _attribute_json.contains("l2d_type") and _attribute_json["l2d_type"] == "datacache") {
if (_attribute_json.contains("sram_alloc") and _config_json.contains("l2d_type") and _config_json["l2d_type"] == "datacache") {
auto sram_alloc_list = _attribute_json["sram_alloc"];
spdlog::info("[TOGParser/Attribute] ================= SRAM Alloc Plan ================");
for (auto it = sram_alloc_list.begin(); it != sram_alloc_list.end(); ++it) {
Expand Down
12 changes: 6 additions & 6 deletions PyTorchSimBackend/src/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ namespace po = boost::program_options;
const char* env_value = std::getenv("BACKENDSIM_DRYRUN");
bool isDryRun = (env_value != nullptr && std::string(env_value) == "1");

void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, cycle_type request_time=0, int partiton_id=0) {
auto graph_praser = TileGraphParser(onnx_path, attribute_path);
void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, std::string config_path, cycle_type request_time=0, int partiton_id=0) {
auto graph_praser = TileGraphParser(onnx_path, attribute_path, config_path);
std::unique_ptr<TileGraph>& tile_graph = graph_praser.get_tile_graph();
tile_graph->set_arrival_time(request_time ? request_time : simulator->get_core_cycle());
spdlog::info("[Scheduler {}] Register graph path: {} operation: {} at {}", partiton_id, onnx_path, tile_graph->get_name(), simulator->get_core_cycle());
Expand Down Expand Up @@ -46,16 +46,16 @@ void interactive_mode(Simulator* simulator) {
// Parse the first part of the command (e.g., "launch", "until", "quit")
iss >> token;
if (token == "launch") {
std::string onnx_path, attribute_path;
std::string onnx_path, attribute_path, config_path;
cycle_type request_time = 0;
int partition_id = 0;
iss >> onnx_path >> attribute_path >> request_time >> partition_id;
iss >> config_path >> onnx_path >> attribute_path >> request_time >> partition_id;

// Check if both paths were provided
if (onnx_path.empty() || attribute_path.empty()) {
spdlog::error("Error: Please provide both ONNX path and Attribute path in the format: launch onnx/path attribute/path");
} else {
launchKernel(simulator, onnx_path, attribute_path, request_time, partition_id);
launchKernel(simulator, onnx_path, attribute_path, config_path, request_time, partition_id);
std::cerr << "launch done" << std::endl;
}
} else if (token == "until") {
Expand Down Expand Up @@ -135,7 +135,7 @@ int main(int argc, char** argv) {
cmd_parser.set_if_defined("attributes_list", &attribute_path);

/* launch kernels */
launchKernel(simulator, onnx_path, attribute_path);
launchKernel(simulator, onnx_path, attribute_path, config_path);
simulator->run_simulator();
if (simulator->get_core_cycle()==0)
simulator->until(1);
Expand Down
16 changes: 6 additions & 10 deletions PyTorchSimFrontend/mlir/mlir_codegen_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,22 +113,18 @@ def write_header(self):
self.header.splice(
f"""
def sram_plan_prefix(buffer_name, buffer):
#if CONFIG_SRAM_BUFFER_PLAN is None:
# return
#elif buffer_name not in CONFIG_SRAM_BUFFER_PLAN:
# return
buffer_size = buffer.element_size() * buffer.untyped_storage().size()
if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN):
return
buffer_size = buffer.untyped_storage().size()
start = buffer.data_ptr()
end = start + buffer_size
# print(f'Alloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})')
BackendSimulator.sram_alloc(buffer_name, [start, end])

def sram_plan_postfix(buffer_name, buffer):
#if CONFIG_SRAM_BUFFER_PLAN is None:
# return
#elif buffer_name not in CONFIG_SRAM_BUFFER_PLAN:
# return
buffer_size = buffer.element_size() * buffer.untyped_storage().size()
if CONFIG_SRAM_BUFFER_PLAN and (buffer_name not in CONFIG_SRAM_BUFFER_PLAN):
return
buffer_size = buffer.untyped_storage().size()
start = buffer.data_ptr()
end = start + buffer_size
# print(f'Dealloc {{buffer_name}}(0x{{start:x}} ~ 0x{{end:x}})')
Expand Down
2 changes: 1 addition & 1 deletion PyTorchSimFrontend/mlir/mlir_gemm_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_no
TILE_K = tile_info["TILE_K"]
else: # case 2: use gemm_combination_mapping
min_tile = (n_extra_node + n_prologue_node) == 0
TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, max(len(n_extra_read)-2, 0), n_prologue_node, min_tile=min_tile)
TILE_M, TILE_N, TILE_K = kernel.gemm_combination_mapping(M, N, K, max(len(n_extra_read)-2, 0), n_prologue_node, min_tile=True)
# case 3: use manual tile size
if extension_config.CONFIG_MANUAL_TILE_SIZE:
TILE_M = extension_config.CONFIG_TILE_M
Expand Down
2 changes: 1 addition & 1 deletion Simulator/simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def send_command(self, command):
return None

def launch(self, onnx_path, attribute_path, arrival_time=0, partion_id=0):
command = f"launch {onnx_path} {attribute_path} {arrival_time} {partion_id}"
command = f"launch {self.config_path} {onnx_path} {attribute_path} {arrival_time} {partion_id}"
ret = self.send_command(command)
return 0

Expand Down
3 changes: 3 additions & 0 deletions tpuv4/CONV0_plan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
plan = {
"primals_2"
}
3 changes: 3 additions & 0 deletions tpuv4/CONV1_plan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
plan = {
"primals_2"
}
3 changes: 3 additions & 0 deletions tpuv4/CONV2_plan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
plan = {
"primals_2"
}
3 changes: 3 additions & 0 deletions tpuv4/CONV3_plan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
plan = {
"primals_1"
}
3 changes: 3 additions & 0 deletions tpuv4/gemm_plan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
plan = {
"arg0_1"
}
5 changes: 5 additions & 0 deletions tpuv4/softmax_plan_2048x2048.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
plan = {
"arg0_1",
"buf0",
"buf1"
}
5 changes: 5 additions & 0 deletions tpuv4/softmax_plan_512x512.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
plan = {
"arg0_1",
"buf0",
"buf1"
}
4 changes: 4 additions & 0 deletions tpuv4/softmax_plan_8192x8192.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
plan = {
"buf0",
"buf1"
}