Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 39 additions & 40 deletions examples/scripts/code_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,47 +530,26 @@ def run(self) -> None:
" export PTO_ISA_ROOT=$(pwd)/examples/scripts/_deps/pto-isa"
)

# Step 1: Build runtime
# Step 1: Build runtime, orchestration, and kernels in parallel
# (they are independent — all only need kernel_compiler which is ready)
logger.info(f"=== Building Runtime: {self.runtime_name} (platform: {self.platform}) ===")
builder = RuntimeBuilder(platform=self.platform)
kernel_compiler = builder.get_kernel_compiler()

try:
host_binary, aicpu_binary, aicore_binary = builder.build(
self.runtime_name,
)
except Exception as e:
raise RuntimeError(
f"Failed to build runtime '{self.runtime_name}' for platform '{self.platform}'.\n"
f"Error: {e}"
) from e

# Step 2: Load runtime and set device
logger.info(f"=== Loading Runtime ({len(host_binary)} bytes) ===")
Runtime = bind_host_binary(host_binary)

logger.info(f"=== Setting Device {self.device_id} ===")
set_device(self.device_id)

# Step 3: Compile orchestration
logger.info("=== Compiling Orchestration ===")

orch_so_binary = kernel_compiler.compile_orchestration(
self.runtime_name,
self.orchestration["source"],
)
from concurrent.futures import ThreadPoolExecutor, Future

# Step 4: Compile kernels (will be registered during runtime.initialize)
logger.info("=== Compiling Kernels ===")
runtime_include_dirs = [
os.path.join(self.project_root, "src", "runtime", self.runtime_name, "runtime")
]

# Build list of (func_id, binary) tuples for passing to runtime.initialize()
kernel_binaries = []
# Prepare runtime include directories for kernel compilation
runtime_include_dirs = []
runtime_dir = os.path.join(self.project_root, "src", "runtime", self.runtime_name, "runtime")
runtime_include_dirs.append(runtime_dir)
def _build_runtime():
return builder.build(self.runtime_name)

from concurrent.futures import ThreadPoolExecutor
def _compile_orchestration():
return kernel_compiler.compile_orchestration(
self.runtime_name,
self.orchestration["source"],
)

def _compile_one_kernel(kernel):
logger.info(f"Compiling kernel: {kernel['source']} (func_id={kernel['func_id']})")
Expand All @@ -580,19 +559,39 @@ def _compile_one_kernel(kernel):
pto_isa_root=pto_isa_root,
extra_include_dirs=runtime_include_dirs,
)
# For sim platform: keep complete .so for dlopen (supports external symbols like std::exp)
# For real hardware: extract .text section (ccec compiled kernels don't depend on external symbols)
if self.platform == "a2a3sim":
kernel_bin = incore_o # Complete .so for dlopen
kernel_bin = incore_o
else:
kernel_bin = extract_text_section(incore_o) # .text only for mmap
kernel_bin = extract_text_section(incore_o)
return (kernel["func_id"], kernel_bin)

with ThreadPoolExecutor(max_workers=len(self.kernels)) as executor:
kernel_binaries = list(executor.map(_compile_one_kernel, self.kernels))
# Launch all compilations concurrently
max_workers = 2 + len(self.kernels) # runtime + orchestration + kernels
with ThreadPoolExecutor(max_workers=max_workers) as executor:
fut_runtime = executor.submit(_build_runtime)
fut_orch = executor.submit(_compile_orchestration)
fut_kernels = [executor.submit(_compile_one_kernel, k) for k in self.kernels]

try:
host_binary, aicpu_binary, aicore_binary = fut_runtime.result()
except Exception as e:
raise RuntimeError(
f"Failed to build runtime '{self.runtime_name}' for platform '{self.platform}'.\n"
f"Error: {e}"
) from e

orch_so_binary = fut_orch.result()
kernel_binaries = [f.result() for f in fut_kernels]

logger.info(f"Compiled {len(kernel_binaries)} kernel(s)")

# Step 2: Load runtime and set device
logger.info(f"=== Loading Runtime ({len(host_binary)} bytes) ===")
Runtime = bind_host_binary(host_binary)

logger.info(f"=== Setting Device {self.device_id} ===")
set_device(self.device_id)

# Step 5: Run each parameter set
total_cases = len(self.params_list)
for case_idx, params in enumerate(self.params_list):
Expand Down
2 changes: 0 additions & 2 deletions src/platform/include/host/performance_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
#ifndef PLATFORM_HOST_PERFORMANCE_COLLECTOR_H_
#define PLATFORM_HOST_PERFORMANCE_COLLECTOR_H_

#include <chrono>
#include <optional>
#include <string>
#include <vector>

Expand Down
2 changes: 2 additions & 0 deletions src/platform/src/performance_collector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
#include "host/performance_collector.h"

#include <algorithm>
#include <chrono>
#include <fstream>
#include <iomanip>
#include <optional>
#include <sys/stat.h>
#include <sys/types.h>
#include <ctime>
Expand Down
153 changes: 153 additions & 0 deletions src/runtime/tensormap_and_ringbuffer/runtime/common.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#include "common.h"

#ifdef __linux__
#include <cxxabi.h>
#include <dlfcn.h>
#include <execinfo.h>
#include <unistd.h>

#include <array>
#include <cstring>
#include <vector>
#endif

/**
* 使用 addr2line 将地址转换为 文件:行号 信息
* 使用 -i 标志展开内联,返回第一行(最内层实际代码位置)
* 如果存在内联,同时通过 inline_chain 返回外层调用链
*/
#ifdef __linux__
static std::string addr_to_line(const char* executable, void* addr,
std::string* inline_chain = nullptr) {
char cmd[512];
snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr);

std::array<char, 256> buffer;
std::string raw_output;

FILE* pipe = popen(cmd, "r");
if (pipe) {
while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
raw_output += buffer.data();
}
pclose(pipe);
}

if (raw_output.empty() || raw_output.find("??") != std::string::npos) {
return "";
}

// 按行分割
std::vector<std::string> lines;
size_t pos = 0;
while (pos < raw_output.size()) {
size_t nl = raw_output.find('\n', pos);
if (nl == std::string::npos) nl = raw_output.size();
std::string line = raw_output.substr(pos, nl - pos);
while (!line.empty() && line.back() == '\r') line.pop_back();
if (!line.empty()) lines.push_back(line);
pos = nl + 1;
}

if (lines.empty()) return "";

// 第一行是最内层的实际代码位置,后续行是外层内联调用者
if (inline_chain && lines.size() > 1) {
*inline_chain = "";
for (size_t j = 1; j < lines.size(); j++) {
*inline_chain += " [inlined by] " + lines[j] + "\n";
}
}

return lines.front();
}
#endif

/**
* 获取当前调用栈信息(包含文件路径和行号)
* 通过 dladdr 定位每个栈帧所在的共享库,并用相对地址调用 addr2line
*/
std::string get_stacktrace(int skip_frames) {
std::string result;
#ifdef __linux__
const int max_frames = 64;
void* buffer[max_frames];
int nframes = backtrace(buffer, max_frames);
char** symbols = backtrace_symbols(buffer, nframes);

if (symbols) {
result = "调用栈:\n";
for (int i = skip_frames; i < nframes; i++) {
std::string frame_info;

void* addr = (void*)((char*)buffer[i] - 1);

Dl_info dl_info;
std::string inline_chain;
if (dladdr(addr, &dl_info) && dl_info.dli_fname) {
void* rel_addr = (void*)((char*)addr - (char*)dl_info.dli_fbase);
std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain);

if (addr2line_result.empty()) {
addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain);
}

if (!addr2line_result.empty()) {
frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result;
}
}

if (frame_info.empty()) {
std::string frame(symbols[i]);

size_t start = frame.find('(');
size_t end = frame.find('+', start);
if (start != std::string::npos && end != std::string::npos) {
std::string mangled = frame.substr(start + 1, end - start - 1);
int status;
char* demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status);
if (status == 0 && demangled) {
frame = frame.substr(0, start + 1) + demangled + frame.substr(end);
free(demangled);
}
}
frame_info = frame;
}

char buf[16];
snprintf(buf, sizeof(buf), " #%d ", i - skip_frames);
result += buf + frame_info + "\n";
if (!inline_chain.empty()) {
result += inline_chain;
}
}
free(symbols);
}
#else
result = "(调用栈仅在 Linux 上可用)\n";
#endif
return result;
}

// AssertionError 构造函数
static std::string build_assert_message(const char* condition, const char* file, int line) {
std::string msg = "断言失败: " + std::string(condition) + "\n";
msg += " 位置: " + std::string(file) + ":" + std::to_string(line) + "\n";
msg += get_stacktrace(3);
return msg;
}

AssertionError::AssertionError(const char* condition, const char* file, int line)
: std::runtime_error(build_assert_message(condition, file, line)),
condition_(condition), file_(file), line_(line) {}

[[noreturn]] void assert_impl(const char* condition, const char* file, int line) {
fprintf(stderr, "\n========================================\n");
fprintf(stderr, "断言失败: %s\n", condition);
fprintf(stderr, "位置: %s:%d\n", file, line);
fprintf(stderr, "%s", get_stacktrace(2).c_str());
fprintf(stderr, "========================================\n\n");
fflush(stderr);

throw AssertionError(condition, file, line);
}
Loading
Loading