From 254234040a87be053a7b9c289054ce591c9af5b0 Mon Sep 17 00:00:00 2001 From: poursoul Date: Sat, 14 Feb 2026 16:24:06 +0800 Subject: [PATCH] Refactor: move inline implementations to .cpp and parallelize full build pipeline - Move get_stacktrace, AssertionError ctor, and assert_impl from common.h inline definitions to common.cpp to reduce header bloat - Parallelize runtime build, orchestration compile, and kernel compile in code_runner using ThreadPoolExecutor - Move and includes from performance_collector.h to performance_collector.cpp (only needed in implementation) --- examples/scripts/code_runner.py | 79 +++++---- .../include/host/performance_collector.h | 2 - src/platform/src/performance_collector.cpp | 2 + .../runtime/common.cpp | 153 +++++++++++++++++ .../tensormap_and_ringbuffer/runtime/common.h | 158 +----------------- 5 files changed, 199 insertions(+), 195 deletions(-) create mode 100644 src/runtime/tensormap_and_ringbuffer/runtime/common.cpp diff --git a/examples/scripts/code_runner.py b/examples/scripts/code_runner.py index 7646455b..bb501f99 100644 --- a/examples/scripts/code_runner.py +++ b/examples/scripts/code_runner.py @@ -530,47 +530,26 @@ def run(self) -> None: " export PTO_ISA_ROOT=$(pwd)/examples/scripts/_deps/pto-isa" ) - # Step 1: Build runtime + # Step 1: Build runtime, orchestration, and kernels in parallel + # (they are independent — all only need kernel_compiler which is ready) logger.info(f"=== Building Runtime: {self.runtime_name} (platform: {self.platform}) ===") builder = RuntimeBuilder(platform=self.platform) kernel_compiler = builder.get_kernel_compiler() - try: - host_binary, aicpu_binary, aicore_binary = builder.build( - self.runtime_name, - ) - except Exception as e: - raise RuntimeError( - f"Failed to build runtime '{self.runtime_name}' for platform '{self.platform}'.\n" - f"Error: {e}" - ) from e - - # Step 2: Load runtime and set device - logger.info(f"=== Loading Runtime ({len(host_binary)} bytes) ===") - Runtime = bind_host_binary(host_binary) - - logger.info(f"=== Setting Device {self.device_id} ===") - set_device(self.device_id) - - # Step 3: Compile orchestration - logger.info("=== Compiling Orchestration ===") - - orch_so_binary = kernel_compiler.compile_orchestration( - self.runtime_name, - self.orchestration["source"], - ) + from concurrent.futures import ThreadPoolExecutor, Future - # Step 4: Compile kernels (will be registered during runtime.initialize) - logger.info("=== Compiling Kernels ===") + runtime_include_dirs = [ + os.path.join(self.project_root, "src", "runtime", self.runtime_name, "runtime") + ] - # Build list of (func_id, binary) tuples for passing to runtime.initialize() - kernel_binaries = [] - # Prepare runtime include directories for kernel compilation - runtime_include_dirs = [] - runtime_dir = os.path.join(self.project_root, "src", "runtime", self.runtime_name, "runtime") - runtime_include_dirs.append(runtime_dir) + def _build_runtime(): + return builder.build(self.runtime_name) - from concurrent.futures import ThreadPoolExecutor + def _compile_orchestration(): + return kernel_compiler.compile_orchestration( + self.runtime_name, + self.orchestration["source"], + ) def _compile_one_kernel(kernel): logger.info(f"Compiling kernel: {kernel['source']} (func_id={kernel['func_id']})") @@ -580,19 +559,39 @@ def _compile_one_kernel(kernel): pto_isa_root=pto_isa_root, extra_include_dirs=runtime_include_dirs, ) - # For sim platform: keep complete .so for dlopen (supports external symbols like std::exp) - # For real hardware: extract .text section (ccec compiled kernels don't depend on external symbols) if self.platform == "a2a3sim": - kernel_bin = incore_o # Complete .so for dlopen + kernel_bin = incore_o else: - kernel_bin = extract_text_section(incore_o) # .text only for mmap + kernel_bin = extract_text_section(incore_o) return (kernel["func_id"], kernel_bin) - with ThreadPoolExecutor(max_workers=len(self.kernels)) as executor: - kernel_binaries = list(executor.map(_compile_one_kernel, self.kernels)) + # Launch all compilations concurrently + max_workers = 2 + len(self.kernels) # runtime + orchestration + kernels + with ThreadPoolExecutor(max_workers=max_workers) as executor: + fut_runtime = executor.submit(_build_runtime) + fut_orch = executor.submit(_compile_orchestration) + fut_kernels = [executor.submit(_compile_one_kernel, k) for k in self.kernels] + + try: + host_binary, aicpu_binary, aicore_binary = fut_runtime.result() + except Exception as e: + raise RuntimeError( + f"Failed to build runtime '{self.runtime_name}' for platform '{self.platform}'.\n" + f"Error: {e}" + ) from e + + orch_so_binary = fut_orch.result() + kernel_binaries = [f.result() for f in fut_kernels] logger.info(f"Compiled {len(kernel_binaries)} kernel(s)") + # Step 2: Load runtime and set device + logger.info(f"=== Loading Runtime ({len(host_binary)} bytes) ===") + Runtime = bind_host_binary(host_binary) + + logger.info(f"=== Setting Device {self.device_id} ===") + set_device(self.device_id) + # Step 5: Run each parameter set total_cases = len(self.params_list) for case_idx, params in enumerate(self.params_list): diff --git a/src/platform/include/host/performance_collector.h b/src/platform/include/host/performance_collector.h index 00d67f26..a255027c 100644 --- a/src/platform/include/host/performance_collector.h +++ b/src/platform/include/host/performance_collector.h @@ -16,8 +16,6 @@ #ifndef PLATFORM_HOST_PERFORMANCE_COLLECTOR_H_ #define PLATFORM_HOST_PERFORMANCE_COLLECTOR_H_ -#include -#include #include #include diff --git a/src/platform/src/performance_collector.cpp b/src/platform/src/performance_collector.cpp index 53a9145e..a018f19e 100644 --- a/src/platform/src/performance_collector.cpp +++ b/src/platform/src/performance_collector.cpp @@ -6,8 +6,10 @@ #include "host/performance_collector.h" #include +#include #include #include +#include #include #include #include diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/common.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/common.cpp new file mode 100644 index 00000000..2e56d8d4 --- /dev/null +++ b/src/runtime/tensormap_and_ringbuffer/runtime/common.cpp @@ -0,0 +1,153 @@ +#include "common.h" + +#ifdef __linux__ +#include +#include +#include +#include + +#include +#include +#include +#endif + +/** + * 使用 addr2line 将地址转换为 文件:行号 信息 + * 使用 -i 标志展开内联,返回第一行(最内层实际代码位置) + * 如果存在内联,同时通过 inline_chain 返回外层调用链 + */ +#ifdef __linux__ +static std::string addr_to_line(const char* executable, void* addr, + std::string* inline_chain = nullptr) { + char cmd[512]; + snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr); + + std::array buffer; + std::string raw_output; + + FILE* pipe = popen(cmd, "r"); + if (pipe) { + while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) { + raw_output += buffer.data(); + } + pclose(pipe); + } + + if (raw_output.empty() || raw_output.find("??") != std::string::npos) { + return ""; + } + + // 按行分割 + std::vector lines; + size_t pos = 0; + while (pos < raw_output.size()) { + size_t nl = raw_output.find('\n', pos); + if (nl == std::string::npos) nl = raw_output.size(); + std::string line = raw_output.substr(pos, nl - pos); + while (!line.empty() && line.back() == '\r') line.pop_back(); + if (!line.empty()) lines.push_back(line); + pos = nl + 1; + } + + if (lines.empty()) return ""; + + // 第一行是最内层的实际代码位置,后续行是外层内联调用者 + if (inline_chain && lines.size() > 1) { + *inline_chain = ""; + for (size_t j = 1; j < lines.size(); j++) { + *inline_chain += " [inlined by] " + lines[j] + "\n"; + } + } + + return lines.front(); +} +#endif + +/** + * 获取当前调用栈信息(包含文件路径和行号) + * 通过 dladdr 定位每个栈帧所在的共享库,并用相对地址调用 addr2line + */ +std::string get_stacktrace(int skip_frames) { + std::string result; +#ifdef __linux__ + const int max_frames = 64; + void* buffer[max_frames]; + int nframes = backtrace(buffer, max_frames); + char** symbols = backtrace_symbols(buffer, nframes); + + if (symbols) { + result = "调用栈:\n"; + for (int i = skip_frames; i < nframes; i++) { + std::string frame_info; + + void* addr = (void*)((char*)buffer[i] - 1); + + Dl_info dl_info; + std::string inline_chain; + if (dladdr(addr, &dl_info) && dl_info.dli_fname) { + void* rel_addr = (void*)((char*)addr - (char*)dl_info.dli_fbase); + std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain); + + if (addr2line_result.empty()) { + addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain); + } + + if (!addr2line_result.empty()) { + frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result; + } + } + + if (frame_info.empty()) { + std::string frame(symbols[i]); + + size_t start = frame.find('('); + size_t end = frame.find('+', start); + if (start != std::string::npos && end != std::string::npos) { + std::string mangled = frame.substr(start + 1, end - start - 1); + int status; + char* demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status); + if (status == 0 && demangled) { + frame = frame.substr(0, start + 1) + demangled + frame.substr(end); + free(demangled); + } + } + frame_info = frame; + } + + char buf[16]; + snprintf(buf, sizeof(buf), " #%d ", i - skip_frames); + result += buf + frame_info + "\n"; + if (!inline_chain.empty()) { + result += inline_chain; + } + } + free(symbols); + } +#else + result = "(调用栈仅在 Linux 上可用)\n"; +#endif + return result; +} + +// AssertionError 构造函数 +static std::string build_assert_message(const char* condition, const char* file, int line) { + std::string msg = "断言失败: " + std::string(condition) + "\n"; + msg += " 位置: " + std::string(file) + ":" + std::to_string(line) + "\n"; + msg += get_stacktrace(3); + return msg; +} + +AssertionError::AssertionError(const char* condition, const char* file, int line) + : std::runtime_error(build_assert_message(condition, file, line)), + condition_(condition), file_(file), line_(line) {} + +[[noreturn]] void assert_impl(const char* condition, const char* file, int line) { + fprintf(stderr, "\n========================================\n"); + fprintf(stderr, "断言失败: %s\n", condition); + fprintf(stderr, "位置: %s:%d\n", file, line); + fprintf(stderr, "%s", get_stacktrace(2).c_str()); + fprintf(stderr, "========================================\n\n"); + fflush(stderr); + + throw AssertionError(condition, file, line); +} diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/common.h b/src/runtime/tensormap_and_ringbuffer/runtime/common.h index d1593ff1..b1da0708 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/common.h +++ b/src/runtime/tensormap_and_ringbuffer/runtime/common.h @@ -5,162 +5,24 @@ #include #include -#ifdef __linux__ -#include -#include -#include -#include - -#include -#include -#include -#include -#endif - -/** - * 使用 addr2line 将地址转换为 文件:行号 信息 - * 使用 -i 标志展开内联,返回第一行(最内层实际代码位置) - * 如果存在内联,同时通过 inline_chain 返回外层调用链 - */ -#ifdef __linux__ -inline std::string addr_to_line(const char* executable, void* addr, - std::string* inline_chain = nullptr) { - char cmd[512]; - snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr); - - std::array buffer; - std::string raw_output; - - FILE* pipe = popen(cmd, "r"); - if (pipe) { - while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) { - raw_output += buffer.data(); - } - pclose(pipe); - } - - if (raw_output.empty() || raw_output.find("??") != std::string::npos) { - return ""; - } - - // 按行分割 - std::vector lines; - size_t pos = 0; - while (pos < raw_output.size()) { - size_t nl = raw_output.find('\n', pos); - if (nl == std::string::npos) nl = raw_output.size(); - std::string line = raw_output.substr(pos, nl - pos); - while (!line.empty() && line.back() == '\r') line.pop_back(); - if (!line.empty()) lines.push_back(line); - pos = nl + 1; - } - - if (lines.empty()) return ""; - - // 第一行是最内层的实际代码位置,后续行是外层内联调用者 - if (inline_chain && lines.size() > 1) { - *inline_chain = ""; - for (size_t j = 1; j < lines.size(); j++) { - *inline_chain += " [inlined by] " + lines[j] + "\n"; - } - } - - return lines.front(); -} -#endif - /** * 获取当前调用栈信息(包含文件路径和行号) - * 通过 dladdr 定位每个栈帧所在的共享库,并用相对地址调用 addr2line + * 实现在 common.cpp 中 */ -inline std::string get_stacktrace(int skip_frames = 1) { - std::string result; -#ifdef __linux__ - const int max_frames = 64; - void* buffer[max_frames]; - int nframes = backtrace(buffer, max_frames); - char** symbols = backtrace_symbols(buffer, nframes); - - if (symbols) { - result = "调用栈:\n"; - for (int i = skip_frames; i < nframes; i++) { - std::string frame_info; - - // backtrace() 返回的是返回地址(call 指令的下一条指令) - // 减 1 使地址落在 call 指令内部,避免解析到下一个函数 - void* addr = (void*)((char*)buffer[i] - 1); - - // 使用 dladdr 获取栈帧所在的共享库信息 - Dl_info dl_info; - std::string inline_chain; - if (dladdr(addr, &dl_info) && dl_info.dli_fname) { - // 计算相对于共享库基地址的偏移 - void* rel_addr = (void*)((char*)addr - (char*)dl_info.dli_fbase); - std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain); - - // 如果相对地址失败,尝试用绝对地址(适用于非 PIE 可执行文件) - if (addr2line_result.empty()) { - addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain); - } - - if (!addr2line_result.empty()) { - frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result; - } - } - - // 如果 addr2line 失败,使用 backtrace_symbols 的输出并 demangle - if (frame_info.empty()) { - std::string frame(symbols[i]); - - size_t start = frame.find('('); - size_t end = frame.find('+', start); - if (start != std::string::npos && end != std::string::npos) { - std::string mangled = frame.substr(start + 1, end - start - 1); - int status; - char* demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status); - if (status == 0 && demangled) { - frame = frame.substr(0, start + 1) + demangled + frame.substr(end); - free(demangled); - } - } - frame_info = frame; - } - - char buf[16]; - snprintf(buf, sizeof(buf), " #%d ", i - skip_frames); - result += buf + frame_info + "\n"; - if (!inline_chain.empty()) { - result += inline_chain; - } - } - free(symbols); - } -#else - result = "(调用栈仅在 Linux 上可用)\n"; -#endif - return result; -} +std::string get_stacktrace(int skip_frames = 1); /** * 断言失败异常,包含文件、行号、条件和调用栈信息 */ class AssertionError : public std::runtime_error { public: - AssertionError(const char* condition, const char* file, int line) - : std::runtime_error(build_message(condition, file, line)), condition_(condition), file_(file), line_(line) {} + AssertionError(const char* condition, const char* file, int line); const char* condition() const { return condition_; } const char* file() const { return file_; } int line() const { return line_; } private: - static std::string build_message(const char* condition, const char* file, int line) { - std::string msg = "断言失败: " + std::string(condition) + "\n"; - msg += " 位置: " + std::string(file) + ":" + std::to_string(line) + "\n"; - msg += get_stacktrace(3); // 跳过 build_message, 构造函数, debug_assert_impl - return msg; - } - const char* condition_; const char* file_; int line_; @@ -168,19 +30,9 @@ class AssertionError : public std::runtime_error { /** * 断言失败时的处理函数 + * 实现在 common.cpp 中 */ -[[noreturn]] inline void assert_impl(const char* condition, const char* file, int line) { - // 打印错误信息到 stderr - fprintf(stderr, "\n========================================\n"); - fprintf(stderr, "断言失败: %s\n", condition); - fprintf(stderr, "位置: %s:%d\n", file, line); - fprintf(stderr, "%s", get_stacktrace(2).c_str()); - fprintf(stderr, "========================================\n\n"); - fflush(stderr); - - // 抛出异常,允许测试框架捕获 - throw AssertionError(condition, file, line); -} +[[noreturn]] void assert_impl(const char* condition, const char* file, int line); /** * debug_assert 宏 - 在 debug 模式下检查条件,失败时抛出异常并打印调用栈