From 254234040a87be053a7b9c289054ce591c9af5b0 Mon Sep 17 00:00:00 2001
From: poursoul <poursoul@126.com>
Date: Sat, 14 Feb 2026 16:24:06 +0800
Subject: [PATCH] Refactor: move inline implementations to .cpp and parallelize
 full build pipeline

- Move get_stacktrace, AssertionError ctor, and assert_impl from
  common.h inline definitions to common.cpp to reduce header bloat
- Parallelize runtime build, orchestration compile, and kernel compile
  in code_runner using ThreadPoolExecutor
- Move <chrono> and <optional> includes from performance_collector.h
  to performance_collector.cpp (only needed in implementation)
---
 examples/scripts/code_runner.py               |  79 +++++----
 .../include/host/performance_collector.h      |   2 -
 src/platform/src/performance_collector.cpp    |   2 +
 .../runtime/common.cpp                        | 153 +++++++++++++++++
 .../tensormap_and_ringbuffer/runtime/common.h | 158 +-----------------
 5 files changed, 199 insertions(+), 195 deletions(-)
 create mode 100644 src/runtime/tensormap_and_ringbuffer/runtime/common.cpp
diff --git a/examples/scripts/code_runner.py b/examples/scripts/code_runner.py
index 7646455b..bb501f99 100644
--- a/examples/scripts/code_runner.py
+++ b/examples/scripts/code_runner.py
@@ -530,47 +530,26 @@ def run(self) -> None:
                 "  export PTO_ISA_ROOT=$(pwd)/examples/scripts/_deps/pto-isa"
             )
 
-        # Step 1: Build runtime
+        # Step 1: Build runtime, orchestration, and kernels in parallel
+        # (they are independent — all only need kernel_compiler which is ready)
         logger.info(f"=== Building Runtime: {self.runtime_name} (platform: {self.platform}) ===")
         builder = RuntimeBuilder(platform=self.platform)
         kernel_compiler = builder.get_kernel_compiler()
 
-        try:
-            host_binary, aicpu_binary, aicore_binary = builder.build(
-                self.runtime_name,
-            )
-        except Exception as e:
-            raise RuntimeError(
-                f"Failed to build runtime '{self.runtime_name}' for platform '{self.platform}'.\n"
-                f"Error: {e}"
-            ) from e
-
-        # Step 2: Load runtime and set device
-        logger.info(f"=== Loading Runtime ({len(host_binary)} bytes) ===")
-        Runtime = bind_host_binary(host_binary)
-
-        logger.info(f"=== Setting Device {self.device_id} ===")
-        set_device(self.device_id)
-
-        # Step 3: Compile orchestration
-        logger.info("=== Compiling Orchestration ===")
-
-        orch_so_binary = kernel_compiler.compile_orchestration(
-            self.runtime_name,
-            self.orchestration["source"],
-        )
+        from concurrent.futures import ThreadPoolExecutor, Future
 
-        # Step 4: Compile kernels (will be registered during runtime.initialize)
-        logger.info("=== Compiling Kernels ===")
+        runtime_include_dirs = [
+            os.path.join(self.project_root, "src", "runtime", self.runtime_name, "runtime")
+        ]
 
-        # Build list of (func_id, binary) tuples for passing to runtime.initialize()
-        kernel_binaries = []
-        # Prepare runtime include directories for kernel compilation
-        runtime_include_dirs = []
-        runtime_dir = os.path.join(self.project_root, "src", "runtime", self.runtime_name, "runtime")
-        runtime_include_dirs.append(runtime_dir)
+        def _build_runtime():
+            return builder.build(self.runtime_name)
 
-        from concurrent.futures import ThreadPoolExecutor
+        def _compile_orchestration():
+            return kernel_compiler.compile_orchestration(
+                self.runtime_name,
+                self.orchestration["source"],
+            )
 
         def _compile_one_kernel(kernel):
             logger.info(f"Compiling kernel: {kernel['source']} (func_id={kernel['func_id']})")
@@ -580,19 +559,39 @@ def _compile_one_kernel(kernel):
                 pto_isa_root=pto_isa_root,
                 extra_include_dirs=runtime_include_dirs,
             )
-            # For sim platform: keep complete .so for dlopen (supports external symbols like std::exp)
-            # For real hardware: extract .text section (ccec compiled kernels don't depend on external symbols)
             if self.platform == "a2a3sim":
-                kernel_bin = incore_o  # Complete .so for dlopen
+                kernel_bin = incore_o
             else:
-                kernel_bin = extract_text_section(incore_o)  # .text only for mmap
+                kernel_bin = extract_text_section(incore_o)
             return (kernel["func_id"], kernel_bin)
 
-        with ThreadPoolExecutor(max_workers=len(self.kernels)) as executor:
-            kernel_binaries = list(executor.map(_compile_one_kernel, self.kernels))
+        # Launch all compilations concurrently
+        max_workers = 2 + len(self.kernels)  # runtime + orchestration + kernels
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            fut_runtime = executor.submit(_build_runtime)
+            fut_orch = executor.submit(_compile_orchestration)
+            fut_kernels = [executor.submit(_compile_one_kernel, k) for k in self.kernels]
+
+            try:
+                host_binary, aicpu_binary, aicore_binary = fut_runtime.result()
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to build runtime '{self.runtime_name}' for platform '{self.platform}'.\n"
+                    f"Error: {e}"
+                ) from e
+
+            orch_so_binary = fut_orch.result()
+            kernel_binaries = [f.result() for f in fut_kernels]
 
         logger.info(f"Compiled {len(kernel_binaries)} kernel(s)")
 
+        # Step 2: Load runtime and set device
+        logger.info(f"=== Loading Runtime ({len(host_binary)} bytes) ===")
+        Runtime = bind_host_binary(host_binary)
+
+        logger.info(f"=== Setting Device {self.device_id} ===")
+        set_device(self.device_id)
+
         # Step 5: Run each parameter set
         total_cases = len(self.params_list)
         for case_idx, params in enumerate(self.params_list):
diff --git a/src/platform/include/host/performance_collector.h b/src/platform/include/host/performance_collector.h
index 00d67f26..a255027c 100644
--- a/src/platform/include/host/performance_collector.h
+++ b/src/platform/include/host/performance_collector.h
@@ -16,8 +16,6 @@
 #ifndef PLATFORM_HOST_PERFORMANCE_COLLECTOR_H_
 #define PLATFORM_HOST_PERFORMANCE_COLLECTOR_H_
 
-#include <chrono>
-#include <optional>
 #include <string>
 #include <vector>
 
diff --git a/src/platform/src/performance_collector.cpp b/src/platform/src/performance_collector.cpp
index 53a9145e..a018f19e 100644
--- a/src/platform/src/performance_collector.cpp
+++ b/src/platform/src/performance_collector.cpp
@@ -6,8 +6,10 @@
 #include "host/performance_collector.h"
 
 #include <algorithm>
+#include <chrono>
 #include <fstream>
 #include <iomanip>
+#include <optional>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <ctime>
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/common.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/common.cpp
new file mode 100644
index 00000000..2e56d8d4
--- /dev/null
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/common.cpp
@@ -0,0 +1,153 @@
+#include "common.h"
+
+#ifdef __linux__
+#include <cxxabi.h>
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <unistd.h>
+
+#include <array>
+#include <cstring>
+#include <vector>
+#endif
+
+/**
+ * 使用 addr2line 将地址转换为 文件:行号 信息
+ * 使用 -i 标志展开内联，返回第一行（最内层实际代码位置）
+ * 如果存在内联，同时通过 inline_chain 返回外层调用链
+ */
+#ifdef __linux__
+static std::string addr_to_line(const char* executable, void* addr,
+                                std::string* inline_chain = nullptr) {
+    char cmd[512];
+    snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr);
+
+    std::array<char, 256> buffer;
+    std::string raw_output;
+
+    FILE* pipe = popen(cmd, "r");
+    if (pipe) {
+        while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
+            raw_output += buffer.data();
+        }
+        pclose(pipe);
+    }
+
+    if (raw_output.empty() || raw_output.find("??") != std::string::npos) {
+        return "";
+    }
+
+    // 按行分割
+    std::vector<std::string> lines;
+    size_t pos = 0;
+    while (pos < raw_output.size()) {
+        size_t nl = raw_output.find('\n', pos);
+        if (nl == std::string::npos) nl = raw_output.size();
+        std::string line = raw_output.substr(pos, nl - pos);
+        while (!line.empty() && line.back() == '\r') line.pop_back();
+        if (!line.empty()) lines.push_back(line);
+        pos = nl + 1;
+    }
+
+    if (lines.empty()) return "";
+
+    // 第一行是最内层的实际代码位置，后续行是外层内联调用者
+    if (inline_chain && lines.size() > 1) {
+        *inline_chain = "";
+        for (size_t j = 1; j < lines.size(); j++) {
+            *inline_chain += "    [inlined by] " + lines[j] + "\n";
+        }
+    }
+
+    return lines.front();
+}
+#endif
+
+/**
+ * 获取当前调用栈信息（包含文件路径和行号）
+ * 通过 dladdr 定位每个栈帧所在的共享库，并用相对地址调用 addr2line
+ */
+std::string get_stacktrace(int skip_frames) {
+    std::string result;
+#ifdef __linux__
+    const int max_frames = 64;
+    void* buffer[max_frames];
+    int nframes = backtrace(buffer, max_frames);
+    char** symbols = backtrace_symbols(buffer, nframes);
+
+    if (symbols) {
+        result = "调用栈:\n";
+        for (int i = skip_frames; i < nframes; i++) {
+            std::string frame_info;
+
+            void* addr = (void*)((char*)buffer[i] - 1);
+
+            Dl_info dl_info;
+            std::string inline_chain;
+            if (dladdr(addr, &dl_info) && dl_info.dli_fname) {
+                void* rel_addr = (void*)((char*)addr - (char*)dl_info.dli_fbase);
+                std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain);
+
+                if (addr2line_result.empty()) {
+                    addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain);
+                }
+
+                if (!addr2line_result.empty()) {
+                    frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result;
+                }
+            }
+
+            if (frame_info.empty()) {
+                std::string frame(symbols[i]);
+
+                size_t start = frame.find('(');
+                size_t end = frame.find('+', start);
+                if (start != std::string::npos && end != std::string::npos) {
+                    std::string mangled = frame.substr(start + 1, end - start - 1);
+                    int status;
+                    char* demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status);
+                    if (status == 0 && demangled) {
+                        frame = frame.substr(0, start + 1) + demangled + frame.substr(end);
+                        free(demangled);
+                    }
+                }
+                frame_info = frame;
+            }
+
+            char buf[16];
+            snprintf(buf, sizeof(buf), "  #%d ", i - skip_frames);
+            result += buf + frame_info + "\n";
+            if (!inline_chain.empty()) {
+                result += inline_chain;
+            }
+        }
+        free(symbols);
+    }
+#else
+    result = "(调用栈仅在 Linux 上可用)\n";
+#endif
+    return result;
+}
+
+// AssertionError 构造函数
+static std::string build_assert_message(const char* condition, const char* file, int line) {
+    std::string msg = "断言失败: " + std::string(condition) + "\n";
+    msg += "  位置: " + std::string(file) + ":" + std::to_string(line) + "\n";
+    msg += get_stacktrace(3);
+    return msg;
+}
+
+AssertionError::AssertionError(const char* condition, const char* file, int line)
+    : std::runtime_error(build_assert_message(condition, file, line)),
+      condition_(condition), file_(file), line_(line) {}
+
+[[noreturn]] void assert_impl(const char* condition, const char* file, int line) {
+    fprintf(stderr, "\n========================================\n");
+    fprintf(stderr, "断言失败: %s\n", condition);
+    fprintf(stderr, "位置: %s:%d\n", file, line);
+    fprintf(stderr, "%s", get_stacktrace(2).c_str());
+    fprintf(stderr, "========================================\n\n");
+    fflush(stderr);
+
+    throw AssertionError(condition, file, line);
+}
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/common.h b/src/runtime/tensormap_and_ringbuffer/runtime/common.h
index d1593ff1..b1da0708 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/common.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/common.h
@@ -5,162 +5,24 @@
 #include <stdexcept>
 #include <string>
 
-#ifdef __linux__
-#include <cxxabi.h>
-#include <dlfcn.h>
-#include <execinfo.h>
-#include <unistd.h>
-
-#include <array>
-#include <cstring>
-#include <memory>
-#include <vector>
-#endif
-
-/**
- * 使用 addr2line 将地址转换为 文件:行号 信息
- * 使用 -i 标志展开内联，返回第一行（最内层实际代码位置）
- * 如果存在内联，同时通过 inline_chain 返回外层调用链
- */
-#ifdef __linux__
-inline std::string addr_to_line(const char* executable, void* addr,
-                                std::string* inline_chain = nullptr) {
-    char cmd[512];
-    snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr);
-
-    std::array<char, 256> buffer;
-    std::string raw_output;
-
-    FILE* pipe = popen(cmd, "r");
-    if (pipe) {
-        while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
-            raw_output += buffer.data();
-        }
-        pclose(pipe);
-    }
-
-    if (raw_output.empty() || raw_output.find("??") != std::string::npos) {
-        return "";
-    }
-
-    // 按行分割
-    std::vector<std::string> lines;
-    size_t pos = 0;
-    while (pos < raw_output.size()) {
-        size_t nl = raw_output.find('\n', pos);
-        if (nl == std::string::npos) nl = raw_output.size();
-        std::string line = raw_output.substr(pos, nl - pos);
-        while (!line.empty() && line.back() == '\r') line.pop_back();
-        if (!line.empty()) lines.push_back(line);
-        pos = nl + 1;
-    }
-
-    if (lines.empty()) return "";
-
-    // 第一行是最内层的实际代码位置，后续行是外层内联调用者
-    if (inline_chain && lines.size() > 1) {
-        *inline_chain = "";
-        for (size_t j = 1; j < lines.size(); j++) {
-            *inline_chain += "    [inlined by] " + lines[j] + "\n";
-        }
-    }
-
-    return lines.front();
-}
-#endif
-
 /**
  * 获取当前调用栈信息（包含文件路径和行号）
- * 通过 dladdr 定位每个栈帧所在的共享库，并用相对地址调用 addr2line
+ * 实现在 common.cpp 中
  */
-inline std::string get_stacktrace(int skip_frames = 1) {
-    std::string result;
-#ifdef __linux__
-    const int max_frames = 64;
-    void* buffer[max_frames];
-    int nframes = backtrace(buffer, max_frames);
-    char** symbols = backtrace_symbols(buffer, nframes);
-
-    if (symbols) {
-        result = "调用栈:\n";
-        for (int i = skip_frames; i < nframes; i++) {
-            std::string frame_info;
-
-            // backtrace() 返回的是返回地址（call 指令的下一条指令）
-            // 减 1 使地址落在 call 指令内部，避免解析到下一个函数
-            void* addr = (void*)((char*)buffer[i] - 1);
-
-            // 使用 dladdr 获取栈帧所在的共享库信息
-            Dl_info dl_info;
-            std::string inline_chain;
-            if (dladdr(addr, &dl_info) && dl_info.dli_fname) {
-                // 计算相对于共享库基地址的偏移
-                void* rel_addr = (void*)((char*)addr - (char*)dl_info.dli_fbase);
-                std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain);
-
-                // 如果相对地址失败，尝试用绝对地址（适用于非 PIE 可执行文件）
-                if (addr2line_result.empty()) {
-                    addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain);
-                }
-
-                if (!addr2line_result.empty()) {
-                    frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result;
-                }
-            }
-
-            // 如果 addr2line 失败，使用 backtrace_symbols 的输出并 demangle
-            if (frame_info.empty()) {
-                std::string frame(symbols[i]);
-
-                size_t start = frame.find('(');
-                size_t end = frame.find('+', start);
-                if (start != std::string::npos && end != std::string::npos) {
-                    std::string mangled = frame.substr(start + 1, end - start - 1);
-                    int status;
-                    char* demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status);
-                    if (status == 0 && demangled) {
-                        frame = frame.substr(0, start + 1) + demangled + frame.substr(end);
-                        free(demangled);
-                    }
-                }
-                frame_info = frame;
-            }
-
-            char buf[16];
-            snprintf(buf, sizeof(buf), "  #%d ", i - skip_frames);
-            result += buf + frame_info + "\n";
-            if (!inline_chain.empty()) {
-                result += inline_chain;
-            }
-        }
-        free(symbols);
-    }
-#else
-    result = "(调用栈仅在 Linux 上可用)\n";
-#endif
-    return result;
-}
+std::string get_stacktrace(int skip_frames = 1);
 
 /**
  * 断言失败异常，包含文件、行号、条件和调用栈信息
  */
 class AssertionError : public std::runtime_error {
 public:
-    AssertionError(const char* condition, const char* file, int line)
-        : std::runtime_error(build_message(condition, file, line)), condition_(condition), file_(file), line_(line) {}
+    AssertionError(const char* condition, const char* file, int line);
 
     const char* condition() const { return condition_; }
     const char* file() const { return file_; }
     int line() const { return line_; }
 
 private:
-    static std::string build_message(const char* condition, const char* file, int line) {
-        std::string msg = "断言失败: " + std::string(condition) + "\n";
-        msg += "  位置: " + std::string(file) + ":" + std::to_string(line) + "\n";
-        msg += get_stacktrace(3);  // 跳过 build_message, 构造函数, debug_assert_impl
-        return msg;
-    }
-
     const char* condition_;
     const char* file_;
     int line_;
@@ -168,19 +30,9 @@ class AssertionError : public std::runtime_error {
 
 /**
  * 断言失败时的处理函数
+ * 实现在 common.cpp 中
  */
-[[noreturn]] inline void assert_impl(const char* condition, const char* file, int line) {
-    // 打印错误信息到 stderr
-    fprintf(stderr, "\n========================================\n");
-    fprintf(stderr, "断言失败: %s\n", condition);
-    fprintf(stderr, "位置: %s:%d\n", file, line);
-    fprintf(stderr, "%s", get_stacktrace(2).c_str());
-    fprintf(stderr, "========================================\n\n");
-    fflush(stderr);
-
-    // 抛出异常，允许测试框架捕获
-    throw AssertionError(condition, file, line);
-}
+[[noreturn]] void assert_impl(const char* condition, const char* file, int line);
 
 /**
  * debug_assert 宏 - 在 debug 模式下检查条件，失败时抛出异常并打印调用栈