ChaoWao · ChaoWao · Feb 14, 2026 · Feb 14, 2026
diff --git a/examples/scripts/code_runner.py b/examples/scripts/code_runner.py
@@ -530,47 +530,26 @@ def run(self) -> None:
                 "  export PTO_ISA_ROOT=$(pwd)/examples/scripts/_deps/pto-isa"
             )
 
-        # Step 1: Build runtime
+        # Step 1: Build runtime, orchestration, and kernels in parallel
+        # (they are independent — all only need kernel_compiler which is ready)
         logger.info(f"=== Building Runtime: {self.runtime_name} (platform: {self.platform}) ===")
         builder = RuntimeBuilder(platform=self.platform)
         kernel_compiler = builder.get_kernel_compiler()
 
-        try:
-            host_binary, aicpu_binary, aicore_binary = builder.build(
-                self.runtime_name,
-            )
-        except Exception as e:
-            raise RuntimeError(
-                f"Failed to build runtime '{self.runtime_name}' for platform '{self.platform}'.\n"
-                f"Error: {e}"
-            ) from e
-
-        # Step 2: Load runtime and set device
-        logger.info(f"=== Loading Runtime ({len(host_binary)} bytes) ===")
-        Runtime = bind_host_binary(host_binary)
-
-        logger.info(f"=== Setting Device {self.device_id} ===")
-        set_device(self.device_id)
-
-        # Step 3: Compile orchestration
-        logger.info("=== Compiling Orchestration ===")
-
-        orch_so_binary = kernel_compiler.compile_orchestration(
-            self.runtime_name,
-            self.orchestration["source"],
-        )
+        from concurrent.futures import ThreadPoolExecutor, Future
 
-        # Step 4: Compile kernels (will be registered during runtime.initialize)
-        logger.info("=== Compiling Kernels ===")
+        runtime_include_dirs = [
+            os.path.join(self.project_root, "src", "runtime", self.runtime_name, "runtime")
+        ]
 
-        # Build list of (func_id, binary) tuples for passing to runtime.initialize()
-        kernel_binaries = []
-        # Prepare runtime include directories for kernel compilation
-        runtime_include_dirs = []
-        runtime_dir = os.path.join(self.project_root, "src", "runtime", self.runtime_name, "runtime")
-        runtime_include_dirs.append(runtime_dir)
+        def _build_runtime():
+            return builder.build(self.runtime_name)
 
-        from concurrent.futures import ThreadPoolExecutor
+        def _compile_orchestration():
+            return kernel_compiler.compile_orchestration(
+                self.runtime_name,
+                self.orchestration["source"],
+            )
 
         def _compile_one_kernel(kernel):
             logger.info(f"Compiling kernel: {kernel['source']} (func_id={kernel['func_id']})")
@@ -580,19 +559,39 @@ def _compile_one_kernel(kernel):
                 pto_isa_root=pto_isa_root,
                 extra_include_dirs=runtime_include_dirs,
             )
-            # For sim platform: keep complete .so for dlopen (supports external symbols like std::exp)
-            # For real hardware: extract .text section (ccec compiled kernels don't depend on external symbols)
             if self.platform == "a2a3sim":
-                kernel_bin = incore_o  # Complete .so for dlopen
+                kernel_bin = incore_o
             else:
-                kernel_bin = extract_text_section(incore_o)  # .text only for mmap
+                kernel_bin = extract_text_section(incore_o)
             return (kernel["func_id"], kernel_bin)
 
-        with ThreadPoolExecutor(max_workers=len(self.kernels)) as executor:
-            kernel_binaries = list(executor.map(_compile_one_kernel, self.kernels))
+        # Launch all compilations concurrently
+        max_workers = 2 + len(self.kernels)  # runtime + orchestration + kernels
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            fut_runtime = executor.submit(_build_runtime)
+            fut_orch = executor.submit(_compile_orchestration)
+            fut_kernels = [executor.submit(_compile_one_kernel, k) for k in self.kernels]
+
+            try:
+                host_binary, aicpu_binary, aicore_binary = fut_runtime.result()
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to build runtime '{self.runtime_name}' for platform '{self.platform}'.\n"
+                    f"Error: {e}"
+                ) from e
+
+            orch_so_binary = fut_orch.result()
+            kernel_binaries = [f.result() for f in fut_kernels]
 
         logger.info(f"Compiled {len(kernel_binaries)} kernel(s)")
 
+        # Step 2: Load runtime and set device
+        logger.info(f"=== Loading Runtime ({len(host_binary)} bytes) ===")
+        Runtime = bind_host_binary(host_binary)
+
+        logger.info(f"=== Setting Device {self.device_id} ===")
+        set_device(self.device_id)
+
         # Step 5: Run each parameter set
         total_cases = len(self.params_list)
         for case_idx, params in enumerate(self.params_list):

diff --git a/src/platform/include/host/performance_collector.h b/src/platform/include/host/performance_collector.h
@@ -16,8 +16,6 @@
 #ifndef PLATFORM_HOST_PERFORMANCE_COLLECTOR_H_
 #define PLATFORM_HOST_PERFORMANCE_COLLECTOR_H_
 
-#include <chrono>
-#include <optional>
 #include <string>
 #include <vector>
 

diff --git a/src/platform/src/performance_collector.cpp b/src/platform/src/performance_collector.cpp
@@ -6,8 +6,10 @@
 #include "host/performance_collector.h"
 
 #include <algorithm>
+#include <chrono>
 #include <fstream>
 #include <iomanip>
+#include <optional>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <ctime>

diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/common.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/common.cpp
@@ -0,0 +1,153 @@
+#include "common.h"
+
+#ifdef __linux__
+#include <cxxabi.h>
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <unistd.h>
+
+#include <array>
+#include <cstring>
+#include <vector>
+#endif
+
+/**
+ * 使用 addr2line 将地址转换为 文件:行号 信息
+ * 使用 -i 标志展开内联，返回第一行（最内层实际代码位置）
+ * 如果存在内联，同时通过 inline_chain 返回外层调用链
+ */
+#ifdef __linux__
+static std::string addr_to_line(const char* executable, void* addr,
+                                std::string* inline_chain = nullptr) {
+    char cmd[512];
+    snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr);
+
+    std::array<char, 256> buffer;
+    std::string raw_output;
+
+    FILE* pipe = popen(cmd, "r");
+    if (pipe) {
+        while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
+            raw_output += buffer.data();
+        }
+        pclose(pipe);
+    }
+
+    if (raw_output.empty() || raw_output.find("??") != std::string::npos) {
+        return "";
+    }
+
+    // 按行分割
+    std::vector<std::string> lines;
+    size_t pos = 0;
+    while (pos < raw_output.size()) {
+        size_t nl = raw_output.find('\n', pos);
+        if (nl == std::string::npos) nl = raw_output.size();
+        std::string line = raw_output.substr(pos, nl - pos);
+        while (!line.empty() && line.back() == '\r') line.pop_back();
+        if (!line.empty()) lines.push_back(line);
+        pos = nl + 1;
+    }
+
+    if (lines.empty()) return "";
+
+    // 第一行是最内层的实际代码位置，后续行是外层内联调用者
+    if (inline_chain && lines.size() > 1) {
+        *inline_chain = "";
+        for (size_t j = 1; j < lines.size(); j++) {
+            *inline_chain += "    [inlined by] " + lines[j] + "\n";
+        }
+    }
+
+    return lines.front();
+}
+#endif
+
+/**
+ * 获取当前调用栈信息（包含文件路径和行号）
+ * 通过 dladdr 定位每个栈帧所在的共享库，并用相对地址调用 addr2line
+ */
+std::string get_stacktrace(int skip_frames) {
+    std::string result;
+#ifdef __linux__
+    const int max_frames = 64;
+    void* buffer[max_frames];
+    int nframes = backtrace(buffer, max_frames);
+    char** symbols = backtrace_symbols(buffer, nframes);
+
+    if (symbols) {
+        result = "调用栈:\n";
+        for (int i = skip_frames; i < nframes; i++) {
+            std::string frame_info;
+
+            void* addr = (void*)((char*)buffer[i] - 1);
+
+            Dl_info dl_info;
+            std::string inline_chain;
+            if (dladdr(addr, &dl_info) && dl_info.dli_fname) {
+                void* rel_addr = (void*)((char*)addr - (char*)dl_info.dli_fbase);
+                std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain);
+
+                if (addr2line_result.empty()) {
+                    addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain);
+                }
+
+                if (!addr2line_result.empty()) {
+                    frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result;
+                }
+            }
+
+            if (frame_info.empty()) {
+                std::string frame(symbols[i]);
+
+                size_t start = frame.find('(');
+                size_t end = frame.find('+', start);
+                if (start != std::string::npos && end != std::string::npos) {
+                    std::string mangled = frame.substr(start + 1, end - start - 1);
+                    int status;
+                    char* demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status);
+                    if (status == 0 && demangled) {
+                        frame = frame.substr(0, start + 1) + demangled + frame.substr(end);
+                        free(demangled);
+                    }
+                }
+                frame_info = frame;
+            }
+
+            char buf[16];
+            snprintf(buf, sizeof(buf), "  #%d ", i - skip_frames);
+            result += buf + frame_info + "\n";
+            if (!inline_chain.empty()) {
+                result += inline_chain;
+            }
+        }
+        free(symbols);
+    }
+#else
+    result = "(调用栈仅在 Linux 上可用)\n";
+#endif
+    return result;
+}
+
+// AssertionError 构造函数
+static std::string build_assert_message(const char* condition, const char* file, int line) {
+    std::string msg = "断言失败: " + std::string(condition) + "\n";
+    msg += "  位置: " + std::string(file) + ":" + std::to_string(line) + "\n";
+    msg += get_stacktrace(3);
+    return msg;
+}
+
+AssertionError::AssertionError(const char* condition, const char* file, int line)
+    : std::runtime_error(build_assert_message(condition, file, line)),
+      condition_(condition), file_(file), line_(line) {}
+
+[[noreturn]] void assert_impl(const char* condition, const char* file, int line) {
+    fprintf(stderr, "\n========================================\n");
+    fprintf(stderr, "断言失败: %s\n", condition);
+    fprintf(stderr, "位置: %s:%d\n", file, line);
+    fprintf(stderr, "%s", get_stacktrace(2).c_str());
+    fprintf(stderr, "========================================\n\n");
+    fflush(stderr);
+
+    throw AssertionError(condition, file, line);
+}