diff --git a/src/platform/a2a3/aicore/inner_kernel.h b/src/platform/a2a3/aicore/inner_kernel.h
index fa31ee3f..3da25ea2 100644
--- a/src/platform/a2a3/aicore/inner_kernel.h
+++ b/src/platform/a2a3/aicore/inner_kernel.h
@@ -9,6 +9,9 @@
 #ifndef PLATFORM_A2A3_AICORE_INNER_KERNEL_H_
 #define PLATFORM_A2A3_AICORE_INNER_KERNEL_H_
 
+#include <cstdint>
+#include "aicpu/aicpu_regs.h"
+
 // AICore function attribute for CANN compiler
 #ifndef __aicore__
 #define __aicore__ [aicore]
@@ -17,4 +20,30 @@
 // dcci (Data Cache Clean and Invalidate) is provided by CANN headers
 // No need to define it here - it's a hardware instruction
 
+/**
+ * Read task_id from DATA_MAIN_BASE register
+ *
+ * Return values: 0=idle, AICORE_EXIT_SIGNAL=shutdown, other=task_id+1
+ */
+ __aicore__ inline uint32_t read_task_id_by_reg() {
+    uint32_t task_id;
+    // MOV reads from AICore SPR, bypasses cache
+    __asm__ volatile("MOV %0, DATA_MAIN_BASE\n" : "=l"(task_id));
+    return task_id;
+}
+
+/**
+ * Set AICore status to BUSY via COND register
+ */
+ __aicore__ inline void set_aicore_busy() {
+    set_cond(AICoreStatus::BUSY);
+}
+
+/**
+ * Set AICore status to IDLE via COND register
+ */
+ __aicore__ inline void set_aicore_idle() {
+    set_cond(AICoreStatus::IDLE);
+}
+
 #endif  // PLATFORM_A2A3_AICORE_INNER_KERNEL_H_
diff --git a/src/platform/a2a3/aicore/kernel.cpp b/src/platform/a2a3/aicore/kernel.cpp
index b17615e9..8e1f641c 100644
--- a/src/platform/a2a3/aicore/kernel.cpp
+++ b/src/platform/a2a3/aicore/kernel.cpp
@@ -23,7 +23,7 @@ class Runtime;
 [[block_local]] int block_idx;
 [[block_local]] CoreType core_type;
 
-extern __aicore__ void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type);
+extern __aicore__ void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id);
 
 /**
  * Kernel entry point with control loop
@@ -49,5 +49,6 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ Runtime
     block_idx = get_block_idx();
     core_type = CoreType::AIC;
 #endif
-    aicore_execute(runtime, block_idx, core_type);
+    uint32_t physical_core_id = static_cast<uint32_t>(get_coreid()) & AICORE_COREID_MASK;
+    aicore_execute(runtime, block_idx, core_type, physical_core_id);
 }
diff --git a/src/platform/a2a3/aicpu/aicpu_regs.cpp b/src/platform/a2a3/aicpu/aicpu_regs.cpp
new file mode 100644
index 00000000..950d1ee0
--- /dev/null
+++ b/src/platform/a2a3/aicpu/aicpu_regs.cpp
@@ -0,0 +1,40 @@
+/**
+ * @file aicpu_regs.cpp
+ * @brief AICPU-side AICore register access implementation
+ */
+
+#include "aicpu/aicpu_regs.h"
+
+void enable_aicore_register(uint64_t reg_base_addr) {
+    volatile uint32_t* reg_enable =
+        reinterpret_cast<volatile uint32_t*>(reg_base_addr + REG_SPR_FAST_PATH_ENABLE_OFFSET);
+    *reg_enable = REG_SPR_FAST_PATH_OPEN;
+    __sync_synchronize();
+
+    // Clear stale task_id from previous kernel runs
+    volatile uint32_t* reg_task =
+        reinterpret_cast<volatile uint32_t*>(reg_base_addr + REG_SPR_DATA_MAIN_BASE_OFFSET);
+    *reg_task = 0;
+}
+
+void disable_aicore_register(uint64_t reg_base_addr) {
+    volatile uint32_t* reg_enable =
+        reinterpret_cast<volatile uint32_t*>(reg_base_addr + REG_SPR_FAST_PATH_ENABLE_OFFSET);
+    *reg_enable = REG_SPR_FAST_PATH_CLOSE;
+    __sync_synchronize();
+}
+
+void write_task_id_to_aicore(uint64_t reg_base_addr, uint32_t task_id) {
+    volatile uint32_t* reg_task =
+        reinterpret_cast<volatile uint32_t*>(reg_base_addr + REG_SPR_DATA_MAIN_BASE_OFFSET);
+    *reg_task = task_id;
+    __sync_synchronize();
+}
+
+AICoreStatus read_aicore_status(uint64_t reg_base_addr) {
+    volatile uint32_t* reg_cond =
+        reinterpret_cast<volatile uint32_t*>(reg_base_addr + REG_SPR_COND_OFFSET);
+    __sync_synchronize();
+    uint32_t status_val = *reg_cond;
+    return static_cast<AICoreStatus>(status_val);
+}
diff --git a/src/platform/a2a3/host/CMakeLists.txt b/src/platform/a2a3/host/CMakeLists.txt
index 85e28645..7dfabdb1 100644
--- a/src/platform/a2a3/host/CMakeLists.txt
+++ b/src/platform/a2a3/host/CMakeLists.txt
@@ -24,6 +24,7 @@ list(APPEND HOST_RUNTIME_SOURCES
     "${CMAKE_CURRENT_SOURCE_DIR}/memory_allocator.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/performance_collector.cpp"
diff --git a/src/platform/a2a3/host/device_runner.cpp b/src/platform/a2a3/host/device_runner.cpp
index a5ece59e..ec92aac1 100644
--- a/src/platform/a2a3/host/device_runner.cpp
+++ b/src/platform/a2a3/host/device_runner.cpp
@@ -11,6 +11,7 @@
 
 // Include HAL constants from CANN (header only, library loaded dynamically)
 #include "ascend_hal.h"
+#include "host/host_regs.h"  // Register address retrieval
 
 // =============================================================================
 // Lazy-loaded HAL (ascend_hal) for profiling host-register only
@@ -324,6 +325,13 @@ int DeviceRunner::run(Runtime& runtime,
     worker_count_ = num_aicore;  // Store for print_handshake_results in destructor
     runtime.sche_cpu_num = launch_aicpu_num;
 
+    // Get AICore register addresses for register-based task dispatch
+    rc = init_aicore_register_addresses(&runtime.regs, static_cast<uint64_t>(device_id), mem_alloc_);
+    if (rc != 0) {
+        LOG_ERROR("init_aicore_register_addresses failed: %d", rc);
+        return rc;
+    }
+
     // Calculate number of AIC cores (1/3 of total)
     int num_aic = block_dim;  // Round up for 1/3
 
diff --git a/src/platform/a2a3/host/host_regs.cpp b/src/platform/a2a3/host/host_regs.cpp
new file mode 100644
index 00000000..da614fec
--- /dev/null
+++ b/src/platform/a2a3/host/host_regs.cpp
@@ -0,0 +1,178 @@
+/**
+ * @file host_regs.cpp
+ * @brief Host-side AICore register address retrieval implementation
+ */
+
+#include "host/host_regs.h"
+#include "host/memory_allocator.h"
+#include "common/unified_log.h"
+#include "common/platform_config.h"
+#include "runtime/rt.h"
+#include "ascend_hal.h"  // CANN HAL API definitions (MODULE_TYPE_AICORE, INFO_TYPE_OCCUPY, etc.)
+#include <dlfcn.h>
+#include <iostream>
+
+/**
+ * Query valid AICore cores via HAL API
+ */
+static bool get_pg_mask(uint64_t& valid, int64_t device_id) {
+    uint64_t aicore_bitmap[PLATFORM_AICORE_BITMAP_LEN] = {0};
+    int32_t size_n = static_cast<int32_t>(sizeof(uint64_t)) * PLATFORM_AICORE_BITMAP_LEN;
+
+    auto halFuncDevInfo =
+        (int (*)(uint64_t deviceId, int32_t moduleType, int32_t infoType, void* buf, int32_t* size))dlsym(
+            nullptr, "halGetDeviceInfoByBuff");
+
+    if (halFuncDevInfo == nullptr) {
+        LOG_WARN("halGetDeviceInfoByBuff not found, assuming all cores valid");
+        return false;
+    }
+
+    auto ret = halFuncDevInfo(static_cast<uint32_t>(device_id),
+        MODULE_TYPE_AICORE,
+        INFO_TYPE_OCCUPY,
+        reinterpret_cast<void*>(&aicore_bitmap[0]),
+        &size_n);
+
+    if (ret != 0) {
+        LOG_ERROR("halGetDeviceInfoByBuff failed with rc=%d", ret);
+        return false;
+    }
+
+    valid = aicore_bitmap[0];
+    return true;
+}
+
+/**
+ * Retrieve AICore register base addresses via HAL API
+ */
+static int get_aicore_reg_info(std::vector<int64_t>& aic, std::vector<int64_t>& aiv,
+                           const int& addr_type, int64_t device_id) {
+    uint64_t valid = 0;
+    if (!get_pg_mask(valid, device_id)) {
+        // If can't get mask, assume all cores valid
+        valid = 0xFFFFFFFF;
+        LOG_WARN("Using default valid mask 0xFFFFFFFF");
+    }
+
+    uint64_t core_stride = 8 * 1024 * 1024;  // 8M
+    uint64_t sub_core_stride = 0x100000ULL;
+
+    auto is_valid = [&valid](int id) {
+        const uint64_t mask = (1ULL << 25) - 1;
+        return ((static_cast<uint64_t>(valid) ^ mask) & (1ULL << id)) == 0;
+    };
+
+    auto halFunc =
+        (int (*)(int type, void* paramValue, size_t paramValueSize, void* outValue, size_t* outSizeRet))dlsym(
+            nullptr, "halMemCtl");
+
+    if (halFunc == nullptr) {
+        LOG_ERROR("halMemCtl not found in symbol table");
+        return -1;
+    }
+
+    struct AddrMapInPara in_map_para;
+    struct AddrMapOutPara out_map_para;
+    in_map_para.devid = device_id;
+    in_map_para.addr_type = addr_type;
+
+    auto ret = halFunc(0,
+        reinterpret_cast<void*>(&in_map_para),
+        sizeof(struct AddrMapInPara),
+        reinterpret_cast<void*>(&out_map_para),
+        nullptr);
+
+    if (ret != 0) {
+        LOG_ERROR("halMemCtl failed with rc=%d", ret);
+        return ret;
+    }
+
+    LOG_INFO("Register base: ptr=0x%llx, len=0x%llx", out_map_para.ptr, out_map_para.len);
+
+    // Iterate over all cores and subcores
+    for (uint32_t i = 0; i < DAV_2201::PLATFORM_MAX_PHYSICAL_CORES; i++) {
+        for (uint32_t j = 0; j < PLATFORM_SUB_CORES_PER_AICORE; j++) {
+            uint64_t vaddr = 0UL;
+            if (is_valid(i)) {
+                vaddr = out_map_para.ptr + (i * core_stride + j * sub_core_stride);
+            }
+            if (j == 0) {
+                aic.push_back(vaddr);
+            } else {
+                aiv.push_back(vaddr);
+            }
+        }
+    }
+
+    return 0;
+}
+
+void get_aicore_regs(std::vector<int64_t>& regs, uint64_t device_id) {
+    std::vector<int64_t> aiv;
+    std::vector<int64_t> aic;
+
+    int rt = get_aicore_reg_info(aic, aiv, ADDR_MAP_TYPE_REG_AIC_CTRL, device_id);
+
+    if (rt != 0) {
+        LOG_ERROR("get_aicore_reg_info failed, using placeholder addresses");
+        // Fallback: generate placeholder addresses
+        for (int i = 0; i < 25; i++) {
+            aic.push_back(0xDEADBEEF00000000ULL + (i * 0x800000));  // 8M stride
+            aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x100000);
+            aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x200000);
+        }
+    }
+
+    // AIC cores first, then AIV cores
+    regs.insert(regs.end(), aic.begin(), aic.end());
+    regs.insert(regs.end(), aiv.begin(), aiv.end());
+
+    LOG_INFO("get_aicore_regs: Retrieved %zu AIC and %zu AIV register addresses",
+             aic.size(), aiv.size());
+}
+
+int init_aicore_register_addresses(
+    uint64_t* runtime_regs_ptr,
+    uint64_t device_id,
+    MemoryAllocator& allocator) {
+
+    if (runtime_regs_ptr == nullptr) {
+        LOG_ERROR("init_aicore_register_addresses: Invalid parameters");
+        return -1;
+    }
+
+    LOG_INFO("Retrieving and allocating AICore register addresses...");
+
+    // Step 1: Get register addresses from HAL
+    std::vector<int64_t> host_regs;
+    get_aicore_regs(host_regs, device_id);
+
+    if (host_regs.empty()) {
+        LOG_ERROR("Failed to get AICore register addresses");
+        return -1;
+    }
+
+    // Step 2: Allocate device memory for register address array
+    size_t regs_size = host_regs.size() * sizeof(int64_t);
+    void* reg_ptr = allocator.alloc(regs_size);
+    if (reg_ptr == nullptr) {
+        LOG_ERROR("Failed to allocate device memory for register addresses");
+        return -1;
+    }
+
+    // Step 3: Copy register addresses to device memory
+    int ret = rtMemcpy(reg_ptr, regs_size, host_regs.data(), regs_size, RT_MEMCPY_HOST_TO_DEVICE);
+    if (ret != 0) {
+        LOG_ERROR("Failed to copy register addresses to device (rc=%d)", ret);
+        return -1;
+    }
+
+    // Step 4: Store device pointer in runtime.regs
+    *runtime_regs_ptr = reinterpret_cast<uint64_t>(reg_ptr);
+
+    LOG_INFO("Successfully initialized register addresses: %zu addresses at device 0x%llx",
+             host_regs.size(), *runtime_regs_ptr);
+
+    return 0;
+}
diff --git a/src/platform/a2a3sim/aicore/inner_kernel.h b/src/platform/a2a3sim/aicore/inner_kernel.h
index 3a74156e..0f40ce21 100644
--- a/src/platform/a2a3sim/aicore/inner_kernel.h
+++ b/src/platform/a2a3sim/aicore/inner_kernel.h
@@ -24,6 +24,7 @@
 
 // Cache coherency constants (no-op in simulation)
 #define ENTIRE_DATA_CACHE 0
+#define SINGLE_CACHE_LINE 0
 #define CACHELINE_OUT 0
 
 // =============================================================================
diff --git a/src/platform/include/aicpu/aicpu_regs.h b/src/platform/include/aicpu/aicpu_regs.h
new file mode 100644
index 00000000..ca7d86a7
--- /dev/null
+++ b/src/platform/include/aicpu/aicpu_regs.h
@@ -0,0 +1,45 @@
+/**
+ * @file aicpu_regs.h
+ * @brief AICPU-side AICore register access via MMIO
+ *
+ * Provides MMIO-based register writes/reads for task dispatch and status polling.
+ * Memory barriers ensure cross-core visibility.
+ */
+
+#ifndef PLATFORM_A2A3_AICPU_AICPU_REGS_H_
+#define PLATFORM_A2A3_AICPU_AICPU_REGS_H_
+
+#include <cstdint>
+#include "common/platform_config.h"
+
+/**
+ * AICore execution status (read from COND register)
+ */
+enum class AICoreStatus : uint32_t {
+    IDLE = 0,
+    BUSY = 1,
+};
+
+/**
+ * Enable register-based communication (call during handshake)
+ */
+void enable_aicore_register(uint64_t reg_base_addr);
+
+/**
+ * Disable register-based communication (call during shutdown)
+ */
+void disable_aicore_register(uint64_t reg_base_addr);
+
+/**
+ * Write task_id to AICore's DATA_MAIN_BASE register
+ *
+ * Special values: 0=idle, AICORE_EXIT_SIGNAL=shutdown
+ */
+void write_task_id_to_aicore(uint64_t reg_base_addr, uint32_t task_id);
+
+/**
+ * Read AICore status from COND register
+ */
+AICoreStatus read_aicore_status(uint64_t reg_base_addr);
+
+#endif  // PLATFORM_A2A3_AICPU_AICPU_REGS_H_
diff --git a/src/platform/include/common/perf_profiling.h b/src/platform/include/common/perf_profiling.h
index e3fe4a47..fb99199d 100644
--- a/src/platform/include/common/perf_profiling.h
+++ b/src/platform/include/common/perf_profiling.h
@@ -38,11 +38,6 @@
 #define RUNTIME_MAX_FANOUT 512
 #endif
 
-// Maximum cores that can be profiled simultaneously
-#ifndef PLATFORM_MAX_CORES
-#define PLATFORM_MAX_CORES 72  // 24 blocks × 3 cores/block
-#endif
-
 // =============================================================================
 // Buffer Status Enumeration
 // =============================================================================
diff --git a/src/platform/include/common/platform_config.h b/src/platform/include/common/platform_config.h
index 5992f563..cb1f4d0c 100644
--- a/src/platform/include/common/platform_config.h
+++ b/src/platform/include/common/platform_config.h
@@ -100,4 +100,47 @@ constexpr int PLATFORM_PROF_TIMEOUT_SECONDS = 2;
  * Number of empty polling iterations before checking timeout
  */
 constexpr int PLATFORM_PROF_EMPTY_POLLS_CHECK_NUM = 1000;
+
+// =============================================================================
+// Register Communication Configuration
+// =============================================================================
+
+// Register offsets for AICore SPR access
+constexpr uint32_t REG_SPR_DATA_MAIN_BASE_OFFSET = 0xA0;  // Task dispatch (AICPU→AICore)
+constexpr uint32_t REG_SPR_COND_OFFSET = 0x4C8;           // Status (AICore→AICPU): 0=IDLE, 1=BUSY
+constexpr uint32_t REG_SPR_FAST_PATH_ENABLE_OFFSET = 0x18;
+
+// Fast path control values
+constexpr uint32_t REG_SPR_FAST_PATH_OPEN = 0xE;
+constexpr uint32_t REG_SPR_FAST_PATH_CLOSE = 0xF;
+
+// Exit signal for AICore shutdown
+constexpr uint32_t AICORE_EXIT_SIGNAL = 0x7FFFFFF0;
+
+// Physical core ID mask for get_coreid()
+constexpr uint32_t AICORE_COREID_MASK = 0x0FFF;
+
+// =============================================================================
+// Hardware Configuration Constants
+// =============================================================================
+
+/**
+ * AICore register bitmap buffer length
+ * Used for querying valid AICore cores via HAL API
+ */
+constexpr uint8_t PLATFORM_AICORE_BITMAP_LEN = 2;
+
+/**
+ * Number of sub-cores per AICore
+ * Hardware architecture: 1 AICore = 1 AIC + 2 AIV sub-cores
+ */
+constexpr uint32_t PLATFORM_SUB_CORES_PER_AICORE = PLATFORM_CORES_PER_BLOCKDIM;
+
+/**
+ * Maximum physical AICore count for DAV 2201 chip
+ */
+namespace DAV_2201 {
+constexpr uint32_t PLATFORM_MAX_PHYSICAL_CORES = 25;
+}
+
 #endif  // PLATFORM_COMMON_PLATFORM_CONFIG_H_
diff --git a/src/platform/include/host/host_regs.h b/src/platform/include/host/host_regs.h
new file mode 100644
index 00000000..655f19f3
--- /dev/null
+++ b/src/platform/include/host/host_regs.h
@@ -0,0 +1,42 @@
+/**
+ * @file host_regs.h
+ * @brief AICore register address retrieval via CANN HAL APIs
+ *
+ * Provides register base addresses for AICPU to perform MMIO-based
+ * task dispatch to AICore cores.
+ */
+
+#ifndef PLATFORM_HOST_HOST_REGS_H_
+#define PLATFORM_HOST_HOST_REGS_H_
+
+#include <cstdint>
+#include <vector>
+
+// Forward declaration
+class MemoryAllocator;
+
+/**
+ * Get AICore register base addresses for all cores
+ *
+ * @param regs Output vector (AIC cores followed by AIV cores)
+ * @param device_id Device ID
+ */
+void get_aicore_regs(std::vector<int64_t>& regs, uint64_t device_id);
+
+/**
+ * Initialize AICore register addresses for runtime
+ *
+ * Retrieves register addresses from HAL, allocates device memory,
+ * copies addresses to device, and stores the device pointer in runtime.
+ *
+ * @param runtime_regs_ptr Pointer to runtime.regs field
+ * @param device_id Device ID
+ * @param allocator Memory allocator for device memory
+ * @return 0 on success, negative on failure
+ */
+int init_aicore_register_addresses(
+    uint64_t* runtime_regs_ptr,
+    uint64_t device_id,
+    MemoryAllocator& allocator);
+
+#endif  // PLATFORM_HOST_HOST_REGS_H_
diff --git a/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp b/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp
index f002616b..b4c33b22 100644
--- a/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp
+++ b/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp
@@ -46,7 +46,7 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ Task*
     kernel(reinterpret_cast<__gm__ int64_t*>(task->args));
 }
 
-__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type) {
+__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id) {
     __gm__ Handshake* my_hank = (__gm__ Handshake*)(&runtime->workers[block_idx]);
 
     // Phase 1: Wait for AICPU initialization signal
diff --git a/src/runtime/aicpu_build_graph/runtime/runtime.h b/src/runtime/aicpu_build_graph/runtime/runtime.h
index 9ccee95d..7f4b7fb9 100644
--- a/src/runtime/aicpu_build_graph/runtime/runtime.h
+++ b/src/runtime/aicpu_build_graph/runtime/runtime.h
@@ -245,6 +245,9 @@ class Runtime {
     // Execution parameters for AICPU scheduling
     int sche_cpu_num;  // Number of AICPU threads for scheduling
 
+    // Register-based task dispatch (unused in aicpu_build_graph, but required by platform)
+    uint64_t regs{0};  // Device memory pointer to register address array
+
     /**
      * Orchestration payload (auto-populated by init_runtime_impl, consumed by AICPU orchestration).
      *
diff --git a/src/runtime/host_build_graph/aicore/aicore_executor.cpp b/src/runtime/host_build_graph/aicore/aicore_executor.cpp
index 9781bec4..c41c88cc 100644
--- a/src/runtime/host_build_graph/aicore/aicore_executor.cpp
+++ b/src/runtime/host_build_graph/aicore/aicore_executor.cpp
@@ -1,7 +1,7 @@
 #include "aicore/aicore.h"
 #include "runtime.h"
 #include "common/perf_profiling.h"
-#include "common/memory_barrier.h"
+#include "common/platform_config.h"  // Platform configuration (C/C++ compatible)
 
 typedef void (*KernelFunc)(__gm__ int64_t*);
 
@@ -28,23 +28,19 @@ __aicore__ __attribute__((always_inline)) static void record_task_performance(
     CoreType core_type,
     uint64_t kernel_ready_time) {
 
-    // Check if buffer is available for writing
+    // dcci() for handshake visibility during profiling
+    dcci((__gm__ uint32_t*)&my_hank->perf_buffer_status, SINGLE_CACHE_LINE, CACHELINE_OUT);
+
     if (my_hank->perf_buffer_status != 0) {
-        return;  // Buffer full, skip recording
+        return;
     }
 
-    // Get current performance buffer pointer
     __gm__ PerfBuffer* perf_buf = (__gm__ PerfBuffer*)my_hank->perf_records_addr;
-
-    // Get current count
     uint32_t idx = perf_buf->count;
 
-    // Check if buffer has space
     if (idx < PLATFORM_PROF_BUFFER_SIZE) {
-        // Get pointer to the record slot
         __gm__ PerfRecord* record = (__gm__ PerfRecord*)&perf_buf->records[idx];
 
-        // Write record data (only essential fields, fanout filled by AICPU)
         record->start_time = start_time;
         record->end_time = end_time;
         record->kernel_ready_time = kernel_ready_time;
@@ -53,20 +49,16 @@ __aicore__ __attribute__((always_inline)) static void record_task_performance(
         record->core_id = block_idx;
         record->core_type = core_type;
 
-        // Increment count after writing record
         perf_buf->count = idx + 1;
-
-        // Write memory barrier: ensure performance data is visible to Host
-        wmb();
-
-        // Check if buffer is full after this write
+        dcci(record, ENTIRE_DATA_CACHE, CACHELINE_OUT);
         if (perf_buf->count >= PLATFORM_PROF_BUFFER_SIZE) {
-            my_hank->perf_buffer_status = 1;  // Notify AICPU: buffer full
+            my_hank->perf_buffer_status = 1;
         }
     } else {
-        // Buffer is already full
         my_hank->perf_buffer_status = 1;
     }
+    // dcci() for handshake visibility during profiling
+    dcci((__gm__ uint32_t*)&my_hank->perf_buffer_status, SINGLE_CACHE_LINE, CACHELINE_OUT);
 }
 
 __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ Task* task) {
@@ -75,9 +67,12 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ Task*
     }
     KernelFunc kernel = (KernelFunc)task->function_bin_addr;
     kernel(reinterpret_cast<__gm__ int64_t*>(task->args));
+
+    // Ensure all memory writes are visible to other cores
+    pipe_barrier(PIPE_ALL);
 }
 
-__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type) {
+__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id) {
     __gm__ Handshake* my_hank = (__gm__ Handshake*)(&runtime->workers[block_idx]);
 
     // Phase 1: Wait for AICPU initialization signal
@@ -85,51 +80,45 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
         dcci(my_hank, ENTIRE_DATA_CACHE, CACHELINE_OUT);
     }
 
-    // Phase 2: Signal AICore is ready and report core type
-    my_hank->core_type = core_type;        // Report core type to AICPU
-    my_hank->aicore_done = block_idx + 1;  // Signal ready (use block_idx + 1 to avoid 0)
+    // Report physical core ID and core type for AICPU
+    my_hank->physical_core_id = physical_core_id;
+    my_hank->core_type = core_type;
+    my_hank->aicore_done = block_idx + 1;
+
+    dcci(my_hank, ENTIRE_DATA_CACHE, CACHELINE_OUT);
+
+    // Report initial idle status for task dispatch
+    set_aicore_idle();
 
-    // Check if profiling is enabled
     bool profiling_enabled = runtime->enable_profiling;
+    uint64_t kernel_ready_time = get_sys_cnt();
 
-    // Record kernel ready time (before entering main loop)
-    // This timestamp represents when the AICore is ready to execute tasks
-    // but hasn't started executing any task yet.
-    // Used for: 1) Startup overhead analysis, 2) Cross-core time alignment
-    uint64_t kernel_ready_time = 0;
-    if (profiling_enabled) {
-        kernel_ready_time = get_sys_cnt();
-    }
+    // Main loop: poll DATA_MAIN_BASE for task_id
+    volatile uint32_t task_id = 0;
+    volatile uint32_t last_task_id = 0;
 
-    // Phase 3: Main execution loop - poll for tasks until quit signal
     while (true) {
-        dcci(my_hank, ENTIRE_DATA_CACHE, CACHELINE_OUT);
-
-        // Check for quit command from AICPU
-        if (my_hank->control == 1) {
-            break;  // Exit kernel
+        task_id = read_task_id_by_reg();
+        if (task_id == AICORE_EXIT_SIGNAL) {
+            break;
         }
 
-        // Execute task if assigned (task != 0 means valid task pointer)
-        if (my_hank->task_status == 1 && my_hank->task != 0) {
-            __gm__ Task* task_ptr = reinterpret_cast<__gm__ Task*>(my_hank->task);
-
-            // Performance profiling: record start time
-            uint64_t start_time = 0;
-            start_time = get_sys_cnt();
+        // Execute task if new (task_id encoding: 0=idle, task_id+1=task)
+        if (task_id != 0 && task_id != last_task_id) {
+            set_aicore_busy();
+            __gm__ Task* task_ptr = &(runtime->tasks[task_id - 1]);
+            uint64_t start_time = get_sys_cnt();
             
-
-            // Execute the task
             execute_task(task_ptr);
 
-            // Performance profiling: record task execution
             if (profiling_enabled) {
                 uint64_t end_time = get_sys_cnt();
-                record_task_performance(my_hank, task_ptr, start_time, end_time, block_idx, core_type, kernel_ready_time);
+                record_task_performance(my_hank, task_ptr, start_time, end_time,
+                                      block_idx, core_type, kernel_ready_time);
             }
 
-            // Mark task as complete (task_status: 0=idle, 1=busy)
-            my_hank->task_status = 0;
+            last_task_id = task_id;
+            set_aicore_idle();
         }
     }
 }
diff --git a/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp
index 4b378fa0..9b76d42a 100644
--- a/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp
+++ b/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp
@@ -9,15 +9,19 @@
 #include "runtime.h"
 #include "aicpu/device_log.h"
 #include "inner_aicpu.h"
+#include "aicpu/aicpu_regs.h"  // Register-based communication
 
 constexpr int MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS;
 constexpr int MAX_AIC_PER_THREAD = PLATFORM_MAX_AIC_PER_THREAD;
 constexpr int MAX_AIV_PER_THREAD = PLATFORM_MAX_AIV_PER_THREAD;
 constexpr int MAX_CORES_PER_THREAD = PLATFORM_MAX_CORES_PER_THREAD;
+constexpr int MAX_CORES = PLATFORM_MAX_CORES;
 
 // Core information for discovery
 struct CoreInfo {
-    int worker_id;     // Index in runtime.workers[]
+    int worker_id;              // Index in runtime.workers[]
+    uint32_t physical_core_id;  // Hardware physical core ID (from AICore)
+    uint64_t reg_addr;          // Cached register address for fast access
     CoreType core_type;
 };
 
@@ -40,6 +44,12 @@ struct AicpuExecutor {
     int aic_count_{0};
     int aiv_count_{0};
 
+    // Fast lookup: core_id -> reg_addr
+    uint64_t core_id_to_reg_addr_[MAX_CORES_PER_THREAD];
+
+    // Track executing task_id per core (-1 = idle)
+    int executing_task_ids_[MAX_CORES];
+
     // ===== Task queue state =====
     std::mutex ready_queue_aic_mutex_;
     int ready_queue_aic_[RUNTIME_MAX_TASKS];
@@ -122,6 +132,11 @@ int AicpuExecutor::init(Runtime* runtime) {
 
     LOG_INFO("Config: threads=%d, cores=%d, cores_per_thread=%d", thread_num_, cores_total_num_, thread_cores_num_);
 
+    // Initialize executing_task_ids_ to -1 (idle)
+    for (int i = 0; i < MAX_CORES; i++) {
+        executing_task_ids_[i] = -1;
+    }
+
     // Assign discovered cores to threads
     assign_cores_to_threads();
 
@@ -228,24 +243,38 @@ int AicpuExecutor::handshake_all_cores(Runtime* runtime) {
             // Busy wait for core response
         }
 
-        // Read core type (written by AICore during handshake)
         CoreType type = hank->core_type;
+        uint32_t physical_core_id = hank->physical_core_id;
+
+        // Get register address using physical_core_id
+        uint64_t* regs = reinterpret_cast<uint64_t*>(runtime->regs);
+        uint64_t reg_addr = regs[physical_core_id];
 
-        // Classify and store core information
         if (type == CoreType::AIC) {
             aic_cores_[aic_count_].worker_id = i;
+            aic_cores_[aic_count_].physical_core_id = physical_core_id;
+            aic_cores_[aic_count_].reg_addr = reg_addr;
             aic_cores_[aic_count_].core_type = type;
             aic_count_++;
         } else if (type == CoreType::AIV) {
             aiv_cores_[aiv_count_].worker_id = i;
+            aiv_cores_[aiv_count_].physical_core_id = physical_core_id;
+            aiv_cores_[aiv_count_].reg_addr = reg_addr;
             aiv_cores_[aiv_count_].core_type = type;
             aiv_count_++;
         } else {
-            LOG_ERROR("Unknown core type %d for core %d", static_cast<int>(type), i);
+            LOG_ERROR("Unknown core type from core %d", i);
             return -1;
         }
 
-        LOG_INFO("  Core %d: type=%s", i, core_type_to_string(type));
+        core_id_to_reg_addr_[i] = reg_addr;
+
+        LOG_INFO("  Core %d: type=%s, physical_id=%u, reg_addr=0x%lx",
+                 i, core_type_to_string(type), physical_core_id, reg_addr);
+
+        if (reg_addr != 0) {
+            enable_aicore_register(reg_addr);
+        }
     }
 
     LOG_INFO("Discovery complete: AIC=%d, AIV=%d, Total=%d", aic_count_, aiv_count_, cores_total_num_);
@@ -340,7 +369,14 @@ int AicpuExecutor::shutdown_aicore(Runtime* runtime, int thread_idx, const int*
         int core_id = cur_thread_cores[i];
         Handshake* hank = &all_hanks[core_id];
         LOG_INFO("Thread %d: AICPU hank addr = 0x%lx", thread_idx, (uint64_t)hank);
-        hank->control = 1;
+
+        uint64_t reg_addr = core_id_to_reg_addr_[core_id];
+        if (reg_addr != 0) {
+            write_task_id_to_aicore(reg_addr, AICORE_EXIT_SIGNAL);
+            disable_aicore_register(reg_addr);
+        } else {
+            LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id);
+        }
     }
     LOG_INFO("Thread %d: Shutdown complete", thread_idx);
     return 0;
@@ -377,15 +413,17 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
 
             for (int i = 0; i < core_num; i++) {
                 int core_id = cur_thread_cores[i];
-                Handshake* h = &hank[core_id];
 
-                if (h->task_status != 0 || h->task != 0) {
+                uint64_t reg_addr = core_id_to_reg_addr_[core_id];
+                AICoreStatus status = read_aicore_status(reg_addr);
+
+                if (status != AICoreStatus::IDLE || executing_task_ids_[core_id] >= 0) {
                     all_cores_idle = false;
 
                     if (verification_warning_count == 0) {
-                        LOG_WARN("Thread %d: Counter reached %d/%d but core %d still has work (status=%d, task=%p)",
+                        LOG_WARN("Thread %d: Counter reached %d/%d but core %d still has work (COND=%d, task_id=%d)",
                                 thread_idx, completed_tasks_.load(std::memory_order_acquire), task_count,
-                                core_id, h->task_status, (void*)h->task);
+                                core_id, static_cast<uint32_t>(status), executing_task_ids_[core_id]);
                     }
                     break;
                 }
@@ -417,15 +455,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
         // Phase 1: Process completed tasks on my managed cores
         for (int i = 0; i < core_num; i++) {
             int core_id = cur_thread_cores[i];
-            Handshake* h = &hank[core_id];
-
-            // Core finished a task (idle + task not null)
-            if (h->task_status == 0 && h->task != 0) {
-                // Get completed task pointer before any buffer operations
-                Task* task = reinterpret_cast<Task*>(h->task);
-                int completed_task_id = task->task_id;
-
-                // Write AICPU dispatch/finish timestamps into the PerfRecord
+            uint64_t reg_addr = core_id_to_reg_addr_[core_id];
+            AICoreStatus status = read_aicore_status(reg_addr);
+            if (status == AICoreStatus::IDLE && executing_task_ids_[core_id] >= 0) {
+                int task_id = executing_task_ids_[core_id];
+                int completed_task_id = task_id;
+                Handshake* h = &hank[core_id];
                 if (profiling_enabled) {
                     uint64_t finish_ts = get_sys_cnt_aicpu();
                     PerfBuffer* perf_buf = (PerfBuffer*)h->perf_records_addr;
@@ -445,10 +480,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
                 if (profiling_enabled && h->perf_buffer_status == 1) {
                     switch_perf_buffer(&runtime, core_id, thread_idx);
                 }
-                // Clear the task pointer
-                h->task = 0;
 
-                int task_id = completed_task_id;
+                Task* task = runtime.get_task(task_id);
 
                 LOG_INFO("Thread %d: Core %d completed task %d", thread_idx, core_id, task_id);
 
@@ -484,10 +517,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
                     }
                 }
 
-                // Update counters
+                // Update counters and clear task tracking
                 cur_thread_tasks_in_flight--;
                 cur_thread_completed++;
                 made_progress = true;
+                // Clear task_id
+                executing_task_ids_[core_id] = -1;
                 completed_tasks_.fetch_add(1, std::memory_order_release);
             }
         }
@@ -497,10 +532,11 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
             // Phase 2: Dispatch new tasks from matching ready queue to idle cores
             for (int i = 0; i < core_num; i++) {
                 int core_id = cur_thread_cores[i];
-                Handshake* h = &hank[core_id];
+                uint64_t reg_addr = core_id_to_reg_addr_[core_id];
+                AICoreStatus status = read_aicore_status(reg_addr);
 
-                // Core is idle and available (idle + task is null)
-                if (h->task_status == 0 && h->task == 0) {
+                if (status == AICoreStatus::IDLE && executing_task_ids_[core_id] == -1) {
+                    Handshake* h = &hank[core_id];
                     // Dispatch from matching queue based on core type
                     if (h->core_type == CoreType::AIC) {  // AIC core
                         if (ready_count_aic_.load(std::memory_order_acquire) > 0) {
@@ -511,16 +547,19 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
                                 int task_id = ready_queue_aic_[ready_queue_aic_head_];
                                 ready_queue_aic_head_ = (ready_queue_aic_head_ + 1) % RUNTIME_MAX_TASKS;
                                 ready_count_aic_.fetch_sub(1, std::memory_order_release);
-                                Task* task = runtime.get_task(task_id);
 
                                 LOG_INFO("Thread %d: Dispatching AIC task %d to core %d (head=%d)",
                                          thread_idx, task_id, core_id, ready_queue_aic_head_);
 
-                                h->task = reinterpret_cast<uint64_t>(task);
+                                uint64_t reg_addr = core_id_to_reg_addr_[core_id];
+
+                                // Write task_id+1 to register
+                                write_task_id_to_aicore(reg_addr, static_cast<uint32_t>(task_id + 1));
+
                                 if (runtime.enable_profiling) {
                                     dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                                 }
-                                h->task_status = 1;  // Mark as busy
+                                executing_task_ids_[core_id] = task_id;
                                 cur_thread_tasks_in_flight++;
                                 made_progress = true;
                             }
@@ -534,16 +573,19 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
                                 int task_id = ready_queue_aiv_[ready_queue_aiv_head_];
                                 ready_queue_aiv_head_ = (ready_queue_aiv_head_ + 1) % RUNTIME_MAX_TASKS;
                                 ready_count_aiv_.fetch_sub(1, std::memory_order_release);
-                                Task* task = runtime.get_task(task_id);
 
                                 LOG_INFO("Thread %d: Dispatching AIV task %d to core %d (head=%d)",
                                          thread_idx, task_id, core_id, ready_queue_aiv_head_);
 
-                                h->task = reinterpret_cast<uint64_t>(task);
+                                uint64_t reg_addr = core_id_to_reg_addr_[core_id];
+
+                                // Write task_id+1 to register
+                                write_task_id_to_aicore(reg_addr, static_cast<uint32_t>(task_id + 1));
+
                                 if (runtime.enable_profiling) {
                                     dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                                 }
-                                h->task_status = 1;  // Mark as busy
+                                executing_task_ids_[core_id] = task_id;
                                 cur_thread_tasks_in_flight++;
                                 made_progress = true;
                             }
@@ -656,7 +698,6 @@ void AicpuExecutor::diagnose_stuck_state(Runtime& runtime, int thread_idx,
 
     int busy_cores = 0;
     int idle_cores = 0;
-    int anomaly_cores = 0;
 
     LOG_ERROR("Core Status:");
     for (int i = 0; i < core_num; i++) {
@@ -665,24 +706,30 @@ void AicpuExecutor::diagnose_stuck_state(Runtime& runtime, int thread_idx,
 
         const char* core_type_str = core_type_to_string(h->core_type);
 
-        if (h->task != 0) {
-            Task* task = reinterpret_cast<Task*>(h->task);
+        uint64_t reg_addr = core_id_to_reg_addr_[core_id];
+        AICoreStatus status = read_aicore_status(reg_addr);
+
+        if (status != AICoreStatus::IDLE) {
             busy_cores++;
 
-            LOG_ERROR("  Core %d [%s, BUSY]: task_id=%d, func_id=%d, fanin=%d, fanout=%d",
-                     core_id, core_type_str,
-                     task->task_id, task->func_id,
-                     task->fanin.load(std::memory_order_acquire),
-                     task->fanout_count);
-        } else if (h->task_status != 0) {
-            anomaly_cores++;
-            LOG_ERROR("  Core %d [%s, ANOMALY]: status=BUSY but task=NULL", core_id, core_type_str);
+            int task_id = executing_task_ids_[core_id];
+            if (task_id >= 0) {
+                Task* task = runtime.get_task(task_id);
+                LOG_ERROR("  Core %d [%s, BUSY]: COND=%d, task_id=%d, func_id=%d, fanin=%d, fanout=%d",
+                         core_id, core_type_str, static_cast<uint32_t>(status),
+                         task->task_id, task->func_id,
+                         task->fanin.load(std::memory_order_acquire),
+                         task->fanout_count);
+            } else {
+                LOG_ERROR("  Core %d [%s, BUSY]: COND=%d but task_id not tracked",
+                         core_id, core_type_str, static_cast<uint32_t>(status));
+            }
         } else {
             idle_cores++;
         }
     }
 
-    LOG_ERROR("Summary: %d busy, %d idle, %d anomaly", busy_cores, idle_cores, anomaly_cores);
+    LOG_ERROR("Summary: %d busy, %d idle", busy_cores, idle_cores);
 
     // Diagnose deadlock vs livelock
     if (busy_cores == 0 && aic_ready == 0 && aiv_ready == 0 && completed < total) {
diff --git a/src/runtime/host_build_graph/runtime/runtime.h b/src/runtime/host_build_graph/runtime/runtime.h
index 919cada9..cfefabd8 100644
--- a/src/runtime/host_build_graph/runtime/runtime.h
+++ b/src/runtime/host_build_graph/runtime/runtime.h
@@ -95,16 +95,20 @@
  * - task_status: Written by both (AICPU=1 on dispatch, AICore=0 on completion)
  * - control: Written by AICPU, read by AICore (0 = continue, 1 = quit)
  * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV)
+ * - perf_records_addr: Written by AICPU, read by AICore (performance records address)
+ * - perf_buffer_status: Written by both (AICPU=1 on buffer full, AICore=0 on buffer empty)
+ * - physical_core_id: Written by AICPU, read by AICore (physical core ID)
  */
 struct Handshake {
-    volatile uint32_t aicpu_ready;  // AICPU ready signal: 0=not ready, 1=ready
-    volatile uint32_t aicore_done;  // AICore ready signal: 0=not ready, core_id+1=ready
-    volatile uint64_t task;         // Task pointer: 0=no task, non-zero=Task* address
-    volatile int32_t task_status;   // Task execution status: 0=idle, 1=busy
-    volatile int32_t control;       // Control signal: 0=execute, 1=quit
-    volatile CoreType core_type;    // Core type: CoreType::AIC or CoreType::AIV
-    volatile uint64_t perf_records_addr; // Performance records address
-    volatile uint32_t perf_buffer_status; // 0 = not full, 1 == full
+    volatile uint32_t aicpu_ready;          // AICPU ready signal: 0=not ready, 1=ready
+    volatile uint32_t aicore_done;          // AICore ready signal: 0=not ready, core_id+1=ready
+    volatile uint64_t task;                 // Task pointer: 0=no task, non-zero=Task* address
+    volatile int32_t task_status;           // Task execution status: 0=idle, 1=busy
+    volatile int32_t control;               // Control signal: 0=execute, 1=quit
+    volatile CoreType core_type;            // Core type: CoreType::AIC or CoreType::AIV
+    volatile uint64_t perf_records_addr;    // Performance records address
+    volatile uint32_t perf_buffer_status;   // 0 = not full, 1 = full
+    volatile uint32_t physical_core_id;     // Physical core ID
 } __attribute__((aligned(64)));
 
 /**
@@ -182,12 +186,17 @@ class Runtime {
     // Execution parameters for AICPU scheduling
     int sche_cpu_num;  // Number of AICPU threads for scheduling
 
+    // Register-based task dispatch
+    uint64_t regs{0};  // Device memory pointer to register address array
+
     // Profiling support
     bool enable_profiling;                  // Enable profiling flag
     uint64_t perf_data_base;                // Performance data shared memory base address (device-side)
-private:
+
     // Task storage
     Task tasks[RUNTIME_MAX_TASKS];  // Fixed-size task array
+
+private:
     int next_task_id;               // Next available task ID
 
     // Initial ready tasks (computed once, read-only after)
diff --git a/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
index 91f0e393..09a0c599 100644
--- a/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
@@ -111,8 +111,9 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ void*
  * @param runtime Pointer to Runtime in global memory
  * @param block_idx Block index (core ID)
  * @param core_type Core type (AIC or AIV)
+ * @param physical_core_id Physical core ID from hardware
  */
-__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type) {
+__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id) {
     __gm__ Handshake* my_hank = (__gm__ Handshake*)(&runtime->workers[block_idx]);
 
     // Phase 1: Wait for AICPU initialization signal
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 20e8183b..72f90198 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -136,6 +136,9 @@ class Runtime {
     // Execution parameters for AICPU scheduling
     int sche_cpu_num;  // Number of AICPU threads for scheduling
 
+    // Register-based task dispatch (unused in tensormap_and_ringbuffer, but required by platform)
+    uint64_t regs{0};  // Device memory pointer to register address array
+
     // PTO2 integration: kernel_id -> GM function_bin_addr mapping
     // NOTE: Made public for direct access from aicore code
     uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];