diff --git a/src/platform/a2a3/aicore/inner_kernel.h b/src/platform/a2a3/aicore/inner_kernel.h
index fa31ee3f..e7f0e517 100644
--- a/src/platform/a2a3/aicore/inner_kernel.h
+++ b/src/platform/a2a3/aicore/inner_kernel.h
@@ -9,6 +9,9 @@
 #ifndef PLATFORM_A2A3_AICORE_INNER_KERNEL_H_
 #define PLATFORM_A2A3_AICORE_INNER_KERNEL_H_
 
+#include <cstdint>
+#include "common/platform_config.h"
+
 // AICore function attribute for CANN compiler
 #ifndef __aicore__
 #define __aicore__ [aicore]
@@ -17,4 +20,43 @@
 // dcci (Data Cache Clean and Invalidate) is provided by CANN headers
 // No need to define it here - it's a hardware instruction
 
+// AICoreStatus is defined in aicpu/aicpu_regs.h for AICPU side.
+// Redeclare here for AICore side (avoids pulling in AICPU headers).
+enum class AICoreStatus : uint32_t {
+    IDLE = 0,
+    BUSY = 1,
+};
+
+/**
+ * Read an AICore register via SPR access
+ *
+ * @param reg  Register identifier
+ * @return Register value (zero-extended to uint64_t)
+ */
+__aicore__ inline uint64_t read_reg(RegId reg) {
+    switch (reg) {
+        case RegId::DATA_MAIN_BASE: {
+            uint32_t val;
+            __asm__ volatile("MOV %0, DATA_MAIN_BASE\n" : "=l"(val));
+            return static_cast<uint64_t>(val);
+        }
+        default: return 0;
+    }
+}
+
+/**
+ * Write to an AICore register
+ *
+ * @param reg    Register identifier
+ * @param value  Value to write
+ */
+__aicore__ inline void write_reg(RegId reg, uint64_t value) {
+    switch (reg) {
+        case RegId::COND:
+            set_cond(static_cast<AICoreStatus>(static_cast<uint32_t>(value)));
+            break;
+        default: break;
+    }
+}
+
 #endif  // PLATFORM_A2A3_AICORE_INNER_KERNEL_H_
diff --git a/src/platform/a2a3/aicore/kernel.cpp b/src/platform/a2a3/aicore/kernel.cpp
index b17615e9..8e1f641c 100644
--- a/src/platform/a2a3/aicore/kernel.cpp
+++ b/src/platform/a2a3/aicore/kernel.cpp
@@ -23,7 +23,7 @@ class Runtime;
 [[block_local]] int block_idx;
 [[block_local]] CoreType core_type;
 
-extern __aicore__ void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type);
+extern __aicore__ void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id);
 
 /**
  * Kernel entry point with control loop
@@ -49,5 +49,6 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ Runtime
     block_idx = get_block_idx();
     core_type = CoreType::AIC;
 #endif
-    aicore_execute(runtime, block_idx, core_type);
+    uint32_t physical_core_id = static_cast<uint32_t>(get_coreid()) & AICORE_COREID_MASK;
+    aicore_execute(runtime, block_idx, core_type, physical_core_id);
 }
diff --git a/src/platform/a2a3/aicpu/aicpu_regs.cpp b/src/platform/a2a3/aicpu/aicpu_regs.cpp
new file mode 100644
index 00000000..846153e9
--- /dev/null
+++ b/src/platform/a2a3/aicpu/aicpu_regs.cpp
@@ -0,0 +1,23 @@
+/**
+ * @file aicpu_regs.cpp
+ * @brief AICPU-side register access implementation (a2a3 real hardware)
+ *
+ * Uses volatile MMIO pointer access with memory barriers for
+ * cross-core register communication.
+ */
+
+#include "aicpu/aicpu_regs.h"
+
+uint64_t read_reg(uint64_t reg_base_addr, RegId reg) {
+    volatile uint32_t* ptr = reinterpret_cast<volatile uint32_t*>(
+        reg_base_addr + reg_offset(reg));
+    __sync_synchronize();
+    return static_cast<uint64_t>(*ptr);
+}
+
+void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) {
+    volatile uint32_t* ptr = reinterpret_cast<volatile uint32_t*>(
+        reg_base_addr + reg_offset(reg));
+    *ptr = static_cast<uint32_t>(value);
+    __sync_synchronize();
+}
diff --git a/src/platform/a2a3/host/CMakeLists.txt b/src/platform/a2a3/host/CMakeLists.txt
index 85e28645..7dfabdb1 100644
--- a/src/platform/a2a3/host/CMakeLists.txt
+++ b/src/platform/a2a3/host/CMakeLists.txt
@@ -24,6 +24,7 @@ list(APPEND HOST_RUNTIME_SOURCES
     "${CMAKE_CURRENT_SOURCE_DIR}/memory_allocator.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/performance_collector.cpp"
diff --git a/src/platform/a2a3/host/device_runner.cpp b/src/platform/a2a3/host/device_runner.cpp
index a5ece59e..ec92aac1 100644
--- a/src/platform/a2a3/host/device_runner.cpp
+++ b/src/platform/a2a3/host/device_runner.cpp
@@ -11,6 +11,7 @@
 
 // Include HAL constants from CANN (header only, library loaded dynamically)
 #include "ascend_hal.h"
+#include "host/host_regs.h"  // Register address retrieval
 
 // =============================================================================
 // Lazy-loaded HAL (ascend_hal) for profiling host-register only
@@ -324,6 +325,13 @@ int DeviceRunner::run(Runtime& runtime,
     worker_count_ = num_aicore;  // Store for print_handshake_results in destructor
     runtime.sche_cpu_num = launch_aicpu_num;
 
+    // Get AICore register addresses for register-based task dispatch
+    rc = init_aicore_register_addresses(&runtime.regs, static_cast<uint64_t>(device_id), mem_alloc_);
+    if (rc != 0) {
+        LOG_ERROR("init_aicore_register_addresses failed: %d", rc);
+        return rc;
+    }
+
     // Calculate number of AIC cores (1/3 of total)
     int num_aic = block_dim;  // Round up for 1/3
 
diff --git a/src/platform/a2a3/host/host_regs.cpp b/src/platform/a2a3/host/host_regs.cpp
new file mode 100644
index 00000000..da614fec
--- /dev/null
+++ b/src/platform/a2a3/host/host_regs.cpp
@@ -0,0 +1,178 @@
+/**
+ * @file host_regs.cpp
+ * @brief Host-side AICore register address retrieval implementation
+ */
+
+#include "host/host_regs.h"
+#include "host/memory_allocator.h"
+#include "common/unified_log.h"
+#include "common/platform_config.h"
+#include "runtime/rt.h"
+#include "ascend_hal.h"  // CANN HAL API definitions (MODULE_TYPE_AICORE, INFO_TYPE_OCCUPY, etc.)
+#include <dlfcn.h>
+#include <iostream>
+
+/**
+ * Query valid AICore cores via HAL API
+ */
+static bool get_pg_mask(uint64_t& valid, int64_t device_id) {
+    uint64_t aicore_bitmap[PLATFORM_AICORE_BITMAP_LEN] = {0};
+    int32_t size_n = static_cast<int32_t>(sizeof(uint64_t)) * PLATFORM_AICORE_BITMAP_LEN;
+
+    auto halFuncDevInfo =
+        (int (*)(uint64_t deviceId, int32_t moduleType, int32_t infoType, void* buf, int32_t* size))dlsym(
+            nullptr, "halGetDeviceInfoByBuff");
+
+    if (halFuncDevInfo == nullptr) {
+        LOG_WARN("halGetDeviceInfoByBuff not found, assuming all cores valid");
+        return false;
+    }
+
+    auto ret = halFuncDevInfo(static_cast<uint32_t>(device_id),
+        MODULE_TYPE_AICORE,
+        INFO_TYPE_OCCUPY,
+        reinterpret_cast<void*>(&aicore_bitmap[0]),
+        &size_n);
+
+    if (ret != 0) {
+        LOG_ERROR("halGetDeviceInfoByBuff failed with rc=%d", ret);
+        return false;
+    }
+
+    valid = aicore_bitmap[0];
+    return true;
+}
+
+/**
+ * Retrieve AICore register base addresses via HAL API
+ */
+static int get_aicore_reg_info(std::vector<int64_t>& aic, std::vector<int64_t>& aiv,
+                           const int& addr_type, int64_t device_id) {
+    uint64_t valid = 0;
+    if (!get_pg_mask(valid, device_id)) {
+        // If can't get mask, assume all cores valid
+        valid = 0xFFFFFFFF;
+        LOG_WARN("Using default valid mask 0xFFFFFFFF");
+    }
+
+    uint64_t core_stride = 8 * 1024 * 1024;  // 8M
+    uint64_t sub_core_stride = 0x100000ULL;
+
+    auto is_valid = [&valid](int id) {
+        const uint64_t mask = (1ULL << 25) - 1;
+        return ((static_cast<uint64_t>(valid) ^ mask) & (1ULL << id)) == 0;
+    };
+
+    auto halFunc =
+        (int (*)(int type, void* paramValue, size_t paramValueSize, void* outValue, size_t* outSizeRet))dlsym(
+            nullptr, "halMemCtl");
+
+    if (halFunc == nullptr) {
+        LOG_ERROR("halMemCtl not found in symbol table");
+        return -1;
+    }
+
+    struct AddrMapInPara in_map_para;
+    struct AddrMapOutPara out_map_para;
+    in_map_para.devid = device_id;
+    in_map_para.addr_type = addr_type;
+
+    auto ret = halFunc(0,
+        reinterpret_cast<void*>(&in_map_para),
+        sizeof(struct AddrMapInPara),
+        reinterpret_cast<void*>(&out_map_para),
+        nullptr);
+
+    if (ret != 0) {
+        LOG_ERROR("halMemCtl failed with rc=%d", ret);
+        return ret;
+    }
+
+    LOG_INFO("Register base: ptr=0x%llx, len=0x%llx", out_map_para.ptr, out_map_para.len);
+
+    // Iterate over all cores and subcores
+    for (uint32_t i = 0; i < DAV_2201::PLATFORM_MAX_PHYSICAL_CORES; i++) {
+        for (uint32_t j = 0; j < PLATFORM_SUB_CORES_PER_AICORE; j++) {
+            uint64_t vaddr = 0UL;
+            if (is_valid(i)) {
+                vaddr = out_map_para.ptr + (i * core_stride + j * sub_core_stride);
+            }
+            if (j == 0) {
+                aic.push_back(vaddr);
+            } else {
+                aiv.push_back(vaddr);
+            }
+        }
+    }
+
+    return 0;
+}
+
+void get_aicore_regs(std::vector<int64_t>& regs, uint64_t device_id) {
+    std::vector<int64_t> aiv;
+    std::vector<int64_t> aic;
+
+    int rt = get_aicore_reg_info(aic, aiv, ADDR_MAP_TYPE_REG_AIC_CTRL, device_id);
+
+    if (rt != 0) {
+        LOG_ERROR("get_aicore_reg_info failed, using placeholder addresses");
+        // Fallback: generate placeholder addresses
+        for (int i = 0; i < 25; i++) {
+            aic.push_back(0xDEADBEEF00000000ULL + (i * 0x800000));  // 8M stride
+            aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x100000);
+            aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x200000);
+        }
+    }
+
+    // AIC cores first, then AIV cores
+    regs.insert(regs.end(), aic.begin(), aic.end());
+    regs.insert(regs.end(), aiv.begin(), aiv.end());
+
+    LOG_INFO("get_aicore_regs: Retrieved %zu AIC and %zu AIV register addresses",
+             aic.size(), aiv.size());
+}
+
+int init_aicore_register_addresses(
+    uint64_t* runtime_regs_ptr,
+    uint64_t device_id,
+    MemoryAllocator& allocator) {
+
+    if (runtime_regs_ptr == nullptr) {
+        LOG_ERROR("init_aicore_register_addresses: Invalid parameters");
+        return -1;
+    }
+
+    LOG_INFO("Retrieving and allocating AICore register addresses...");
+
+    // Step 1: Get register addresses from HAL
+    std::vector<int64_t> host_regs;
+    get_aicore_regs(host_regs, device_id);
+
+    if (host_regs.empty()) {
+        LOG_ERROR("Failed to get AICore register addresses");
+        return -1;
+    }
+
+    // Step 2: Allocate device memory for register address array
+    size_t regs_size = host_regs.size() * sizeof(int64_t);
+    void* reg_ptr = allocator.alloc(regs_size);
+    if (reg_ptr == nullptr) {
+        LOG_ERROR("Failed to allocate device memory for register addresses");
+        return -1;
+    }
+
+    // Step 3: Copy register addresses to device memory
+    int ret = rtMemcpy(reg_ptr, regs_size, host_regs.data(), regs_size, RT_MEMCPY_HOST_TO_DEVICE);
+    if (ret != 0) {
+        LOG_ERROR("Failed to copy register addresses to device (rc=%d)", ret);
+        return -1;
+    }
+
+    // Step 4: Store device pointer in runtime.regs
+    *runtime_regs_ptr = reinterpret_cast<uint64_t>(reg_ptr);
+
+    LOG_INFO("Successfully initialized register addresses: %zu addresses at device 0x%llx",
+             host_regs.size(), *runtime_regs_ptr);
+
+    return 0;
+}
diff --git a/src/platform/a2a3sim/aicore/inner_kernel.h b/src/platform/a2a3sim/aicore/inner_kernel.h
index 3a74156e..25cb0762 100644
--- a/src/platform/a2a3sim/aicore/inner_kernel.h
+++ b/src/platform/a2a3sim/aicore/inner_kernel.h
@@ -24,8 +24,13 @@
 
 // Cache coherency constants (no-op in simulation)
 #define ENTIRE_DATA_CACHE 0
+#define SINGLE_CACHE_LINE 0
 #define CACHELINE_OUT 0
 
+// pipe_barrier - memory barrier in simulation (hardware pipeline synchronization)
+#define PIPE_ALL 0
+#define pipe_barrier(pipe) __sync_synchronize()
+
 // =============================================================================
 // System Counter Simulation
 // =============================================================================
@@ -56,4 +61,49 @@ inline uint64_t get_sys_cnt() {
     return ticks;
 }
 
+// =============================================================================
+// Register Access Simulation
+// =============================================================================
+
+/**
+ * Per-thread simulated register base address.
+ * Set by the kernel wrapper before calling aicore_execute().
+ * Points to a SIM_REG_BLOCK_SIZE-byte block allocated by DeviceRunner.
+ */
+extern thread_local volatile uint8_t* g_sim_reg_base;
+
+/**
+ * AICore execution status (matches aicpu/aicpu_regs.h definition)
+ */
+enum class AICoreStatus : uint32_t {
+    IDLE = 0,
+    BUSY = 1,
+};
+
+/**
+ * Read an AICore register from simulated register memory
+ *
+ * @param reg  Register identifier
+ * @return Register value (zero-extended to uint64_t)
+ */
+inline uint64_t read_reg(RegId reg) {
+    uint32_t offset = reg_offset(reg);
+    __sync_synchronize();
+    return static_cast<uint64_t>(
+        *reinterpret_cast<volatile uint32_t*>(g_sim_reg_base + offset));
+}
+
+/**
+ * Write to an AICore register in simulated register memory
+ *
+ * @param reg    Register identifier
+ * @param value  Value to write
+ */
+inline void write_reg(RegId reg, uint64_t value) {
+    uint32_t offset = reg_offset(reg);
+    *reinterpret_cast<volatile uint32_t*>(g_sim_reg_base + offset) =
+        static_cast<uint32_t>(value);
+    __sync_synchronize();
+}
+
 #endif  // PLATFORM_A2A3SIM_AICORE_INNER_KERNEL_H_
diff --git a/src/platform/a2a3sim/aicore/kernel.cpp b/src/platform/a2a3sim/aicore/kernel.cpp
index 46058f2c..8dcbcb9d 100644
--- a/src/platform/a2a3sim/aicore/kernel.cpp
+++ b/src/platform/a2a3sim/aicore/kernel.cpp
@@ -2,18 +2,30 @@
  * AICore Kernel Wrapper for Simulation
  *
  * Provides a wrapper around aicore_execute for dlsym lookup.
- * This allows adding pre/post processing around kernel execution.
+ * Sets up per-thread simulated register base before calling the executor.
  */
 
+#include <cstdint>
 #include "aicore/aicore.h"
 #include "common/core_type.h"
+#include "common/platform_config.h"
+#include "runtime.h"
 
-class Runtime;
+// Thread-local simulated register base (declared in inner_kernel.h)
+thread_local volatile uint8_t* g_sim_reg_base = nullptr;
 
 // Declare the original function (defined in aicore_executor.cpp with weak linkage)
-void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type);
+void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id);
 
 // Wrapper with extern "C" for dlsym lookup
-extern "C" void aicore_execute_wrapper(__gm__ Runtime* runtime, int block_idx, CoreType core_type) {
-    aicore_execute(runtime, block_idx, core_type);
+extern "C" void aicore_execute_wrapper(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id) {
+    // Set up simulated register base for this thread.
+    // runtime->regs points to an array of uint64_t base addresses (one per core).
+    // physical_core_id indexes into it to get this core's register block.
+    if (runtime->regs != 0) {
+        uint64_t* regs_array = reinterpret_cast<uint64_t*>(runtime->regs);
+        g_sim_reg_base = reinterpret_cast<volatile uint8_t*>(regs_array[physical_core_id]);
+    }
+
+    aicore_execute(runtime, block_idx, core_type, physical_core_id);
 }
diff --git a/src/platform/a2a3sim/aicpu/aicpu_regs.cpp b/src/platform/a2a3sim/aicpu/aicpu_regs.cpp
new file mode 100644
index 00000000..1f391689
--- /dev/null
+++ b/src/platform/a2a3sim/aicpu/aicpu_regs.cpp
@@ -0,0 +1,23 @@
+/**
+ * @file aicpu_regs.cpp
+ * @brief AICPU-side register access implementation (a2a3sim simulation)
+ *
+ * Uses volatile pointer access to host-allocated memory blocks
+ * that simulate AICore hardware registers.
+ */
+
+#include "aicpu/aicpu_regs.h"
+
+uint64_t read_reg(uint64_t reg_base_addr, RegId reg) {
+    volatile uint32_t* ptr = reinterpret_cast<volatile uint32_t*>(
+        reg_base_addr + reg_offset(reg));
+    __sync_synchronize();
+    return static_cast<uint64_t>(*ptr);
+}
+
+void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) {
+    volatile uint32_t* ptr = reinterpret_cast<volatile uint32_t*>(
+        reg_base_addr + reg_offset(reg));
+    *ptr = static_cast<uint32_t>(value);
+    __sync_synchronize();
+}
diff --git a/src/platform/a2a3sim/host/device_runner.cpp b/src/platform/a2a3sim/host/device_runner.cpp
index 491b33c8..c9a96550 100644
--- a/src/platform/a2a3sim/host/device_runner.cpp
+++ b/src/platform/a2a3sim/host/device_runner.cpp
@@ -18,7 +18,7 @@
 
 // Function pointer types for dynamically loaded executors
 typedef int (*aicpu_execute_func_t)(Runtime* runtime);
-typedef void (*aicore_execute_func_t)(Runtime* runtime, int block_idx, CoreType core_type);
+typedef void (*aicore_execute_func_t)(Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id);
 
 // =============================================================================
 // DeviceRunner Implementation
@@ -89,7 +89,7 @@ int DeviceRunner::ensure_binaries_loaded(const std::vector<uint8_t>& aicpu_so_bi
             return -1;
         }
 
-        aicore_execute_func_ = reinterpret_cast<void(*)(Runtime*, int, CoreType)>(dlsym(aicore_so_handle_, "aicore_execute_wrapper"));
+        aicore_execute_func_ = reinterpret_cast<void(*)(Runtime*, int, CoreType, uint32_t)>(dlsym(aicore_so_handle_, "aicore_execute_wrapper"));
         if (aicore_execute_func_ == nullptr) {
             LOG_ERROR("dlsym failed for aicore_execute_wrapper: %s", dlerror());
             return -1;
@@ -212,6 +212,30 @@ int DeviceRunner::run(Runtime& runtime,
         }
     }
 
+    // Allocate simulated register blocks for all AICore cores
+    size_t total_reg_size = num_aicore * SIM_REG_BLOCK_SIZE;
+    void* reg_blocks = mem_alloc_.alloc(total_reg_size);
+    if (reg_blocks == nullptr) {
+        LOG_ERROR("Failed to allocate simulated register memory (%zu bytes)", total_reg_size);
+        return -1;
+    }
+    std::memset(reg_blocks, 0, total_reg_size);
+
+    // Build array of per-core register base addresses
+    size_t regs_array_size = num_aicore * sizeof(uint64_t);
+    uint64_t* regs_array = reinterpret_cast<uint64_t*>(mem_alloc_.alloc(regs_array_size));
+    if (regs_array == nullptr) {
+        LOG_ERROR("Failed to allocate register address array");
+        return -1;
+    }
+    for (int i = 0; i < num_aicore; i++) {
+        regs_array[i] = reinterpret_cast<uint64_t>(
+            static_cast<uint8_t*>(reg_blocks) + i * SIM_REG_BLOCK_SIZE);
+    }
+    runtime.regs = reinterpret_cast<uint64_t>(regs_array);
+
+    LOG_INFO("Allocated simulated registers: %d cores x 0x%x bytes", num_aicore, SIM_REG_BLOCK_SIZE);
+
     // Check if executors are loaded
     if (aicpu_execute_func_ == nullptr || aicore_execute_func_ == nullptr) {
         LOG_ERROR("Executor functions not loaded. Call ensure_binaries_loaded first.");
@@ -232,8 +256,9 @@ int DeviceRunner::run(Runtime& runtime,
     std::vector<std::thread> aicore_threads;
     for (int i = 0; i < num_aicore; i++) {
         CoreType core_type = runtime.workers[i].core_type;
-        aicore_threads.emplace_back([this, &runtime, i, core_type]() {
-            aicore_execute_func_(&runtime, i, core_type);
+        uint32_t physical_core_id = static_cast<uint32_t>(i);
+        aicore_threads.emplace_back([this, &runtime, i, core_type, physical_core_id]() {
+            aicore_execute_func_(&runtime, i, core_type, physical_core_id);
         });
     }
 
diff --git a/src/platform/a2a3sim/host/device_runner.h b/src/platform/a2a3sim/host/device_runner.h
index 3ae8ef34..f169b8f1 100644
--- a/src/platform/a2a3sim/host/device_runner.h
+++ b/src/platform/a2a3sim/host/device_runner.h
@@ -207,7 +207,7 @@ class DeviceRunner {
     void* aicpu_so_handle_{nullptr};
     void* aicore_so_handle_{nullptr};
     int (*aicpu_execute_func_)(Runtime*){nullptr};
-    void (*aicore_execute_func_)(Runtime*, int, CoreType){nullptr};
+    void (*aicore_execute_func_)(Runtime*, int, CoreType, uint32_t){nullptr};
     std::string aicpu_so_path_;
     std::string aicore_so_path_;
 
diff --git a/src/platform/include/aicpu/aicpu_regs.h b/src/platform/include/aicpu/aicpu_regs.h
new file mode 100644
index 00000000..33e5b1ce
--- /dev/null
+++ b/src/platform/include/aicpu/aicpu_regs.h
@@ -0,0 +1,42 @@
+/**
+ * @file aicpu_regs.h
+ * @brief AICPU-side register access interface
+ *
+ * Provides unified read_reg/write_reg for AICPU to access AICore registers.
+ * On real hardware (a2a3): MMIO volatile pointer access with memory barriers.
+ * In simulation (a2a3sim): volatile pointer access to host-allocated memory.
+ */
+
+#ifndef PLATFORM_AICPU_AICPU_REGS_H_
+#define PLATFORM_AICPU_AICPU_REGS_H_
+
+#include <cstdint>
+#include "common/platform_config.h"
+
+/**
+ * AICore execution status (read from COND register)
+ */
+enum class AICoreStatus : uint32_t {
+    IDLE = 0,
+    BUSY = 1,
+};
+
+/**
+ * Read a register value from an AICore's register block
+ *
+ * @param reg_base_addr  Base address of the AICore's register block
+ * @param reg            Register identifier
+ * @return Register value (zero-extended to uint64_t)
+ */
+uint64_t read_reg(uint64_t reg_base_addr, RegId reg);
+
+/**
+ * Write a value to an AICore's register
+ *
+ * @param reg_base_addr  Base address of the AICore's register block
+ * @param reg            Register identifier
+ * @param value          Value to write (truncated to register width)
+ */
+void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value);
+
+#endif  // PLATFORM_AICPU_AICPU_REGS_H_
diff --git a/src/platform/include/common/perf_profiling.h b/src/platform/include/common/perf_profiling.h
index e3fe4a47..fb99199d 100644
--- a/src/platform/include/common/perf_profiling.h
+++ b/src/platform/include/common/perf_profiling.h
@@ -38,11 +38,6 @@
 #define RUNTIME_MAX_FANOUT 512
 #endif
 
-// Maximum cores that can be profiled simultaneously
-#ifndef PLATFORM_MAX_CORES
-#define PLATFORM_MAX_CORES 72  // 24 blocks × 3 cores/block
-#endif
-
 // =============================================================================
 // Buffer Status Enumeration
 // =============================================================================
diff --git a/src/platform/include/common/platform_config.h b/src/platform/include/common/platform_config.h
index 5992f563..bcba2965 100644
--- a/src/platform/include/common/platform_config.h
+++ b/src/platform/include/common/platform_config.h
@@ -100,4 +100,71 @@ constexpr int PLATFORM_PROF_TIMEOUT_SECONDS = 2;
  * Number of empty polling iterations before checking timeout
  */
 constexpr int PLATFORM_PROF_EMPTY_POLLS_CHECK_NUM = 1000;
+
+// =============================================================================
+// Register Communication Configuration
+// =============================================================================
+
+// Register offsets for AICore SPR access
+constexpr uint32_t REG_SPR_DATA_MAIN_BASE_OFFSET = 0xA0;  // Task dispatch (AICPU→AICore)
+constexpr uint32_t REG_SPR_COND_OFFSET = 0x4C8;           // Status (AICore→AICPU): 0=IDLE, 1=BUSY
+constexpr uint32_t REG_SPR_FAST_PATH_ENABLE_OFFSET = 0x18;
+
+// Fast path control values
+constexpr uint32_t REG_SPR_FAST_PATH_OPEN = 0xE;
+constexpr uint32_t REG_SPR_FAST_PATH_CLOSE = 0xF;
+
+// Exit signal for AICore shutdown
+constexpr uint32_t AICORE_EXIT_SIGNAL = 0x7FFFFFF0;
+
+// Physical core ID mask for get_coreid()
+constexpr uint32_t AICORE_COREID_MASK = 0x0FFF;
+
+/**
+ * Register identifier for unified read_reg/write_reg interface
+ */
+enum class RegId : uint32_t {
+    DATA_MAIN_BASE = 0,    // Task dispatch (AICPU→AICore)
+    COND = 1,              // Status (AICore→AICPU)
+    FAST_PATH_ENABLE = 2,  // Fast path control
+};
+
+/**
+ * Map RegId to hardware register offset
+ */
+constexpr uint32_t reg_offset(RegId reg) {
+    switch (reg) {
+        case RegId::DATA_MAIN_BASE:  return REG_SPR_DATA_MAIN_BASE_OFFSET;
+        case RegId::COND:            return REG_SPR_COND_OFFSET;
+        case RegId::FAST_PATH_ENABLE: return REG_SPR_FAST_PATH_ENABLE_OFFSET;
+        default: return 0;
+    }
+}
+
+// Size of simulated register block per core (covers largest offset + 4 bytes)
+constexpr uint32_t SIM_REG_BLOCK_SIZE = 0x500;
+
+// =============================================================================
+// Hardware Configuration Constants
+// =============================================================================
+
+/**
+ * AICore register bitmap buffer length
+ * Used for querying valid AICore cores via HAL API
+ */
+constexpr uint8_t PLATFORM_AICORE_BITMAP_LEN = 2;
+
+/**
+ * Number of sub-cores per AICore
+ * Hardware architecture: 1 AICore = 1 AIC + 2 AIV sub-cores
+ */
+constexpr uint32_t PLATFORM_SUB_CORES_PER_AICORE = PLATFORM_CORES_PER_BLOCKDIM;
+
+/**
+ * Maximum physical AICore count for DAV 2201 chip
+ */
+namespace DAV_2201 {
+constexpr uint32_t PLATFORM_MAX_PHYSICAL_CORES = 25;
+}
+
 #endif  // PLATFORM_COMMON_PLATFORM_CONFIG_H_
diff --git a/src/platform/include/host/host_regs.h b/src/platform/include/host/host_regs.h
new file mode 100644
index 00000000..655f19f3
--- /dev/null
+++ b/src/platform/include/host/host_regs.h
@@ -0,0 +1,42 @@
+/**
+ * @file host_regs.h
+ * @brief AICore register address retrieval via CANN HAL APIs
+ *
+ * Provides register base addresses for AICPU to perform MMIO-based
+ * task dispatch to AICore cores.
+ */
+
+#ifndef PLATFORM_HOST_HOST_REGS_H_
+#define PLATFORM_HOST_HOST_REGS_H_
+
+#include <cstdint>
+#include <vector>
+
+// Forward declaration
+class MemoryAllocator;
+
+/**
+ * Get AICore register base addresses for all cores
+ *
+ * @param regs Output vector (AIC cores followed by AIV cores)
+ * @param device_id Device ID
+ */
+void get_aicore_regs(std::vector<int64_t>& regs, uint64_t device_id);
+
+/**
+ * Initialize AICore register addresses for runtime
+ *
+ * Retrieves register addresses from HAL, allocates device memory,
+ * copies addresses to device, and stores the device pointer in runtime.
+ *
+ * @param runtime_regs_ptr Pointer to runtime.regs field
+ * @param device_id Device ID
+ * @param allocator Memory allocator for device memory
+ * @return 0 on success, negative on failure
+ */
+int init_aicore_register_addresses(
+    uint64_t* runtime_regs_ptr,
+    uint64_t device_id,
+    MemoryAllocator& allocator);
+
+#endif  // PLATFORM_HOST_HOST_REGS_H_
diff --git a/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp b/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp
index f002616b..b4c33b22 100644
--- a/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp
+++ b/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp
@@ -46,7 +46,7 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ Task*
     kernel(reinterpret_cast<__gm__ int64_t*>(task->args));
 }
 
-__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type) {
+__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id) {
     __gm__ Handshake* my_hank = (__gm__ Handshake*)(&runtime->workers[block_idx]);
 
     // Phase 1: Wait for AICPU initialization signal
diff --git a/src/runtime/aicpu_build_graph/runtime/runtime.h b/src/runtime/aicpu_build_graph/runtime/runtime.h
index 9ccee95d..18812e89 100644
--- a/src/runtime/aicpu_build_graph/runtime/runtime.h
+++ b/src/runtime/aicpu_build_graph/runtime/runtime.h
@@ -115,6 +115,7 @@ struct Handshake {
     volatile CoreType core_type;    // Core type: CoreType::AIC or CoreType::AIV
     volatile uint64_t perf_records_addr; // Performance records address
     volatile uint32_t perf_buffer_status; // 0 = not full, 1 == full
+    volatile uint32_t physical_core_id;     // Physical core ID
 } __attribute__((aligned(64)));
 
 /**
@@ -245,6 +246,9 @@ class Runtime {
     // Execution parameters for AICPU scheduling
     int sche_cpu_num;  // Number of AICPU threads for scheduling
 
+    // Register-based task dispatch (unused in aicpu_build_graph, but required by platform)
+    uint64_t regs{0};  // Device memory pointer to register address array
+
     /**
      * Orchestration payload (auto-populated by init_runtime_impl, consumed by AICPU orchestration).
      *
diff --git a/src/runtime/host_build_graph/aicore/aicore_executor.cpp b/src/runtime/host_build_graph/aicore/aicore_executor.cpp
index 9781bec4..87e4f798 100644
--- a/src/runtime/host_build_graph/aicore/aicore_executor.cpp
+++ b/src/runtime/host_build_graph/aicore/aicore_executor.cpp
@@ -1,7 +1,7 @@
 #include "aicore/aicore.h"
 #include "runtime.h"
 #include "common/perf_profiling.h"
-#include "common/memory_barrier.h"
+#include "common/platform_config.h"  // Platform configuration (C/C++ compatible)
 
 typedef void (*KernelFunc)(__gm__ int64_t*);
 
@@ -28,23 +28,19 @@ __aicore__ __attribute__((always_inline)) static void record_task_performance(
     CoreType core_type,
     uint64_t kernel_ready_time) {
 
-    // Check if buffer is available for writing
+    // dcci() for handshake visibility during profiling
+    dcci((__gm__ uint32_t*)&my_hank->perf_buffer_status, SINGLE_CACHE_LINE, CACHELINE_OUT);
+
     if (my_hank->perf_buffer_status != 0) {
-        return;  // Buffer full, skip recording
+        return;
     }
 
-    // Get current performance buffer pointer
     __gm__ PerfBuffer* perf_buf = (__gm__ PerfBuffer*)my_hank->perf_records_addr;
-
-    // Get current count
     uint32_t idx = perf_buf->count;
 
-    // Check if buffer has space
     if (idx < PLATFORM_PROF_BUFFER_SIZE) {
-        // Get pointer to the record slot
         __gm__ PerfRecord* record = (__gm__ PerfRecord*)&perf_buf->records[idx];
 
-        // Write record data (only essential fields, fanout filled by AICPU)
         record->start_time = start_time;
         record->end_time = end_time;
         record->kernel_ready_time = kernel_ready_time;
@@ -53,20 +49,16 @@ __aicore__ __attribute__((always_inline)) static void record_task_performance(
         record->core_id = block_idx;
         record->core_type = core_type;
 
-        // Increment count after writing record
         perf_buf->count = idx + 1;
-
-        // Write memory barrier: ensure performance data is visible to Host
-        wmb();
-
-        // Check if buffer is full after this write
+        dcci(record, ENTIRE_DATA_CACHE, CACHELINE_OUT);
         if (perf_buf->count >= PLATFORM_PROF_BUFFER_SIZE) {
-            my_hank->perf_buffer_status = 1;  // Notify AICPU: buffer full
+            my_hank->perf_buffer_status = 1;
         }
     } else {
-        // Buffer is already full
         my_hank->perf_buffer_status = 1;
     }
+    // dcci() for handshake visibility during profiling
+    dcci((__gm__ uint32_t*)&my_hank->perf_buffer_status, SINGLE_CACHE_LINE, CACHELINE_OUT);
 }
 
 __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ Task* task) {
@@ -75,9 +67,12 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ Task*
     }
     KernelFunc kernel = (KernelFunc)task->function_bin_addr;
     kernel(reinterpret_cast<__gm__ int64_t*>(task->args));
+
+    // Ensure all memory writes are visible to other cores
+    pipe_barrier(PIPE_ALL);
 }
 
-__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type) {
+__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id) {
     __gm__ Handshake* my_hank = (__gm__ Handshake*)(&runtime->workers[block_idx]);
 
     // Phase 1: Wait for AICPU initialization signal
@@ -85,51 +80,45 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
         dcci(my_hank, ENTIRE_DATA_CACHE, CACHELINE_OUT);
     }
 
-    // Phase 2: Signal AICore is ready and report core type
-    my_hank->core_type = core_type;        // Report core type to AICPU
-    my_hank->aicore_done = block_idx + 1;  // Signal ready (use block_idx + 1 to avoid 0)
+    // Report physical core ID and core type for AICPU
+    my_hank->physical_core_id = physical_core_id;
+    my_hank->core_type = core_type;
+    my_hank->aicore_done = block_idx + 1;
+
+    dcci(my_hank, ENTIRE_DATA_CACHE, CACHELINE_OUT);
+
+    // Report initial idle status for task dispatch
+    write_reg(RegId::COND, static_cast<uint64_t>(AICoreStatus::IDLE));
 
-    // Check if profiling is enabled
     bool profiling_enabled = runtime->enable_profiling;
+    uint64_t kernel_ready_time = get_sys_cnt();
 
-    // Record kernel ready time (before entering main loop)
-    // This timestamp represents when the AICore is ready to execute tasks
-    // but hasn't started executing any task yet.
-    // Used for: 1) Startup overhead analysis, 2) Cross-core time alignment
-    uint64_t kernel_ready_time = 0;
-    if (profiling_enabled) {
-        kernel_ready_time = get_sys_cnt();
-    }
+    // Main loop: poll DATA_MAIN_BASE for task_id
+    volatile uint32_t task_id = 0;
+    volatile uint32_t last_task_id = 0;
 
-    // Phase 3: Main execution loop - poll for tasks until quit signal
     while (true) {
-        dcci(my_hank, ENTIRE_DATA_CACHE, CACHELINE_OUT);
-
-        // Check for quit command from AICPU
-        if (my_hank->control == 1) {
-            break;  // Exit kernel
+        task_id = static_cast<uint32_t>(read_reg(RegId::DATA_MAIN_BASE));
+        if (task_id == AICORE_EXIT_SIGNAL) {
+            break;
         }
 
-        // Execute task if assigned (task != 0 means valid task pointer)
-        if (my_hank->task_status == 1 && my_hank->task != 0) {
-            __gm__ Task* task_ptr = reinterpret_cast<__gm__ Task*>(my_hank->task);
-
-            // Performance profiling: record start time
-            uint64_t start_time = 0;
-            start_time = get_sys_cnt();
+        // Execute task if new (task_id encoding: 0=idle, task_id+1=task)
+        if (task_id != 0 && task_id != last_task_id) {
+            write_reg(RegId::COND, static_cast<uint64_t>(AICoreStatus::BUSY));
+            __gm__ Task* task_ptr = &(runtime->tasks[task_id - 1]);
+            uint64_t start_time = get_sys_cnt();
             
-
-            // Execute the task
             execute_task(task_ptr);
 
-            // Performance profiling: record task execution
             if (profiling_enabled) {
                 uint64_t end_time = get_sys_cnt();
-                record_task_performance(my_hank, task_ptr, start_time, end_time, block_idx, core_type, kernel_ready_time);
+                record_task_performance(my_hank, task_ptr, start_time, end_time,
+                                      block_idx, core_type, kernel_ready_time);
             }
 
-            // Mark task as complete (task_status: 0=idle, 1=busy)
-            my_hank->task_status = 0;
+            last_task_id = task_id;
+            write_reg(RegId::COND, static_cast<uint64_t>(AICoreStatus::IDLE));
         }
     }
 }
diff --git a/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp
index 4b378fa0..daf4f62c 100644
--- a/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp
+++ b/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp
@@ -9,15 +9,19 @@
 #include "runtime.h"
 #include "aicpu/device_log.h"
 #include "inner_aicpu.h"
+#include "aicpu/aicpu_regs.h"  // Register-based communication
 
 constexpr int MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS;
 constexpr int MAX_AIC_PER_THREAD = PLATFORM_MAX_AIC_PER_THREAD;
 constexpr int MAX_AIV_PER_THREAD = PLATFORM_MAX_AIV_PER_THREAD;
 constexpr int MAX_CORES_PER_THREAD = PLATFORM_MAX_CORES_PER_THREAD;
+constexpr int MAX_CORES = PLATFORM_MAX_CORES;
 
 // Core information for discovery
 struct CoreInfo {
-    int worker_id;     // Index in runtime.workers[]
+    int worker_id;              // Index in runtime.workers[]
+    uint32_t physical_core_id;  // Hardware physical core ID (from AICore)
+    uint64_t reg_addr;          // Cached register address for fast access
     CoreType core_type;
 };
 
@@ -40,6 +44,12 @@ struct AicpuExecutor {
     int aic_count_{0};
     int aiv_count_{0};
 
+    // Fast lookup: core_id -> reg_addr
+    uint64_t core_id_to_reg_addr_[MAX_CORES_PER_THREAD];
+
+    // Track executing task_id per core (-1 = idle)
+    int executing_task_ids_[MAX_CORES];
+
     // ===== Task queue state =====
     std::mutex ready_queue_aic_mutex_;
     int ready_queue_aic_[RUNTIME_MAX_TASKS];
@@ -122,6 +132,11 @@ int AicpuExecutor::init(Runtime* runtime) {
 
     LOG_INFO("Config: threads=%d, cores=%d, cores_per_thread=%d", thread_num_, cores_total_num_, thread_cores_num_);
 
+    // Initialize executing_task_ids_ to -1 (idle)
+    for (int i = 0; i < MAX_CORES; i++) {
+        executing_task_ids_[i] = -1;
+    }
+
     // Assign discovered cores to threads
     assign_cores_to_threads();
 
@@ -228,24 +243,39 @@ int AicpuExecutor::handshake_all_cores(Runtime* runtime) {
             // Busy wait for core response
         }
 
-        // Read core type (written by AICore during handshake)
         CoreType type = hank->core_type;
+        uint32_t physical_core_id = hank->physical_core_id;
+
+        // Get register address using physical_core_id
+        uint64_t* regs = reinterpret_cast<uint64_t*>(runtime->regs);
+        uint64_t reg_addr = regs[physical_core_id];
 
-        // Classify and store core information
         if (type == CoreType::AIC) {
             aic_cores_[aic_count_].worker_id = i;
+            aic_cores_[aic_count_].physical_core_id = physical_core_id;
+            aic_cores_[aic_count_].reg_addr = reg_addr;
             aic_cores_[aic_count_].core_type = type;
             aic_count_++;
         } else if (type == CoreType::AIV) {
             aiv_cores_[aiv_count_].worker_id = i;
+            aiv_cores_[aiv_count_].physical_core_id = physical_core_id;
+            aiv_cores_[aiv_count_].reg_addr = reg_addr;
             aiv_cores_[aiv_count_].core_type = type;
             aiv_count_++;
         } else {
-            LOG_ERROR("Unknown core type %d for core %d", static_cast<int>(type), i);
+            LOG_ERROR("Unknown core type from core %d", i);
             return -1;
         }
 
-        LOG_INFO("  Core %d: type=%s", i, core_type_to_string(type));
+        core_id_to_reg_addr_[i] = reg_addr;
+
+        LOG_INFO("  Core %d: type=%s, physical_id=%u, reg_addr=0x%lx",
+                 i, core_type_to_string(type), physical_core_id, reg_addr);
+
+        if (reg_addr != 0) {
+            write_reg(reg_addr, RegId::FAST_PATH_ENABLE, REG_SPR_FAST_PATH_OPEN);
+            write_reg(reg_addr, RegId::DATA_MAIN_BASE, 0);
+        }
     }
 
     LOG_INFO("Discovery complete: AIC=%d, AIV=%d, Total=%d", aic_count_, aiv_count_, cores_total_num_);
@@ -340,7 +370,14 @@ int AicpuExecutor::shutdown_aicore(Runtime* runtime, int thread_idx, const int*
         int core_id = cur_thread_cores[i];
         Handshake* hank = &all_hanks[core_id];
         LOG_INFO("Thread %d: AICPU hank addr = 0x%lx", thread_idx, (uint64_t)hank);
-        hank->control = 1;
+
+        uint64_t reg_addr = core_id_to_reg_addr_[core_id];
+        if (reg_addr != 0) {
+            write_reg(reg_addr, RegId::DATA_MAIN_BASE, AICORE_EXIT_SIGNAL);
+            write_reg(reg_addr, RegId::FAST_PATH_ENABLE, REG_SPR_FAST_PATH_CLOSE);
+        } else {
+            LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id);
+        }
     }
     LOG_INFO("Thread %d: Shutdown complete", thread_idx);
     return 0;
@@ -377,15 +414,17 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
 
             for (int i = 0; i < core_num; i++) {
                 int core_id = cur_thread_cores[i];
-                Handshake* h = &hank[core_id];
 
-                if (h->task_status != 0 || h->task != 0) {
+                uint64_t reg_addr = core_id_to_reg_addr_[core_id];
+                AICoreStatus status = static_cast<AICoreStatus>(read_reg(reg_addr, RegId::COND));
+
+                if (status != AICoreStatus::IDLE || executing_task_ids_[core_id] >= 0) {
                     all_cores_idle = false;
 
                     if (verification_warning_count == 0) {
-                        LOG_WARN("Thread %d: Counter reached %d/%d but core %d still has work (status=%d, task=%p)",
+                        LOG_WARN("Thread %d: Counter reached %d/%d but core %d still has work (COND=%d, task_id=%d)",
                                 thread_idx, completed_tasks_.load(std::memory_order_acquire), task_count,
-                                core_id, h->task_status, (void*)h->task);
+                                core_id, static_cast<uint32_t>(status), executing_task_ids_[core_id]);
                     }
                     break;
                 }
@@ -417,15 +456,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
         // Phase 1: Process completed tasks on my managed cores
         for (int i = 0; i < core_num; i++) {
             int core_id = cur_thread_cores[i];
-            Handshake* h = &hank[core_id];
-
-            // Core finished a task (idle + task not null)
-            if (h->task_status == 0 && h->task != 0) {
-                // Get completed task pointer before any buffer operations
-                Task* task = reinterpret_cast<Task*>(h->task);
-                int completed_task_id = task->task_id;
-
-                // Write AICPU dispatch/finish timestamps into the PerfRecord
+            uint64_t reg_addr = core_id_to_reg_addr_[core_id];
+            AICoreStatus status = static_cast<AICoreStatus>(read_reg(reg_addr, RegId::COND));
+            if (status == AICoreStatus::IDLE && executing_task_ids_[core_id] >= 0) {
+                int task_id = executing_task_ids_[core_id];
+                int completed_task_id = task_id;
+                Handshake* h = &hank[core_id];
                 if (profiling_enabled) {
                     uint64_t finish_ts = get_sys_cnt_aicpu();
                     PerfBuffer* perf_buf = (PerfBuffer*)h->perf_records_addr;
@@ -445,10 +481,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
                 if (profiling_enabled && h->perf_buffer_status == 1) {
                     switch_perf_buffer(&runtime, core_id, thread_idx);
                 }
-                // Clear the task pointer
-                h->task = 0;
 
-                int task_id = completed_task_id;
+                Task* task = runtime.get_task(task_id);
 
                 LOG_INFO("Thread %d: Core %d completed task %d", thread_idx, core_id, task_id);
 
@@ -484,10 +518,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
                     }
                 }
 
-                // Update counters
+                // Update counters and clear task tracking
                 cur_thread_tasks_in_flight--;
                 cur_thread_completed++;
                 made_progress = true;
+                // Clear task_id
+                executing_task_ids_[core_id] = -1;
                 completed_tasks_.fetch_add(1, std::memory_order_release);
             }
         }
@@ -497,10 +533,11 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
             // Phase 2: Dispatch new tasks from matching ready queue to idle cores
             for (int i = 0; i < core_num; i++) {
                 int core_id = cur_thread_cores[i];
-                Handshake* h = &hank[core_id];
+                uint64_t reg_addr = core_id_to_reg_addr_[core_id];
+                AICoreStatus status = static_cast<AICoreStatus>(read_reg(reg_addr, RegId::COND));
 
-                // Core is idle and available (idle + task is null)
-                if (h->task_status == 0 && h->task == 0) {
+                if (status == AICoreStatus::IDLE && executing_task_ids_[core_id] == -1) {
+                    Handshake* h = &hank[core_id];
                     // Dispatch from matching queue based on core type
                     if (h->core_type == CoreType::AIC) {  // AIC core
                         if (ready_count_aic_.load(std::memory_order_acquire) > 0) {
@@ -511,16 +548,24 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
                                 int task_id = ready_queue_aic_[ready_queue_aic_head_];
                                 ready_queue_aic_head_ = (ready_queue_aic_head_ + 1) % RUNTIME_MAX_TASKS;
                                 ready_count_aic_.fetch_sub(1, std::memory_order_release);
-                                Task* task = runtime.get_task(task_id);
 
                                 LOG_INFO("Thread %d: Dispatching AIC task %d to core %d (head=%d)",
                                          thread_idx, task_id, core_id, ready_queue_aic_head_);
 
-                                h->task = reinterpret_cast<uint64_t>(task);
+                                uint64_t reg_addr = core_id_to_reg_addr_[core_id];
+
+                                // Pre-set COND=BUSY before writing task_id to prevent
+                                // false completion detection (AICPU seeing stale IDLE
+                                // before AICore has started the task)
+                                write_reg(reg_addr, RegId::COND, static_cast<uint64_t>(AICoreStatus::BUSY));
+
+                                // Write task_id+1 to register
+                                write_reg(reg_addr, RegId::DATA_MAIN_BASE, static_cast<uint64_t>(task_id + 1));
+
                                 if (runtime.enable_profiling) {
                                     dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                                 }
-                                h->task_status = 1;  // Mark as busy
+                                executing_task_ids_[core_id] = task_id;
                                 cur_thread_tasks_in_flight++;
                                 made_progress = true;
                             }
@@ -534,16 +579,24 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
                                 int task_id = ready_queue_aiv_[ready_queue_aiv_head_];
                                 ready_queue_aiv_head_ = (ready_queue_aiv_head_ + 1) % RUNTIME_MAX_TASKS;
                                 ready_count_aiv_.fetch_sub(1, std::memory_order_release);
-                                Task* task = runtime.get_task(task_id);
 
                                 LOG_INFO("Thread %d: Dispatching AIV task %d to core %d (head=%d)",
                                          thread_idx, task_id, core_id, ready_queue_aiv_head_);
 
-                                h->task = reinterpret_cast<uint64_t>(task);
+                                uint64_t reg_addr = core_id_to_reg_addr_[core_id];
+
+                                // Pre-set COND=BUSY before writing task_id to prevent
+                                // false completion detection (AICPU seeing stale IDLE
+                                // before AICore has started the task)
+                                write_reg(reg_addr, RegId::COND, static_cast<uint64_t>(AICoreStatus::BUSY));
+
+                                // Write task_id+1 to register
+                                write_reg(reg_addr, RegId::DATA_MAIN_BASE, static_cast<uint64_t>(task_id + 1));
+
                                 if (runtime.enable_profiling) {
                                     dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
                                 }
-                                h->task_status = 1;  // Mark as busy
+                                executing_task_ids_[core_id] = task_id;
                                 cur_thread_tasks_in_flight++;
                                 made_progress = true;
                             }
@@ -656,7 +709,6 @@ void AicpuExecutor::diagnose_stuck_state(Runtime& runtime, int thread_idx,
 
     int busy_cores = 0;
     int idle_cores = 0;
-    int anomaly_cores = 0;
 
     LOG_ERROR("Core Status:");
     for (int i = 0; i < core_num; i++) {
@@ -665,24 +717,30 @@ void AicpuExecutor::diagnose_stuck_state(Runtime& runtime, int thread_idx,
 
         const char* core_type_str = core_type_to_string(h->core_type);
 
-        if (h->task != 0) {
-            Task* task = reinterpret_cast<Task*>(h->task);
+        uint64_t reg_addr = core_id_to_reg_addr_[core_id];
+        AICoreStatus status = static_cast<AICoreStatus>(read_reg(reg_addr, RegId::COND));
+
+        if (status != AICoreStatus::IDLE) {
             busy_cores++;
 
-            LOG_ERROR("  Core %d [%s, BUSY]: task_id=%d, func_id=%d, fanin=%d, fanout=%d",
-                     core_id, core_type_str,
-                     task->task_id, task->func_id,
-                     task->fanin.load(std::memory_order_acquire),
-                     task->fanout_count);
-        } else if (h->task_status != 0) {
-            anomaly_cores++;
-            LOG_ERROR("  Core %d [%s, ANOMALY]: status=BUSY but task=NULL", core_id, core_type_str);
+            int task_id = executing_task_ids_[core_id];
+            if (task_id >= 0) {
+                Task* task = runtime.get_task(task_id);
+                LOG_ERROR("  Core %d [%s, BUSY]: COND=%d, task_id=%d, func_id=%d, fanin=%d, fanout=%d",
+                         core_id, core_type_str, static_cast<uint32_t>(status),
+                         task->task_id, task->func_id,
+                         task->fanin.load(std::memory_order_acquire),
+                         task->fanout_count);
+            } else {
+                LOG_ERROR("  Core %d [%s, BUSY]: COND=%d but task_id not tracked",
+                         core_id, core_type_str, static_cast<uint32_t>(status));
+            }
         } else {
             idle_cores++;
         }
     }
 
-    LOG_ERROR("Summary: %d busy, %d idle, %d anomaly", busy_cores, idle_cores, anomaly_cores);
+    LOG_ERROR("Summary: %d busy, %d idle", busy_cores, idle_cores);
 
     // Diagnose deadlock vs livelock
     if (busy_cores == 0 && aic_ready == 0 && aiv_ready == 0 && completed < total) {
diff --git a/src/runtime/host_build_graph/runtime/runtime.h b/src/runtime/host_build_graph/runtime/runtime.h
index 919cada9..cfefabd8 100644
--- a/src/runtime/host_build_graph/runtime/runtime.h
+++ b/src/runtime/host_build_graph/runtime/runtime.h
@@ -95,16 +95,20 @@
  * - task_status: Written by both (AICPU=1 on dispatch, AICore=0 on completion)
  * - control: Written by AICPU, read by AICore (0 = continue, 1 = quit)
  * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV)
+ * - perf_records_addr: Written by AICPU, read by AICore (performance records address)
+ * - perf_buffer_status: Written by both (AICPU=1 on buffer full, AICore=0 on buffer empty)
+ * - physical_core_id: Written by AICPU, read by AICore (physical core ID)
  */
 struct Handshake {
-    volatile uint32_t aicpu_ready;  // AICPU ready signal: 0=not ready, 1=ready
-    volatile uint32_t aicore_done;  // AICore ready signal: 0=not ready, core_id+1=ready
-    volatile uint64_t task;         // Task pointer: 0=no task, non-zero=Task* address
-    volatile int32_t task_status;   // Task execution status: 0=idle, 1=busy
-    volatile int32_t control;       // Control signal: 0=execute, 1=quit
-    volatile CoreType core_type;    // Core type: CoreType::AIC or CoreType::AIV
-    volatile uint64_t perf_records_addr; // Performance records address
-    volatile uint32_t perf_buffer_status; // 0 = not full, 1 == full
+    volatile uint32_t aicpu_ready;          // AICPU ready signal: 0=not ready, 1=ready
+    volatile uint32_t aicore_done;          // AICore ready signal: 0=not ready, core_id+1=ready
+    volatile uint64_t task;                 // Task pointer: 0=no task, non-zero=Task* address
+    volatile int32_t task_status;           // Task execution status: 0=idle, 1=busy
+    volatile int32_t control;               // Control signal: 0=execute, 1=quit
+    volatile CoreType core_type;            // Core type: CoreType::AIC or CoreType::AIV
+    volatile uint64_t perf_records_addr;    // Performance records address
+    volatile uint32_t perf_buffer_status;   // 0 = not full, 1 = full
+    volatile uint32_t physical_core_id;     // Physical core ID
 } __attribute__((aligned(64)));
 
 /**
@@ -182,12 +186,17 @@ class Runtime {
     // Execution parameters for AICPU scheduling
     int sche_cpu_num;  // Number of AICPU threads for scheduling
 
+    // Register-based task dispatch
+    uint64_t regs{0};  // Device memory pointer to register address array
+
     // Profiling support
     bool enable_profiling;                  // Enable profiling flag
     uint64_t perf_data_base;                // Performance data shared memory base address (device-side)
-private:
+
     // Task storage
     Task tasks[RUNTIME_MAX_TASKS];  // Fixed-size task array
+
+private:
     int next_task_id;               // Next available task ID
 
     // Initial ready tasks (computed once, read-only after)
diff --git a/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
index 91f0e393..09a0c599 100644
--- a/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
@@ -111,8 +111,9 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ void*
  * @param runtime Pointer to Runtime in global memory
  * @param block_idx Block index (core ID)
  * @param core_type Core type (AIC or AIV)
+ * @param physical_core_id Physical core ID from hardware
  */
-__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type) {
+__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id) {
     __gm__ Handshake* my_hank = (__gm__ Handshake*)(&runtime->workers[block_idx]);
 
     // Phase 1: Wait for AICPU initialization signal
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 20e8183b..8b552aff 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -80,6 +80,7 @@ struct Handshake {
     volatile CoreType core_type;           // Core type: CoreType::AIC or CoreType::AIV
     volatile uint64_t perf_records_addr;   // Performance records address
     volatile uint32_t perf_buffer_status;  // 0 = not full, 1 == full
+    volatile uint32_t physical_core_id;     // Physical core ID
 } __attribute__((aligned(64)));
 
 /**
@@ -136,6 +137,9 @@ class Runtime {
     // Execution parameters for AICPU scheduling
     int sche_cpu_num;  // Number of AICPU threads for scheduling
 
+    // Register-based task dispatch (unused in tensormap_and_ringbuffer, but required by platform)
+    uint64_t regs{0};  // Device memory pointer to register address array
+
     // PTO2 integration: kernel_id -> GM function_bin_addr mapping
     // NOTE: Made public for direct access from aicore code
     uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];