diff --git a/src/platform/a2a3/aicore/inner_kernel.h b/src/platform/a2a3/aicore/inner_kernel.h index fa31ee3f..e7f0e517 100644 --- a/src/platform/a2a3/aicore/inner_kernel.h +++ b/src/platform/a2a3/aicore/inner_kernel.h @@ -9,6 +9,9 @@ #ifndef PLATFORM_A2A3_AICORE_INNER_KERNEL_H_ #define PLATFORM_A2A3_AICORE_INNER_KERNEL_H_ +#include +#include "common/platform_config.h" + // AICore function attribute for CANN compiler #ifndef __aicore__ #define __aicore__ [aicore] @@ -17,4 +20,43 @@ // dcci (Data Cache Clean and Invalidate) is provided by CANN headers // No need to define it here - it's a hardware instruction +// AICoreStatus is defined in aicpu/aicpu_regs.h for AICPU side. +// Redeclare here for AICore side (avoids pulling in AICPU headers). +enum class AICoreStatus : uint32_t { + IDLE = 0, + BUSY = 1, +}; + +/** + * Read an AICore register via SPR access + * + * @param reg Register identifier + * @return Register value (zero-extended to uint64_t) + */ +__aicore__ inline uint64_t read_reg(RegId reg) { + switch (reg) { + case RegId::DATA_MAIN_BASE: { + uint32_t val; + __asm__ volatile("MOV %0, DATA_MAIN_BASE\n" : "=l"(val)); + return static_cast(val); + } + default: return 0; + } +} + +/** + * Write to an AICore register + * + * @param reg Register identifier + * @param value Value to write + */ +__aicore__ inline void write_reg(RegId reg, uint64_t value) { + switch (reg) { + case RegId::COND: + set_cond(static_cast(static_cast(value))); + break; + default: break; + } +} + #endif // PLATFORM_A2A3_AICORE_INNER_KERNEL_H_ diff --git a/src/platform/a2a3/aicore/kernel.cpp b/src/platform/a2a3/aicore/kernel.cpp index b17615e9..8e1f641c 100644 --- a/src/platform/a2a3/aicore/kernel.cpp +++ b/src/platform/a2a3/aicore/kernel.cpp @@ -23,7 +23,7 @@ class Runtime; [[block_local]] int block_idx; [[block_local]] CoreType core_type; -extern __aicore__ void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type); +extern __aicore__ void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id); /** * Kernel entry point with control loop @@ -49,5 +49,6 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ Runtime block_idx = get_block_idx(); core_type = CoreType::AIC; #endif - aicore_execute(runtime, block_idx, core_type); + uint32_t physical_core_id = static_cast(get_coreid()) & AICORE_COREID_MASK; + aicore_execute(runtime, block_idx, core_type, physical_core_id); } diff --git a/src/platform/a2a3/aicpu/aicpu_regs.cpp b/src/platform/a2a3/aicpu/aicpu_regs.cpp new file mode 100644 index 00000000..846153e9 --- /dev/null +++ b/src/platform/a2a3/aicpu/aicpu_regs.cpp @@ -0,0 +1,23 @@ +/** + * @file aicpu_regs.cpp + * @brief AICPU-side register access implementation (a2a3 real hardware) + * + * Uses volatile MMIO pointer access with memory barriers for + * cross-core register communication. + */ + +#include "aicpu/aicpu_regs.h" + +uint64_t read_reg(uint64_t reg_base_addr, RegId reg) { + volatile uint32_t* ptr = reinterpret_cast( + reg_base_addr + reg_offset(reg)); + __sync_synchronize(); + return static_cast(*ptr); +} + +void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) { + volatile uint32_t* ptr = reinterpret_cast( + reg_base_addr + reg_offset(reg)); + *ptr = static_cast(value); + __sync_synchronize(); +} diff --git a/src/platform/a2a3/host/CMakeLists.txt b/src/platform/a2a3/host/CMakeLists.txt index 85e28645..7dfabdb1 100644 --- a/src/platform/a2a3/host/CMakeLists.txt +++ b/src/platform/a2a3/host/CMakeLists.txt @@ -24,6 +24,7 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/memory_allocator.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/performance_collector.cpp" diff --git a/src/platform/a2a3/host/device_runner.cpp b/src/platform/a2a3/host/device_runner.cpp index a5ece59e..ec92aac1 100644 --- a/src/platform/a2a3/host/device_runner.cpp +++ b/src/platform/a2a3/host/device_runner.cpp @@ -11,6 +11,7 @@ // Include HAL constants from CANN (header only, library loaded dynamically) #include "ascend_hal.h" +#include "host/host_regs.h" // Register address retrieval // ============================================================================= // Lazy-loaded HAL (ascend_hal) for profiling host-register only @@ -324,6 +325,13 @@ int DeviceRunner::run(Runtime& runtime, worker_count_ = num_aicore; // Store for print_handshake_results in destructor runtime.sche_cpu_num = launch_aicpu_num; + // Get AICore register addresses for register-based task dispatch + rc = init_aicore_register_addresses(&runtime.regs, static_cast(device_id), mem_alloc_); + if (rc != 0) { + LOG_ERROR("init_aicore_register_addresses failed: %d", rc); + return rc; + } + // Calculate number of AIC cores (1/3 of total) int num_aic = block_dim; // Round up for 1/3 diff --git a/src/platform/a2a3/host/host_regs.cpp b/src/platform/a2a3/host/host_regs.cpp new file mode 100644 index 00000000..da614fec --- /dev/null +++ b/src/platform/a2a3/host/host_regs.cpp @@ -0,0 +1,178 @@ +/** + * @file host_regs.cpp + * @brief Host-side AICore register address retrieval implementation + */ + +#include "host/host_regs.h" +#include "host/memory_allocator.h" +#include "common/unified_log.h" +#include "common/platform_config.h" +#include "runtime/rt.h" +#include "ascend_hal.h" // CANN HAL API definitions (MODULE_TYPE_AICORE, INFO_TYPE_OCCUPY, etc.) +#include +#include + +/** + * Query valid AICore cores via HAL API + */ +static bool get_pg_mask(uint64_t& valid, int64_t device_id) { + uint64_t aicore_bitmap[PLATFORM_AICORE_BITMAP_LEN] = {0}; + int32_t size_n = static_cast(sizeof(uint64_t)) * PLATFORM_AICORE_BITMAP_LEN; + + auto halFuncDevInfo = + (int (*)(uint64_t deviceId, int32_t moduleType, int32_t infoType, void* buf, int32_t* size))dlsym( + nullptr, "halGetDeviceInfoByBuff"); + + if (halFuncDevInfo == nullptr) { + LOG_WARN("halGetDeviceInfoByBuff not found, assuming all cores valid"); + return false; + } + + auto ret = halFuncDevInfo(static_cast(device_id), + MODULE_TYPE_AICORE, + INFO_TYPE_OCCUPY, + reinterpret_cast(&aicore_bitmap[0]), + &size_n); + + if (ret != 0) { + LOG_ERROR("halGetDeviceInfoByBuff failed with rc=%d", ret); + return false; + } + + valid = aicore_bitmap[0]; + return true; +} + +/** + * Retrieve AICore register base addresses via HAL API + */ +static int get_aicore_reg_info(std::vector& aic, std::vector& aiv, + const int& addr_type, int64_t device_id) { + uint64_t valid = 0; + if (!get_pg_mask(valid, device_id)) { + // If can't get mask, assume all cores valid + valid = 0xFFFFFFFF; + LOG_WARN("Using default valid mask 0xFFFFFFFF"); + } + + uint64_t core_stride = 8 * 1024 * 1024; // 8M + uint64_t sub_core_stride = 0x100000ULL; + + auto is_valid = [&valid](int id) { + const uint64_t mask = (1ULL << 25) - 1; + return ((static_cast(valid) ^ mask) & (1ULL << id)) == 0; + }; + + auto halFunc = + (int (*)(int type, void* paramValue, size_t paramValueSize, void* outValue, size_t* outSizeRet))dlsym( + nullptr, "halMemCtl"); + + if (halFunc == nullptr) { + LOG_ERROR("halMemCtl not found in symbol table"); + return -1; + } + + struct AddrMapInPara in_map_para; + struct AddrMapOutPara out_map_para; + in_map_para.devid = device_id; + in_map_para.addr_type = addr_type; + + auto ret = halFunc(0, + reinterpret_cast(&in_map_para), + sizeof(struct AddrMapInPara), + reinterpret_cast(&out_map_para), + nullptr); + + if (ret != 0) { + LOG_ERROR("halMemCtl failed with rc=%d", ret); + return ret; + } + + LOG_INFO("Register base: ptr=0x%llx, len=0x%llx", out_map_para.ptr, out_map_para.len); + + // Iterate over all cores and subcores + for (uint32_t i = 0; i < DAV_2201::PLATFORM_MAX_PHYSICAL_CORES; i++) { + for (uint32_t j = 0; j < PLATFORM_SUB_CORES_PER_AICORE; j++) { + uint64_t vaddr = 0UL; + if (is_valid(i)) { + vaddr = out_map_para.ptr + (i * core_stride + j * sub_core_stride); + } + if (j == 0) { + aic.push_back(vaddr); + } else { + aiv.push_back(vaddr); + } + } + } + + return 0; +} + +void get_aicore_regs(std::vector& regs, uint64_t device_id) { + std::vector aiv; + std::vector aic; + + int rt = get_aicore_reg_info(aic, aiv, ADDR_MAP_TYPE_REG_AIC_CTRL, device_id); + + if (rt != 0) { + LOG_ERROR("get_aicore_reg_info failed, using placeholder addresses"); + // Fallback: generate placeholder addresses + for (int i = 0; i < 25; i++) { + aic.push_back(0xDEADBEEF00000000ULL + (i * 0x800000)); // 8M stride + aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x100000); + aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x200000); + } + } + + // AIC cores first, then AIV cores + regs.insert(regs.end(), aic.begin(), aic.end()); + regs.insert(regs.end(), aiv.begin(), aiv.end()); + + LOG_INFO("get_aicore_regs: Retrieved %zu AIC and %zu AIV register addresses", + aic.size(), aiv.size()); +} + +int init_aicore_register_addresses( + uint64_t* runtime_regs_ptr, + uint64_t device_id, + MemoryAllocator& allocator) { + + if (runtime_regs_ptr == nullptr) { + LOG_ERROR("init_aicore_register_addresses: Invalid parameters"); + return -1; + } + + LOG_INFO("Retrieving and allocating AICore register addresses..."); + + // Step 1: Get register addresses from HAL + std::vector host_regs; + get_aicore_regs(host_regs, device_id); + + if (host_regs.empty()) { + LOG_ERROR("Failed to get AICore register addresses"); + return -1; + } + + // Step 2: Allocate device memory for register address array + size_t regs_size = host_regs.size() * sizeof(int64_t); + void* reg_ptr = allocator.alloc(regs_size); + if (reg_ptr == nullptr) { + LOG_ERROR("Failed to allocate device memory for register addresses"); + return -1; + } + + // Step 3: Copy register addresses to device memory + int ret = rtMemcpy(reg_ptr, regs_size, host_regs.data(), regs_size, RT_MEMCPY_HOST_TO_DEVICE); + if (ret != 0) { + LOG_ERROR("Failed to copy register addresses to device (rc=%d)", ret); + return -1; + } + + // Step 4: Store device pointer in runtime.regs + *runtime_regs_ptr = reinterpret_cast(reg_ptr); + + LOG_INFO("Successfully initialized register addresses: %zu addresses at device 0x%llx", + host_regs.size(), *runtime_regs_ptr); + + return 0; +} diff --git a/src/platform/a2a3sim/aicore/inner_kernel.h b/src/platform/a2a3sim/aicore/inner_kernel.h index 3a74156e..25cb0762 100644 --- a/src/platform/a2a3sim/aicore/inner_kernel.h +++ b/src/platform/a2a3sim/aicore/inner_kernel.h @@ -24,8 +24,13 @@ // Cache coherency constants (no-op in simulation) #define ENTIRE_DATA_CACHE 0 +#define SINGLE_CACHE_LINE 0 #define CACHELINE_OUT 0 +// pipe_barrier - memory barrier in simulation (hardware pipeline synchronization) +#define PIPE_ALL 0 +#define pipe_barrier(pipe) __sync_synchronize() + // ============================================================================= // System Counter Simulation // ============================================================================= @@ -56,4 +61,49 @@ inline uint64_t get_sys_cnt() { return ticks; } +// ============================================================================= +// Register Access Simulation +// ============================================================================= + +/** + * Per-thread simulated register base address. + * Set by the kernel wrapper before calling aicore_execute(). + * Points to a SIM_REG_BLOCK_SIZE-byte block allocated by DeviceRunner. + */ +extern thread_local volatile uint8_t* g_sim_reg_base; + +/** + * AICore execution status (matches aicpu/aicpu_regs.h definition) + */ +enum class AICoreStatus : uint32_t { + IDLE = 0, + BUSY = 1, +}; + +/** + * Read an AICore register from simulated register memory + * + * @param reg Register identifier + * @return Register value (zero-extended to uint64_t) + */ +inline uint64_t read_reg(RegId reg) { + uint32_t offset = reg_offset(reg); + __sync_synchronize(); + return static_cast( + *reinterpret_cast(g_sim_reg_base + offset)); +} + +/** + * Write to an AICore register in simulated register memory + * + * @param reg Register identifier + * @param value Value to write + */ +inline void write_reg(RegId reg, uint64_t value) { + uint32_t offset = reg_offset(reg); + *reinterpret_cast(g_sim_reg_base + offset) = + static_cast(value); + __sync_synchronize(); +} + #endif // PLATFORM_A2A3SIM_AICORE_INNER_KERNEL_H_ diff --git a/src/platform/a2a3sim/aicore/kernel.cpp b/src/platform/a2a3sim/aicore/kernel.cpp index 46058f2c..8dcbcb9d 100644 --- a/src/platform/a2a3sim/aicore/kernel.cpp +++ b/src/platform/a2a3sim/aicore/kernel.cpp @@ -2,18 +2,30 @@ * AICore Kernel Wrapper for Simulation * * Provides a wrapper around aicore_execute for dlsym lookup. - * This allows adding pre/post processing around kernel execution. + * Sets up per-thread simulated register base before calling the executor. */ +#include #include "aicore/aicore.h" #include "common/core_type.h" +#include "common/platform_config.h" +#include "runtime.h" -class Runtime; +// Thread-local simulated register base (declared in inner_kernel.h) +thread_local volatile uint8_t* g_sim_reg_base = nullptr; // Declare the original function (defined in aicore_executor.cpp with weak linkage) -void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type); +void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id); // Wrapper with extern "C" for dlsym lookup -extern "C" void aicore_execute_wrapper(__gm__ Runtime* runtime, int block_idx, CoreType core_type) { - aicore_execute(runtime, block_idx, core_type); +extern "C" void aicore_execute_wrapper(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id) { + // Set up simulated register base for this thread. + // runtime->regs points to an array of uint64_t base addresses (one per core). + // physical_core_id indexes into it to get this core's register block. + if (runtime->regs != 0) { + uint64_t* regs_array = reinterpret_cast(runtime->regs); + g_sim_reg_base = reinterpret_cast(regs_array[physical_core_id]); + } + + aicore_execute(runtime, block_idx, core_type, physical_core_id); } diff --git a/src/platform/a2a3sim/aicpu/aicpu_regs.cpp b/src/platform/a2a3sim/aicpu/aicpu_regs.cpp new file mode 100644 index 00000000..1f391689 --- /dev/null +++ b/src/platform/a2a3sim/aicpu/aicpu_regs.cpp @@ -0,0 +1,23 @@ +/** + * @file aicpu_regs.cpp + * @brief AICPU-side register access implementation (a2a3sim simulation) + * + * Uses volatile pointer access to host-allocated memory blocks + * that simulate AICore hardware registers. + */ + +#include "aicpu/aicpu_regs.h" + +uint64_t read_reg(uint64_t reg_base_addr, RegId reg) { + volatile uint32_t* ptr = reinterpret_cast( + reg_base_addr + reg_offset(reg)); + __sync_synchronize(); + return static_cast(*ptr); +} + +void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) { + volatile uint32_t* ptr = reinterpret_cast( + reg_base_addr + reg_offset(reg)); + *ptr = static_cast(value); + __sync_synchronize(); +} diff --git a/src/platform/a2a3sim/host/device_runner.cpp b/src/platform/a2a3sim/host/device_runner.cpp index 491b33c8..c9a96550 100644 --- a/src/platform/a2a3sim/host/device_runner.cpp +++ b/src/platform/a2a3sim/host/device_runner.cpp @@ -18,7 +18,7 @@ // Function pointer types for dynamically loaded executors typedef int (*aicpu_execute_func_t)(Runtime* runtime); -typedef void (*aicore_execute_func_t)(Runtime* runtime, int block_idx, CoreType core_type); +typedef void (*aicore_execute_func_t)(Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id); // ============================================================================= // DeviceRunner Implementation @@ -89,7 +89,7 @@ int DeviceRunner::ensure_binaries_loaded(const std::vector& aicpu_so_bi return -1; } - aicore_execute_func_ = reinterpret_cast(dlsym(aicore_so_handle_, "aicore_execute_wrapper")); + aicore_execute_func_ = reinterpret_cast(dlsym(aicore_so_handle_, "aicore_execute_wrapper")); if (aicore_execute_func_ == nullptr) { LOG_ERROR("dlsym failed for aicore_execute_wrapper: %s", dlerror()); return -1; @@ -212,6 +212,30 @@ int DeviceRunner::run(Runtime& runtime, } } + // Allocate simulated register blocks for all AICore cores + size_t total_reg_size = num_aicore * SIM_REG_BLOCK_SIZE; + void* reg_blocks = mem_alloc_.alloc(total_reg_size); + if (reg_blocks == nullptr) { + LOG_ERROR("Failed to allocate simulated register memory (%zu bytes)", total_reg_size); + return -1; + } + std::memset(reg_blocks, 0, total_reg_size); + + // Build array of per-core register base addresses + size_t regs_array_size = num_aicore * sizeof(uint64_t); + uint64_t* regs_array = reinterpret_cast(mem_alloc_.alloc(regs_array_size)); + if (regs_array == nullptr) { + LOG_ERROR("Failed to allocate register address array"); + return -1; + } + for (int i = 0; i < num_aicore; i++) { + regs_array[i] = reinterpret_cast( + static_cast(reg_blocks) + i * SIM_REG_BLOCK_SIZE); + } + runtime.regs = reinterpret_cast(regs_array); + + LOG_INFO("Allocated simulated registers: %d cores x 0x%x bytes", num_aicore, SIM_REG_BLOCK_SIZE); + // Check if executors are loaded if (aicpu_execute_func_ == nullptr || aicore_execute_func_ == nullptr) { LOG_ERROR("Executor functions not loaded. Call ensure_binaries_loaded first."); @@ -232,8 +256,9 @@ int DeviceRunner::run(Runtime& runtime, std::vector aicore_threads; for (int i = 0; i < num_aicore; i++) { CoreType core_type = runtime.workers[i].core_type; - aicore_threads.emplace_back([this, &runtime, i, core_type]() { - aicore_execute_func_(&runtime, i, core_type); + uint32_t physical_core_id = static_cast(i); + aicore_threads.emplace_back([this, &runtime, i, core_type, physical_core_id]() { + aicore_execute_func_(&runtime, i, core_type, physical_core_id); }); } diff --git a/src/platform/a2a3sim/host/device_runner.h b/src/platform/a2a3sim/host/device_runner.h index 3ae8ef34..f169b8f1 100644 --- a/src/platform/a2a3sim/host/device_runner.h +++ b/src/platform/a2a3sim/host/device_runner.h @@ -207,7 +207,7 @@ class DeviceRunner { void* aicpu_so_handle_{nullptr}; void* aicore_so_handle_{nullptr}; int (*aicpu_execute_func_)(Runtime*){nullptr}; - void (*aicore_execute_func_)(Runtime*, int, CoreType){nullptr}; + void (*aicore_execute_func_)(Runtime*, int, CoreType, uint32_t){nullptr}; std::string aicpu_so_path_; std::string aicore_so_path_; diff --git a/src/platform/include/aicpu/aicpu_regs.h b/src/platform/include/aicpu/aicpu_regs.h new file mode 100644 index 00000000..33e5b1ce --- /dev/null +++ b/src/platform/include/aicpu/aicpu_regs.h @@ -0,0 +1,42 @@ +/** + * @file aicpu_regs.h + * @brief AICPU-side register access interface + * + * Provides unified read_reg/write_reg for AICPU to access AICore registers. + * On real hardware (a2a3): MMIO volatile pointer access with memory barriers. + * In simulation (a2a3sim): volatile pointer access to host-allocated memory. + */ + +#ifndef PLATFORM_AICPU_AICPU_REGS_H_ +#define PLATFORM_AICPU_AICPU_REGS_H_ + +#include +#include "common/platform_config.h" + +/** + * AICore execution status (read from COND register) + */ +enum class AICoreStatus : uint32_t { + IDLE = 0, + BUSY = 1, +}; + +/** + * Read a register value from an AICore's register block + * + * @param reg_base_addr Base address of the AICore's register block + * @param reg Register identifier + * @return Register value (zero-extended to uint64_t) + */ +uint64_t read_reg(uint64_t reg_base_addr, RegId reg); + +/** + * Write a value to an AICore's register + * + * @param reg_base_addr Base address of the AICore's register block + * @param reg Register identifier + * @param value Value to write (truncated to register width) + */ +void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value); + +#endif // PLATFORM_AICPU_AICPU_REGS_H_ diff --git a/src/platform/include/common/perf_profiling.h b/src/platform/include/common/perf_profiling.h index e3fe4a47..fb99199d 100644 --- a/src/platform/include/common/perf_profiling.h +++ b/src/platform/include/common/perf_profiling.h @@ -38,11 +38,6 @@ #define RUNTIME_MAX_FANOUT 512 #endif -// Maximum cores that can be profiled simultaneously -#ifndef PLATFORM_MAX_CORES -#define PLATFORM_MAX_CORES 72 // 24 blocks × 3 cores/block -#endif - // ============================================================================= // Buffer Status Enumeration // ============================================================================= diff --git a/src/platform/include/common/platform_config.h b/src/platform/include/common/platform_config.h index 5992f563..bcba2965 100644 --- a/src/platform/include/common/platform_config.h +++ b/src/platform/include/common/platform_config.h @@ -100,4 +100,71 @@ constexpr int PLATFORM_PROF_TIMEOUT_SECONDS = 2; * Number of empty polling iterations before checking timeout */ constexpr int PLATFORM_PROF_EMPTY_POLLS_CHECK_NUM = 1000; + +// ============================================================================= +// Register Communication Configuration +// ============================================================================= + +// Register offsets for AICore SPR access +constexpr uint32_t REG_SPR_DATA_MAIN_BASE_OFFSET = 0xA0; // Task dispatch (AICPU→AICore) +constexpr uint32_t REG_SPR_COND_OFFSET = 0x4C8; // Status (AICore→AICPU): 0=IDLE, 1=BUSY +constexpr uint32_t REG_SPR_FAST_PATH_ENABLE_OFFSET = 0x18; + +// Fast path control values +constexpr uint32_t REG_SPR_FAST_PATH_OPEN = 0xE; +constexpr uint32_t REG_SPR_FAST_PATH_CLOSE = 0xF; + +// Exit signal for AICore shutdown +constexpr uint32_t AICORE_EXIT_SIGNAL = 0x7FFFFFF0; + +// Physical core ID mask for get_coreid() +constexpr uint32_t AICORE_COREID_MASK = 0x0FFF; + +/** + * Register identifier for unified read_reg/write_reg interface + */ +enum class RegId : uint32_t { + DATA_MAIN_BASE = 0, // Task dispatch (AICPU→AICore) + COND = 1, // Status (AICore→AICPU) + FAST_PATH_ENABLE = 2, // Fast path control +}; + +/** + * Map RegId to hardware register offset + */ +constexpr uint32_t reg_offset(RegId reg) { + switch (reg) { + case RegId::DATA_MAIN_BASE: return REG_SPR_DATA_MAIN_BASE_OFFSET; + case RegId::COND: return REG_SPR_COND_OFFSET; + case RegId::FAST_PATH_ENABLE: return REG_SPR_FAST_PATH_ENABLE_OFFSET; + default: return 0; + } +} + +// Size of simulated register block per core (covers largest offset + 4 bytes) +constexpr uint32_t SIM_REG_BLOCK_SIZE = 0x500; + +// ============================================================================= +// Hardware Configuration Constants +// ============================================================================= + +/** + * AICore register bitmap buffer length + * Used for querying valid AICore cores via HAL API + */ +constexpr uint8_t PLATFORM_AICORE_BITMAP_LEN = 2; + +/** + * Number of sub-cores per AICore + * Hardware architecture: 1 AICore = 1 AIC + 2 AIV sub-cores + */ +constexpr uint32_t PLATFORM_SUB_CORES_PER_AICORE = PLATFORM_CORES_PER_BLOCKDIM; + +/** + * Maximum physical AICore count for DAV 2201 chip + */ +namespace DAV_2201 { +constexpr uint32_t PLATFORM_MAX_PHYSICAL_CORES = 25; +} + #endif // PLATFORM_COMMON_PLATFORM_CONFIG_H_ diff --git a/src/platform/include/host/host_regs.h b/src/platform/include/host/host_regs.h new file mode 100644 index 00000000..655f19f3 --- /dev/null +++ b/src/platform/include/host/host_regs.h @@ -0,0 +1,42 @@ +/** + * @file host_regs.h + * @brief AICore register address retrieval via CANN HAL APIs + * + * Provides register base addresses for AICPU to perform MMIO-based + * task dispatch to AICore cores. + */ + +#ifndef PLATFORM_HOST_HOST_REGS_H_ +#define PLATFORM_HOST_HOST_REGS_H_ + +#include +#include + +// Forward declaration +class MemoryAllocator; + +/** + * Get AICore register base addresses for all cores + * + * @param regs Output vector (AIC cores followed by AIV cores) + * @param device_id Device ID + */ +void get_aicore_regs(std::vector& regs, uint64_t device_id); + +/** + * Initialize AICore register addresses for runtime + * + * Retrieves register addresses from HAL, allocates device memory, + * copies addresses to device, and stores the device pointer in runtime. + * + * @param runtime_regs_ptr Pointer to runtime.regs field + * @param device_id Device ID + * @param allocator Memory allocator for device memory + * @return 0 on success, negative on failure + */ +int init_aicore_register_addresses( + uint64_t* runtime_regs_ptr, + uint64_t device_id, + MemoryAllocator& allocator); + +#endif // PLATFORM_HOST_HOST_REGS_H_ diff --git a/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp b/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp index f002616b..b4c33b22 100644 --- a/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp +++ b/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp @@ -46,7 +46,7 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ Task* kernel(reinterpret_cast<__gm__ int64_t*>(task->args)); } -__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type) { +__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id) { __gm__ Handshake* my_hank = (__gm__ Handshake*)(&runtime->workers[block_idx]); // Phase 1: Wait for AICPU initialization signal diff --git a/src/runtime/aicpu_build_graph/runtime/runtime.h b/src/runtime/aicpu_build_graph/runtime/runtime.h index 9ccee95d..18812e89 100644 --- a/src/runtime/aicpu_build_graph/runtime/runtime.h +++ b/src/runtime/aicpu_build_graph/runtime/runtime.h @@ -115,6 +115,7 @@ struct Handshake { volatile CoreType core_type; // Core type: CoreType::AIC or CoreType::AIV volatile uint64_t perf_records_addr; // Performance records address volatile uint32_t perf_buffer_status; // 0 = not full, 1 == full + volatile uint32_t physical_core_id; // Physical core ID } __attribute__((aligned(64))); /** @@ -245,6 +246,9 @@ class Runtime { // Execution parameters for AICPU scheduling int sche_cpu_num; // Number of AICPU threads for scheduling + // Register-based task dispatch (unused in aicpu_build_graph, but required by platform) + uint64_t regs{0}; // Device memory pointer to register address array + /** * Orchestration payload (auto-populated by init_runtime_impl, consumed by AICPU orchestration). * diff --git a/src/runtime/host_build_graph/aicore/aicore_executor.cpp b/src/runtime/host_build_graph/aicore/aicore_executor.cpp index 9781bec4..87e4f798 100644 --- a/src/runtime/host_build_graph/aicore/aicore_executor.cpp +++ b/src/runtime/host_build_graph/aicore/aicore_executor.cpp @@ -1,7 +1,7 @@ #include "aicore/aicore.h" #include "runtime.h" #include "common/perf_profiling.h" -#include "common/memory_barrier.h" +#include "common/platform_config.h" // Platform configuration (C/C++ compatible) typedef void (*KernelFunc)(__gm__ int64_t*); @@ -28,23 +28,19 @@ __aicore__ __attribute__((always_inline)) static void record_task_performance( CoreType core_type, uint64_t kernel_ready_time) { - // Check if buffer is available for writing + // dcci() for handshake visibility during profiling + dcci((__gm__ uint32_t*)&my_hank->perf_buffer_status, SINGLE_CACHE_LINE, CACHELINE_OUT); + if (my_hank->perf_buffer_status != 0) { - return; // Buffer full, skip recording + return; } - // Get current performance buffer pointer __gm__ PerfBuffer* perf_buf = (__gm__ PerfBuffer*)my_hank->perf_records_addr; - - // Get current count uint32_t idx = perf_buf->count; - // Check if buffer has space if (idx < PLATFORM_PROF_BUFFER_SIZE) { - // Get pointer to the record slot __gm__ PerfRecord* record = (__gm__ PerfRecord*)&perf_buf->records[idx]; - // Write record data (only essential fields, fanout filled by AICPU) record->start_time = start_time; record->end_time = end_time; record->kernel_ready_time = kernel_ready_time; @@ -53,20 +49,16 @@ __aicore__ __attribute__((always_inline)) static void record_task_performance( record->core_id = block_idx; record->core_type = core_type; - // Increment count after writing record perf_buf->count = idx + 1; - - // Write memory barrier: ensure performance data is visible to Host - wmb(); - - // Check if buffer is full after this write + dcci(record, ENTIRE_DATA_CACHE, CACHELINE_OUT); if (perf_buf->count >= PLATFORM_PROF_BUFFER_SIZE) { - my_hank->perf_buffer_status = 1; // Notify AICPU: buffer full + my_hank->perf_buffer_status = 1; } } else { - // Buffer is already full my_hank->perf_buffer_status = 1; } + // dcci() for handshake visibility during profiling + dcci((__gm__ uint32_t*)&my_hank->perf_buffer_status, SINGLE_CACHE_LINE, CACHELINE_OUT); } __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ Task* task) { @@ -75,9 +67,12 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ Task* } KernelFunc kernel = (KernelFunc)task->function_bin_addr; kernel(reinterpret_cast<__gm__ int64_t*>(task->args)); + + // Ensure all memory writes are visible to other cores + pipe_barrier(PIPE_ALL); } -__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type) { +__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id) { __gm__ Handshake* my_hank = (__gm__ Handshake*)(&runtime->workers[block_idx]); // Phase 1: Wait for AICPU initialization signal @@ -85,51 +80,45 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in dcci(my_hank, ENTIRE_DATA_CACHE, CACHELINE_OUT); } - // Phase 2: Signal AICore is ready and report core type - my_hank->core_type = core_type; // Report core type to AICPU - my_hank->aicore_done = block_idx + 1; // Signal ready (use block_idx + 1 to avoid 0) + // Report physical core ID and core type for AICPU + my_hank->physical_core_id = physical_core_id; + my_hank->core_type = core_type; + my_hank->aicore_done = block_idx + 1; + + dcci(my_hank, ENTIRE_DATA_CACHE, CACHELINE_OUT); + + // Report initial idle status for task dispatch + write_reg(RegId::COND, static_cast(AICoreStatus::IDLE)); - // Check if profiling is enabled bool profiling_enabled = runtime->enable_profiling; + uint64_t kernel_ready_time = get_sys_cnt(); - // Record kernel ready time (before entering main loop) - // This timestamp represents when the AICore is ready to execute tasks - // but hasn't started executing any task yet. - // Used for: 1) Startup overhead analysis, 2) Cross-core time alignment - uint64_t kernel_ready_time = 0; - if (profiling_enabled) { - kernel_ready_time = get_sys_cnt(); - } + // Main loop: poll DATA_MAIN_BASE for task_id + volatile uint32_t task_id = 0; + volatile uint32_t last_task_id = 0; - // Phase 3: Main execution loop - poll for tasks until quit signal while (true) { - dcci(my_hank, ENTIRE_DATA_CACHE, CACHELINE_OUT); - - // Check for quit command from AICPU - if (my_hank->control == 1) { - break; // Exit kernel + task_id = static_cast(read_reg(RegId::DATA_MAIN_BASE)); + if (task_id == AICORE_EXIT_SIGNAL) { + break; } - // Execute task if assigned (task != 0 means valid task pointer) - if (my_hank->task_status == 1 && my_hank->task != 0) { - __gm__ Task* task_ptr = reinterpret_cast<__gm__ Task*>(my_hank->task); - - // Performance profiling: record start time - uint64_t start_time = 0; - start_time = get_sys_cnt(); + // Execute task if new (task_id encoding: 0=idle, task_id+1=task) + if (task_id != 0 && task_id != last_task_id) { + write_reg(RegId::COND, static_cast(AICoreStatus::BUSY)); + __gm__ Task* task_ptr = &(runtime->tasks[task_id - 1]); + uint64_t start_time = get_sys_cnt(); - - // Execute the task execute_task(task_ptr); - // Performance profiling: record task execution if (profiling_enabled) { uint64_t end_time = get_sys_cnt(); - record_task_performance(my_hank, task_ptr, start_time, end_time, block_idx, core_type, kernel_ready_time); + record_task_performance(my_hank, task_ptr, start_time, end_time, + block_idx, core_type, kernel_ready_time); } - // Mark task as complete (task_status: 0=idle, 1=busy) - my_hank->task_status = 0; + last_task_id = task_id; + write_reg(RegId::COND, static_cast(AICoreStatus::IDLE)); } } } diff --git a/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp index 4b378fa0..daf4f62c 100644 --- a/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp +++ b/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp @@ -9,15 +9,19 @@ #include "runtime.h" #include "aicpu/device_log.h" #include "inner_aicpu.h" +#include "aicpu/aicpu_regs.h" // Register-based communication constexpr int MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS; constexpr int MAX_AIC_PER_THREAD = PLATFORM_MAX_AIC_PER_THREAD; constexpr int MAX_AIV_PER_THREAD = PLATFORM_MAX_AIV_PER_THREAD; constexpr int MAX_CORES_PER_THREAD = PLATFORM_MAX_CORES_PER_THREAD; +constexpr int MAX_CORES = PLATFORM_MAX_CORES; // Core information for discovery struct CoreInfo { - int worker_id; // Index in runtime.workers[] + int worker_id; // Index in runtime.workers[] + uint32_t physical_core_id; // Hardware physical core ID (from AICore) + uint64_t reg_addr; // Cached register address for fast access CoreType core_type; }; @@ -40,6 +44,12 @@ struct AicpuExecutor { int aic_count_{0}; int aiv_count_{0}; + // Fast lookup: core_id -> reg_addr + uint64_t core_id_to_reg_addr_[MAX_CORES_PER_THREAD]; + + // Track executing task_id per core (-1 = idle) + int executing_task_ids_[MAX_CORES]; + // ===== Task queue state ===== std::mutex ready_queue_aic_mutex_; int ready_queue_aic_[RUNTIME_MAX_TASKS]; @@ -122,6 +132,11 @@ int AicpuExecutor::init(Runtime* runtime) { LOG_INFO("Config: threads=%d, cores=%d, cores_per_thread=%d", thread_num_, cores_total_num_, thread_cores_num_); + // Initialize executing_task_ids_ to -1 (idle) + for (int i = 0; i < MAX_CORES; i++) { + executing_task_ids_[i] = -1; + } + // Assign discovered cores to threads assign_cores_to_threads(); @@ -228,24 +243,39 @@ int AicpuExecutor::handshake_all_cores(Runtime* runtime) { // Busy wait for core response } - // Read core type (written by AICore during handshake) CoreType type = hank->core_type; + uint32_t physical_core_id = hank->physical_core_id; + + // Get register address using physical_core_id + uint64_t* regs = reinterpret_cast(runtime->regs); + uint64_t reg_addr = regs[physical_core_id]; - // Classify and store core information if (type == CoreType::AIC) { aic_cores_[aic_count_].worker_id = i; + aic_cores_[aic_count_].physical_core_id = physical_core_id; + aic_cores_[aic_count_].reg_addr = reg_addr; aic_cores_[aic_count_].core_type = type; aic_count_++; } else if (type == CoreType::AIV) { aiv_cores_[aiv_count_].worker_id = i; + aiv_cores_[aiv_count_].physical_core_id = physical_core_id; + aiv_cores_[aiv_count_].reg_addr = reg_addr; aiv_cores_[aiv_count_].core_type = type; aiv_count_++; } else { - LOG_ERROR("Unknown core type %d for core %d", static_cast(type), i); + LOG_ERROR("Unknown core type from core %d", i); return -1; } - LOG_INFO(" Core %d: type=%s", i, core_type_to_string(type)); + core_id_to_reg_addr_[i] = reg_addr; + + LOG_INFO(" Core %d: type=%s, physical_id=%u, reg_addr=0x%lx", + i, core_type_to_string(type), physical_core_id, reg_addr); + + if (reg_addr != 0) { + write_reg(reg_addr, RegId::FAST_PATH_ENABLE, REG_SPR_FAST_PATH_OPEN); + write_reg(reg_addr, RegId::DATA_MAIN_BASE, 0); + } } LOG_INFO("Discovery complete: AIC=%d, AIV=%d, Total=%d", aic_count_, aiv_count_, cores_total_num_); @@ -340,7 +370,14 @@ int AicpuExecutor::shutdown_aicore(Runtime* runtime, int thread_idx, const int* int core_id = cur_thread_cores[i]; Handshake* hank = &all_hanks[core_id]; LOG_INFO("Thread %d: AICPU hank addr = 0x%lx", thread_idx, (uint64_t)hank); - hank->control = 1; + + uint64_t reg_addr = core_id_to_reg_addr_[core_id]; + if (reg_addr != 0) { + write_reg(reg_addr, RegId::DATA_MAIN_BASE, AICORE_EXIT_SIGNAL); + write_reg(reg_addr, RegId::FAST_PATH_ENABLE, REG_SPR_FAST_PATH_CLOSE); + } else { + LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id); + } } LOG_INFO("Thread %d: Shutdown complete", thread_idx); return 0; @@ -377,15 +414,17 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const for (int i = 0; i < core_num; i++) { int core_id = cur_thread_cores[i]; - Handshake* h = &hank[core_id]; - if (h->task_status != 0 || h->task != 0) { + uint64_t reg_addr = core_id_to_reg_addr_[core_id]; + AICoreStatus status = static_cast(read_reg(reg_addr, RegId::COND)); + + if (status != AICoreStatus::IDLE || executing_task_ids_[core_id] >= 0) { all_cores_idle = false; if (verification_warning_count == 0) { - LOG_WARN("Thread %d: Counter reached %d/%d but core %d still has work (status=%d, task=%p)", + LOG_WARN("Thread %d: Counter reached %d/%d but core %d still has work (COND=%d, task_id=%d)", thread_idx, completed_tasks_.load(std::memory_order_acquire), task_count, - core_id, h->task_status, (void*)h->task); + core_id, static_cast(status), executing_task_ids_[core_id]); } break; } @@ -417,15 +456,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const // Phase 1: Process completed tasks on my managed cores for (int i = 0; i < core_num; i++) { int core_id = cur_thread_cores[i]; - Handshake* h = &hank[core_id]; - - // Core finished a task (idle + task not null) - if (h->task_status == 0 && h->task != 0) { - // Get completed task pointer before any buffer operations - Task* task = reinterpret_cast(h->task); - int completed_task_id = task->task_id; - - // Write AICPU dispatch/finish timestamps into the PerfRecord + uint64_t reg_addr = core_id_to_reg_addr_[core_id]; + AICoreStatus status = static_cast(read_reg(reg_addr, RegId::COND)); + if (status == AICoreStatus::IDLE && executing_task_ids_[core_id] >= 0) { + int task_id = executing_task_ids_[core_id]; + int completed_task_id = task_id; + Handshake* h = &hank[core_id]; if (profiling_enabled) { uint64_t finish_ts = get_sys_cnt_aicpu(); PerfBuffer* perf_buf = (PerfBuffer*)h->perf_records_addr; @@ -445,10 +481,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const if (profiling_enabled && h->perf_buffer_status == 1) { switch_perf_buffer(&runtime, core_id, thread_idx); } - // Clear the task pointer - h->task = 0; - int task_id = completed_task_id; + Task* task = runtime.get_task(task_id); LOG_INFO("Thread %d: Core %d completed task %d", thread_idx, core_id, task_id); @@ -484,10 +518,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const } } - // Update counters + // Update counters and clear task tracking cur_thread_tasks_in_flight--; cur_thread_completed++; made_progress = true; + // Clear task_id + executing_task_ids_[core_id] = -1; completed_tasks_.fetch_add(1, std::memory_order_release); } } @@ -497,10 +533,11 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const // Phase 2: Dispatch new tasks from matching ready queue to idle cores for (int i = 0; i < core_num; i++) { int core_id = cur_thread_cores[i]; - Handshake* h = &hank[core_id]; + uint64_t reg_addr = core_id_to_reg_addr_[core_id]; + AICoreStatus status = static_cast(read_reg(reg_addr, RegId::COND)); - // Core is idle and available (idle + task is null) - if (h->task_status == 0 && h->task == 0) { + if (status == AICoreStatus::IDLE && executing_task_ids_[core_id] == -1) { + Handshake* h = &hank[core_id]; // Dispatch from matching queue based on core type if (h->core_type == CoreType::AIC) { // AIC core if (ready_count_aic_.load(std::memory_order_acquire) > 0) { @@ -511,16 +548,24 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const int task_id = ready_queue_aic_[ready_queue_aic_head_]; ready_queue_aic_head_ = (ready_queue_aic_head_ + 1) % RUNTIME_MAX_TASKS; ready_count_aic_.fetch_sub(1, std::memory_order_release); - Task* task = runtime.get_task(task_id); LOG_INFO("Thread %d: Dispatching AIC task %d to core %d (head=%d)", thread_idx, task_id, core_id, ready_queue_aic_head_); - h->task = reinterpret_cast(task); + uint64_t reg_addr = core_id_to_reg_addr_[core_id]; + + // Pre-set COND=BUSY before writing task_id to prevent + // false completion detection (AICPU seeing stale IDLE + // before AICore has started the task) + write_reg(reg_addr, RegId::COND, static_cast(AICoreStatus::BUSY)); + + // Write task_id+1 to register + write_reg(reg_addr, RegId::DATA_MAIN_BASE, static_cast(task_id + 1)); + if (runtime.enable_profiling) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } - h->task_status = 1; // Mark as busy + executing_task_ids_[core_id] = task_id; cur_thread_tasks_in_flight++; made_progress = true; } @@ -534,16 +579,24 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const int task_id = ready_queue_aiv_[ready_queue_aiv_head_]; ready_queue_aiv_head_ = (ready_queue_aiv_head_ + 1) % RUNTIME_MAX_TASKS; ready_count_aiv_.fetch_sub(1, std::memory_order_release); - Task* task = runtime.get_task(task_id); LOG_INFO("Thread %d: Dispatching AIV task %d to core %d (head=%d)", thread_idx, task_id, core_id, ready_queue_aiv_head_); - h->task = reinterpret_cast(task); + uint64_t reg_addr = core_id_to_reg_addr_[core_id]; + + // Pre-set COND=BUSY before writing task_id to prevent + // false completion detection (AICPU seeing stale IDLE + // before AICore has started the task) + write_reg(reg_addr, RegId::COND, static_cast(AICoreStatus::BUSY)); + + // Write task_id+1 to register + write_reg(reg_addr, RegId::DATA_MAIN_BASE, static_cast(task_id + 1)); + if (runtime.enable_profiling) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } - h->task_status = 1; // Mark as busy + executing_task_ids_[core_id] = task_id; cur_thread_tasks_in_flight++; made_progress = true; } @@ -656,7 +709,6 @@ void AicpuExecutor::diagnose_stuck_state(Runtime& runtime, int thread_idx, int busy_cores = 0; int idle_cores = 0; - int anomaly_cores = 0; LOG_ERROR("Core Status:"); for (int i = 0; i < core_num; i++) { @@ -665,24 +717,30 @@ void AicpuExecutor::diagnose_stuck_state(Runtime& runtime, int thread_idx, const char* core_type_str = core_type_to_string(h->core_type); - if (h->task != 0) { - Task* task = reinterpret_cast(h->task); + uint64_t reg_addr = core_id_to_reg_addr_[core_id]; + AICoreStatus status = static_cast(read_reg(reg_addr, RegId::COND)); + + if (status != AICoreStatus::IDLE) { busy_cores++; - LOG_ERROR(" Core %d [%s, BUSY]: task_id=%d, func_id=%d, fanin=%d, fanout=%d", - core_id, core_type_str, - task->task_id, task->func_id, - task->fanin.load(std::memory_order_acquire), - task->fanout_count); - } else if (h->task_status != 0) { - anomaly_cores++; - LOG_ERROR(" Core %d [%s, ANOMALY]: status=BUSY but task=NULL", core_id, core_type_str); + int task_id = executing_task_ids_[core_id]; + if (task_id >= 0) { + Task* task = runtime.get_task(task_id); + LOG_ERROR(" Core %d [%s, BUSY]: COND=%d, task_id=%d, func_id=%d, fanin=%d, fanout=%d", + core_id, core_type_str, static_cast(status), + task->task_id, task->func_id, + task->fanin.load(std::memory_order_acquire), + task->fanout_count); + } else { + LOG_ERROR(" Core %d [%s, BUSY]: COND=%d but task_id not tracked", + core_id, core_type_str, static_cast(status)); + } } else { idle_cores++; } } - LOG_ERROR("Summary: %d busy, %d idle, %d anomaly", busy_cores, idle_cores, anomaly_cores); + LOG_ERROR("Summary: %d busy, %d idle", busy_cores, idle_cores); // Diagnose deadlock vs livelock if (busy_cores == 0 && aic_ready == 0 && aiv_ready == 0 && completed < total) { diff --git a/src/runtime/host_build_graph/runtime/runtime.h b/src/runtime/host_build_graph/runtime/runtime.h index 919cada9..cfefabd8 100644 --- a/src/runtime/host_build_graph/runtime/runtime.h +++ b/src/runtime/host_build_graph/runtime/runtime.h @@ -95,16 +95,20 @@ * - task_status: Written by both (AICPU=1 on dispatch, AICore=0 on completion) * - control: Written by AICPU, read by AICore (0 = continue, 1 = quit) * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV) + * - perf_records_addr: Written by AICPU, read by AICore (performance records address) + * - perf_buffer_status: Written by both (AICPU=1 on buffer full, AICore=0 on buffer empty) + * - physical_core_id: Written by AICPU, read by AICore (physical core ID) */ struct Handshake { - volatile uint32_t aicpu_ready; // AICPU ready signal: 0=not ready, 1=ready - volatile uint32_t aicore_done; // AICore ready signal: 0=not ready, core_id+1=ready - volatile uint64_t task; // Task pointer: 0=no task, non-zero=Task* address - volatile int32_t task_status; // Task execution status: 0=idle, 1=busy - volatile int32_t control; // Control signal: 0=execute, 1=quit - volatile CoreType core_type; // Core type: CoreType::AIC or CoreType::AIV - volatile uint64_t perf_records_addr; // Performance records address - volatile uint32_t perf_buffer_status; // 0 = not full, 1 == full + volatile uint32_t aicpu_ready; // AICPU ready signal: 0=not ready, 1=ready + volatile uint32_t aicore_done; // AICore ready signal: 0=not ready, core_id+1=ready + volatile uint64_t task; // Task pointer: 0=no task, non-zero=Task* address + volatile int32_t task_status; // Task execution status: 0=idle, 1=busy + volatile int32_t control; // Control signal: 0=execute, 1=quit + volatile CoreType core_type; // Core type: CoreType::AIC or CoreType::AIV + volatile uint64_t perf_records_addr; // Performance records address + volatile uint32_t perf_buffer_status; // 0 = not full, 1 = full + volatile uint32_t physical_core_id; // Physical core ID } __attribute__((aligned(64))); /** @@ -182,12 +186,17 @@ class Runtime { // Execution parameters for AICPU scheduling int sche_cpu_num; // Number of AICPU threads for scheduling + // Register-based task dispatch + uint64_t regs{0}; // Device memory pointer to register address array + // Profiling support bool enable_profiling; // Enable profiling flag uint64_t perf_data_base; // Performance data shared memory base address (device-side) -private: + // Task storage Task tasks[RUNTIME_MAX_TASKS]; // Fixed-size task array + +private: int next_task_id; // Next available task ID // Initial ready tasks (computed once, read-only after) diff --git a/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp index 91f0e393..09a0c599 100644 --- a/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp +++ b/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp @@ -111,8 +111,9 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ void* * @param runtime Pointer to Runtime in global memory * @param block_idx Block index (core ID) * @param core_type Core type (AIC or AIV) + * @param physical_core_id Physical core ID from hardware */ -__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type) { +__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id) { __gm__ Handshake* my_hank = (__gm__ Handshake*)(&runtime->workers[block_idx]); // Phase 1: Wait for AICPU initialization signal diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 20e8183b..8b552aff 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -80,6 +80,7 @@ struct Handshake { volatile CoreType core_type; // Core type: CoreType::AIC or CoreType::AIV volatile uint64_t perf_records_addr; // Performance records address volatile uint32_t perf_buffer_status; // 0 = not full, 1 == full + volatile uint32_t physical_core_id; // Physical core ID } __attribute__((aligned(64))); /** @@ -136,6 +137,9 @@ class Runtime { // Execution parameters for AICPU scheduling int sche_cpu_num; // Number of AICPU threads for scheduling + // Register-based task dispatch (unused in tensormap_and_ringbuffer, but required by platform) + uint64_t regs{0}; // Device memory pointer to register address array + // PTO2 integration: kernel_id -> GM function_bin_addr mapping // NOTE: Made public for direct access from aicore code uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];