diff --git a/src/platform/a2a3/aicore/inner_kernel.h b/src/platform/a2a3/aicore/inner_kernel.h index fa31ee3f..3da25ea2 100644 --- a/src/platform/a2a3/aicore/inner_kernel.h +++ b/src/platform/a2a3/aicore/inner_kernel.h @@ -9,6 +9,9 @@ #ifndef PLATFORM_A2A3_AICORE_INNER_KERNEL_H_ #define PLATFORM_A2A3_AICORE_INNER_KERNEL_H_ +#include +#include "aicpu/aicpu_regs.h" + // AICore function attribute for CANN compiler #ifndef __aicore__ #define __aicore__ [aicore] @@ -17,4 +20,30 @@ // dcci (Data Cache Clean and Invalidate) is provided by CANN headers // No need to define it here - it's a hardware instruction +/** + * Read task_id from DATA_MAIN_BASE register + * + * Return values: 0=idle, AICORE_EXIT_SIGNAL=shutdown, other=task_id+1 + */ + __aicore__ inline uint32_t read_task_id_by_reg() { + uint32_t task_id; + // MOV reads from AICore SPR, bypasses cache + __asm__ volatile("MOV %0, DATA_MAIN_BASE\n" : "=l"(task_id)); + return task_id; +} + +/** + * Set AICore status to BUSY via COND register + */ + __aicore__ inline void set_aicore_busy() { + set_cond(AICoreStatus::BUSY); +} + +/** + * Set AICore status to IDLE via COND register + */ + __aicore__ inline void set_aicore_idle() { + set_cond(AICoreStatus::IDLE); +} + #endif // PLATFORM_A2A3_AICORE_INNER_KERNEL_H_ diff --git a/src/platform/a2a3/aicore/kernel.cpp b/src/platform/a2a3/aicore/kernel.cpp index b17615e9..8e1f641c 100644 --- a/src/platform/a2a3/aicore/kernel.cpp +++ b/src/platform/a2a3/aicore/kernel.cpp @@ -23,7 +23,7 @@ class Runtime; [[block_local]] int block_idx; [[block_local]] CoreType core_type; -extern __aicore__ void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type); +extern __aicore__ void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id); /** * Kernel entry point with control loop @@ -49,5 +49,6 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ Runtime block_idx = get_block_idx(); core_type = CoreType::AIC; #endif - aicore_execute(runtime, block_idx, core_type); + uint32_t physical_core_id = static_cast(get_coreid()) & AICORE_COREID_MASK; + aicore_execute(runtime, block_idx, core_type, physical_core_id); } diff --git a/src/platform/a2a3/aicpu/aicpu_regs.cpp b/src/platform/a2a3/aicpu/aicpu_regs.cpp new file mode 100644 index 00000000..950d1ee0 --- /dev/null +++ b/src/platform/a2a3/aicpu/aicpu_regs.cpp @@ -0,0 +1,40 @@ +/** + * @file aicpu_regs.cpp + * @brief AICPU-side AICore register access implementation + */ + +#include "aicpu/aicpu_regs.h" + +void enable_aicore_register(uint64_t reg_base_addr) { + volatile uint32_t* reg_enable = + reinterpret_cast(reg_base_addr + REG_SPR_FAST_PATH_ENABLE_OFFSET); + *reg_enable = REG_SPR_FAST_PATH_OPEN; + __sync_synchronize(); + + // Clear stale task_id from previous kernel runs + volatile uint32_t* reg_task = + reinterpret_cast(reg_base_addr + REG_SPR_DATA_MAIN_BASE_OFFSET); + *reg_task = 0; +} + +void disable_aicore_register(uint64_t reg_base_addr) { + volatile uint32_t* reg_enable = + reinterpret_cast(reg_base_addr + REG_SPR_FAST_PATH_ENABLE_OFFSET); + *reg_enable = REG_SPR_FAST_PATH_CLOSE; + __sync_synchronize(); +} + +void write_task_id_to_aicore(uint64_t reg_base_addr, uint32_t task_id) { + volatile uint32_t* reg_task = + reinterpret_cast(reg_base_addr + REG_SPR_DATA_MAIN_BASE_OFFSET); + *reg_task = task_id; + __sync_synchronize(); +} + +AICoreStatus read_aicore_status(uint64_t reg_base_addr) { + volatile uint32_t* reg_cond = + reinterpret_cast(reg_base_addr + REG_SPR_COND_OFFSET); + __sync_synchronize(); + uint32_t status_val = *reg_cond; + return static_cast(status_val); +} diff --git a/src/platform/a2a3/host/CMakeLists.txt b/src/platform/a2a3/host/CMakeLists.txt index 85e28645..7dfabdb1 100644 --- a/src/platform/a2a3/host/CMakeLists.txt +++ b/src/platform/a2a3/host/CMakeLists.txt @@ -24,6 +24,7 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/memory_allocator.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/performance_collector.cpp" diff --git a/src/platform/a2a3/host/device_runner.cpp b/src/platform/a2a3/host/device_runner.cpp index a5ece59e..ec92aac1 100644 --- a/src/platform/a2a3/host/device_runner.cpp +++ b/src/platform/a2a3/host/device_runner.cpp @@ -11,6 +11,7 @@ // Include HAL constants from CANN (header only, library loaded dynamically) #include "ascend_hal.h" +#include "host/host_regs.h" // Register address retrieval // ============================================================================= // Lazy-loaded HAL (ascend_hal) for profiling host-register only @@ -324,6 +325,13 @@ int DeviceRunner::run(Runtime& runtime, worker_count_ = num_aicore; // Store for print_handshake_results in destructor runtime.sche_cpu_num = launch_aicpu_num; + // Get AICore register addresses for register-based task dispatch + rc = init_aicore_register_addresses(&runtime.regs, static_cast(device_id), mem_alloc_); + if (rc != 0) { + LOG_ERROR("init_aicore_register_addresses failed: %d", rc); + return rc; + } + // Calculate number of AIC cores (1/3 of total) int num_aic = block_dim; // Round up for 1/3 diff --git a/src/platform/a2a3/host/host_regs.cpp b/src/platform/a2a3/host/host_regs.cpp new file mode 100644 index 00000000..da614fec --- /dev/null +++ b/src/platform/a2a3/host/host_regs.cpp @@ -0,0 +1,178 @@ +/** + * @file host_regs.cpp + * @brief Host-side AICore register address retrieval implementation + */ + +#include "host/host_regs.h" +#include "host/memory_allocator.h" +#include "common/unified_log.h" +#include "common/platform_config.h" +#include "runtime/rt.h" +#include "ascend_hal.h" // CANN HAL API definitions (MODULE_TYPE_AICORE, INFO_TYPE_OCCUPY, etc.) +#include +#include + +/** + * Query valid AICore cores via HAL API + */ +static bool get_pg_mask(uint64_t& valid, int64_t device_id) { + uint64_t aicore_bitmap[PLATFORM_AICORE_BITMAP_LEN] = {0}; + int32_t size_n = static_cast(sizeof(uint64_t)) * PLATFORM_AICORE_BITMAP_LEN; + + auto halFuncDevInfo = + (int (*)(uint64_t deviceId, int32_t moduleType, int32_t infoType, void* buf, int32_t* size))dlsym( + nullptr, "halGetDeviceInfoByBuff"); + + if (halFuncDevInfo == nullptr) { + LOG_WARN("halGetDeviceInfoByBuff not found, assuming all cores valid"); + return false; + } + + auto ret = halFuncDevInfo(static_cast(device_id), + MODULE_TYPE_AICORE, + INFO_TYPE_OCCUPY, + reinterpret_cast(&aicore_bitmap[0]), + &size_n); + + if (ret != 0) { + LOG_ERROR("halGetDeviceInfoByBuff failed with rc=%d", ret); + return false; + } + + valid = aicore_bitmap[0]; + return true; +} + +/** + * Retrieve AICore register base addresses via HAL API + */ +static int get_aicore_reg_info(std::vector& aic, std::vector& aiv, + const int& addr_type, int64_t device_id) { + uint64_t valid = 0; + if (!get_pg_mask(valid, device_id)) { + // If can't get mask, assume all cores valid + valid = 0xFFFFFFFF; + LOG_WARN("Using default valid mask 0xFFFFFFFF"); + } + + uint64_t core_stride = 8 * 1024 * 1024; // 8M + uint64_t sub_core_stride = 0x100000ULL; + + auto is_valid = [&valid](int id) { + const uint64_t mask = (1ULL << 25) - 1; + return ((static_cast(valid) ^ mask) & (1ULL << id)) == 0; + }; + + auto halFunc = + (int (*)(int type, void* paramValue, size_t paramValueSize, void* outValue, size_t* outSizeRet))dlsym( + nullptr, "halMemCtl"); + + if (halFunc == nullptr) { + LOG_ERROR("halMemCtl not found in symbol table"); + return -1; + } + + struct AddrMapInPara in_map_para; + struct AddrMapOutPara out_map_para; + in_map_para.devid = device_id; + in_map_para.addr_type = addr_type; + + auto ret = halFunc(0, + reinterpret_cast(&in_map_para), + sizeof(struct AddrMapInPara), + reinterpret_cast(&out_map_para), + nullptr); + + if (ret != 0) { + LOG_ERROR("halMemCtl failed with rc=%d", ret); + return ret; + } + + LOG_INFO("Register base: ptr=0x%llx, len=0x%llx", out_map_para.ptr, out_map_para.len); + + // Iterate over all cores and subcores + for (uint32_t i = 0; i < DAV_2201::PLATFORM_MAX_PHYSICAL_CORES; i++) { + for (uint32_t j = 0; j < PLATFORM_SUB_CORES_PER_AICORE; j++) { + uint64_t vaddr = 0UL; + if (is_valid(i)) { + vaddr = out_map_para.ptr + (i * core_stride + j * sub_core_stride); + } + if (j == 0) { + aic.push_back(vaddr); + } else { + aiv.push_back(vaddr); + } + } + } + + return 0; +} + +void get_aicore_regs(std::vector& regs, uint64_t device_id) { + std::vector aiv; + std::vector aic; + + int rt = get_aicore_reg_info(aic, aiv, ADDR_MAP_TYPE_REG_AIC_CTRL, device_id); + + if (rt != 0) { + LOG_ERROR("get_aicore_reg_info failed, using placeholder addresses"); + // Fallback: generate placeholder addresses + for (int i = 0; i < 25; i++) { + aic.push_back(0xDEADBEEF00000000ULL + (i * 0x800000)); // 8M stride + aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x100000); + aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x200000); + } + } + + // AIC cores first, then AIV cores + regs.insert(regs.end(), aic.begin(), aic.end()); + regs.insert(regs.end(), aiv.begin(), aiv.end()); + + LOG_INFO("get_aicore_regs: Retrieved %zu AIC and %zu AIV register addresses", + aic.size(), aiv.size()); +} + +int init_aicore_register_addresses( + uint64_t* runtime_regs_ptr, + uint64_t device_id, + MemoryAllocator& allocator) { + + if (runtime_regs_ptr == nullptr) { + LOG_ERROR("init_aicore_register_addresses: Invalid parameters"); + return -1; + } + + LOG_INFO("Retrieving and allocating AICore register addresses..."); + + // Step 1: Get register addresses from HAL + std::vector host_regs; + get_aicore_regs(host_regs, device_id); + + if (host_regs.empty()) { + LOG_ERROR("Failed to get AICore register addresses"); + return -1; + } + + // Step 2: Allocate device memory for register address array + size_t regs_size = host_regs.size() * sizeof(int64_t); + void* reg_ptr = allocator.alloc(regs_size); + if (reg_ptr == nullptr) { + LOG_ERROR("Failed to allocate device memory for register addresses"); + return -1; + } + + // Step 3: Copy register addresses to device memory + int ret = rtMemcpy(reg_ptr, regs_size, host_regs.data(), regs_size, RT_MEMCPY_HOST_TO_DEVICE); + if (ret != 0) { + LOG_ERROR("Failed to copy register addresses to device (rc=%d)", ret); + return -1; + } + + // Step 4: Store device pointer in runtime.regs + *runtime_regs_ptr = reinterpret_cast(reg_ptr); + + LOG_INFO("Successfully initialized register addresses: %zu addresses at device 0x%llx", + host_regs.size(), *runtime_regs_ptr); + + return 0; +} diff --git a/src/platform/a2a3sim/aicore/inner_kernel.h b/src/platform/a2a3sim/aicore/inner_kernel.h index 3a74156e..0f40ce21 100644 --- a/src/platform/a2a3sim/aicore/inner_kernel.h +++ b/src/platform/a2a3sim/aicore/inner_kernel.h @@ -24,6 +24,7 @@ // Cache coherency constants (no-op in simulation) #define ENTIRE_DATA_CACHE 0 +#define SINGLE_CACHE_LINE 0 #define CACHELINE_OUT 0 // ============================================================================= diff --git a/src/platform/include/aicpu/aicpu_regs.h b/src/platform/include/aicpu/aicpu_regs.h new file mode 100644 index 00000000..ca7d86a7 --- /dev/null +++ b/src/platform/include/aicpu/aicpu_regs.h @@ -0,0 +1,45 @@ +/** + * @file aicpu_regs.h + * @brief AICPU-side AICore register access via MMIO + * + * Provides MMIO-based register writes/reads for task dispatch and status polling. + * Memory barriers ensure cross-core visibility. + */ + +#ifndef PLATFORM_A2A3_AICPU_AICPU_REGS_H_ +#define PLATFORM_A2A3_AICPU_AICPU_REGS_H_ + +#include +#include "common/platform_config.h" + +/** + * AICore execution status (read from COND register) + */ +enum class AICoreStatus : uint32_t { + IDLE = 0, + BUSY = 1, +}; + +/** + * Enable register-based communication (call during handshake) + */ +void enable_aicore_register(uint64_t reg_base_addr); + +/** + * Disable register-based communication (call during shutdown) + */ +void disable_aicore_register(uint64_t reg_base_addr); + +/** + * Write task_id to AICore's DATA_MAIN_BASE register + * + * Special values: 0=idle, AICORE_EXIT_SIGNAL=shutdown + */ +void write_task_id_to_aicore(uint64_t reg_base_addr, uint32_t task_id); + +/** + * Read AICore status from COND register + */ +AICoreStatus read_aicore_status(uint64_t reg_base_addr); + +#endif // PLATFORM_A2A3_AICPU_AICPU_REGS_H_ diff --git a/src/platform/include/common/perf_profiling.h b/src/platform/include/common/perf_profiling.h index e3fe4a47..fb99199d 100644 --- a/src/platform/include/common/perf_profiling.h +++ b/src/platform/include/common/perf_profiling.h @@ -38,11 +38,6 @@ #define RUNTIME_MAX_FANOUT 512 #endif -// Maximum cores that can be profiled simultaneously -#ifndef PLATFORM_MAX_CORES -#define PLATFORM_MAX_CORES 72 // 24 blocks × 3 cores/block -#endif - // ============================================================================= // Buffer Status Enumeration // ============================================================================= diff --git a/src/platform/include/common/platform_config.h b/src/platform/include/common/platform_config.h index 5992f563..cb1f4d0c 100644 --- a/src/platform/include/common/platform_config.h +++ b/src/platform/include/common/platform_config.h @@ -100,4 +100,47 @@ constexpr int PLATFORM_PROF_TIMEOUT_SECONDS = 2; * Number of empty polling iterations before checking timeout */ constexpr int PLATFORM_PROF_EMPTY_POLLS_CHECK_NUM = 1000; + +// ============================================================================= +// Register Communication Configuration +// ============================================================================= + +// Register offsets for AICore SPR access +constexpr uint32_t REG_SPR_DATA_MAIN_BASE_OFFSET = 0xA0; // Task dispatch (AICPU→AICore) +constexpr uint32_t REG_SPR_COND_OFFSET = 0x4C8; // Status (AICore→AICPU): 0=IDLE, 1=BUSY +constexpr uint32_t REG_SPR_FAST_PATH_ENABLE_OFFSET = 0x18; + +// Fast path control values +constexpr uint32_t REG_SPR_FAST_PATH_OPEN = 0xE; +constexpr uint32_t REG_SPR_FAST_PATH_CLOSE = 0xF; + +// Exit signal for AICore shutdown +constexpr uint32_t AICORE_EXIT_SIGNAL = 0x7FFFFFF0; + +// Physical core ID mask for get_coreid() +constexpr uint32_t AICORE_COREID_MASK = 0x0FFF; + +// ============================================================================= +// Hardware Configuration Constants +// ============================================================================= + +/** + * AICore register bitmap buffer length + * Used for querying valid AICore cores via HAL API + */ +constexpr uint8_t PLATFORM_AICORE_BITMAP_LEN = 2; + +/** + * Number of sub-cores per AICore + * Hardware architecture: 1 AICore = 1 AIC + 2 AIV sub-cores + */ +constexpr uint32_t PLATFORM_SUB_CORES_PER_AICORE = PLATFORM_CORES_PER_BLOCKDIM; + +/** + * Maximum physical AICore count for DAV 2201 chip + */ +namespace DAV_2201 { +constexpr uint32_t PLATFORM_MAX_PHYSICAL_CORES = 25; +} + #endif // PLATFORM_COMMON_PLATFORM_CONFIG_H_ diff --git a/src/platform/include/host/host_regs.h b/src/platform/include/host/host_regs.h new file mode 100644 index 00000000..655f19f3 --- /dev/null +++ b/src/platform/include/host/host_regs.h @@ -0,0 +1,42 @@ +/** + * @file host_regs.h + * @brief AICore register address retrieval via CANN HAL APIs + * + * Provides register base addresses for AICPU to perform MMIO-based + * task dispatch to AICore cores. + */ + +#ifndef PLATFORM_HOST_HOST_REGS_H_ +#define PLATFORM_HOST_HOST_REGS_H_ + +#include +#include + +// Forward declaration +class MemoryAllocator; + +/** + * Get AICore register base addresses for all cores + * + * @param regs Output vector (AIC cores followed by AIV cores) + * @param device_id Device ID + */ +void get_aicore_regs(std::vector& regs, uint64_t device_id); + +/** + * Initialize AICore register addresses for runtime + * + * Retrieves register addresses from HAL, allocates device memory, + * copies addresses to device, and stores the device pointer in runtime. + * + * @param runtime_regs_ptr Pointer to runtime.regs field + * @param device_id Device ID + * @param allocator Memory allocator for device memory + * @return 0 on success, negative on failure + */ +int init_aicore_register_addresses( + uint64_t* runtime_regs_ptr, + uint64_t device_id, + MemoryAllocator& allocator); + +#endif // PLATFORM_HOST_HOST_REGS_H_ diff --git a/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp b/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp index f002616b..b4c33b22 100644 --- a/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp +++ b/src/runtime/aicpu_build_graph/aicore/aicore_executor.cpp @@ -46,7 +46,7 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ Task* kernel(reinterpret_cast<__gm__ int64_t*>(task->args)); } -__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type) { +__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id) { __gm__ Handshake* my_hank = (__gm__ Handshake*)(&runtime->workers[block_idx]); // Phase 1: Wait for AICPU initialization signal diff --git a/src/runtime/aicpu_build_graph/runtime/runtime.h b/src/runtime/aicpu_build_graph/runtime/runtime.h index 9ccee95d..7f4b7fb9 100644 --- a/src/runtime/aicpu_build_graph/runtime/runtime.h +++ b/src/runtime/aicpu_build_graph/runtime/runtime.h @@ -245,6 +245,9 @@ class Runtime { // Execution parameters for AICPU scheduling int sche_cpu_num; // Number of AICPU threads for scheduling + // Register-based task dispatch (unused in aicpu_build_graph, but required by platform) + uint64_t regs{0}; // Device memory pointer to register address array + /** * Orchestration payload (auto-populated by init_runtime_impl, consumed by AICPU orchestration). * diff --git a/src/runtime/host_build_graph/aicore/aicore_executor.cpp b/src/runtime/host_build_graph/aicore/aicore_executor.cpp index 9781bec4..c41c88cc 100644 --- a/src/runtime/host_build_graph/aicore/aicore_executor.cpp +++ b/src/runtime/host_build_graph/aicore/aicore_executor.cpp @@ -1,7 +1,7 @@ #include "aicore/aicore.h" #include "runtime.h" #include "common/perf_profiling.h" -#include "common/memory_barrier.h" +#include "common/platform_config.h" // Platform configuration (C/C++ compatible) typedef void (*KernelFunc)(__gm__ int64_t*); @@ -28,23 +28,19 @@ __aicore__ __attribute__((always_inline)) static void record_task_performance( CoreType core_type, uint64_t kernel_ready_time) { - // Check if buffer is available for writing + // dcci() for handshake visibility during profiling + dcci((__gm__ uint32_t*)&my_hank->perf_buffer_status, SINGLE_CACHE_LINE, CACHELINE_OUT); + if (my_hank->perf_buffer_status != 0) { - return; // Buffer full, skip recording + return; } - // Get current performance buffer pointer __gm__ PerfBuffer* perf_buf = (__gm__ PerfBuffer*)my_hank->perf_records_addr; - - // Get current count uint32_t idx = perf_buf->count; - // Check if buffer has space if (idx < PLATFORM_PROF_BUFFER_SIZE) { - // Get pointer to the record slot __gm__ PerfRecord* record = (__gm__ PerfRecord*)&perf_buf->records[idx]; - // Write record data (only essential fields, fanout filled by AICPU) record->start_time = start_time; record->end_time = end_time; record->kernel_ready_time = kernel_ready_time; @@ -53,20 +49,16 @@ __aicore__ __attribute__((always_inline)) static void record_task_performance( record->core_id = block_idx; record->core_type = core_type; - // Increment count after writing record perf_buf->count = idx + 1; - - // Write memory barrier: ensure performance data is visible to Host - wmb(); - - // Check if buffer is full after this write + dcci(record, ENTIRE_DATA_CACHE, CACHELINE_OUT); if (perf_buf->count >= PLATFORM_PROF_BUFFER_SIZE) { - my_hank->perf_buffer_status = 1; // Notify AICPU: buffer full + my_hank->perf_buffer_status = 1; } } else { - // Buffer is already full my_hank->perf_buffer_status = 1; } + // dcci() for handshake visibility during profiling + dcci((__gm__ uint32_t*)&my_hank->perf_buffer_status, SINGLE_CACHE_LINE, CACHELINE_OUT); } __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ Task* task) { @@ -75,9 +67,12 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ Task* } KernelFunc kernel = (KernelFunc)task->function_bin_addr; kernel(reinterpret_cast<__gm__ int64_t*>(task->args)); + + // Ensure all memory writes are visible to other cores + pipe_barrier(PIPE_ALL); } -__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type) { +__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id) { __gm__ Handshake* my_hank = (__gm__ Handshake*)(&runtime->workers[block_idx]); // Phase 1: Wait for AICPU initialization signal @@ -85,51 +80,45 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in dcci(my_hank, ENTIRE_DATA_CACHE, CACHELINE_OUT); } - // Phase 2: Signal AICore is ready and report core type - my_hank->core_type = core_type; // Report core type to AICPU - my_hank->aicore_done = block_idx + 1; // Signal ready (use block_idx + 1 to avoid 0) + // Report physical core ID and core type for AICPU + my_hank->physical_core_id = physical_core_id; + my_hank->core_type = core_type; + my_hank->aicore_done = block_idx + 1; + + dcci(my_hank, ENTIRE_DATA_CACHE, CACHELINE_OUT); + + // Report initial idle status for task dispatch + set_aicore_idle(); - // Check if profiling is enabled bool profiling_enabled = runtime->enable_profiling; + uint64_t kernel_ready_time = get_sys_cnt(); - // Record kernel ready time (before entering main loop) - // This timestamp represents when the AICore is ready to execute tasks - // but hasn't started executing any task yet. - // Used for: 1) Startup overhead analysis, 2) Cross-core time alignment - uint64_t kernel_ready_time = 0; - if (profiling_enabled) { - kernel_ready_time = get_sys_cnt(); - } + // Main loop: poll DATA_MAIN_BASE for task_id + volatile uint32_t task_id = 0; + volatile uint32_t last_task_id = 0; - // Phase 3: Main execution loop - poll for tasks until quit signal while (true) { - dcci(my_hank, ENTIRE_DATA_CACHE, CACHELINE_OUT); - - // Check for quit command from AICPU - if (my_hank->control == 1) { - break; // Exit kernel + task_id = read_task_id_by_reg(); + if (task_id == AICORE_EXIT_SIGNAL) { + break; } - // Execute task if assigned (task != 0 means valid task pointer) - if (my_hank->task_status == 1 && my_hank->task != 0) { - __gm__ Task* task_ptr = reinterpret_cast<__gm__ Task*>(my_hank->task); - - // Performance profiling: record start time - uint64_t start_time = 0; - start_time = get_sys_cnt(); + // Execute task if new (task_id encoding: 0=idle, task_id+1=task) + if (task_id != 0 && task_id != last_task_id) { + set_aicore_busy(); + __gm__ Task* task_ptr = &(runtime->tasks[task_id - 1]); + uint64_t start_time = get_sys_cnt(); - - // Execute the task execute_task(task_ptr); - // Performance profiling: record task execution if (profiling_enabled) { uint64_t end_time = get_sys_cnt(); - record_task_performance(my_hank, task_ptr, start_time, end_time, block_idx, core_type, kernel_ready_time); + record_task_performance(my_hank, task_ptr, start_time, end_time, + block_idx, core_type, kernel_ready_time); } - // Mark task as complete (task_status: 0=idle, 1=busy) - my_hank->task_status = 0; + last_task_id = task_id; + set_aicore_idle(); } } } diff --git a/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp index 4b378fa0..9b76d42a 100644 --- a/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp +++ b/src/runtime/host_build_graph/aicpu/aicpu_executor.cpp @@ -9,15 +9,19 @@ #include "runtime.h" #include "aicpu/device_log.h" #include "inner_aicpu.h" +#include "aicpu/aicpu_regs.h" // Register-based communication constexpr int MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS; constexpr int MAX_AIC_PER_THREAD = PLATFORM_MAX_AIC_PER_THREAD; constexpr int MAX_AIV_PER_THREAD = PLATFORM_MAX_AIV_PER_THREAD; constexpr int MAX_CORES_PER_THREAD = PLATFORM_MAX_CORES_PER_THREAD; +constexpr int MAX_CORES = PLATFORM_MAX_CORES; // Core information for discovery struct CoreInfo { - int worker_id; // Index in runtime.workers[] + int worker_id; // Index in runtime.workers[] + uint32_t physical_core_id; // Hardware physical core ID (from AICore) + uint64_t reg_addr; // Cached register address for fast access CoreType core_type; }; @@ -40,6 +44,12 @@ struct AicpuExecutor { int aic_count_{0}; int aiv_count_{0}; + // Fast lookup: core_id -> reg_addr + uint64_t core_id_to_reg_addr_[MAX_CORES_PER_THREAD]; + + // Track executing task_id per core (-1 = idle) + int executing_task_ids_[MAX_CORES]; + // ===== Task queue state ===== std::mutex ready_queue_aic_mutex_; int ready_queue_aic_[RUNTIME_MAX_TASKS]; @@ -122,6 +132,11 @@ int AicpuExecutor::init(Runtime* runtime) { LOG_INFO("Config: threads=%d, cores=%d, cores_per_thread=%d", thread_num_, cores_total_num_, thread_cores_num_); + // Initialize executing_task_ids_ to -1 (idle) + for (int i = 0; i < MAX_CORES; i++) { + executing_task_ids_[i] = -1; + } + // Assign discovered cores to threads assign_cores_to_threads(); @@ -228,24 +243,38 @@ int AicpuExecutor::handshake_all_cores(Runtime* runtime) { // Busy wait for core response } - // Read core type (written by AICore during handshake) CoreType type = hank->core_type; + uint32_t physical_core_id = hank->physical_core_id; + + // Get register address using physical_core_id + uint64_t* regs = reinterpret_cast(runtime->regs); + uint64_t reg_addr = regs[physical_core_id]; - // Classify and store core information if (type == CoreType::AIC) { aic_cores_[aic_count_].worker_id = i; + aic_cores_[aic_count_].physical_core_id = physical_core_id; + aic_cores_[aic_count_].reg_addr = reg_addr; aic_cores_[aic_count_].core_type = type; aic_count_++; } else if (type == CoreType::AIV) { aiv_cores_[aiv_count_].worker_id = i; + aiv_cores_[aiv_count_].physical_core_id = physical_core_id; + aiv_cores_[aiv_count_].reg_addr = reg_addr; aiv_cores_[aiv_count_].core_type = type; aiv_count_++; } else { - LOG_ERROR("Unknown core type %d for core %d", static_cast(type), i); + LOG_ERROR("Unknown core type from core %d", i); return -1; } - LOG_INFO(" Core %d: type=%s", i, core_type_to_string(type)); + core_id_to_reg_addr_[i] = reg_addr; + + LOG_INFO(" Core %d: type=%s, physical_id=%u, reg_addr=0x%lx", + i, core_type_to_string(type), physical_core_id, reg_addr); + + if (reg_addr != 0) { + enable_aicore_register(reg_addr); + } } LOG_INFO("Discovery complete: AIC=%d, AIV=%d, Total=%d", aic_count_, aiv_count_, cores_total_num_); @@ -340,7 +369,14 @@ int AicpuExecutor::shutdown_aicore(Runtime* runtime, int thread_idx, const int* int core_id = cur_thread_cores[i]; Handshake* hank = &all_hanks[core_id]; LOG_INFO("Thread %d: AICPU hank addr = 0x%lx", thread_idx, (uint64_t)hank); - hank->control = 1; + + uint64_t reg_addr = core_id_to_reg_addr_[core_id]; + if (reg_addr != 0) { + write_task_id_to_aicore(reg_addr, AICORE_EXIT_SIGNAL); + disable_aicore_register(reg_addr); + } else { + LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id); + } } LOG_INFO("Thread %d: Shutdown complete", thread_idx); return 0; @@ -377,15 +413,17 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const for (int i = 0; i < core_num; i++) { int core_id = cur_thread_cores[i]; - Handshake* h = &hank[core_id]; - if (h->task_status != 0 || h->task != 0) { + uint64_t reg_addr = core_id_to_reg_addr_[core_id]; + AICoreStatus status = read_aicore_status(reg_addr); + + if (status != AICoreStatus::IDLE || executing_task_ids_[core_id] >= 0) { all_cores_idle = false; if (verification_warning_count == 0) { - LOG_WARN("Thread %d: Counter reached %d/%d but core %d still has work (status=%d, task=%p)", + LOG_WARN("Thread %d: Counter reached %d/%d but core %d still has work (COND=%d, task_id=%d)", thread_idx, completed_tasks_.load(std::memory_order_acquire), task_count, - core_id, h->task_status, (void*)h->task); + core_id, static_cast(status), executing_task_ids_[core_id]); } break; } @@ -417,15 +455,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const // Phase 1: Process completed tasks on my managed cores for (int i = 0; i < core_num; i++) { int core_id = cur_thread_cores[i]; - Handshake* h = &hank[core_id]; - - // Core finished a task (idle + task not null) - if (h->task_status == 0 && h->task != 0) { - // Get completed task pointer before any buffer operations - Task* task = reinterpret_cast(h->task); - int completed_task_id = task->task_id; - - // Write AICPU dispatch/finish timestamps into the PerfRecord + uint64_t reg_addr = core_id_to_reg_addr_[core_id]; + AICoreStatus status = read_aicore_status(reg_addr); + if (status == AICoreStatus::IDLE && executing_task_ids_[core_id] >= 0) { + int task_id = executing_task_ids_[core_id]; + int completed_task_id = task_id; + Handshake* h = &hank[core_id]; if (profiling_enabled) { uint64_t finish_ts = get_sys_cnt_aicpu(); PerfBuffer* perf_buf = (PerfBuffer*)h->perf_records_addr; @@ -445,10 +480,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const if (profiling_enabled && h->perf_buffer_status == 1) { switch_perf_buffer(&runtime, core_id, thread_idx); } - // Clear the task pointer - h->task = 0; - int task_id = completed_task_id; + Task* task = runtime.get_task(task_id); LOG_INFO("Thread %d: Core %d completed task %d", thread_idx, core_id, task_id); @@ -484,10 +517,12 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const } } - // Update counters + // Update counters and clear task tracking cur_thread_tasks_in_flight--; cur_thread_completed++; made_progress = true; + // Clear task_id + executing_task_ids_[core_id] = -1; completed_tasks_.fetch_add(1, std::memory_order_release); } } @@ -497,10 +532,11 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const // Phase 2: Dispatch new tasks from matching ready queue to idle cores for (int i = 0; i < core_num; i++) { int core_id = cur_thread_cores[i]; - Handshake* h = &hank[core_id]; + uint64_t reg_addr = core_id_to_reg_addr_[core_id]; + AICoreStatus status = read_aicore_status(reg_addr); - // Core is idle and available (idle + task is null) - if (h->task_status == 0 && h->task == 0) { + if (status == AICoreStatus::IDLE && executing_task_ids_[core_id] == -1) { + Handshake* h = &hank[core_id]; // Dispatch from matching queue based on core type if (h->core_type == CoreType::AIC) { // AIC core if (ready_count_aic_.load(std::memory_order_acquire) > 0) { @@ -511,16 +547,19 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const int task_id = ready_queue_aic_[ready_queue_aic_head_]; ready_queue_aic_head_ = (ready_queue_aic_head_ + 1) % RUNTIME_MAX_TASKS; ready_count_aic_.fetch_sub(1, std::memory_order_release); - Task* task = runtime.get_task(task_id); LOG_INFO("Thread %d: Dispatching AIC task %d to core %d (head=%d)", thread_idx, task_id, core_id, ready_queue_aic_head_); - h->task = reinterpret_cast(task); + uint64_t reg_addr = core_id_to_reg_addr_[core_id]; + + // Write task_id+1 to register + write_task_id_to_aicore(reg_addr, static_cast(task_id + 1)); + if (runtime.enable_profiling) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } - h->task_status = 1; // Mark as busy + executing_task_ids_[core_id] = task_id; cur_thread_tasks_in_flight++; made_progress = true; } @@ -534,16 +573,19 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const int task_id = ready_queue_aiv_[ready_queue_aiv_head_]; ready_queue_aiv_head_ = (ready_queue_aiv_head_ + 1) % RUNTIME_MAX_TASKS; ready_count_aiv_.fetch_sub(1, std::memory_order_release); - Task* task = runtime.get_task(task_id); LOG_INFO("Thread %d: Dispatching AIV task %d to core %d (head=%d)", thread_idx, task_id, core_id, ready_queue_aiv_head_); - h->task = reinterpret_cast(task); + uint64_t reg_addr = core_id_to_reg_addr_[core_id]; + + // Write task_id+1 to register + write_task_id_to_aicore(reg_addr, static_cast(task_id + 1)); + if (runtime.enable_profiling) { dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); } - h->task_status = 1; // Mark as busy + executing_task_ids_[core_id] = task_id; cur_thread_tasks_in_flight++; made_progress = true; } @@ -656,7 +698,6 @@ void AicpuExecutor::diagnose_stuck_state(Runtime& runtime, int thread_idx, int busy_cores = 0; int idle_cores = 0; - int anomaly_cores = 0; LOG_ERROR("Core Status:"); for (int i = 0; i < core_num; i++) { @@ -665,24 +706,30 @@ void AicpuExecutor::diagnose_stuck_state(Runtime& runtime, int thread_idx, const char* core_type_str = core_type_to_string(h->core_type); - if (h->task != 0) { - Task* task = reinterpret_cast(h->task); + uint64_t reg_addr = core_id_to_reg_addr_[core_id]; + AICoreStatus status = read_aicore_status(reg_addr); + + if (status != AICoreStatus::IDLE) { busy_cores++; - LOG_ERROR(" Core %d [%s, BUSY]: task_id=%d, func_id=%d, fanin=%d, fanout=%d", - core_id, core_type_str, - task->task_id, task->func_id, - task->fanin.load(std::memory_order_acquire), - task->fanout_count); - } else if (h->task_status != 0) { - anomaly_cores++; - LOG_ERROR(" Core %d [%s, ANOMALY]: status=BUSY but task=NULL", core_id, core_type_str); + int task_id = executing_task_ids_[core_id]; + if (task_id >= 0) { + Task* task = runtime.get_task(task_id); + LOG_ERROR(" Core %d [%s, BUSY]: COND=%d, task_id=%d, func_id=%d, fanin=%d, fanout=%d", + core_id, core_type_str, static_cast(status), + task->task_id, task->func_id, + task->fanin.load(std::memory_order_acquire), + task->fanout_count); + } else { + LOG_ERROR(" Core %d [%s, BUSY]: COND=%d but task_id not tracked", + core_id, core_type_str, static_cast(status)); + } } else { idle_cores++; } } - LOG_ERROR("Summary: %d busy, %d idle, %d anomaly", busy_cores, idle_cores, anomaly_cores); + LOG_ERROR("Summary: %d busy, %d idle", busy_cores, idle_cores); // Diagnose deadlock vs livelock if (busy_cores == 0 && aic_ready == 0 && aiv_ready == 0 && completed < total) { diff --git a/src/runtime/host_build_graph/runtime/runtime.h b/src/runtime/host_build_graph/runtime/runtime.h index 919cada9..cfefabd8 100644 --- a/src/runtime/host_build_graph/runtime/runtime.h +++ b/src/runtime/host_build_graph/runtime/runtime.h @@ -95,16 +95,20 @@ * - task_status: Written by both (AICPU=1 on dispatch, AICore=0 on completion) * - control: Written by AICPU, read by AICore (0 = continue, 1 = quit) * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV) + * - perf_records_addr: Written by AICPU, read by AICore (performance records address) + * - perf_buffer_status: Written by both (AICPU=1 on buffer full, AICore=0 on buffer empty) + * - physical_core_id: Written by AICPU, read by AICore (physical core ID) */ struct Handshake { - volatile uint32_t aicpu_ready; // AICPU ready signal: 0=not ready, 1=ready - volatile uint32_t aicore_done; // AICore ready signal: 0=not ready, core_id+1=ready - volatile uint64_t task; // Task pointer: 0=no task, non-zero=Task* address - volatile int32_t task_status; // Task execution status: 0=idle, 1=busy - volatile int32_t control; // Control signal: 0=execute, 1=quit - volatile CoreType core_type; // Core type: CoreType::AIC or CoreType::AIV - volatile uint64_t perf_records_addr; // Performance records address - volatile uint32_t perf_buffer_status; // 0 = not full, 1 == full + volatile uint32_t aicpu_ready; // AICPU ready signal: 0=not ready, 1=ready + volatile uint32_t aicore_done; // AICore ready signal: 0=not ready, core_id+1=ready + volatile uint64_t task; // Task pointer: 0=no task, non-zero=Task* address + volatile int32_t task_status; // Task execution status: 0=idle, 1=busy + volatile int32_t control; // Control signal: 0=execute, 1=quit + volatile CoreType core_type; // Core type: CoreType::AIC or CoreType::AIV + volatile uint64_t perf_records_addr; // Performance records address + volatile uint32_t perf_buffer_status; // 0 = not full, 1 = full + volatile uint32_t physical_core_id; // Physical core ID } __attribute__((aligned(64))); /** @@ -182,12 +186,17 @@ class Runtime { // Execution parameters for AICPU scheduling int sche_cpu_num; // Number of AICPU threads for scheduling + // Register-based task dispatch + uint64_t regs{0}; // Device memory pointer to register address array + // Profiling support bool enable_profiling; // Enable profiling flag uint64_t perf_data_base; // Performance data shared memory base address (device-side) -private: + // Task storage Task tasks[RUNTIME_MAX_TASKS]; // Fixed-size task array + +private: int next_task_id; // Next available task ID // Initial ready tasks (computed once, read-only after) diff --git a/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp index 91f0e393..09a0c599 100644 --- a/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp +++ b/src/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp @@ -111,8 +111,9 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ void* * @param runtime Pointer to Runtime in global memory * @param block_idx Block index (core ID) * @param core_type Core type (AIC or AIV) + * @param physical_core_id Physical core ID from hardware */ -__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type) { +__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id) { __gm__ Handshake* my_hank = (__gm__ Handshake*)(&runtime->workers[block_idx]); // Phase 1: Wait for AICPU initialization signal diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 20e8183b..72f90198 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -136,6 +136,9 @@ class Runtime { // Execution parameters for AICPU scheduling int sche_cpu_num; // Number of AICPU threads for scheduling + // Register-based task dispatch (unused in tensormap_and_ringbuffer, but required by platform) + uint64_t regs{0}; // Device memory pointer to register address array + // PTO2 integration: kernel_id -> GM function_bin_addr mapping // NOTE: Made public for direct access from aicore code uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];