Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions src/platform/a2a3/aicore/inner_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
#ifndef PLATFORM_A2A3_AICORE_INNER_KERNEL_H_
#define PLATFORM_A2A3_AICORE_INNER_KERNEL_H_

#include <cstdint>
#include "aicpu/aicpu_regs.h"

// AICore function attribute for CANN compiler
#ifndef __aicore__
#define __aicore__ [aicore]
Expand All @@ -17,4 +20,30 @@
// dcci (Data Cache Clean and Invalidate) is provided by CANN headers
// No need to define it here - it's a hardware instruction

/**
* Read task_id from DATA_MAIN_BASE register
*
* Return values: 0=idle, AICORE_EXIT_SIGNAL=shutdown, other=task_id+1
*/
__aicore__ inline uint32_t read_task_id_by_reg() {
uint32_t task_id;
// MOV reads from AICore SPR, bypasses cache
__asm__ volatile("MOV %0, DATA_MAIN_BASE\n" : "=l"(task_id));
return task_id;
}

/**
* Set AICore status to BUSY via COND register
*/
__aicore__ inline void set_aicore_busy() {
set_cond(AICoreStatus::BUSY);
}

/**
* Set AICore status to IDLE via COND register
*/
__aicore__ inline void set_aicore_idle() {
set_cond(AICoreStatus::IDLE);
}

#endif // PLATFORM_A2A3_AICORE_INNER_KERNEL_H_
5 changes: 3 additions & 2 deletions src/platform/a2a3/aicore/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class Runtime;
[[block_local]] int block_idx;
[[block_local]] CoreType core_type;

extern __aicore__ void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type);
extern __aicore__ void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id);

/**
* Kernel entry point with control loop
Expand All @@ -49,5 +49,6 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ Runtime
block_idx = get_block_idx();
core_type = CoreType::AIC;
#endif
aicore_execute(runtime, block_idx, core_type);
uint32_t physical_core_id = static_cast<uint32_t>(get_coreid()) & AICORE_COREID_MASK;
aicore_execute(runtime, block_idx, core_type, physical_core_id);
}
40 changes: 40 additions & 0 deletions src/platform/a2a3/aicpu/aicpu_regs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/**
* @file aicpu_regs.cpp
* @brief AICPU-side AICore register access implementation
*/

#include "aicpu/aicpu_regs.h"

void enable_aicore_register(uint64_t reg_base_addr) {
volatile uint32_t* reg_enable =
reinterpret_cast<volatile uint32_t*>(reg_base_addr + REG_SPR_FAST_PATH_ENABLE_OFFSET);
*reg_enable = REG_SPR_FAST_PATH_OPEN;
__sync_synchronize();

// Clear stale task_id from previous kernel runs
volatile uint32_t* reg_task =
reinterpret_cast<volatile uint32_t*>(reg_base_addr + REG_SPR_DATA_MAIN_BASE_OFFSET);
*reg_task = 0;
}

void disable_aicore_register(uint64_t reg_base_addr) {
volatile uint32_t* reg_enable =
reinterpret_cast<volatile uint32_t*>(reg_base_addr + REG_SPR_FAST_PATH_ENABLE_OFFSET);
*reg_enable = REG_SPR_FAST_PATH_CLOSE;
__sync_synchronize();
}

void write_task_id_to_aicore(uint64_t reg_base_addr, uint32_t task_id) {
volatile uint32_t* reg_task =
reinterpret_cast<volatile uint32_t*>(reg_base_addr + REG_SPR_DATA_MAIN_BASE_OFFSET);
*reg_task = task_id;
__sync_synchronize();
}

AICoreStatus read_aicore_status(uint64_t reg_base_addr) {
volatile uint32_t* reg_cond =
reinterpret_cast<volatile uint32_t*>(reg_base_addr + REG_SPR_COND_OFFSET);
__sync_synchronize();
uint32_t status_val = *reg_cond;
return static_cast<AICoreStatus>(status_val);
}
1 change: 1 addition & 0 deletions src/platform/a2a3/host/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ list(APPEND HOST_RUNTIME_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/memory_allocator.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../../src/performance_collector.cpp"
Expand Down
8 changes: 8 additions & 0 deletions src/platform/a2a3/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

// Include HAL constants from CANN (header only, library loaded dynamically)
#include "ascend_hal.h"
#include "host/host_regs.h" // Register address retrieval

// =============================================================================
// Lazy-loaded HAL (ascend_hal) for profiling host-register only
Expand Down Expand Up @@ -324,6 +325,13 @@ int DeviceRunner::run(Runtime& runtime,
worker_count_ = num_aicore; // Store for print_handshake_results in destructor
runtime.sche_cpu_num = launch_aicpu_num;

// Get AICore register addresses for register-based task dispatch
rc = init_aicore_register_addresses(&runtime.regs, static_cast<uint64_t>(device_id), mem_alloc_);
if (rc != 0) {
LOG_ERROR("init_aicore_register_addresses failed: %d", rc);
return rc;
}

// Calculate number of AIC cores (1/3 of total)
int num_aic = block_dim; // Round up for 1/3

Expand Down
178 changes: 178 additions & 0 deletions src/platform/a2a3/host/host_regs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/**
* @file host_regs.cpp
* @brief Host-side AICore register address retrieval implementation
*/

#include "host/host_regs.h"
#include "host/memory_allocator.h"
#include "common/unified_log.h"
#include "common/platform_config.h"
#include "runtime/rt.h"
#include "ascend_hal.h" // CANN HAL API definitions (MODULE_TYPE_AICORE, INFO_TYPE_OCCUPY, etc.)
#include <dlfcn.h>
#include <iostream>

/**
* Query valid AICore cores via HAL API
*/
static bool get_pg_mask(uint64_t& valid, int64_t device_id) {
uint64_t aicore_bitmap[PLATFORM_AICORE_BITMAP_LEN] = {0};
int32_t size_n = static_cast<int32_t>(sizeof(uint64_t)) * PLATFORM_AICORE_BITMAP_LEN;

auto halFuncDevInfo =
(int (*)(uint64_t deviceId, int32_t moduleType, int32_t infoType, void* buf, int32_t* size))dlsym(
nullptr, "halGetDeviceInfoByBuff");

if (halFuncDevInfo == nullptr) {
LOG_WARN("halGetDeviceInfoByBuff not found, assuming all cores valid");
return false;
}

auto ret = halFuncDevInfo(static_cast<uint32_t>(device_id),
MODULE_TYPE_AICORE,
INFO_TYPE_OCCUPY,
reinterpret_cast<void*>(&aicore_bitmap[0]),
&size_n);

if (ret != 0) {
LOG_ERROR("halGetDeviceInfoByBuff failed with rc=%d", ret);
return false;
}

valid = aicore_bitmap[0];
return true;
}

/**
* Retrieve AICore register base addresses via HAL API
*/
static int get_aicore_reg_info(std::vector<int64_t>& aic, std::vector<int64_t>& aiv,
const int& addr_type, int64_t device_id) {
uint64_t valid = 0;
if (!get_pg_mask(valid, device_id)) {
// If can't get mask, assume all cores valid
valid = 0xFFFFFFFF;
LOG_WARN("Using default valid mask 0xFFFFFFFF");
}

uint64_t core_stride = 8 * 1024 * 1024; // 8M
uint64_t sub_core_stride = 0x100000ULL;

auto is_valid = [&valid](int id) {
const uint64_t mask = (1ULL << 25) - 1;
return ((static_cast<uint64_t>(valid) ^ mask) & (1ULL << id)) == 0;
};

auto halFunc =
(int (*)(int type, void* paramValue, size_t paramValueSize, void* outValue, size_t* outSizeRet))dlsym(
nullptr, "halMemCtl");

if (halFunc == nullptr) {
LOG_ERROR("halMemCtl not found in symbol table");
return -1;
}

struct AddrMapInPara in_map_para;
struct AddrMapOutPara out_map_para;
in_map_para.devid = device_id;
in_map_para.addr_type = addr_type;

auto ret = halFunc(0,
reinterpret_cast<void*>(&in_map_para),
sizeof(struct AddrMapInPara),
reinterpret_cast<void*>(&out_map_para),
nullptr);

if (ret != 0) {
LOG_ERROR("halMemCtl failed with rc=%d", ret);
return ret;
}

LOG_INFO("Register base: ptr=0x%llx, len=0x%llx", out_map_para.ptr, out_map_para.len);

// Iterate over all cores and subcores
for (uint32_t i = 0; i < DAV_2201::PLATFORM_MAX_PHYSICAL_CORES; i++) {
for (uint32_t j = 0; j < PLATFORM_SUB_CORES_PER_AICORE; j++) {
uint64_t vaddr = 0UL;
if (is_valid(i)) {
vaddr = out_map_para.ptr + (i * core_stride + j * sub_core_stride);
}
if (j == 0) {
aic.push_back(vaddr);
} else {
aiv.push_back(vaddr);
}
}
}

return 0;
}

void get_aicore_regs(std::vector<int64_t>& regs, uint64_t device_id) {
std::vector<int64_t> aiv;
std::vector<int64_t> aic;

int rt = get_aicore_reg_info(aic, aiv, ADDR_MAP_TYPE_REG_AIC_CTRL, device_id);

if (rt != 0) {
LOG_ERROR("get_aicore_reg_info failed, using placeholder addresses");
// Fallback: generate placeholder addresses
for (int i = 0; i < 25; i++) {
aic.push_back(0xDEADBEEF00000000ULL + (i * 0x800000)); // 8M stride
aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x100000);
aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x200000);
}
}

// AIC cores first, then AIV cores
regs.insert(regs.end(), aic.begin(), aic.end());
regs.insert(regs.end(), aiv.begin(), aiv.end());

LOG_INFO("get_aicore_regs: Retrieved %zu AIC and %zu AIV register addresses",
aic.size(), aiv.size());
}

int init_aicore_register_addresses(
uint64_t* runtime_regs_ptr,
uint64_t device_id,
MemoryAllocator& allocator) {

if (runtime_regs_ptr == nullptr) {
LOG_ERROR("init_aicore_register_addresses: Invalid parameters");
return -1;
}

LOG_INFO("Retrieving and allocating AICore register addresses...");

// Step 1: Get register addresses from HAL
std::vector<int64_t> host_regs;
get_aicore_regs(host_regs, device_id);

if (host_regs.empty()) {
LOG_ERROR("Failed to get AICore register addresses");
return -1;
}

// Step 2: Allocate device memory for register address array
size_t regs_size = host_regs.size() * sizeof(int64_t);
void* reg_ptr = allocator.alloc(regs_size);
if (reg_ptr == nullptr) {
LOG_ERROR("Failed to allocate device memory for register addresses");
return -1;
}

// Step 3: Copy register addresses to device memory
int ret = rtMemcpy(reg_ptr, regs_size, host_regs.data(), regs_size, RT_MEMCPY_HOST_TO_DEVICE);
if (ret != 0) {
LOG_ERROR("Failed to copy register addresses to device (rc=%d)", ret);
return -1;
}

// Step 4: Store device pointer in runtime.regs
*runtime_regs_ptr = reinterpret_cast<uint64_t>(reg_ptr);

LOG_INFO("Successfully initialized register addresses: %zu addresses at device 0x%llx",
host_regs.size(), *runtime_regs_ptr);

return 0;
}
1 change: 1 addition & 0 deletions src/platform/a2a3sim/aicore/inner_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

// Cache coherency constants (no-op in simulation)
#define ENTIRE_DATA_CACHE 0
#define SINGLE_CACHE_LINE 0
#define CACHELINE_OUT 0

// =============================================================================
Expand Down
45 changes: 45 additions & 0 deletions src/platform/include/aicpu/aicpu_regs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/**
* @file aicpu_regs.h
* @brief AICPU-side AICore register access via MMIO
*
* Provides MMIO-based register writes/reads for task dispatch and status polling.
* Memory barriers ensure cross-core visibility.
*/

#ifndef PLATFORM_A2A3_AICPU_AICPU_REGS_H_
#define PLATFORM_A2A3_AICPU_AICPU_REGS_H_

#include <cstdint>
#include "common/platform_config.h"

/**
* AICore execution status (read from COND register)
*/
enum class AICoreStatus : uint32_t {
IDLE = 0,
BUSY = 1,
};

/**
* Enable register-based communication (call during handshake)
*/
void enable_aicore_register(uint64_t reg_base_addr);

/**
* Disable register-based communication (call during shutdown)
*/
void disable_aicore_register(uint64_t reg_base_addr);

/**
* Write task_id to AICore's DATA_MAIN_BASE register
*
* Special values: 0=idle, AICORE_EXIT_SIGNAL=shutdown
*/
void write_task_id_to_aicore(uint64_t reg_base_addr, uint32_t task_id);

/**
* Read AICore status from COND register
*/
AICoreStatus read_aicore_status(uint64_t reg_base_addr);

#endif // PLATFORM_A2A3_AICPU_AICPU_REGS_H_
5 changes: 0 additions & 5 deletions src/platform/include/common/perf_profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,6 @@
#define RUNTIME_MAX_FANOUT 512
#endif

// Maximum cores that can be profiled simultaneously
#ifndef PLATFORM_MAX_CORES
#define PLATFORM_MAX_CORES 72 // 24 blocks × 3 cores/block
#endif

// =============================================================================
// Buffer Status Enumeration
// =============================================================================
Expand Down
Loading
Loading