Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions src/platform/a2a3/aicore/inner_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
#ifndef PLATFORM_A2A3_AICORE_INNER_KERNEL_H_
#define PLATFORM_A2A3_AICORE_INNER_KERNEL_H_

#include <cstdint>
#include "common/platform_config.h"

// AICore function attribute for CANN compiler
#ifndef __aicore__
#define __aicore__ [aicore]
Expand All @@ -17,4 +20,43 @@
// dcci (Data Cache Clean and Invalidate) is provided by CANN headers
// No need to define it here - it's a hardware instruction

// AICoreStatus is defined in aicpu/aicpu_regs.h for AICPU side.
// Redeclare here for AICore side (avoids pulling in AICPU headers).
enum class AICoreStatus : uint32_t {
IDLE = 0,
BUSY = 1,
};

/**
* Read an AICore register via SPR access
*
* @param reg Register identifier
* @return Register value (zero-extended to uint64_t)
*/
__aicore__ inline uint64_t read_reg(RegId reg) {
switch (reg) {
case RegId::DATA_MAIN_BASE: {
uint32_t val;
__asm__ volatile("MOV %0, DATA_MAIN_BASE\n" : "=l"(val));
return static_cast<uint64_t>(val);
}
default: return 0;
}
}

/**
* Write to an AICore register
*
* @param reg Register identifier
* @param value Value to write
*/
__aicore__ inline void write_reg(RegId reg, uint64_t value) {
switch (reg) {
case RegId::COND:
set_cond(static_cast<AICoreStatus>(static_cast<uint32_t>(value)));
break;
default: break;
}
}

#endif // PLATFORM_A2A3_AICORE_INNER_KERNEL_H_
5 changes: 3 additions & 2 deletions src/platform/a2a3/aicore/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class Runtime;
[[block_local]] int block_idx;
[[block_local]] CoreType core_type;

extern __aicore__ void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type);
extern __aicore__ void aicore_execute(__gm__ Runtime* runtime, int block_idx, CoreType core_type, uint32_t physical_core_id);

/**
* Kernel entry point with control loop
Expand All @@ -49,5 +49,6 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ Runtime
block_idx = get_block_idx();
core_type = CoreType::AIC;
#endif
aicore_execute(runtime, block_idx, core_type);
uint32_t physical_core_id = static_cast<uint32_t>(get_coreid()) & AICORE_COREID_MASK;
aicore_execute(runtime, block_idx, core_type, physical_core_id);
}
23 changes: 23 additions & 0 deletions src/platform/a2a3/aicpu/aicpu_regs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/**
* @file aicpu_regs.cpp
* @brief AICPU-side register access implementation (a2a3 real hardware)
*
* Uses volatile MMIO pointer access with memory barriers for
* cross-core register communication.
*/

#include "aicpu/aicpu_regs.h"

uint64_t read_reg(uint64_t reg_base_addr, RegId reg) {
volatile uint32_t* ptr = reinterpret_cast<volatile uint32_t*>(
reg_base_addr + reg_offset(reg));
__sync_synchronize();
return static_cast<uint64_t>(*ptr);
}

void write_reg(uint64_t reg_base_addr, RegId reg, uint64_t value) {
volatile uint32_t* ptr = reinterpret_cast<volatile uint32_t*>(
reg_base_addr + reg_offset(reg));
*ptr = static_cast<uint32_t>(value);
__sync_synchronize();
}
1 change: 1 addition & 0 deletions src/platform/a2a3/host/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ list(APPEND HOST_RUNTIME_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/memory_allocator.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/pto_runtime_c_api.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/platform_compile_info.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/host_regs.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../../src/performance_collector.cpp"
Expand Down
8 changes: 8 additions & 0 deletions src/platform/a2a3/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

// Include HAL constants from CANN (header only, library loaded dynamically)
#include "ascend_hal.h"
#include "host/host_regs.h" // Register address retrieval

// =============================================================================
// Lazy-loaded HAL (ascend_hal) for profiling host-register only
Expand Down Expand Up @@ -324,6 +325,13 @@ int DeviceRunner::run(Runtime& runtime,
worker_count_ = num_aicore; // Store for print_handshake_results in destructor
runtime.sche_cpu_num = launch_aicpu_num;

// Get AICore register addresses for register-based task dispatch
rc = init_aicore_register_addresses(&runtime.regs, static_cast<uint64_t>(device_id), mem_alloc_);
if (rc != 0) {
LOG_ERROR("init_aicore_register_addresses failed: %d", rc);
return rc;
}

// Calculate number of AIC cores (1/3 of total)
int num_aic = block_dim; // Round up for 1/3

Expand Down
178 changes: 178 additions & 0 deletions src/platform/a2a3/host/host_regs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/**
* @file host_regs.cpp
* @brief Host-side AICore register address retrieval implementation
*/

#include "host/host_regs.h"
#include "host/memory_allocator.h"
#include "common/unified_log.h"
#include "common/platform_config.h"
#include "runtime/rt.h"
#include "ascend_hal.h" // CANN HAL API definitions (MODULE_TYPE_AICORE, INFO_TYPE_OCCUPY, etc.)
#include <dlfcn.h>
#include <iostream>

/**
* Query valid AICore cores via HAL API
*/
static bool get_pg_mask(uint64_t& valid, int64_t device_id) {
uint64_t aicore_bitmap[PLATFORM_AICORE_BITMAP_LEN] = {0};
int32_t size_n = static_cast<int32_t>(sizeof(uint64_t)) * PLATFORM_AICORE_BITMAP_LEN;

auto halFuncDevInfo =
(int (*)(uint64_t deviceId, int32_t moduleType, int32_t infoType, void* buf, int32_t* size))dlsym(
nullptr, "halGetDeviceInfoByBuff");

if (halFuncDevInfo == nullptr) {
LOG_WARN("halGetDeviceInfoByBuff not found, assuming all cores valid");
return false;
}

auto ret = halFuncDevInfo(static_cast<uint32_t>(device_id),
MODULE_TYPE_AICORE,
INFO_TYPE_OCCUPY,
reinterpret_cast<void*>(&aicore_bitmap[0]),
&size_n);

if (ret != 0) {
LOG_ERROR("halGetDeviceInfoByBuff failed with rc=%d", ret);
return false;
}

valid = aicore_bitmap[0];
return true;
}

/**
* Retrieve AICore register base addresses via HAL API
*/
static int get_aicore_reg_info(std::vector<int64_t>& aic, std::vector<int64_t>& aiv,
const int& addr_type, int64_t device_id) {
uint64_t valid = 0;
if (!get_pg_mask(valid, device_id)) {
// If can't get mask, assume all cores valid
valid = 0xFFFFFFFF;
LOG_WARN("Using default valid mask 0xFFFFFFFF");
}

uint64_t core_stride = 8 * 1024 * 1024; // 8M
uint64_t sub_core_stride = 0x100000ULL;

auto is_valid = [&valid](int id) {
const uint64_t mask = (1ULL << 25) - 1;
return ((static_cast<uint64_t>(valid) ^ mask) & (1ULL << id)) == 0;
};

auto halFunc =
(int (*)(int type, void* paramValue, size_t paramValueSize, void* outValue, size_t* outSizeRet))dlsym(
nullptr, "halMemCtl");

if (halFunc == nullptr) {
LOG_ERROR("halMemCtl not found in symbol table");
return -1;
}

struct AddrMapInPara in_map_para;
struct AddrMapOutPara out_map_para;
in_map_para.devid = device_id;
in_map_para.addr_type = addr_type;

auto ret = halFunc(0,
reinterpret_cast<void*>(&in_map_para),
sizeof(struct AddrMapInPara),
reinterpret_cast<void*>(&out_map_para),
nullptr);

if (ret != 0) {
LOG_ERROR("halMemCtl failed with rc=%d", ret);
return ret;
}

LOG_INFO("Register base: ptr=0x%llx, len=0x%llx", out_map_para.ptr, out_map_para.len);

// Iterate over all cores and subcores
for (uint32_t i = 0; i < DAV_2201::PLATFORM_MAX_PHYSICAL_CORES; i++) {
for (uint32_t j = 0; j < PLATFORM_SUB_CORES_PER_AICORE; j++) {
uint64_t vaddr = 0UL;
if (is_valid(i)) {
vaddr = out_map_para.ptr + (i * core_stride + j * sub_core_stride);
}
if (j == 0) {
aic.push_back(vaddr);
} else {
aiv.push_back(vaddr);
}
}
}

return 0;
}

void get_aicore_regs(std::vector<int64_t>& regs, uint64_t device_id) {
std::vector<int64_t> aiv;
std::vector<int64_t> aic;

int rt = get_aicore_reg_info(aic, aiv, ADDR_MAP_TYPE_REG_AIC_CTRL, device_id);

if (rt != 0) {
LOG_ERROR("get_aicore_reg_info failed, using placeholder addresses");
// Fallback: generate placeholder addresses
for (int i = 0; i < 25; i++) {
aic.push_back(0xDEADBEEF00000000ULL + (i * 0x800000)); // 8M stride
aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x100000);
aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x200000);
}
}

// AIC cores first, then AIV cores
regs.insert(regs.end(), aic.begin(), aic.end());
regs.insert(regs.end(), aiv.begin(), aiv.end());

LOG_INFO("get_aicore_regs: Retrieved %zu AIC and %zu AIV register addresses",
aic.size(), aiv.size());
}

int init_aicore_register_addresses(
uint64_t* runtime_regs_ptr,
uint64_t device_id,
MemoryAllocator& allocator) {

if (runtime_regs_ptr == nullptr) {
LOG_ERROR("init_aicore_register_addresses: Invalid parameters");
return -1;
}

LOG_INFO("Retrieving and allocating AICore register addresses...");

// Step 1: Get register addresses from HAL
std::vector<int64_t> host_regs;
get_aicore_regs(host_regs, device_id);

if (host_regs.empty()) {
LOG_ERROR("Failed to get AICore register addresses");
return -1;
}

// Step 2: Allocate device memory for register address array
size_t regs_size = host_regs.size() * sizeof(int64_t);
void* reg_ptr = allocator.alloc(regs_size);
if (reg_ptr == nullptr) {
LOG_ERROR("Failed to allocate device memory for register addresses");
return -1;
}

// Step 3: Copy register addresses to device memory
int ret = rtMemcpy(reg_ptr, regs_size, host_regs.data(), regs_size, RT_MEMCPY_HOST_TO_DEVICE);
if (ret != 0) {
LOG_ERROR("Failed to copy register addresses to device (rc=%d)", ret);
return -1;
}

// Step 4: Store device pointer in runtime.regs
*runtime_regs_ptr = reinterpret_cast<uint64_t>(reg_ptr);

LOG_INFO("Successfully initialized register addresses: %zu addresses at device 0x%llx",
host_regs.size(), *runtime_regs_ptr);

return 0;
}
50 changes: 50 additions & 0 deletions src/platform/a2a3sim/aicore/inner_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,13 @@

// Cache coherency constants (no-op in simulation)
#define ENTIRE_DATA_CACHE 0
#define SINGLE_CACHE_LINE 0
#define CACHELINE_OUT 0

// pipe_barrier - memory barrier in simulation (hardware pipeline synchronization)
#define PIPE_ALL 0
#define pipe_barrier(pipe) __sync_synchronize()

// =============================================================================
// System Counter Simulation
// =============================================================================
Expand Down Expand Up @@ -56,4 +61,49 @@ inline uint64_t get_sys_cnt() {
return ticks;
}

// =============================================================================
// Register Access Simulation
// =============================================================================

/**
* Per-thread simulated register base address.
* Set by the kernel wrapper before calling aicore_execute().
* Points to a SIM_REG_BLOCK_SIZE-byte block allocated by DeviceRunner.
*/
extern thread_local volatile uint8_t* g_sim_reg_base;

/**
* AICore execution status (matches aicpu/aicpu_regs.h definition)
*/
enum class AICoreStatus : uint32_t {
IDLE = 0,
BUSY = 1,
};

/**
* Read an AICore register from simulated register memory
*
* @param reg Register identifier
* @return Register value (zero-extended to uint64_t)
*/
inline uint64_t read_reg(RegId reg) {
uint32_t offset = reg_offset(reg);
__sync_synchronize();
return static_cast<uint64_t>(
*reinterpret_cast<volatile uint32_t*>(g_sim_reg_base + offset));
}

/**
* Write to an AICore register in simulated register memory
*
* @param reg Register identifier
* @param value Value to write
*/
inline void write_reg(RegId reg, uint64_t value) {
uint32_t offset = reg_offset(reg);
*reinterpret_cast<volatile uint32_t*>(g_sim_reg_base + offset) =
static_cast<uint32_t>(value);
__sync_synchronize();
}

#endif // PLATFORM_A2A3SIM_AICORE_INNER_KERNEL_H_
Loading
Loading