Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ constexpr int GRID_K = 4;
constexpr int GRID_N = 4;
constexpr int BATCH = 1;

constexpr size_t TILE_BYTES = TILE * TILE * sizeof(float);
constexpr uint64_t TILE_BYTES = TILE * TILE * sizeof(float);
constexpr int NUM_P_BUFFERS = BATCH * GRID_M * GRID_N;

constexpr int DEV_A = 0;
Expand Down Expand Up @@ -82,9 +82,9 @@ extern "C" int orchestration(Runtime* runtime) {
for (int n_idx = 0; n_idx < GRID_N; n_idx++) {
for (int k_idx = 0; k_idx < GRID_K; k_idx++) {
// Calculate tile offsets
size_t A_offset = (batch * GRID_M * GRID_K + m_idx * GRID_K + k_idx) * TILE_BYTES;
size_t B_offset = (batch * GRID_K * GRID_N + k_idx * GRID_N + n_idx) * TILE_BYTES;
size_t C_offset = (batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx) * TILE_BYTES;
uint64_t A_offset = (batch * GRID_M * GRID_K + m_idx * GRID_K + k_idx) * TILE_BYTES;
uint64_t B_offset = (batch * GRID_K * GRID_N + k_idx * GRID_N + n_idx) * TILE_BYTES;
uint64_t C_offset = (batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx) * TILE_BYTES;

int c_tile_idx = batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ extern "C" int orchestration(Runtime* runtime) {

// Allocate intermediate tensors on device (HBM, accessible by AIV cores).
// Note: malloc() on AICPU returns AICPU-local memory which AIV cores cannot access.
size_t bytes = static_cast<size_t>(size) * sizeof(float);
uint64_t bytes = static_cast<uint64_t>(size) * sizeof(float);
void* dev_c = api.device_malloc(bytes);
void* dev_d = api.device_malloc(bytes);
void* dev_e = api.device_malloc(bytes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ constexpr int GRID_K = 4;
constexpr int GRID_N = 4;
constexpr int BATCH = 1;

constexpr size_t TILE_BYTES = TILE * TILE * sizeof(float);
constexpr uint64_t TILE_BYTES = TILE * TILE * sizeof(float);

int build_bgemm_graph(Runtime* runtime, uint64_t* args, int arg_count) {
if (arg_count < 7) {
Expand All @@ -41,9 +41,9 @@ int build_bgemm_graph(Runtime* runtime, uint64_t* args, int arg_count) {
void* host_A = reinterpret_cast<void*>(args[0]);
void* host_B = reinterpret_cast<void*>(args[1]);
void* host_C = reinterpret_cast<void*>(args[2]);
size_t size_A = static_cast<size_t>(args[3]);
size_t size_B = static_cast<size_t>(args[4]);
size_t size_C = static_cast<size_t>(args[5]);
uint64_t size_A = static_cast<uint64_t>(args[3]);
uint64_t size_B = static_cast<uint64_t>(args[4]);
uint64_t size_C = static_cast<uint64_t>(args[5]);

std::cout << "\n=== build_bgemm_graph ===" << '\n';
std::cout << "Grid: " << GRID_M << " x " << GRID_K << " x " << GRID_N << '\n';
Expand Down Expand Up @@ -94,9 +94,9 @@ int build_bgemm_graph(Runtime* runtime, uint64_t* args, int arg_count) {
for (int n_idx = 0; n_idx < GRID_N; n_idx++) {
for (int k_idx = 0; k_idx < GRID_K; k_idx++) {
// Calculate tile offsets
size_t A_offset = (batch * GRID_M * GRID_K + m_idx * GRID_K + k_idx) * TILE_BYTES;
size_t B_offset = (batch * GRID_K * GRID_N + k_idx * GRID_N + n_idx) * TILE_BYTES;
size_t C_offset = (batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx) * TILE_BYTES;
uint64_t A_offset = (batch * GRID_M * GRID_K + m_idx * GRID_K + k_idx) * TILE_BYTES;
uint64_t B_offset = (batch * GRID_K * GRID_N + k_idx * GRID_N + n_idx) * TILE_BYTES;
uint64_t C_offset = (batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx) * TILE_BYTES;

int c_tile_idx = batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ int build_matmul_graph(Runtime* runtime, uint64_t* args, int arg_count) {
void* host_w1 = reinterpret_cast<void*>(args[1]);
void* host_w2 = reinterpret_cast<void*>(args[2]);
void* host_f = reinterpret_cast<void*>(args[3]);
size_t size_a = static_cast<size_t>(args[4]);
size_t size_w1 = static_cast<size_t>(args[5]);
size_t size_w2 = static_cast<size_t>(args[6]);
size_t size_f = static_cast<size_t>(args[7]);
uint64_t size_a = static_cast<uint64_t>(args[4]);
uint64_t size_w1 = static_cast<uint64_t>(args[5]);
uint64_t size_w2 = static_cast<uint64_t>(args[6]);
uint64_t size_f = static_cast<uint64_t>(args[7]);
int SIZE = static_cast<int>(args[8]);

std::cout << "\n=== build_matmul_graph: Creating Task Runtime ===" << '\n';
Expand Down Expand Up @@ -92,8 +92,8 @@ int build_matmul_graph(Runtime* runtime, uint64_t* args, int arg_count) {
// Allocate intermediate tensors (b, c, d)
// dev_b is half precision (output of log_sqrt kernel, input to matmul)
// dev_c, dev_d are float precision (output of matmul kernels)
size_t BYTES_HALF = SIZE * sizeof(uint16_t); // half = 2 bytes
size_t BYTES_FLOAT = SIZE * sizeof(float); // float = 4 bytes
uint64_t BYTES_HALF = SIZE * sizeof(uint16_t); // half = 2 bytes
uint64_t BYTES_FLOAT = SIZE * sizeof(float); // float = 4 bytes
void* dev_b = runtime->host_api.device_malloc(BYTES_HALF); // sqrt(log(A)) - half output
void* dev_c = runtime->host_api.device_malloc(BYTES_FLOAT); // B @ W1 - float output
void* dev_d = runtime->host_api.device_malloc(BYTES_FLOAT); // B @ W2 - float output
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count)
void* host_out = reinterpret_cast<void*>(args[5]);
int64_t* host_config = reinterpret_cast<int64_t*>(args[6]);

size_t query_size = static_cast<size_t>(args[7]);
size_t key_cache_size = static_cast<size_t>(args[8]);
size_t value_cache_size = static_cast<size_t>(args[9]);
size_t block_table_size = static_cast<size_t>(args[10]);
size_t context_lens_size = static_cast<size_t>(args[11]);
size_t out_size = static_cast<size_t>(args[12]);
size_t config_size = static_cast<size_t>(args[13]);
uint64_t query_size = static_cast<uint64_t>(args[7]);
uint64_t key_cache_size = static_cast<uint64_t>(args[8]);
uint64_t value_cache_size = static_cast<uint64_t>(args[9]);
uint64_t block_table_size = static_cast<uint64_t>(args[10]);
uint64_t context_lens_size = static_cast<uint64_t>(args[11]);
uint64_t out_size = static_cast<uint64_t>(args[12]);
uint64_t config_size = static_cast<uint64_t>(args[13]);

int batch = static_cast<int>(host_config[0]);
int num_heads = static_cast<int>(host_config[1]);
Expand Down Expand Up @@ -79,11 +79,11 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count)
runtime->record_tensor_pair(host_out, dev_out, out_size);

// Buffer sizes depend on q_tile_size and block_size
size_t sij_size = static_cast<size_t>(q_tile_size) * block_size * sizeof(float);
size_t pij_size = static_cast<size_t>(q_tile_size) * block_size * sizeof(uint16_t);
size_t mij_size = static_cast<size_t>(q_tile_size) * sizeof(float);
size_t lij_size = mij_size;
size_t oi_new_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
uint64_t sij_size = static_cast<uint64_t>(q_tile_size) * block_size * sizeof(float);
uint64_t pij_size = static_cast<uint64_t>(q_tile_size) * block_size * sizeof(uint16_t);
uint64_t mij_size = static_cast<uint64_t>(q_tile_size) * sizeof(float);
uint64_t lij_size = mij_size;
uint64_t oi_new_size = static_cast<uint64_t>(q_tile_size) * head_dim * sizeof(float);

// Per-batch-per-block intermediate buffers
int total_buffers = batch * max_num_blocks;
Expand All @@ -103,9 +103,9 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count)

// Per-(batch, head_tile) accumulators
int total_accums = batch * num_head_tiles;
size_t mi_size = static_cast<size_t>(q_tile_size) * sizeof(float);
size_t li_size = mi_size;
size_t oi_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
uint64_t mi_size = static_cast<uint64_t>(q_tile_size) * sizeof(float);
uint64_t li_size = mi_size;
uint64_t oi_size = static_cast<uint64_t>(q_tile_size) * head_dim * sizeof(float);

void** dev_mi_arr = new void*[total_accums];
void** dev_li_arr = new void*[total_accums];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ int build_example_graph(Runtime* runtime, uint64_t* args, int arg_count) {
void* host_a = reinterpret_cast<void*>(args[0]);
void* host_b = reinterpret_cast<void*>(args[1]);
void* host_f = reinterpret_cast<void*>(args[2]);
size_t size_a = static_cast<size_t>(args[3]);
size_t size_b = static_cast<size_t>(args[4]);
size_t size_f = static_cast<size_t>(args[5]);
uint64_t size_a = static_cast<uint64_t>(args[3]);
uint64_t size_b = static_cast<uint64_t>(args[4]);
uint64_t size_f = static_cast<uint64_t>(args[5]);
int SIZE = static_cast<int>(args[6]);

std::cout << "\n=== build_example_graph: Creating Task Runtime ===" << '\n';
Expand Down Expand Up @@ -70,7 +70,7 @@ int build_example_graph(Runtime* runtime, uint64_t* args, int arg_count) {
std::cout << "Tensor f (output): " << size_f << " bytes allocated\n";

// Allocate intermediate tensors (c, d, e)
size_t BYTES = SIZE * sizeof(float);
uint64_t BYTES = SIZE * sizeof(float);
void* dev_c = runtime->host_api.device_malloc(BYTES);
void* dev_d = runtime->host_api.device_malloc(BYTES);
void* dev_e = runtime->host_api.device_malloc(BYTES);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) {
void* dev_A = (void*)(uintptr_t)args[ARG_PTR_A];
void* dev_B = (void*)(uintptr_t)args[ARG_PTR_B];
void* dev_C = (void*)(uintptr_t)args[ARG_PTR_C];
size_t size_A = (size_t)args[ARG_SIZE_A];
size_t size_B = (size_t)args[ARG_SIZE_B];
size_t size_C = (size_t)args[ARG_SIZE_C];
uint64_t size_A = (uint64_t)args[ARG_SIZE_A];
uint64_t size_B = (uint64_t)args[ARG_SIZE_B];
uint64_t size_C = (uint64_t)args[ARG_SIZE_C];

printf("[bgemm_orch] Grid: %dx%dx%d, Batch: %d, Tile: %d\n",
GRID_M, GRID_K, GRID_N, BATCH, TILE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ static __aicore__ void softmax_prepare_impl(__gm__ Tensor* sij,
using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;

TileVecMxN sijTile;
TileSijDyn sijDynTile(static_cast<size_t>(valid_len));
TileSijDyn sijDynTile(valid_len);
TileSijPad sijPadTile;
TileVecMxN pijTile;
TileVecMxN tmpTile;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) {
int64_t* host_config = (int64_t*)(uintptr_t)args[6];

// Extract sizes (next 7 args after pointers)
size_t query_size = (size_t)args[7];
size_t key_cache_size = (size_t)args[8];
size_t value_cache_size = (size_t)args[9];
uint64_t query_size = (uint64_t)args[7];
uint64_t key_cache_size = (uint64_t)args[8];
uint64_t value_cache_size = (uint64_t)args[9];

// Extract config parameters
uint64_t batch = (uint64_t)(int)host_config[0];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,14 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) {
void* arg_a_ptr = (void*)(uintptr_t)args[ARG_PTR_A];
void* arg_b_ptr = (void*)(uintptr_t)args[ARG_PTR_B];
void* arg_f_ptr = (void*)(uintptr_t)args[ARG_PTR_F];
size_t size_a = (size_t)args[ARG_SIZE_A];
size_t size_b = (size_t)args[ARG_SIZE_B];
size_t size_f = (size_t)args[ARG_SIZE_F];
uint64_t size_a = (uint64_t)args[ARG_SIZE_A];
uint64_t size_b = (uint64_t)args[ARG_SIZE_B];
uint64_t size_f = (uint64_t)args[ARG_SIZE_F];
int SIZE = (int)(args[ARG_SIZE] & 0x7FFFFFFF);

printf("===============SIZE=%d\n", SIZE);

size_t BYTES = (size_t)SIZE * sizeof(float);
uint64_t BYTES = (uint64_t)SIZE * sizeof(float);

Tensor ext_a = make_tensor_external(arg_a_ptr, size_a);
Tensor ext_b = make_tensor_external(arg_b_ptr, size_b);
Expand Down
6 changes: 3 additions & 3 deletions src/platform/a2a3/aicpu/device_malloc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ static void resolve_hal_mem_functions() {
g_hal_resolved = true;
}

void* aicpu_device_malloc(size_t size) {
void* aicpu_device_malloc(uint64_t size) {
resolve_hal_mem_functions();

if (g_halMemAlloc == nullptr) {
Expand All @@ -49,9 +49,9 @@ void* aicpu_device_malloc(size_t size) {
// bit14~16: phy mem type (MEM_TYPE_HBM=0x1 << 14)
constexpr unsigned long long MEM_TYPE_HBM = 0x1ULL << 14;
unsigned long long flag = MEM_TYPE_HBM;
int rc = g_halMemAlloc(&ptr, static_cast<unsigned long long>(size), flag);
int rc = g_halMemAlloc(&ptr, size, flag);
if (rc != 0 || ptr == nullptr) {
DEV_ERROR("halMemAlloc failed: rc=%d size=%zu flag=0x%llx", rc, size, flag);
DEV_ERROR("halMemAlloc failed: rc=%d size=%llu flag=0x%llx", rc, size, flag);
return nullptr;
}
return ptr;
Expand Down
36 changes: 18 additions & 18 deletions src/platform/a2a3/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
namespace {
void* g_hal_handle = nullptr;

using HalHostRegisterFn = int (*)(void* dev_ptr, size_t size, unsigned int flags, int device_id, void** host_ptr);
using HalHostRegisterFn = int (*)(void* dev_ptr, uint64_t size, unsigned int flags, int device_id, void** host_ptr);
using HalHostUnregisterFn = int (*)(void* host_ptr, int device_id);

int load_hal_if_needed() {
Expand Down Expand Up @@ -57,8 +57,8 @@ int KernelArgsHelper::init_device_args(const DeviceArgs& host_device_args, Memor

// Allocate device memory for device_args
if (args.device_args == nullptr) {
uint64_t device_args_size = sizeof(DeviceArgs);
void* device_args_dev = allocator_->alloc(device_args_size);
uint64_t device_args_size = static_cast<uint64_t>(sizeof(DeviceArgs));
void* device_args_dev = allocator_->alloc(static_cast<size_t>(device_args_size));
if (device_args_dev == nullptr) {
LOG_ERROR("Alloc for device_args failed");
return -1;
Expand Down Expand Up @@ -90,8 +90,8 @@ int KernelArgsHelper::init_runtime_args(const Runtime& host_runtime, MemoryAlloc
allocator_ = &allocator;

if (args.runtime_args == nullptr) {
uint64_t runtime_size = sizeof(Runtime);
void* runtime_dev = allocator_->alloc(runtime_size);
uint64_t runtime_size = static_cast<uint64_t>(sizeof(Runtime));
void* runtime_dev = allocator_->alloc(static_cast<size_t>(runtime_size));
if (runtime_dev == nullptr) {
LOG_ERROR("Alloc for runtime_args failed");
return -1;
Expand Down Expand Up @@ -129,8 +129,8 @@ int AicpuSoInfo::init(const std::vector<uint8_t>& aicpu_so_binary, MemoryAllocat
return -1;
}

size_t file_size = aicpu_so_binary.size();
void* d_aicpu_data = allocator_->alloc(file_size);
uint64_t file_size = static_cast<uint64_t>(aicpu_so_binary.size());
void* d_aicpu_data = allocator_->alloc(static_cast<size_t>(file_size));
if (d_aicpu_data == nullptr) {
LOG_ERROR("Alloc failed for AICPU SO");
return -1;
Expand Down Expand Up @@ -256,19 +256,19 @@ int DeviceRunner::ensure_binaries_loaded(
return 0;
}

void* DeviceRunner::allocate_tensor(size_t bytes) { return mem_alloc_.alloc(bytes); }
void* DeviceRunner::allocate_tensor(uint64_t bytes) { return mem_alloc_.alloc(bytes); }

void DeviceRunner::free_tensor(void* dev_ptr) {
if (dev_ptr != nullptr) {
mem_alloc_.free(dev_ptr);
}
}

int DeviceRunner::copy_to_device(void* dev_ptr, const void* host_ptr, size_t bytes) {
int DeviceRunner::copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t bytes) {
return rtMemcpy(dev_ptr, bytes, host_ptr, bytes, RT_MEMCPY_HOST_TO_DEVICE);
}

int DeviceRunner::copy_from_device(void* host_ptr, const void* dev_ptr, size_t bytes) {
int DeviceRunner::copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t bytes) {
return rtMemcpy(host_ptr, bytes, dev_ptr, bytes, RT_MEMCPY_DEVICE_TO_HOST);
}

Expand Down Expand Up @@ -436,7 +436,7 @@ void DeviceRunner::print_handshake_results() {

// Allocate temporary buffer to read handshake data from device
std::vector<Handshake> workers(worker_count_);
size_t total_size = sizeof(Handshake) * worker_count_;
uint64_t total_size = sizeof(Handshake) * worker_count_;
rtMemcpy(workers.data(), total_size, kernel_args_.args.runtime_args->workers, total_size, RT_MEMCPY_DEVICE_TO_HOST);

LOG_DEBUG("Handshake results for %d cores:", worker_count_);
Expand Down Expand Up @@ -537,7 +537,7 @@ int DeviceRunner::launch_aicore_kernel(rtStream_t stream, Runtime* runtime) {
return -1;
}

size_t bin_size = aicore_kernel_binary_.size();
uint64_t bin_size = static_cast<uint64_t>(aicore_kernel_binary_.size());
const void* bin_data = aicore_kernel_binary_.data();

rtDevBinary_t binary;
Expand Down Expand Up @@ -579,7 +579,7 @@ int DeviceRunner::launch_aicore_kernel(rtStream_t stream, Runtime* runtime) {
// Kernel Binary Upload (returns device address for caller to store in Runtime)
// =============================================================================

uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data, size_t bin_size) {
uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data, uint64_t bin_size) {
if (bin_data == nullptr || bin_size == 0) {
LOG_ERROR("Invalid kernel binary data");
return 0;
Expand All @@ -602,7 +602,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data

// Allocate device GM memory (size field + binary data)
uint64_t alloc_size = sizeof(uint64_t) + bin_size;
void* gm_addr = mem_alloc_.alloc(alloc_size);
void* gm_addr = mem_alloc_.alloc(static_cast<size_t>(alloc_size));
if (gm_addr == nullptr) {
LOG_ERROR("Failed to allocate device GM memory for kernel func_id=%d", func_id);
return 0;
Expand All @@ -612,7 +612,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data
std::vector<uint8_t> host_buf(alloc_size);
uint64_t* size_ptr = reinterpret_cast<uint64_t*>(host_buf.data());
*size_ptr = bin_size;
std::memcpy(host_buf.data() + sizeof(uint64_t), bin_data, bin_size);
std::memcpy(host_buf.data() + sizeof(uint64_t), bin_data, static_cast<size_t>(bin_size));

// Copy to device
int rc = rtMemcpy(gm_addr, alloc_size, host_buf.data(), alloc_size, RT_MEMCPY_HOST_TO_DEVICE);
Expand All @@ -635,13 +635,13 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data

int DeviceRunner::init_performance_profiling(Runtime& runtime, int num_aicore, int device_id) {
// Define allocation callback (a2a3: use MemoryAllocator)
auto alloc_cb = [](size_t size, void* user_data) -> void* {
auto alloc_cb = [](uint64_t size, void* user_data) -> void* {
auto* allocator = static_cast<MemoryAllocator*>(user_data);
return allocator->alloc(size);
return allocator->alloc(static_cast<size_t>(size));
};

// Define registration callback (a2a3: use halHostRegister for shared memory)
auto register_cb = [](void* dev_ptr, size_t size, int device_id,
auto register_cb = [](void* dev_ptr, uint64_t size, int device_id,
void* user_data, void** host_ptr) -> int {
(void)user_data; // Not needed for registration
if (load_hal_if_needed() != 0) {
Expand Down
Loading
Loading