diff --git a/examples/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp index 496ecef9..30da58af 100644 --- a/examples/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp +++ b/examples/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp @@ -29,7 +29,7 @@ constexpr int GRID_K = 4; constexpr int GRID_N = 4; constexpr int BATCH = 1; -constexpr size_t TILE_BYTES = TILE * TILE * sizeof(float); +constexpr uint64_t TILE_BYTES = TILE * TILE * sizeof(float); constexpr int NUM_P_BUFFERS = BATCH * GRID_M * GRID_N; constexpr int DEV_A = 0; @@ -82,9 +82,9 @@ extern "C" int orchestration(Runtime* runtime) { for (int n_idx = 0; n_idx < GRID_N; n_idx++) { for (int k_idx = 0; k_idx < GRID_K; k_idx++) { // Calculate tile offsets - size_t A_offset = (batch * GRID_M * GRID_K + m_idx * GRID_K + k_idx) * TILE_BYTES; - size_t B_offset = (batch * GRID_K * GRID_N + k_idx * GRID_N + n_idx) * TILE_BYTES; - size_t C_offset = (batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx) * TILE_BYTES; + uint64_t A_offset = (batch * GRID_M * GRID_K + m_idx * GRID_K + k_idx) * TILE_BYTES; + uint64_t B_offset = (batch * GRID_K * GRID_N + k_idx * GRID_N + n_idx) * TILE_BYTES; + uint64_t C_offset = (batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx) * TILE_BYTES; int c_tile_idx = batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx; diff --git a/examples/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp b/examples/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp index 09c41f8f..900a2371 100644 --- a/examples/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp +++ b/examples/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp @@ -63,7 +63,7 @@ extern "C" int orchestration(Runtime* runtime) { // Allocate intermediate tensors on device (HBM, accessible by AIV cores). // Note: malloc() on AICPU returns AICPU-local memory which AIV cores cannot access. - size_t bytes = static_cast(size) * sizeof(float); + uint64_t bytes = static_cast(size) * sizeof(float); void* dev_c = api.device_malloc(bytes); void* dev_d = api.device_malloc(bytes); void* dev_e = api.device_malloc(bytes); diff --git a/examples/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp index 5c6254d5..ae831e2d 100644 --- a/examples/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp +++ b/examples/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp @@ -30,7 +30,7 @@ constexpr int GRID_K = 4; constexpr int GRID_N = 4; constexpr int BATCH = 1; -constexpr size_t TILE_BYTES = TILE * TILE * sizeof(float); +constexpr uint64_t TILE_BYTES = TILE * TILE * sizeof(float); int build_bgemm_graph(Runtime* runtime, uint64_t* args, int arg_count) { if (arg_count < 7) { @@ -41,9 +41,9 @@ int build_bgemm_graph(Runtime* runtime, uint64_t* args, int arg_count) { void* host_A = reinterpret_cast(args[0]); void* host_B = reinterpret_cast(args[1]); void* host_C = reinterpret_cast(args[2]); - size_t size_A = static_cast(args[3]); - size_t size_B = static_cast(args[4]); - size_t size_C = static_cast(args[5]); + uint64_t size_A = static_cast(args[3]); + uint64_t size_B = static_cast(args[4]); + uint64_t size_C = static_cast(args[5]); std::cout << "\n=== build_bgemm_graph ===" << '\n'; std::cout << "Grid: " << GRID_M << " x " << GRID_K << " x " << GRID_N << '\n'; @@ -94,9 +94,9 @@ int build_bgemm_graph(Runtime* runtime, uint64_t* args, int arg_count) { for (int n_idx = 0; n_idx < GRID_N; n_idx++) { for (int k_idx = 0; k_idx < GRID_K; k_idx++) { // Calculate tile offsets - size_t A_offset = (batch * GRID_M * GRID_K + m_idx * GRID_K + k_idx) * TILE_BYTES; - size_t B_offset = (batch * GRID_K * GRID_N + k_idx * GRID_N + n_idx) * TILE_BYTES; - size_t C_offset = (batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx) * TILE_BYTES; + uint64_t A_offset = (batch * GRID_M * GRID_K + m_idx * GRID_K + k_idx) * TILE_BYTES; + uint64_t B_offset = (batch * GRID_K * GRID_N + k_idx * GRID_N + n_idx) * TILE_BYTES; + uint64_t C_offset = (batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx) * TILE_BYTES; int c_tile_idx = batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx; diff --git a/examples/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp b/examples/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp index 6c0578e5..041b339c 100644 --- a/examples/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp +++ b/examples/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp @@ -37,10 +37,10 @@ int build_matmul_graph(Runtime* runtime, uint64_t* args, int arg_count) { void* host_w1 = reinterpret_cast(args[1]); void* host_w2 = reinterpret_cast(args[2]); void* host_f = reinterpret_cast(args[3]); - size_t size_a = static_cast(args[4]); - size_t size_w1 = static_cast(args[5]); - size_t size_w2 = static_cast(args[6]); - size_t size_f = static_cast(args[7]); + uint64_t size_a = static_cast(args[4]); + uint64_t size_w1 = static_cast(args[5]); + uint64_t size_w2 = static_cast(args[6]); + uint64_t size_f = static_cast(args[7]); int SIZE = static_cast(args[8]); std::cout << "\n=== build_matmul_graph: Creating Task Runtime ===" << '\n'; @@ -92,8 +92,8 @@ int build_matmul_graph(Runtime* runtime, uint64_t* args, int arg_count) { // Allocate intermediate tensors (b, c, d) // dev_b is half precision (output of log_sqrt kernel, input to matmul) // dev_c, dev_d are float precision (output of matmul kernels) - size_t BYTES_HALF = SIZE * sizeof(uint16_t); // half = 2 bytes - size_t BYTES_FLOAT = SIZE * sizeof(float); // float = 4 bytes + uint64_t BYTES_HALF = SIZE * sizeof(uint16_t); // half = 2 bytes + uint64_t BYTES_FLOAT = SIZE * sizeof(float); // float = 4 bytes void* dev_b = runtime->host_api.device_malloc(BYTES_HALF); // sqrt(log(A)) - half output void* dev_c = runtime->host_api.device_malloc(BYTES_FLOAT); // B @ W1 - float output void* dev_d = runtime->host_api.device_malloc(BYTES_FLOAT); // B @ W2 - float output diff --git a/examples/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp index 45c6ac75..3bd7ef4c 100644 --- a/examples/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/examples/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -37,13 +37,13 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count) void* host_out = reinterpret_cast(args[5]); int64_t* host_config = reinterpret_cast(args[6]); - size_t query_size = static_cast(args[7]); - size_t key_cache_size = static_cast(args[8]); - size_t value_cache_size = static_cast(args[9]); - size_t block_table_size = static_cast(args[10]); - size_t context_lens_size = static_cast(args[11]); - size_t out_size = static_cast(args[12]); - size_t config_size = static_cast(args[13]); + uint64_t query_size = static_cast(args[7]); + uint64_t key_cache_size = static_cast(args[8]); + uint64_t value_cache_size = static_cast(args[9]); + uint64_t block_table_size = static_cast(args[10]); + uint64_t context_lens_size = static_cast(args[11]); + uint64_t out_size = static_cast(args[12]); + uint64_t config_size = static_cast(args[13]); int batch = static_cast(host_config[0]); int num_heads = static_cast(host_config[1]); @@ -79,11 +79,11 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count) runtime->record_tensor_pair(host_out, dev_out, out_size); // Buffer sizes depend on q_tile_size and block_size - size_t sij_size = static_cast(q_tile_size) * block_size * sizeof(float); - size_t pij_size = static_cast(q_tile_size) * block_size * sizeof(uint16_t); - size_t mij_size = static_cast(q_tile_size) * sizeof(float); - size_t lij_size = mij_size; - size_t oi_new_size = static_cast(q_tile_size) * head_dim * sizeof(float); + uint64_t sij_size = static_cast(q_tile_size) * block_size * sizeof(float); + uint64_t pij_size = static_cast(q_tile_size) * block_size * sizeof(uint16_t); + uint64_t mij_size = static_cast(q_tile_size) * sizeof(float); + uint64_t lij_size = mij_size; + uint64_t oi_new_size = static_cast(q_tile_size) * head_dim * sizeof(float); // Per-batch-per-block intermediate buffers int total_buffers = batch * max_num_blocks; @@ -103,9 +103,9 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count) // Per-(batch, head_tile) accumulators int total_accums = batch * num_head_tiles; - size_t mi_size = static_cast(q_tile_size) * sizeof(float); - size_t li_size = mi_size; - size_t oi_size = static_cast(q_tile_size) * head_dim * sizeof(float); + uint64_t mi_size = static_cast(q_tile_size) * sizeof(float); + uint64_t li_size = mi_size; + uint64_t oi_size = static_cast(q_tile_size) * head_dim * sizeof(float); void** dev_mi_arr = new void*[total_accums]; void** dev_li_arr = new void*[total_accums]; diff --git a/examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp b/examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp index 26ee6b07..6c649e70 100644 --- a/examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp +++ b/examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp @@ -29,9 +29,9 @@ int build_example_graph(Runtime* runtime, uint64_t* args, int arg_count) { void* host_a = reinterpret_cast(args[0]); void* host_b = reinterpret_cast(args[1]); void* host_f = reinterpret_cast(args[2]); - size_t size_a = static_cast(args[3]); - size_t size_b = static_cast(args[4]); - size_t size_f = static_cast(args[5]); + uint64_t size_a = static_cast(args[3]); + uint64_t size_b = static_cast(args[4]); + uint64_t size_f = static_cast(args[5]); int SIZE = static_cast(args[6]); std::cout << "\n=== build_example_graph: Creating Task Runtime ===" << '\n'; @@ -70,7 +70,7 @@ int build_example_graph(Runtime* runtime, uint64_t* args, int arg_count) { std::cout << "Tensor f (output): " << size_f << " bytes allocated\n"; // Allocate intermediate tensors (c, d, e) - size_t BYTES = SIZE * sizeof(float); + uint64_t BYTES = SIZE * sizeof(float); void* dev_c = runtime->host_api.device_malloc(BYTES); void* dev_d = runtime->host_api.device_malloc(BYTES); void* dev_e = runtime->host_api.device_malloc(BYTES); diff --git a/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp index 70388d1a..5e78aa39 100644 --- a/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp +++ b/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp @@ -69,9 +69,9 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) { void* dev_A = (void*)(uintptr_t)args[ARG_PTR_A]; void* dev_B = (void*)(uintptr_t)args[ARG_PTR_B]; void* dev_C = (void*)(uintptr_t)args[ARG_PTR_C]; - size_t size_A = (size_t)args[ARG_SIZE_A]; - size_t size_B = (size_t)args[ARG_SIZE_B]; - size_t size_C = (size_t)args[ARG_SIZE_C]; + uint64_t size_A = (uint64_t)args[ARG_SIZE_A]; + uint64_t size_B = (uint64_t)args[ARG_SIZE_B]; + uint64_t size_C = (uint64_t)args[ARG_SIZE_C]; printf("[bgemm_orch] Grid: %dx%dx%d, Batch: %d, Tile: %d\n", GRID_M, GRID_K, GRID_N, BATCH, TILE); diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp index aed3cb35..549477cd 100644 --- a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp +++ b/examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp @@ -65,7 +65,7 @@ static __aicore__ void softmax_prepare_impl(__gm__ Tensor* sij, using TileScalarDN = Tile; TileVecMxN sijTile; - TileSijDyn sijDynTile(static_cast(valid_len)); + TileSijDyn sijDynTile(valid_len); TileSijPad sijPadTile; TileVecMxN pijTile; TileVecMxN tmpTile; diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp index 50567d0c..40d6f470 100644 --- a/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -71,9 +71,9 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) { int64_t* host_config = (int64_t*)(uintptr_t)args[6]; // Extract sizes (next 7 args after pointers) - size_t query_size = (size_t)args[7]; - size_t key_cache_size = (size_t)args[8]; - size_t value_cache_size = (size_t)args[9]; + uint64_t query_size = (uint64_t)args[7]; + uint64_t key_cache_size = (uint64_t)args[8]; + uint64_t value_cache_size = (uint64_t)args[9]; // Extract config parameters uint64_t batch = (uint64_t)(int)host_config[0]; diff --git a/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp b/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp index aa9c8f75..ba830127 100644 --- a/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp +++ b/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp @@ -88,14 +88,14 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) { void* arg_a_ptr = (void*)(uintptr_t)args[ARG_PTR_A]; void* arg_b_ptr = (void*)(uintptr_t)args[ARG_PTR_B]; void* arg_f_ptr = (void*)(uintptr_t)args[ARG_PTR_F]; - size_t size_a = (size_t)args[ARG_SIZE_A]; - size_t size_b = (size_t)args[ARG_SIZE_B]; - size_t size_f = (size_t)args[ARG_SIZE_F]; + uint64_t size_a = (uint64_t)args[ARG_SIZE_A]; + uint64_t size_b = (uint64_t)args[ARG_SIZE_B]; + uint64_t size_f = (uint64_t)args[ARG_SIZE_F]; int SIZE = (int)(args[ARG_SIZE] & 0x7FFFFFFF); printf("===============SIZE=%d\n", SIZE); - size_t BYTES = (size_t)SIZE * sizeof(float); + uint64_t BYTES = (uint64_t)SIZE * sizeof(float); Tensor ext_a = make_tensor_external(arg_a_ptr, size_a); Tensor ext_b = make_tensor_external(arg_b_ptr, size_b); diff --git a/src/platform/a2a3/aicpu/device_malloc.cpp b/src/platform/a2a3/aicpu/device_malloc.cpp index 8dc0a870..5904aa95 100644 --- a/src/platform/a2a3/aicpu/device_malloc.cpp +++ b/src/platform/a2a3/aicpu/device_malloc.cpp @@ -34,7 +34,7 @@ static void resolve_hal_mem_functions() { g_hal_resolved = true; } -void* aicpu_device_malloc(size_t size) { +void* aicpu_device_malloc(uint64_t size) { resolve_hal_mem_functions(); if (g_halMemAlloc == nullptr) { @@ -49,9 +49,9 @@ void* aicpu_device_malloc(size_t size) { // bit14~16: phy mem type (MEM_TYPE_HBM=0x1 << 14) constexpr unsigned long long MEM_TYPE_HBM = 0x1ULL << 14; unsigned long long flag = MEM_TYPE_HBM; - int rc = g_halMemAlloc(&ptr, static_cast(size), flag); + int rc = g_halMemAlloc(&ptr, size, flag); if (rc != 0 || ptr == nullptr) { - DEV_ERROR("halMemAlloc failed: rc=%d size=%zu flag=0x%llx", rc, size, flag); + DEV_ERROR("halMemAlloc failed: rc=%d size=%llu flag=0x%llx", rc, size, flag); return nullptr; } return ptr; diff --git a/src/platform/a2a3/host/device_runner.cpp b/src/platform/a2a3/host/device_runner.cpp index a5ece59e..5e6a91a4 100644 --- a/src/platform/a2a3/host/device_runner.cpp +++ b/src/platform/a2a3/host/device_runner.cpp @@ -19,7 +19,7 @@ namespace { void* g_hal_handle = nullptr; -using HalHostRegisterFn = int (*)(void* dev_ptr, size_t size, unsigned int flags, int device_id, void** host_ptr); +using HalHostRegisterFn = int (*)(void* dev_ptr, uint64_t size, unsigned int flags, int device_id, void** host_ptr); using HalHostUnregisterFn = int (*)(void* host_ptr, int device_id); int load_hal_if_needed() { @@ -57,8 +57,8 @@ int KernelArgsHelper::init_device_args(const DeviceArgs& host_device_args, Memor // Allocate device memory for device_args if (args.device_args == nullptr) { - uint64_t device_args_size = sizeof(DeviceArgs); - void* device_args_dev = allocator_->alloc(device_args_size); + uint64_t device_args_size = static_cast(sizeof(DeviceArgs)); + void* device_args_dev = allocator_->alloc(static_cast(device_args_size)); if (device_args_dev == nullptr) { LOG_ERROR("Alloc for device_args failed"); return -1; @@ -90,8 +90,8 @@ int KernelArgsHelper::init_runtime_args(const Runtime& host_runtime, MemoryAlloc allocator_ = &allocator; if (args.runtime_args == nullptr) { - uint64_t runtime_size = sizeof(Runtime); - void* runtime_dev = allocator_->alloc(runtime_size); + uint64_t runtime_size = static_cast(sizeof(Runtime)); + void* runtime_dev = allocator_->alloc(static_cast(runtime_size)); if (runtime_dev == nullptr) { LOG_ERROR("Alloc for runtime_args failed"); return -1; @@ -129,8 +129,8 @@ int AicpuSoInfo::init(const std::vector& aicpu_so_binary, MemoryAllocat return -1; } - size_t file_size = aicpu_so_binary.size(); - void* d_aicpu_data = allocator_->alloc(file_size); + uint64_t file_size = static_cast(aicpu_so_binary.size()); + void* d_aicpu_data = allocator_->alloc(static_cast(file_size)); if (d_aicpu_data == nullptr) { LOG_ERROR("Alloc failed for AICPU SO"); return -1; @@ -256,7 +256,7 @@ int DeviceRunner::ensure_binaries_loaded( return 0; } -void* DeviceRunner::allocate_tensor(size_t bytes) { return mem_alloc_.alloc(bytes); } +void* DeviceRunner::allocate_tensor(uint64_t bytes) { return mem_alloc_.alloc(bytes); } void DeviceRunner::free_tensor(void* dev_ptr) { if (dev_ptr != nullptr) { @@ -264,11 +264,11 @@ void DeviceRunner::free_tensor(void* dev_ptr) { } } -int DeviceRunner::copy_to_device(void* dev_ptr, const void* host_ptr, size_t bytes) { +int DeviceRunner::copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t bytes) { return rtMemcpy(dev_ptr, bytes, host_ptr, bytes, RT_MEMCPY_HOST_TO_DEVICE); } -int DeviceRunner::copy_from_device(void* host_ptr, const void* dev_ptr, size_t bytes) { +int DeviceRunner::copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t bytes) { return rtMemcpy(host_ptr, bytes, dev_ptr, bytes, RT_MEMCPY_DEVICE_TO_HOST); } @@ -436,7 +436,7 @@ void DeviceRunner::print_handshake_results() { // Allocate temporary buffer to read handshake data from device std::vector workers(worker_count_); - size_t total_size = sizeof(Handshake) * worker_count_; + uint64_t total_size = sizeof(Handshake) * worker_count_; rtMemcpy(workers.data(), total_size, kernel_args_.args.runtime_args->workers, total_size, RT_MEMCPY_DEVICE_TO_HOST); LOG_DEBUG("Handshake results for %d cores:", worker_count_); @@ -537,7 +537,7 @@ int DeviceRunner::launch_aicore_kernel(rtStream_t stream, Runtime* runtime) { return -1; } - size_t bin_size = aicore_kernel_binary_.size(); + uint64_t bin_size = static_cast(aicore_kernel_binary_.size()); const void* bin_data = aicore_kernel_binary_.data(); rtDevBinary_t binary; @@ -579,7 +579,7 @@ int DeviceRunner::launch_aicore_kernel(rtStream_t stream, Runtime* runtime) { // Kernel Binary Upload (returns device address for caller to store in Runtime) // ============================================================================= -uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data, size_t bin_size) { +uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data, uint64_t bin_size) { if (bin_data == nullptr || bin_size == 0) { LOG_ERROR("Invalid kernel binary data"); return 0; @@ -602,7 +602,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data // Allocate device GM memory (size field + binary data) uint64_t alloc_size = sizeof(uint64_t) + bin_size; - void* gm_addr = mem_alloc_.alloc(alloc_size); + void* gm_addr = mem_alloc_.alloc(static_cast(alloc_size)); if (gm_addr == nullptr) { LOG_ERROR("Failed to allocate device GM memory for kernel func_id=%d", func_id); return 0; @@ -612,7 +612,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data std::vector host_buf(alloc_size); uint64_t* size_ptr = reinterpret_cast(host_buf.data()); *size_ptr = bin_size; - std::memcpy(host_buf.data() + sizeof(uint64_t), bin_data, bin_size); + std::memcpy(host_buf.data() + sizeof(uint64_t), bin_data, static_cast(bin_size)); // Copy to device int rc = rtMemcpy(gm_addr, alloc_size, host_buf.data(), alloc_size, RT_MEMCPY_HOST_TO_DEVICE); @@ -635,13 +635,13 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data int DeviceRunner::init_performance_profiling(Runtime& runtime, int num_aicore, int device_id) { // Define allocation callback (a2a3: use MemoryAllocator) - auto alloc_cb = [](size_t size, void* user_data) -> void* { + auto alloc_cb = [](uint64_t size, void* user_data) -> void* { auto* allocator = static_cast(user_data); - return allocator->alloc(size); + return allocator->alloc(static_cast(size)); }; // Define registration callback (a2a3: use halHostRegister for shared memory) - auto register_cb = [](void* dev_ptr, size_t size, int device_id, + auto register_cb = [](void* dev_ptr, uint64_t size, int device_id, void* user_data, void** host_ptr) -> int { (void)user_data; // Not needed for registration if (load_hal_if_needed() != 0) { diff --git a/src/platform/a2a3/host/device_runner.h b/src/platform/a2a3/host/device_runner.h index c92bc0dc..14e09322 100644 --- a/src/platform/a2a3/host/device_runner.h +++ b/src/platform/a2a3/host/device_runner.h @@ -159,7 +159,7 @@ class DeviceRunner { * @param bytes Size of tensor in bytes * @return Device pointer on success, nullptr on failure */ - void* allocate_tensor(size_t bytes); + void* allocate_tensor(uint64_t bytes); /** * Free device tensor memory @@ -176,7 +176,7 @@ class DeviceRunner { * @param bytes Number of bytes to copy * @return 0 on success, error code on failure */ - int copy_to_device(void* dev_ptr, const void* host_ptr, size_t bytes); + int copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t bytes); /** * Copy data from device to host @@ -186,7 +186,7 @@ class DeviceRunner { * @param bytes Number of bytes to copy * @return 0 on success, error code on failure */ - int copy_from_device(void* host_ptr, const void* dev_ptr, size_t bytes); + int copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t bytes); /** * Execute a runtime @@ -303,7 +303,7 @@ class DeviceRunner { * @param bin_size Size of binary data in bytes * @return Device GM address of kernel on success, 0 on error */ - uint64_t upload_kernel_binary(int func_id, const uint8_t* bin_data, size_t bin_size); + uint64_t upload_kernel_binary(int func_id, const uint8_t* bin_data, uint64_t bin_size); /** * Ensure device is set and streams are created (minimal initialization) diff --git a/src/platform/a2a3/host/pto_runtime_c_api.cpp b/src/platform/a2a3/host/pto_runtime_c_api.cpp index 935b5e93..fa62922c 100644 --- a/src/platform/a2a3/host/pto_runtime_c_api.cpp +++ b/src/platform/a2a3/host/pto_runtime_c_api.cpp @@ -20,7 +20,7 @@ extern "C" { */ int init_runtime_impl(Runtime* runtime, const uint8_t* orch_so_binary, - size_t orch_so_size, + uint64_t orch_so_size, const char* orch_func_name, uint64_t* func_args, int func_args_count, @@ -28,16 +28,16 @@ int init_runtime_impl(Runtime* runtime, uint64_t* arg_sizes, const int* kernel_func_ids, const uint8_t* const* kernel_binaries, - const size_t* kernel_sizes, + const uint64_t* kernel_sizes, int kernel_count); int validate_runtime_impl(Runtime* runtime); /* Forward declarations for device memory functions used in init_runtime */ -void* device_malloc(size_t size); +void* device_malloc(uint64_t size); void device_free(void* dev_ptr); -int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size); -int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size); -uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size_t bin_size); +int copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t size); +int copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t size); +uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, uint64_t bin_size); /* =========================================================================== */ @@ -45,11 +45,11 @@ uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size /* =========================================================================== */ -size_t get_runtime_size(void) { return sizeof(Runtime); } +uint64_t get_runtime_size(void) { return sizeof(Runtime); } int init_runtime(RuntimeHandle runtime, const uint8_t* orch_so_binary, - size_t orch_so_size, + uint64_t orch_so_size, const char* orch_func_name, uint64_t* func_args, int func_args_count, @@ -57,7 +57,7 @@ int init_runtime(RuntimeHandle runtime, uint64_t* arg_sizes, const int* kernel_func_ids, const uint8_t* const* kernel_binaries, - const size_t* kernel_sizes, + const uint64_t* kernel_sizes, int kernel_count) { if (runtime == NULL) { return -1; @@ -103,7 +103,7 @@ int init_runtime(RuntimeHandle runtime, /* =========================================================================== */ -void* device_malloc(size_t size) { +void* device_malloc(uint64_t size) { try { DeviceRunner& runner = DeviceRunner::get(); return runner.allocate_tensor(size); @@ -124,7 +124,7 @@ void device_free(void* dev_ptr) { } } -int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size) { +int copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t size) { if (dev_ptr == NULL || host_ptr == NULL) { return -1; } @@ -136,7 +136,7 @@ int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size) { } } -int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size) { +int copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t size) { if (host_ptr == NULL || dev_ptr == NULL) { return -1; } @@ -148,7 +148,7 @@ int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size) { } } -uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size_t bin_size) { +uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, uint64_t bin_size) { try { DeviceRunner& runner = DeviceRunner::get(); return runner.upload_kernel_binary(func_id, bin_data, bin_size); @@ -162,9 +162,9 @@ int launch_runtime(RuntimeHandle runtime, int block_dim, int device_id, const uint8_t* aicpu_binary, - size_t aicpu_size, + uint64_t aicpu_size, const uint8_t* aicore_binary, - size_t aicore_size) { + uint64_t aicore_size) { if (runtime == NULL) { return -1; } @@ -218,7 +218,7 @@ int set_device(int device_id) { void record_tensor_pair(RuntimeHandle runtime, void* host_ptr, void* dev_ptr, - size_t size) { + uint64_t size) { if (runtime == NULL) { return; } diff --git a/src/platform/a2a3sim/aicpu/device_malloc.cpp b/src/platform/a2a3sim/aicpu/device_malloc.cpp index 51d6e877..bf6a5cf4 100644 --- a/src/platform/a2a3sim/aicpu/device_malloc.cpp +++ b/src/platform/a2a3sim/aicpu/device_malloc.cpp @@ -10,7 +10,7 @@ #include -void* aicpu_device_malloc(size_t size) { +void* aicpu_device_malloc(uint64_t size) { return malloc(size); } diff --git a/src/platform/a2a3sim/host/device_runner.cpp b/src/platform/a2a3sim/host/device_runner.cpp index 491b33c8..252870cc 100644 --- a/src/platform/a2a3sim/host/device_runner.cpp +++ b/src/platform/a2a3sim/host/device_runner.cpp @@ -100,7 +100,7 @@ int DeviceRunner::ensure_binaries_loaded(const std::vector& aicpu_so_bi return 0; } -void* DeviceRunner::allocate_tensor(size_t bytes) { +void* DeviceRunner::allocate_tensor(uint64_t bytes) { return mem_alloc_.alloc(bytes); } @@ -110,15 +110,15 @@ void DeviceRunner::free_tensor(void* dev_ptr) { } } -int DeviceRunner::copy_to_device(void* dev_ptr, const void* host_ptr, size_t bytes) { +int DeviceRunner::copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t bytes) { // In simulation, this is just a memcpy - std::memcpy(dev_ptr, host_ptr, bytes); + std::memcpy(dev_ptr, host_ptr, static_cast(bytes)); return 0; } -int DeviceRunner::copy_from_device(void* host_ptr, const void* dev_ptr, size_t bytes) { +int DeviceRunner::copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t bytes) { // In simulation, this is just a memcpy - std::memcpy(host_ptr, dev_ptr, bytes); + std::memcpy(host_ptr, dev_ptr, static_cast(bytes)); return 0; } @@ -353,7 +353,7 @@ int DeviceRunner::finalize() { // Kernel Binary Upload (returns function address for caller to store in Runtime) // ============================================================================= -uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data, size_t bin_size) { +uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data, uint64_t bin_size) { if (bin_data == nullptr || bin_size == 0) { LOG_ERROR("Invalid kernel data"); return 0; @@ -419,9 +419,9 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data int DeviceRunner::init_performance_profiling(Runtime& runtime, int num_aicore, int device_id) { // Define allocation callback (a2a3sim: use malloc) - auto alloc_cb = [](size_t size, void* user_data) -> void* { + auto alloc_cb = [](uint64_t size, void* user_data) -> void* { (void)user_data; // Not needed for malloc - return malloc(size); + return malloc(static_cast(size)); }; // Simulation: no registration needed (pass nullptr) diff --git a/src/platform/a2a3sim/host/device_runner.h b/src/platform/a2a3sim/host/device_runner.h index 3ae8ef34..0db59ecc 100644 --- a/src/platform/a2a3sim/host/device_runner.h +++ b/src/platform/a2a3sim/host/device_runner.h @@ -74,7 +74,7 @@ class DeviceRunner { * @param bytes Size of tensor in bytes * @return Pointer on success, nullptr on failure */ - void* allocate_tensor(size_t bytes); + void* allocate_tensor(uint64_t bytes); /** * Free tensor memory @@ -91,7 +91,7 @@ class DeviceRunner { * @param bytes Number of bytes to copy * @return 0 on success */ - int copy_to_device(void* dev_ptr, const void* host_ptr, size_t bytes); + int copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t bytes); /** * Copy data (memcpy in simulation) @@ -101,7 +101,7 @@ class DeviceRunner { * @param bytes Number of bytes to copy * @return 0 on success */ - int copy_from_device(void* host_ptr, const void* dev_ptr, size_t bytes); + int copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t bytes); /** * Execute a runtime using threads @@ -179,7 +179,7 @@ class DeviceRunner { * @param bin_size Size of binary data in bytes * @return Function pointer address on success, 0 on error */ - uint64_t upload_kernel_binary(int func_id, const uint8_t* bin_data, size_t bin_size); + uint64_t upload_kernel_binary(int func_id, const uint8_t* bin_data, uint64_t bin_size); private: DeviceRunner() = default; diff --git a/src/platform/a2a3sim/host/pto_runtime_c_api.cpp b/src/platform/a2a3sim/host/pto_runtime_c_api.cpp index dd6052ff..7e402539 100644 --- a/src/platform/a2a3sim/host/pto_runtime_c_api.cpp +++ b/src/platform/a2a3sim/host/pto_runtime_c_api.cpp @@ -23,7 +23,7 @@ extern "C" { */ int init_runtime_impl(Runtime* runtime, const uint8_t* orch_so_binary, - size_t orch_so_size, + uint64_t orch_so_size, const char* orch_func_name, uint64_t* func_args, int func_args_count, @@ -31,29 +31,29 @@ int init_runtime_impl(Runtime* runtime, uint64_t* arg_sizes, const int* kernel_func_ids, const uint8_t* const* kernel_binaries, - const size_t* kernel_sizes, + const uint64_t* kernel_sizes, int kernel_count); int validate_runtime_impl(Runtime* runtime); /* Forward declarations */ -void* device_malloc(size_t size); +void* device_malloc(uint64_t size); void device_free(void* dev_ptr); -int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size); -int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size); -uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size_t bin_size); +int copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t size); +int copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t size); +uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, uint64_t bin_size); /* =========================================================================== * Runtime API Implementation * =========================================================================== */ -size_t get_runtime_size(void) { +uint64_t get_runtime_size(void) { return sizeof(Runtime); } int init_runtime(RuntimeHandle runtime, const uint8_t* orch_so_binary, - size_t orch_so_size, + uint64_t orch_so_size, const char* orch_func_name, uint64_t* func_args, int func_args_count, @@ -61,7 +61,7 @@ int init_runtime(RuntimeHandle runtime, uint64_t* arg_sizes, const int* kernel_func_ids, const uint8_t* const* kernel_binaries, - const size_t* kernel_sizes, + const uint64_t* kernel_sizes, int kernel_count) { if (runtime == NULL) { return -1; @@ -102,7 +102,7 @@ int init_runtime(RuntimeHandle runtime, * =========================================================================== */ -void* device_malloc(size_t size) { +void* device_malloc(uint64_t size) { try { DeviceRunner& runner = DeviceRunner::get(); return runner.allocate_tensor(size); @@ -123,7 +123,7 @@ void device_free(void* dev_ptr) { } } -int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size) { +int copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t size) { if (dev_ptr == NULL || host_ptr == NULL) { return -1; } @@ -135,7 +135,7 @@ int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size) { } } -int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size) { +int copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t size) { if (host_ptr == NULL || dev_ptr == NULL) { return -1; } @@ -147,7 +147,7 @@ int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size) { } } -uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size_t bin_size) { +uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, uint64_t bin_size) { try { DeviceRunner& runner = DeviceRunner::get(); return runner.upload_kernel_binary(func_id, bin_data, bin_size); @@ -161,9 +161,9 @@ int launch_runtime(RuntimeHandle runtime, int block_dim, int device_id, const uint8_t* aicpu_binary, - size_t aicpu_size, + uint64_t aicpu_size, const uint8_t* aicore_binary, - size_t aicore_size) { + uint64_t aicore_size) { if (runtime == NULL) { return -1; } @@ -235,7 +235,7 @@ int enable_runtime_profiling(RuntimeHandle runtime, int enabled) { void record_tensor_pair(RuntimeHandle runtime, void* host_ptr, void* dev_ptr, - size_t size) { + uint64_t size) { if (runtime == NULL) { return; } diff --git a/src/platform/include/aicpu/device_malloc.h b/src/platform/include/aicpu/device_malloc.h index b78e7dfc..cb911ceb 100644 --- a/src/platform/include/aicpu/device_malloc.h +++ b/src/platform/include/aicpu/device_malloc.h @@ -14,7 +14,7 @@ #ifndef PLATFORM_DEVICE_MALLOC_H_ #define PLATFORM_DEVICE_MALLOC_H_ -#include +#include /** * Allocate device memory (HBM on real hardware, heap on simulation). @@ -29,7 +29,7 @@ * @param size Number of bytes to allocate * @return Pointer to allocated memory, or nullptr on failure */ -void* aicpu_device_malloc(size_t size); +void* aicpu_device_malloc(uint64_t size); /** * Free device memory previously allocated by aicpu_device_malloc(). diff --git a/src/platform/include/common/perf_profiling.h b/src/platform/include/common/perf_profiling.h index e3fe4a47..0e801989 100644 --- a/src/platform/include/common/perf_profiling.h +++ b/src/platform/include/common/perf_profiling.h @@ -199,7 +199,7 @@ extern "C" { * @param num_cores Number of cores (block_dim × PLATFORM_CORES_PER_BLOCKDIM) * @return Total bytes */ -inline size_t calc_perf_data_size(int num_cores) { +inline uint64_t calc_perf_data_size(int num_cores) { return sizeof(PerfDataHeader) + num_cores * sizeof(DoubleBuffer); } diff --git a/src/platform/include/host/performance_collector.h b/src/platform/include/host/performance_collector.h index a255027c..2fdc4097 100644 --- a/src/platform/include/host/performance_collector.h +++ b/src/platform/include/host/performance_collector.h @@ -30,7 +30,7 @@ * @param user_data User-provided context pointer * @return Allocated device memory pointer, or nullptr on failure */ -using PerfAllocCallback = void* (*)(size_t size, void* user_data); +using PerfAllocCallback = void* (*)(uint64_t size, void* user_data); /** * Memory registration callback (for Host-Device shared memory) @@ -42,7 +42,7 @@ using PerfAllocCallback = void* (*)(size_t size, void* user_data); * @param[out] host_ptr Host-mapped pointer * @return 0 on success, error code on failure */ -using PerfRegisterCallback = int (*)(void* dev_ptr, size_t size, int device_id, +using PerfRegisterCallback = int (*)(void* dev_ptr, uint64_t size, int device_id, void* user_data, void** host_ptr); /** diff --git a/src/platform/include/host/pto_runtime_c_api.h b/src/platform/include/host/pto_runtime_c_api.h index 637aad3a..1b1f6bf8 100644 --- a/src/platform/include/host/pto_runtime_c_api.h +++ b/src/platform/include/host/pto_runtime_c_api.h @@ -65,7 +65,7 @@ typedef void* RuntimeHandle; * * @return Size of Runtime structure in bytes */ -size_t get_runtime_size(void); +uint64_t get_runtime_size(void); /** * Initialize a runtime with dynamic orchestration and kernel binaries. @@ -94,7 +94,7 @@ size_t get_runtime_size(void); */ int init_runtime(RuntimeHandle runtime, const uint8_t* orch_so_binary, - size_t orch_so_size, + uint64_t orch_so_size, const char* orch_func_name, uint64_t* func_args, int func_args_count, @@ -102,7 +102,7 @@ int init_runtime(RuntimeHandle runtime, uint64_t* arg_sizes, const int* kernel_func_ids, const uint8_t* const* kernel_binaries, - const size_t* kernel_sizes, + const uint64_t* kernel_sizes, int kernel_count); /* =========================================================================== @@ -116,7 +116,7 @@ int init_runtime(RuntimeHandle runtime, * @param size Size in bytes to allocate * @return Device pointer on success, NULL on failure */ -void* device_malloc(size_t size); +void* device_malloc(uint64_t size); /** * Free device memory. @@ -133,7 +133,7 @@ void device_free(void* dev_ptr); * @param size Size in bytes to copy * @return 0 on success, error code on failure */ -int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size); +int copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t size); /** * Copy data from device to host. @@ -143,7 +143,7 @@ int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size); * @param size Size in bytes to copy * @return 0 on success, error code on failure */ -int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size); +int copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t size); /** * Execute a runtime on the device. @@ -167,9 +167,9 @@ int launch_runtime(RuntimeHandle runtime, int block_dim, int device_id, const uint8_t* aicpu_binary, - size_t aicpu_size, + uint64_t aicpu_size, const uint8_t* aicore_binary, - size_t aicore_size); + uint64_t aicore_size); /** * Finalize and cleanup a runtime instance. @@ -217,7 +217,7 @@ int set_device(int device_id); void record_tensor_pair(RuntimeHandle runtime, void* host_ptr, void* dev_ptr, - size_t size); + uint64_t size); /** diff --git a/src/platform/src/performance_collector.cpp b/src/platform/src/performance_collector.cpp index a018f19e..f3c23f95 100644 --- a/src/platform/src/performance_collector.cpp +++ b/src/platform/src/performance_collector.cpp @@ -41,10 +41,10 @@ int PerformanceCollector::initialize(Runtime& runtime, device_id_ = device_id; // Step 1: Calculate total memory size - size_t total_size = calc_perf_data_size(num_aicore); - size_t header_size = sizeof(PerfDataHeader); - size_t single_db_size = sizeof(DoubleBuffer); - size_t buffers_size = num_aicore * single_db_size; + uint64_t total_size = calc_perf_data_size(num_aicore); + uint64_t header_size = sizeof(PerfDataHeader); + uint64_t single_db_size = sizeof(DoubleBuffer); + uint64_t buffers_size = num_aicore * single_db_size; LOG_DEBUG("Memory allocation plan:"); LOG_DEBUG(" Number of cores: %d", num_aicore); @@ -297,7 +297,7 @@ int PerformanceCollector::export_swimlane_json(const std::string& output_path) { outfile << " \"version\": 1,\n"; outfile << " \"tasks\": [\n"; - for (size_t i = 0; i < sorted_records.size(); ++i) { + for (uint64_t i = 0; i < sorted_records.size(); ++i) { const auto& record = sorted_records[i]; // Convert times to microseconds diff --git a/src/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp b/src/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp index 0fef4011..7e4d7380 100644 --- a/src/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp +++ b/src/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp @@ -38,19 +38,19 @@ extern "C" void aicpu_runtime_publish_task(Runtime* runtime, int task_id); namespace { using AicpuBuilderFunc = int (*)(Runtime*); -int write_bytes_to_file(const char* path, const uint8_t* data, size_t size) { +int write_bytes_to_file(const char* path, const uint8_t* data, uint64_t size) { int fd = ::open(path, O_WRONLY | O_CREAT | O_TRUNC, 0755); if (fd < 0) { return -1; } - size_t off = 0; + uint64_t off = 0; while (off < size) { - ssize_t n = ::write(fd, data + off, size - off); + ssize_t n = ::write(fd, data + off, static_cast(size - off)); if (n <= 0) { ::close(fd); return -1; } - off += static_cast(n); + off += static_cast(n); } ::close(fd); return 0; @@ -81,7 +81,7 @@ int build_graph_via_aicpu_plugin(Runtime* runtime, int thread_idx) { } const void* so_data_v = runtime->get_aicpu_orch_so_data(); - size_t so_size = runtime->get_aicpu_orch_so_size(); + uint64_t so_size = runtime->get_aicpu_orch_so_size(); if (so_data_v == nullptr || so_size == 0) { DEV_ERROR("Thread %d: AICPU orch plugin not embedded (size=0). Host orchestration must embed plugin bytes.", thread_idx); @@ -116,7 +116,7 @@ int build_graph_via_aicpu_plugin(Runtime* runtime, int thread_idx) { DEV_INFO("Thread %d: Trying AICPU orch plugin path %s (bytes=%lu, sym=%s)", thread_idx, so_path, - static_cast(so_size), + so_size, sym); if (write_bytes_to_file(so_path, so_data, so_size) != 0) { diff --git a/src/runtime/aicpu_build_graph/host/runtime_maker.cpp b/src/runtime/aicpu_build_graph/host/runtime_maker.cpp index 27bc078a..92181c98 100644 --- a/src/runtime/aicpu_build_graph/host/runtime_maker.cpp +++ b/src/runtime/aicpu_build_graph/host/runtime_maker.cpp @@ -105,7 +105,7 @@ extern "C" { */ int init_runtime_impl(Runtime* runtime, const uint8_t* orch_so_binary, - size_t orch_so_size, + uint64_t orch_so_size, const char* orch_func_name, uint64_t* func_args, int func_args_count, @@ -113,7 +113,7 @@ int init_runtime_impl(Runtime* runtime, uint64_t* arg_sizes, const int* kernel_func_ids, const uint8_t* const* kernel_binaries, - const size_t* kernel_sizes, + const uint64_t* kernel_sizes, int kernel_count) { if (runtime == nullptr) { std::cerr << "Error: Runtime pointer is null\n"; @@ -163,7 +163,7 @@ int init_runtime_impl(Runtime* runtime, } else { // Pointer argument: allocate device memory. void* host_ptr = reinterpret_cast(func_args[i]); - size_t nbytes = static_cast(asize); + uint64_t nbytes = asize; void* dev_ptr = runtime->host_api.device_malloc(nbytes); if (dev_ptr == nullptr) { diff --git a/src/runtime/aicpu_build_graph/runtime/runtime.cpp b/src/runtime/aicpu_build_graph/runtime/runtime.cpp index 6388c482..2e2b092f 100644 --- a/src/runtime/aicpu_build_graph/runtime/runtime.cpp +++ b/src/runtime/aicpu_build_graph/runtime/runtime.cpp @@ -49,7 +49,7 @@ Runtime::Runtime() { aicpu_build_api = {}; } -bool Runtime::try_set_aicpu_orch_so(const void* data, size_t size) { +bool Runtime::try_set_aicpu_orch_so(const void* data, uint64_t size) { if (data == nullptr || size == 0) { aicpu_orch_so_size = 0; return false; @@ -63,15 +63,15 @@ bool Runtime::try_set_aicpu_orch_so(const void* data, size_t size) { return false; } memcpy(aicpu_orch_so_storage, data, size); - aicpu_orch_so_size = static_cast(size); + aicpu_orch_so_size = size; return true; } -void Runtime::set_aicpu_orch_so(const void* data, size_t size) { (void)try_set_aicpu_orch_so(data, size); } +void Runtime::set_aicpu_orch_so(const void* data, uint64_t size) { (void)try_set_aicpu_orch_so(data, size); } const void* Runtime::get_aicpu_orch_so_data() const { return aicpu_orch_so_size > 0 ? aicpu_orch_so_storage : nullptr; } -size_t Runtime::get_aicpu_orch_so_size() const { return static_cast(aicpu_orch_so_size); } +uint64_t Runtime::get_aicpu_orch_so_size() const { return aicpu_orch_so_size; } // ============================================================================= // Task Management @@ -255,7 +255,7 @@ void Runtime::print_runtime() const { // Tensor Pair Management // ============================================================================= -void Runtime::record_tensor_pair(void* host_ptr, void* dev_ptr, size_t size) { +void Runtime::record_tensor_pair(void* host_ptr, void* dev_ptr, uint64_t size) { if (tensor_pair_count >= RUNTIME_MAX_TENSOR_PAIRS) { fprintf(stderr, "[Runtime] ERROR: Tensor pairs full (max=%d)\n", RUNTIME_MAX_TENSOR_PAIRS); return; diff --git a/src/runtime/aicpu_build_graph/runtime/runtime.h b/src/runtime/aicpu_build_graph/runtime/runtime.h index 9ccee95d..150970ab 100644 --- a/src/runtime/aicpu_build_graph/runtime/runtime.h +++ b/src/runtime/aicpu_build_graph/runtime/runtime.h @@ -124,7 +124,7 @@ struct Handshake { struct TensorPair { void* host_ptr; void* dev_ptr; - size_t size; + uint64_t size; }; /** @@ -161,7 +161,7 @@ struct AicpuBuildApi { Runtime* runtime, uint64_t* args, int num_args, int func_id, CoreType core_type, uint64_t function_bin_addr); void (*add_successor_conditional)(Runtime* runtime, int from_task, int to_task); void (*publish_task)(Runtime* runtime, int task_id); - void* (*device_malloc)(size_t size); + void* (*device_malloc)(uint64_t size); void (*device_free)(void* ptr); }; @@ -170,11 +170,11 @@ struct AicpuBuildApi { * Allows runtime to use pluggable device memory backends. */ struct HostApi { - void* (*device_malloc)(size_t size); + void* (*device_malloc)(uint64_t size); void (*device_free)(void* dev_ptr); - int (*copy_to_device)(void* dev_ptr, const void* host_ptr, size_t size); - int (*copy_from_device)(void* host_ptr, const void* dev_ptr, size_t size); - uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, size_t bin_size); + int (*copy_to_device)(void* dev_ptr, const void* host_ptr, uint64_t size); + int (*copy_from_device)(void* host_ptr, const void* dev_ptr, uint64_t size); + uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, uint64_t bin_size); }; /** @@ -286,16 +286,16 @@ class Runtime { * orchestration plugin `.so` (instead of relinking/reuploading the full runtime). */ uint8_t aicpu_orch_so_storage[RUNTIME_MAX_AICPU_ORCH_SO_SIZE]; - uint32_t aicpu_orch_so_size; + uint64_t aicpu_orch_so_size; char aicpu_orch_func_name[64]; // Attempt to embed AICPU orchestration plugin bytes into Runtime. // Returns false on invalid input or if the plugin is larger than the // built-in storage. - bool try_set_aicpu_orch_so(const void* data, size_t size); - void set_aicpu_orch_so(const void* data, size_t size); + bool try_set_aicpu_orch_so(const void* data, uint64_t size); + void set_aicpu_orch_so(const void* data, uint64_t size); const void* get_aicpu_orch_so_data() const; - size_t get_aicpu_orch_so_size() const; + uint64_t get_aicpu_orch_so_size() const; /** * Build mode: @@ -465,7 +465,7 @@ class Runtime { * @param dev_ptr Device memory pointer (source for copy-back) * @param size Size of tensor in bytes */ - void record_tensor_pair(void* host_ptr, void* dev_ptr, size_t size); + void record_tensor_pair(void* host_ptr, void* dev_ptr, uint64_t size); /** * Record a device allocation for cleanup during finalize. diff --git a/src/runtime/host_build_graph/host/runtime_maker.cpp b/src/runtime/host_build_graph/host/runtime_maker.cpp index 22fda049..de5eb00a 100644 --- a/src/runtime/host_build_graph/host/runtime_maker.cpp +++ b/src/runtime/host_build_graph/host/runtime_maker.cpp @@ -61,7 +61,7 @@ extern "C" { */ int init_runtime_impl(Runtime *runtime, const uint8_t* orch_so_binary, - size_t orch_so_size, + uint64_t orch_so_size, const char* orch_func_name, uint64_t* func_args, int func_args_count, @@ -69,7 +69,7 @@ int init_runtime_impl(Runtime *runtime, uint64_t* arg_sizes, const int* kernel_func_ids, const uint8_t* const* kernel_binaries, - const size_t* kernel_sizes, + const uint64_t* kernel_sizes, int kernel_count) { // Unused parameters for host orchestration (void)arg_types; @@ -111,7 +111,7 @@ int init_runtime_impl(Runtime *runtime, } ssize_t written = write(fd, orch_so_binary, orch_so_size); - if (written < 0 || static_cast(written) != orch_so_size) { + if (written < 0 || static_cast(written) != orch_so_size) { LOG_ERROR("Failed to write orchestration SO to temp file"); close(fd); unlink(fd_path); diff --git a/src/runtime/host_build_graph/runtime/runtime.cpp b/src/runtime/host_build_graph/runtime/runtime.cpp index 1324acbc..efa1d690 100644 --- a/src/runtime/host_build_graph/runtime/runtime.cpp +++ b/src/runtime/host_build_graph/runtime/runtime.cpp @@ -192,7 +192,7 @@ void Runtime::print_runtime() const { // Tensor Pair Management // ============================================================================= -void Runtime::record_tensor_pair(void* host_ptr, void* dev_ptr, size_t size) { +void Runtime::record_tensor_pair(void* host_ptr, void* dev_ptr, uint64_t size) { if (tensor_pair_count >= RUNTIME_MAX_TENSOR_PAIRS) { LOG_ERROR("[Runtime] Tensor pairs full (max=%d)", RUNTIME_MAX_TENSOR_PAIRS); return; diff --git a/src/runtime/host_build_graph/runtime/runtime.h b/src/runtime/host_build_graph/runtime/runtime.h index 919cada9..6ea62ce9 100644 --- a/src/runtime/host_build_graph/runtime/runtime.h +++ b/src/runtime/host_build_graph/runtime/runtime.h @@ -114,7 +114,7 @@ struct Handshake { struct TensorPair { void* host_ptr; void* dev_ptr; - size_t size; + uint64_t size; }; /** @@ -122,11 +122,11 @@ struct TensorPair { * Allows runtime to use pluggable device memory backends. */ struct HostApi { - void* (*device_malloc)(size_t size); + void* (*device_malloc)(uint64_t size); void (*device_free)(void* dev_ptr); - int (*copy_to_device)(void* dev_ptr, const void* host_ptr, size_t size); - int (*copy_from_device)(void* host_ptr, const void* dev_ptr, size_t size); - uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, size_t bin_size); + int (*copy_to_device)(void* dev_ptr, const void* host_ptr, uint64_t size); + int (*copy_from_device)(void* host_ptr, const void* dev_ptr, uint64_t size); + uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, uint64_t bin_size); }; /** @@ -287,7 +287,7 @@ class Runtime { * @param dev_ptr Device memory pointer (source for copy-back) * @param size Size of tensor in bytes */ - void record_tensor_pair(void* host_ptr, void* dev_ptr, size_t size); + void record_tensor_pair(void* host_ptr, void* dev_ptr, uint64_t size); /** * Get pointer to tensor pairs array. diff --git a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index e8f5699c..da7ab109 100644 --- a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -408,9 +408,9 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx, DEV_INFO("Thread %d: task_descriptors=%p, dep_list_pool=%p", thread_idx, (void*)task_descriptors, (void*)dep_list_pool); - int32_t window_size = header->task_window_size; - if (window_size <= 0 || window_size > PTO2_MAX_SLOTS) window_size = PTO2_MAX_SLOTS; - int32_t window_mask = window_size - 1; + uint64_t window_size = header->task_window_size; + if (window_size == 0 || window_size > PTO2_MAX_SLOTS) window_size = PTO2_MAX_SLOTS; + uint64_t window_mask = window_size - 1; Handshake* hank = static_cast(runtime->workers); DEV_INFO("Thread %d: hank=%p, window_size=%d", @@ -734,7 +734,7 @@ int AicpuExecutor::run(Runtime* runtime) { // Get SO binary from runtime const void* so_data = runtime->get_device_orch_so_data(); - size_t so_size = runtime->get_device_orch_so_size(); + uint64_t so_size = runtime->get_device_orch_so_size(); if (so_data == nullptr || so_size == 0) { DEV_ERROR("Thread 3: Device orchestration SO not set"); @@ -768,7 +768,7 @@ int AicpuExecutor::run(Runtime* runtime) { } ssize_t written = write(fd, so_data, so_size); close(fd); - if (written != static_cast(so_size)) { + if (written < 0 || static_cast(written) != so_size) { DEV_INFO("Thread 3: Cannot write SO to %s (errno=%d), trying next path", so_path, errno); unlink(so_path); @@ -825,9 +825,9 @@ int AicpuExecutor::run(Runtime* runtime) { } // Read config from orchestration SO (or use defaults) - int32_t task_window_size = PTO2_TASK_WINDOW_SIZE; - int32_t dep_list_pool_size = PTO2_DEP_LIST_POOL_SIZE; - int32_t heap_size = PTO2_HEAP_SIZE; + uint64_t task_window_size = PTO2_TASK_WINDOW_SIZE; + uint64_t dep_list_pool_size = PTO2_DEP_LIST_POOL_SIZE; + uint64_t heap_size = PTO2_HEAP_SIZE; int expected_arg_count = 0; if (config_func) { PTO2OrchestrationConfig cfg = config_func(args, arg_count); @@ -850,7 +850,7 @@ int AicpuExecutor::run(Runtime* runtime) { void* gm_heap = runtime->get_pto2_gm_heap_ptr(); // Create shared memory handle and runtime (ops table populated inside) - int32_t sm_size = pto2_sm_calculate_size(task_window_size, dep_list_pool_size); + uint64_t sm_size = pto2_sm_calculate_size(task_window_size, dep_list_pool_size); PTO2SharedMemoryHandle* sm_handle = pto2_sm_create_from_buffer(sm_ptr, sm_size, task_window_size, heap_size, dep_list_pool_size); @@ -880,8 +880,8 @@ int AicpuExecutor::run(Runtime* runtime) { } // Set orchestrator's aicpu parallel mode pointers - int32_t ws = header->task_window_size; - if (ws <= 0 || ws > PTO2_MAX_SLOTS) ws = PTO2_MAX_SLOTS; + uint64_t ws = header->task_window_size; + if (ws == 0 || ws > PTO2_MAX_SLOTS) ws = PTO2_MAX_SLOTS; rt->orchestrator.aicpu_fanin_refcount = s_pto2_fanin_refcount; rt->orchestrator.aicpu_task_completed = s_pto2_task_completed; rt->orchestrator.aicpu_window_mask = ws - 1; diff --git a/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 3a5493fe..40886d73 100644 --- a/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -59,7 +59,7 @@ static long long _now_ms() { */ extern "C" int init_runtime_impl(Runtime *runtime, const uint8_t* orch_so_binary, - size_t orch_so_size, + uint64_t orch_so_size, const char* orch_func_name, uint64_t* func_args, int func_args_count, @@ -67,7 +67,7 @@ extern "C" int init_runtime_impl(Runtime *runtime, uint64_t* arg_sizes, const int* kernel_func_ids, const uint8_t* const* kernel_binaries, - const size_t* kernel_sizes, + const uint64_t* kernel_sizes, int kernel_count) { // Suppress unused parameter warning (void)orch_func_name; @@ -127,7 +127,7 @@ extern "C" int init_runtime_impl(Runtime *runtime, case ARG_INPUT_PTR: { // Input pointer: allocate device memory, copy data void* host_ptr = reinterpret_cast(func_args[i]); - size_t size = arg_sizes[i]; + uint64_t size = arg_sizes[i]; void* dev_ptr = runtime->host_api.device_malloc(size); if (dev_ptr == nullptr) { @@ -152,7 +152,7 @@ extern "C" int init_runtime_impl(Runtime *runtime, case ARG_OUTPUT_PTR: { // Output pointer: allocate device memory, record for copy-back void* host_ptr = reinterpret_cast(func_args[i]); - size_t size = arg_sizes[i]; + uint64_t size = arg_sizes[i]; void* dev_ptr = runtime->host_api.device_malloc(size); if (dev_ptr == nullptr) { @@ -170,7 +170,7 @@ extern "C" int init_runtime_impl(Runtime *runtime, case ARG_INOUT_PTR: { // Input/output pointer: allocate, copy, record for copy-back void* host_ptr = reinterpret_cast(func_args[i]); - size_t size = arg_sizes[i]; + uint64_t size = arg_sizes[i]; void* dev_ptr = runtime->host_api.device_malloc(size); if (dev_ptr == nullptr) { @@ -233,15 +233,15 @@ extern "C" int init_runtime_impl(Runtime *runtime, // Allocate PTO2 shared memory long long t_sm_start = _now_ms(); - int32_t sm_size = pto2_sm_calculate_size(PTO2_TASK_WINDOW_SIZE, PTO2_DEP_LIST_POOL_SIZE); - void* sm_ptr = runtime->host_api.device_malloc(static_cast(sm_size)); + uint64_t sm_size = pto2_sm_calculate_size(PTO2_TASK_WINDOW_SIZE, PTO2_DEP_LIST_POOL_SIZE); + void* sm_ptr = runtime->host_api.device_malloc(sm_size); long long t_sm_end = _now_ms(); if (sm_ptr == nullptr) { std::cerr << "Error: Failed to allocate PTO2 shared memory\n"; return -1; } runtime->set_pto2_gm_sm_ptr(sm_ptr); - runtime->record_tensor_pair(nullptr, sm_ptr, static_cast(sm_size)); + runtime->record_tensor_pair(nullptr, sm_ptr, sm_size); // Set up device orchestration state runtime->set_orch_built_on_host(false); @@ -289,7 +289,7 @@ extern "C" int validate_runtime_impl(Runtime *runtime) { // PTO2 (device orchestration): graph output may be in packed buffer void* pto2_sm = runtime->get_pto2_gm_sm_ptr(); uint64_t graph_out_ptr = 0; - int32_t graph_out_size = 0; + uint64_t graph_out_size = 0; if (pto2_sm != nullptr) { // Copy header from device to host to read graph_output_ptr/size @@ -324,12 +324,12 @@ extern "C" int validate_runtime_impl(Runtime *runtime) { } void* src_ptr = pair.dev_ptr; - size_t copy_size = pair.size; + uint64_t copy_size = pair.size; // Use graph_output_ptr for the first output tensor if available if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) { src_ptr = reinterpret_cast(static_cast(graph_out_ptr)); - copy_size = static_cast(graph_out_size); + copy_size = graph_out_size; std::cout << "Using packed output buffer for tensor " << i << "\n"; first_output_tensor = false; } diff --git a/src/runtime/tensormap_and_ringbuffer/orchestration/tensor_orch.cpp b/src/runtime/tensormap_and_ringbuffer/orchestration/tensor_orch.cpp index 4b1105d9..8ee10dc7 100644 --- a/src/runtime/tensormap_and_ringbuffer/orchestration/tensor_orch.cpp +++ b/src/runtime/tensormap_and_ringbuffer/orchestration/tensor_orch.cpp @@ -187,7 +187,7 @@ uint64_t Tensor::offset_ndim_to_1d(const std::vector& offset_ndims) co } bool Tensor::valid_view(const uint64_t shapes[], const uint64_t offsets[]) const { - for (size_t i = 0; i < ndims; i++) { + for (uint64_t i = 0; i < ndims; i++) { if (shapes[i] + offsets[i] > repeats[i]) { return false; } @@ -199,7 +199,7 @@ Tensor Tensor::view(const uint64_t shapes[], const uint64_t offsets[]) const { debug_assert(valid_view(shapes, offsets)); Tensor result(*this); result.start_offset = start_offset + offset_ndim_to_1d(offsets); - for (size_t i = 0; i < ndims; i++) { + for (uint64_t i = 0; i < ndims; i++) { result.repeats[i] = shapes[i]; } return result; @@ -208,7 +208,7 @@ Tensor Tensor::view(const uint64_t shapes[], const uint64_t offsets[]) const { Tensor Tensor::view(const std::vector& shapes, const std::vector& offsets) const { Tensor result(*this); result.start_offset = start_offset + offset_ndim_to_1d(offsets); - for (size_t i = 0; i < ndims; i++) { + for (uint64_t i = 0; i < ndims; i++) { result.repeats[i] = shapes[i]; } return result; @@ -231,11 +231,11 @@ bool Tensor::is_contiguous() const { bool Tensor::valid_reshape(const uint64_t shapes[], uint64_t new_ndims) const { uint64_t x = 1; - for (size_t i = 0; i < ndims; i++) { + for (uint64_t i = 0; i < ndims; i++) { x *= repeats[i]; } uint64_t y = 1; - for (size_t i = 0; i < new_ndims; i++) { + for (uint64_t i = 0; i < new_ndims; i++) { y *= shapes[i]; } return x == y; diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index b4bb1fed..c7270e2f 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -72,7 +72,7 @@ static inline void task_fanout_unlock(PTO2TaskDescriptor* task) { PTO2_STORE_REL // ============================================================================= bool pto2_orchestrator_init( - PTO2OrchestratorState* orch, PTO2SharedMemoryHandle* sm_handle, void* gm_heap, int32_t heap_size) { + PTO2OrchestratorState* orch, PTO2SharedMemoryHandle* sm_handle, void* gm_heap, uint64_t heap_size) { memset(orch, 0, sizeof(PTO2OrchestratorState)); orch->sm_handle = sm_handle; @@ -99,8 +99,8 @@ bool pto2_orchestrator_init( orch->tensormap_last_cleanup = 0; // Initialize scope stack: one flat buffer for task IDs + one array for begin offsets - int32_t max_depth = PTO2_MAX_SCOPE_DEPTH; - int32_t init_cap = PTO2_SCOPE_TASKS_INIT_CAP; + uint64_t max_depth = PTO2_MAX_SCOPE_DEPTH; + uint64_t init_cap = PTO2_SCOPE_TASKS_INIT_CAP; orch->scope_tasks = (int32_t*)malloc(init_cap * sizeof(int32_t)); orch->scope_begins = (int32_t*)malloc(max_depth * sizeof(int32_t)); if (!orch->scope_tasks || !orch->scope_begins) { @@ -163,7 +163,7 @@ void pto2_orchestrator_set_scheduler_mode( static void scope_tasks_push(PTO2OrchestratorState* orch, int32_t task_id) { if (orch->scope_tasks_size >= orch->scope_tasks_capacity) { - int32_t new_cap = orch->scope_tasks_capacity * 2; + uint64_t new_cap = orch->scope_tasks_capacity * 2; int32_t* new_buf = (int32_t*)realloc(orch->scope_tasks, new_cap * sizeof(int32_t)); assert(new_buf && "Failed to grow scope task buffer"); orch->scope_tasks = new_buf; @@ -173,7 +173,7 @@ static void scope_tasks_push(PTO2OrchestratorState* orch, int32_t task_id) { } void pto2_scope_begin(PTO2OrchestratorState* orch) { - assert(orch->scope_stack_top < orch->scope_stack_capacity - 1 && "Scope stack overflow"); + assert(orch->scope_stack_top < (int32_t)(orch->scope_stack_capacity - 1) && "Scope stack overflow"); ++orch->scope_stack_top; orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size; @@ -187,7 +187,7 @@ void pto2_scope_end(PTO2OrchestratorState* orch) { #endif int32_t begin = orch->scope_begins[orch->scope_stack_top--]; - int32_t count = orch->scope_tasks_size - begin; + uint64_t count = orch->scope_tasks_size - begin; if (orch->scheduler && count > 0) { pto2_scheduler_on_scope_end(orch->scheduler, &orch->scope_tasks[begin], count); @@ -302,7 +302,7 @@ void pto2_submit_task(PTO2OrchestratorState* orch, scope_tasks_push(orch, task_id); // Temporary storage for collecting output sizes - int32_t total_output_size = 0; + uint64_t total_output_size = 0; // Temporary storage for fanin int32_t fanin_temp[PTO2_MAX_INPUTS]; @@ -504,8 +504,8 @@ void pto2_orchestrator_print_stats(PTO2OrchestratorState* orch) { printf("Bytes allocated: %lld\n", (long long)orch->bytes_allocated); printf("Current scope depth: %d\n", orch->scope_stack_top + 1); printf("Task ring active: %d\n", pto2_task_ring_active_count(&orch->task_ring)); - printf("Heap ring used: %d / %d\n", orch->heap_ring.top, orch->heap_ring.size); - printf("Dep pool used: %d / %d\n", pto2_dep_pool_used(&orch->dep_pool), orch->dep_pool.capacity); + printf("Heap ring used: %lu / %lu\n", (unsigned long)orch->heap_ring.top, (unsigned long)orch->heap_ring.size); + printf("Dep pool used: %zu / %zu\n", pto2_dep_pool_used(&orch->dep_pool), orch->dep_pool.capacity); printf("TensorMap valid: %d\n", pto2_tensormap_valid_count(&orch->tensor_map)); printf("===============================\n"); } @@ -516,8 +516,8 @@ void pto2_orchestrator_print_scope_stack(PTO2OrchestratorState* orch) { for (int i = 0; i <= orch->scope_stack_top; i++) { int32_t begin = orch->scope_begins[i]; - int32_t end = (i < orch->scope_stack_top) ? orch->scope_begins[i + 1] : orch->scope_tasks_size; - printf(" [%d] tasks_owned = %d\n", i, end - begin); + uint64_t end = (i < orch->scope_stack_top) ? orch->scope_begins[i + 1] : orch->scope_tasks_size; + printf(" [%d] tasks_owned = %zu\n", i, end - begin); } printf("==================\n"); diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index fbeb3ff0..5fd67b8d 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -52,11 +52,11 @@ struct PTO2OrchestratorState { // scope_begins[i] is the index into scope_tasks where scope i starts. // Tasks for the top scope occupy [scope_begins[top], scope_tasks_size). int32_t* scope_tasks; // Flat buffer of task IDs (all scopes concatenated) - int32_t scope_tasks_size; // Number of task IDs currently in the buffer - int32_t scope_tasks_capacity; // Allocated capacity of scope_tasks + uint64_t scope_tasks_size; // Number of task IDs currently in the buffer + uint64_t scope_tasks_capacity; // Allocated capacity of scope_tasks int32_t* scope_begins; // scope_begins[i] = start index of scope i in scope_tasks int32_t scope_stack_top; // Current top of stack (-1 = no scope open) - int32_t scope_stack_capacity; // Max nesting depth (PTO2_MAX_SCOPE_DEPTH) + uint64_t scope_stack_capacity; // Max nesting depth (PTO2_MAX_SCOPE_DEPTH) // === SCHEDULER REFERENCE === // Note: In simulated mode, orchestrator and scheduler share address space @@ -66,7 +66,7 @@ struct PTO2OrchestratorState { // === GM HEAP (for output buffers) === void* gm_heap_base; // Base address of GM heap - int32_t gm_heap_size; // Size of GM heap + uint64_t gm_heap_size; // Size of GM heap // === STATISTICS === int64_t tasks_submitted; @@ -105,7 +105,7 @@ struct PTO2OrchestratorState { * @return true on success */ bool pto2_orchestrator_init( - PTO2OrchestratorState* orch, PTO2SharedMemoryHandle* sm_handle, void* gm_heap, int32_t heap_size); + PTO2OrchestratorState* orch, PTO2SharedMemoryHandle* sm_handle, void* gm_heap, uint64_t heap_size); /** * Destroy orchestrator state and free resources diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp index da9bfcc7..22feb6d8 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp @@ -21,8 +21,8 @@ // Heap Ring Buffer Implementation // ============================================================================= -void pto2_heap_ring_init(PTO2HeapRing* ring, void* base, int32_t size, - volatile int32_t* tail_ptr) { +void pto2_heap_ring_init(PTO2HeapRing* ring, void* base, uint64_t size, + volatile uint64_t* tail_ptr) { ring->base = base; ring->size = size; ring->top = 0; @@ -34,14 +34,14 @@ void pto2_heap_ring_init(PTO2HeapRing* ring, void* base, int32_t size, // Heap ring spin limit - after this, report deadlock and exit #define PTO2_HEAP_SPIN_LIMIT 100000 -void* pto2_heap_ring_alloc(PTO2HeapRing* ring, int32_t size) { +void* pto2_heap_ring_alloc(PTO2HeapRing* ring, uint64_t size) { // Align size for DMA efficiency size = PTO2_ALIGN_UP(size, PTO2_ALIGN_SIZE); - + // Spin-wait if insufficient space (back-pressure from Scheduler) int spin_count = 0; bool notified = false; - + while (1) { void* ptr = pto2_heap_ring_try_alloc(ring, size); if (ptr != NULL) { @@ -52,69 +52,69 @@ void* pto2_heap_ring_alloc(PTO2HeapRing* ring, int32_t size) { #endif return ptr; } - + // No space available, spin-wait spin_count++; - + #if PTO2_SPIN_VERBOSE_LOGGING // Periodic block notification if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0 && spin_count < PTO2_HEAP_SPIN_LIMIT) { - int32_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr); - int32_t available = pto2_heap_ring_available(ring); - fprintf(stderr, "[HeapRing] BLOCKED: requesting %d bytes, available=%d, " - "top=%d, tail=%d, spins=%d\n", + uint64_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr); + uint64_t available = pto2_heap_ring_available(ring); + fprintf(stderr, "[HeapRing] BLOCKED: requesting %lu bytes, available=%lu, " + "top=%lu, tail=%lu, spins=%d\n", size, available, ring->top, tail, spin_count); notified = true; } #endif - + if (spin_count >= PTO2_HEAP_SPIN_LIMIT) { - int32_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr); - int32_t available = pto2_heap_ring_available(ring); + uint64_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr); + uint64_t available = pto2_heap_ring_available(ring); fprintf(stderr, "\n"); fprintf(stderr, "========================================\n"); fprintf(stderr, "FATAL: Heap Ring Deadlock Detected!\n"); fprintf(stderr, "========================================\n"); fprintf(stderr, "Orchestrator blocked waiting for heap space after %d spins.\n", spin_count); - fprintf(stderr, " - Requested: %d bytes\n", size); - fprintf(stderr, " - Available: %d bytes\n", available); - fprintf(stderr, " - Heap top: %d\n", ring->top); - fprintf(stderr, " - Heap tail: %d\n", tail); - fprintf(stderr, " - Heap size: %d\n", ring->size); + fprintf(stderr, " - Requested: %lu bytes\n", size); + fprintf(stderr, " - Available: %lu bytes\n", available); + fprintf(stderr, " - Heap top: %lu\n", ring->top); + fprintf(stderr, " - Heap tail: %lu\n", tail); + fprintf(stderr, " - Heap size: %lu\n", ring->size); fprintf(stderr, "\n"); fprintf(stderr, "Solution: Increase PTO2_HEAP_SIZE (e.g. 256*1024 for 4 x 64KB outputs).\n"); fprintf(stderr, "========================================\n"); fprintf(stderr, "\n"); exit(1); } - + PTO2_SPIN_PAUSE(); } } -void* pto2_heap_ring_try_alloc(PTO2HeapRing* ring, int32_t size) { +void* pto2_heap_ring_try_alloc(PTO2HeapRing* ring, uint64_t size) { // Align size for DMA efficiency size = PTO2_ALIGN_UP(size, PTO2_ALIGN_SIZE); - + // Read latest tail from shared memory (Scheduler updates this) - int32_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr); - int32_t top = ring->top; - + uint64_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr); + uint64_t top = ring->top; + if (top >= tail) { // Case 1: top is at or ahead of tail (normal case) // [....tail====top......] // ^-- space_at_end = size - top - - int32_t space_at_end = ring->size - top; - + + uint64_t space_at_end = ring->size - top; + if (space_at_end >= size) { // Enough space at end - allocate here void* ptr = (char*)ring->base + top; ring->top = top + size; return ptr; } - + // Not enough space at end - check if we can wrap to beginning // IMPORTANT: Don't split buffer, skip remaining space at end if (tail > size) { @@ -122,35 +122,35 @@ void* pto2_heap_ring_try_alloc(PTO2HeapRing* ring, int32_t size) { ring->top = size; return ring->base; } - + // Not enough space anywhere - return NULL return NULL; - + } else { // Case 2: top has wrapped, tail is ahead // [====top....tail=====] // ^-- free space = tail - top - - int32_t gap = tail - top; + + uint64_t gap = tail - top; if (gap >= size) { void* ptr = (char*)ring->base + top; ring->top = top + size; return ptr; } - + // Not enough space - return NULL return NULL; } } -int32_t pto2_heap_ring_available(PTO2HeapRing* ring) { - int32_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr); - int32_t top = ring->top; - +uint64_t pto2_heap_ring_available(PTO2HeapRing* ring) { + uint64_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr); + uint64_t top = ring->top; + if (top >= tail) { // Space at end + space at beginning (if any) - int32_t at_end = ring->size - top; - int32_t at_begin = tail; + uint64_t at_end = ring->size - top; + uint64_t at_begin = tail; return at_end > at_begin ? at_end : at_begin; // Max usable } else { // Contiguous space between top and tail @@ -167,7 +167,7 @@ void pto2_heap_ring_reset(PTO2HeapRing* ring) { // ============================================================================= void pto2_task_ring_init(PTO2TaskRing* ring, PTO2TaskDescriptor* descriptors, - int32_t window_size, volatile int32_t* last_alive_ptr) { + uint64_t window_size, volatile int32_t* last_alive_ptr) { ring->descriptors = descriptors; ring->window_size = window_size; ring->current_index = 0; @@ -204,7 +204,7 @@ int32_t pto2_task_ring_alloc(PTO2TaskRing* ring) { int32_t last_alive = PTO2_LOAD_ACQUIRE(ring->last_alive_ptr); int32_t active_count = ring->current_index - last_alive; fprintf(stderr, "[TaskRing] BLOCKED (Flow Control): current=%d, last_alive=%d, " - "active=%d/%d (%.1f%%), spins=%d\n", + "active=%d/%zu (%.1f%%), spins=%d\n", ring->current_index, last_alive, active_count, ring->window_size, 100.0 * active_count / ring->window_size, spin_count); notified = true; @@ -227,7 +227,7 @@ int32_t pto2_task_ring_alloc(PTO2TaskRing* ring) { fprintf(stderr, " - Current task index: %d\n", ring->current_index); fprintf(stderr, " - Last task alive: %d\n", last_alive); fprintf(stderr, " - Active tasks: %d\n", active_count); - fprintf(stderr, " - Window size: %d\n", ring->window_size); + fprintf(stderr, " - Window size: %zu\n", ring->window_size); fprintf(stderr, " - Window utilization: %.1f%%\n", 100.0 * active_count / ring->window_size); fprintf(stderr, "\n"); @@ -239,7 +239,7 @@ int32_t pto2_task_ring_alloc(PTO2TaskRing* ring) { fprintf(stderr, " This creates a circular dependency (deadlock).\n"); fprintf(stderr, "\n"); fprintf(stderr, "Solution:\n"); - fprintf(stderr, " Current task_window_size: %d\n", ring->window_size); + fprintf(stderr, " Current task_window_size: %zu\n", ring->window_size); fprintf(stderr, " Default PTO2_TASK_WINDOW_SIZE: %d\n", PTO2_TASK_WINDOW_SIZE); fprintf(stderr, " Recommended: %d (at least 2x current active tasks)\n", active_count * 2); @@ -268,7 +268,7 @@ int32_t pto2_task_ring_try_alloc(PTO2TaskRing* ring) { // Check if there's room for one more task // Leave at least 1 slot empty to distinguish full from empty - if (active_count < ring->window_size - 1) { + if (active_count < (int32_t)(ring->window_size - 1)) { int32_t task_id = current; int32_t slot = task_id & (ring->window_size - 1); @@ -295,7 +295,7 @@ int32_t pto2_task_ring_active_count(PTO2TaskRing* ring) { bool pto2_task_ring_has_space(PTO2TaskRing* ring) { int32_t active = pto2_task_ring_active_count(ring); - return active < ring->window_size - 1; + return active < (int32_t)(ring->window_size - 1); } void pto2_task_ring_reset(PTO2TaskRing* ring) { @@ -309,7 +309,7 @@ void pto2_task_ring_reset(PTO2TaskRing* ring) { // Dependency List Pool Implementation // ============================================================================= -void pto2_dep_pool_init(PTO2DepListPool* pool, PTO2DepListEntry* base, int32_t capacity) { +void pto2_dep_pool_init(PTO2DepListPool* pool, PTO2DepListEntry* base, uint64_t capacity) { pool->base = base; pool->capacity = capacity; pool->top = 1; // Start from 1, 0 means NULL/empty @@ -324,7 +324,7 @@ int32_t pto2_dep_pool_alloc_one(PTO2DepListPool* pool) { // Wrap around to beginning (old entries reclaimed with task ring) pool->top = 1; // Start from 1, 0 means NULL } - return pool->top++; + return static_cast(pool->top++); } int32_t pto2_dep_list_prepend(PTO2DepListPool* pool, int32_t current_head, int32_t task_id) { @@ -346,8 +346,8 @@ int32_t pto2_dep_list_prepend(PTO2DepListPool* pool, int32_t current_head, int32 void pto2_dep_list_iterate(PTO2DepListPool* pool, int32_t head, void (*callback)(int32_t task_id, void* ctx), void* ctx) { int32_t current = head; - - while (current > 0 && current < pool->capacity) { + + while (current > 0 && current < (int32_t)pool->capacity) { PTO2DepListEntry* entry = &pool->base[current]; callback(entry->task_id, ctx); current = entry->next_offset; @@ -357,8 +357,8 @@ void pto2_dep_list_iterate(PTO2DepListPool* pool, int32_t head, int32_t pto2_dep_list_count(PTO2DepListPool* pool, int32_t head) { int32_t count = 0; int32_t current = head; - - while (current > 0 && current < pool->capacity) { + + while (current > 0 && current < (int32_t)pool->capacity) { count++; current = pool->base[current].next_offset; } @@ -377,10 +377,10 @@ void pto2_dep_pool_reset(PTO2DepListPool* pool) { pool->base[0].next_offset = 0; } -int32_t pto2_dep_pool_used(PTO2DepListPool* pool) { +uint64_t pto2_dep_pool_used(PTO2DepListPool* pool) { return pool->top - 1; // Exclude entry 0 (NULL marker) } -int32_t pto2_dep_pool_available(PTO2DepListPool* pool) { +uint64_t pto2_dep_pool_available(PTO2DepListPool* pool) { return pool->capacity - pool->top; } diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index bb15b468..500a98db 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -41,12 +41,12 @@ */ typedef struct { void* base; // GM_Heap_Base pointer - int32_t size; // GM_Heap_Size (total heap size in bytes) - int32_t top; // Allocation pointer (local copy) - + uint64_t size; // GM_Heap_Size (total heap size in bytes) + uint64_t top; // Allocation pointer (local copy) + // Reference to shared memory tail (for back-pressure) - volatile int32_t* tail_ptr; // Points to header->heap_tail - + volatile uint64_t* tail_ptr; // Points to header->heap_tail + } PTO2HeapRing; /** @@ -57,35 +57,35 @@ typedef struct { * @param size Total heap size in bytes * @param tail_ptr Pointer to shared memory heap_tail */ -void pto2_heap_ring_init(PTO2HeapRing* ring, void* base, int32_t size, - volatile int32_t* tail_ptr); +void pto2_heap_ring_init(PTO2HeapRing* ring, void* base, uint64_t size, + volatile uint64_t* tail_ptr); /** * Allocate memory from heap ring - * + * * O(1) bump allocation with wrap-around. * May STALL (spin-wait) if insufficient space (back-pressure). * Never splits a buffer across the wrap-around boundary. - * + * * @param ring Heap ring * @param size Requested size in bytes * @return Pointer to allocated memory, never NULL (stalls instead) */ -void* pto2_heap_ring_alloc(PTO2HeapRing* ring, int32_t size); +void* pto2_heap_ring_alloc(PTO2HeapRing* ring, uint64_t size); /** * Try to allocate memory without stalling - * + * * @param ring Heap ring * @param size Requested size in bytes * @return Pointer to allocated memory, or NULL if no space */ -void* pto2_heap_ring_try_alloc(PTO2HeapRing* ring, int32_t size); +void* pto2_heap_ring_try_alloc(PTO2HeapRing* ring, uint64_t size); /** * Get available space in heap ring */ -int32_t pto2_heap_ring_available(PTO2HeapRing* ring); +uint64_t pto2_heap_ring_available(PTO2HeapRing* ring); /** * Reset heap ring to initial state @@ -104,24 +104,24 @@ void pto2_heap_ring_reset(PTO2HeapRing* ring); */ typedef struct { PTO2TaskDescriptor* descriptors; // Task descriptor array (from shared memory) - int32_t window_size; // Window size (power of 2) + uint64_t window_size; // Window size (power of 2) int32_t current_index; // Next task to allocate (absolute ID) - + // Reference to shared memory last_task_alive (for back-pressure) volatile int32_t* last_alive_ptr; // Points to header->last_task_alive - + } PTO2TaskRing; /** * Initialize task ring buffer - * + * * @param ring Task ring to initialize * @param descriptors Task descriptor array from shared memory * @param window_size Window size (must be power of 2) * @param last_alive_ptr Pointer to shared memory last_task_alive */ void pto2_task_ring_init(PTO2TaskRing* ring, PTO2TaskDescriptor* descriptors, - int32_t window_size, volatile int32_t* last_alive_ptr); + uint64_t window_size, volatile int32_t* last_alive_ptr); /** * Allocate a task slot from task ring @@ -176,19 +176,19 @@ void pto2_task_ring_reset(PTO2TaskRing* ring); */ typedef struct { PTO2DepListEntry* base; // Pool base address (from shared memory) - int32_t capacity; // Total number of entries - int32_t top; // Next allocation position (starts from 1, 0=NULL) - + uint64_t capacity; // Total number of entries + uint64_t top; // Next allocation position (starts from 1, 0=NULL) + } PTO2DepListPool; /** * Initialize dependency list pool - * + * * @param pool Pool to initialize * @param base Pool base address from shared memory * @param capacity Total number of entries */ -void pto2_dep_pool_init(PTO2DepListPool* pool, PTO2DepListEntry* base, int32_t capacity); +void pto2_dep_pool_init(PTO2DepListPool* pool, PTO2DepListEntry* base, uint64_t capacity); /** * Allocate a single entry from the pool @@ -243,7 +243,7 @@ void pto2_dep_pool_reset(PTO2DepListPool* pool); /** * Get pool usage statistics */ -int32_t pto2_dep_pool_used(PTO2DepListPool* pool); -int32_t pto2_dep_pool_available(PTO2DepListPool* pool); +uint64_t pto2_dep_pool_used(PTO2DepListPool* pool); +uint64_t pto2_dep_pool_available(PTO2DepListPool* pool); #endif // PTO_RING_BUFFER_H diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index 4afa6418..67a9570f 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -53,9 +53,9 @@ PTO2Runtime* pto2_runtime_create(PTO2RuntimeMode mode) { } PTO2Runtime* pto2_runtime_create_custom(PTO2RuntimeMode mode, - int32_t task_window_size, - int32_t heap_size, - int32_t dep_list_size) { + uint64_t task_window_size, + uint64_t heap_size, + uint64_t dep_list_size) { // Allocate runtime context PTO2Runtime* rt = (PTO2Runtime*)calloc(1, sizeof(PTO2Runtime)); if (!rt) { @@ -116,7 +116,7 @@ PTO2Runtime* pto2_runtime_create_custom(PTO2RuntimeMode mode, PTO2Runtime* pto2_runtime_create_from_sm(PTO2RuntimeMode mode, PTO2SharedMemoryHandle* sm_handle, void* gm_heap, - int32_t heap_size) { + uint64_t heap_size) { if (!sm_handle) return NULL; PTO2Runtime* rt = (PTO2Runtime*)calloc(1, sizeof(PTO2Runtime)); diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index 959370a5..74cc8bc8 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -80,7 +80,7 @@ struct PTO2Runtime { // GM Heap for output buffers void* gm_heap; - int32_t gm_heap_size; + uint64_t gm_heap_size; bool gm_heap_owned; // True if we allocated it // Mode @@ -112,9 +112,9 @@ PTO2Runtime* pto2_runtime_create(PTO2RuntimeMode mode); * @return Runtime context, or NULL on failure */ PTO2Runtime* pto2_runtime_create_custom(PTO2RuntimeMode mode, - int32_t task_window_size, - int32_t heap_size, - int32_t dep_list_size); + uint64_t task_window_size, + uint64_t heap_size, + uint64_t dep_list_size); /** * Create runtime from existing shared memory and GM heap (e.g. on device). @@ -129,7 +129,7 @@ PTO2Runtime* pto2_runtime_create_custom(PTO2RuntimeMode mode, PTO2Runtime* pto2_runtime_create_from_sm(PTO2RuntimeMode mode, PTO2SharedMemoryHandle* sm_handle, void* gm_heap, - int32_t heap_size); + uint64_t heap_size); /** * Destroy runtime and free all resources @@ -295,9 +295,9 @@ struct PTO2OrchestrationBeginInfo { uint64_t* args; int arg_count; int expected_arg_count; - int32_t task_window_size; - int32_t dep_list_pool_size; - int32_t heap_size; + uint64_t task_window_size; + uint64_t dep_list_pool_size; + uint64_t heap_size; void* gm_heap_ptr = nullptr; }; @@ -329,7 +329,7 @@ class PTO2OrchestrationGuard { } header_ = static_cast(begin_info.sm_ptr); - int32_t sm_size = pto2_sm_calculate_size(begin_info.task_window_size, + uint64_t sm_size = pto2_sm_calculate_size(begin_info.task_window_size, begin_info.dep_list_pool_size); PTO2SharedMemoryHandle* sm_handle = pto2_sm_create_from_buffer(begin_info.sm_ptr, sm_size, @@ -339,13 +339,13 @@ class PTO2OrchestrationGuard { if (!sm_handle) return; void* gm_heap = begin_info.gm_heap_ptr; - int32_t gm_heap_size = begin_info.heap_size; + uint64_t gm_heap_size = begin_info.heap_size; if (begin_info.arg_count >= 2) { uint64_t heap_arg = begin_info.args[begin_info.arg_count - 2]; uint64_t size_arg = begin_info.args[begin_info.arg_count - 1]; if (heap_arg != 0 && size_arg != 0) { gm_heap = reinterpret_cast(static_cast(heap_arg)); - gm_heap_size = static_cast(size_arg & 0x7FFFFFFF); + gm_heap_size = size_arg; } } diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp index b405f773..b89526b5 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp @@ -30,17 +30,17 @@ const char* pto2_task_state_name(PTO2TaskState state) { // Ready Queue Implementation // ============================================================================= -bool pto2_ready_queue_init(PTO2ReadyQueue* queue, int32_t capacity) { +bool pto2_ready_queue_init(PTO2ReadyQueue* queue, uint64_t capacity) { queue->task_ids = (int32_t*)malloc(capacity * sizeof(int32_t)); if (!queue->task_ids) { return false; } - + queue->head = 0; queue->tail = 0; queue->capacity = capacity; queue->count = 0; - + return true; } @@ -94,7 +94,7 @@ bool pto2_scheduler_init(PTO2SchedulerState* sched, sched->dep_pool = dep_pool; // Get runtime task_window_size from shared memory header - int32_t window_size = sm_handle->header->task_window_size; + uint64_t window_size = sm_handle->header->task_window_size; sched->task_window_size = window_size; sched->task_window_mask = window_size - 1; // For fast modulo (window_size must be power of 2) @@ -162,10 +162,10 @@ void pto2_scheduler_destroy(PTO2SchedulerState* sched) { void pto2_scheduler_reset(PTO2SchedulerState* sched) { sched->last_task_alive = 0; sched->heap_tail = 0; - - memset(sched->task_state, 0, PTO2_TASK_WINDOW_SIZE * sizeof(PTO2TaskState)); - memset(sched->fanin_refcount, 0, PTO2_TASK_WINDOW_SIZE * sizeof(int32_t)); - memset(sched->fanout_refcount, 0, PTO2_TASK_WINDOW_SIZE * sizeof(int32_t)); + + memset(sched->task_state, 0, sched->task_window_size * sizeof(PTO2TaskState)); + memset(sched->fanin_refcount, 0, sched->task_window_size * sizeof(int32_t)); + memset(sched->fanout_refcount, 0, sched->task_window_size * sizeof(int32_t)); for (int i = 0; i < PTO2_NUM_WORKER_TYPES; i++) { pto2_ready_queue_reset(&sched->ready_queues[i]); @@ -363,7 +363,7 @@ void pto2_scheduler_advance_ring_pointers(PTO2SchedulerState* sched) { // heap_tail = offset of end of last consumed task's buffer // Note: This requires knowing the heap base, which should be passed in // For now, we just track the relative position - sched->heap_tail = (int32_t)(intptr_t)last_consumed->packed_buffer_end; + sched->heap_tail = reinterpret_cast(last_consumed->packed_buffer_end); } } @@ -403,7 +403,7 @@ bool pto2_scheduler_is_done(PTO2SchedulerState* sched) { void pto2_scheduler_print_stats(PTO2SchedulerState* sched) { printf("=== Scheduler Statistics ===\n"); printf("last_task_alive: %d\n", sched->last_task_alive); - printf("heap_tail: %d\n", sched->heap_tail); + printf("heap_tail: %lu\n", sched->heap_tail); printf("tasks_completed: %lld\n", (long long)sched->tasks_completed); printf("tasks_consumed: %lld\n", (long long)sched->tasks_consumed); printf("============================\n"); diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h index 3dccde69..87556321 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h @@ -54,11 +54,11 @@ typedef struct PTO2SchedulerState { // Local copies of ring pointers (written to shared memory after update) int32_t last_task_alive; // Task ring tail - int32_t heap_tail; // Heap ring tail + uint64_t heap_tail; // Heap ring tail // === DYNAMIC CONFIGURATION === - int32_t task_window_size; // Task window size (power of 2) - int32_t task_window_mask; // task_window_size - 1 (for fast modulo) + uint64_t task_window_size; // Task window size (power of 2) + uint64_t task_window_mask; // task_window_size - 1 (for fast modulo) // === PRIVATE DATA (not in shared memory) === @@ -121,7 +121,7 @@ void pto2_scheduler_reset(PTO2SchedulerState* sched); /** * Initialize a ready queue */ -bool pto2_ready_queue_init(PTO2ReadyQueue* queue, int32_t capacity); +bool pto2_ready_queue_init(PTO2ReadyQueue* queue, uint64_t capacity); /** * Destroy ready queue diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp index 38d3411e..08baee6e 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp @@ -16,18 +16,18 @@ // Size Calculation // ============================================================================= -int32_t pto2_sm_calculate_size(int32_t task_window_size, int32_t dep_list_pool_size) { - int32_t size = 0; - +uint64_t pto2_sm_calculate_size(uint64_t task_window_size, uint64_t dep_list_pool_size) { + uint64_t size = 0; + // Header (aligned to cache line) size += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); - + // Task descriptors size += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); - + // Dependency list pool (entry 0 is reserved as NULL) size += PTO2_ALIGN_UP((dep_list_pool_size + 1) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE); - + return size; } @@ -35,18 +35,18 @@ int32_t pto2_sm_calculate_size(int32_t task_window_size, int32_t dep_list_pool_s // Creation and Destruction // ============================================================================= -PTO2SharedMemoryHandle* pto2_sm_create(int32_t task_window_size, - int32_t heap_size, - int32_t dep_list_pool_size) { +PTO2SharedMemoryHandle* pto2_sm_create(uint64_t task_window_size, + uint64_t heap_size, + uint64_t dep_list_pool_size) { // Allocate handle PTO2SharedMemoryHandle* handle = (PTO2SharedMemoryHandle*)calloc(1, sizeof(PTO2SharedMemoryHandle)); if (!handle) { return NULL; } - + // Calculate total size - int32_t sm_size = pto2_sm_calculate_size(task_window_size, dep_list_pool_size); - + uint64_t sm_size = pto2_sm_calculate_size(task_window_size, dep_list_pool_size); + // Allocate shared memory (aligned for DMA efficiency) #if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L if (posix_memalign(&handle->sm_base, PTO2_ALIGN_SIZE, sm_size) != 0) { @@ -94,13 +94,13 @@ PTO2SharedMemoryHandle* pto2_sm_create_default(void) { } PTO2SharedMemoryHandle* pto2_sm_create_from_buffer(void* sm_base, - int32_t sm_size, - int32_t task_window_size, - int32_t heap_size, - int32_t dep_list_pool_size) { - if (!sm_base || sm_size <= 0) return NULL; + uint64_t sm_size, + uint64_t task_window_size, + uint64_t heap_size, + uint64_t dep_list_pool_size) { + if (!sm_base || sm_size == 0) return NULL; - int32_t required = pto2_sm_calculate_size(task_window_size, dep_list_pool_size); + uint64_t required = pto2_sm_calculate_size(task_window_size, dep_list_pool_size); if (sm_size < required) return NULL; PTO2SharedMemoryHandle* handle = (PTO2SharedMemoryHandle*)calloc(1, sizeof(PTO2SharedMemoryHandle)); @@ -136,9 +136,9 @@ void pto2_sm_destroy(PTO2SharedMemoryHandle* handle) { // ============================================================================= void pto2_sm_init_header(PTO2SharedMemoryHandle* handle, - int32_t task_window_size, - int32_t heap_size, - int32_t dep_list_pool_size) { + uint64_t task_window_size, + uint64_t heap_size, + uint64_t dep_list_pool_size) { PTO2SharedMemoryHeader* header = handle->header; // Flow control pointers (start at 0) @@ -154,9 +154,9 @@ void pto2_sm_init_header(PTO2SharedMemoryHandle* handle, header->dep_list_pool_size = dep_list_pool_size; // Calculate offsets - int32_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); header->task_descriptors_offset = offset; - + offset += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); header->dep_list_pool_offset = offset; @@ -203,21 +203,21 @@ void pto2_sm_print_layout(PTO2SharedMemoryHandle* handle) { printf("=== PTO2 Shared Memory Layout ===\n"); printf("Base address: %p\n", handle->sm_base); - printf("Total size: %d bytes\n", h->total_size); + printf("Total size: %zu bytes\n", h->total_size); printf("\n"); - printf("Task window size: %d\n", h->task_window_size); - printf("Heap size: %d bytes\n", h->heap_size); - printf("DepList pool size: %d entries\n", h->dep_list_pool_size); + printf("Task window size: %zu\n", h->task_window_size); + printf("Heap size: %zu bytes\n", h->heap_size); + printf("DepList pool size: %zu entries\n", h->dep_list_pool_size); printf("\n"); printf("Offsets:\n"); - printf(" TaskDescriptors: %d (0x%x)\n", h->task_descriptors_offset, h->task_descriptors_offset); - printf(" DepListPool: %d (0x%x)\n", h->dep_list_pool_offset, h->dep_list_pool_offset); + printf(" TaskDescriptors: %zu (0x%zx)\n", h->task_descriptors_offset, h->task_descriptors_offset); + printf(" DepListPool: %zu (0x%zx)\n", h->dep_list_pool_offset, h->dep_list_pool_offset); printf("\n"); printf("Flow control:\n"); printf(" current_task_index: %d\n", h->current_task_index); printf(" last_task_alive: %d\n", h->last_task_alive); - printf(" heap_top: %d\n", h->heap_top); - printf(" heap_tail: %d\n", h->heap_tail); + printf(" heap_top: %lu\n", h->heap_top); + printf(" heap_tail: %lu\n", h->heap_tail); printf(" orchestrator_done: %d\n", h->orchestrator_done); printf("================================\n"); } @@ -240,8 +240,8 @@ bool pto2_sm_validate(PTO2SharedMemoryHandle* handle) { // Check flow control pointer sanity if (h->current_task_index < 0) return false; if (h->last_task_alive < 0) return false; - if (h->heap_top < 0 || h->heap_top > h->heap_size) return false; - if (h->heap_tail < 0 || h->heap_tail > h->heap_size) return false; + if (h->heap_top > h->heap_size) return false; + if (h->heap_tail > h->heap_size) return false; return true; } diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h index c758a791..7c689a2f 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h @@ -44,32 +44,32 @@ typedef struct { // Written by Orchestrator, Read by Scheduler volatile int32_t current_task_index; // Task ring head (next to allocate) - volatile int32_t heap_top; // Heap ring allocation pointer + volatile uint64_t heap_top; // Heap ring allocation pointer volatile int32_t orchestrator_done; // Flag: orchestration complete - + // Written by Scheduler, Read by Orchestrator (for back-pressure) volatile int32_t last_task_alive; // Task ring tail (oldest active task) - volatile int32_t heap_tail; // Heap ring free pointer + volatile uint64_t heap_tail; // Heap ring free pointer // === LAYOUT INFO (set once at init) === - int32_t task_window_size; // PTO2_TASK_WINDOW_SIZE - int32_t heap_size; // Total heap size - int32_t dep_list_pool_size; // Dependency list pool size + uint64_t task_window_size; // PTO2_TASK_WINDOW_SIZE + uint64_t heap_size; // Total heap size + uint64_t dep_list_pool_size; // Dependency list pool size // Offsets into shared memory (relative to SM_Base) - int32_t task_descriptors_offset; // Offset to TaskDescriptor array - int32_t dep_list_pool_offset; // Offset to DepListPool - + uint64_t task_descriptors_offset; // Offset to TaskDescriptor array + uint64_t dep_list_pool_offset; // Offset to DepListPool + // Total shared memory size (for validation) - int32_t total_size; + uint64_t total_size; // Graph output for copy-back (set by orchestrator when using packed buffer) // Host finalize copies from this address instead of dev_ptr when non-zero volatile uint64_t graph_output_ptr; // Address where final output was written (packed buffer) - volatile int32_t graph_output_size; // Size in bytes - - // Padding to cache line - int32_t _padding[2]; + volatile uint64_t graph_output_size; // Size in bytes + + // Padding to cache line (adjusted for uint64_t changes) + uint64_t _padding[1]; } PTO2SharedMemoryHeader; @@ -83,8 +83,8 @@ typedef struct { */ typedef struct { void* sm_base; // Base address of shared memory - int32_t sm_size; // Total size of shared memory - + uint64_t sm_size; // Total size of shared memory + // Quick pointers into shared memory regions PTO2SharedMemoryHeader* header; PTO2TaskDescriptor* task_descriptors; @@ -101,12 +101,12 @@ typedef struct { /** * Calculate required shared memory size - * + * * @param task_window_size Number of task slots * @param dep_list_pool_size Number of dependency list entries * @return Total bytes required */ -int32_t pto2_sm_calculate_size(int32_t task_window_size, int32_t dep_list_pool_size); +uint64_t pto2_sm_calculate_size(uint64_t task_window_size, uint64_t dep_list_pool_size); /** * Create shared memory for Orchestrator and Scheduler @@ -119,9 +119,9 @@ int32_t pto2_sm_calculate_size(int32_t task_window_size, int32_t dep_list_pool_s * @param dep_list_pool_size Number of dependency list entries * @return Handle with both views, or NULL on failure */ -PTO2SharedMemoryHandle* pto2_sm_create(int32_t task_window_size, - int32_t heap_size, - int32_t dep_list_pool_size); +PTO2SharedMemoryHandle* pto2_sm_create(uint64_t task_window_size, + uint64_t heap_size, + uint64_t dep_list_pool_size); /** * Create shared memory with default sizes @@ -140,10 +140,10 @@ PTO2SharedMemoryHandle* pto2_sm_create_default(void); * @return Handle, or NULL on failure */ PTO2SharedMemoryHandle* pto2_sm_create_from_buffer(void* sm_base, - int32_t sm_size, - int32_t task_window_size, - int32_t heap_size, - int32_t dep_list_pool_size); + uint64_t sm_size, + uint64_t task_window_size, + uint64_t heap_size, + uint64_t dep_list_pool_size); /** * Destroy shared memory and free resources @@ -155,9 +155,9 @@ void pto2_sm_destroy(PTO2SharedMemoryHandle* handle); * Called after memory is allocated */ void pto2_sm_init_header(PTO2SharedMemoryHandle* handle, - int32_t task_window_size, - int32_t heap_size, - int32_t dep_list_pool_size); + uint64_t task_window_size, + uint64_t heap_size, + uint64_t dep_list_pool_size); /** * Reset shared memory to initial state (for reuse) @@ -169,9 +169,9 @@ void pto2_sm_reset(PTO2SharedMemoryHandle* handle); * Get task descriptor by task ID * Uses runtime window_size for ring buffer indexing (not compile-time constant) */ -static inline PTO2TaskDescriptor* pto2_sm_get_task(PTO2SharedMemoryHandle* handle, +static inline PTO2TaskDescriptor* pto2_sm_get_task(PTO2SharedMemoryHandle* handle, int32_t task_id) { - int32_t window_mask = handle->header->task_window_size - 1; + uint64_t window_mask = handle->header->task_window_size - 1; return &handle->task_descriptors[task_id & window_mask]; } diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp index aebace93..ce7e923d 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp @@ -27,7 +27,7 @@ // Initialization and Destruction // ============================================================================= -bool pto2_tensormap_init(PTO2TensorMap* tm, int32_t num_buckets, int32_t pool_size) { +bool pto2_tensormap_init(PTO2TensorMap* tm, uint64_t num_buckets, uint64_t pool_size) { // Validate power of 2 for fast modulo if ((num_buckets & (num_buckets - 1)) != 0) { return false; // num_buckets must be power of 2 @@ -40,7 +40,7 @@ bool pto2_tensormap_init(PTO2TensorMap* tm, int32_t num_buckets, int32_t pool_si } // Initialize all buckets to empty (-1) - for (int32_t i = 0; i < num_buckets; i++) { + for (uint64_t i = 0; i < num_buckets; i++) { tm->buckets[i] = -1; } @@ -58,7 +58,7 @@ bool pto2_tensormap_init(PTO2TensorMap* tm, int32_t num_buckets, int32_t pool_si tm->pool_head = 0; // Initialize all entries as not in bucket - for (int32_t i = 0; i < pool_size; i++) { + for (uint64_t i = 0; i < pool_size; i++) { tm->entry_pool[i].in_bucket = false; tm->entry_pool[i].next_in_bucket = -1; tm->entry_pool[i].prev_in_bucket = -1; @@ -110,12 +110,12 @@ void pto2_tensormap_destroy(PTO2TensorMap* tm) { void pto2_tensormap_reset(PTO2TensorMap* tm) { // Reset all buckets to empty - for (int32_t i = 0; i < tm->num_buckets; i++) { + for (uint64_t i = 0; i < tm->num_buckets; i++) { tm->buckets[i] = -1; } // Reset all entries - for (int32_t i = 0; i < tm->pool_size; i++) { + for (uint64_t i = 0; i < tm->pool_size; i++) { tm->entry_pool[i].in_bucket = false; tm->entry_pool[i].next_in_bucket = -1; tm->entry_pool[i].prev_in_bucket = -1; @@ -155,7 +155,7 @@ uint32_t pto2_tensormap_hash(PTO2TensorMap* tm, Tensor* tensor) { // Region A: base=X, offset=0 → bucket 5 // Region B: base=X, offset=128 → bucket 5 (CORRECT! Same bucket) // - uint64_t key = (uint64_t)(uintptr_t)tensor->buffer.addr; + uint64_t key = tensor->buffer.addr; // Improve distribution by mixing bits (pointers often have aligned low bits) key = key ^ (key >> 16); @@ -298,13 +298,13 @@ void pto2_tensormap_lookup(PTO2TensorMap* tm, Tensor* tensor, PTO2LookupResult* void pto2_tensormap_insert(PTO2TensorMap* tm, Tensor* tensor, int32_t producer_task_id, bool with_alloc) { // Allocate entry from ring buffer pool - int32_t entry_offset = tm->pool_head; + uint64_t entry_offset = tm->pool_head; PTO2TensorMapEntry* entry = &tm->entry_pool[entry_offset]; // Advance pool head (wrap around) tm->pool_head = (tm->pool_head + 1) % tm->pool_size; - size_t wait_count = 0; + uint64_t wait_count = 0; while (entry->in_bucket) { pto2_orchestrator_sync_tensormap(tm); always_assert(wait_count++ <= 1000000000UL); @@ -321,9 +321,9 @@ void pto2_tensormap_insert(PTO2TensorMap* tm, Tensor* tensor, int32_t producer_t entry->prev_in_bucket = -1; // New head has no predecessor // Update old head's prev pointer if (entry->next_in_bucket >= 0) { - tm->entry_pool[entry->next_in_bucket].prev_in_bucket = entry_offset; + tm->entry_pool[entry->next_in_bucket].prev_in_bucket = (int32_t)entry_offset; } - tm->buckets[bucket] = entry_offset; + tm->buckets[bucket] = (int32_t)entry_offset; entry->in_bucket = true; // Link to task's entry list (for cleanup) @@ -332,9 +332,9 @@ void pto2_tensormap_insert(PTO2TensorMap* tm, Tensor* tensor, int32_t producer_t entry->prev_in_task = -1; // New head has no predecessor // Update old head's prev pointer if (entry->next_in_task >= 0) { - tm->entry_pool[entry->next_in_task].prev_in_task = entry_offset; + tm->entry_pool[entry->next_in_task].prev_in_task = (int32_t)entry_offset; } - tm->task_entry_head[task_slot] = entry_offset; + tm->task_entry_head[task_slot] = (int32_t)entry_offset; } // ============================================================================= @@ -350,7 +350,7 @@ void pto2_tensormap_print_stats(PTO2TensorMap* tm) { int32_t non_empty_buckets = 0; // Count entries - for (int32_t i = 0; i < tm->pool_size; i++) { + for (uint64_t i = 0; i < tm->pool_size; i++) { if (tm->entry_pool[i].in_bucket) { if (pto2_tensormap_entry_valid(tm, &tm->entry_pool[i])) { valid++; @@ -361,7 +361,7 @@ void pto2_tensormap_print_stats(PTO2TensorMap* tm) { } // Count bucket stats - for (int32_t b = 0; b < tm->num_buckets; b++) { + for (uint64_t b = 0; b < tm->num_buckets; b++) { int32_t chain_len = 0; int32_t offset = tm->buckets[b]; @@ -382,9 +382,9 @@ void pto2_tensormap_print_stats(PTO2TensorMap* tm) { } printf("=== TensorMap Statistics ===\n"); - printf("Pool size: %d\n", tm->pool_size); - printf("Pool head: %d\n", tm->pool_head); - printf("Num buckets: %d\n", tm->num_buckets); + printf("Pool size: %zu\n", tm->pool_size); + printf("Pool head: %zu\n", tm->pool_head); + printf("Num buckets: %zu\n", tm->num_buckets); printf("Valid entries: %d\n", valid); printf("Stale entries: %d\n", stale); printf("Empty buckets: %d\n", empty_buckets); @@ -397,7 +397,7 @@ void pto2_tensormap_print_stats(PTO2TensorMap* tm) { int32_t pto2_tensormap_valid_count(PTO2TensorMap* tm) { int32_t count = 0; - for (int32_t i = 0; i < tm->pool_size; i++) { + for (uint64_t i = 0; i < tm->pool_size; i++) { if (tm->entry_pool[i].in_bucket && pto2_tensormap_entry_valid(tm, &tm->entry_pool[i])) { count++; } diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h index 6e974cde..ce0091c0 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h @@ -74,13 +74,13 @@ typedef struct { */ typedef struct { // Hash table buckets (fixed size, power of 2) - int32_t* buckets; // Array of offsets into entry_pool (-1 = empty) - int32_t num_buckets; // Must be power of 2 for fast modulo + int32_t* buckets; // Array of offsets into entry_pool (-1 = empty) + uint64_t num_buckets; // Must be power of 2 for fast modulo // Entry pool as ring buffer PTO2TensorMapEntry* entry_pool; // Ring buffer of entries - int32_t pool_size; // Total pool capacity - int32_t pool_head; // Next allocation position (wraps around) + uint64_t pool_size; // Total pool capacity + uint64_t pool_head; // Next allocation position (wraps around) // Per-task entry tracking (for efficient bucket cleanup) int32_t* task_entry_head; // Per-task head offset (-1 = no entries) @@ -104,7 +104,7 @@ typedef struct { * @param pool_size Size of entry pool * @return true on success, false on allocation failure */ -bool pto2_tensormap_init(PTO2TensorMap* tm, int32_t num_buckets, int32_t pool_size); +bool pto2_tensormap_init(PTO2TensorMap* tm, uint64_t num_buckets, uint64_t pool_size); /** * Initialize TensorMap with default sizes diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp index f1d8be3d..5fe14382 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp +++ b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp @@ -43,7 +43,7 @@ Runtime::Runtime() { // Tensor Pair Management // ============================================================================= -void Runtime::record_tensor_pair(void* host_ptr, void* dev_ptr, size_t size) { +void Runtime::record_tensor_pair(void* host_ptr, void* dev_ptr, uint64_t size) { if (tensor_pair_count >= RUNTIME_MAX_TENSOR_PAIRS) { fprintf(stderr, "[Runtime] ERROR: Tensor pairs full (max=%d)\n", RUNTIME_MAX_TENSOR_PAIRS); return; @@ -86,7 +86,7 @@ void Runtime::set_pto2_gm_heap(void* p) { pto2_gm_heap_ptr_ = p; } void Runtime::set_orch_args(uint64_t* args, int count) { orch_arg_count_ = count <= RUNTIME_MAX_ARGS ? count : RUNTIME_MAX_ARGS; if (args && orch_arg_count_ > 0) { - memcpy(orch_args_storage_, args, (size_t)orch_arg_count_ * sizeof(uint64_t)); + memcpy(orch_args_storage_, args, (uint64_t)orch_arg_count_ * sizeof(uint64_t)); // Note: We no longer store orch_args_ pointer as it would contain host address // get_orch_args() now computes address from embedded storage directly } @@ -94,14 +94,14 @@ void Runtime::set_orch_args(uint64_t* args, int count) { // Device orchestration SO binary (for dlopen on AICPU thread 3) // Copies data to internal storage to avoid lifetime issues with Python ctypes arrays -void Runtime::set_device_orch_so(const void* data, size_t size) { +void Runtime::set_device_orch_so(const void* data, uint64_t size) { if (data == nullptr || size == 0) { device_orch_so_size_ = 0; return; } if (size > RUNTIME_MAX_ORCH_SO_SIZE) { - fprintf(stderr, "[Runtime] ERROR: Orchestration SO too large (%zu > %d)\n", - size, RUNTIME_MAX_ORCH_SO_SIZE); + fprintf(stderr, "[Runtime] ERROR: Orchestration SO too large (%llu > %d)\n", + (unsigned long long)size, RUNTIME_MAX_ORCH_SO_SIZE); device_orch_so_size_ = 0; return; } @@ -113,7 +113,7 @@ const void* Runtime::get_device_orch_so_data() const { return device_orch_so_size_ > 0 ? device_orch_so_storage_ : nullptr; } -size_t Runtime::get_device_orch_so_size() const { +uint64_t Runtime::get_device_orch_so_size() const { return device_orch_so_size_; } diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 20e8183b..7902ee63 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -89,7 +89,7 @@ struct Handshake { struct TensorPair { void* host_ptr; void* dev_ptr; - size_t size; + uint64_t size; }; /** @@ -97,11 +97,11 @@ struct TensorPair { * Allows runtime to use pluggable device memory backends. */ struct HostApi { - void* (*device_malloc)(size_t size); + void* (*device_malloc)(uint64_t size); void (*device_free)(void* dev_ptr); - int (*copy_to_device)(void* dev_ptr, const void* host_ptr, size_t size); - int (*copy_from_device)(void* host_ptr, const void* dev_ptr, size_t size); - uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, size_t bin_size); + int (*copy_to_device)(void* dev_ptr, const void* host_ptr, uint64_t size); + int (*copy_from_device)(void* host_ptr, const void* dev_ptr, uint64_t size); + uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, uint64_t bin_size); }; /** @@ -160,7 +160,7 @@ class Runtime { // Device orchestration SO binary (for dlopen on AICPU thread 3) // Stored as a copy to avoid lifetime issues with Python ctypes arrays uint8_t device_orch_so_storage_[RUNTIME_MAX_ORCH_SO_SIZE]; - size_t device_orch_so_size_; + uint64_t device_orch_so_size_; public: /** @@ -175,7 +175,7 @@ class Runtime { /** * Record a host-device tensor pair for copy-back during finalize. */ - void record_tensor_pair(void* host_ptr, void* dev_ptr, size_t size); + void record_tensor_pair(void* host_ptr, void* dev_ptr, uint64_t size); /** * Get pointer to tensor pairs array. @@ -207,9 +207,9 @@ class Runtime { void set_orch_args(uint64_t* args, int count); // Device orchestration SO binary (for dlopen on AICPU thread 3) - void set_device_orch_so(const void* data, size_t size); + void set_device_orch_so(const void* data, uint64_t size); const void* get_device_orch_so_data() const; - size_t get_device_orch_so_size() const; + uint64_t get_device_orch_so_size() const; uint64_t get_function_bin_addr(int func_id) const; void set_function_bin_addr(int func_id, uint64_t addr); diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp index ee9ca815..e7d58b06 100644 --- a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp +++ b/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp @@ -60,7 +60,7 @@ static __aicore__ void softmax_prepare_impl(__gm__ uint8_t* sij_raw, float scale using TileScalarDN = Tile; TileVecMxN sijTile; - TileSijDyn sijDynTile(static_cast(valid_len)); + TileSijDyn sijDynTile(static_cast(valid_len)); TileSijPad sijPadTile; TileVecMxN pijTile; TileVecMxN tmpTile; diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp index 955eca58..1fad9193 100644 --- a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/device_tests/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -95,11 +95,11 @@ extern "C" int orchestration(Runtime* runtime) { int num_head_tiles = (num_heads + q_tile_size - 1) / q_tile_size; // Buffer sizes for per-block intermediates - size_t sij_size = static_cast(q_tile_size) * block_size * sizeof(float); - size_t pij_size = static_cast(q_tile_size) * block_size * sizeof(uint16_t); - size_t mij_size = static_cast(q_tile_size) * sizeof(float); - size_t lij_size = mij_size; - size_t oi_new_size = static_cast(q_tile_size) * head_dim * sizeof(float); + uint64_t sij_size = static_cast(q_tile_size) * block_size * sizeof(float); + uint64_t pij_size = static_cast(q_tile_size) * block_size * sizeof(uint16_t); + uint64_t mij_size = static_cast(q_tile_size) * sizeof(float); + uint64_t lij_size = mij_size; + uint64_t oi_new_size = static_cast(q_tile_size) * head_dim * sizeof(float); // Allocate per-block intermediate buffers on device (HBM) int total_buffers = batch * max_num_blocks; @@ -119,9 +119,9 @@ extern "C" int orchestration(Runtime* runtime) { // Per-(batch, head_tile) accumulators int total_accums = batch * num_head_tiles; - size_t mi_size = static_cast(q_tile_size) * sizeof(float); - size_t li_size = mi_size; - size_t oi_size = static_cast(q_tile_size) * head_dim * sizeof(float); + uint64_t mi_size = static_cast(q_tile_size) * sizeof(float); + uint64_t li_size = mi_size; + uint64_t oi_size = static_cast(q_tile_size) * head_dim * sizeof(float); void** dev_mi_arr = new void*[total_accums]; void** dev_li_arr = new void*[total_accums]; diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp index ee9ca815..e7d58b06 100644 --- a/tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp +++ b/tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp @@ -60,7 +60,7 @@ static __aicore__ void softmax_prepare_impl(__gm__ uint8_t* sij_raw, float scale using TileScalarDN = Tile; TileVecMxN sijTile; - TileSijDyn sijDynTile(static_cast(valid_len)); + TileSijDyn sijDynTile(static_cast(valid_len)); TileSijPad sijPadTile; TileVecMxN pijTile; TileVecMxN tmpTile; diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp index 46d26e7d..bb33419b 100644 --- a/tests/device_tests/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/device_tests/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -37,13 +37,13 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count) void* host_out = reinterpret_cast(args[5]); int64_t* host_config = reinterpret_cast(args[6]); - size_t query_size = static_cast(args[7]); - size_t key_cache_size = static_cast(args[8]); - size_t value_cache_size = static_cast(args[9]); - size_t block_table_size = static_cast(args[10]); - size_t context_lens_size = static_cast(args[11]); - size_t out_size = static_cast(args[12]); - size_t config_size = static_cast(args[13]); + uint64_t query_size = static_cast(args[7]); + uint64_t key_cache_size = static_cast(args[8]); + uint64_t value_cache_size = static_cast(args[9]); + uint64_t block_table_size = static_cast(args[10]); + uint64_t context_lens_size = static_cast(args[11]); + uint64_t out_size = static_cast(args[12]); + uint64_t config_size = static_cast(args[13]); int batch = static_cast(host_config[0]); int num_heads = static_cast(host_config[1]); @@ -79,11 +79,11 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count) runtime->record_tensor_pair(host_out, dev_out, out_size); // Buffer sizes depend on q_tile_size and block_size - size_t sij_size = static_cast(q_tile_size) * block_size * sizeof(float); - size_t pij_size = static_cast(q_tile_size) * block_size * sizeof(uint16_t); - size_t mij_size = static_cast(q_tile_size) * sizeof(float); - size_t lij_size = mij_size; - size_t oi_new_size = static_cast(q_tile_size) * head_dim * sizeof(float); + uint64_t sij_size = static_cast(q_tile_size) * block_size * sizeof(float); + uint64_t pij_size = static_cast(q_tile_size) * block_size * sizeof(uint16_t); + uint64_t mij_size = static_cast(q_tile_size) * sizeof(float); + uint64_t lij_size = mij_size; + uint64_t oi_new_size = static_cast(q_tile_size) * head_dim * sizeof(float); // Per-batch-per-block intermediate buffers int total_buffers = batch * max_num_blocks; @@ -103,9 +103,9 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count) // Per-(batch, head_tile) accumulators int total_accums = batch * num_head_tiles; - size_t mi_size = static_cast(q_tile_size) * sizeof(float); - size_t li_size = mi_size; - size_t oi_size = static_cast(q_tile_size) * head_dim * sizeof(float); + uint64_t mi_size = static_cast(q_tile_size) * sizeof(float); + uint64_t li_size = mi_size; + uint64_t oi_size = static_cast(q_tile_size) * head_dim * sizeof(float); void** dev_mi_arr = new void*[total_accums]; void** dev_li_arr = new void*[total_accums]; diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp index 0980c2d1..1d54c9bd 100644 --- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp +++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp @@ -63,7 +63,7 @@ static __aicore__ void softmax_prepare_impl(__gm__ Tensor* sij, using TileScalarDN = Tile; TileVecMxN sijTile; - TileSijDyn sijDynTile(static_cast(valid_len)); + TileSijDyn sijDynTile(valid_len); TileSijPad sijPadTile; TileVecMxN pijTile; TileVecMxN tmpTile; diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp index 52006506..2d7f2b09 100644 --- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -73,13 +73,13 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim int64_t* host_config = reinterpret_cast(args[6]); // Extract sizes (next 7) - size_t query_size = static_cast(args[7]); - size_t key_cache_size = static_cast(args[8]); - size_t value_cache_size = static_cast(args[9]); - size_t block_table_size = static_cast(args[10]); - size_t context_lens_size = static_cast(args[11]); - size_t out_size = static_cast(args[12]); - size_t config_size = static_cast(args[13]); + uint64_t query_size = static_cast(args[7]); + uint64_t key_cache_size = static_cast(args[8]); + uint64_t value_cache_size = static_cast(args[9]); + uint64_t block_table_size = static_cast(args[10]); + uint64_t context_lens_size = static_cast(args[11]); + uint64_t out_size = static_cast(args[12]); + uint64_t config_size = static_cast(args[13]); // Extract config parameters uint64_t batch = static_cast(static_cast(host_config[0]));