diff --git a/examples/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
index 496ecef9..30da58af 100644
--- a/examples/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
+++ b/examples/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
@@ -29,7 +29,7 @@ constexpr int GRID_K = 4;
 constexpr int GRID_N = 4;
 constexpr int BATCH = 1;
 
-constexpr size_t TILE_BYTES = TILE * TILE * sizeof(float);
+constexpr uint64_t TILE_BYTES = TILE * TILE * sizeof(float);
 constexpr int NUM_P_BUFFERS = BATCH * GRID_M * GRID_N;
 
 constexpr int DEV_A = 0;
@@ -82,9 +82,9 @@ extern "C" int orchestration(Runtime* runtime) {
             for (int n_idx = 0; n_idx < GRID_N; n_idx++) {
                 for (int k_idx = 0; k_idx < GRID_K; k_idx++) {
                     // Calculate tile offsets
-                    size_t A_offset = (batch * GRID_M * GRID_K + m_idx * GRID_K + k_idx) * TILE_BYTES;
-                    size_t B_offset = (batch * GRID_K * GRID_N + k_idx * GRID_N + n_idx) * TILE_BYTES;
-                    size_t C_offset = (batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx) * TILE_BYTES;
+                    uint64_t A_offset = (batch * GRID_M * GRID_K + m_idx * GRID_K + k_idx) * TILE_BYTES;
+                    uint64_t B_offset = (batch * GRID_K * GRID_N + k_idx * GRID_N + n_idx) * TILE_BYTES;
+                    uint64_t C_offset = (batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx) * TILE_BYTES;
 
                     int c_tile_idx = batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx;
 
diff --git a/examples/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp b/examples/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp
index 09c41f8f..900a2371 100644
--- a/examples/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp
+++ b/examples/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp
@@ -63,7 +63,7 @@ extern "C" int orchestration(Runtime* runtime) {
 
     // Allocate intermediate tensors on device (HBM, accessible by AIV cores).
     // Note: malloc() on AICPU returns AICPU-local memory which AIV cores cannot access.
-    size_t bytes = static_cast<size_t>(size) * sizeof(float);
+    uint64_t bytes = static_cast<uint64_t>(size) * sizeof(float);
     void* dev_c = api.device_malloc(bytes);
     void* dev_d = api.device_malloc(bytes);
     void* dev_e = api.device_malloc(bytes);
diff --git a/examples/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
index 5c6254d5..ae831e2d 100644
--- a/examples/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
+++ b/examples/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
@@ -30,7 +30,7 @@ constexpr int GRID_K = 4;
 constexpr int GRID_N = 4;
 constexpr int BATCH = 1;
 
-constexpr size_t TILE_BYTES = TILE * TILE * sizeof(float);
+constexpr uint64_t TILE_BYTES = TILE * TILE * sizeof(float);
 
 int build_bgemm_graph(Runtime* runtime, uint64_t* args, int arg_count) {
     if (arg_count < 7) {
@@ -41,9 +41,9 @@ int build_bgemm_graph(Runtime* runtime, uint64_t* args, int arg_count) {
     void* host_A = reinterpret_cast<void*>(args[0]);
     void* host_B = reinterpret_cast<void*>(args[1]);
     void* host_C = reinterpret_cast<void*>(args[2]);
-    size_t size_A = static_cast<size_t>(args[3]);
-    size_t size_B = static_cast<size_t>(args[4]);
-    size_t size_C = static_cast<size_t>(args[5]);
+    uint64_t size_A = static_cast<uint64_t>(args[3]);
+    uint64_t size_B = static_cast<uint64_t>(args[4]);
+    uint64_t size_C = static_cast<uint64_t>(args[5]);
 
     std::cout << "\n=== build_bgemm_graph ===" << '\n';
     std::cout << "Grid: " << GRID_M << " x " << GRID_K << " x " << GRID_N << '\n';
@@ -94,9 +94,9 @@ int build_bgemm_graph(Runtime* runtime, uint64_t* args, int arg_count) {
             for (int n_idx = 0; n_idx < GRID_N; n_idx++) {
                 for (int k_idx = 0; k_idx < GRID_K; k_idx++) {
                     // Calculate tile offsets
-                    size_t A_offset = (batch * GRID_M * GRID_K + m_idx * GRID_K + k_idx) * TILE_BYTES;
-                    size_t B_offset = (batch * GRID_K * GRID_N + k_idx * GRID_N + n_idx) * TILE_BYTES;
-                    size_t C_offset = (batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx) * TILE_BYTES;
+                    uint64_t A_offset = (batch * GRID_M * GRID_K + m_idx * GRID_K + k_idx) * TILE_BYTES;
+                    uint64_t B_offset = (batch * GRID_K * GRID_N + k_idx * GRID_N + n_idx) * TILE_BYTES;
+                    uint64_t C_offset = (batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx) * TILE_BYTES;
 
                     int c_tile_idx = batch * GRID_M * GRID_N + m_idx * GRID_N + n_idx;
 
diff --git a/examples/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp b/examples/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp
index 6c0578e5..041b339c 100644
--- a/examples/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp
+++ b/examples/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp
@@ -37,10 +37,10 @@ int build_matmul_graph(Runtime* runtime, uint64_t* args, int arg_count) {
     void* host_w1 = reinterpret_cast<void*>(args[1]);
     void* host_w2 = reinterpret_cast<void*>(args[2]);
     void* host_f  = reinterpret_cast<void*>(args[3]);
-    size_t size_a  = static_cast<size_t>(args[4]);
-    size_t size_w1 = static_cast<size_t>(args[5]);
-    size_t size_w2 = static_cast<size_t>(args[6]);
-    size_t size_f  = static_cast<size_t>(args[7]);
+    uint64_t size_a  = static_cast<uint64_t>(args[4]);
+    uint64_t size_w1 = static_cast<uint64_t>(args[5]);
+    uint64_t size_w2 = static_cast<uint64_t>(args[6]);
+    uint64_t size_f  = static_cast<uint64_t>(args[7]);
     int SIZE = static_cast<int>(args[8]);
 
     std::cout << "\n=== build_matmul_graph: Creating Task Runtime ===" << '\n';
@@ -92,8 +92,8 @@ int build_matmul_graph(Runtime* runtime, uint64_t* args, int arg_count) {
     // Allocate intermediate tensors (b, c, d)
     // dev_b is half precision (output of log_sqrt kernel, input to matmul)
     // dev_c, dev_d are float precision (output of matmul kernels)
-    size_t BYTES_HALF = SIZE * sizeof(uint16_t);   // half = 2 bytes
-    size_t BYTES_FLOAT = SIZE * sizeof(float);     // float = 4 bytes
+    uint64_t BYTES_HALF = SIZE * sizeof(uint16_t);   // half = 2 bytes
+    uint64_t BYTES_FLOAT = SIZE * sizeof(float);     // float = 4 bytes
     void* dev_b = runtime->host_api.device_malloc(BYTES_HALF);   // sqrt(log(A)) - half output
     void* dev_c = runtime->host_api.device_malloc(BYTES_FLOAT);  // B @ W1 - float output
     void* dev_d = runtime->host_api.device_malloc(BYTES_FLOAT);  // B @ W2 - float output
diff --git a/examples/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 45c6ac75..3bd7ef4c 100644
--- a/examples/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/examples/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -37,13 +37,13 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count)
     void* host_out = reinterpret_cast<void*>(args[5]);
     int64_t* host_config = reinterpret_cast<int64_t*>(args[6]);
 
-    size_t query_size = static_cast<size_t>(args[7]);
-    size_t key_cache_size = static_cast<size_t>(args[8]);
-    size_t value_cache_size = static_cast<size_t>(args[9]);
-    size_t block_table_size = static_cast<size_t>(args[10]);
-    size_t context_lens_size = static_cast<size_t>(args[11]);
-    size_t out_size = static_cast<size_t>(args[12]);
-    size_t config_size = static_cast<size_t>(args[13]);
+    uint64_t query_size = static_cast<uint64_t>(args[7]);
+    uint64_t key_cache_size = static_cast<uint64_t>(args[8]);
+    uint64_t value_cache_size = static_cast<uint64_t>(args[9]);
+    uint64_t block_table_size = static_cast<uint64_t>(args[10]);
+    uint64_t context_lens_size = static_cast<uint64_t>(args[11]);
+    uint64_t out_size = static_cast<uint64_t>(args[12]);
+    uint64_t config_size = static_cast<uint64_t>(args[13]);
 
     int batch = static_cast<int>(host_config[0]);
     int num_heads = static_cast<int>(host_config[1]);
@@ -79,11 +79,11 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count)
     runtime->record_tensor_pair(host_out, dev_out, out_size);
 
     // Buffer sizes depend on q_tile_size and block_size
-    size_t sij_size    = static_cast<size_t>(q_tile_size) * block_size * sizeof(float);
-    size_t pij_size    = static_cast<size_t>(q_tile_size) * block_size * sizeof(uint16_t);
-    size_t mij_size    = static_cast<size_t>(q_tile_size) * sizeof(float);
-    size_t lij_size    = mij_size;
-    size_t oi_new_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
+    uint64_t sij_size    = static_cast<uint64_t>(q_tile_size) * block_size * sizeof(float);
+    uint64_t pij_size    = static_cast<uint64_t>(q_tile_size) * block_size * sizeof(uint16_t);
+    uint64_t mij_size    = static_cast<uint64_t>(q_tile_size) * sizeof(float);
+    uint64_t lij_size    = mij_size;
+    uint64_t oi_new_size = static_cast<uint64_t>(q_tile_size) * head_dim * sizeof(float);
 
     // Per-batch-per-block intermediate buffers
     int total_buffers = batch * max_num_blocks;
@@ -103,9 +103,9 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count)
 
     // Per-(batch, head_tile) accumulators
     int total_accums = batch * num_head_tiles;
-    size_t mi_size = static_cast<size_t>(q_tile_size) * sizeof(float);
-    size_t li_size = mi_size;
-    size_t oi_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
+    uint64_t mi_size = static_cast<uint64_t>(q_tile_size) * sizeof(float);
+    uint64_t li_size = mi_size;
+    uint64_t oi_size = static_cast<uint64_t>(q_tile_size) * head_dim * sizeof(float);
 
     void** dev_mi_arr = new void*[total_accums];
     void** dev_li_arr = new void*[total_accums];
diff --git a/examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp b/examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp
index 26ee6b07..6c649e70 100644
--- a/examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp
+++ b/examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp
@@ -29,9 +29,9 @@ int build_example_graph(Runtime* runtime, uint64_t* args, int arg_count) {
     void* host_a = reinterpret_cast<void*>(args[0]);
     void* host_b = reinterpret_cast<void*>(args[1]);
     void* host_f = reinterpret_cast<void*>(args[2]);
-    size_t size_a = static_cast<size_t>(args[3]);
-    size_t size_b = static_cast<size_t>(args[4]);
-    size_t size_f = static_cast<size_t>(args[5]);
+    uint64_t size_a = static_cast<uint64_t>(args[3]);
+    uint64_t size_b = static_cast<uint64_t>(args[4]);
+    uint64_t size_f = static_cast<uint64_t>(args[5]);
     int SIZE = static_cast<int>(args[6]);
 
     std::cout << "\n=== build_example_graph: Creating Task Runtime ===" << '\n';
@@ -70,7 +70,7 @@ int build_example_graph(Runtime* runtime, uint64_t* args, int arg_count) {
     std::cout << "Tensor f (output): " << size_f << " bytes allocated\n";
 
     // Allocate intermediate tensors (c, d, e)
-    size_t BYTES = SIZE * sizeof(float);
+    uint64_t BYTES = SIZE * sizeof(float);
     void* dev_c = runtime->host_api.device_malloc(BYTES);
     void* dev_d = runtime->host_api.device_malloc(BYTES);
     void* dev_e = runtime->host_api.device_malloc(BYTES);
diff --git a/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp
index 70388d1a..5e78aa39 100644
--- a/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp
+++ b/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp
@@ -69,9 +69,9 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) {
     void* dev_A = (void*)(uintptr_t)args[ARG_PTR_A];
     void* dev_B = (void*)(uintptr_t)args[ARG_PTR_B];
     void* dev_C = (void*)(uintptr_t)args[ARG_PTR_C];
-    size_t size_A = (size_t)args[ARG_SIZE_A];
-    size_t size_B = (size_t)args[ARG_SIZE_B];
-    size_t size_C = (size_t)args[ARG_SIZE_C];
+    uint64_t size_A = (uint64_t)args[ARG_SIZE_A];
+    uint64_t size_B = (uint64_t)args[ARG_SIZE_B];
+    uint64_t size_C = (uint64_t)args[ARG_SIZE_C];
 
     printf("[bgemm_orch] Grid: %dx%dx%d, Batch: %d, Tile: %d\n",
            GRID_M, GRID_K, GRID_N, BATCH, TILE);
diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
index aed3cb35..549477cd 100644
--- a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
+++ b/examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -65,7 +65,7 @@ static __aicore__ void softmax_prepare_impl(__gm__ Tensor* sij,
     using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
 
     TileVecMxN sijTile;
-    TileSijDyn sijDynTile(static_cast<size_t>(valid_len));
+    TileSijDyn sijDynTile(valid_len);
     TileSijPad sijPadTile;
     TileVecMxN pijTile;
     TileVecMxN tmpTile;
diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 50567d0c..40d6f470 100644
--- a/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -71,9 +71,9 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) {
     int64_t* host_config = (int64_t*)(uintptr_t)args[6];
 
     // Extract sizes (next 7 args after pointers)
-    size_t query_size = (size_t)args[7];
-    size_t key_cache_size = (size_t)args[8];
-    size_t value_cache_size = (size_t)args[9];
+    uint64_t query_size = (uint64_t)args[7];
+    uint64_t key_cache_size = (uint64_t)args[8];
+    uint64_t value_cache_size = (uint64_t)args[9];
 
     // Extract config parameters
     uint64_t batch = (uint64_t)(int)host_config[0];
diff --git a/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp b/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp
index aa9c8f75..ba830127 100644
--- a/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp
+++ b/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp
@@ -88,14 +88,14 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) {
     void* arg_a_ptr = (void*)(uintptr_t)args[ARG_PTR_A];
     void* arg_b_ptr = (void*)(uintptr_t)args[ARG_PTR_B];
     void* arg_f_ptr = (void*)(uintptr_t)args[ARG_PTR_F];
-    size_t size_a = (size_t)args[ARG_SIZE_A];
-    size_t size_b = (size_t)args[ARG_SIZE_B];
-    size_t size_f = (size_t)args[ARG_SIZE_F];
+    uint64_t size_a = (uint64_t)args[ARG_SIZE_A];
+    uint64_t size_b = (uint64_t)args[ARG_SIZE_B];
+    uint64_t size_f = (uint64_t)args[ARG_SIZE_F];
     int SIZE = (int)(args[ARG_SIZE] & 0x7FFFFFFF);
 
     printf("===============SIZE=%d\n", SIZE);
 
-    size_t BYTES = (size_t)SIZE * sizeof(float);
+    uint64_t BYTES = (uint64_t)SIZE * sizeof(float);
 
     Tensor ext_a = make_tensor_external(arg_a_ptr, size_a);
     Tensor ext_b = make_tensor_external(arg_b_ptr, size_b);
diff --git a/src/platform/a2a3/aicpu/device_malloc.cpp b/src/platform/a2a3/aicpu/device_malloc.cpp
index 8dc0a870..5904aa95 100644
--- a/src/platform/a2a3/aicpu/device_malloc.cpp
+++ b/src/platform/a2a3/aicpu/device_malloc.cpp
@@ -34,7 +34,7 @@ static void resolve_hal_mem_functions() {
     g_hal_resolved = true;
 }
 
-void* aicpu_device_malloc(size_t size) {
+void* aicpu_device_malloc(uint64_t size) {
     resolve_hal_mem_functions();
 
     if (g_halMemAlloc == nullptr) {
@@ -49,9 +49,9 @@ void* aicpu_device_malloc(size_t size) {
     //   bit14~16: phy mem type  (MEM_TYPE_HBM=0x1 << 14)
     constexpr unsigned long long MEM_TYPE_HBM = 0x1ULL << 14;
     unsigned long long flag = MEM_TYPE_HBM;
-    int rc = g_halMemAlloc(&ptr, static_cast<unsigned long long>(size), flag);
+    int rc = g_halMemAlloc(&ptr, size, flag);
     if (rc != 0 || ptr == nullptr) {
-        DEV_ERROR("halMemAlloc failed: rc=%d size=%zu flag=0x%llx", rc, size, flag);
+        DEV_ERROR("halMemAlloc failed: rc=%d size=%llu flag=0x%llx", rc, size, flag);
         return nullptr;
     }
     return ptr;
diff --git a/src/platform/a2a3/host/device_runner.cpp b/src/platform/a2a3/host/device_runner.cpp
index a5ece59e..5e6a91a4 100644
--- a/src/platform/a2a3/host/device_runner.cpp
+++ b/src/platform/a2a3/host/device_runner.cpp
@@ -19,7 +19,7 @@
 namespace {
 void* g_hal_handle = nullptr;
 
-using HalHostRegisterFn = int (*)(void* dev_ptr, size_t size, unsigned int flags, int device_id, void** host_ptr);
+using HalHostRegisterFn = int (*)(void* dev_ptr, uint64_t size, unsigned int flags, int device_id, void** host_ptr);
 using HalHostUnregisterFn = int (*)(void* host_ptr, int device_id);
 
 int load_hal_if_needed() {
@@ -57,8 +57,8 @@ int KernelArgsHelper::init_device_args(const DeviceArgs& host_device_args, Memor
 
     // Allocate device memory for device_args
     if (args.device_args == nullptr) {
-        uint64_t device_args_size = sizeof(DeviceArgs);
-        void* device_args_dev = allocator_->alloc(device_args_size);
+        uint64_t device_args_size = static_cast<uint64_t>(sizeof(DeviceArgs));
+        void* device_args_dev = allocator_->alloc(static_cast<size_t>(device_args_size));
         if (device_args_dev == nullptr) {
             LOG_ERROR("Alloc for device_args failed");
             return -1;
@@ -90,8 +90,8 @@ int KernelArgsHelper::init_runtime_args(const Runtime& host_runtime, MemoryAlloc
     allocator_ = &allocator;
 
     if (args.runtime_args == nullptr) {
-        uint64_t runtime_size = sizeof(Runtime);
-        void* runtime_dev = allocator_->alloc(runtime_size);
+        uint64_t runtime_size = static_cast<uint64_t>(sizeof(Runtime));
+        void* runtime_dev = allocator_->alloc(static_cast<size_t>(runtime_size));
         if (runtime_dev == nullptr) {
             LOG_ERROR("Alloc for runtime_args failed");
             return -1;
@@ -129,8 +129,8 @@ int AicpuSoInfo::init(const std::vector<uint8_t>& aicpu_so_binary, MemoryAllocat
         return -1;
     }
 
-    size_t file_size = aicpu_so_binary.size();
-    void* d_aicpu_data = allocator_->alloc(file_size);
+    uint64_t file_size = static_cast<uint64_t>(aicpu_so_binary.size());
+    void* d_aicpu_data = allocator_->alloc(static_cast<size_t>(file_size));
     if (d_aicpu_data == nullptr) {
         LOG_ERROR("Alloc failed for AICPU SO");
         return -1;
@@ -256,7 +256,7 @@ int DeviceRunner::ensure_binaries_loaded(
     return 0;
 }
 
-void* DeviceRunner::allocate_tensor(size_t bytes) { return mem_alloc_.alloc(bytes); }
+void* DeviceRunner::allocate_tensor(uint64_t bytes) { return mem_alloc_.alloc(bytes); }
 
 void DeviceRunner::free_tensor(void* dev_ptr) {
     if (dev_ptr != nullptr) {
@@ -264,11 +264,11 @@ void DeviceRunner::free_tensor(void* dev_ptr) {
     }
 }
 
-int DeviceRunner::copy_to_device(void* dev_ptr, const void* host_ptr, size_t bytes) {
+int DeviceRunner::copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t bytes) {
     return rtMemcpy(dev_ptr, bytes, host_ptr, bytes, RT_MEMCPY_HOST_TO_DEVICE);
 }
 
-int DeviceRunner::copy_from_device(void* host_ptr, const void* dev_ptr, size_t bytes) {
+int DeviceRunner::copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t bytes) {
     return rtMemcpy(host_ptr, bytes, dev_ptr, bytes, RT_MEMCPY_DEVICE_TO_HOST);
 }
 
@@ -436,7 +436,7 @@ void DeviceRunner::print_handshake_results() {
 
     // Allocate temporary buffer to read handshake data from device
     std::vector<Handshake> workers(worker_count_);
-    size_t total_size = sizeof(Handshake) * worker_count_;
+    uint64_t total_size = sizeof(Handshake) * worker_count_;
     rtMemcpy(workers.data(), total_size, kernel_args_.args.runtime_args->workers, total_size, RT_MEMCPY_DEVICE_TO_HOST);
 
     LOG_DEBUG("Handshake results for %d cores:", worker_count_);
@@ -537,7 +537,7 @@ int DeviceRunner::launch_aicore_kernel(rtStream_t stream, Runtime* runtime) {
         return -1;
     }
 
-    size_t bin_size = aicore_kernel_binary_.size();
+    uint64_t bin_size = static_cast<uint64_t>(aicore_kernel_binary_.size());
     const void* bin_data = aicore_kernel_binary_.data();
 
     rtDevBinary_t binary;
@@ -579,7 +579,7 @@ int DeviceRunner::launch_aicore_kernel(rtStream_t stream, Runtime* runtime) {
 // Kernel Binary Upload (returns device address for caller to store in Runtime)
 // =============================================================================
 
-uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data, size_t bin_size) {
+uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data, uint64_t bin_size) {
     if (bin_data == nullptr || bin_size == 0) {
         LOG_ERROR("Invalid kernel binary data");
         return 0;
@@ -602,7 +602,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data
 
     // Allocate device GM memory (size field + binary data)
     uint64_t alloc_size = sizeof(uint64_t) + bin_size;
-    void* gm_addr = mem_alloc_.alloc(alloc_size);
+    void* gm_addr = mem_alloc_.alloc(static_cast<size_t>(alloc_size));
     if (gm_addr == nullptr) {
         LOG_ERROR("Failed to allocate device GM memory for kernel func_id=%d", func_id);
         return 0;
@@ -612,7 +612,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data
     std::vector<uint8_t> host_buf(alloc_size);
     uint64_t* size_ptr = reinterpret_cast<uint64_t*>(host_buf.data());
     *size_ptr = bin_size;
-    std::memcpy(host_buf.data() + sizeof(uint64_t), bin_data, bin_size);
+    std::memcpy(host_buf.data() + sizeof(uint64_t), bin_data, static_cast<size_t>(bin_size));
 
     // Copy to device
     int rc = rtMemcpy(gm_addr, alloc_size, host_buf.data(), alloc_size, RT_MEMCPY_HOST_TO_DEVICE);
@@ -635,13 +635,13 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data
 
 int DeviceRunner::init_performance_profiling(Runtime& runtime, int num_aicore, int device_id) {
     // Define allocation callback (a2a3: use MemoryAllocator)
-    auto alloc_cb = [](size_t size, void* user_data) -> void* {
+    auto alloc_cb = [](uint64_t size, void* user_data) -> void* {
         auto* allocator = static_cast<MemoryAllocator*>(user_data);
-        return allocator->alloc(size);
+        return allocator->alloc(static_cast<size_t>(size));
     };
 
     // Define registration callback (a2a3: use halHostRegister for shared memory)
-    auto register_cb = [](void* dev_ptr, size_t size, int device_id,
+    auto register_cb = [](void* dev_ptr, uint64_t size, int device_id,
                           void* user_data, void** host_ptr) -> int {
         (void)user_data;  // Not needed for registration
         if (load_hal_if_needed() != 0) {
diff --git a/src/platform/a2a3/host/device_runner.h b/src/platform/a2a3/host/device_runner.h
index c92bc0dc..14e09322 100644
--- a/src/platform/a2a3/host/device_runner.h
+++ b/src/platform/a2a3/host/device_runner.h
@@ -159,7 +159,7 @@ class DeviceRunner {
      * @param bytes  Size of tensor in bytes
      * @return Device pointer on success, nullptr on failure
      */
-    void* allocate_tensor(size_t bytes);
+    void* allocate_tensor(uint64_t bytes);
 
     /**
      * Free device tensor memory
@@ -176,7 +176,7 @@ class DeviceRunner {
      * @param bytes    Number of bytes to copy
      * @return 0 on success, error code on failure
      */
-    int copy_to_device(void* dev_ptr, const void* host_ptr, size_t bytes);
+    int copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t bytes);
 
     /**
      * Copy data from device to host
@@ -186,7 +186,7 @@ class DeviceRunner {
      * @param bytes    Number of bytes to copy
      * @return 0 on success, error code on failure
      */
-    int copy_from_device(void* host_ptr, const void* dev_ptr, size_t bytes);
+    int copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t bytes);
 
     /**
      * Execute a runtime
@@ -303,7 +303,7 @@ class DeviceRunner {
      * @param bin_size  Size of binary data in bytes
      * @return Device GM address of kernel on success, 0 on error
      */
-    uint64_t upload_kernel_binary(int func_id, const uint8_t* bin_data, size_t bin_size);
+    uint64_t upload_kernel_binary(int func_id, const uint8_t* bin_data, uint64_t bin_size);
 
     /**
      * Ensure device is set and streams are created (minimal initialization)
diff --git a/src/platform/a2a3/host/pto_runtime_c_api.cpp b/src/platform/a2a3/host/pto_runtime_c_api.cpp
index 935b5e93..fa62922c 100644
--- a/src/platform/a2a3/host/pto_runtime_c_api.cpp
+++ b/src/platform/a2a3/host/pto_runtime_c_api.cpp
@@ -20,7 +20,7 @@ extern "C" {
  */
 int init_runtime_impl(Runtime* runtime,
                     const uint8_t* orch_so_binary,
-                    size_t orch_so_size,
+                    uint64_t orch_so_size,
                     const char* orch_func_name,
                     uint64_t* func_args,
                     int func_args_count,
@@ -28,16 +28,16 @@ int init_runtime_impl(Runtime* runtime,
                     uint64_t* arg_sizes,
                     const int* kernel_func_ids,
                     const uint8_t* const* kernel_binaries,
-                    const size_t* kernel_sizes,
+                    const uint64_t* kernel_sizes,
                     int kernel_count);
 int validate_runtime_impl(Runtime* runtime);
 
 /* Forward declarations for device memory functions used in init_runtime */
-void* device_malloc(size_t size);
+void* device_malloc(uint64_t size);
 void device_free(void* dev_ptr);
-int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size);
-int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size);
-uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size_t bin_size);
+int copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t size);
+int copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t size);
+uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, uint64_t bin_size);
 
 /* ===========================================================================
  */
@@ -45,11 +45,11 @@ uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size
 /* ===========================================================================
  */
 
-size_t get_runtime_size(void) { return sizeof(Runtime); }
+uint64_t get_runtime_size(void) { return sizeof(Runtime); }
 
 int init_runtime(RuntimeHandle runtime,
                 const uint8_t* orch_so_binary,
-                size_t orch_so_size,
+                uint64_t orch_so_size,
                 const char* orch_func_name,
                 uint64_t* func_args,
                 int func_args_count,
@@ -57,7 +57,7 @@ int init_runtime(RuntimeHandle runtime,
                 uint64_t* arg_sizes,
                 const int* kernel_func_ids,
                 const uint8_t* const* kernel_binaries,
-                const size_t* kernel_sizes,
+                const uint64_t* kernel_sizes,
                 int kernel_count) {
     if (runtime == NULL) {
         return -1;
@@ -103,7 +103,7 @@ int init_runtime(RuntimeHandle runtime,
 /* ===========================================================================
  */
 
-void* device_malloc(size_t size) {
+void* device_malloc(uint64_t size) {
     try {
         DeviceRunner& runner = DeviceRunner::get();
         return runner.allocate_tensor(size);
@@ -124,7 +124,7 @@ void device_free(void* dev_ptr) {
     }
 }
 
-int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size) {
+int copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t size) {
     if (dev_ptr == NULL || host_ptr == NULL) {
         return -1;
     }
@@ -136,7 +136,7 @@ int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size) {
     }
 }
 
-int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size) {
+int copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t size) {
     if (host_ptr == NULL || dev_ptr == NULL) {
         return -1;
     }
@@ -148,7 +148,7 @@ int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size) {
     }
 }
 
-uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size_t bin_size) {
+uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, uint64_t bin_size) {
     try {
         DeviceRunner& runner = DeviceRunner::get();
         return runner.upload_kernel_binary(func_id, bin_data, bin_size);
@@ -162,9 +162,9 @@ int launch_runtime(RuntimeHandle runtime,
     int block_dim,
     int device_id,
     const uint8_t* aicpu_binary,
-    size_t aicpu_size,
+    uint64_t aicpu_size,
     const uint8_t* aicore_binary,
-    size_t aicore_size) {
+    uint64_t aicore_size) {
     if (runtime == NULL) {
         return -1;
     }
@@ -218,7 +218,7 @@ int set_device(int device_id) {
 void record_tensor_pair(RuntimeHandle runtime,
                        void* host_ptr,
                        void* dev_ptr,
-                       size_t size) {
+                       uint64_t size) {
     if (runtime == NULL) {
         return;
     }
diff --git a/src/platform/a2a3sim/aicpu/device_malloc.cpp b/src/platform/a2a3sim/aicpu/device_malloc.cpp
index 51d6e877..bf6a5cf4 100644
--- a/src/platform/a2a3sim/aicpu/device_malloc.cpp
+++ b/src/platform/a2a3sim/aicpu/device_malloc.cpp
@@ -10,7 +10,7 @@
 
 #include <cstdlib>
 
-void* aicpu_device_malloc(size_t size) {
+void* aicpu_device_malloc(uint64_t size) {
     return malloc(size);
 }
 
diff --git a/src/platform/a2a3sim/host/device_runner.cpp b/src/platform/a2a3sim/host/device_runner.cpp
index 491b33c8..252870cc 100644
--- a/src/platform/a2a3sim/host/device_runner.cpp
+++ b/src/platform/a2a3sim/host/device_runner.cpp
@@ -100,7 +100,7 @@ int DeviceRunner::ensure_binaries_loaded(const std::vector<uint8_t>& aicpu_so_bi
     return 0;
 }
 
-void* DeviceRunner::allocate_tensor(size_t bytes) {
+void* DeviceRunner::allocate_tensor(uint64_t bytes) {
     return mem_alloc_.alloc(bytes);
 }
 
@@ -110,15 +110,15 @@ void DeviceRunner::free_tensor(void* dev_ptr) {
     }
 }
 
-int DeviceRunner::copy_to_device(void* dev_ptr, const void* host_ptr, size_t bytes) {
+int DeviceRunner::copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t bytes) {
     // In simulation, this is just a memcpy
-    std::memcpy(dev_ptr, host_ptr, bytes);
+    std::memcpy(dev_ptr, host_ptr, static_cast<size_t>(bytes));
     return 0;
 }
 
-int DeviceRunner::copy_from_device(void* host_ptr, const void* dev_ptr, size_t bytes) {
+int DeviceRunner::copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t bytes) {
     // In simulation, this is just a memcpy
-    std::memcpy(host_ptr, dev_ptr, bytes);
+    std::memcpy(host_ptr, dev_ptr, static_cast<size_t>(bytes));
     return 0;
 }
 
@@ -353,7 +353,7 @@ int DeviceRunner::finalize() {
 // Kernel Binary Upload (returns function address for caller to store in Runtime)
 // =============================================================================
 
-uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data, size_t bin_size) {
+uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data, uint64_t bin_size) {
     if (bin_data == nullptr || bin_size == 0) {
         LOG_ERROR("Invalid kernel data");
         return 0;
@@ -419,9 +419,9 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t* bin_data
 
 int DeviceRunner::init_performance_profiling(Runtime& runtime, int num_aicore, int device_id) {
     // Define allocation callback (a2a3sim: use malloc)
-    auto alloc_cb = [](size_t size, void* user_data) -> void* {
+    auto alloc_cb = [](uint64_t size, void* user_data) -> void* {
         (void)user_data;  // Not needed for malloc
-        return malloc(size);
+        return malloc(static_cast<size_t>(size));
     };
 
     // Simulation: no registration needed (pass nullptr)
diff --git a/src/platform/a2a3sim/host/device_runner.h b/src/platform/a2a3sim/host/device_runner.h
index 3ae8ef34..0db59ecc 100644
--- a/src/platform/a2a3sim/host/device_runner.h
+++ b/src/platform/a2a3sim/host/device_runner.h
@@ -74,7 +74,7 @@ class DeviceRunner {
      * @param bytes  Size of tensor in bytes
      * @return Pointer on success, nullptr on failure
      */
-    void* allocate_tensor(size_t bytes);
+    void* allocate_tensor(uint64_t bytes);
 
     /**
      * Free tensor memory
@@ -91,7 +91,7 @@ class DeviceRunner {
      * @param bytes     Number of bytes to copy
      * @return 0 on success
      */
-    int copy_to_device(void* dev_ptr, const void* host_ptr, size_t bytes);
+    int copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t bytes);
 
     /**
      * Copy data (memcpy in simulation)
@@ -101,7 +101,7 @@ class DeviceRunner {
      * @param bytes     Number of bytes to copy
      * @return 0 on success
      */
-    int copy_from_device(void* host_ptr, const void* dev_ptr, size_t bytes);
+    int copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t bytes);
 
     /**
      * Execute a runtime using threads
@@ -179,7 +179,7 @@ class DeviceRunner {
      * @param bin_size     Size of binary data in bytes
      * @return Function pointer address on success, 0 on error
      */
-    uint64_t upload_kernel_binary(int func_id, const uint8_t* bin_data, size_t bin_size);
+    uint64_t upload_kernel_binary(int func_id, const uint8_t* bin_data, uint64_t bin_size);
 
 private:
     DeviceRunner() = default;
diff --git a/src/platform/a2a3sim/host/pto_runtime_c_api.cpp b/src/platform/a2a3sim/host/pto_runtime_c_api.cpp
index dd6052ff..7e402539 100644
--- a/src/platform/a2a3sim/host/pto_runtime_c_api.cpp
+++ b/src/platform/a2a3sim/host/pto_runtime_c_api.cpp
@@ -23,7 +23,7 @@ extern "C" {
  */
 int init_runtime_impl(Runtime* runtime,
                     const uint8_t* orch_so_binary,
-                    size_t orch_so_size,
+                    uint64_t orch_so_size,
                     const char* orch_func_name,
                     uint64_t* func_args,
                     int func_args_count,
@@ -31,29 +31,29 @@ int init_runtime_impl(Runtime* runtime,
                     uint64_t* arg_sizes,
                     const int* kernel_func_ids,
                     const uint8_t* const* kernel_binaries,
-                    const size_t* kernel_sizes,
+                    const uint64_t* kernel_sizes,
                     int kernel_count);
 int validate_runtime_impl(Runtime* runtime);
 
 /* Forward declarations */
-void* device_malloc(size_t size);
+void* device_malloc(uint64_t size);
 void device_free(void* dev_ptr);
-int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size);
-int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size);
-uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size_t bin_size);
+int copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t size);
+int copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t size);
+uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, uint64_t bin_size);
 
 /* ===========================================================================
  * Runtime API Implementation
  * ===========================================================================
  */
 
-size_t get_runtime_size(void) {
+uint64_t get_runtime_size(void) {
     return sizeof(Runtime);
 }
 
 int init_runtime(RuntimeHandle runtime,
                 const uint8_t* orch_so_binary,
-                size_t orch_so_size,
+                uint64_t orch_so_size,
                 const char* orch_func_name,
                 uint64_t* func_args,
                 int func_args_count,
@@ -61,7 +61,7 @@ int init_runtime(RuntimeHandle runtime,
                 uint64_t* arg_sizes,
                 const int* kernel_func_ids,
                 const uint8_t* const* kernel_binaries,
-                const size_t* kernel_sizes,
+                const uint64_t* kernel_sizes,
                 int kernel_count) {
     if (runtime == NULL) {
         return -1;
@@ -102,7 +102,7 @@ int init_runtime(RuntimeHandle runtime,
  * ===========================================================================
  */
 
-void* device_malloc(size_t size) {
+void* device_malloc(uint64_t size) {
     try {
         DeviceRunner& runner = DeviceRunner::get();
         return runner.allocate_tensor(size);
@@ -123,7 +123,7 @@ void device_free(void* dev_ptr) {
     }
 }
 
-int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size) {
+int copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t size) {
     if (dev_ptr == NULL || host_ptr == NULL) {
         return -1;
     }
@@ -135,7 +135,7 @@ int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size) {
     }
 }
 
-int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size) {
+int copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t size) {
     if (host_ptr == NULL || dev_ptr == NULL) {
         return -1;
     }
@@ -147,7 +147,7 @@ int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size) {
     }
 }
 
-uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, size_t bin_size) {
+uint64_t upload_kernel_binary_wrapper(int func_id, const uint8_t* bin_data, uint64_t bin_size) {
     try {
         DeviceRunner& runner = DeviceRunner::get();
         return runner.upload_kernel_binary(func_id, bin_data, bin_size);
@@ -161,9 +161,9 @@ int launch_runtime(RuntimeHandle runtime,
                    int block_dim,
                    int device_id,
                    const uint8_t* aicpu_binary,
-                   size_t aicpu_size,
+                   uint64_t aicpu_size,
                    const uint8_t* aicore_binary,
-                   size_t aicore_size) {
+                   uint64_t aicore_size) {
     if (runtime == NULL) {
         return -1;
     }
@@ -235,7 +235,7 @@ int enable_runtime_profiling(RuntimeHandle runtime, int enabled) {
 void record_tensor_pair(RuntimeHandle runtime,
                        void* host_ptr,
                        void* dev_ptr,
-                       size_t size) {
+                       uint64_t size) {
     if (runtime == NULL) {
         return;
     }
diff --git a/src/platform/include/aicpu/device_malloc.h b/src/platform/include/aicpu/device_malloc.h
index b78e7dfc..cb911ceb 100644
--- a/src/platform/include/aicpu/device_malloc.h
+++ b/src/platform/include/aicpu/device_malloc.h
@@ -14,7 +14,7 @@
 #ifndef PLATFORM_DEVICE_MALLOC_H_
 #define PLATFORM_DEVICE_MALLOC_H_
 
-#include <cstddef>
+#include <cstdint>
 
 /**
  * Allocate device memory (HBM on real hardware, heap on simulation).
@@ -29,7 +29,7 @@
  * @param size  Number of bytes to allocate
  * @return Pointer to allocated memory, or nullptr on failure
  */
-void* aicpu_device_malloc(size_t size);
+void* aicpu_device_malloc(uint64_t size);
 
 /**
  * Free device memory previously allocated by aicpu_device_malloc().
diff --git a/src/platform/include/common/perf_profiling.h b/src/platform/include/common/perf_profiling.h
index e3fe4a47..0e801989 100644
--- a/src/platform/include/common/perf_profiling.h
+++ b/src/platform/include/common/perf_profiling.h
@@ -199,7 +199,7 @@ extern "C" {
  * @param num_cores Number of cores (block_dim × PLATFORM_CORES_PER_BLOCKDIM)
  * @return Total bytes
  */
-inline size_t calc_perf_data_size(int num_cores) {
+inline uint64_t calc_perf_data_size(int num_cores) {
     return sizeof(PerfDataHeader) + num_cores * sizeof(DoubleBuffer);
 }
 
diff --git a/src/platform/include/host/performance_collector.h b/src/platform/include/host/performance_collector.h
index a255027c..2fdc4097 100644
--- a/src/platform/include/host/performance_collector.h
+++ b/src/platform/include/host/performance_collector.h
@@ -30,7 +30,7 @@
  * @param user_data User-provided context pointer
  * @return Allocated device memory pointer, or nullptr on failure
  */
-using PerfAllocCallback = void* (*)(size_t size, void* user_data);
+using PerfAllocCallback = void* (*)(uint64_t size, void* user_data);
 
 /**
  * Memory registration callback (for Host-Device shared memory)
@@ -42,7 +42,7 @@ using PerfAllocCallback = void* (*)(size_t size, void* user_data);
  * @param[out] host_ptr Host-mapped pointer
  * @return 0 on success, error code on failure
  */
-using PerfRegisterCallback = int (*)(void* dev_ptr, size_t size, int device_id,
+using PerfRegisterCallback = int (*)(void* dev_ptr, uint64_t size, int device_id,
                                       void* user_data, void** host_ptr);
 
 /**
diff --git a/src/platform/include/host/pto_runtime_c_api.h b/src/platform/include/host/pto_runtime_c_api.h
index 637aad3a..1b1f6bf8 100644
--- a/src/platform/include/host/pto_runtime_c_api.h
+++ b/src/platform/include/host/pto_runtime_c_api.h
@@ -65,7 +65,7 @@ typedef void* RuntimeHandle;
  *
  * @return Size of Runtime structure in bytes
  */
-size_t get_runtime_size(void);
+uint64_t get_runtime_size(void);
 
 /**
  * Initialize a runtime with dynamic orchestration and kernel binaries.
@@ -94,7 +94,7 @@ size_t get_runtime_size(void);
  */
 int init_runtime(RuntimeHandle runtime,
                 const uint8_t* orch_so_binary,
-                size_t orch_so_size,
+                uint64_t orch_so_size,
                 const char* orch_func_name,
                 uint64_t* func_args,
                 int func_args_count,
@@ -102,7 +102,7 @@ int init_runtime(RuntimeHandle runtime,
                 uint64_t* arg_sizes,
                 const int* kernel_func_ids,
                 const uint8_t* const* kernel_binaries,
-                const size_t* kernel_sizes,
+                const uint64_t* kernel_sizes,
                 int kernel_count);
 
 /* ===========================================================================
@@ -116,7 +116,7 @@ int init_runtime(RuntimeHandle runtime,
  * @param size  Size in bytes to allocate
  * @return Device pointer on success, NULL on failure
  */
-void* device_malloc(size_t size);
+void* device_malloc(uint64_t size);
 
 /**
  * Free device memory.
@@ -133,7 +133,7 @@ void device_free(void* dev_ptr);
  * @param size     Size in bytes to copy
  * @return 0 on success, error code on failure
  */
-int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size);
+int copy_to_device(void* dev_ptr, const void* host_ptr, uint64_t size);
 
 /**
  * Copy data from device to host.
@@ -143,7 +143,7 @@ int copy_to_device(void* dev_ptr, const void* host_ptr, size_t size);
  * @param size     Size in bytes to copy
  * @return 0 on success, error code on failure
  */
-int copy_from_device(void* host_ptr, const void* dev_ptr, size_t size);
+int copy_from_device(void* host_ptr, const void* dev_ptr, uint64_t size);
 
 /**
  * Execute a runtime on the device.
@@ -167,9 +167,9 @@ int launch_runtime(RuntimeHandle runtime,
     int block_dim,
     int device_id,
     const uint8_t* aicpu_binary,
-    size_t aicpu_size,
+    uint64_t aicpu_size,
     const uint8_t* aicore_binary,
-    size_t aicore_size);
+    uint64_t aicore_size);
 
 /**
  * Finalize and cleanup a runtime instance.
@@ -217,7 +217,7 @@ int set_device(int device_id);
 void record_tensor_pair(RuntimeHandle runtime,
                        void* host_ptr,
                        void* dev_ptr,
-                       size_t size);
+                       uint64_t size);
 
 
 /**
diff --git a/src/platform/src/performance_collector.cpp b/src/platform/src/performance_collector.cpp
index a018f19e..f3c23f95 100644
--- a/src/platform/src/performance_collector.cpp
+++ b/src/platform/src/performance_collector.cpp
@@ -41,10 +41,10 @@ int PerformanceCollector::initialize(Runtime& runtime,
     device_id_ = device_id;
 
     // Step 1: Calculate total memory size
-    size_t total_size = calc_perf_data_size(num_aicore);
-    size_t header_size = sizeof(PerfDataHeader);
-    size_t single_db_size = sizeof(DoubleBuffer);
-    size_t buffers_size = num_aicore * single_db_size;
+    uint64_t total_size = calc_perf_data_size(num_aicore);
+    uint64_t header_size = sizeof(PerfDataHeader);
+    uint64_t single_db_size = sizeof(DoubleBuffer);
+    uint64_t buffers_size = num_aicore * single_db_size;
 
     LOG_DEBUG("Memory allocation plan:");
     LOG_DEBUG("  Number of cores:      %d", num_aicore);
@@ -297,7 +297,7 @@ int PerformanceCollector::export_swimlane_json(const std::string& output_path) {
     outfile << "  \"version\": 1,\n";
     outfile << "  \"tasks\": [\n";
 
-    for (size_t i = 0; i < sorted_records.size(); ++i) {
+    for (uint64_t i = 0; i < sorted_records.size(); ++i) {
         const auto& record = sorted_records[i];
 
         // Convert times to microseconds
diff --git a/src/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp b/src/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp
index 0fef4011..7e4d7380 100644
--- a/src/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp
+++ b/src/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp
@@ -38,19 +38,19 @@ extern "C" void aicpu_runtime_publish_task(Runtime* runtime, int task_id);
 namespace {
 using AicpuBuilderFunc = int (*)(Runtime*);
 
-int write_bytes_to_file(const char* path, const uint8_t* data, size_t size) {
+int write_bytes_to_file(const char* path, const uint8_t* data, uint64_t size) {
     int fd = ::open(path, O_WRONLY | O_CREAT | O_TRUNC, 0755);
     if (fd < 0) {
         return -1;
     }
-    size_t off = 0;
+    uint64_t off = 0;
     while (off < size) {
-        ssize_t n = ::write(fd, data + off, size - off);
+        ssize_t n = ::write(fd, data + off, static_cast<size_t>(size - off));
         if (n <= 0) {
             ::close(fd);
             return -1;
         }
-        off += static_cast<size_t>(n);
+        off += static_cast<uint64_t>(n);
     }
     ::close(fd);
     return 0;
@@ -81,7 +81,7 @@ int build_graph_via_aicpu_plugin(Runtime* runtime, int thread_idx) {
     }
 
     const void* so_data_v = runtime->get_aicpu_orch_so_data();
-    size_t so_size = runtime->get_aicpu_orch_so_size();
+    uint64_t so_size = runtime->get_aicpu_orch_so_size();
     if (so_data_v == nullptr || so_size == 0) {
         DEV_ERROR("Thread %d: AICPU orch plugin not embedded (size=0). Host orchestration must embed plugin bytes.",
             thread_idx);
@@ -116,7 +116,7 @@ int build_graph_via_aicpu_plugin(Runtime* runtime, int thread_idx) {
         DEV_INFO("Thread %d: Trying AICPU orch plugin path %s (bytes=%lu, sym=%s)",
             thread_idx,
             so_path,
-            static_cast<uint64_t>(so_size),
+            so_size,
             sym);
 
         if (write_bytes_to_file(so_path, so_data, so_size) != 0) {
diff --git a/src/runtime/aicpu_build_graph/host/runtime_maker.cpp b/src/runtime/aicpu_build_graph/host/runtime_maker.cpp
index 27bc078a..92181c98 100644
--- a/src/runtime/aicpu_build_graph/host/runtime_maker.cpp
+++ b/src/runtime/aicpu_build_graph/host/runtime_maker.cpp
@@ -105,7 +105,7 @@ extern "C" {
  */
 int init_runtime_impl(Runtime* runtime,
     const uint8_t* orch_so_binary,
-    size_t orch_so_size,
+    uint64_t orch_so_size,
     const char* orch_func_name,
     uint64_t* func_args,
     int func_args_count,
@@ -113,7 +113,7 @@ int init_runtime_impl(Runtime* runtime,
     uint64_t* arg_sizes,
     const int* kernel_func_ids,
     const uint8_t* const* kernel_binaries,
-    const size_t* kernel_sizes,
+    const uint64_t* kernel_sizes,
     int kernel_count) {
     if (runtime == nullptr) {
         std::cerr << "Error: Runtime pointer is null\n";
@@ -163,7 +163,7 @@ int init_runtime_impl(Runtime* runtime,
         } else {
             // Pointer argument: allocate device memory.
             void* host_ptr = reinterpret_cast<void*>(func_args[i]);
-            size_t nbytes = static_cast<size_t>(asize);
+            uint64_t nbytes = asize;
 
             void* dev_ptr = runtime->host_api.device_malloc(nbytes);
             if (dev_ptr == nullptr) {
diff --git a/src/runtime/aicpu_build_graph/runtime/runtime.cpp b/src/runtime/aicpu_build_graph/runtime/runtime.cpp
index 6388c482..2e2b092f 100644
--- a/src/runtime/aicpu_build_graph/runtime/runtime.cpp
+++ b/src/runtime/aicpu_build_graph/runtime/runtime.cpp
@@ -49,7 +49,7 @@ Runtime::Runtime() {
     aicpu_build_api = {};
 }
 
-bool Runtime::try_set_aicpu_orch_so(const void* data, size_t size) {
+bool Runtime::try_set_aicpu_orch_so(const void* data, uint64_t size) {
     if (data == nullptr || size == 0) {
         aicpu_orch_so_size = 0;
         return false;
@@ -63,15 +63,15 @@ bool Runtime::try_set_aicpu_orch_so(const void* data, size_t size) {
         return false;
     }
     memcpy(aicpu_orch_so_storage, data, size);
-    aicpu_orch_so_size = static_cast<uint32_t>(size);
+    aicpu_orch_so_size = size;
     return true;
 }
 
-void Runtime::set_aicpu_orch_so(const void* data, size_t size) { (void)try_set_aicpu_orch_so(data, size); }
+void Runtime::set_aicpu_orch_so(const void* data, uint64_t size) { (void)try_set_aicpu_orch_so(data, size); }
 
 const void* Runtime::get_aicpu_orch_so_data() const { return aicpu_orch_so_size > 0 ? aicpu_orch_so_storage : nullptr; }
 
-size_t Runtime::get_aicpu_orch_so_size() const { return static_cast<size_t>(aicpu_orch_so_size); }
+uint64_t Runtime::get_aicpu_orch_so_size() const { return aicpu_orch_so_size; }
 
 // =============================================================================
 // Task Management
@@ -255,7 +255,7 @@ void Runtime::print_runtime() const {
 // Tensor Pair Management
 // =============================================================================
 
-void Runtime::record_tensor_pair(void* host_ptr, void* dev_ptr, size_t size) {
+void Runtime::record_tensor_pair(void* host_ptr, void* dev_ptr, uint64_t size) {
     if (tensor_pair_count >= RUNTIME_MAX_TENSOR_PAIRS) {
         fprintf(stderr, "[Runtime] ERROR: Tensor pairs full (max=%d)\n", RUNTIME_MAX_TENSOR_PAIRS);
         return;
diff --git a/src/runtime/aicpu_build_graph/runtime/runtime.h b/src/runtime/aicpu_build_graph/runtime/runtime.h
index 9ccee95d..150970ab 100644
--- a/src/runtime/aicpu_build_graph/runtime/runtime.h
+++ b/src/runtime/aicpu_build_graph/runtime/runtime.h
@@ -124,7 +124,7 @@ struct Handshake {
 struct TensorPair {
     void* host_ptr;
     void* dev_ptr;
-    size_t size;
+    uint64_t size;
 };
 
 /**
@@ -161,7 +161,7 @@ struct AicpuBuildApi {
         Runtime* runtime, uint64_t* args, int num_args, int func_id, CoreType core_type, uint64_t function_bin_addr);
     void (*add_successor_conditional)(Runtime* runtime, int from_task, int to_task);
     void (*publish_task)(Runtime* runtime, int task_id);
-    void* (*device_malloc)(size_t size);
+    void* (*device_malloc)(uint64_t size);
     void (*device_free)(void* ptr);
 };
 
@@ -170,11 +170,11 @@ struct AicpuBuildApi {
  * Allows runtime to use pluggable device memory backends.
  */
 struct HostApi {
-    void* (*device_malloc)(size_t size);
+    void* (*device_malloc)(uint64_t size);
     void (*device_free)(void* dev_ptr);
-    int (*copy_to_device)(void* dev_ptr, const void* host_ptr, size_t size);
-    int (*copy_from_device)(void* host_ptr, const void* dev_ptr, size_t size);
-    uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, size_t bin_size);
+    int (*copy_to_device)(void* dev_ptr, const void* host_ptr, uint64_t size);
+    int (*copy_from_device)(void* host_ptr, const void* dev_ptr, uint64_t size);
+    uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, uint64_t bin_size);
 };
 
 /**
@@ -286,16 +286,16 @@ class Runtime {
      * orchestration plugin `.so` (instead of relinking/reuploading the full runtime).
      */
     uint8_t aicpu_orch_so_storage[RUNTIME_MAX_AICPU_ORCH_SO_SIZE];
-    uint32_t aicpu_orch_so_size;
+    uint64_t aicpu_orch_so_size;
     char aicpu_orch_func_name[64];
 
     // Attempt to embed AICPU orchestration plugin bytes into Runtime.
     // Returns false on invalid input or if the plugin is larger than the
     // built-in storage.
-    bool try_set_aicpu_orch_so(const void* data, size_t size);
-    void set_aicpu_orch_so(const void* data, size_t size);
+    bool try_set_aicpu_orch_so(const void* data, uint64_t size);
+    void set_aicpu_orch_so(const void* data, uint64_t size);
     const void* get_aicpu_orch_so_data() const;
-    size_t get_aicpu_orch_so_size() const;
+    uint64_t get_aicpu_orch_so_size() const;
 
     /**
      * Build mode:
@@ -465,7 +465,7 @@ class Runtime {
      * @param dev_ptr   Device memory pointer (source for copy-back)
      * @param size     Size of tensor in bytes
      */
-    void record_tensor_pair(void* host_ptr, void* dev_ptr, size_t size);
+    void record_tensor_pair(void* host_ptr, void* dev_ptr, uint64_t size);
 
     /**
      * Record a device allocation for cleanup during finalize.
diff --git a/src/runtime/host_build_graph/host/runtime_maker.cpp b/src/runtime/host_build_graph/host/runtime_maker.cpp
index 22fda049..de5eb00a 100644
--- a/src/runtime/host_build_graph/host/runtime_maker.cpp
+++ b/src/runtime/host_build_graph/host/runtime_maker.cpp
@@ -61,7 +61,7 @@ extern "C" {
  */
 int init_runtime_impl(Runtime *runtime,
                     const uint8_t* orch_so_binary,
-                    size_t orch_so_size,
+                    uint64_t orch_so_size,
                     const char* orch_func_name,
                     uint64_t* func_args,
                     int func_args_count,
@@ -69,7 +69,7 @@ int init_runtime_impl(Runtime *runtime,
                     uint64_t* arg_sizes,
                     const int* kernel_func_ids,
                     const uint8_t* const* kernel_binaries,
-                    const size_t* kernel_sizes,
+                    const uint64_t* kernel_sizes,
                     int kernel_count) {
     // Unused parameters for host orchestration
     (void)arg_types;
@@ -111,7 +111,7 @@ int init_runtime_impl(Runtime *runtime,
     }
 
     ssize_t written = write(fd, orch_so_binary, orch_so_size);
-    if (written < 0 || static_cast<size_t>(written) != orch_so_size) {
+    if (written < 0 || static_cast<uint64_t>(written) != orch_so_size) {
         LOG_ERROR("Failed to write orchestration SO to temp file");
         close(fd);
         unlink(fd_path);
diff --git a/src/runtime/host_build_graph/runtime/runtime.cpp b/src/runtime/host_build_graph/runtime/runtime.cpp
index 1324acbc..efa1d690 100644
--- a/src/runtime/host_build_graph/runtime/runtime.cpp
+++ b/src/runtime/host_build_graph/runtime/runtime.cpp
@@ -192,7 +192,7 @@ void Runtime::print_runtime() const {
 // Tensor Pair Management
 // =============================================================================
 
-void Runtime::record_tensor_pair(void* host_ptr, void* dev_ptr, size_t size) {
+void Runtime::record_tensor_pair(void* host_ptr, void* dev_ptr, uint64_t size) {
     if (tensor_pair_count >= RUNTIME_MAX_TENSOR_PAIRS) {
         LOG_ERROR("[Runtime] Tensor pairs full (max=%d)", RUNTIME_MAX_TENSOR_PAIRS);
         return;
diff --git a/src/runtime/host_build_graph/runtime/runtime.h b/src/runtime/host_build_graph/runtime/runtime.h
index 919cada9..6ea62ce9 100644
--- a/src/runtime/host_build_graph/runtime/runtime.h
+++ b/src/runtime/host_build_graph/runtime/runtime.h
@@ -114,7 +114,7 @@ struct Handshake {
 struct TensorPair {
     void* host_ptr;
     void* dev_ptr;
-    size_t size;
+    uint64_t size;
 };
 
 /**
@@ -122,11 +122,11 @@ struct TensorPair {
  * Allows runtime to use pluggable device memory backends.
  */
 struct HostApi {
-    void* (*device_malloc)(size_t size);
+    void* (*device_malloc)(uint64_t size);
     void (*device_free)(void* dev_ptr);
-    int (*copy_to_device)(void* dev_ptr, const void* host_ptr, size_t size);
-    int (*copy_from_device)(void* host_ptr, const void* dev_ptr, size_t size);
-    uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, size_t bin_size);
+    int (*copy_to_device)(void* dev_ptr, const void* host_ptr, uint64_t size);
+    int (*copy_from_device)(void* host_ptr, const void* dev_ptr, uint64_t size);
+    uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, uint64_t bin_size);
 };
 
 /**
@@ -287,7 +287,7 @@ class Runtime {
      * @param dev_ptr   Device memory pointer (source for copy-back)
      * @param size     Size of tensor in bytes
      */
-    void record_tensor_pair(void* host_ptr, void* dev_ptr, size_t size);
+    void record_tensor_pair(void* host_ptr, void* dev_ptr, uint64_t size);
 
     /**
      * Get pointer to tensor pairs array.
diff --git a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index e8f5699c..da7ab109 100644
--- a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -408,9 +408,9 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
     DEV_INFO("Thread %d: task_descriptors=%p, dep_list_pool=%p",
              thread_idx, (void*)task_descriptors, (void*)dep_list_pool);
 
-    int32_t window_size = header->task_window_size;
-    if (window_size <= 0 || window_size > PTO2_MAX_SLOTS) window_size = PTO2_MAX_SLOTS;
-    int32_t window_mask = window_size - 1;
+    uint64_t window_size = header->task_window_size;
+    if (window_size == 0 || window_size > PTO2_MAX_SLOTS) window_size = PTO2_MAX_SLOTS;
+    uint64_t window_mask = window_size - 1;
 
     Handshake* hank = static_cast<Handshake*>(runtime->workers);
     DEV_INFO("Thread %d: hank=%p, window_size=%d",
@@ -734,7 +734,7 @@ int AicpuExecutor::run(Runtime* runtime) {
 
             // Get SO binary from runtime
             const void* so_data = runtime->get_device_orch_so_data();
-            size_t so_size = runtime->get_device_orch_so_size();
+            uint64_t so_size = runtime->get_device_orch_so_size();
 
             if (so_data == nullptr || so_size == 0) {
                 DEV_ERROR("Thread 3: Device orchestration SO not set");
@@ -768,7 +768,7 @@ int AicpuExecutor::run(Runtime* runtime) {
                 }
                 ssize_t written = write(fd, so_data, so_size);
                 close(fd);
-                if (written != static_cast<ssize_t>(so_size)) {
+                if (written < 0 || static_cast<uint64_t>(written) != so_size) {
                     DEV_INFO("Thread 3: Cannot write SO to %s (errno=%d), trying next path",
                              so_path, errno);
                     unlink(so_path);
@@ -825,9 +825,9 @@ int AicpuExecutor::run(Runtime* runtime) {
             }
 
             // Read config from orchestration SO (or use defaults)
-            int32_t task_window_size = PTO2_TASK_WINDOW_SIZE;
-            int32_t dep_list_pool_size = PTO2_DEP_LIST_POOL_SIZE;
-            int32_t heap_size = PTO2_HEAP_SIZE;
+            uint64_t task_window_size = PTO2_TASK_WINDOW_SIZE;
+            uint64_t dep_list_pool_size = PTO2_DEP_LIST_POOL_SIZE;
+            uint64_t heap_size = PTO2_HEAP_SIZE;
             int expected_arg_count = 0;
             if (config_func) {
                 PTO2OrchestrationConfig cfg = config_func(args, arg_count);
@@ -850,7 +850,7 @@ int AicpuExecutor::run(Runtime* runtime) {
             void* gm_heap = runtime->get_pto2_gm_heap_ptr();
 
             // Create shared memory handle and runtime (ops table populated inside)
-            int32_t sm_size = pto2_sm_calculate_size(task_window_size, dep_list_pool_size);
+            uint64_t sm_size = pto2_sm_calculate_size(task_window_size, dep_list_pool_size);
             PTO2SharedMemoryHandle* sm_handle =
                 pto2_sm_create_from_buffer(sm_ptr, sm_size, task_window_size,
                                             heap_size, dep_list_pool_size);
@@ -880,8 +880,8 @@ int AicpuExecutor::run(Runtime* runtime) {
             }
 
             // Set orchestrator's aicpu parallel mode pointers
-            int32_t ws = header->task_window_size;
-            if (ws <= 0 || ws > PTO2_MAX_SLOTS) ws = PTO2_MAX_SLOTS;
+            uint64_t ws = header->task_window_size;
+            if (ws == 0 || ws > PTO2_MAX_SLOTS) ws = PTO2_MAX_SLOTS;
             rt->orchestrator.aicpu_fanin_refcount = s_pto2_fanin_refcount;
             rt->orchestrator.aicpu_task_completed = s_pto2_task_completed;
             rt->orchestrator.aicpu_window_mask = ws - 1;
diff --git a/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 3a5493fe..40886d73 100644
--- a/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -59,7 +59,7 @@ static long long _now_ms() {
  */
 extern "C" int init_runtime_impl(Runtime *runtime,
                     const uint8_t* orch_so_binary,
-                    size_t orch_so_size,
+                    uint64_t orch_so_size,
                     const char* orch_func_name,
                     uint64_t* func_args,
                     int func_args_count,
@@ -67,7 +67,7 @@ extern "C" int init_runtime_impl(Runtime *runtime,
                     uint64_t* arg_sizes,
                     const int* kernel_func_ids,
                     const uint8_t* const* kernel_binaries,
-                    const size_t* kernel_sizes,
+                    const uint64_t* kernel_sizes,
                     int kernel_count) {
     // Suppress unused parameter warning
     (void)orch_func_name;
@@ -127,7 +127,7 @@ extern "C" int init_runtime_impl(Runtime *runtime,
             case ARG_INPUT_PTR: {
                 // Input pointer: allocate device memory, copy data
                 void* host_ptr = reinterpret_cast<void*>(func_args[i]);
-                size_t size = arg_sizes[i];
+                uint64_t size = arg_sizes[i];
 
                 void* dev_ptr = runtime->host_api.device_malloc(size);
                 if (dev_ptr == nullptr) {
@@ -152,7 +152,7 @@ extern "C" int init_runtime_impl(Runtime *runtime,
             case ARG_OUTPUT_PTR: {
                 // Output pointer: allocate device memory, record for copy-back
                 void* host_ptr = reinterpret_cast<void*>(func_args[i]);
-                size_t size = arg_sizes[i];
+                uint64_t size = arg_sizes[i];
 
                 void* dev_ptr = runtime->host_api.device_malloc(size);
                 if (dev_ptr == nullptr) {
@@ -170,7 +170,7 @@ extern "C" int init_runtime_impl(Runtime *runtime,
             case ARG_INOUT_PTR: {
                 // Input/output pointer: allocate, copy, record for copy-back
                 void* host_ptr = reinterpret_cast<void*>(func_args[i]);
-                size_t size = arg_sizes[i];
+                uint64_t size = arg_sizes[i];
 
                 void* dev_ptr = runtime->host_api.device_malloc(size);
                 if (dev_ptr == nullptr) {
@@ -233,15 +233,15 @@ extern "C" int init_runtime_impl(Runtime *runtime,
 
     // Allocate PTO2 shared memory
     long long t_sm_start = _now_ms();
-    int32_t sm_size = pto2_sm_calculate_size(PTO2_TASK_WINDOW_SIZE, PTO2_DEP_LIST_POOL_SIZE);
-    void* sm_ptr = runtime->host_api.device_malloc(static_cast<size_t>(sm_size));
+    uint64_t sm_size = pto2_sm_calculate_size(PTO2_TASK_WINDOW_SIZE, PTO2_DEP_LIST_POOL_SIZE);
+    void* sm_ptr = runtime->host_api.device_malloc(sm_size);
     long long t_sm_end = _now_ms();
     if (sm_ptr == nullptr) {
         std::cerr << "Error: Failed to allocate PTO2 shared memory\n";
         return -1;
     }
     runtime->set_pto2_gm_sm_ptr(sm_ptr);
-    runtime->record_tensor_pair(nullptr, sm_ptr, static_cast<size_t>(sm_size));
+    runtime->record_tensor_pair(nullptr, sm_ptr, sm_size);
 
     // Set up device orchestration state
     runtime->set_orch_built_on_host(false);
@@ -289,7 +289,7 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
     // PTO2 (device orchestration): graph output may be in packed buffer
     void* pto2_sm = runtime->get_pto2_gm_sm_ptr();
     uint64_t graph_out_ptr = 0;
-    int32_t graph_out_size = 0;
+    uint64_t graph_out_size = 0;
 
     if (pto2_sm != nullptr) {
         // Copy header from device to host to read graph_output_ptr/size
@@ -324,12 +324,12 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
         }
 
         void* src_ptr = pair.dev_ptr;
-        size_t copy_size = pair.size;
+        uint64_t copy_size = pair.size;
 
         // Use graph_output_ptr for the first output tensor if available
         if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) {
             src_ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(graph_out_ptr));
-            copy_size = static_cast<size_t>(graph_out_size);
+            copy_size = graph_out_size;
             std::cout << "Using packed output buffer for tensor " << i << "\n";
             first_output_tensor = false;
         }
diff --git a/src/runtime/tensormap_and_ringbuffer/orchestration/tensor_orch.cpp b/src/runtime/tensormap_and_ringbuffer/orchestration/tensor_orch.cpp
index 4b1105d9..8ee10dc7 100644
--- a/src/runtime/tensormap_and_ringbuffer/orchestration/tensor_orch.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/orchestration/tensor_orch.cpp
@@ -187,7 +187,7 @@ uint64_t Tensor::offset_ndim_to_1d(const std::vector<uint64_t>& offset_ndims) co
 }
 
 bool Tensor::valid_view(const uint64_t shapes[], const uint64_t offsets[]) const {
-    for (size_t i = 0; i < ndims; i++) {
+    for (uint64_t i = 0; i < ndims; i++) {
         if (shapes[i] + offsets[i] > repeats[i]) {
             return false;
         }
@@ -199,7 +199,7 @@ Tensor Tensor::view(const uint64_t shapes[], const uint64_t offsets[]) const {
     debug_assert(valid_view(shapes, offsets));
     Tensor result(*this);
     result.start_offset = start_offset + offset_ndim_to_1d(offsets);
-    for (size_t i = 0; i < ndims; i++) {
+    for (uint64_t i = 0; i < ndims; i++) {
         result.repeats[i] = shapes[i];
     }
     return result;
@@ -208,7 +208,7 @@ Tensor Tensor::view(const uint64_t shapes[], const uint64_t offsets[]) const {
 Tensor Tensor::view(const std::vector<uint64_t>& shapes, const std::vector<uint64_t>& offsets) const {
     Tensor result(*this);
     result.start_offset = start_offset + offset_ndim_to_1d(offsets);
-    for (size_t i = 0; i < ndims; i++) {
+    for (uint64_t i = 0; i < ndims; i++) {
         result.repeats[i] = shapes[i];
     }
     return result;
@@ -231,11 +231,11 @@ bool Tensor::is_contiguous() const {
 
 bool Tensor::valid_reshape(const uint64_t shapes[], uint64_t new_ndims) const {
     uint64_t x = 1;
-    for (size_t i = 0; i < ndims; i++) {
+    for (uint64_t i = 0; i < ndims; i++) {
         x *= repeats[i];
     }
     uint64_t y = 1;
-    for (size_t i = 0; i < new_ndims; i++) {
+    for (uint64_t i = 0; i < new_ndims; i++) {
         y *= shapes[i];
     }
     return x == y;
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index b4bb1fed..c7270e2f 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -72,7 +72,7 @@ static inline void task_fanout_unlock(PTO2TaskDescriptor* task) { PTO2_STORE_REL
 // =============================================================================
 
 bool pto2_orchestrator_init(
-    PTO2OrchestratorState* orch, PTO2SharedMemoryHandle* sm_handle, void* gm_heap, int32_t heap_size) {
+    PTO2OrchestratorState* orch, PTO2SharedMemoryHandle* sm_handle, void* gm_heap, uint64_t heap_size) {
     memset(orch, 0, sizeof(PTO2OrchestratorState));
 
     orch->sm_handle = sm_handle;
@@ -99,8 +99,8 @@ bool pto2_orchestrator_init(
     orch->tensormap_last_cleanup = 0;
 
     // Initialize scope stack: one flat buffer for task IDs + one array for begin offsets
-    int32_t max_depth = PTO2_MAX_SCOPE_DEPTH;
-    int32_t init_cap = PTO2_SCOPE_TASKS_INIT_CAP;
+    uint64_t max_depth = PTO2_MAX_SCOPE_DEPTH;
+    uint64_t init_cap = PTO2_SCOPE_TASKS_INIT_CAP;
     orch->scope_tasks = (int32_t*)malloc(init_cap * sizeof(int32_t));
     orch->scope_begins = (int32_t*)malloc(max_depth * sizeof(int32_t));
     if (!orch->scope_tasks || !orch->scope_begins) {
@@ -163,7 +163,7 @@ void pto2_orchestrator_set_scheduler_mode(
 
 static void scope_tasks_push(PTO2OrchestratorState* orch, int32_t task_id) {
     if (orch->scope_tasks_size >= orch->scope_tasks_capacity) {
-        int32_t new_cap = orch->scope_tasks_capacity * 2;
+        uint64_t new_cap = orch->scope_tasks_capacity * 2;
         int32_t* new_buf = (int32_t*)realloc(orch->scope_tasks, new_cap * sizeof(int32_t));
         assert(new_buf && "Failed to grow scope task buffer");
         orch->scope_tasks = new_buf;
@@ -173,7 +173,7 @@ static void scope_tasks_push(PTO2OrchestratorState* orch, int32_t task_id) {
 }
 
 void pto2_scope_begin(PTO2OrchestratorState* orch) {
-    assert(orch->scope_stack_top < orch->scope_stack_capacity - 1 && "Scope stack overflow");
+    assert(orch->scope_stack_top < (int32_t)(orch->scope_stack_capacity - 1) && "Scope stack overflow");
 
     ++orch->scope_stack_top;
     orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size;
@@ -187,7 +187,7 @@ void pto2_scope_end(PTO2OrchestratorState* orch) {
 #endif
 
     int32_t begin = orch->scope_begins[orch->scope_stack_top--];
-    int32_t count = orch->scope_tasks_size - begin;
+    uint64_t count = orch->scope_tasks_size - begin;
 
     if (orch->scheduler && count > 0) {
         pto2_scheduler_on_scope_end(orch->scheduler, &orch->scope_tasks[begin], count);
@@ -302,7 +302,7 @@ void pto2_submit_task(PTO2OrchestratorState* orch,
     scope_tasks_push(orch, task_id);
 
     // Temporary storage for collecting output sizes
-    int32_t total_output_size = 0;
+    uint64_t total_output_size = 0;
 
     // Temporary storage for fanin
     int32_t fanin_temp[PTO2_MAX_INPUTS];
@@ -504,8 +504,8 @@ void pto2_orchestrator_print_stats(PTO2OrchestratorState* orch) {
     printf("Bytes allocated:     %lld\n", (long long)orch->bytes_allocated);
     printf("Current scope depth: %d\n", orch->scope_stack_top + 1);
     printf("Task ring active:    %d\n", pto2_task_ring_active_count(&orch->task_ring));
-    printf("Heap ring used:      %d / %d\n", orch->heap_ring.top, orch->heap_ring.size);
-    printf("Dep pool used:       %d / %d\n", pto2_dep_pool_used(&orch->dep_pool), orch->dep_pool.capacity);
+    printf("Heap ring used:      %lu / %lu\n", (unsigned long)orch->heap_ring.top, (unsigned long)orch->heap_ring.size);
+    printf("Dep pool used:       %zu / %zu\n", pto2_dep_pool_used(&orch->dep_pool), orch->dep_pool.capacity);
     printf("TensorMap valid:     %d\n", pto2_tensormap_valid_count(&orch->tensor_map));
     printf("===============================\n");
 }
@@ -516,8 +516,8 @@ void pto2_orchestrator_print_scope_stack(PTO2OrchestratorState* orch) {
 
     for (int i = 0; i <= orch->scope_stack_top; i++) {
         int32_t begin = orch->scope_begins[i];
-        int32_t end = (i < orch->scope_stack_top) ? orch->scope_begins[i + 1] : orch->scope_tasks_size;
-        printf("  [%d] tasks_owned = %d\n", i, end - begin);
+        uint64_t end = (i < orch->scope_stack_top) ? orch->scope_begins[i + 1] : orch->scope_tasks_size;
+        printf("  [%d] tasks_owned = %zu\n", i, end - begin);
     }
 
     printf("==================\n");
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index fbeb3ff0..5fd67b8d 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -52,11 +52,11 @@ struct PTO2OrchestratorState {
     // scope_begins[i] is the index into scope_tasks where scope i starts.
     // Tasks for the top scope occupy [scope_begins[top], scope_tasks_size).
     int32_t* scope_tasks;          // Flat buffer of task IDs (all scopes concatenated)
-    int32_t scope_tasks_size;      // Number of task IDs currently in the buffer
-    int32_t scope_tasks_capacity;  // Allocated capacity of scope_tasks
+    uint64_t scope_tasks_size;       // Number of task IDs currently in the buffer
+    uint64_t scope_tasks_capacity;   // Allocated capacity of scope_tasks
     int32_t* scope_begins;         // scope_begins[i] = start index of scope i in scope_tasks
     int32_t scope_stack_top;       // Current top of stack (-1 = no scope open)
-    int32_t scope_stack_capacity;  // Max nesting depth (PTO2_MAX_SCOPE_DEPTH)
+    uint64_t scope_stack_capacity;   // Max nesting depth (PTO2_MAX_SCOPE_DEPTH)
 
     // === SCHEDULER REFERENCE ===
     // Note: In simulated mode, orchestrator and scheduler share address space
@@ -66,7 +66,7 @@ struct PTO2OrchestratorState {
 
     // === GM HEAP (for output buffers) ===
     void* gm_heap_base;    // Base address of GM heap
-    int32_t gm_heap_size;  // Size of GM heap
+    uint64_t gm_heap_size;   // Size of GM heap
 
     // === STATISTICS ===
     int64_t tasks_submitted;
@@ -105,7 +105,7 @@ struct PTO2OrchestratorState {
  * @return true on success
  */
 bool pto2_orchestrator_init(
-    PTO2OrchestratorState* orch, PTO2SharedMemoryHandle* sm_handle, void* gm_heap, int32_t heap_size);
+    PTO2OrchestratorState* orch, PTO2SharedMemoryHandle* sm_handle, void* gm_heap, uint64_t heap_size);
 
 /**
  * Destroy orchestrator state and free resources
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
index da9bfcc7..22feb6d8 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
@@ -21,8 +21,8 @@
 // Heap Ring Buffer Implementation
 // =============================================================================
 
-void pto2_heap_ring_init(PTO2HeapRing* ring, void* base, int32_t size,
-                          volatile int32_t* tail_ptr) {
+void pto2_heap_ring_init(PTO2HeapRing* ring, void* base, uint64_t size,
+                          volatile uint64_t* tail_ptr) {
     ring->base = base;
     ring->size = size;
     ring->top = 0;
@@ -34,14 +34,14 @@ void pto2_heap_ring_init(PTO2HeapRing* ring, void* base, int32_t size,
 // Heap ring spin limit - after this, report deadlock and exit
 #define PTO2_HEAP_SPIN_LIMIT        100000
 
-void* pto2_heap_ring_alloc(PTO2HeapRing* ring, int32_t size) {
+void* pto2_heap_ring_alloc(PTO2HeapRing* ring, uint64_t size) {
     // Align size for DMA efficiency
     size = PTO2_ALIGN_UP(size, PTO2_ALIGN_SIZE);
-    
+
     // Spin-wait if insufficient space (back-pressure from Scheduler)
     int spin_count = 0;
     bool notified = false;
-    
+
     while (1) {
         void* ptr = pto2_heap_ring_try_alloc(ring, size);
         if (ptr != NULL) {
@@ -52,69 +52,69 @@ void* pto2_heap_ring_alloc(PTO2HeapRing* ring, int32_t size) {
 #endif
             return ptr;
         }
-        
+
         // No space available, spin-wait
         spin_count++;
-        
+
 #if PTO2_SPIN_VERBOSE_LOGGING
         // Periodic block notification
         if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0 &&
             spin_count < PTO2_HEAP_SPIN_LIMIT) {
-            int32_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr);
-            int32_t available = pto2_heap_ring_available(ring);
-            fprintf(stderr, "[HeapRing] BLOCKED: requesting %d bytes, available=%d, "
-                    "top=%d, tail=%d, spins=%d\n",
+            uint64_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr);
+            uint64_t available = pto2_heap_ring_available(ring);
+            fprintf(stderr, "[HeapRing] BLOCKED: requesting %lu bytes, available=%lu, "
+                    "top=%lu, tail=%lu, spins=%d\n",
                     size, available, ring->top, tail, spin_count);
             notified = true;
         }
 #endif
-        
+
         if (spin_count >= PTO2_HEAP_SPIN_LIMIT) {
-            int32_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr);
-            int32_t available = pto2_heap_ring_available(ring);
+            uint64_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr);
+            uint64_t available = pto2_heap_ring_available(ring);
             fprintf(stderr, "\n");
             fprintf(stderr, "========================================\n");
             fprintf(stderr, "FATAL: Heap Ring Deadlock Detected!\n");
             fprintf(stderr, "========================================\n");
             fprintf(stderr, "Orchestrator blocked waiting for heap space after %d spins.\n", spin_count);
-            fprintf(stderr, "  - Requested:     %d bytes\n", size);
-            fprintf(stderr, "  - Available:     %d bytes\n", available);
-            fprintf(stderr, "  - Heap top:      %d\n", ring->top);
-            fprintf(stderr, "  - Heap tail:     %d\n", tail);
-            fprintf(stderr, "  - Heap size:     %d\n", ring->size);
+            fprintf(stderr, "  - Requested:     %lu bytes\n", size);
+            fprintf(stderr, "  - Available:     %lu bytes\n", available);
+            fprintf(stderr, "  - Heap top:      %lu\n", ring->top);
+            fprintf(stderr, "  - Heap tail:     %lu\n", tail);
+            fprintf(stderr, "  - Heap size:     %lu\n", ring->size);
             fprintf(stderr, "\n");
             fprintf(stderr, "Solution: Increase PTO2_HEAP_SIZE (e.g. 256*1024 for 4 x 64KB outputs).\n");
             fprintf(stderr, "========================================\n");
             fprintf(stderr, "\n");
             exit(1);
         }
-        
+
         PTO2_SPIN_PAUSE();
     }
 }
 
-void* pto2_heap_ring_try_alloc(PTO2HeapRing* ring, int32_t size) {
+void* pto2_heap_ring_try_alloc(PTO2HeapRing* ring, uint64_t size) {
     // Align size for DMA efficiency
     size = PTO2_ALIGN_UP(size, PTO2_ALIGN_SIZE);
-    
+
     // Read latest tail from shared memory (Scheduler updates this)
-    int32_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr);
-    int32_t top = ring->top;
-    
+    uint64_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr);
+    uint64_t top = ring->top;
+
     if (top >= tail) {
         // Case 1: top is at or ahead of tail (normal case)
         //   [....tail====top......]
         //                   ^-- space_at_end = size - top
-        
-        int32_t space_at_end = ring->size - top;
-        
+
+        uint64_t space_at_end = ring->size - top;
+
         if (space_at_end >= size) {
             // Enough space at end - allocate here
             void* ptr = (char*)ring->base + top;
             ring->top = top + size;
             return ptr;
         }
-        
+
         // Not enough space at end - check if we can wrap to beginning
         // IMPORTANT: Don't split buffer, skip remaining space at end
         if (tail > size) {
@@ -122,35 +122,35 @@ void* pto2_heap_ring_try_alloc(PTO2HeapRing* ring, int32_t size) {
             ring->top = size;
             return ring->base;
         }
-        
+
         // Not enough space anywhere - return NULL
         return NULL;
-        
+
     } else {
         // Case 2: top has wrapped, tail is ahead
         //   [====top....tail=====]
         //         ^-- free space = tail - top
-        
-        int32_t gap = tail - top;
+
+        uint64_t gap = tail - top;
         if (gap >= size) {
             void* ptr = (char*)ring->base + top;
             ring->top = top + size;
             return ptr;
         }
-        
+
         // Not enough space - return NULL
         return NULL;
     }
 }
 
-int32_t pto2_heap_ring_available(PTO2HeapRing* ring) {
-    int32_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr);
-    int32_t top = ring->top;
-    
+uint64_t pto2_heap_ring_available(PTO2HeapRing* ring) {
+    uint64_t tail = PTO2_LOAD_ACQUIRE(ring->tail_ptr);
+    uint64_t top = ring->top;
+
     if (top >= tail) {
         // Space at end + space at beginning (if any)
-        int32_t at_end = ring->size - top;
-        int32_t at_begin = tail;
+        uint64_t at_end = ring->size - top;
+        uint64_t at_begin = tail;
         return at_end > at_begin ? at_end : at_begin;  // Max usable
     } else {
         // Contiguous space between top and tail
@@ -167,7 +167,7 @@ void pto2_heap_ring_reset(PTO2HeapRing* ring) {
 // =============================================================================
 
 void pto2_task_ring_init(PTO2TaskRing* ring, PTO2TaskDescriptor* descriptors,
-                          int32_t window_size, volatile int32_t* last_alive_ptr) {
+                          uint64_t window_size, volatile int32_t* last_alive_ptr) {
     ring->descriptors = descriptors;
     ring->window_size = window_size;
     ring->current_index = 0;
@@ -204,7 +204,7 @@ int32_t pto2_task_ring_alloc(PTO2TaskRing* ring) {
             int32_t last_alive = PTO2_LOAD_ACQUIRE(ring->last_alive_ptr);
             int32_t active_count = ring->current_index - last_alive;
             fprintf(stderr, "[TaskRing] BLOCKED (Flow Control): current=%d, last_alive=%d, "
-                    "active=%d/%d (%.1f%%), spins=%d\n",
+                    "active=%d/%zu (%.1f%%), spins=%d\n",
                     ring->current_index, last_alive, active_count, ring->window_size,
                     100.0 * active_count / ring->window_size, spin_count);
             notified = true;
@@ -227,7 +227,7 @@ int32_t pto2_task_ring_alloc(PTO2TaskRing* ring) {
             fprintf(stderr, "  - Current task index:  %d\n", ring->current_index);
             fprintf(stderr, "  - Last task alive:     %d\n", last_alive);
             fprintf(stderr, "  - Active tasks:        %d\n", active_count);
-            fprintf(stderr, "  - Window size:         %d\n", ring->window_size);
+            fprintf(stderr, "  - Window size:         %zu\n", ring->window_size);
             fprintf(stderr, "  - Window utilization:  %.1f%%\n", 
                     100.0 * active_count / ring->window_size);
             fprintf(stderr, "\n");
@@ -239,7 +239,7 @@ int32_t pto2_task_ring_alloc(PTO2TaskRing* ring) {
             fprintf(stderr, "  This creates a circular dependency (deadlock).\n");
             fprintf(stderr, "\n");
             fprintf(stderr, "Solution:\n");
-            fprintf(stderr, "  Current task_window_size: %d\n", ring->window_size);
+            fprintf(stderr, "  Current task_window_size: %zu\n", ring->window_size);
             fprintf(stderr, "  Default PTO2_TASK_WINDOW_SIZE: %d\n", PTO2_TASK_WINDOW_SIZE);
             fprintf(stderr, "  Recommended: %d (at least 2x current active tasks)\n", 
                     active_count * 2);
@@ -268,7 +268,7 @@ int32_t pto2_task_ring_try_alloc(PTO2TaskRing* ring) {
     
     // Check if there's room for one more task
     // Leave at least 1 slot empty to distinguish full from empty
-    if (active_count < ring->window_size - 1) {
+    if (active_count < (int32_t)(ring->window_size - 1)) {
         int32_t task_id = current;
         int32_t slot = task_id & (ring->window_size - 1);
         
@@ -295,7 +295,7 @@ int32_t pto2_task_ring_active_count(PTO2TaskRing* ring) {
 
 bool pto2_task_ring_has_space(PTO2TaskRing* ring) {
     int32_t active = pto2_task_ring_active_count(ring);
-    return active < ring->window_size - 1;
+    return active < (int32_t)(ring->window_size - 1);
 }
 
 void pto2_task_ring_reset(PTO2TaskRing* ring) {
@@ -309,7 +309,7 @@ void pto2_task_ring_reset(PTO2TaskRing* ring) {
 // Dependency List Pool Implementation
 // =============================================================================
 
-void pto2_dep_pool_init(PTO2DepListPool* pool, PTO2DepListEntry* base, int32_t capacity) {
+void pto2_dep_pool_init(PTO2DepListPool* pool, PTO2DepListEntry* base, uint64_t capacity) {
     pool->base = base;
     pool->capacity = capacity;
     pool->top = 1;  // Start from 1, 0 means NULL/empty
@@ -324,7 +324,7 @@ int32_t pto2_dep_pool_alloc_one(PTO2DepListPool* pool) {
         // Wrap around to beginning (old entries reclaimed with task ring)
         pool->top = 1;  // Start from 1, 0 means NULL
     }
-    return pool->top++;
+    return static_cast<int32_t>(pool->top++);
 }
 
 int32_t pto2_dep_list_prepend(PTO2DepListPool* pool, int32_t current_head, int32_t task_id) {
@@ -346,8 +346,8 @@ int32_t pto2_dep_list_prepend(PTO2DepListPool* pool, int32_t current_head, int32
 void pto2_dep_list_iterate(PTO2DepListPool* pool, int32_t head,
                             void (*callback)(int32_t task_id, void* ctx), void* ctx) {
     int32_t current = head;
-    
-    while (current > 0 && current < pool->capacity) {
+
+    while (current > 0 && current < (int32_t)pool->capacity) {
         PTO2DepListEntry* entry = &pool->base[current];
         callback(entry->task_id, ctx);
         current = entry->next_offset;
@@ -357,8 +357,8 @@ void pto2_dep_list_iterate(PTO2DepListPool* pool, int32_t head,
 int32_t pto2_dep_list_count(PTO2DepListPool* pool, int32_t head) {
     int32_t count = 0;
     int32_t current = head;
-    
-    while (current > 0 && current < pool->capacity) {
+
+    while (current > 0 && current < (int32_t)pool->capacity) {
         count++;
         current = pool->base[current].next_offset;
     }
@@ -377,10 +377,10 @@ void pto2_dep_pool_reset(PTO2DepListPool* pool) {
     pool->base[0].next_offset = 0;
 }
 
-int32_t pto2_dep_pool_used(PTO2DepListPool* pool) {
+uint64_t pto2_dep_pool_used(PTO2DepListPool* pool) {
     return pool->top - 1;  // Exclude entry 0 (NULL marker)
 }
 
-int32_t pto2_dep_pool_available(PTO2DepListPool* pool) {
+uint64_t pto2_dep_pool_available(PTO2DepListPool* pool) {
     return pool->capacity - pool->top;
 }
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index bb15b468..500a98db 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -41,12 +41,12 @@
  */
 typedef struct {
     void*    base;        // GM_Heap_Base pointer
-    int32_t  size;        // GM_Heap_Size (total heap size in bytes)
-    int32_t  top;         // Allocation pointer (local copy)
-    
+    uint64_t size;        // GM_Heap_Size (total heap size in bytes)
+    uint64_t top;         // Allocation pointer (local copy)
+
     // Reference to shared memory tail (for back-pressure)
-    volatile int32_t* tail_ptr;  // Points to header->heap_tail
-    
+    volatile uint64_t* tail_ptr;  // Points to header->heap_tail
+
 } PTO2HeapRing;
 
 /**
@@ -57,35 +57,35 @@ typedef struct {
  * @param size      Total heap size in bytes
  * @param tail_ptr  Pointer to shared memory heap_tail
  */
-void pto2_heap_ring_init(PTO2HeapRing* ring, void* base, int32_t size,
-                          volatile int32_t* tail_ptr);
+void pto2_heap_ring_init(PTO2HeapRing* ring, void* base, uint64_t size,
+                          volatile uint64_t* tail_ptr);
 
 /**
  * Allocate memory from heap ring
- * 
+ *
  * O(1) bump allocation with wrap-around.
  * May STALL (spin-wait) if insufficient space (back-pressure).
  * Never splits a buffer across the wrap-around boundary.
- * 
+ *
  * @param ring  Heap ring
  * @param size  Requested size in bytes
  * @return Pointer to allocated memory, never NULL (stalls instead)
  */
-void* pto2_heap_ring_alloc(PTO2HeapRing* ring, int32_t size);
+void* pto2_heap_ring_alloc(PTO2HeapRing* ring, uint64_t size);
 
 /**
  * Try to allocate memory without stalling
- * 
+ *
  * @param ring  Heap ring
  * @param size  Requested size in bytes
  * @return Pointer to allocated memory, or NULL if no space
  */
-void* pto2_heap_ring_try_alloc(PTO2HeapRing* ring, int32_t size);
+void* pto2_heap_ring_try_alloc(PTO2HeapRing* ring, uint64_t size);
 
 /**
  * Get available space in heap ring
  */
-int32_t pto2_heap_ring_available(PTO2HeapRing* ring);
+uint64_t pto2_heap_ring_available(PTO2HeapRing* ring);
 
 /**
  * Reset heap ring to initial state
@@ -104,24 +104,24 @@ void pto2_heap_ring_reset(PTO2HeapRing* ring);
  */
 typedef struct {
     PTO2TaskDescriptor* descriptors;  // Task descriptor array (from shared memory)
-    int32_t window_size;              // Window size (power of 2)
+    uint64_t window_size;               // Window size (power of 2)
     int32_t current_index;            // Next task to allocate (absolute ID)
-    
+
     // Reference to shared memory last_task_alive (for back-pressure)
     volatile int32_t* last_alive_ptr;  // Points to header->last_task_alive
-    
+
 } PTO2TaskRing;
 
 /**
  * Initialize task ring buffer
- * 
+ *
  * @param ring            Task ring to initialize
  * @param descriptors     Task descriptor array from shared memory
  * @param window_size     Window size (must be power of 2)
  * @param last_alive_ptr  Pointer to shared memory last_task_alive
  */
 void pto2_task_ring_init(PTO2TaskRing* ring, PTO2TaskDescriptor* descriptors,
-                          int32_t window_size, volatile int32_t* last_alive_ptr);
+                          uint64_t window_size, volatile int32_t* last_alive_ptr);
 
 /**
  * Allocate a task slot from task ring
@@ -176,19 +176,19 @@ void pto2_task_ring_reset(PTO2TaskRing* ring);
  */
 typedef struct {
     PTO2DepListEntry* base;   // Pool base address (from shared memory)
-    int32_t capacity;         // Total number of entries
-    int32_t top;              // Next allocation position (starts from 1, 0=NULL)
-    
+    uint64_t capacity;        // Total number of entries
+    uint64_t top;             // Next allocation position (starts from 1, 0=NULL)
+
 } PTO2DepListPool;
 
 /**
  * Initialize dependency list pool
- * 
+ *
  * @param pool      Pool to initialize
  * @param base      Pool base address from shared memory
  * @param capacity  Total number of entries
  */
-void pto2_dep_pool_init(PTO2DepListPool* pool, PTO2DepListEntry* base, int32_t capacity);
+void pto2_dep_pool_init(PTO2DepListPool* pool, PTO2DepListEntry* base, uint64_t capacity);
 
 /**
  * Allocate a single entry from the pool
@@ -243,7 +243,7 @@ void pto2_dep_pool_reset(PTO2DepListPool* pool);
 /**
  * Get pool usage statistics
  */
-int32_t pto2_dep_pool_used(PTO2DepListPool* pool);
-int32_t pto2_dep_pool_available(PTO2DepListPool* pool);
+uint64_t pto2_dep_pool_used(PTO2DepListPool* pool);
+uint64_t pto2_dep_pool_available(PTO2DepListPool* pool);
 
 #endif // PTO_RING_BUFFER_H
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
index 4afa6418..67a9570f 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
@@ -53,9 +53,9 @@ PTO2Runtime* pto2_runtime_create(PTO2RuntimeMode mode) {
 }
 
 PTO2Runtime* pto2_runtime_create_custom(PTO2RuntimeMode mode,
-                                         int32_t task_window_size,
-                                         int32_t heap_size,
-                                         int32_t dep_list_size) {
+                                         uint64_t task_window_size,
+                                         uint64_t heap_size,
+                                         uint64_t dep_list_size) {
     // Allocate runtime context
     PTO2Runtime* rt = (PTO2Runtime*)calloc(1, sizeof(PTO2Runtime));
     if (!rt) {
@@ -116,7 +116,7 @@ PTO2Runtime* pto2_runtime_create_custom(PTO2RuntimeMode mode,
 PTO2Runtime* pto2_runtime_create_from_sm(PTO2RuntimeMode mode,
                                           PTO2SharedMemoryHandle* sm_handle,
                                           void* gm_heap,
-                                          int32_t heap_size) {
+                                          uint64_t heap_size) {
     if (!sm_handle) return NULL;
 
     PTO2Runtime* rt = (PTO2Runtime*)calloc(1, sizeof(PTO2Runtime));
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 959370a5..74cc8bc8 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -80,7 +80,7 @@ struct PTO2Runtime {
 
     // GM Heap for output buffers
     void*                   gm_heap;
-    int32_t                 gm_heap_size;
+    uint64_t                  gm_heap_size;
     bool                    gm_heap_owned;  // True if we allocated it
 
     // Mode
@@ -112,9 +112,9 @@ PTO2Runtime* pto2_runtime_create(PTO2RuntimeMode mode);
  * @return Runtime context, or NULL on failure
  */
 PTO2Runtime* pto2_runtime_create_custom(PTO2RuntimeMode mode,
-                                         int32_t task_window_size,
-                                         int32_t heap_size,
-                                         int32_t dep_list_size);
+                                         uint64_t task_window_size,
+                                         uint64_t heap_size,
+                                         uint64_t dep_list_size);
 
 /**
  * Create runtime from existing shared memory and GM heap (e.g. on device).
@@ -129,7 +129,7 @@ PTO2Runtime* pto2_runtime_create_custom(PTO2RuntimeMode mode,
 PTO2Runtime* pto2_runtime_create_from_sm(PTO2RuntimeMode mode,
                                           PTO2SharedMemoryHandle* sm_handle,
                                           void* gm_heap,
-                                          int32_t heap_size);
+                                          uint64_t heap_size);
 
 /**
  * Destroy runtime and free all resources
@@ -295,9 +295,9 @@ struct PTO2OrchestrationBeginInfo {
     uint64_t*   args;
     int         arg_count;
     int         expected_arg_count;
-    int32_t     task_window_size;
-    int32_t     dep_list_pool_size;
-    int32_t     heap_size;
+    uint64_t      task_window_size;
+    uint64_t      dep_list_pool_size;
+    uint64_t      heap_size;
     void*       gm_heap_ptr = nullptr;
 };
 
@@ -329,7 +329,7 @@ class PTO2OrchestrationGuard {
         }
         header_ = static_cast<PTO2SharedMemoryHeader*>(begin_info.sm_ptr);
 
-        int32_t sm_size = pto2_sm_calculate_size(begin_info.task_window_size,
+        uint64_t sm_size = pto2_sm_calculate_size(begin_info.task_window_size,
                                                   begin_info.dep_list_pool_size);
         PTO2SharedMemoryHandle* sm_handle =
             pto2_sm_create_from_buffer(begin_info.sm_ptr, sm_size,
@@ -339,13 +339,13 @@ class PTO2OrchestrationGuard {
         if (!sm_handle) return;
 
         void*   gm_heap      = begin_info.gm_heap_ptr;
-        int32_t gm_heap_size = begin_info.heap_size;
+        uint64_t gm_heap_size = begin_info.heap_size;
         if (begin_info.arg_count >= 2) {
             uint64_t heap_arg  = begin_info.args[begin_info.arg_count - 2];
             uint64_t size_arg  = begin_info.args[begin_info.arg_count - 1];
             if (heap_arg != 0 && size_arg != 0) {
                 gm_heap      = reinterpret_cast<void*>(static_cast<uintptr_t>(heap_arg));
-                gm_heap_size = static_cast<int32_t>(size_arg & 0x7FFFFFFF);
+                gm_heap_size = size_arg;
             }
         }
 
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp
index b405f773..b89526b5 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp
@@ -30,17 +30,17 @@ const char* pto2_task_state_name(PTO2TaskState state) {
 // Ready Queue Implementation
 // =============================================================================
 
-bool pto2_ready_queue_init(PTO2ReadyQueue* queue, int32_t capacity) {
+bool pto2_ready_queue_init(PTO2ReadyQueue* queue, uint64_t capacity) {
     queue->task_ids = (int32_t*)malloc(capacity * sizeof(int32_t));
     if (!queue->task_ids) {
         return false;
     }
-    
+
     queue->head = 0;
     queue->tail = 0;
     queue->capacity = capacity;
     queue->count = 0;
-    
+
     return true;
 }
 
@@ -94,7 +94,7 @@ bool pto2_scheduler_init(PTO2SchedulerState* sched,
     sched->dep_pool = dep_pool;
     
     // Get runtime task_window_size from shared memory header
-    int32_t window_size = sm_handle->header->task_window_size;
+    uint64_t window_size = sm_handle->header->task_window_size;
     sched->task_window_size = window_size;
     sched->task_window_mask = window_size - 1;  // For fast modulo (window_size must be power of 2)
     
@@ -162,10 +162,10 @@ void pto2_scheduler_destroy(PTO2SchedulerState* sched) {
 void pto2_scheduler_reset(PTO2SchedulerState* sched) {
     sched->last_task_alive = 0;
     sched->heap_tail = 0;
-    
-    memset(sched->task_state, 0, PTO2_TASK_WINDOW_SIZE * sizeof(PTO2TaskState));
-    memset(sched->fanin_refcount, 0, PTO2_TASK_WINDOW_SIZE * sizeof(int32_t));
-    memset(sched->fanout_refcount, 0, PTO2_TASK_WINDOW_SIZE * sizeof(int32_t));
+
+    memset(sched->task_state, 0, sched->task_window_size * sizeof(PTO2TaskState));
+    memset(sched->fanin_refcount, 0, sched->task_window_size * sizeof(int32_t));
+    memset(sched->fanout_refcount, 0, sched->task_window_size * sizeof(int32_t));
     
     for (int i = 0; i < PTO2_NUM_WORKER_TYPES; i++) {
         pto2_ready_queue_reset(&sched->ready_queues[i]);
@@ -363,7 +363,7 @@ void pto2_scheduler_advance_ring_pointers(PTO2SchedulerState* sched) {
             // heap_tail = offset of end of last consumed task's buffer
             // Note: This requires knowing the heap base, which should be passed in
             // For now, we just track the relative position
-            sched->heap_tail = (int32_t)(intptr_t)last_consumed->packed_buffer_end;
+            sched->heap_tail = reinterpret_cast<uint64_t>(last_consumed->packed_buffer_end);
         }
     }
     
@@ -403,7 +403,7 @@ bool pto2_scheduler_is_done(PTO2SchedulerState* sched) {
 void pto2_scheduler_print_stats(PTO2SchedulerState* sched) {
     printf("=== Scheduler Statistics ===\n");
     printf("last_task_alive:   %d\n", sched->last_task_alive);
-    printf("heap_tail:         %d\n", sched->heap_tail);
+    printf("heap_tail:         %lu\n", sched->heap_tail);
     printf("tasks_completed:   %lld\n", (long long)sched->tasks_completed);
     printf("tasks_consumed:    %lld\n", (long long)sched->tasks_consumed);
     printf("============================\n");
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
index 3dccde69..87556321 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -54,11 +54,11 @@ typedef struct PTO2SchedulerState {
 
     // Local copies of ring pointers (written to shared memory after update)
     int32_t last_task_alive;      // Task ring tail
-    int32_t heap_tail;            // Heap ring tail
+    uint64_t heap_tail;           // Heap ring tail
 
     // === DYNAMIC CONFIGURATION ===
-    int32_t task_window_size;     // Task window size (power of 2)
-    int32_t task_window_mask;     // task_window_size - 1 (for fast modulo)
+    uint64_t task_window_size;    // Task window size (power of 2)
+    uint64_t task_window_mask;    // task_window_size - 1 (for fast modulo)
 
     // === PRIVATE DATA (not in shared memory) ===
 
@@ -121,7 +121,7 @@ void pto2_scheduler_reset(PTO2SchedulerState* sched);
 /**
  * Initialize a ready queue
  */
-bool pto2_ready_queue_init(PTO2ReadyQueue* queue, int32_t capacity);
+bool pto2_ready_queue_init(PTO2ReadyQueue* queue, uint64_t capacity);
 
 /**
  * Destroy ready queue
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp
index 38d3411e..08baee6e 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp
@@ -16,18 +16,18 @@
 // Size Calculation
 // =============================================================================
 
-int32_t pto2_sm_calculate_size(int32_t task_window_size, int32_t dep_list_pool_size) {
-    int32_t size = 0;
-    
+uint64_t pto2_sm_calculate_size(uint64_t task_window_size, uint64_t dep_list_pool_size) {
+    uint64_t size = 0;
+
     // Header (aligned to cache line)
     size += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-    
+
     // Task descriptors
     size += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-    
+
     // Dependency list pool (entry 0 is reserved as NULL)
     size += PTO2_ALIGN_UP((dep_list_pool_size + 1) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
-    
+
     return size;
 }
 
@@ -35,18 +35,18 @@ int32_t pto2_sm_calculate_size(int32_t task_window_size, int32_t dep_list_pool_s
 // Creation and Destruction
 // =============================================================================
 
-PTO2SharedMemoryHandle* pto2_sm_create(int32_t task_window_size,
-                                        int32_t heap_size,
-                                        int32_t dep_list_pool_size) {
+PTO2SharedMemoryHandle* pto2_sm_create(uint64_t task_window_size,
+                                        uint64_t heap_size,
+                                        uint64_t dep_list_pool_size) {
     // Allocate handle
     PTO2SharedMemoryHandle* handle = (PTO2SharedMemoryHandle*)calloc(1, sizeof(PTO2SharedMemoryHandle));
     if (!handle) {
         return NULL;
     }
-    
+
     // Calculate total size
-    int32_t sm_size = pto2_sm_calculate_size(task_window_size, dep_list_pool_size);
-    
+    uint64_t sm_size = pto2_sm_calculate_size(task_window_size, dep_list_pool_size);
+
     // Allocate shared memory (aligned for DMA efficiency)
     #if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L
         if (posix_memalign(&handle->sm_base, PTO2_ALIGN_SIZE, sm_size) != 0) {
@@ -94,13 +94,13 @@ PTO2SharedMemoryHandle* pto2_sm_create_default(void) {
 }
 
 PTO2SharedMemoryHandle* pto2_sm_create_from_buffer(void* sm_base,
-                                                    int32_t sm_size,
-                                                    int32_t task_window_size,
-                                                    int32_t heap_size,
-                                                    int32_t dep_list_pool_size) {
-    if (!sm_base || sm_size <= 0) return NULL;
+                                                    uint64_t sm_size,
+                                                    uint64_t task_window_size,
+                                                    uint64_t heap_size,
+                                                    uint64_t dep_list_pool_size) {
+    if (!sm_base || sm_size == 0) return NULL;
 
-    int32_t required = pto2_sm_calculate_size(task_window_size, dep_list_pool_size);
+    uint64_t required = pto2_sm_calculate_size(task_window_size, dep_list_pool_size);
     if (sm_size < required) return NULL;
 
     PTO2SharedMemoryHandle* handle = (PTO2SharedMemoryHandle*)calloc(1, sizeof(PTO2SharedMemoryHandle));
@@ -136,9 +136,9 @@ void pto2_sm_destroy(PTO2SharedMemoryHandle* handle) {
 // =============================================================================
 
 void pto2_sm_init_header(PTO2SharedMemoryHandle* handle,
-                          int32_t task_window_size,
-                          int32_t heap_size,
-                          int32_t dep_list_pool_size) {
+                          uint64_t task_window_size,
+                          uint64_t heap_size,
+                          uint64_t dep_list_pool_size) {
     PTO2SharedMemoryHeader* header = handle->header;
     
     // Flow control pointers (start at 0)
@@ -154,9 +154,9 @@ void pto2_sm_init_header(PTO2SharedMemoryHandle* handle,
     header->dep_list_pool_size = dep_list_pool_size;
     
     // Calculate offsets
-    int32_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
     header->task_descriptors_offset = offset;
-    
+
     offset += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
     header->dep_list_pool_offset = offset;
     
@@ -203,21 +203,21 @@ void pto2_sm_print_layout(PTO2SharedMemoryHandle* handle) {
     
     printf("=== PTO2 Shared Memory Layout ===\n");
     printf("Base address:       %p\n", handle->sm_base);
-    printf("Total size:         %d bytes\n", h->total_size);
+    printf("Total size:         %zu bytes\n", h->total_size);
     printf("\n");
-    printf("Task window size:   %d\n", h->task_window_size);
-    printf("Heap size:          %d bytes\n", h->heap_size);
-    printf("DepList pool size:  %d entries\n", h->dep_list_pool_size);
+    printf("Task window size:   %zu\n", h->task_window_size);
+    printf("Heap size:          %zu bytes\n", h->heap_size);
+    printf("DepList pool size:  %zu entries\n", h->dep_list_pool_size);
     printf("\n");
     printf("Offsets:\n");
-    printf("  TaskDescriptors:  %d (0x%x)\n", h->task_descriptors_offset, h->task_descriptors_offset);
-    printf("  DepListPool:      %d (0x%x)\n", h->dep_list_pool_offset, h->dep_list_pool_offset);
+    printf("  TaskDescriptors:  %zu (0x%zx)\n", h->task_descriptors_offset, h->task_descriptors_offset);
+    printf("  DepListPool:      %zu (0x%zx)\n", h->dep_list_pool_offset, h->dep_list_pool_offset);
     printf("\n");
     printf("Flow control:\n");
     printf("  current_task_index: %d\n", h->current_task_index);
     printf("  last_task_alive:    %d\n", h->last_task_alive);
-    printf("  heap_top:           %d\n", h->heap_top);
-    printf("  heap_tail:          %d\n", h->heap_tail);
+    printf("  heap_top:           %lu\n", h->heap_top);
+    printf("  heap_tail:          %lu\n", h->heap_tail);
     printf("  orchestrator_done:  %d\n", h->orchestrator_done);
     printf("================================\n");
 }
@@ -240,8 +240,8 @@ bool pto2_sm_validate(PTO2SharedMemoryHandle* handle) {
     // Check flow control pointer sanity
     if (h->current_task_index < 0) return false;
     if (h->last_task_alive < 0) return false;
-    if (h->heap_top < 0 || h->heap_top > h->heap_size) return false;
-    if (h->heap_tail < 0 || h->heap_tail > h->heap_size) return false;
+    if (h->heap_top > h->heap_size) return false;
+    if (h->heap_tail > h->heap_size) return false;
     
     return true;
 }
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index c758a791..7c689a2f 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -44,32 +44,32 @@ typedef struct {
     
     // Written by Orchestrator, Read by Scheduler
     volatile int32_t current_task_index;  // Task ring head (next to allocate)
-    volatile int32_t heap_top;            // Heap ring allocation pointer
+    volatile uint64_t heap_top;           // Heap ring allocation pointer
     volatile int32_t orchestrator_done;   // Flag: orchestration complete
-    
+
     // Written by Scheduler, Read by Orchestrator (for back-pressure)
     volatile int32_t last_task_alive;     // Task ring tail (oldest active task)
-    volatile int32_t heap_tail;           // Heap ring free pointer
+    volatile uint64_t heap_tail;          // Heap ring free pointer
     
     // === LAYOUT INFO (set once at init) ===
-    int32_t task_window_size;             // PTO2_TASK_WINDOW_SIZE
-    int32_t heap_size;                    // Total heap size
-    int32_t dep_list_pool_size;           // Dependency list pool size
+    uint64_t task_window_size;              // PTO2_TASK_WINDOW_SIZE
+    uint64_t heap_size;                     // Total heap size
+    uint64_t dep_list_pool_size;            // Dependency list pool size
     
     // Offsets into shared memory (relative to SM_Base)
-    int32_t task_descriptors_offset;      // Offset to TaskDescriptor array
-    int32_t dep_list_pool_offset;         // Offset to DepListPool
-    
+    uint64_t task_descriptors_offset;       // Offset to TaskDescriptor array
+    uint64_t dep_list_pool_offset;          // Offset to DepListPool
+
     // Total shared memory size (for validation)
-    int32_t total_size;
+    uint64_t total_size;
     
     // Graph output for copy-back (set by orchestrator when using packed buffer)
     // Host finalize copies from this address instead of dev_ptr when non-zero
     volatile uint64_t graph_output_ptr;   // Address where final output was written (packed buffer)
-    volatile int32_t  graph_output_size;  // Size in bytes
-    
-    // Padding to cache line
-    int32_t _padding[2];
+    volatile uint64_t graph_output_size;    // Size in bytes
+
+    // Padding to cache line (adjusted for uint64_t changes)
+    uint64_t _padding[1];
     
 } PTO2SharedMemoryHeader;
 
@@ -83,8 +83,8 @@ typedef struct {
  */
 typedef struct {
     void*   sm_base;              // Base address of shared memory
-    int32_t sm_size;              // Total size of shared memory
-    
+    uint64_t  sm_size;              // Total size of shared memory
+
     // Quick pointers into shared memory regions
     PTO2SharedMemoryHeader* header;
     PTO2TaskDescriptor*     task_descriptors;
@@ -101,12 +101,12 @@ typedef struct {
 
 /**
  * Calculate required shared memory size
- * 
+ *
  * @param task_window_size  Number of task slots
  * @param dep_list_pool_size Number of dependency list entries
  * @return Total bytes required
  */
-int32_t pto2_sm_calculate_size(int32_t task_window_size, int32_t dep_list_pool_size);
+uint64_t pto2_sm_calculate_size(uint64_t task_window_size, uint64_t dep_list_pool_size);
 
 /**
  * Create shared memory for Orchestrator and Scheduler
@@ -119,9 +119,9 @@ int32_t pto2_sm_calculate_size(int32_t task_window_size, int32_t dep_list_pool_s
  * @param dep_list_pool_size Number of dependency list entries
  * @return Handle with both views, or NULL on failure
  */
-PTO2SharedMemoryHandle* pto2_sm_create(int32_t task_window_size,
-                                        int32_t heap_size,
-                                        int32_t dep_list_pool_size);
+PTO2SharedMemoryHandle* pto2_sm_create(uint64_t task_window_size,
+                                        uint64_t heap_size,
+                                        uint64_t dep_list_pool_size);
 
 /**
  * Create shared memory with default sizes
@@ -140,10 +140,10 @@ PTO2SharedMemoryHandle* pto2_sm_create_default(void);
  * @return Handle, or NULL on failure
  */
 PTO2SharedMemoryHandle* pto2_sm_create_from_buffer(void* sm_base,
-                                                    int32_t sm_size,
-                                                    int32_t task_window_size,
-                                                    int32_t heap_size,
-                                                    int32_t dep_list_pool_size);
+                                                    uint64_t sm_size,
+                                                    uint64_t task_window_size,
+                                                    uint64_t heap_size,
+                                                    uint64_t dep_list_pool_size);
 
 /**
  * Destroy shared memory and free resources
@@ -155,9 +155,9 @@ void pto2_sm_destroy(PTO2SharedMemoryHandle* handle);
  * Called after memory is allocated
  */
 void pto2_sm_init_header(PTO2SharedMemoryHandle* handle,
-                          int32_t task_window_size,
-                          int32_t heap_size,
-                          int32_t dep_list_pool_size);
+                          uint64_t task_window_size,
+                          uint64_t heap_size,
+                          uint64_t dep_list_pool_size);
 
 /**
  * Reset shared memory to initial state (for reuse)
@@ -169,9 +169,9 @@ void pto2_sm_reset(PTO2SharedMemoryHandle* handle);
  * Get task descriptor by task ID
  * Uses runtime window_size for ring buffer indexing (not compile-time constant)
  */
-static inline PTO2TaskDescriptor* pto2_sm_get_task(PTO2SharedMemoryHandle* handle, 
+static inline PTO2TaskDescriptor* pto2_sm_get_task(PTO2SharedMemoryHandle* handle,
                                                     int32_t task_id) {
-    int32_t window_mask = handle->header->task_window_size - 1;
+    uint64_t window_mask = handle->header->task_window_size - 1;
     return &handle->task_descriptors[task_id & window_mask];
 }
 
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
index aebace93..ce7e923d 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
@@ -27,7 +27,7 @@
 // Initialization and Destruction
 // =============================================================================
 
-bool pto2_tensormap_init(PTO2TensorMap* tm, int32_t num_buckets, int32_t pool_size) {
+bool pto2_tensormap_init(PTO2TensorMap* tm, uint64_t num_buckets, uint64_t pool_size) {
     // Validate power of 2 for fast modulo
     if ((num_buckets & (num_buckets - 1)) != 0) {
         return false;  // num_buckets must be power of 2
@@ -40,7 +40,7 @@ bool pto2_tensormap_init(PTO2TensorMap* tm, int32_t num_buckets, int32_t pool_si
     }
 
     // Initialize all buckets to empty (-1)
-    for (int32_t i = 0; i < num_buckets; i++) {
+    for (uint64_t i = 0; i < num_buckets; i++) {
         tm->buckets[i] = -1;
     }
 
@@ -58,7 +58,7 @@ bool pto2_tensormap_init(PTO2TensorMap* tm, int32_t num_buckets, int32_t pool_si
     tm->pool_head = 0;
 
     // Initialize all entries as not in bucket
-    for (int32_t i = 0; i < pool_size; i++) {
+    for (uint64_t i = 0; i < pool_size; i++) {
         tm->entry_pool[i].in_bucket = false;
         tm->entry_pool[i].next_in_bucket = -1;
         tm->entry_pool[i].prev_in_bucket = -1;
@@ -110,12 +110,12 @@ void pto2_tensormap_destroy(PTO2TensorMap* tm) {
 
 void pto2_tensormap_reset(PTO2TensorMap* tm) {
     // Reset all buckets to empty
-    for (int32_t i = 0; i < tm->num_buckets; i++) {
+    for (uint64_t i = 0; i < tm->num_buckets; i++) {
         tm->buckets[i] = -1;
     }
 
     // Reset all entries
-    for (int32_t i = 0; i < tm->pool_size; i++) {
+    for (uint64_t i = 0; i < tm->pool_size; i++) {
         tm->entry_pool[i].in_bucket = false;
         tm->entry_pool[i].next_in_bucket = -1;
         tm->entry_pool[i].prev_in_bucket = -1;
@@ -155,7 +155,7 @@ uint32_t pto2_tensormap_hash(PTO2TensorMap* tm, Tensor* tensor) {
     //   Region A: base=X, offset=0   → bucket 5
     //   Region B: base=X, offset=128 → bucket 5   (CORRECT! Same bucket)
     //
-    uint64_t key = (uint64_t)(uintptr_t)tensor->buffer.addr;
+    uint64_t key = tensor->buffer.addr;
 
     // Improve distribution by mixing bits (pointers often have aligned low bits)
     key = key ^ (key >> 16);
@@ -298,13 +298,13 @@ void pto2_tensormap_lookup(PTO2TensorMap* tm, Tensor* tensor, PTO2LookupResult*
 
 void pto2_tensormap_insert(PTO2TensorMap* tm, Tensor* tensor, int32_t producer_task_id, bool with_alloc) {
     // Allocate entry from ring buffer pool
-    int32_t entry_offset = tm->pool_head;
+    uint64_t entry_offset = tm->pool_head;
     PTO2TensorMapEntry* entry = &tm->entry_pool[entry_offset];
 
     // Advance pool head (wrap around)
     tm->pool_head = (tm->pool_head + 1) % tm->pool_size;
 
-    size_t wait_count = 0;
+    uint64_t wait_count = 0;
     while (entry->in_bucket) {
         pto2_orchestrator_sync_tensormap(tm);
         always_assert(wait_count++ <= 1000000000UL);
@@ -321,9 +321,9 @@ void pto2_tensormap_insert(PTO2TensorMap* tm, Tensor* tensor, int32_t producer_t
     entry->prev_in_bucket = -1;  // New head has no predecessor
     // Update old head's prev pointer
     if (entry->next_in_bucket >= 0) {
-        tm->entry_pool[entry->next_in_bucket].prev_in_bucket = entry_offset;
+        tm->entry_pool[entry->next_in_bucket].prev_in_bucket = (int32_t)entry_offset;
     }
-    tm->buckets[bucket] = entry_offset;
+    tm->buckets[bucket] = (int32_t)entry_offset;
     entry->in_bucket = true;
 
     // Link to task's entry list (for cleanup)
@@ -332,9 +332,9 @@ void pto2_tensormap_insert(PTO2TensorMap* tm, Tensor* tensor, int32_t producer_t
     entry->prev_in_task = -1;  // New head has no predecessor
     // Update old head's prev pointer
     if (entry->next_in_task >= 0) {
-        tm->entry_pool[entry->next_in_task].prev_in_task = entry_offset;
+        tm->entry_pool[entry->next_in_task].prev_in_task = (int32_t)entry_offset;
     }
-    tm->task_entry_head[task_slot] = entry_offset;
+    tm->task_entry_head[task_slot] = (int32_t)entry_offset;
 }
 
 // =============================================================================
@@ -350,7 +350,7 @@ void pto2_tensormap_print_stats(PTO2TensorMap* tm) {
     int32_t non_empty_buckets = 0;
 
     // Count entries
-    for (int32_t i = 0; i < tm->pool_size; i++) {
+    for (uint64_t i = 0; i < tm->pool_size; i++) {
         if (tm->entry_pool[i].in_bucket) {
             if (pto2_tensormap_entry_valid(tm, &tm->entry_pool[i])) {
                 valid++;
@@ -361,7 +361,7 @@ void pto2_tensormap_print_stats(PTO2TensorMap* tm) {
     }
 
     // Count bucket stats
-    for (int32_t b = 0; b < tm->num_buckets; b++) {
+    for (uint64_t b = 0; b < tm->num_buckets; b++) {
         int32_t chain_len = 0;
         int32_t offset = tm->buckets[b];
 
@@ -382,9 +382,9 @@ void pto2_tensormap_print_stats(PTO2TensorMap* tm) {
     }
 
     printf("=== TensorMap Statistics ===\n");
-    printf("Pool size:       %d\n", tm->pool_size);
-    printf("Pool head:       %d\n", tm->pool_head);
-    printf("Num buckets:     %d\n", tm->num_buckets);
+    printf("Pool size:       %zu\n", tm->pool_size);
+    printf("Pool head:       %zu\n", tm->pool_head);
+    printf("Num buckets:     %zu\n", tm->num_buckets);
     printf("Valid entries:   %d\n", valid);
     printf("Stale entries:   %d\n", stale);
     printf("Empty buckets:   %d\n", empty_buckets);
@@ -397,7 +397,7 @@ void pto2_tensormap_print_stats(PTO2TensorMap* tm) {
 int32_t pto2_tensormap_valid_count(PTO2TensorMap* tm) {
     int32_t count = 0;
 
-    for (int32_t i = 0; i < tm->pool_size; i++) {
+    for (uint64_t i = 0; i < tm->pool_size; i++) {
         if (tm->entry_pool[i].in_bucket && pto2_tensormap_entry_valid(tm, &tm->entry_pool[i])) {
             count++;
         }
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index 6e974cde..ce0091c0 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -74,13 +74,13 @@ typedef struct {
  */
 typedef struct {
     // Hash table buckets (fixed size, power of 2)
-    int32_t* buckets;     // Array of offsets into entry_pool (-1 = empty)
-    int32_t num_buckets;  // Must be power of 2 for fast modulo
+    int32_t* buckets;      // Array of offsets into entry_pool (-1 = empty)
+    uint64_t num_buckets;  // Must be power of 2 for fast modulo
 
     // Entry pool as ring buffer
     PTO2TensorMapEntry* entry_pool;  // Ring buffer of entries
-    int32_t pool_size;               // Total pool capacity
-    int32_t pool_head;               // Next allocation position (wraps around)
+    uint64_t pool_size;              // Total pool capacity
+    uint64_t pool_head;              // Next allocation position (wraps around)
 
     // Per-task entry tracking (for efficient bucket cleanup)
     int32_t* task_entry_head;  // Per-task head offset (-1 = no entries)
@@ -104,7 +104,7 @@ typedef struct {
  * @param pool_size   Size of entry pool
  * @return true on success, false on allocation failure
  */
-bool pto2_tensormap_init(PTO2TensorMap* tm, int32_t num_buckets, int32_t pool_size);
+bool pto2_tensormap_init(PTO2TensorMap* tm, uint64_t num_buckets, uint64_t pool_size);
 
 /**
  * Initialize TensorMap with default sizes
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
index f1d8be3d..5fe14382 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
@@ -43,7 +43,7 @@ Runtime::Runtime() {
 // Tensor Pair Management
 // =============================================================================
 
-void Runtime::record_tensor_pair(void* host_ptr, void* dev_ptr, size_t size) {
+void Runtime::record_tensor_pair(void* host_ptr, void* dev_ptr, uint64_t size) {
     if (tensor_pair_count >= RUNTIME_MAX_TENSOR_PAIRS) {
         fprintf(stderr, "[Runtime] ERROR: Tensor pairs full (max=%d)\n", RUNTIME_MAX_TENSOR_PAIRS);
         return;
@@ -86,7 +86,7 @@ void Runtime::set_pto2_gm_heap(void* p) { pto2_gm_heap_ptr_ = p; }
 void Runtime::set_orch_args(uint64_t* args, int count) {
     orch_arg_count_ = count <= RUNTIME_MAX_ARGS ? count : RUNTIME_MAX_ARGS;
     if (args && orch_arg_count_ > 0) {
-        memcpy(orch_args_storage_, args, (size_t)orch_arg_count_ * sizeof(uint64_t));
+        memcpy(orch_args_storage_, args, (uint64_t)orch_arg_count_ * sizeof(uint64_t));
         // Note: We no longer store orch_args_ pointer as it would contain host address
         // get_orch_args() now computes address from embedded storage directly
     }
@@ -94,14 +94,14 @@ void Runtime::set_orch_args(uint64_t* args, int count) {
 
 // Device orchestration SO binary (for dlopen on AICPU thread 3)
 // Copies data to internal storage to avoid lifetime issues with Python ctypes arrays
-void Runtime::set_device_orch_so(const void* data, size_t size) {
+void Runtime::set_device_orch_so(const void* data, uint64_t size) {
     if (data == nullptr || size == 0) {
         device_orch_so_size_ = 0;
         return;
     }
     if (size > RUNTIME_MAX_ORCH_SO_SIZE) {
-        fprintf(stderr, "[Runtime] ERROR: Orchestration SO too large (%zu > %d)\n",
-                size, RUNTIME_MAX_ORCH_SO_SIZE);
+        fprintf(stderr, "[Runtime] ERROR: Orchestration SO too large (%llu > %d)\n",
+                (unsigned long long)size, RUNTIME_MAX_ORCH_SO_SIZE);
         device_orch_so_size_ = 0;
         return;
     }
@@ -113,7 +113,7 @@ const void* Runtime::get_device_orch_so_data() const {
     return device_orch_so_size_ > 0 ? device_orch_so_storage_ : nullptr;
 }
 
-size_t Runtime::get_device_orch_so_size() const {
+uint64_t Runtime::get_device_orch_so_size() const {
     return device_orch_so_size_;
 }
 
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 20e8183b..7902ee63 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -89,7 +89,7 @@ struct Handshake {
 struct TensorPair {
     void* host_ptr;
     void* dev_ptr;
-    size_t size;
+    uint64_t size;
 };
 
 /**
@@ -97,11 +97,11 @@ struct TensorPair {
  * Allows runtime to use pluggable device memory backends.
  */
 struct HostApi {
-    void* (*device_malloc)(size_t size);
+    void* (*device_malloc)(uint64_t size);
     void (*device_free)(void* dev_ptr);
-    int (*copy_to_device)(void* dev_ptr, const void* host_ptr, size_t size);
-    int (*copy_from_device)(void* host_ptr, const void* dev_ptr, size_t size);
-    uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, size_t bin_size);
+    int (*copy_to_device)(void* dev_ptr, const void* host_ptr, uint64_t size);
+    int (*copy_from_device)(void* host_ptr, const void* dev_ptr, uint64_t size);
+    uint64_t (*upload_kernel_binary)(int func_id, const uint8_t* bin_data, uint64_t bin_size);
 };
 
 /**
@@ -160,7 +160,7 @@ class Runtime {
     // Device orchestration SO binary (for dlopen on AICPU thread 3)
     // Stored as a copy to avoid lifetime issues with Python ctypes arrays
     uint8_t device_orch_so_storage_[RUNTIME_MAX_ORCH_SO_SIZE];
-    size_t device_orch_so_size_;
+    uint64_t device_orch_so_size_;
 
 public:
     /**
@@ -175,7 +175,7 @@ class Runtime {
     /**
      * Record a host-device tensor pair for copy-back during finalize.
      */
-    void record_tensor_pair(void* host_ptr, void* dev_ptr, size_t size);
+    void record_tensor_pair(void* host_ptr, void* dev_ptr, uint64_t size);
 
     /**
      * Get pointer to tensor pairs array.
@@ -207,9 +207,9 @@ class Runtime {
     void set_orch_args(uint64_t* args, int count);
 
     // Device orchestration SO binary (for dlopen on AICPU thread 3)
-    void set_device_orch_so(const void* data, size_t size);
+    void set_device_orch_so(const void* data, uint64_t size);
     const void* get_device_orch_so_data() const;
-    size_t get_device_orch_so_size() const;
+    uint64_t get_device_orch_so_size() const;
 
     uint64_t get_function_bin_addr(int func_id) const;
     void set_function_bin_addr(int func_id, uint64_t addr);
diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
index ee9ca815..e7d58b06 100644
--- a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
+++ b/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -60,7 +60,7 @@ static __aicore__ void softmax_prepare_impl(__gm__ uint8_t* sij_raw, float scale
     using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
 
     TileVecMxN sijTile;
-    TileSijDyn sijDynTile(static_cast<size_t>(valid_len));
+    TileSijDyn sijDynTile(static_cast<uint64_t>(valid_len));
     TileSijPad sijPadTile;
     TileVecMxN pijTile;
     TileVecMxN tmpTile;
diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 955eca58..1fad9193 100644
--- a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/device_tests/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -95,11 +95,11 @@ extern "C" int orchestration(Runtime* runtime) {
     int num_head_tiles  = (num_heads + q_tile_size - 1) / q_tile_size;
 
     // Buffer sizes for per-block intermediates
-    size_t sij_size    = static_cast<size_t>(q_tile_size) * block_size * sizeof(float);
-    size_t pij_size    = static_cast<size_t>(q_tile_size) * block_size * sizeof(uint16_t);
-    size_t mij_size    = static_cast<size_t>(q_tile_size) * sizeof(float);
-    size_t lij_size    = mij_size;
-    size_t oi_new_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
+    uint64_t sij_size    = static_cast<uint64_t>(q_tile_size) * block_size * sizeof(float);
+    uint64_t pij_size    = static_cast<uint64_t>(q_tile_size) * block_size * sizeof(uint16_t);
+    uint64_t mij_size    = static_cast<uint64_t>(q_tile_size) * sizeof(float);
+    uint64_t lij_size    = mij_size;
+    uint64_t oi_new_size = static_cast<uint64_t>(q_tile_size) * head_dim * sizeof(float);
 
     // Allocate per-block intermediate buffers on device (HBM)
     int total_buffers = batch * max_num_blocks;
@@ -119,9 +119,9 @@ extern "C" int orchestration(Runtime* runtime) {
 
     // Per-(batch, head_tile) accumulators
     int total_accums = batch * num_head_tiles;
-    size_t mi_size = static_cast<size_t>(q_tile_size) * sizeof(float);
-    size_t li_size = mi_size;
-    size_t oi_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
+    uint64_t mi_size = static_cast<uint64_t>(q_tile_size) * sizeof(float);
+    uint64_t li_size = mi_size;
+    uint64_t oi_size = static_cast<uint64_t>(q_tile_size) * head_dim * sizeof(float);
 
     void** dev_mi_arr = new void*[total_accums];
     void** dev_li_arr = new void*[total_accums];
diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
index ee9ca815..e7d58b06 100644
--- a/tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
+++ b/tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -60,7 +60,7 @@ static __aicore__ void softmax_prepare_impl(__gm__ uint8_t* sij_raw, float scale
     using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
 
     TileVecMxN sijTile;
-    TileSijDyn sijDynTile(static_cast<size_t>(valid_len));
+    TileSijDyn sijDynTile(static_cast<uint64_t>(valid_len));
     TileSijPad sijPadTile;
     TileVecMxN pijTile;
     TileVecMxN tmpTile;
diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 46d26e7d..bb33419b 100644
--- a/tests/device_tests/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/device_tests/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -37,13 +37,13 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count)
     void* host_out = reinterpret_cast<void*>(args[5]);
     int64_t* host_config = reinterpret_cast<int64_t*>(args[6]);
 
-    size_t query_size = static_cast<size_t>(args[7]);
-    size_t key_cache_size = static_cast<size_t>(args[8]);
-    size_t value_cache_size = static_cast<size_t>(args[9]);
-    size_t block_table_size = static_cast<size_t>(args[10]);
-    size_t context_lens_size = static_cast<size_t>(args[11]);
-    size_t out_size = static_cast<size_t>(args[12]);
-    size_t config_size = static_cast<size_t>(args[13]);
+    uint64_t query_size = static_cast<uint64_t>(args[7]);
+    uint64_t key_cache_size = static_cast<uint64_t>(args[8]);
+    uint64_t value_cache_size = static_cast<uint64_t>(args[9]);
+    uint64_t block_table_size = static_cast<uint64_t>(args[10]);
+    uint64_t context_lens_size = static_cast<uint64_t>(args[11]);
+    uint64_t out_size = static_cast<uint64_t>(args[12]);
+    uint64_t config_size = static_cast<uint64_t>(args[13]);
 
     int batch = static_cast<int>(host_config[0]);
     int num_heads = static_cast<int>(host_config[1]);
@@ -79,11 +79,11 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count)
     runtime->record_tensor_pair(host_out, dev_out, out_size);
 
     // Buffer sizes depend on q_tile_size and block_size
-    size_t sij_size    = static_cast<size_t>(q_tile_size) * block_size * sizeof(float);
-    size_t pij_size    = static_cast<size_t>(q_tile_size) * block_size * sizeof(uint16_t);
-    size_t mij_size    = static_cast<size_t>(q_tile_size) * sizeof(float);
-    size_t lij_size    = mij_size;
-    size_t oi_new_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
+    uint64_t sij_size    = static_cast<uint64_t>(q_tile_size) * block_size * sizeof(float);
+    uint64_t pij_size    = static_cast<uint64_t>(q_tile_size) * block_size * sizeof(uint16_t);
+    uint64_t mij_size    = static_cast<uint64_t>(q_tile_size) * sizeof(float);
+    uint64_t lij_size    = mij_size;
+    uint64_t oi_new_size = static_cast<uint64_t>(q_tile_size) * head_dim * sizeof(float);
 
     // Per-batch-per-block intermediate buffers
     int total_buffers = batch * max_num_blocks;
@@ -103,9 +103,9 @@ int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count)
 
     // Per-(batch, head_tile) accumulators
     int total_accums = batch * num_head_tiles;
-    size_t mi_size = static_cast<size_t>(q_tile_size) * sizeof(float);
-    size_t li_size = mi_size;
-    size_t oi_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
+    uint64_t mi_size = static_cast<uint64_t>(q_tile_size) * sizeof(float);
+    uint64_t li_size = mi_size;
+    uint64_t oi_size = static_cast<uint64_t>(q_tile_size) * head_dim * sizeof(float);
 
     void** dev_mi_arr = new void*[total_accums];
     void** dev_li_arr = new void*[total_accums];
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
index 0980c2d1..1d54c9bd 100644
--- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
+++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -63,7 +63,7 @@ static __aicore__ void softmax_prepare_impl(__gm__ Tensor* sij,
     using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
 
     TileVecMxN sijTile;
-    TileSijDyn sijDynTile(static_cast<size_t>(valid_len));
+    TileSijDyn sijDynTile(valid_len);
     TileSijPad sijPadTile;
     TileVecMxN pijTile;
     TileVecMxN tmpTile;
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 52006506..2d7f2b09 100644
--- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -73,13 +73,13 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
     int64_t* host_config = reinterpret_cast<int64_t*>(args[6]);
 
     // Extract sizes (next 7)
-    size_t query_size = static_cast<size_t>(args[7]);
-    size_t key_cache_size = static_cast<size_t>(args[8]);
-    size_t value_cache_size = static_cast<size_t>(args[9]);
-    size_t block_table_size = static_cast<size_t>(args[10]);
-    size_t context_lens_size = static_cast<size_t>(args[11]);
-    size_t out_size = static_cast<size_t>(args[12]);
-    size_t config_size = static_cast<size_t>(args[13]);
+    uint64_t query_size = static_cast<uint64_t>(args[7]);
+    uint64_t key_cache_size = static_cast<uint64_t>(args[8]);
+    uint64_t value_cache_size = static_cast<uint64_t>(args[9]);
+    uint64_t block_table_size = static_cast<uint64_t>(args[10]);
+    uint64_t context_lens_size = static_cast<uint64_t>(args[11]);
+    uint64_t out_size = static_cast<uint64_t>(args[12]);
+    uint64_t config_size = static_cast<uint64_t>(args[13]);
 
     // Extract config parameters
     uint64_t batch = static_cast<uint64_t>(static_cast<int>(host_config[0]));