Qualcomm AI Engine Direct - Remove legacy code related to the shared buffer (#16000)

shewu-quic · web-flow · commit d25cc441ec51 · 2025-12-01T20:27:01.000-08:00
Summary: - Remove PreRegisterMem, as it is legacy code used for registering custom memory before execution. cc: @haowhsu-quic
diff --git a/backends/qualcomm/runtime/QnnExecuTorch.h b/backends/qualcomm/runtime/QnnExecuTorch.h
@@ -69,10 +69,6 @@ void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment);
 /// handle to tensor wrapper during execution
 void QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem);
 
-/// Add custom mem tensor info. Help to bring forward the memHandle creating
-/// time from execution to initialization.
-void QnnExecuTorchAddCustomMemTensorInfo(const CustomMemTensorInfo& info);
-
 /// Free the allocated shared memory.
 void QnnExecuTorchFreeCustomMem(void* buffer_ptr);
 
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
@@ -124,52 +124,6 @@ Error QnnManager::LoadQnnLibrary() {
   return ret;
 }
 
-Error QnnManager::PreRegisterMem() {
-  SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
-  for (const auto info : shared_buffer_manager.GetCustomMemTensorInfoSet()) {
-    void* unaligned_custom_mem_base =
-        shared_buffer_manager.GetUnAlignedAddr(info.custom_mem);
-
-    size_t tensor_offset = (static_cast<char*>(info.custom_mem) -
-                            static_cast<char*>(unaligned_custom_mem_base)) +
-        info.pos;
-    size_t total_custom_mem_size =
-        shared_buffer_manager.GetAllocatedSize(info.custom_mem);
-
-    int32_t mem_fd = shared_buffer_manager.MemToFd(unaligned_custom_mem_base);
-    if (mem_fd == -1) {
-      QNN_EXECUTORCH_LOG_WARN(
-          "PreRegisterMem failed to get file descriptor.",
-          "custom_mem: %p",
-          "tensor_addr: %p",
-          "pos: %uz",
-          "tensor_bytes: %uz",
-          "shape: %p",
-          "rank: %zu",
-          "qnn_dtype: %X",
-          info.custom_mem,
-          info.tensor_addr,
-          info.pos,
-          info.tensor_bytes,
-          info.shape,
-          info.rank,
-          info.dtype);
-      return Error::Internal;
-    }
-
-    ET_CHECK_OR_RETURN_ERROR(
-        backend_params_ptr_->qnn_mem_manager_ptr_->PreRegisterCustomMemHandle(
-            mem_fd,
-            unaligned_custom_mem_base,
-            total_custom_mem_size,
-            tensor_offset,
-            info) == Error::Ok,
-        Internal,
-        "Fail to register to shared memory.");
-  }
-  return Error::Ok;
-}
-
 Error QnnManager::RegisterMem(
     void* data_ptr,
     const std::shared_ptr<TensorWrapper>& tensor_wrapper) {
@@ -256,6 +210,9 @@ Error QnnManager::RegisterCustomMem(
 
   Qnn_MemHandle_t pre_registered_handle =
       backend_params_ptr_->qnn_mem_manager_ptr_->GetPreRegisteredHandle(info);
+  // If this memory block has already been registered, we can use it directly.
+  // This applies when running llama in lookahead mode with the same AR-N model
+  // handling both the prompt processor and the token generator.
   if (pre_registered_handle != nullptr) {
     if (get_option(options_->log_level()) >=
         QnnExecuTorchLogLevel::kLogLevelInfo) {
@@ -268,15 +225,15 @@ Error QnnManager::RegisterCustomMem(
   }
 
   SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
-  void* unaligned_custom_mem_base =
-      shared_buffer_manager.GetUnAlignedAddr(custom_mem_base);
 
-  size_t tensor_offset = static_cast<char*>(custom_mem_base) -
-      static_cast<char*>(unaligned_custom_mem_base) + info.pos;
+  size_t tensor_offset = info.pos;
   size_t total_custom_mem_size =
       shared_buffer_manager.GetAllocatedSize(custom_mem_base);
 
-  int32_t mem_fd = shared_buffer_manager.MemToFd(unaligned_custom_mem_base);
+  int32_t mem_fd = shared_buffer_manager.MemToFd(custom_mem_base);
+  // Note: If obtaining the file descriptor fails, it may be due to memory not
+  // being released with QnnExecuTorchFreeCustomMem. In this situation, we could
+  // consider adding a map to monitor it.
   if (mem_fd == -1) {
     QNN_EXECUTORCH_LOG_WARN(
         "Tensor name %s failed to get file descriptor.",
@@ -289,7 +246,6 @@ Error QnnManager::RegisterCustomMem(
           tensor_wrapper,
           mem_fd,
           data_ptr,
-          unaligned_custom_mem_base,
           total_custom_mem_size,
           tensor_offset,
           info) == Error::Ok,
@@ -355,13 +311,6 @@ Error QnnManager::Init() {
         BackendInitializeState::INITIALIZED;
   }
 
-#if defined(__aarch64__)
-  ET_CHECK_OR_RETURN_ERROR(
-      PreRegisterMem() == Error::Ok,
-      Internal,
-      "Fail to pre register custom memory handle");
-#endif
-
   if (IsOnlinePrepare()) {
     Qnn_ApiVersion_t qnn_version = {QNN_VERSION_INIT};
     qnn_loaded_backend_.GetQnnInterface().qnn_backend_get_api_version(
@@ -697,8 +646,3 @@ void QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem) {
   executorch::backends::qnn::SharedBuffer::GetSharedBufferManager()
       .AddCusomMemTensorAddr(tensor_addr, custom_mem);
 }
-
-void QnnExecuTorchAddCustomMemTensorInfo(const CustomMemTensorInfo& info) {
-  executorch::backends::qnn::SharedBuffer::GetSharedBufferManager()
-      .AddCusomMemTensorInfo(info);
-}
diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp
@@ -69,14 +69,6 @@ void* SharedBuffer::GetCustomMemBase(void* buf) {
   return it->second;
 }
 
-void* SharedBuffer::GetUnAlignedAddr(void* buf) {
-  auto it = restore_map_.find(buf);
-  if (it == restore_map_.end()) {
-    return nullptr;
-  }
-  return it->second;
-}
-
 size_t SharedBuffer::GetAllocatedSize(void* buf) {
   auto it = allocated_size_map_.find(buf);
   if (it == allocated_size_map_.end()) {
@@ -123,10 +115,10 @@ void* SharedBuffer::AllocMem(size_t bytes, size_t alignment) {
     QNN_EXECUTORCH_LOG_WARN("Failed to allocate the tensor by RPC memory.");
     return nullptr;
   }
-  allocated_size_map_.insert({buf, allocate_bytes});
   auto aligned_buf = reinterpret_cast<void*>(
       alignTo(alignment, reinterpret_cast<intptr_t>(buf)));
   bool status = restore_map_.insert({aligned_buf, buf}).second;
+  allocated_size_map_.insert({aligned_buf, allocate_bytes});
   if (!status) {
     QNN_EXECUTORCH_LOG_ERROR("Failed to allocate the tensor by RPC memory.");
     rpc_mem_free_(buf);
@@ -152,6 +144,15 @@ void SharedBuffer::FreeMem(void* buf) {
   } else {
     rpc_mem_free_(restore_map_[buf]);
     restore_map_.erase(buf);
+    allocated_size_map_.erase(buf);
+    // Unbind the custom memory from tensor address.
+    auto mit = custom_mem_to_tensor_addr_.find(buf);
+    if (mit != custom_mem_to_tensor_addr_.end()) {
+      for (auto it = mit->second.begin(); it != mit->second.end(); ++it) {
+        tensor_addr_to_custom_mem_.erase(*it);
+      }
+      custom_mem_to_tensor_addr_.erase(buf);
+    }
   }
 }
 
@@ -185,14 +186,18 @@ Error SharedBuffer::Load() {
 }
 
 void SharedBuffer::AddCusomMemTensorAddr(void* tensor_addr, void* custom_mem) {
-  tensor_addr_to_custom_mem_.insert({tensor_addr, custom_mem});
+  bool status =
+      tensor_addr_to_custom_mem_.insert({tensor_addr, custom_mem}).second;
+  if (!status) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Tensor address %p already associated with custom memory %p",
+        tensor_addr,
+        custom_mem);
+    return;
+  }
+  custom_mem_to_tensor_addr_[custom_mem].insert(tensor_addr);
 };
 
-void SharedBuffer::AddCusomMemTensorInfo(const CustomMemTensorInfo& info) {
-  custom_mem_tensor_info_set_.insert(info);
-  tensor_addr_to_custom_mem_.insert({info.tensor_addr, info.custom_mem});
-}
-
 Error SharedBuffer::UnLoad() {
   if (dlclose(lib_cdsp_rpc_) != 0) {
     QNN_EXECUTORCH_LOG_ERROR(
diff --git a/backends/qualcomm/runtime/SharedBuffer.h b/backends/qualcomm/runtime/SharedBuffer.h
@@ -59,19 +59,10 @@ class SharedBuffer final {
   // memory handle is registered during execution
   void AddCusomMemTensorAddr(void* tensor_addr, void* custom_mem);
 
-  // memory handle can be registered before execution
-  void AddCusomMemTensorInfo(const CustomMemTensorInfo& info);
-
   size_t GetAllocatedSize(void* buf);
 
   void* GetCustomMemBase(void* buf);
 
-  void* GetUnAlignedAddr(void* buf);
-
-  const std::unordered_set<CustomMemTensorInfo>& GetCustomMemTensorInfoSet() {
-    return custom_mem_tensor_info_set_;
-  };
-
  private:
   SharedBuffer() = default;
 
@@ -93,7 +84,10 @@ class SharedBuffer final {
   std::unordered_map<void*, size_t> allocated_size_map_;
   // Maps for the custom memory
   std::unordered_map<void*, void*> tensor_addr_to_custom_mem_;
-  std::unordered_set<CustomMemTensorInfo> custom_mem_tensor_info_set_;
+  // After the custom memory is freed, we will ensure that no tensor addresses
+  // remain linked to this custom memory.
+  std::unordered_map<void*, std::unordered_set<void*>>
+      custom_mem_to_tensor_addr_;
   std::atomic_bool initialize_{false};
   static std::mutex init_mutex_;
 };
diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.cpp b/backends/qualcomm/runtime/backends/QnnMemManager.cpp
@@ -56,13 +56,10 @@ Error QnnMemManager::RegisterIonMem(
   return Error::Ok;
 }
 
-// TODO: Find a better way to unify RegisterCustomMem and
-// PreRegisterCustomMemHandle
 Error QnnMemManager::RegisterCustomMem(
     const std::shared_ptr<TensorWrapper>& tensor_wrapper,
     int32_t mem_fd,
     void* mem_ptr,
-    void* unaligned_custom_mem_base,
     size_t total_custom_mem_size,
     size_t tensor_offset,
     const CustomMemTensorInfo& info) {
@@ -107,46 +104,6 @@ Error QnnMemManager::RegisterCustomMem(
   return Error::Ok;
 }
 
-Error QnnMemManager::PreRegisterCustomMemHandle(
-    int32_t mem_fd,
-    void* unaligned_custom_mem_base,
-    size_t total_custom_mem_size,
-    size_t tensor_offset,
-    const CustomMemTensorInfo& info) {
-  const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
-  Qnn_MemDescriptor_t descriptor = {
-      {info.rank, info.shape, nullptr},
-      scalar_type_to_qnn_dtype_[info.dtype],
-      QNN_MEM_TYPE_CUSTOM,
-      {{mem_fd}}};
-  Qnn_MemHandle_t handle = nullptr;
-  Qnn_ErrorHandle_t error = QNN_SUCCESS;
-
-  QnnMemHtp_Descriptor_t htp_descriptor;
-  htp_descriptor.type = QNN_HTP_MEM_SHARED_BUFFER;
-  htp_descriptor.size = total_custom_mem_size;
-
-  QnnHtpMem_SharedBufferConfig_t htpSharedBuffConfig = {mem_fd, tensor_offset};
-  htp_descriptor.sharedBufferConfig = htpSharedBuffConfig;
-
-  descriptor.customInfo = &htp_descriptor;
-
-  error = qnn_interface.qnn_mem_register(
-      context_->GetHandle(),
-      &descriptor,
-      /*numDescriptors=*/1,
-      &handle);
-  if (error != QNN_SUCCESS) {
-    QNN_EXECUTORCH_LOG_WARN(
-        "PreRegisterCustomMemHandle fail", QNN_GET_ERROR_CODE(error));
-    return Error::Internal;
-  }
-
-  pre_registered_handles_.insert({info, handle});
-  registered_map_.insert({handle, nullptr});
-  return Error::Ok;
-}
-
 void* QnnMemManager::GetPreRegisteredHandle(const CustomMemTensorInfo& info) {
   auto it = pre_registered_handles_.find(info);
   if (it == pre_registered_handles_.end()) {
diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.h b/backends/qualcomm/runtime/backends/QnnMemManager.h
@@ -39,16 +39,6 @@ class QnnMemManager {
       const std::shared_ptr<TensorWrapper>& tensor_wrapper,
       int32_t mem_fd,
       void* mem_ptr,
-      void* unaligned_custom_mem_base,
-      size_t total_custom_mem_size,
-      size_t tensor_offset,
-      const CustomMemTensorInfo& info);
-
-  // Pre-register custom mem handle from SharedBuffer. Bring forward the
-  // memHandle creating time from execution to initialization.
-  executorch::runtime::Error PreRegisterCustomMemHandle(
-      int32_t mem_fd,
-      void* unaligned_custom_mem_base,
       size_t total_custom_mem_size,
       size_t tensor_offset,
       const CustomMemTensorInfo& info);
diff --git a/examples/qualcomm/README.md b/examples/qualcomm/README.md
@@ -110,6 +110,28 @@ This section outlines the essential APIs and utilities provided to streamline th
 
    Creates a clean directory for storing model outputs or intermediate results. If the directory already exists, it will be deleted and recreated to ensure a consistent environment for each run.
 
+## Run Inference Using Shared Buffer
+This section shows how to use shared buffer for input/output tensors in QNN ExecuTorch, usually graph inputs and outputs on shared memory to reduce huge tensor copying time from CPU to HTP. This feature can accelerate inference speed. Users need to do shared memory resource management by themselves. The key idea is to use `QnnExecuTorchAllocCustomMem` to allocate a large chunk of memory on the device, then use `QnnExecuTorchFreeCustomMem` to free it after inference.
+
+### Run example scipts with shared buffer
+You can specify `--shared_buffer` flag to run example scripts with shared buffer such as:
+```
+python mobilenet_v2.py -s <device_serial> -m "SM8550" -b path/to/build-android/ -d /path/to/imagenet-mini/val --shared_buffer
+```
+
+### Workflow of using shared memory
+There are two ways to use shared buffer in QNN ExecuTorch:
+1. Use ION buffer (1 tensor to 1 rpc mem)
+    - For all I/O tensors, user call QnnExecuTorchAllocCustomMem to request n bytes RPC memory
+    - For all I/O tensors, user create TensorImpl with the above memory address
+    - Run inference with shared buffer
+    - For all I/O tensors, user call QnnExecuTorchFreeCustomMem to free RPC memory
+2. Use Custom Memory (many tensors to 1 rpc mem)
+    - Call QnnExecuTorchAllocCustomMem to allocate a large RPC memory block capable of holding all I/O tensors
+    - For all I/O tensors, create TensorImpl with a sufficient memory block derived from the base RPC memory address, then call QnnExecuTorchAddCustomMemTensorAddr to bind each tensor’s address to the base RPC memory.
+    - Run inference with shared buffer
+    - Call QnnExecuTorchFreeCustomMem to free RPC memory
+
 ## Additional Dependency
 This example requires the following Python packages:
 - pandas and scikit-learn: used in the mobilebert multi-class text classification example.
diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
@@ -44,20 +44,10 @@ void RpcMem::add_memory_info(
       it == io_pos_map_.end()) {
     ET_LOG(Error, "Shared buffer pointer %p is not found", data_ptr);
   }
-  size_t pos = io_pos_map_[static_cast<std::byte*>(data_ptr)];
-  uint32_t* shape = const_cast<uint32_t*>(
-      reinterpret_cast<const uint32_t*>(tensor_info.sizes().data()));
-  uint32_t rank = static_cast<uint32_t>(tensor_info.sizes().size());
-  executorch::aten::ScalarType scalar_type = tensor_info.scalar_type();
-  CustomMemTensorInfo info = {
-      shared_buffer_base_ptr_,
-      data_ptr,
-      pos,
-      data_size,
-      shape,
-      rank,
-      scalar_type};
-  QnnExecuTorchAddCustomMemTensorInfo(info);
+  if (binded_tensor_addr_set_.find(data_ptr) == binded_tensor_addr_set_.end()) {
+    QnnExecuTorchAddCustomMemTensorAddr(data_ptr, shared_buffer_base_ptr_);
+    binded_tensor_addr_set_.insert(data_ptr);
+  }
 };
 
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h
@@ -9,6 +9,7 @@
 #pragma once
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
 #include <unordered_map>
+#include <unordered_set>
 
 namespace example {
 /**
@@ -58,6 +59,7 @@ tensor.
   void* shared_buffer_base_ptr_;
   size_t calculated_offsets_;
   std::unordered_map<std::byte*, size_t> io_pos_map_;
+  std::unordered_set<void*> binded_tensor_addr_set_;
 };
 
 } // namespace example