From f26794d91c0f0c55caff9b6af95e4ff88cab64d7 Mon Sep 17 00:00:00 2001
From: shewu <shewu@qti.qualcomm.com>
Date: Tue, 18 Nov 2025 12:48:51 +0800
Subject: [PATCH 1/2] Qualcomm AI Engine Direct - Remove legacy code related to
 the shared buffer

Summary:
- Remove PreRegisterMem, as it is legacy code used for registering custom memory before execution.
---
 backends/qualcomm/runtime/QnnExecuTorch.h     |  4 --
 backends/qualcomm/runtime/QnnManager.cpp      | 72 +++----------------
 backends/qualcomm/runtime/SharedBuffer.cpp    | 35 +++++----
 backends/qualcomm/runtime/SharedBuffer.h      | 14 ++--
 .../runtime/backends/QnnMemManager.cpp        | 43 -----------
 .../qualcomm/runtime/backends/QnnMemManager.h | 10 ---
 .../oss_scripts/llama/runner/rpc_mem.cpp      | 18 ++---
 .../oss_scripts/llama/runner/rpc_mem.h        |  2 +
 8 files changed, 38 insertions(+), 160 deletions(-)
diff --git a/backends/qualcomm/runtime/QnnExecuTorch.h b/backends/qualcomm/runtime/QnnExecuTorch.h
index d8fbade3b3b..ccd02273c4f 100644
--- a/backends/qualcomm/runtime/QnnExecuTorch.h
+++ b/backends/qualcomm/runtime/QnnExecuTorch.h
@@ -69,10 +69,6 @@ void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment);
 /// handle to tensor wrapper during execution
 void QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem);
 
-/// Add custom mem tensor info. Help to bring forward the memHandle creating
-/// time from execution to initialization.
-void QnnExecuTorchAddCustomMemTensorInfo(const CustomMemTensorInfo& info);
-
 /// Free the allocated shared memory.
 void QnnExecuTorchFreeCustomMem(void* buffer_ptr);
 
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index 5e3220f25d9..da4c7fbc069 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -124,52 +124,6 @@ Error QnnManager::LoadQnnLibrary() {
   return ret;
 }
 
-Error QnnManager::PreRegisterMem() {
-  SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
-  for (const auto info : shared_buffer_manager.GetCustomMemTensorInfoSet()) {
-    void* unaligned_custom_mem_base =
-        shared_buffer_manager.GetUnAlignedAddr(info.custom_mem);
-
-    size_t tensor_offset = (static_cast<char*>(info.custom_mem) -
-                            static_cast<char*>(unaligned_custom_mem_base)) +
-        info.pos;
-    size_t total_custom_mem_size =
-        shared_buffer_manager.GetAllocatedSize(info.custom_mem);
-
-    int32_t mem_fd = shared_buffer_manager.MemToFd(unaligned_custom_mem_base);
-    if (mem_fd == -1) {
-      QNN_EXECUTORCH_LOG_WARN(
-          "PreRegisterMem failed to get file descriptor.",
-          "custom_mem: %p",
-          "tensor_addr: %p",
-          "pos: %uz",
-          "tensor_bytes: %uz",
-          "shape: %p",
-          "rank: %zu",
-          "qnn_dtype: %X",
-          info.custom_mem,
-          info.tensor_addr,
-          info.pos,
-          info.tensor_bytes,
-          info.shape,
-          info.rank,
-          info.dtype);
-      return Error::Internal;
-    }
-
-    ET_CHECK_OR_RETURN_ERROR(
-        backend_params_ptr_->qnn_mem_manager_ptr_->PreRegisterCustomMemHandle(
-            mem_fd,
-            unaligned_custom_mem_base,
-            total_custom_mem_size,
-            tensor_offset,
-            info) == Error::Ok,
-        Internal,
-        "Fail to register to shared memory.");
-  }
-  return Error::Ok;
-}
-
 Error QnnManager::RegisterMem(
     void* data_ptr,
     const std::shared_ptr<TensorWrapper>& tensor_wrapper) {
@@ -256,6 +210,9 @@ Error QnnManager::RegisterCustomMem(
 
   Qnn_MemHandle_t pre_registered_handle =
       backend_params_ptr_->qnn_mem_manager_ptr_->GetPreRegisteredHandle(info);
+  // If this memory block has already been registered, we can use it directly.
+  // This applies when running llama in lookahead mode with the same AR-N model
+  // handling both the prompt processor and the token generator.
   if (pre_registered_handle != nullptr) {
     if (get_option(options_->log_level()) >=
         QnnExecuTorchLogLevel::kLogLevelInfo) {
@@ -268,15 +225,15 @@ Error QnnManager::RegisterCustomMem(
   }
 
   SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
-  void* unaligned_custom_mem_base =
-      shared_buffer_manager.GetUnAlignedAddr(custom_mem_base);
 
-  size_t tensor_offset = static_cast<char*>(custom_mem_base) -
-      static_cast<char*>(unaligned_custom_mem_base) + info.pos;
+  size_t tensor_offset = info.pos;
   size_t total_custom_mem_size =
       shared_buffer_manager.GetAllocatedSize(custom_mem_base);
 
-  int32_t mem_fd = shared_buffer_manager.MemToFd(unaligned_custom_mem_base);
+  int32_t mem_fd = shared_buffer_manager.MemToFd(custom_mem_base);
+  // Note: If obtaining the file descriptor fails, it may be due to memory not
+  // being released with QnnExecuTorchFreeCustomMem. In this situation, we could
+  // consider adding a map to monitor it.
   if (mem_fd == -1) {
     QNN_EXECUTORCH_LOG_WARN(
         "Tensor name %s failed to get file descriptor.",
@@ -289,7 +246,6 @@ Error QnnManager::RegisterCustomMem(
           tensor_wrapper,
           mem_fd,
           data_ptr,
-          unaligned_custom_mem_base,
           total_custom_mem_size,
           tensor_offset,
           info) == Error::Ok,
@@ -355,13 +311,6 @@ Error QnnManager::Init() {
         BackendInitializeState::INITIALIZED;
   }
 
-#if defined(__aarch64__)
-  ET_CHECK_OR_RETURN_ERROR(
-      PreRegisterMem() == Error::Ok,
-      Internal,
-      "Fail to pre register custom memory handle");
-#endif
-
   if (IsOnlinePrepare()) {
     Qnn_ApiVersion_t qnn_version = {QNN_VERSION_INIT};
     qnn_loaded_backend_.GetQnnInterface().qnn_backend_get_api_version(
@@ -697,8 +646,3 @@ void QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem) {
   executorch::backends::qnn::SharedBuffer::GetSharedBufferManager()
       .AddCusomMemTensorAddr(tensor_addr, custom_mem);
 }
-
-void QnnExecuTorchAddCustomMemTensorInfo(const CustomMemTensorInfo& info) {
-  executorch::backends::qnn::SharedBuffer::GetSharedBufferManager()
-      .AddCusomMemTensorInfo(info);
-}
diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp
index 99dee7c9a7b..d79f8041932 100644
--- a/backends/qualcomm/runtime/SharedBuffer.cpp
+++ b/backends/qualcomm/runtime/SharedBuffer.cpp
@@ -69,14 +69,6 @@ void* SharedBuffer::GetCustomMemBase(void* buf) {
   return it->second;
 }
 
-void* SharedBuffer::GetUnAlignedAddr(void* buf) {
-  auto it = restore_map_.find(buf);
-  if (it == restore_map_.end()) {
-    return nullptr;
-  }
-  return it->second;
-}
-
 size_t SharedBuffer::GetAllocatedSize(void* buf) {
   auto it = allocated_size_map_.find(buf);
   if (it == allocated_size_map_.end()) {
@@ -123,10 +115,10 @@ void* SharedBuffer::AllocMem(size_t bytes, size_t alignment) {
     QNN_EXECUTORCH_LOG_WARN("Failed to allocate the tensor by RPC memory.");
     return nullptr;
   }
-  allocated_size_map_.insert({buf, allocate_bytes});
   auto aligned_buf = reinterpret_cast<void*>(
       alignTo(alignment, reinterpret_cast<intptr_t>(buf)));
   bool status = restore_map_.insert({aligned_buf, buf}).second;
+  allocated_size_map_.insert({aligned_buf, allocate_bytes});
   if (!status) {
     QNN_EXECUTORCH_LOG_ERROR("Failed to allocate the tensor by RPC memory.");
     rpc_mem_free_(buf);
@@ -152,6 +144,15 @@ void SharedBuffer::FreeMem(void* buf) {
   } else {
     rpc_mem_free_(restore_map_[buf]);
     restore_map_.erase(buf);
+    allocated_size_map_.erase(buf);
+    // Unbind the custom memory from tensor address.
+    auto mit = custom_mem_to_tensor_addr_.find(buf);
+    if (mit != custom_mem_to_tensor_addr_.end()) {
+      for (auto it = mit->second.begin(); it != mit->second.end(); ++it) {
+        tensor_addr_to_custom_mem_.erase(*it);
+      }
+      custom_mem_to_tensor_addr_.erase(buf);
+    }
   }
 }
 
@@ -185,14 +186,18 @@ Error SharedBuffer::Load() {
 }
 
 void SharedBuffer::AddCusomMemTensorAddr(void* tensor_addr, void* custom_mem) {
-  tensor_addr_to_custom_mem_.insert({tensor_addr, custom_mem});
+  bool status =
+      tensor_addr_to_custom_mem_.insert({tensor_addr, custom_mem}).second;
+  if (!status) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Tensor address %p already associated with custom memory %p",
+        tensor_addr,
+        custom_mem);
+    return;
+  }
+  custom_mem_to_tensor_addr_[custom_mem].insert(tensor_addr);
 };
 
-void SharedBuffer::AddCusomMemTensorInfo(const CustomMemTensorInfo& info) {
-  custom_mem_tensor_info_set_.insert(info);
-  tensor_addr_to_custom_mem_.insert({info.tensor_addr, info.custom_mem});
-}
-
 Error SharedBuffer::UnLoad() {
   if (dlclose(lib_cdsp_rpc_) != 0) {
     QNN_EXECUTORCH_LOG_ERROR(
diff --git a/backends/qualcomm/runtime/SharedBuffer.h b/backends/qualcomm/runtime/SharedBuffer.h
index a02ea0e4c25..6bf06a6350b 100644
--- a/backends/qualcomm/runtime/SharedBuffer.h
+++ b/backends/qualcomm/runtime/SharedBuffer.h
@@ -59,19 +59,10 @@ class SharedBuffer final {
   // memory handle is registered during execution
   void AddCusomMemTensorAddr(void* tensor_addr, void* custom_mem);
 
-  // memory handle can be registered before execution
-  void AddCusomMemTensorInfo(const CustomMemTensorInfo& info);
-
   size_t GetAllocatedSize(void* buf);
 
   void* GetCustomMemBase(void* buf);
 
-  void* GetUnAlignedAddr(void* buf);
-
-  const std::unordered_set<CustomMemTensorInfo>& GetCustomMemTensorInfoSet() {
-    return custom_mem_tensor_info_set_;
-  };
-
  private:
   SharedBuffer() = default;
 
@@ -93,7 +84,10 @@ class SharedBuffer final {
   std::unordered_map<void*, size_t> allocated_size_map_;
   // Maps for the custom memory
   std::unordered_map<void*, void*> tensor_addr_to_custom_mem_;
-  std::unordered_set<CustomMemTensorInfo> custom_mem_tensor_info_set_;
+  // After the custom memory is freed, we will ensure that no tensor addresses
+  // remain linked to this custom memory.
+  std::unordered_map<void*, std::unordered_set<void*>>
+      custom_mem_to_tensor_addr_;
   std::atomic_bool initialize_{false};
   static std::mutex init_mutex_;
 };
diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.cpp b/backends/qualcomm/runtime/backends/QnnMemManager.cpp
index 3b99dd10868..05a819286dd 100644
--- a/backends/qualcomm/runtime/backends/QnnMemManager.cpp
+++ b/backends/qualcomm/runtime/backends/QnnMemManager.cpp
@@ -56,13 +56,10 @@ Error QnnMemManager::RegisterIonMem(
   return Error::Ok;
 }
 
-// TODO: Find a better way to unify RegisterCustomMem and
-// PreRegisterCustomMemHandle
 Error QnnMemManager::RegisterCustomMem(
     const std::shared_ptr<TensorWrapper>& tensor_wrapper,
     int32_t mem_fd,
     void* mem_ptr,
-    void* unaligned_custom_mem_base,
     size_t total_custom_mem_size,
     size_t tensor_offset,
     const CustomMemTensorInfo& info) {
@@ -107,46 +104,6 @@ Error QnnMemManager::RegisterCustomMem(
   return Error::Ok;
 }
 
-Error QnnMemManager::PreRegisterCustomMemHandle(
-    int32_t mem_fd,
-    void* unaligned_custom_mem_base,
-    size_t total_custom_mem_size,
-    size_t tensor_offset,
-    const CustomMemTensorInfo& info) {
-  const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
-  Qnn_MemDescriptor_t descriptor = {
-      {info.rank, info.shape, nullptr},
-      scalar_type_to_qnn_dtype_[info.dtype],
-      QNN_MEM_TYPE_CUSTOM,
-      {{mem_fd}}};
-  Qnn_MemHandle_t handle = nullptr;
-  Qnn_ErrorHandle_t error = QNN_SUCCESS;
-
-  QnnMemHtp_Descriptor_t htp_descriptor;
-  htp_descriptor.type = QNN_HTP_MEM_SHARED_BUFFER;
-  htp_descriptor.size = total_custom_mem_size;
-
-  QnnHtpMem_SharedBufferConfig_t htpSharedBuffConfig = {mem_fd, tensor_offset};
-  htp_descriptor.sharedBufferConfig = htpSharedBuffConfig;
-
-  descriptor.customInfo = &htp_descriptor;
-
-  error = qnn_interface.qnn_mem_register(
-      context_->GetHandle(),
-      &descriptor,
-      /*numDescriptors=*/1,
-      &handle);
-  if (error != QNN_SUCCESS) {
-    QNN_EXECUTORCH_LOG_WARN(
-        "PreRegisterCustomMemHandle fail", QNN_GET_ERROR_CODE(error));
-    return Error::Internal;
-  }
-
-  pre_registered_handles_.insert({info, handle});
-  registered_map_.insert({handle, nullptr});
-  return Error::Ok;
-}
-
 void* QnnMemManager::GetPreRegisteredHandle(const CustomMemTensorInfo& info) {
   auto it = pre_registered_handles_.find(info);
   if (it == pre_registered_handles_.end()) {
diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.h b/backends/qualcomm/runtime/backends/QnnMemManager.h
index 6a7f00b016a..f4d3beda2c2 100644
--- a/backends/qualcomm/runtime/backends/QnnMemManager.h
+++ b/backends/qualcomm/runtime/backends/QnnMemManager.h
@@ -39,16 +39,6 @@ class QnnMemManager {
       const std::shared_ptr<TensorWrapper>& tensor_wrapper,
       int32_t mem_fd,
       void* mem_ptr,
-      void* unaligned_custom_mem_base,
-      size_t total_custom_mem_size,
-      size_t tensor_offset,
-      const CustomMemTensorInfo& info);
-
-  // Pre-register custom mem handle from SharedBuffer. Bring forward the
-  // memHandle creating time from execution to initialization.
-  executorch::runtime::Error PreRegisterCustomMemHandle(
-      int32_t mem_fd,
-      void* unaligned_custom_mem_base,
       size_t total_custom_mem_size,
       size_t tensor_offset,
       const CustomMemTensorInfo& info);
diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
index f0cc6d9a7a2..67d7ec80aab 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
@@ -44,20 +44,10 @@ void RpcMem::add_memory_info(
       it == io_pos_map_.end()) {
     ET_LOG(Error, "Shared buffer pointer %p is not found", data_ptr);
   }
-  size_t pos = io_pos_map_[static_cast<std::byte*>(data_ptr)];
-  uint32_t* shape = const_cast<uint32_t*>(
-      reinterpret_cast<const uint32_t*>(tensor_info.sizes().data()));
-  uint32_t rank = static_cast<uint32_t>(tensor_info.sizes().size());
-  executorch::aten::ScalarType scalar_type = tensor_info.scalar_type();
-  CustomMemTensorInfo info = {
-      shared_buffer_base_ptr_,
-      data_ptr,
-      pos,
-      data_size,
-      shape,
-      rank,
-      scalar_type};
-  QnnExecuTorchAddCustomMemTensorInfo(info);
+  if (binded_tensor_addr_set_.find(data_ptr) == binded_tensor_addr_set_.end()) {
+    QnnExecuTorchAddCustomMemTensorAddr(data_ptr, shared_buffer_base_ptr_);
+    binded_tensor_addr_set_.insert(data_ptr);
+  }
 };
 
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h
index 08c92741545..99e9cb1dec1 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h
@@ -9,6 +9,7 @@
 #pragma once
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
 #include <unordered_map>
+#include <unordered_set>
 
 namespace example {
 /**
@@ -58,6 +59,7 @@ tensor.
   void* shared_buffer_base_ptr_;
   size_t calculated_offsets_;
   std::unordered_map<std::byte*, size_t> io_pos_map_;
+  std::unordered_set<void*> binded_tensor_addr_set_;
 };
 
 } // namespace example

From 90ecc4aaaf4fcb7b03050ac1838162b2dce0f71c Mon Sep 17 00:00:00 2001
From: shewu <shewu@qti.qualcomm.com>
Date: Tue, 2 Dec 2025 09:11:52 +0800
Subject: [PATCH 2/2] - Add a section to describe how to use shared buffer in
 QNN ExecuTorch

---
 examples/qualcomm/README.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/examples/qualcomm/README.md b/examples/qualcomm/README.md
index 31443f2d356..e10f506ceb4 100644
--- a/examples/qualcomm/README.md
+++ b/examples/qualcomm/README.md
@@ -110,6 +110,28 @@ This section outlines the essential APIs and utilities provided to streamline th
 
    Creates a clean directory for storing model outputs or intermediate results. If the directory already exists, it will be deleted and recreated to ensure a consistent environment for each run.
 
+## Run Inference Using Shared Buffer
+This section shows how to use shared buffer for input/output tensors in QNN ExecuTorch, usually graph inputs and outputs on shared memory to reduce huge tensor copying time from CPU to HTP. This feature can accelerate inference speed. Users need to do shared memory resource management by themselves. The key idea is to use `QnnExecuTorchAllocCustomMem` to allocate a large chunk of memory on the device, then use `QnnExecuTorchFreeCustomMem` to free it after inference.
+
+### Run example scipts with shared buffer
+You can specify `--shared_buffer` flag to run example scripts with shared buffer such as:
+```
+python mobilenet_v2.py -s <device_serial> -m "SM8550" -b path/to/build-android/ -d /path/to/imagenet-mini/val --shared_buffer
+```
+
+### Workflow of using shared memory
+There are two ways to use shared buffer in QNN ExecuTorch:
+1. Use ION buffer (1 tensor to 1 rpc mem)
+    - For all I/O tensors, user call QnnExecuTorchAllocCustomMem to request n bytes RPC memory
+    - For all I/O tensors, user create TensorImpl with the above memory address
+    - Run inference with shared buffer
+    - For all I/O tensors, user call QnnExecuTorchFreeCustomMem to free RPC memory
+2. Use Custom Memory (many tensors to 1 rpc mem)
+    - Call QnnExecuTorchAllocCustomMem to allocate a large RPC memory block capable of holding all I/O tensors
+    - For all I/O tensors, create TensorImpl with a sufficient memory block derived from the base RPC memory address, then call QnnExecuTorchAddCustomMemTensorAddr to bind each tensor’s address to the base RPC memory.
+    - Run inference with shared buffer
+    - Call QnnExecuTorchFreeCustomMem to free RPC memory
+
 ## Additional Dependency
 This example requires the following Python packages:
 - pandas and scikit-learn: used in the mobilebert multi-class text classification example.