Skip to content

Commit d25cc44

Browse files
authored
Qualcomm AI Engine Direct - Remove legacy code related to the shared buffer (#16000)
Summary: - Remove PreRegisterMem, as it is legacy code used for registering custom memory before execution. cc: @haowhsu-quic
1 parent 2ebed88 commit d25cc44

File tree

9 files changed

+60
-160
lines changed

9 files changed

+60
-160
lines changed

backends/qualcomm/runtime/QnnExecuTorch.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,6 @@ void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment);
6969
/// handle to tensor wrapper during execution
7070
void QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem);
7171

72-
/// Add custom mem tensor info. Help to bring forward the memHandle creating
73-
/// time from execution to initialization.
74-
void QnnExecuTorchAddCustomMemTensorInfo(const CustomMemTensorInfo& info);
75-
7672
/// Free the allocated shared memory.
7773
void QnnExecuTorchFreeCustomMem(void* buffer_ptr);
7874

backends/qualcomm/runtime/QnnManager.cpp

Lines changed: 8 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -124,52 +124,6 @@ Error QnnManager::LoadQnnLibrary() {
124124
return ret;
125125
}
126126

127-
Error QnnManager::PreRegisterMem() {
128-
SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
129-
for (const auto info : shared_buffer_manager.GetCustomMemTensorInfoSet()) {
130-
void* unaligned_custom_mem_base =
131-
shared_buffer_manager.GetUnAlignedAddr(info.custom_mem);
132-
133-
size_t tensor_offset = (static_cast<char*>(info.custom_mem) -
134-
static_cast<char*>(unaligned_custom_mem_base)) +
135-
info.pos;
136-
size_t total_custom_mem_size =
137-
shared_buffer_manager.GetAllocatedSize(info.custom_mem);
138-
139-
int32_t mem_fd = shared_buffer_manager.MemToFd(unaligned_custom_mem_base);
140-
if (mem_fd == -1) {
141-
QNN_EXECUTORCH_LOG_WARN(
142-
"PreRegisterMem failed to get file descriptor.",
143-
"custom_mem: %p",
144-
"tensor_addr: %p",
145-
"pos: %uz",
146-
"tensor_bytes: %uz",
147-
"shape: %p",
148-
"rank: %zu",
149-
"qnn_dtype: %X",
150-
info.custom_mem,
151-
info.tensor_addr,
152-
info.pos,
153-
info.tensor_bytes,
154-
info.shape,
155-
info.rank,
156-
info.dtype);
157-
return Error::Internal;
158-
}
159-
160-
ET_CHECK_OR_RETURN_ERROR(
161-
backend_params_ptr_->qnn_mem_manager_ptr_->PreRegisterCustomMemHandle(
162-
mem_fd,
163-
unaligned_custom_mem_base,
164-
total_custom_mem_size,
165-
tensor_offset,
166-
info) == Error::Ok,
167-
Internal,
168-
"Fail to register to shared memory.");
169-
}
170-
return Error::Ok;
171-
}
172-
173127
Error QnnManager::RegisterMem(
174128
void* data_ptr,
175129
const std::shared_ptr<TensorWrapper>& tensor_wrapper) {
@@ -256,6 +210,9 @@ Error QnnManager::RegisterCustomMem(
256210

257211
Qnn_MemHandle_t pre_registered_handle =
258212
backend_params_ptr_->qnn_mem_manager_ptr_->GetPreRegisteredHandle(info);
213+
// If this memory block has already been registered, we can use it directly.
214+
// This applies when running llama in lookahead mode with the same AR-N model
215+
// handling both the prompt processor and the token generator.
259216
if (pre_registered_handle != nullptr) {
260217
if (get_option(options_->log_level()) >=
261218
QnnExecuTorchLogLevel::kLogLevelInfo) {
@@ -268,15 +225,15 @@ Error QnnManager::RegisterCustomMem(
268225
}
269226

270227
SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
271-
void* unaligned_custom_mem_base =
272-
shared_buffer_manager.GetUnAlignedAddr(custom_mem_base);
273228

274-
size_t tensor_offset = static_cast<char*>(custom_mem_base) -
275-
static_cast<char*>(unaligned_custom_mem_base) + info.pos;
229+
size_t tensor_offset = info.pos;
276230
size_t total_custom_mem_size =
277231
shared_buffer_manager.GetAllocatedSize(custom_mem_base);
278232

279-
int32_t mem_fd = shared_buffer_manager.MemToFd(unaligned_custom_mem_base);
233+
int32_t mem_fd = shared_buffer_manager.MemToFd(custom_mem_base);
234+
// Note: If obtaining the file descriptor fails, it may be due to memory not
235+
// being released with QnnExecuTorchFreeCustomMem. In this situation, we could
236+
// consider adding a map to monitor it.
280237
if (mem_fd == -1) {
281238
QNN_EXECUTORCH_LOG_WARN(
282239
"Tensor name %s failed to get file descriptor.",
@@ -289,7 +246,6 @@ Error QnnManager::RegisterCustomMem(
289246
tensor_wrapper,
290247
mem_fd,
291248
data_ptr,
292-
unaligned_custom_mem_base,
293249
total_custom_mem_size,
294250
tensor_offset,
295251
info) == Error::Ok,
@@ -355,13 +311,6 @@ Error QnnManager::Init() {
355311
BackendInitializeState::INITIALIZED;
356312
}
357313

358-
#if defined(__aarch64__)
359-
ET_CHECK_OR_RETURN_ERROR(
360-
PreRegisterMem() == Error::Ok,
361-
Internal,
362-
"Fail to pre register custom memory handle");
363-
#endif
364-
365314
if (IsOnlinePrepare()) {
366315
Qnn_ApiVersion_t qnn_version = {QNN_VERSION_INIT};
367316
qnn_loaded_backend_.GetQnnInterface().qnn_backend_get_api_version(
@@ -697,8 +646,3 @@ void QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem) {
697646
executorch::backends::qnn::SharedBuffer::GetSharedBufferManager()
698647
.AddCusomMemTensorAddr(tensor_addr, custom_mem);
699648
}
700-
701-
void QnnExecuTorchAddCustomMemTensorInfo(const CustomMemTensorInfo& info) {
702-
executorch::backends::qnn::SharedBuffer::GetSharedBufferManager()
703-
.AddCusomMemTensorInfo(info);
704-
}

backends/qualcomm/runtime/SharedBuffer.cpp

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,6 @@ void* SharedBuffer::GetCustomMemBase(void* buf) {
6969
return it->second;
7070
}
7171

72-
void* SharedBuffer::GetUnAlignedAddr(void* buf) {
73-
auto it = restore_map_.find(buf);
74-
if (it == restore_map_.end()) {
75-
return nullptr;
76-
}
77-
return it->second;
78-
}
79-
8072
size_t SharedBuffer::GetAllocatedSize(void* buf) {
8173
auto it = allocated_size_map_.find(buf);
8274
if (it == allocated_size_map_.end()) {
@@ -123,10 +115,10 @@ void* SharedBuffer::AllocMem(size_t bytes, size_t alignment) {
123115
QNN_EXECUTORCH_LOG_WARN("Failed to allocate the tensor by RPC memory.");
124116
return nullptr;
125117
}
126-
allocated_size_map_.insert({buf, allocate_bytes});
127118
auto aligned_buf = reinterpret_cast<void*>(
128119
alignTo(alignment, reinterpret_cast<intptr_t>(buf)));
129120
bool status = restore_map_.insert({aligned_buf, buf}).second;
121+
allocated_size_map_.insert({aligned_buf, allocate_bytes});
130122
if (!status) {
131123
QNN_EXECUTORCH_LOG_ERROR("Failed to allocate the tensor by RPC memory.");
132124
rpc_mem_free_(buf);
@@ -152,6 +144,15 @@ void SharedBuffer::FreeMem(void* buf) {
152144
} else {
153145
rpc_mem_free_(restore_map_[buf]);
154146
restore_map_.erase(buf);
147+
allocated_size_map_.erase(buf);
148+
// Unbind the custom memory from tensor address.
149+
auto mit = custom_mem_to_tensor_addr_.find(buf);
150+
if (mit != custom_mem_to_tensor_addr_.end()) {
151+
for (auto it = mit->second.begin(); it != mit->second.end(); ++it) {
152+
tensor_addr_to_custom_mem_.erase(*it);
153+
}
154+
custom_mem_to_tensor_addr_.erase(buf);
155+
}
155156
}
156157
}
157158

@@ -185,14 +186,18 @@ Error SharedBuffer::Load() {
185186
}
186187

187188
void SharedBuffer::AddCusomMemTensorAddr(void* tensor_addr, void* custom_mem) {
188-
tensor_addr_to_custom_mem_.insert({tensor_addr, custom_mem});
189+
bool status =
190+
tensor_addr_to_custom_mem_.insert({tensor_addr, custom_mem}).second;
191+
if (!status) {
192+
QNN_EXECUTORCH_LOG_WARN(
193+
"Tensor address %p already associated with custom memory %p",
194+
tensor_addr,
195+
custom_mem);
196+
return;
197+
}
198+
custom_mem_to_tensor_addr_[custom_mem].insert(tensor_addr);
189199
};
190200

191-
void SharedBuffer::AddCusomMemTensorInfo(const CustomMemTensorInfo& info) {
192-
custom_mem_tensor_info_set_.insert(info);
193-
tensor_addr_to_custom_mem_.insert({info.tensor_addr, info.custom_mem});
194-
}
195-
196201
Error SharedBuffer::UnLoad() {
197202
if (dlclose(lib_cdsp_rpc_) != 0) {
198203
QNN_EXECUTORCH_LOG_ERROR(

backends/qualcomm/runtime/SharedBuffer.h

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -59,19 +59,10 @@ class SharedBuffer final {
5959
// memory handle is registered during execution
6060
void AddCusomMemTensorAddr(void* tensor_addr, void* custom_mem);
6161

62-
// memory handle can be registered before execution
63-
void AddCusomMemTensorInfo(const CustomMemTensorInfo& info);
64-
6562
size_t GetAllocatedSize(void* buf);
6663

6764
void* GetCustomMemBase(void* buf);
6865

69-
void* GetUnAlignedAddr(void* buf);
70-
71-
const std::unordered_set<CustomMemTensorInfo>& GetCustomMemTensorInfoSet() {
72-
return custom_mem_tensor_info_set_;
73-
};
74-
7566
private:
7667
SharedBuffer() = default;
7768

@@ -93,7 +84,10 @@ class SharedBuffer final {
9384
std::unordered_map<void*, size_t> allocated_size_map_;
9485
// Maps for the custom memory
9586
std::unordered_map<void*, void*> tensor_addr_to_custom_mem_;
96-
std::unordered_set<CustomMemTensorInfo> custom_mem_tensor_info_set_;
87+
// After the custom memory is freed, we will ensure that no tensor addresses
88+
// remain linked to this custom memory.
89+
std::unordered_map<void*, std::unordered_set<void*>>
90+
custom_mem_to_tensor_addr_;
9791
std::atomic_bool initialize_{false};
9892
static std::mutex init_mutex_;
9993
};

backends/qualcomm/runtime/backends/QnnMemManager.cpp

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,10 @@ Error QnnMemManager::RegisterIonMem(
5656
return Error::Ok;
5757
}
5858

59-
// TODO: Find a better way to unify RegisterCustomMem and
60-
// PreRegisterCustomMemHandle
6159
Error QnnMemManager::RegisterCustomMem(
6260
const std::shared_ptr<TensorWrapper>& tensor_wrapper,
6361
int32_t mem_fd,
6462
void* mem_ptr,
65-
void* unaligned_custom_mem_base,
6663
size_t total_custom_mem_size,
6764
size_t tensor_offset,
6865
const CustomMemTensorInfo& info) {
@@ -107,46 +104,6 @@ Error QnnMemManager::RegisterCustomMem(
107104
return Error::Ok;
108105
}
109106

110-
Error QnnMemManager::PreRegisterCustomMemHandle(
111-
int32_t mem_fd,
112-
void* unaligned_custom_mem_base,
113-
size_t total_custom_mem_size,
114-
size_t tensor_offset,
115-
const CustomMemTensorInfo& info) {
116-
const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
117-
Qnn_MemDescriptor_t descriptor = {
118-
{info.rank, info.shape, nullptr},
119-
scalar_type_to_qnn_dtype_[info.dtype],
120-
QNN_MEM_TYPE_CUSTOM,
121-
{{mem_fd}}};
122-
Qnn_MemHandle_t handle = nullptr;
123-
Qnn_ErrorHandle_t error = QNN_SUCCESS;
124-
125-
QnnMemHtp_Descriptor_t htp_descriptor;
126-
htp_descriptor.type = QNN_HTP_MEM_SHARED_BUFFER;
127-
htp_descriptor.size = total_custom_mem_size;
128-
129-
QnnHtpMem_SharedBufferConfig_t htpSharedBuffConfig = {mem_fd, tensor_offset};
130-
htp_descriptor.sharedBufferConfig = htpSharedBuffConfig;
131-
132-
descriptor.customInfo = &htp_descriptor;
133-
134-
error = qnn_interface.qnn_mem_register(
135-
context_->GetHandle(),
136-
&descriptor,
137-
/*numDescriptors=*/1,
138-
&handle);
139-
if (error != QNN_SUCCESS) {
140-
QNN_EXECUTORCH_LOG_WARN(
141-
"PreRegisterCustomMemHandle fail", QNN_GET_ERROR_CODE(error));
142-
return Error::Internal;
143-
}
144-
145-
pre_registered_handles_.insert({info, handle});
146-
registered_map_.insert({handle, nullptr});
147-
return Error::Ok;
148-
}
149-
150107
void* QnnMemManager::GetPreRegisteredHandle(const CustomMemTensorInfo& info) {
151108
auto it = pre_registered_handles_.find(info);
152109
if (it == pre_registered_handles_.end()) {

backends/qualcomm/runtime/backends/QnnMemManager.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,6 @@ class QnnMemManager {
3939
const std::shared_ptr<TensorWrapper>& tensor_wrapper,
4040
int32_t mem_fd,
4141
void* mem_ptr,
42-
void* unaligned_custom_mem_base,
43-
size_t total_custom_mem_size,
44-
size_t tensor_offset,
45-
const CustomMemTensorInfo& info);
46-
47-
// Pre-register custom mem handle from SharedBuffer. Bring forward the
48-
// memHandle creating time from execution to initialization.
49-
executorch::runtime::Error PreRegisterCustomMemHandle(
50-
int32_t mem_fd,
51-
void* unaligned_custom_mem_base,
5242
size_t total_custom_mem_size,
5343
size_t tensor_offset,
5444
const CustomMemTensorInfo& info);

examples/qualcomm/README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,28 @@ This section outlines the essential APIs and utilities provided to streamline th
110110

111111
Creates a clean directory for storing model outputs or intermediate results. If the directory already exists, it will be deleted and recreated to ensure a consistent environment for each run.
112112

113+
## Run Inference Using Shared Buffer
114+
This section shows how to use shared buffer for input/output tensors in QNN ExecuTorch, usually graph inputs and outputs on shared memory to reduce huge tensor copying time from CPU to HTP. This feature can accelerate inference speed. Users need to do shared memory resource management by themselves. The key idea is to use `QnnExecuTorchAllocCustomMem` to allocate a large chunk of memory on the device, then use `QnnExecuTorchFreeCustomMem` to free it after inference.
115+
116+
### Run example scipts with shared buffer
117+
You can specify `--shared_buffer` flag to run example scripts with shared buffer such as:
118+
```
119+
python mobilenet_v2.py -s <device_serial> -m "SM8550" -b path/to/build-android/ -d /path/to/imagenet-mini/val --shared_buffer
120+
```
121+
122+
### Workflow of using shared memory
123+
There are two ways to use shared buffer in QNN ExecuTorch:
124+
1. Use ION buffer (1 tensor to 1 rpc mem)
125+
- For all I/O tensors, user call QnnExecuTorchAllocCustomMem to request n bytes RPC memory
126+
- For all I/O tensors, user create TensorImpl with the above memory address
127+
- Run inference with shared buffer
128+
- For all I/O tensors, user call QnnExecuTorchFreeCustomMem to free RPC memory
129+
2. Use Custom Memory (many tensors to 1 rpc mem)
130+
- Call QnnExecuTorchAllocCustomMem to allocate a large RPC memory block capable of holding all I/O tensors
131+
- For all I/O tensors, create TensorImpl with a sufficient memory block derived from the base RPC memory address, then call QnnExecuTorchAddCustomMemTensorAddr to bind each tensor’s address to the base RPC memory.
132+
- Run inference with shared buffer
133+
- Call QnnExecuTorchFreeCustomMem to free RPC memory
134+
113135
## Additional Dependency
114136
This example requires the following Python packages:
115137
- pandas and scikit-learn: used in the mobilebert multi-class text classification example.

examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -44,20 +44,10 @@ void RpcMem::add_memory_info(
4444
it == io_pos_map_.end()) {
4545
ET_LOG(Error, "Shared buffer pointer %p is not found", data_ptr);
4646
}
47-
size_t pos = io_pos_map_[static_cast<std::byte*>(data_ptr)];
48-
uint32_t* shape = const_cast<uint32_t*>(
49-
reinterpret_cast<const uint32_t*>(tensor_info.sizes().data()));
50-
uint32_t rank = static_cast<uint32_t>(tensor_info.sizes().size());
51-
executorch::aten::ScalarType scalar_type = tensor_info.scalar_type();
52-
CustomMemTensorInfo info = {
53-
shared_buffer_base_ptr_,
54-
data_ptr,
55-
pos,
56-
data_size,
57-
shape,
58-
rank,
59-
scalar_type};
60-
QnnExecuTorchAddCustomMemTensorInfo(info);
47+
if (binded_tensor_addr_set_.find(data_ptr) == binded_tensor_addr_set_.end()) {
48+
QnnExecuTorchAddCustomMemTensorAddr(data_ptr, shared_buffer_base_ptr_);
49+
binded_tensor_addr_set_.insert(data_ptr);
50+
}
6151
};
6252

6353
} // namespace example

examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#pragma once
1010
#include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
1111
#include <unordered_map>
12+
#include <unordered_set>
1213

1314
namespace example {
1415
/**
@@ -58,6 +59,7 @@ tensor.
5859
void* shared_buffer_base_ptr_;
5960
size_t calculated_offsets_;
6061
std::unordered_map<std::byte*, size_t> io_pos_map_;
62+
std::unordered_set<void*> binded_tensor_addr_set_;
6163
};
6264

6365
} // namespace example

0 commit comments

Comments
 (0)