UbiquitousLearning · chenghuaWang · Feb 1, 2026 · Jan 31, 2026 · Feb 1, 2026 · coderabbitai
diff --git a/examples/llama_qnn_aot/modeling_llama_qnn_aot.hpp b/examples/llama_qnn_aot/modeling_llama_qnn_aot.hpp
@@ -83,7 +83,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
     case kUInt8PerTensorSym: {
       auto scale = m->getTopParameterFile()->pull(scale_name);
       auto zp = m->getTopParameterFile()->pull(zp_name);
-      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
+      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);
 
       // Is 128! not 127!
       auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);

diff --git a/examples/llama_qnn_aot/modeling_llama_qnn_aot_sha.hpp b/examples/llama_qnn_aot/modeling_llama_qnn_aot_sha.hpp
@@ -88,7 +88,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
     case kUInt8PerTensorSym: {
       auto scale = m->getTopParameterFile()->pull(scale_name);
       auto zp = m->getTopParameterFile()->pull(zp_name);
-      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
+      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);
 
       // Is 128! not 127!
       auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);

diff --git a/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot.hpp b/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot.hpp
@@ -83,7 +83,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
     case kUInt8PerTensorSym: {
       auto scale = m->getTopParameterFile()->pull(scale_name);
       auto zp = m->getTopParameterFile()->pull(zp_name);
-      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
+      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);
 
       // Is 128! not 127!
       auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);

diff --git a/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot_sha.hpp b/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot_sha.hpp
@@ -88,7 +88,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
     case kUInt8PerTensorSym: {
       auto scale = m->getTopParameterFile()->pull(scale_name);
       auto zp = m->getTopParameterFile()->pull(zp_name);
-      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
+      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);
 
       // Is 128! not 127!
       auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);

diff --git a/examples/qwen3_qnn_aot/compile_sha.cpp b/examples/qwen3_qnn_aot/compile_sha.cpp
@@ -166,12 +166,8 @@ MLLM_MAIN({
         CL - N,
     }, mllm::kUInt8PerTensorSym);
     trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kUInt8PerTensorSym);
-
     trace_inputs[past_key_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.scale").impl(), true);
-    trace_inputs[past_key_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
-
     trace_inputs[past_value_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.scale").impl(), true);
-    trace_inputs[past_value_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
       // clang-format on
     }
 

diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
@@ -83,7 +83,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
     case kUInt8PerTensorSym: {
       auto scale = m->getTopParameterFile()->pull(scale_name);
       auto zp = m->getTopParameterFile()->pull(zp_name);
-      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
+      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);
 
       // Is 128! not 127!
       auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);

diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot_sha.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot_sha.hpp
@@ -88,7 +88,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
     case kUInt8PerTensorSym: {
       auto scale = m->getTopParameterFile()->pull(scale_name);
       auto zp = m->getTopParameterFile()->pull(zp_name);
-      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
+      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);
 
       // Is 128! not 127!
       auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
@@ -356,14 +356,12 @@ class Qwen3AttentionSHA final : public nn::Module {
       std::string h_str = std::to_string(h);
 
       // K: De-quantize and re-quantize to int8
-      auto k_h = key_states_per_head[h].to(kFloat32);
-      k_h = k_h.to(kUInt8PerTensorSym);
+      auto k_h = key_states_per_head[h].to(kUInt8PerTensorSym);
       k_h = ptq::QDQ_KV(this, k_h, "k_cast_to_int8_qdq_h" + h_str);
       k_h = k_h.transpose(2, 3);  // [B, 1, D, S]
 
       // V: Quantize to int16 then int8
       auto v_h = ptq::QDQ(this, value_states_per_head[h], "v_cast_to_int16_qdq_h" + h_str);
-      v_h = v_h.to(kFloat32);
       v_h = v_h.to(kUInt8PerTensorSym);
       v_h = ptq::QDQ_KV(this, v_h, "v_cast_to_int8_qdq_h" + h_str);
 

@@ -14,24 +14,36 @@ namespace mllm::qnn {
 #define RPCMEM_DEFAULT_FLAGS 1
 
 QNNAllocator::QNNAllocator() {
-  void* libCdspHandle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
-  if (nullptr == libCdspHandle) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); }
+  libCdspHandle_ = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+  if (nullptr == libCdspHandle_) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); }
 
-  rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle, "rpcmem_alloc");
-  rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle, "rpcmem_free");
-  rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle, "rpcmem_to_fd");
+  rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle_, "rpcmem_alloc");
+  rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle_, "rpcmem_free");
+  rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle_, "rpcmem_to_fd");
 }
 
 QNNAllocator::QNNAllocator(QNN_INTERFACE_VER_TYPE qnnInterface, void* context)
     : qnnInterface_(qnnInterface), context_(context) {
   MLLM_RT_ASSERT(context_ != nullptr);
 
-  void* libCdspHandle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
-  if (nullptr == libCdspHandle) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); }
+  libCdspHandle_ = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+  if (nullptr == libCdspHandle_) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); }
 
-  rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle, "rpcmem_alloc");
-  rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle, "rpcmem_free");
-  rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle, "rpcmem_to_fd");
+  rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle_, "rpcmem_alloc");
+  rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle_, "rpcmem_free");
+  rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle_, "rpcmem_to_fd");
+}
+
+QNNAllocator::~QNNAllocator() {
+  // Properly release all resources before unloading the library
+  // Since we hold libCdspHandle_, the library won't be unloaded until we dlclose it
+  shutdown();
+
+  // Now safe to unload the library
+  if (libCdspHandle_) {
+    dlclose(libCdspHandle_);
+    libCdspHandle_ = nullptr;
+  }
 }
 
 bool QNNAllocator::alloc(Storage* storage) {
@@ -46,12 +58,22 @@ bool QNNAllocator::alloc(Storage* storage) {
 }
 
 void QNNAllocator::free(Storage* storage) {
+  // Skip if shutdown was called or destructor is running
+  // During program exit, QNN library resources might be destroyed, so we can't safely call rpcmem_free
+  if (isShutdown_) { return; }
+
+  // Only free memory that was allocated by this allocator and not yet freed
+  if (!qnnMemPtrSet_.count(storage->ptr_)) {
+    return;  // Not our memory or already freed, skip
+  }
+
   if (ptrToFdAndMemHandleMap_.count(storage->ptr_)) {
-    MLLM_RT_ASSERT_EQ(QNN_SUCCESS,
-                      qnnInterface_.memDeRegister(&(ptrToFdAndMemHandleMap_.find(storage->ptr_)->second.second), 1));
+    qnnInterface_.memDeRegister(&(ptrToFdAndMemHandleMap_.find(storage->ptr_)->second.second), 1);
+    ptrToFdAndMemHandleMap_.erase(storage->ptr_);
   }
 
   rpcmem_free(storage->ptr_);
+  qnnMemPtrSet_.erase(storage->ptr_);
 }
 
 void QNNAllocator::registerQnnTensorToSharedBuffer(void* ptr, Qnn_Tensor_t& qnn_tensor) {
@@ -99,4 +121,4 @@ void QNNAllocator::deRegisterQnnTensorFromSharedBuffer(void* ptr) {
 
 std::shared_ptr<QNNAllocator> createQNNAllocator() { return std::make_shared<QNNAllocator>(); }
 
-}  // namespace mllm::qnn
+}  // namespace mllm::qnn
@@ -30,15 +30,37 @@ class QNNAllocator final : public Allocator {
   QNNAllocator();  // need to setQNNPointer afterward
   QNNAllocator(QNN_INTERFACE_VER_TYPE qnnInterface, void* context);
 
-  ~QNNAllocator() {
+  ~QNNAllocator();
+
+  // Explicitly release all QNN memory resources. Call this for proper cleanup when you
+  // want to release memory during normal operation (not program exit).
+  // This is SAFE to call and will properly free all QNN resources.
+  void shutdown() {
+    if (isShutdown_) return;
+    isShutdown_ = true;
+
+    // First, deregister all registered memory
     for (auto iter = ptrToFdAndMemHandleMap_.begin(); iter != ptrToFdAndMemHandleMap_.end();) {
       Qnn_ErrorHandle_t deregisterRet = qnnInterface_.memDeRegister(&iter->second.second, 1);
-      if (QNN_SUCCESS != deregisterRet) { MLLM_ERROR("~QNNAllocator: qnnInterface_.memDeRegister failed"); }
-      rpcmem_free(iter->first);
+      if (QNN_SUCCESS != deregisterRet) { MLLM_ERROR("QNNAllocator::shutdown: qnnInterface_.memDeRegister failed"); }
       iter = ptrToFdAndMemHandleMap_.erase(iter);
     }
+
+    // Then, free all allocated memory (registered or not)
+    MLLM_INFO("QNNAllocator::shutdown: freeing all allocated memory");
+    for (void* ptr : qnnMemPtrSet_) { rpcmem_free(ptr); }
+    qnnMemPtrSet_.clear();
   }
 
+  // Legacy name for shutdown() - kept for compatibility
+  void releaseAllResources() { shutdown(); }
+
+  // Mark the allocator as shut down without actually freeing memory.
+  // Use this in destructors to prevent crashes during program exit when
+  // QNN library resources might already be destroyed.
+  // After this is called, all free() calls become no-ops.
+  void markShutdown() { isShutdown_ = true; }
+
   void setQNNPointer(QNN_INTERFACE_VER_TYPE qnnInterface, void* context) {
     this->qnnInterface_ = qnnInterface;
     this->context_ = context;
@@ -83,15 +105,23 @@ class QNNAllocator final : public Allocator {
   QNN_INTERFACE_VER_TYPE qnnInterface_;
   Qnn_ContextHandle_t context_ = nullptr;
 
-  RpcMemAllocFn_t rpcmem_alloc;
-  RpcMemFreeFn_t rpcmem_free;
-  RpcMemToFdFn_t rpcmem_to_fd;
+  // Hold the library handle to control unload order
+  // libcdsprpc.so will only be unloaded when this allocator is destroyed
+  void* libCdspHandle_ = nullptr;
+
+  RpcMemAllocFn_t rpcmem_alloc = nullptr;
+  RpcMemFreeFn_t rpcmem_free = nullptr;
+  RpcMemToFdFn_t rpcmem_to_fd = nullptr;
 
   // to check if the ptr is allocted by rpcmem_alloc
   std::set<void*> qnnMemPtrSet_;
   std::map<void*, std::pair<int, Qnn_MemHandle_t>> ptrToFdAndMemHandleMap_;
+
+  // Flag to indicate shutdown has been called or destructor is running
+  // When true, free() calls become no-ops to avoid crashes during program exit
+  bool isShutdown_ = false;
 };
 
 std::shared_ptr<QNNAllocator> createQNNAllocator();
 
-}  // namespace mllm::qnn
+}  // namespace mllm::qnn