diff --git a/examples/llama_qnn_aot/modeling_llama_qnn_aot.hpp b/examples/llama_qnn_aot/modeling_llama_qnn_aot.hpp
index a129cd3bd..3d28e79d6 100644
--- a/examples/llama_qnn_aot/modeling_llama_qnn_aot.hpp
+++ b/examples/llama_qnn_aot/modeling_llama_qnn_aot.hpp
@@ -83,7 +83,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
     case kUInt8PerTensorSym: {
       auto scale = m->getTopParameterFile()->pull(scale_name);
       auto zp = m->getTopParameterFile()->pull(zp_name);
-      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
+      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);
 
       // Is 128! not 127!
       auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
diff --git a/examples/llama_qnn_aot/modeling_llama_qnn_aot_sha.hpp b/examples/llama_qnn_aot/modeling_llama_qnn_aot_sha.hpp
index a26ebef1e..d84a76da3 100644
--- a/examples/llama_qnn_aot/modeling_llama_qnn_aot_sha.hpp
+++ b/examples/llama_qnn_aot/modeling_llama_qnn_aot_sha.hpp
@@ -88,7 +88,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
     case kUInt8PerTensorSym: {
       auto scale = m->getTopParameterFile()->pull(scale_name);
       auto zp = m->getTopParameterFile()->pull(zp_name);
-      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
+      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);
 
       // Is 128! not 127!
       auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
diff --git a/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot.hpp b/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot.hpp
index 26d57e676..9c389ced6 100644
--- a/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot.hpp
+++ b/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot.hpp
@@ -83,7 +83,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
     case kUInt8PerTensorSym: {
       auto scale = m->getTopParameterFile()->pull(scale_name);
       auto zp = m->getTopParameterFile()->pull(zp_name);
-      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
+      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);
 
       // Is 128! not 127!
       auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
diff --git a/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot_sha.hpp b/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot_sha.hpp
index db69c6017..834564cfa 100644
--- a/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot_sha.hpp
+++ b/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot_sha.hpp
@@ -88,7 +88,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
     case kUInt8PerTensorSym: {
       auto scale = m->getTopParameterFile()->pull(scale_name);
       auto zp = m->getTopParameterFile()->pull(zp_name);
-      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
+      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);
 
       // Is 128! not 127!
       auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
diff --git a/examples/qwen3_qnn_aot/compile_sha.cpp b/examples/qwen3_qnn_aot/compile_sha.cpp
index 8e6dd2323..f6d25894b 100644
--- a/examples/qwen3_qnn_aot/compile_sha.cpp
+++ b/examples/qwen3_qnn_aot/compile_sha.cpp
@@ -166,12 +166,8 @@ MLLM_MAIN({
         CL - N,
     }, mllm::kUInt8PerTensorSym);
     trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kUInt8PerTensorSym);
-    
     trace_inputs[past_key_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.scale").impl(), true);
-    trace_inputs[past_key_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
-
     trace_inputs[past_value_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.scale").impl(), true);
-    trace_inputs[past_value_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
       // clang-format on
     }
 
diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
index ba1cdb227..e41f14c75 100644
--- a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
+++ b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
@@ -83,7 +83,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
     case kUInt8PerTensorSym: {
       auto scale = m->getTopParameterFile()->pull(scale_name);
       auto zp = m->getTopParameterFile()->pull(zp_name);
-      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
+      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);
 
       // Is 128! not 127!
       auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot_sha.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot_sha.hpp
index 66a313451..b6a9f403a 100644
--- a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot_sha.hpp
+++ b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot_sha.hpp
@@ -88,7 +88,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
     case kUInt8PerTensorSym: {
       auto scale = m->getTopParameterFile()->pull(scale_name);
       auto zp = m->getTopParameterFile()->pull(zp_name);
-      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
+      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);
 
       // Is 128! not 127!
       auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
@@ -356,14 +356,12 @@ class Qwen3AttentionSHA final : public nn::Module {
       std::string h_str = std::to_string(h);
 
       // K: De-quantize and re-quantize to int8
-      auto k_h = key_states_per_head[h].to(kFloat32);
-      k_h = k_h.to(kUInt8PerTensorSym);
+      auto k_h = key_states_per_head[h].to(kUInt8PerTensorSym);
       k_h = ptq::QDQ_KV(this, k_h, "k_cast_to_int8_qdq_h" + h_str);
       k_h = k_h.transpose(2, 3);  // [B, 1, D, S]
 
       // V: Quantize to int16 then int8
       auto v_h = ptq::QDQ(this, value_states_per_head[h], "v_cast_to_int16_qdq_h" + h_str);
-      v_h = v_h.to(kFloat32);
       v_h = v_h.to(kUInt8PerTensorSym);
       v_h = ptq::QDQ_KV(this, v_h, "v_cast_to_int8_qdq_h" + h_str);
 
diff --git a/mllm/backends/qnn/QNNAllocator.cpp b/mllm/backends/qnn/QNNAllocator.cpp
index bc4a73bfd..dc04b5d0b 100644
--- a/mllm/backends/qnn/QNNAllocator.cpp
+++ b/mllm/backends/qnn/QNNAllocator.cpp
@@ -14,24 +14,36 @@ namespace mllm::qnn {
 #define RPCMEM_DEFAULT_FLAGS 1
 
 QNNAllocator::QNNAllocator() {
-  void* libCdspHandle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
-  if (nullptr == libCdspHandle) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); }
+  libCdspHandle_ = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+  if (nullptr == libCdspHandle_) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); }
 
-  rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle, "rpcmem_alloc");
-  rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle, "rpcmem_free");
-  rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle, "rpcmem_to_fd");
+  rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle_, "rpcmem_alloc");
+  rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle_, "rpcmem_free");
+  rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle_, "rpcmem_to_fd");
 }
 
 QNNAllocator::QNNAllocator(QNN_INTERFACE_VER_TYPE qnnInterface, void* context)
     : qnnInterface_(qnnInterface), context_(context) {
   MLLM_RT_ASSERT(context_ != nullptr);
 
-  void* libCdspHandle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
-  if (nullptr == libCdspHandle) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); }
+  libCdspHandle_ = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+  if (nullptr == libCdspHandle_) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); }
 
-  rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle, "rpcmem_alloc");
-  rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle, "rpcmem_free");
-  rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle, "rpcmem_to_fd");
+  rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle_, "rpcmem_alloc");
+  rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle_, "rpcmem_free");
+  rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle_, "rpcmem_to_fd");
+}
+
+QNNAllocator::~QNNAllocator() {
+  // Properly release all resources before unloading the library
+  // Since we hold libCdspHandle_, the library won't be unloaded until we dlclose it
+  shutdown();
+
+  // Now safe to unload the library
+  if (libCdspHandle_) {
+    dlclose(libCdspHandle_);
+    libCdspHandle_ = nullptr;
+  }
 }
 
 bool QNNAllocator::alloc(Storage* storage) {
@@ -46,12 +58,22 @@ bool QNNAllocator::alloc(Storage* storage) {
 }
 
 void QNNAllocator::free(Storage* storage) {
+  // Skip if shutdown was called or destructor is running
+  // During program exit, QNN library resources might be destroyed, so we can't safely call rpcmem_free
+  if (isShutdown_) { return; }
+
+  // Only free memory that was allocated by this allocator and not yet freed
+  if (!qnnMemPtrSet_.count(storage->ptr_)) {
+    return;  // Not our memory or already freed, skip
+  }
+
   if (ptrToFdAndMemHandleMap_.count(storage->ptr_)) {
-    MLLM_RT_ASSERT_EQ(QNN_SUCCESS,
-                      qnnInterface_.memDeRegister(&(ptrToFdAndMemHandleMap_.find(storage->ptr_)->second.second), 1));
+    qnnInterface_.memDeRegister(&(ptrToFdAndMemHandleMap_.find(storage->ptr_)->second.second), 1);
+    ptrToFdAndMemHandleMap_.erase(storage->ptr_);
   }
 
   rpcmem_free(storage->ptr_);
+  qnnMemPtrSet_.erase(storage->ptr_);
 }
 
 void QNNAllocator::registerQnnTensorToSharedBuffer(void* ptr, Qnn_Tensor_t& qnn_tensor) {
@@ -99,4 +121,4 @@ void QNNAllocator::deRegisterQnnTensorFromSharedBuffer(void* ptr) {
 
 std::shared_ptr<QNNAllocator> createQNNAllocator() { return std::make_shared<QNNAllocator>(); }
 
-}  // namespace mllm::qnn
\ No newline at end of file
+}  // namespace mllm::qnn
diff --git a/mllm/backends/qnn/QNNAllocator.hpp b/mllm/backends/qnn/QNNAllocator.hpp
index eac40a534..38c69716b 100644
--- a/mllm/backends/qnn/QNNAllocator.hpp
+++ b/mllm/backends/qnn/QNNAllocator.hpp
@@ -30,15 +30,37 @@ class QNNAllocator final : public Allocator {
   QNNAllocator();  // need to setQNNPointer afterward
   QNNAllocator(QNN_INTERFACE_VER_TYPE qnnInterface, void* context);
 
-  ~QNNAllocator() {
+  ~QNNAllocator();
+
+  // Explicitly release all QNN memory resources. Call this for proper cleanup when you
+  // want to release memory during normal operation (not program exit).
+  // This is SAFE to call and will properly free all QNN resources.
+  void shutdown() {
+    if (isShutdown_) return;
+    isShutdown_ = true;
+
+    // First, deregister all registered memory
     for (auto iter = ptrToFdAndMemHandleMap_.begin(); iter != ptrToFdAndMemHandleMap_.end();) {
       Qnn_ErrorHandle_t deregisterRet = qnnInterface_.memDeRegister(&iter->second.second, 1);
-      if (QNN_SUCCESS != deregisterRet) { MLLM_ERROR("~QNNAllocator: qnnInterface_.memDeRegister failed"); }
-      rpcmem_free(iter->first);
+      if (QNN_SUCCESS != deregisterRet) { MLLM_ERROR("QNNAllocator::shutdown: qnnInterface_.memDeRegister failed"); }
       iter = ptrToFdAndMemHandleMap_.erase(iter);
     }
+
+    // Then, free all allocated memory (registered or not)
+    MLLM_INFO("QNNAllocator::shutdown: freeing all allocated memory");
+    for (void* ptr : qnnMemPtrSet_) { rpcmem_free(ptr); }
+    qnnMemPtrSet_.clear();
   }
 
+  // Legacy name for shutdown() - kept for compatibility
+  void releaseAllResources() { shutdown(); }
+
+  // Mark the allocator as shut down without actually freeing memory.
+  // Use this in destructors to prevent crashes during program exit when
+  // QNN library resources might already be destroyed.
+  // After this is called, all free() calls become no-ops.
+  void markShutdown() { isShutdown_ = true; }
+
   void setQNNPointer(QNN_INTERFACE_VER_TYPE qnnInterface, void* context) {
     this->qnnInterface_ = qnnInterface;
     this->context_ = context;
@@ -83,15 +105,23 @@ class QNNAllocator final : public Allocator {
   QNN_INTERFACE_VER_TYPE qnnInterface_;
   Qnn_ContextHandle_t context_ = nullptr;
 
-  RpcMemAllocFn_t rpcmem_alloc;
-  RpcMemFreeFn_t rpcmem_free;
-  RpcMemToFdFn_t rpcmem_to_fd;
+  // Hold the library handle to control unload order
+  // libcdsprpc.so will only be unloaded when this allocator is destroyed
+  void* libCdspHandle_ = nullptr;
+
+  RpcMemAllocFn_t rpcmem_alloc = nullptr;
+  RpcMemFreeFn_t rpcmem_free = nullptr;
+  RpcMemToFdFn_t rpcmem_to_fd = nullptr;
 
   // to check if the ptr is allocted by rpcmem_alloc
   std::set<void*> qnnMemPtrSet_;
   std::map<void*, std::pair<int, Qnn_MemHandle_t>> ptrToFdAndMemHandleMap_;
+
+  // Flag to indicate shutdown has been called or destructor is running
+  // When true, free() calls become no-ops to avoid crashes during program exit
+  bool isShutdown_ = false;
 };
 
 std::shared_ptr<QNNAllocator> createQNNAllocator();
 
-}  // namespace mllm::qnn
\ No newline at end of file
+}  // namespace mllm::qnn
diff --git a/mllm/backends/qnn/QNNBackend.cpp b/mllm/backends/qnn/QNNBackend.cpp
index 1a891cb6b..3900afc35 100644
--- a/mllm/backends/qnn/QNNBackend.cpp
+++ b/mllm/backends/qnn/QNNBackend.cpp
@@ -1,12 +1,15 @@
-#include "QNNBackend.hpp"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstring>
+#include <dlfcn.h>
 #include <fstream>
 #include <memory>
-#include "QNNUtils.hpp"
+
 #include "QnnLog.h"
+
+#include "mllm/backends/qnn/QNNBackend.hpp"
+#include "mllm/backends/qnn/QNNUtils.hpp"
 #include "mllm/backends/qnn/QNNAllocator.hpp"
 #include "mllm/backends/qnn/op/QNNCastTypeOp.hpp"
 #include "mllm/backends/qnn/op/QNNElewiseOp.hpp"
@@ -33,17 +36,16 @@ QNNBackend::QNNBackend() : Backend(kQNN, createQNNAllocator()) {
   profilingLevel_ = ProfilingLevel::OFF;
   debug_ = false;  // when set true, NATIVE tensor will be regared as APP_READ tensor
 
-  if (!loadQNNSymbol()) {
-    MLLM_ERROR_EXIT(ExitCode::kQnnError, "Failed to load QNN symbols");
-  } else {
-    MLLM_INFO("QNN symbols loaded successfully");
-  }
+  // Load QNN libraries and hold handles for lifecycle management
+  auto [qnnSuccess, qnnHandle] = loadQNNSymbol();
+  if (!qnnSuccess) { MLLM_ERROR_EXIT(ExitCode::kQnnError, "Failed to load QNN symbols"); }
+  qnnHtpLibHandle_ = qnnHandle;
+  MLLM_INFO("QNN symbols loaded successfully");
 
-  if (!loadQNNSystemSymbol()) {
-    MLLM_ERROR_EXIT(ExitCode::kQnnError, "Failed to load QNN System symbols");
-  } else {
-    MLLM_INFO("QNN System symbols loaded successfully");
-  }
+  auto [sysSuccess, sysHandle] = loadQNNSystemSymbol();
+  if (!sysSuccess) { MLLM_ERROR_EXIT(ExitCode::kQnnError, "Failed to load QNN System symbols"); }
+  qnnSystemLibHandle_ = sysHandle;
+  MLLM_INFO("QNN System symbols loaded successfully");
 
   runtime_ = QNNRuntime::create(profilingLevel_, qnnLogLevel);
   if (!runtime_) {
@@ -75,24 +77,67 @@ QNNBackend::QNNBackend() : Backend(kQNN, createQNNAllocator()) {
   MLLM_INFO("QNN Perf created successfully");
 }
 
+QNNBackend::~QNNBackend() {
+  // Cleanup order is critical - we hold all QNN library handles to control unload order:
+  // 1. Allocator shutdown (memDeRegister + rpcmem_free) - needs QNN alive
+  // 2. Clear models - tensor destructors try to free but allocator is shut down
+  // 3. Perf cleanup - needs QNN HTP infrastructure alive
+  // 4. Runtime cleanup - frees QNN backend/device handles
+  // 5. Allocator reset - dlcloses libcdsprpc.so (held by allocator)
+  // 6. Close QNN libraries - libQnnSystem.so first, then libQnnHtp.so
+
+  // 1. Properly shutdown allocator while QNN is still alive
+  //    This calls memDeRegister and rpcmem_free safely
+  if (allocator_) {
+    auto* qnnAllocator = dynamic_cast<QNNAllocator*>(allocator_.get());
+    if (qnnAllocator) { qnnAllocator->shutdown(); }
+  }
+
+  // 2. Clear models - tensor destructors will call free() but they're now no-ops
+  qnnModels_.clear();
+  qnnModelIndexMap_.clear();
+
+  // 3. Cleanup perf while QNN HTP infrastructure is still alive
+  if (perf_) { perf_->shutdown(); }
+  perf_.reset();
+
+  // 4. Cleanup runtime - frees QNN backend/device handles
+  runtime_->qnnInterface.contextFree(context_, nullptr);
+  context_ = nullptr;
+  runtime_.reset();
+
+  // 5. Reset allocator - will dlclose libcdsprpc.so since shutdown() was already called
+  allocator_.reset();
+
+  // 6. Close QNN libraries in reverse order of dependency
+  if (qnnSystemLibHandle_) {
+    dlclose(qnnSystemLibHandle_);
+    qnnSystemLibHandle_ = nullptr;
+  }
+  if (qnnHtpLibHandle_) {
+    dlclose(qnnHtpLibHandle_);
+    qnnHtpLibHandle_ = nullptr;
+  }
+}
+
 QNNPerf::QNNPerf(const QNN_INTERFACE_VER_TYPE* qnnInterface) {
   assert(qnnInterface != nullptr);
-  mQnnInterface = qnnInterface;
+  qnnInterface_ = qnnInterface;
 
   QnnDevice_Infrastructure_t deviceInfra = nullptr;
-  CALL_QNN(mQnnInterface->deviceGetInfrastructure(&deviceInfra));
+  CALL_QNN(qnnInterface_->deviceGetInfrastructure(&deviceInfra));
   QnnHtpDevice_Infrastructure_t* htpInfra = static_cast<QnnHtpDevice_Infrastructure_t*>(deviceInfra);
-  mPerfInfra = htpInfra->perfInfra;
+  perfInfra_ = htpInfra->perfInfra;
 
   uint32_t deviceId = 0;
   uint32_t coreId = 0;
-  CALL_QNN(mPerfInfra.createPowerConfigId(deviceId, coreId, &mPowerConfigId));
+  CALL_QNN(perfInfra_.createPowerConfigId(deviceId, coreId, &powerConfigId_));
 
-  mPowerConfigBurst = {
+  powerConfigBurst_ = {
       .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3,
       .dcvsV3Config =
           {
-              .contextId = mPowerConfigId,  // use the power config id created
+              .contextId = powerConfigId_,  // use the power config id created
               .setDcvsEnable = 1,
               .dcvsEnable = 0,  // 1- To enable Dcvs and consider dcvs power mode, 0- To disable dcvs
               .powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE,
@@ -111,11 +156,11 @@ QNNPerf::QNNPerf(const QNN_INTERFACE_VER_TYPE* qnnInterface) {
           },
   };
 
-  mPowerConfigBalanced = {
+  powerConfigBalanced_ = {
       .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3,
       .dcvsV3Config =
           {
-              .contextId = mPowerConfigId,  // use the power config id created
+              .contextId = powerConfigId_,  // use the power config id created
               .setDcvsEnable = 1,
               .dcvsEnable = 1,  // 1- To enable Dcvs and consider dcvs power mode, 0- To disable dcvs
               .powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_UP_DOWN,
@@ -135,8 +180,17 @@ QNNPerf::QNNPerf(const QNN_INTERFACE_VER_TYPE* qnnInterface) {
   };
 }
 
-// destory power config
-QNNPerf::~QNNPerf() { CALL_QNN(mPerfInfra.destroyPowerConfigId(mPowerConfigId)); }
+void QNNPerf::shutdown() {
+  if (isShutdown_) return;
+  isShutdown_ = true;
+  CALL_QNN(perfInfra_.destroyPowerConfigId(powerConfigId_));
+}
+
+QNNPerf::~QNNPerf() {
+  // If shutdown() was already called, skip cleanup
+  // This prevents crashes during program exit when QNN HTP infrastructure might be destroyed
+  if (!isShutdown_) { shutdown(); }
+}
 
 void QNNPerf::setRpcLatencyAndPolling() {
   // set RPC Control Latency
@@ -146,7 +200,7 @@ void QNNPerf::setRpcLatencyAndPolling() {
   rpcControlLatency.rpcControlLatencyConfig = 100;  // use rpc control latency recommended 100 us, refer hexagon sdk
   const QnnHtpPerfInfrastructure_PowerConfig_t* powerConfigs1[] = {&rpcControlLatency, nullptr};
 
-  CALL_QNN(mPerfInfra.setPowerConfig(mPowerConfigId, powerConfigs1));  // set RPC latency config on power config ID created
+  CALL_QNN(perfInfra_.setPowerConfig(powerConfigId_, powerConfigs1));  // set RPC latency config on power config ID created
 
   // set RPC Polling
   QnnHtpPerfInfrastructure_PowerConfig_t rpcPollingTime;  // refer QnnHtpPerfInfrastructure.h
@@ -155,17 +209,17 @@ void QNNPerf::setRpcLatencyAndPolling() {
   rpcPollingTime.rpcPollingTimeConfig = 9999;  // use rpc polling time recommended 0-10000 us
   const QnnHtpPerfInfrastructure_PowerConfig_t* powerConfigs2[] = {&rpcPollingTime, nullptr};
 
-  CALL_QNN(mPerfInfra.setPowerConfig(mPowerConfigId, powerConfigs2));  // set RPC polling config on power config ID created
+  CALL_QNN(perfInfra_.setPowerConfig(powerConfigId_, powerConfigs2));  // set RPC polling config on power config ID created
 }
 
 void QNNPerf::setPowerConfigBurst() {
-  const QnnHtpPerfInfrastructure_PowerConfig_t* powerConfigs[] = {&mPowerConfigBurst, nullptr};
-  CALL_QNN(mPerfInfra.setPowerConfig(mPowerConfigId, powerConfigs));
+  const QnnHtpPerfInfrastructure_PowerConfig_t* powerConfigs[] = {&powerConfigBurst_, nullptr};
+  CALL_QNN(perfInfra_.setPowerConfig(powerConfigId_, powerConfigs));
 }
 
 void QNNPerf::setPowerConfigBalanced() {
-  const QnnHtpPerfInfrastructure_PowerConfig_t* powerConfigs[] = {&mPowerConfigBalanced, nullptr};
-  CALL_QNN(mPerfInfra.setPowerConfig(mPowerConfigId, powerConfigs));
+  const QnnHtpPerfInfrastructure_PowerConfig_t* powerConfigs[] = {&powerConfigBalanced_, nullptr};
+  CALL_QNN(perfInfra_.setPowerConfig(powerConfigId_, powerConfigs));
 }
 
 QNNRuntime::~QNNRuntime() {
diff --git a/mllm/backends/qnn/QNNBackend.hpp b/mllm/backends/qnn/QNNBackend.hpp
index 78953f32d..f439dced9 100644
--- a/mllm/backends/qnn/QNNBackend.hpp
+++ b/mllm/backends/qnn/QNNBackend.hpp
@@ -26,16 +26,21 @@ class QNNPerf {
   }
   explicit QNNPerf(const QNN_INTERFACE_VER_TYPE* qnnInterface);
   ~QNNPerf();
+
+  // Explicitly destroy power config. Call this while QNN HTP infrastructure is still alive.
+  void shutdown();
+
   void setRpcLatencyAndPolling();
   void setPowerConfigBurst();
   void setPowerConfigBalanced();
 
  private:
-  const QNN_INTERFACE_VER_TYPE* mQnnInterface = nullptr;
-  QnnHtpDevice_PerfInfrastructure_t mPerfInfra{};
-  uint32_t mPowerConfigId;
-  QnnHtpPerfInfrastructure_PowerConfig_t mPowerConfigBurst{};
-  QnnHtpPerfInfrastructure_PowerConfig_t mPowerConfigBalanced{};
+  const QNN_INTERFACE_VER_TYPE* qnnInterface_ = nullptr;
+  QnnHtpDevice_PerfInfrastructure_t perfInfra_{};
+  uint32_t powerConfigId_ = 0;
+  QnnHtpPerfInfrastructure_PowerConfig_t powerConfigBurst_{};
+  QnnHtpPerfInfrastructure_PowerConfig_t powerConfigBalanced_{};
+  bool isShutdown_ = false;
 };
 
 class QNNRuntime {
@@ -86,6 +91,7 @@ class QNNRuntime {
 class QNNBackend final : public Backend {
  public:
   QNNBackend();
+  ~QNNBackend();
 
   bool loadContext(const std::string& contextPath);
   bool createContext();
@@ -128,6 +134,11 @@ class QNNBackend final : public Backend {
   std::unique_ptr<QNNRuntime> runtime_;
   std::unique_ptr<QNNPerf> perf_;
 
+  // Hold QNN library handles to control unload order
+  // These libraries will only be unloaded when QNNBackend is destroyed
+  void* qnnHtpLibHandle_ = nullptr;
+  void* qnnSystemLibHandle_ = nullptr;
+
   // Graph management
   std::map<std::string, int> qnnModelIndexMap_;
   std::vector<std::shared_ptr<QNNModel>> qnnModels_;
diff --git a/mllm/backends/qnn/QNNUtils.cpp b/mllm/backends/qnn/QNNUtils.cpp
index 96baeffd1..73d240bbb 100644
--- a/mllm/backends/qnn/QNNUtils.cpp
+++ b/mllm/backends/qnn/QNNUtils.cpp
@@ -21,14 +21,13 @@ namespace mllm::qnn {
 
 QnnInterfaceGetProvidersFn_t QnnInterface_getProviders = nullptr;
 
-bool loadQNNSymbol() {
+std::pair<bool, void*> loadQNNSymbol() {
   MLLM_INFO("QNN Backend Lib: libQnnHtp.so");
-  void* qnnLibHandle = nullptr;
-  qnnLibHandle = dlopen("libQnnHtp.so", RTLD_NOW | RTLD_LOCAL);
+  void* qnnLibHandle = dlopen("libQnnHtp.so", RTLD_NOW | RTLD_LOCAL);
   const char* errorOpen = dlerror();
   if (!qnnLibHandle) {
     MLLM_ERROR("Failed to open QNN libs.");
-    return false;
+    return {false, nullptr};
   }
 
   QnnInterface_getProviders = (QnnInterfaceGetProvidersFn_t)dlsym(qnnLibHandle, "QnnInterface_getProviders");
@@ -36,20 +35,20 @@ bool loadQNNSymbol() {
   if (!QnnInterface_getProviders) {
     MLLM_ERROR("Failed to load symbol <QnnInterface_getProviders>. dlerror returns {}.", errorSym);
     dlclose(qnnLibHandle);
-    return false;
+    return {false, nullptr};
   }
 
-  return true;
+  return {true, qnnLibHandle};
 }
 
 QnnSystemInterfaceGetProvidersFn_t QnnSystemInterface_getProviders = nullptr;
 
-bool loadQNNSystemSymbol() {
+std::pair<bool, void*> loadQNNSystemSymbol() {
   void* systemLibraryHandle = dlopen("libQnnSystem.so", RTLD_NOW | RTLD_LOCAL);
   const char* errorOpen = dlerror();
   if (!systemLibraryHandle) {
     MLLM_ERROR("Failed to open QNN System libs.");
-    return false;
+    return {false, nullptr};
   }
 
   QnnSystemInterface_getProviders =
@@ -58,10 +57,10 @@ bool loadQNNSystemSymbol() {
   if (!QnnSystemInterface_getProviders) {
     MLLM_ERROR("Failed to load symbol <QnnSystemInterface_getProviders>. dlerror returns {}.", errorSym);
     dlclose(systemLibraryHandle);
-    return false;
+    return {false, nullptr};
   }
 
-  return true;
+  return {true, systemLibraryHandle};
 }
 
 // --------------- End of QNN symbols loading ---------------
diff --git a/mllm/backends/qnn/QNNUtils.hpp b/mllm/backends/qnn/QNNUtils.hpp
index e74f27f4a..047a79355 100644
--- a/mllm/backends/qnn/QNNUtils.hpp
+++ b/mllm/backends/qnn/QNNUtils.hpp
@@ -43,8 +43,10 @@ using QnnSystemInterfaceGetProvidersFn_t = Qnn_ErrorHandle_t (*)(const QnnSystem
 extern QnnInterfaceGetProvidersFn_t QnnInterface_getProviders;
 extern QnnSystemInterfaceGetProvidersFn_t QnnSystemInterface_getProviders;
 
-bool loadQNNSymbol();
-bool loadQNNSystemSymbol();
+// Load QNN symbols and return library handle for lifecycle management
+// Returns {success, libHandle} - caller owns the handle and must dlclose it
+std::pair<bool, void*> loadQNNSymbol();
+std::pair<bool, void*> loadQNNSystemSymbol();
 
 // --------------- End of QNN symbols loading ---------------
 
diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp
index 0f67bab56..2a2e6010f 100644
--- a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp
+++ b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp
@@ -138,7 +138,10 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten
       if (!cfg->scale) {
         MLLM_ERROR_EXIT(ExitCode::kCoreError, "SymPerTensor quant recipe has no scale. tensor: {}", v->name());
       }
-      ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item<float>(), .offset = 0};
+
+      MLLM_RT_ASSERT_EQ(cfg->quant_to_type, kUInt8);
+
+      ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item<float>(), .offset = -128};
       MLLM_INFO("Configuring SymPerTensor quantization for tensor: {}, scale: {}", v->name(), cfg->scale.item<float>());
       break;
     }
diff --git a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
index 7e2a63220..8320776b6 100644
--- a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
+++ b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
@@ -571,6 +571,14 @@ bool LLMQuantRecipeSlicePattern::isMatch(const mllm::ir::op_ptr_t& op) {
 }
 
 bool LLMQuantRecipeSlicePattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& node) {
+  auto slice_ir = node->cast_<ir::linalg::SliceOp>();
+  auto i_0 = *(node->inputs().begin());
+
+  if (!i_0->getAttr("quant_recipe")) {
+    auto i_0_spec = genSimpleQuantizationSpecAttr(writer.getContext(), i_0->cast_<ir::tensor::TensorValue>());
+    i_0->setAttr("quant_recipe", i_0_spec);
+  }
+
   return shareQuantSpecSingleInputToSingleOutputAndSetOpQuantAnnoAttr(writer.getContext(),
                                                                       node->cast_<ir::linalg::LinalgIROp>());
 }
diff --git a/mllm/backends/qnn/aot/visitor/CastType.cpp b/mllm/backends/qnn/aot/visitor/CastType.cpp
index 95dc3997a..82e4194db 100644
--- a/mllm/backends/qnn/aot/visitor/CastType.cpp
+++ b/mllm/backends/qnn/aot/visitor/CastType.cpp
@@ -61,7 +61,10 @@ bool QnnAOTCastTypePattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op
     qnn_op_type = "Quantize";
   } else if (isInt(input_dtype) && isFloat(target_dtype)) {
     qnn_op_type = "Dequantize";
+  } else if (isInt(input_dtype) && isInt(target_dtype)) {
+    qnn_op_type = "Convert";
   } else if (input_dtype == kFloat32 && target_dtype == kFloat16) {
+    // TODO, Cast usage is error.
     qnn_op_type = "Cast";
   } else {
     MLLM_ERROR("Unsupported CastType for QNN: {} -> {}", (int)input_dtype, (int)target_dtype);
diff --git a/mllm/backends/qnn/aot_rt/KVCacheManager.cpp b/mllm/backends/qnn/aot_rt/KVCacheManager.cpp
index 7df115464..73e2bbc87 100644
--- a/mllm/backends/qnn/aot_rt/KVCacheManager.cpp
+++ b/mllm/backends/qnn/aot_rt/KVCacheManager.cpp
@@ -187,6 +187,7 @@ void KVCacheManager<T>::rearrangeCache(int32_t ar_len_dst) {
 
 template<typename T>
 void KVCacheManager<T>::rearrangeKey(KVCache<T>& k_cache, int32_t ar_len_dst) {
+  // [B, H, D, S] rearrange.
   const int32_t src_cache_num = (cur_ar_len_ == config_.context_len) ? config_.context_len : config_.context_len - cur_ar_len_;
   const int32_t dst_cache_num = config_.context_len - ar_len_dst;
   T* k_cache_in_read_ptr = k_cache.buffer;
@@ -213,6 +214,7 @@ void KVCacheManager<T>::rearrangeKey(KVCache<T>& k_cache, int32_t ar_len_dst) {
 
 template<typename T>
 void KVCacheManager<T>::rearrangeValue(KVCache<T>& v_cache, int32_t ar_len_dst) {
+  // [B, H, S, D] rearrange.
   const int32_t src_cache_num = (cur_ar_len_ == config_.context_len) ? config_.context_len : config_.context_len - cur_ar_len_;
   const int32_t dst_cache_num = config_.context_len - ar_len_dst;
   T* v_cache_in_read_ptr = v_cache.buffer;
diff --git a/mllm/backends/qnn/aot_rt/KVCacheManager.hpp b/mllm/backends/qnn/aot_rt/KVCacheManager.hpp
index 8eddb1a95..e1090278c 100644
--- a/mllm/backends/qnn/aot_rt/KVCacheManager.hpp
+++ b/mllm/backends/qnn/aot_rt/KVCacheManager.hpp
@@ -24,7 +24,20 @@ template<typename T>
 class KVCacheManager {
  public:
   explicit KVCacheManager(QnnAOTConfig config);
-  ~KVCacheManager() = default;
+  ~KVCacheManager() {
+    // Explicitly clear storage to ensure proper cleanup order
+    // Storage must be released before QNN backend is destroyed
+    for (auto& cache : k_cache_) {
+      cache.buffer_storage.reset();
+      cache.output_buffer_storage.reset();
+    }
+    for (auto& cache : v_cache_) {
+      cache.buffer_storage.reset();
+      cache.output_buffer_storage.reset();
+    }
+    k_cache_.clear();
+    v_cache_.clear();
+  }
 
   void initCache(mllm::Allocator* allocator, int32_t ar_len);
   void rearrangeCache(int32_t ar_len_dst);
diff --git a/mllm/backends/qnn/aot_rt/PromptProcessor.hpp b/mllm/backends/qnn/aot_rt/PromptProcessor.hpp
index cfe08620b..b77553640 100644
--- a/mllm/backends/qnn/aot_rt/PromptProcessor.hpp
+++ b/mllm/backends/qnn/aot_rt/PromptProcessor.hpp
@@ -17,6 +17,14 @@ class PromptProcessor {
  public:
   PromptProcessor(KVCacheManager<T>* kv_manager, QnnAOTConfig config);
 
+  ~PromptProcessor() {
+    // Clear module's output tensors before member tensors are destroyed
+    // to avoid double-free or use-after-free issues
+    if (module_) { module_->setOutputTensors({}); }
+    output_tensors_.clear();
+    input_tensors_.clear();
+  }
+
   /**
    * Prefill an LLM Module with the given text input.
    * @param prompt_tokens The text prompt tokens to the LLM Module.
diff --git a/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp b/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp
index 9b8acc338..f489344e6 100644
--- a/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp
+++ b/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp
@@ -12,6 +12,10 @@ namespace mllm::qnn::aot {
 class QnnAOTModule : public mllm::nn::Module, public models::ARGeneration {
  public:
   explicit QnnAOTModule(const std::string& graph_name);
+  ~QnnAOTModule() {
+    // Clear output tensors to ensure proper cleanup order
+    output_tensors_.clear();
+  }
 
   std::vector<mllm::Tensor> forward(const std::vector<mllm::Tensor>& inputs, const std::vector<mllm::AnyValue>& args) override;
 
diff --git a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp
index 51ce86c70..2754e062e 100644
--- a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp
+++ b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp
@@ -19,7 +19,14 @@ using RunnerConfig = QnnAOTConfig;
 class Runner {
  public:
   explicit Runner(const RunnerConfig& config, mllm::preprocessor::AutoTokenizer* tokenizer);
-  ~Runner() = default;
+  ~Runner() {
+    // Explicit destruction order to avoid use-after-free issues
+    // Destroy generators first (they reference kv_manager_)
+    token_generator_.reset();
+    prompt_processor_.reset();
+    // Then destroy kv_manager_
+    kv_manager_.reset();
+  }
 
   bool load();
   void generate(const Tensor& prompt_tokens, int32_t seq_len, const std::function<void(const std::string&)>& token_callback,
diff --git a/mllm/backends/qnn/aot_rt/TokenGenerator.cpp b/mllm/backends/qnn/aot_rt/TokenGenerator.cpp
index 4e0884358..8c379c52a 100644
--- a/mllm/backends/qnn/aot_rt/TokenGenerator.cpp
+++ b/mllm/backends/qnn/aot_rt/TokenGenerator.cpp
@@ -1,6 +1,7 @@
 #include "mllm/backends/qnn/aot_rt/TokenGenerator.hpp"
 #include "mllm/preprocessor/tokenizers/Unicode.hpp"
 #include <cstring>
+#include <numeric>
 #include <utility>
 
 namespace mllm::qnn::aot {
@@ -16,7 +17,7 @@ TokenGenerator<T>::TokenGenerator(mllm::preprocessor::AutoTokenizer* tokenizer,
 
 template<typename T>
 void TokenGenerator<T>::init_io() {
-  input_tensors_.reserve(4 + 2 * config_.num_layers);
+  input_tensors_.reserve(3 + 2 * config_.num_layers);
 
   // 1. Input IDs
   auto input_ids = Tensor::empty({1, 1}, kInt32, kQNN).alloc();
@@ -38,7 +39,8 @@ void TokenGenerator<T>::init_io() {
   const auto& v_caches = kv_manager_->getVCache();
   // K
   for (int l = 0; l < config_.num_layers; ++l) {
-    auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.head_dim, config_.context_len}, config_.kv_dtype, kQNN);
+    auto k_tensor =
+        Tensor::empty({1, (int)config_.num_heads, config_.head_dim, config_.context_len - 1}, config_.kv_dtype, kQNN);
     k_tensor.impl()->storage()->ptr_ = k_caches[l].buffer;
     k_tensor.impl()->storage()->mem_type_ = kManual;
     k_tensor.setName("past_key_" + std::to_string(l));
@@ -107,6 +109,11 @@ int64_t TokenGenerator<T>::generate(std::vector<int64_t>& tokens, int64_t start_
   // Ensure KV cache is arranged for decode (1 token)
   kv_manager_->rearrangeCache(1);
 
+  // Initialize attention mask for decode phase
+  std::vector<int32_t> attention_map(1);
+  std::iota(attention_map.begin(), attention_map.end(), -1);
+  kv_manager_->initAttentionMask(input_tensors_[2].ptr<uint16_t>(), attention_map, 1, current_pos);
+
   module_->setOutputTensors(output_tensors_);
 
   for (int i = 0; i < seq_len; ++i) {
@@ -122,6 +129,9 @@ int64_t TokenGenerator<T>::generate(std::vector<int64_t>& tokens, int64_t start_
     int32_t n_update = 1;
     kv_manager_->updateCache(1, current_pos, n_update, {});
 
+    // Update attention mask
+    kv_manager_->updateAttentionMask(input_tensors_[2].ptr<uint16_t>(), 1, current_pos, n_update);
+
     // Get logits
     auto logits = output_tensors_[0].to(kCPU).squeeze(0);
 
diff --git a/mllm/backends/qnn/aot_rt/TokenGenerator.hpp b/mllm/backends/qnn/aot_rt/TokenGenerator.hpp
index b40b9725a..4c8968559 100644
--- a/mllm/backends/qnn/aot_rt/TokenGenerator.hpp
+++ b/mllm/backends/qnn/aot_rt/TokenGenerator.hpp
@@ -19,7 +19,13 @@ class TokenGenerator {
   TokenGenerator(mllm::preprocessor::AutoTokenizer* tokenizer, KVCacheManager<T>* kv_manager,
                  std::unique_ptr<std::unordered_set<uint64_t>>&& eos_ids, QnnAOTConfig config);
 
-  virtual ~TokenGenerator() = default;
+  virtual ~TokenGenerator() {
+    // Clear module's output tensors before member tensors are destroyed
+    // to avoid double-free or use-after-free issues
+    if (module_) { module_->setOutputTensors({}); }
+    output_tensors_.clear();
+    input_tensors_.clear();
+  }
 
   void init_io();
 
diff --git a/mllm/compile/ir/Node.cpp b/mllm/compile/ir/Node.cpp
index 303f99e51..481f60971 100644
--- a/mllm/compile/ir/Node.cpp
+++ b/mllm/compile/ir/Node.cpp
@@ -258,16 +258,16 @@ void IRContext::setDevice(DeviceTypes device_type) { device_type_ = device_type;
 // FIXME: deprecated, context has no device
 DeviceTypes IRContext::getDevice() { return device_type_; }
 
-bool IRContext::isCacheInputOutputTensor(uint32_t uuid) {
+bool IRContext::isCacheInputOutputTensor(size_t uuid) {
   if (cached_inputs_outputs_.count(uuid)) { return true; }
   return false;
 }
 
-void IRContext::cacheInputOutputTensor(uint32_t uuid, const val_ptr_t& tensor_ir) { cached_inputs_outputs_[uuid] = tensor_ir; }
+void IRContext::cacheInputOutputTensor(size_t uuid, const val_ptr_t& tensor_ir) { cached_inputs_outputs_[uuid] = tensor_ir; }
 
-val_ptr_t IRContext::getCacheInputOutputTensor(uint32_t uuid) { return cached_inputs_outputs_[uuid]; }
+val_ptr_t IRContext::getCacheInputOutputTensor(size_t uuid) { return cached_inputs_outputs_[uuid]; }
 
-std::unordered_map<uint32_t, val_ptr_t>& IRContext::getAllCachedInputOutputTensorIRs() { return cached_inputs_outputs_; }
+std::unordered_map<size_t, val_ptr_t>& IRContext::getAllCachedInputOutputTensorIRs() { return cached_inputs_outputs_; }
 
 void IRContext::pushRegion2InsertRegionStackAndSetRegion(const region_ptr_t& region) {
   insert_region_stack_.push(region);
diff --git a/mllm/compile/ir/Node.hpp b/mllm/compile/ir/Node.hpp
index a3eceec58..2506d5e93 100644
--- a/mllm/compile/ir/Node.hpp
+++ b/mllm/compile/ir/Node.hpp
@@ -314,13 +314,13 @@ class IRContext : public std::enable_shared_from_this<IRContext> {
   // FIXME: deprecated, context has no device
   DeviceTypes getDevice();
 
-  bool isCacheInputOutputTensor(uint32_t uuid);
+  bool isCacheInputOutputTensor(size_t uuid);
 
-  void cacheInputOutputTensor(uint32_t uuid, const val_ptr_t& tensor_ir);
+  void cacheInputOutputTensor(size_t uuid, const val_ptr_t& tensor_ir);
 
-  val_ptr_t getCacheInputOutputTensor(uint32_t uuid);
+  val_ptr_t getCacheInputOutputTensor(size_t uuid);
 
-  std::unordered_map<uint32_t, val_ptr_t>& getAllCachedInputOutputTensorIRs();
+  std::unordered_map<size_t, val_ptr_t>& getAllCachedInputOutputTensorIRs();
 
   // A long name that avoid user to use this low level API
   void pushRegion2InsertRegionStackAndSetRegion(const region_ptr_t& region);
@@ -335,7 +335,7 @@ class IRContext : public std::enable_shared_from_this<IRContext> {
   std::unordered_map<val_ptr_t, std::string> value_names_;
   region_ptr_t cur_insert_region_;
   node_ptr_t top_level_op_;
-  std::unordered_map<uint32_t, val_ptr_t> cached_inputs_outputs_;
+  std::unordered_map<size_t, val_ptr_t> cached_inputs_outputs_;
   std::stack<region_ptr_t> insert_region_stack_;
 };
 
diff --git a/mllm/engine/Context.cpp b/mllm/engine/Context.cpp
index c4e178b55..7dddff6bd 100644
--- a/mllm/engine/Context.cpp
+++ b/mllm/engine/Context.cpp
@@ -41,6 +41,11 @@ Backend::ptr_t Context::getBackend(const DeviceTypes& device) {
   return backends_[device];
 }
 
+void Context::shutdownBackend(const DeviceTypes& device) {
+  if (!backends_.has(device)) { return; }
+  backends_.remove(device);
+}
+
 std::vector<Tensor> Context::buildOpAndSubmitTask(OpTypes op_type, const BaseOpOptionsBase& base_options,
                                                   const std::vector<Tensor>& inputs, DeviceTypes special_device) {
   MLLM_TRACY_ZONE_SCOPED;
diff --git a/mllm/engine/Context.hpp b/mllm/engine/Context.hpp
index e9a3506bb..4f1f5ee00 100644
--- a/mllm/engine/Context.hpp
+++ b/mllm/engine/Context.hpp
@@ -30,6 +30,8 @@ class Context {
 
   Backend::ptr_t getBackend(const DeviceTypes& device);
 
+  void shutdownBackend(const DeviceTypes& device);
+
   inline MemoryManager::ptr_t memoryManager() { return memory_manager_; }
 
   inline DispatcherManager::ptr_t dispatcherManager() { return dispatcher_manager_; }
diff --git a/mllm/mllm.cpp b/mllm/mllm.cpp
index 08a45aefd..4775682d3 100644
--- a/mllm/mllm.cpp
+++ b/mllm/mllm.cpp
@@ -53,6 +53,12 @@ void shutdownContext() {
   }
   ::mllm::cleanThisThread();
 
+  // Clean up QNN Backend before system start to unload QNN dynamic libraries
+  if (isQnnAvailable()) {
+    auto& ctx = Context::instance();
+    ctx.shutdownBackend(kQNN);
+  }
+
   // Clean up memory before backend is freed.
   // FIXME:
   // This line is needed for cuda !!!
diff --git a/pymllm/backends/qualcomm/transformers/core/qdq.py b/pymllm/backends/qualcomm/transformers/core/qdq.py
index c13011a51..813aad3f0 100644
--- a/pymllm/backends/qualcomm/transformers/core/qdq.py
+++ b/pymllm/backends/qualcomm/transformers/core/qdq.py
@@ -27,10 +27,11 @@ def __init__(self, bits=8, qscheme=torch.per_tensor_affine):
 
         # 1. Calculate quantization range based on bits and scheme
         if qscheme in [torch.per_tensor_symmetric, torch.per_channel_symmetric]:
-            # Symmetric: range is [-(2^(bits-1)), 2^(bits-1) - 1]
-            # e.g., 8-bit: -128 to 127
-            self.quant_min = -(2 ** (bits - 1))
-            self.quant_max = 2 ** (bits - 1) - 1
+            # NOTE: If left empty: with uint8 and symmetric quantization, the observer will use [0, 255] as the range. And 128 as the zero_point.
+            self.quant_min = None
+            self.quant_max = None
+            assert bits == 8, "Symmetric quantization is only supported for 8-bit"
+            self.dtype = torch.uint8
         else:
             # Asymmetric (Affine): range is [0, 2^bits - 1]
             # e.g., 8-bit: 0 to 255