diff --git a/examples/llama_qnn_aot/modeling_llama_qnn_aot.hpp b/examples/llama_qnn_aot/modeling_llama_qnn_aot.hpp index a129cd3bd..3d28e79d6 100644 --- a/examples/llama_qnn_aot/modeling_llama_qnn_aot.hpp +++ b/examples/llama_qnn_aot/modeling_llama_qnn_aot.hpp @@ -83,7 +83,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch) case kUInt8PerTensorSym: { auto scale = m->getTopParameterFile()->pull(scale_name); auto zp = m->getTopParameterFile()->pull(zp_name); - MLLM_RT_ASSERT_EQ(zp.item(), 0); + MLLM_RT_ASSERT_EQ(zp.item(), 128); // Is 128! not 127! auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal); diff --git a/examples/llama_qnn_aot/modeling_llama_qnn_aot_sha.hpp b/examples/llama_qnn_aot/modeling_llama_qnn_aot_sha.hpp index a26ebef1e..d84a76da3 100644 --- a/examples/llama_qnn_aot/modeling_llama_qnn_aot_sha.hpp +++ b/examples/llama_qnn_aot/modeling_llama_qnn_aot_sha.hpp @@ -88,7 +88,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch) case kUInt8PerTensorSym: { auto scale = m->getTopParameterFile()->pull(scale_name); auto zp = m->getTopParameterFile()->pull(zp_name); - MLLM_RT_ASSERT_EQ(zp.item(), 0); + MLLM_RT_ASSERT_EQ(zp.item(), 128); // Is 128! not 127! auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal); diff --git a/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot.hpp b/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot.hpp index 26d57e676..9c389ced6 100644 --- a/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot.hpp +++ b/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot.hpp @@ -83,7 +83,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch) case kUInt8PerTensorSym: { auto scale = m->getTopParameterFile()->pull(scale_name); auto zp = m->getTopParameterFile()->pull(zp_name); - MLLM_RT_ASSERT_EQ(zp.item(), 0); + MLLM_RT_ASSERT_EQ(zp.item(), 128); // Is 128! not 127! auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal); diff --git a/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot_sha.hpp b/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot_sha.hpp index db69c6017..834564cfa 100644 --- a/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot_sha.hpp +++ b/examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot_sha.hpp @@ -88,7 +88,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch) case kUInt8PerTensorSym: { auto scale = m->getTopParameterFile()->pull(scale_name); auto zp = m->getTopParameterFile()->pull(zp_name); - MLLM_RT_ASSERT_EQ(zp.item(), 0); + MLLM_RT_ASSERT_EQ(zp.item(), 128); // Is 128! not 127! auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal); diff --git a/examples/qwen3_qnn_aot/compile_sha.cpp b/examples/qwen3_qnn_aot/compile_sha.cpp index 8e6dd2323..f6d25894b 100644 --- a/examples/qwen3_qnn_aot/compile_sha.cpp +++ b/examples/qwen3_qnn_aot/compile_sha.cpp @@ -166,12 +166,8 @@ MLLM_MAIN({ CL - N, }, mllm::kUInt8PerTensorSym); trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kUInt8PerTensorSym); - trace_inputs[past_key_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.scale").impl(), true); - trace_inputs[past_key_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point").impl(), true); - trace_inputs[past_value_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.scale").impl(), true); - trace_inputs[past_value_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point").impl(), true); // clang-format on } diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp index ba1cdb227..e41f14c75 100644 --- a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp +++ b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp @@ -83,7 +83,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch) case kUInt8PerTensorSym: { auto scale = m->getTopParameterFile()->pull(scale_name); auto zp = m->getTopParameterFile()->pull(zp_name); - MLLM_RT_ASSERT_EQ(zp.item(), 0); + MLLM_RT_ASSERT_EQ(zp.item(), 128); // Is 128! not 127! auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal); diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot_sha.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot_sha.hpp index 66a313451..b6a9f403a 100644 --- a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot_sha.hpp +++ b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot_sha.hpp @@ -88,7 +88,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch) case kUInt8PerTensorSym: { auto scale = m->getTopParameterFile()->pull(scale_name); auto zp = m->getTopParameterFile()->pull(zp_name); - MLLM_RT_ASSERT_EQ(zp.item(), 0); + MLLM_RT_ASSERT_EQ(zp.item(), 128); // Is 128! not 127! auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal); @@ -356,14 +356,12 @@ class Qwen3AttentionSHA final : public nn::Module { std::string h_str = std::to_string(h); // K: De-quantize and re-quantize to int8 - auto k_h = key_states_per_head[h].to(kFloat32); - k_h = k_h.to(kUInt8PerTensorSym); + auto k_h = key_states_per_head[h].to(kUInt8PerTensorSym); k_h = ptq::QDQ_KV(this, k_h, "k_cast_to_int8_qdq_h" + h_str); k_h = k_h.transpose(2, 3); // [B, 1, D, S] // V: Quantize to int16 then int8 auto v_h = ptq::QDQ(this, value_states_per_head[h], "v_cast_to_int16_qdq_h" + h_str); - v_h = v_h.to(kFloat32); v_h = v_h.to(kUInt8PerTensorSym); v_h = ptq::QDQ_KV(this, v_h, "v_cast_to_int8_qdq_h" + h_str); diff --git a/mllm/backends/qnn/QNNAllocator.cpp b/mllm/backends/qnn/QNNAllocator.cpp index bc4a73bfd..dc04b5d0b 100644 --- a/mllm/backends/qnn/QNNAllocator.cpp +++ b/mllm/backends/qnn/QNNAllocator.cpp @@ -14,24 +14,36 @@ namespace mllm::qnn { #define RPCMEM_DEFAULT_FLAGS 1 QNNAllocator::QNNAllocator() { - void* libCdspHandle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); - if (nullptr == libCdspHandle) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); } + libCdspHandle_ = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == libCdspHandle_) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); } - rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle, "rpcmem_alloc"); - rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle, "rpcmem_free"); - rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle, "rpcmem_to_fd"); + rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle_, "rpcmem_alloc"); + rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle_, "rpcmem_free"); + rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle_, "rpcmem_to_fd"); } QNNAllocator::QNNAllocator(QNN_INTERFACE_VER_TYPE qnnInterface, void* context) : qnnInterface_(qnnInterface), context_(context) { MLLM_RT_ASSERT(context_ != nullptr); - void* libCdspHandle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); - if (nullptr == libCdspHandle) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); } + libCdspHandle_ = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == libCdspHandle_) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); } - rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle, "rpcmem_alloc"); - rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle, "rpcmem_free"); - rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle, "rpcmem_to_fd"); + rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle_, "rpcmem_alloc"); + rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle_, "rpcmem_free"); + rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle_, "rpcmem_to_fd"); +} + +QNNAllocator::~QNNAllocator() { + // Properly release all resources before unloading the library + // Since we hold libCdspHandle_, the library won't be unloaded until we dlclose it + shutdown(); + + // Now safe to unload the library + if (libCdspHandle_) { + dlclose(libCdspHandle_); + libCdspHandle_ = nullptr; + } } bool QNNAllocator::alloc(Storage* storage) { @@ -46,12 +58,22 @@ bool QNNAllocator::alloc(Storage* storage) { } void QNNAllocator::free(Storage* storage) { + // Skip if shutdown was called or destructor is running + // During program exit, QNN library resources might be destroyed, so we can't safely call rpcmem_free + if (isShutdown_) { return; } + + // Only free memory that was allocated by this allocator and not yet freed + if (!qnnMemPtrSet_.count(storage->ptr_)) { + return; // Not our memory or already freed, skip + } + if (ptrToFdAndMemHandleMap_.count(storage->ptr_)) { - MLLM_RT_ASSERT_EQ(QNN_SUCCESS, - qnnInterface_.memDeRegister(&(ptrToFdAndMemHandleMap_.find(storage->ptr_)->second.second), 1)); + qnnInterface_.memDeRegister(&(ptrToFdAndMemHandleMap_.find(storage->ptr_)->second.second), 1); + ptrToFdAndMemHandleMap_.erase(storage->ptr_); } rpcmem_free(storage->ptr_); + qnnMemPtrSet_.erase(storage->ptr_); } void QNNAllocator::registerQnnTensorToSharedBuffer(void* ptr, Qnn_Tensor_t& qnn_tensor) { @@ -99,4 +121,4 @@ void QNNAllocator::deRegisterQnnTensorFromSharedBuffer(void* ptr) { std::shared_ptr createQNNAllocator() { return std::make_shared(); } -} // namespace mllm::qnn \ No newline at end of file +} // namespace mllm::qnn diff --git a/mllm/backends/qnn/QNNAllocator.hpp b/mllm/backends/qnn/QNNAllocator.hpp index eac40a534..38c69716b 100644 --- a/mllm/backends/qnn/QNNAllocator.hpp +++ b/mllm/backends/qnn/QNNAllocator.hpp @@ -30,15 +30,37 @@ class QNNAllocator final : public Allocator { QNNAllocator(); // need to setQNNPointer afterward QNNAllocator(QNN_INTERFACE_VER_TYPE qnnInterface, void* context); - ~QNNAllocator() { + ~QNNAllocator(); + + // Explicitly release all QNN memory resources. Call this for proper cleanup when you + // want to release memory during normal operation (not program exit). + // This is SAFE to call and will properly free all QNN resources. + void shutdown() { + if (isShutdown_) return; + isShutdown_ = true; + + // First, deregister all registered memory for (auto iter = ptrToFdAndMemHandleMap_.begin(); iter != ptrToFdAndMemHandleMap_.end();) { Qnn_ErrorHandle_t deregisterRet = qnnInterface_.memDeRegister(&iter->second.second, 1); - if (QNN_SUCCESS != deregisterRet) { MLLM_ERROR("~QNNAllocator: qnnInterface_.memDeRegister failed"); } - rpcmem_free(iter->first); + if (QNN_SUCCESS != deregisterRet) { MLLM_ERROR("QNNAllocator::shutdown: qnnInterface_.memDeRegister failed"); } iter = ptrToFdAndMemHandleMap_.erase(iter); } + + // Then, free all allocated memory (registered or not) + MLLM_INFO("QNNAllocator::shutdown: freeing all allocated memory"); + for (void* ptr : qnnMemPtrSet_) { rpcmem_free(ptr); } + qnnMemPtrSet_.clear(); } + // Legacy name for shutdown() - kept for compatibility + void releaseAllResources() { shutdown(); } + + // Mark the allocator as shut down without actually freeing memory. + // Use this in destructors to prevent crashes during program exit when + // QNN library resources might already be destroyed. + // After this is called, all free() calls become no-ops. + void markShutdown() { isShutdown_ = true; } + void setQNNPointer(QNN_INTERFACE_VER_TYPE qnnInterface, void* context) { this->qnnInterface_ = qnnInterface; this->context_ = context; @@ -83,15 +105,23 @@ class QNNAllocator final : public Allocator { QNN_INTERFACE_VER_TYPE qnnInterface_; Qnn_ContextHandle_t context_ = nullptr; - RpcMemAllocFn_t rpcmem_alloc; - RpcMemFreeFn_t rpcmem_free; - RpcMemToFdFn_t rpcmem_to_fd; + // Hold the library handle to control unload order + // libcdsprpc.so will only be unloaded when this allocator is destroyed + void* libCdspHandle_ = nullptr; + + RpcMemAllocFn_t rpcmem_alloc = nullptr; + RpcMemFreeFn_t rpcmem_free = nullptr; + RpcMemToFdFn_t rpcmem_to_fd = nullptr; // to check if the ptr is allocted by rpcmem_alloc std::set qnnMemPtrSet_; std::map> ptrToFdAndMemHandleMap_; + + // Flag to indicate shutdown has been called or destructor is running + // When true, free() calls become no-ops to avoid crashes during program exit + bool isShutdown_ = false; }; std::shared_ptr createQNNAllocator(); -} // namespace mllm::qnn \ No newline at end of file +} // namespace mllm::qnn diff --git a/mllm/backends/qnn/QNNBackend.cpp b/mllm/backends/qnn/QNNBackend.cpp index 1a891cb6b..3900afc35 100644 --- a/mllm/backends/qnn/QNNBackend.cpp +++ b/mllm/backends/qnn/QNNBackend.cpp @@ -1,12 +1,15 @@ -#include "QNNBackend.hpp" #include #include #include #include +#include #include #include -#include "QNNUtils.hpp" + #include "QnnLog.h" + +#include "mllm/backends/qnn/QNNBackend.hpp" +#include "mllm/backends/qnn/QNNUtils.hpp" #include "mllm/backends/qnn/QNNAllocator.hpp" #include "mllm/backends/qnn/op/QNNCastTypeOp.hpp" #include "mllm/backends/qnn/op/QNNElewiseOp.hpp" @@ -33,17 +36,16 @@ QNNBackend::QNNBackend() : Backend(kQNN, createQNNAllocator()) { profilingLevel_ = ProfilingLevel::OFF; debug_ = false; // when set true, NATIVE tensor will be regared as APP_READ tensor - if (!loadQNNSymbol()) { - MLLM_ERROR_EXIT(ExitCode::kQnnError, "Failed to load QNN symbols"); - } else { - MLLM_INFO("QNN symbols loaded successfully"); - } + // Load QNN libraries and hold handles for lifecycle management + auto [qnnSuccess, qnnHandle] = loadQNNSymbol(); + if (!qnnSuccess) { MLLM_ERROR_EXIT(ExitCode::kQnnError, "Failed to load QNN symbols"); } + qnnHtpLibHandle_ = qnnHandle; + MLLM_INFO("QNN symbols loaded successfully"); - if (!loadQNNSystemSymbol()) { - MLLM_ERROR_EXIT(ExitCode::kQnnError, "Failed to load QNN System symbols"); - } else { - MLLM_INFO("QNN System symbols loaded successfully"); - } + auto [sysSuccess, sysHandle] = loadQNNSystemSymbol(); + if (!sysSuccess) { MLLM_ERROR_EXIT(ExitCode::kQnnError, "Failed to load QNN System symbols"); } + qnnSystemLibHandle_ = sysHandle; + MLLM_INFO("QNN System symbols loaded successfully"); runtime_ = QNNRuntime::create(profilingLevel_, qnnLogLevel); if (!runtime_) { @@ -75,24 +77,67 @@ QNNBackend::QNNBackend() : Backend(kQNN, createQNNAllocator()) { MLLM_INFO("QNN Perf created successfully"); } +QNNBackend::~QNNBackend() { + // Cleanup order is critical - we hold all QNN library handles to control unload order: + // 1. Allocator shutdown (memDeRegister + rpcmem_free) - needs QNN alive + // 2. Clear models - tensor destructors try to free but allocator is shut down + // 3. Perf cleanup - needs QNN HTP infrastructure alive + // 4. Runtime cleanup - frees QNN backend/device handles + // 5. Allocator reset - dlcloses libcdsprpc.so (held by allocator) + // 6. Close QNN libraries - libQnnSystem.so first, then libQnnHtp.so + + // 1. Properly shutdown allocator while QNN is still alive + // This calls memDeRegister and rpcmem_free safely + if (allocator_) { + auto* qnnAllocator = dynamic_cast(allocator_.get()); + if (qnnAllocator) { qnnAllocator->shutdown(); } + } + + // 2. Clear models - tensor destructors will call free() but they're now no-ops + qnnModels_.clear(); + qnnModelIndexMap_.clear(); + + // 3. Cleanup perf while QNN HTP infrastructure is still alive + if (perf_) { perf_->shutdown(); } + perf_.reset(); + + // 4. Cleanup runtime - frees QNN backend/device handles + runtime_->qnnInterface.contextFree(context_, nullptr); + context_ = nullptr; + runtime_.reset(); + + // 5. Reset allocator - will dlclose libcdsprpc.so since shutdown() was already called + allocator_.reset(); + + // 6. Close QNN libraries in reverse order of dependency + if (qnnSystemLibHandle_) { + dlclose(qnnSystemLibHandle_); + qnnSystemLibHandle_ = nullptr; + } + if (qnnHtpLibHandle_) { + dlclose(qnnHtpLibHandle_); + qnnHtpLibHandle_ = nullptr; + } +} + QNNPerf::QNNPerf(const QNN_INTERFACE_VER_TYPE* qnnInterface) { assert(qnnInterface != nullptr); - mQnnInterface = qnnInterface; + qnnInterface_ = qnnInterface; QnnDevice_Infrastructure_t deviceInfra = nullptr; - CALL_QNN(mQnnInterface->deviceGetInfrastructure(&deviceInfra)); + CALL_QNN(qnnInterface_->deviceGetInfrastructure(&deviceInfra)); QnnHtpDevice_Infrastructure_t* htpInfra = static_cast(deviceInfra); - mPerfInfra = htpInfra->perfInfra; + perfInfra_ = htpInfra->perfInfra; uint32_t deviceId = 0; uint32_t coreId = 0; - CALL_QNN(mPerfInfra.createPowerConfigId(deviceId, coreId, &mPowerConfigId)); + CALL_QNN(perfInfra_.createPowerConfigId(deviceId, coreId, &powerConfigId_)); - mPowerConfigBurst = { + powerConfigBurst_ = { .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3, .dcvsV3Config = { - .contextId = mPowerConfigId, // use the power config id created + .contextId = powerConfigId_, // use the power config id created .setDcvsEnable = 1, .dcvsEnable = 0, // 1- To enable Dcvs and consider dcvs power mode, 0- To disable dcvs .powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE, @@ -111,11 +156,11 @@ QNNPerf::QNNPerf(const QNN_INTERFACE_VER_TYPE* qnnInterface) { }, }; - mPowerConfigBalanced = { + powerConfigBalanced_ = { .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3, .dcvsV3Config = { - .contextId = mPowerConfigId, // use the power config id created + .contextId = powerConfigId_, // use the power config id created .setDcvsEnable = 1, .dcvsEnable = 1, // 1- To enable Dcvs and consider dcvs power mode, 0- To disable dcvs .powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_UP_DOWN, @@ -135,8 +180,17 @@ QNNPerf::QNNPerf(const QNN_INTERFACE_VER_TYPE* qnnInterface) { }; } -// destory power config -QNNPerf::~QNNPerf() { CALL_QNN(mPerfInfra.destroyPowerConfigId(mPowerConfigId)); } +void QNNPerf::shutdown() { + if (isShutdown_) return; + isShutdown_ = true; + CALL_QNN(perfInfra_.destroyPowerConfigId(powerConfigId_)); +} + +QNNPerf::~QNNPerf() { + // If shutdown() was already called, skip cleanup + // This prevents crashes during program exit when QNN HTP infrastructure might be destroyed + if (!isShutdown_) { shutdown(); } +} void QNNPerf::setRpcLatencyAndPolling() { // set RPC Control Latency @@ -146,7 +200,7 @@ void QNNPerf::setRpcLatencyAndPolling() { rpcControlLatency.rpcControlLatencyConfig = 100; // use rpc control latency recommended 100 us, refer hexagon sdk const QnnHtpPerfInfrastructure_PowerConfig_t* powerConfigs1[] = {&rpcControlLatency, nullptr}; - CALL_QNN(mPerfInfra.setPowerConfig(mPowerConfigId, powerConfigs1)); // set RPC latency config on power config ID created + CALL_QNN(perfInfra_.setPowerConfig(powerConfigId_, powerConfigs1)); // set RPC latency config on power config ID created // set RPC Polling QnnHtpPerfInfrastructure_PowerConfig_t rpcPollingTime; // refer QnnHtpPerfInfrastructure.h @@ -155,17 +209,17 @@ void QNNPerf::setRpcLatencyAndPolling() { rpcPollingTime.rpcPollingTimeConfig = 9999; // use rpc polling time recommended 0-10000 us const QnnHtpPerfInfrastructure_PowerConfig_t* powerConfigs2[] = {&rpcPollingTime, nullptr}; - CALL_QNN(mPerfInfra.setPowerConfig(mPowerConfigId, powerConfigs2)); // set RPC polling config on power config ID created + CALL_QNN(perfInfra_.setPowerConfig(powerConfigId_, powerConfigs2)); // set RPC polling config on power config ID created } void QNNPerf::setPowerConfigBurst() { - const QnnHtpPerfInfrastructure_PowerConfig_t* powerConfigs[] = {&mPowerConfigBurst, nullptr}; - CALL_QNN(mPerfInfra.setPowerConfig(mPowerConfigId, powerConfigs)); + const QnnHtpPerfInfrastructure_PowerConfig_t* powerConfigs[] = {&powerConfigBurst_, nullptr}; + CALL_QNN(perfInfra_.setPowerConfig(powerConfigId_, powerConfigs)); } void QNNPerf::setPowerConfigBalanced() { - const QnnHtpPerfInfrastructure_PowerConfig_t* powerConfigs[] = {&mPowerConfigBalanced, nullptr}; - CALL_QNN(mPerfInfra.setPowerConfig(mPowerConfigId, powerConfigs)); + const QnnHtpPerfInfrastructure_PowerConfig_t* powerConfigs[] = {&powerConfigBalanced_, nullptr}; + CALL_QNN(perfInfra_.setPowerConfig(powerConfigId_, powerConfigs)); } QNNRuntime::~QNNRuntime() { diff --git a/mllm/backends/qnn/QNNBackend.hpp b/mllm/backends/qnn/QNNBackend.hpp index 78953f32d..f439dced9 100644 --- a/mllm/backends/qnn/QNNBackend.hpp +++ b/mllm/backends/qnn/QNNBackend.hpp @@ -26,16 +26,21 @@ class QNNPerf { } explicit QNNPerf(const QNN_INTERFACE_VER_TYPE* qnnInterface); ~QNNPerf(); + + // Explicitly destroy power config. Call this while QNN HTP infrastructure is still alive. + void shutdown(); + void setRpcLatencyAndPolling(); void setPowerConfigBurst(); void setPowerConfigBalanced(); private: - const QNN_INTERFACE_VER_TYPE* mQnnInterface = nullptr; - QnnHtpDevice_PerfInfrastructure_t mPerfInfra{}; - uint32_t mPowerConfigId; - QnnHtpPerfInfrastructure_PowerConfig_t mPowerConfigBurst{}; - QnnHtpPerfInfrastructure_PowerConfig_t mPowerConfigBalanced{}; + const QNN_INTERFACE_VER_TYPE* qnnInterface_ = nullptr; + QnnHtpDevice_PerfInfrastructure_t perfInfra_{}; + uint32_t powerConfigId_ = 0; + QnnHtpPerfInfrastructure_PowerConfig_t powerConfigBurst_{}; + QnnHtpPerfInfrastructure_PowerConfig_t powerConfigBalanced_{}; + bool isShutdown_ = false; }; class QNNRuntime { @@ -86,6 +91,7 @@ class QNNRuntime { class QNNBackend final : public Backend { public: QNNBackend(); + ~QNNBackend(); bool loadContext(const std::string& contextPath); bool createContext(); @@ -128,6 +134,11 @@ class QNNBackend final : public Backend { std::unique_ptr runtime_; std::unique_ptr perf_; + // Hold QNN library handles to control unload order + // These libraries will only be unloaded when QNNBackend is destroyed + void* qnnHtpLibHandle_ = nullptr; + void* qnnSystemLibHandle_ = nullptr; + // Graph management std::map qnnModelIndexMap_; std::vector> qnnModels_; diff --git a/mllm/backends/qnn/QNNUtils.cpp b/mllm/backends/qnn/QNNUtils.cpp index 96baeffd1..73d240bbb 100644 --- a/mllm/backends/qnn/QNNUtils.cpp +++ b/mllm/backends/qnn/QNNUtils.cpp @@ -21,14 +21,13 @@ namespace mllm::qnn { QnnInterfaceGetProvidersFn_t QnnInterface_getProviders = nullptr; -bool loadQNNSymbol() { +std::pair loadQNNSymbol() { MLLM_INFO("QNN Backend Lib: libQnnHtp.so"); - void* qnnLibHandle = nullptr; - qnnLibHandle = dlopen("libQnnHtp.so", RTLD_NOW | RTLD_LOCAL); + void* qnnLibHandle = dlopen("libQnnHtp.so", RTLD_NOW | RTLD_LOCAL); const char* errorOpen = dlerror(); if (!qnnLibHandle) { MLLM_ERROR("Failed to open QNN libs."); - return false; + return {false, nullptr}; } QnnInterface_getProviders = (QnnInterfaceGetProvidersFn_t)dlsym(qnnLibHandle, "QnnInterface_getProviders"); @@ -36,20 +35,20 @@ bool loadQNNSymbol() { if (!QnnInterface_getProviders) { MLLM_ERROR("Failed to load symbol . dlerror returns {}.", errorSym); dlclose(qnnLibHandle); - return false; + return {false, nullptr}; } - return true; + return {true, qnnLibHandle}; } QnnSystemInterfaceGetProvidersFn_t QnnSystemInterface_getProviders = nullptr; -bool loadQNNSystemSymbol() { +std::pair loadQNNSystemSymbol() { void* systemLibraryHandle = dlopen("libQnnSystem.so", RTLD_NOW | RTLD_LOCAL); const char* errorOpen = dlerror(); if (!systemLibraryHandle) { MLLM_ERROR("Failed to open QNN System libs."); - return false; + return {false, nullptr}; } QnnSystemInterface_getProviders = @@ -58,10 +57,10 @@ bool loadQNNSystemSymbol() { if (!QnnSystemInterface_getProviders) { MLLM_ERROR("Failed to load symbol . dlerror returns {}.", errorSym); dlclose(systemLibraryHandle); - return false; + return {false, nullptr}; } - return true; + return {true, systemLibraryHandle}; } // --------------- End of QNN symbols loading --------------- diff --git a/mllm/backends/qnn/QNNUtils.hpp b/mllm/backends/qnn/QNNUtils.hpp index e74f27f4a..047a79355 100644 --- a/mllm/backends/qnn/QNNUtils.hpp +++ b/mllm/backends/qnn/QNNUtils.hpp @@ -43,8 +43,10 @@ using QnnSystemInterfaceGetProvidersFn_t = Qnn_ErrorHandle_t (*)(const QnnSystem extern QnnInterfaceGetProvidersFn_t QnnInterface_getProviders; extern QnnSystemInterfaceGetProvidersFn_t QnnSystemInterface_getProviders; -bool loadQNNSymbol(); -bool loadQNNSystemSymbol(); +// Load QNN symbols and return library handle for lifecycle management +// Returns {success, libHandle} - caller owns the handle and must dlclose it +std::pair loadQNNSymbol(); +std::pair loadQNNSystemSymbol(); // --------------- End of QNN symbols loading --------------- diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp index 0f67bab56..2a2e6010f 100644 --- a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp +++ b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp @@ -138,7 +138,10 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten if (!cfg->scale) { MLLM_ERROR_EXIT(ExitCode::kCoreError, "SymPerTensor quant recipe has no scale. tensor: {}", v->name()); } - ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item(), .offset = 0}; + + MLLM_RT_ASSERT_EQ(cfg->quant_to_type, kUInt8); + + ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item(), .offset = -128}; MLLM_INFO("Configuring SymPerTensor quantization for tensor: {}, scale: {}", v->name(), cfg->scale.item()); break; } diff --git a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp index 7e2a63220..8320776b6 100644 --- a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp +++ b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp @@ -571,6 +571,14 @@ bool LLMQuantRecipeSlicePattern::isMatch(const mllm::ir::op_ptr_t& op) { } bool LLMQuantRecipeSlicePattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& node) { + auto slice_ir = node->cast_(); + auto i_0 = *(node->inputs().begin()); + + if (!i_0->getAttr("quant_recipe")) { + auto i_0_spec = genSimpleQuantizationSpecAttr(writer.getContext(), i_0->cast_()); + i_0->setAttr("quant_recipe", i_0_spec); + } + return shareQuantSpecSingleInputToSingleOutputAndSetOpQuantAnnoAttr(writer.getContext(), node->cast_()); } diff --git a/mllm/backends/qnn/aot/visitor/CastType.cpp b/mllm/backends/qnn/aot/visitor/CastType.cpp index 95dc3997a..82e4194db 100644 --- a/mllm/backends/qnn/aot/visitor/CastType.cpp +++ b/mllm/backends/qnn/aot/visitor/CastType.cpp @@ -61,7 +61,10 @@ bool QnnAOTCastTypePattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op qnn_op_type = "Quantize"; } else if (isInt(input_dtype) && isFloat(target_dtype)) { qnn_op_type = "Dequantize"; + } else if (isInt(input_dtype) && isInt(target_dtype)) { + qnn_op_type = "Convert"; } else if (input_dtype == kFloat32 && target_dtype == kFloat16) { + // TODO, Cast usage is error. qnn_op_type = "Cast"; } else { MLLM_ERROR("Unsupported CastType for QNN: {} -> {}", (int)input_dtype, (int)target_dtype); diff --git a/mllm/backends/qnn/aot_rt/KVCacheManager.cpp b/mllm/backends/qnn/aot_rt/KVCacheManager.cpp index 7df115464..73e2bbc87 100644 --- a/mllm/backends/qnn/aot_rt/KVCacheManager.cpp +++ b/mllm/backends/qnn/aot_rt/KVCacheManager.cpp @@ -187,6 +187,7 @@ void KVCacheManager::rearrangeCache(int32_t ar_len_dst) { template void KVCacheManager::rearrangeKey(KVCache& k_cache, int32_t ar_len_dst) { + // [B, H, D, S] rearrange. const int32_t src_cache_num = (cur_ar_len_ == config_.context_len) ? config_.context_len : config_.context_len - cur_ar_len_; const int32_t dst_cache_num = config_.context_len - ar_len_dst; T* k_cache_in_read_ptr = k_cache.buffer; @@ -213,6 +214,7 @@ void KVCacheManager::rearrangeKey(KVCache& k_cache, int32_t ar_len_dst) { template void KVCacheManager::rearrangeValue(KVCache& v_cache, int32_t ar_len_dst) { + // [B, H, S, D] rearrange. const int32_t src_cache_num = (cur_ar_len_ == config_.context_len) ? config_.context_len : config_.context_len - cur_ar_len_; const int32_t dst_cache_num = config_.context_len - ar_len_dst; T* v_cache_in_read_ptr = v_cache.buffer; diff --git a/mllm/backends/qnn/aot_rt/KVCacheManager.hpp b/mllm/backends/qnn/aot_rt/KVCacheManager.hpp index 8eddb1a95..e1090278c 100644 --- a/mllm/backends/qnn/aot_rt/KVCacheManager.hpp +++ b/mllm/backends/qnn/aot_rt/KVCacheManager.hpp @@ -24,7 +24,20 @@ template class KVCacheManager { public: explicit KVCacheManager(QnnAOTConfig config); - ~KVCacheManager() = default; + ~KVCacheManager() { + // Explicitly clear storage to ensure proper cleanup order + // Storage must be released before QNN backend is destroyed + for (auto& cache : k_cache_) { + cache.buffer_storage.reset(); + cache.output_buffer_storage.reset(); + } + for (auto& cache : v_cache_) { + cache.buffer_storage.reset(); + cache.output_buffer_storage.reset(); + } + k_cache_.clear(); + v_cache_.clear(); + } void initCache(mllm::Allocator* allocator, int32_t ar_len); void rearrangeCache(int32_t ar_len_dst); diff --git a/mllm/backends/qnn/aot_rt/PromptProcessor.hpp b/mllm/backends/qnn/aot_rt/PromptProcessor.hpp index cfe08620b..b77553640 100644 --- a/mllm/backends/qnn/aot_rt/PromptProcessor.hpp +++ b/mllm/backends/qnn/aot_rt/PromptProcessor.hpp @@ -17,6 +17,14 @@ class PromptProcessor { public: PromptProcessor(KVCacheManager* kv_manager, QnnAOTConfig config); + ~PromptProcessor() { + // Clear module's output tensors before member tensors are destroyed + // to avoid double-free or use-after-free issues + if (module_) { module_->setOutputTensors({}); } + output_tensors_.clear(); + input_tensors_.clear(); + } + /** * Prefill an LLM Module with the given text input. * @param prompt_tokens The text prompt tokens to the LLM Module. diff --git a/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp b/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp index 9b8acc338..f489344e6 100644 --- a/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp +++ b/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp @@ -12,6 +12,10 @@ namespace mllm::qnn::aot { class QnnAOTModule : public mllm::nn::Module, public models::ARGeneration { public: explicit QnnAOTModule(const std::string& graph_name); + ~QnnAOTModule() { + // Clear output tensors to ensure proper cleanup order + output_tensors_.clear(); + } std::vector forward(const std::vector& inputs, const std::vector& args) override; diff --git a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp index 51ce86c70..2754e062e 100644 --- a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp +++ b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp @@ -19,7 +19,14 @@ using RunnerConfig = QnnAOTConfig; class Runner { public: explicit Runner(const RunnerConfig& config, mllm::preprocessor::AutoTokenizer* tokenizer); - ~Runner() = default; + ~Runner() { + // Explicit destruction order to avoid use-after-free issues + // Destroy generators first (they reference kv_manager_) + token_generator_.reset(); + prompt_processor_.reset(); + // Then destroy kv_manager_ + kv_manager_.reset(); + } bool load(); void generate(const Tensor& prompt_tokens, int32_t seq_len, const std::function& token_callback, diff --git a/mllm/backends/qnn/aot_rt/TokenGenerator.cpp b/mllm/backends/qnn/aot_rt/TokenGenerator.cpp index 4e0884358..8c379c52a 100644 --- a/mllm/backends/qnn/aot_rt/TokenGenerator.cpp +++ b/mllm/backends/qnn/aot_rt/TokenGenerator.cpp @@ -1,6 +1,7 @@ #include "mllm/backends/qnn/aot_rt/TokenGenerator.hpp" #include "mllm/preprocessor/tokenizers/Unicode.hpp" #include +#include #include namespace mllm::qnn::aot { @@ -16,7 +17,7 @@ TokenGenerator::TokenGenerator(mllm::preprocessor::AutoTokenizer* tokenizer, template void TokenGenerator::init_io() { - input_tensors_.reserve(4 + 2 * config_.num_layers); + input_tensors_.reserve(3 + 2 * config_.num_layers); // 1. Input IDs auto input_ids = Tensor::empty({1, 1}, kInt32, kQNN).alloc(); @@ -38,7 +39,8 @@ void TokenGenerator::init_io() { const auto& v_caches = kv_manager_->getVCache(); // K for (int l = 0; l < config_.num_layers; ++l) { - auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.head_dim, config_.context_len}, config_.kv_dtype, kQNN); + auto k_tensor = + Tensor::empty({1, (int)config_.num_heads, config_.head_dim, config_.context_len - 1}, config_.kv_dtype, kQNN); k_tensor.impl()->storage()->ptr_ = k_caches[l].buffer; k_tensor.impl()->storage()->mem_type_ = kManual; k_tensor.setName("past_key_" + std::to_string(l)); @@ -107,6 +109,11 @@ int64_t TokenGenerator::generate(std::vector& tokens, int64_t start_ // Ensure KV cache is arranged for decode (1 token) kv_manager_->rearrangeCache(1); + // Initialize attention mask for decode phase + std::vector attention_map(1); + std::iota(attention_map.begin(), attention_map.end(), -1); + kv_manager_->initAttentionMask(input_tensors_[2].ptr(), attention_map, 1, current_pos); + module_->setOutputTensors(output_tensors_); for (int i = 0; i < seq_len; ++i) { @@ -122,6 +129,9 @@ int64_t TokenGenerator::generate(std::vector& tokens, int64_t start_ int32_t n_update = 1; kv_manager_->updateCache(1, current_pos, n_update, {}); + // Update attention mask + kv_manager_->updateAttentionMask(input_tensors_[2].ptr(), 1, current_pos, n_update); + // Get logits auto logits = output_tensors_[0].to(kCPU).squeeze(0); diff --git a/mllm/backends/qnn/aot_rt/TokenGenerator.hpp b/mllm/backends/qnn/aot_rt/TokenGenerator.hpp index b40b9725a..4c8968559 100644 --- a/mllm/backends/qnn/aot_rt/TokenGenerator.hpp +++ b/mllm/backends/qnn/aot_rt/TokenGenerator.hpp @@ -19,7 +19,13 @@ class TokenGenerator { TokenGenerator(mllm::preprocessor::AutoTokenizer* tokenizer, KVCacheManager* kv_manager, std::unique_ptr>&& eos_ids, QnnAOTConfig config); - virtual ~TokenGenerator() = default; + virtual ~TokenGenerator() { + // Clear module's output tensors before member tensors are destroyed + // to avoid double-free or use-after-free issues + if (module_) { module_->setOutputTensors({}); } + output_tensors_.clear(); + input_tensors_.clear(); + } void init_io(); diff --git a/mllm/compile/ir/Node.cpp b/mllm/compile/ir/Node.cpp index 303f99e51..481f60971 100644 --- a/mllm/compile/ir/Node.cpp +++ b/mllm/compile/ir/Node.cpp @@ -258,16 +258,16 @@ void IRContext::setDevice(DeviceTypes device_type) { device_type_ = device_type; // FIXME: deprecated, context has no device DeviceTypes IRContext::getDevice() { return device_type_; } -bool IRContext::isCacheInputOutputTensor(uint32_t uuid) { +bool IRContext::isCacheInputOutputTensor(size_t uuid) { if (cached_inputs_outputs_.count(uuid)) { return true; } return false; } -void IRContext::cacheInputOutputTensor(uint32_t uuid, const val_ptr_t& tensor_ir) { cached_inputs_outputs_[uuid] = tensor_ir; } +void IRContext::cacheInputOutputTensor(size_t uuid, const val_ptr_t& tensor_ir) { cached_inputs_outputs_[uuid] = tensor_ir; } -val_ptr_t IRContext::getCacheInputOutputTensor(uint32_t uuid) { return cached_inputs_outputs_[uuid]; } +val_ptr_t IRContext::getCacheInputOutputTensor(size_t uuid) { return cached_inputs_outputs_[uuid]; } -std::unordered_map& IRContext::getAllCachedInputOutputTensorIRs() { return cached_inputs_outputs_; } +std::unordered_map& IRContext::getAllCachedInputOutputTensorIRs() { return cached_inputs_outputs_; } void IRContext::pushRegion2InsertRegionStackAndSetRegion(const region_ptr_t& region) { insert_region_stack_.push(region); diff --git a/mllm/compile/ir/Node.hpp b/mllm/compile/ir/Node.hpp index a3eceec58..2506d5e93 100644 --- a/mllm/compile/ir/Node.hpp +++ b/mllm/compile/ir/Node.hpp @@ -314,13 +314,13 @@ class IRContext : public std::enable_shared_from_this { // FIXME: deprecated, context has no device DeviceTypes getDevice(); - bool isCacheInputOutputTensor(uint32_t uuid); + bool isCacheInputOutputTensor(size_t uuid); - void cacheInputOutputTensor(uint32_t uuid, const val_ptr_t& tensor_ir); + void cacheInputOutputTensor(size_t uuid, const val_ptr_t& tensor_ir); - val_ptr_t getCacheInputOutputTensor(uint32_t uuid); + val_ptr_t getCacheInputOutputTensor(size_t uuid); - std::unordered_map& getAllCachedInputOutputTensorIRs(); + std::unordered_map& getAllCachedInputOutputTensorIRs(); // A long name that avoid user to use this low level API void pushRegion2InsertRegionStackAndSetRegion(const region_ptr_t& region); @@ -335,7 +335,7 @@ class IRContext : public std::enable_shared_from_this { std::unordered_map value_names_; region_ptr_t cur_insert_region_; node_ptr_t top_level_op_; - std::unordered_map cached_inputs_outputs_; + std::unordered_map cached_inputs_outputs_; std::stack insert_region_stack_; }; diff --git a/mllm/engine/Context.cpp b/mllm/engine/Context.cpp index c4e178b55..7dddff6bd 100644 --- a/mllm/engine/Context.cpp +++ b/mllm/engine/Context.cpp @@ -41,6 +41,11 @@ Backend::ptr_t Context::getBackend(const DeviceTypes& device) { return backends_[device]; } +void Context::shutdownBackend(const DeviceTypes& device) { + if (!backends_.has(device)) { return; } + backends_.remove(device); +} + std::vector Context::buildOpAndSubmitTask(OpTypes op_type, const BaseOpOptionsBase& base_options, const std::vector& inputs, DeviceTypes special_device) { MLLM_TRACY_ZONE_SCOPED; diff --git a/mllm/engine/Context.hpp b/mllm/engine/Context.hpp index e9a3506bb..4f1f5ee00 100644 --- a/mllm/engine/Context.hpp +++ b/mllm/engine/Context.hpp @@ -30,6 +30,8 @@ class Context { Backend::ptr_t getBackend(const DeviceTypes& device); + void shutdownBackend(const DeviceTypes& device); + inline MemoryManager::ptr_t memoryManager() { return memory_manager_; } inline DispatcherManager::ptr_t dispatcherManager() { return dispatcher_manager_; } diff --git a/mllm/mllm.cpp b/mllm/mllm.cpp index 08a45aefd..4775682d3 100644 --- a/mllm/mllm.cpp +++ b/mllm/mllm.cpp @@ -53,6 +53,12 @@ void shutdownContext() { } ::mllm::cleanThisThread(); + // Clean up QNN Backend before system start to unload QNN dynamic libraries + if (isQnnAvailable()) { + auto& ctx = Context::instance(); + ctx.shutdownBackend(kQNN); + } + // Clean up memory before backend is freed. // FIXME: // This line is needed for cuda !!! diff --git a/pymllm/backends/qualcomm/transformers/core/qdq.py b/pymllm/backends/qualcomm/transformers/core/qdq.py index c13011a51..813aad3f0 100644 --- a/pymllm/backends/qualcomm/transformers/core/qdq.py +++ b/pymllm/backends/qualcomm/transformers/core/qdq.py @@ -27,10 +27,11 @@ def __init__(self, bits=8, qscheme=torch.per_tensor_affine): # 1. Calculate quantization range based on bits and scheme if qscheme in [torch.per_tensor_symmetric, torch.per_channel_symmetric]: - # Symmetric: range is [-(2^(bits-1)), 2^(bits-1) - 1] - # e.g., 8-bit: -128 to 127 - self.quant_min = -(2 ** (bits - 1)) - self.quant_max = 2 ** (bits - 1) - 1 + # NOTE: If left empty: with uint8 and symmetric quantization, the observer will use [0, 255] as the range. And 128 as the zero_point. + self.quant_min = None + self.quant_max = None + assert bits == 8, "Symmetric quantization is only supported for 8-bit" + self.dtype = torch.uint8 else: # Asymmetric (Affine): range is [0, 2^bits - 1] # e.g., 8-bit: 0 to 255