diff --git a/docs/qnn_backend/setup_env.rst b/docs/qnn_backend/setup_env.rst index 61a06a459..6619a21b3 100644 --- a/docs/qnn_backend/setup_env.rst +++ b/docs/qnn_backend/setup_env.rst @@ -31,7 +31,7 @@ QNN SDK Installation 1. Download the QNN SDK from the `official Qualcomm website `_ 2. Unzip the downloaded file -3. Set the environment variable ``QNN_SDK_ROOT`` to point to the unzipped directory +3. Set the environment variable ``QAIRT_SDK_ROOT`` to point to the unzipped directory Hexagon SDK Installation ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -68,7 +68,7 @@ After setting up the environment, verify that the following environment variable .. code-block:: bash - echo $QNN_SDK_ROOT # Should point to /path/to/your/qnn/sdk + echo $QAIRT_SDK_ROOT # Should point to /path/to/your/qnn/sdk echo $HEXAGON_SDK_ROOT # Should point to /path/to/your/hexagon/sdk .. note:: @@ -84,7 +84,7 @@ Prerequisites for Compilation Ensure the following environment variables are set: -- ``QNN_SDK_ROOT`` +- ``QAIRT_SDK_ROOT`` - ``HEXAGON_SDK_ROOT`` - ``ANDROID_NDK_ROOT`` diff --git a/examples/qwen3_qnn_aot/CMakeLists.txt b/examples/qwen3_qnn_aot/CMakeLists.txt index 75a7375cd..efc9f2db8 100644 --- a/examples/qwen3_qnn_aot/CMakeLists.txt +++ b/examples/qwen3_qnn_aot/CMakeLists.txt @@ -1,8 +1,3 @@ add_executable(mllm-qwen3-aot-c compile.cpp) -target_link_libraries(mllm-qwen3-aot-c PRIVATE MllmRT MllmCPUBackend) +target_link_libraries(mllm-qwen3-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend) target_include_directories(mllm-qwen3-aot-c PRIVATE ${MLLM_INCLUDE_DIR}) - -target_include_directories(mllm-qwen3-aot-c PRIVATE - $ENV{QAIRT_SDK_ROOT}/include # QNN SDK include - $ENV{QAIRT_SDK_ROOT}/include/QNN # QNN SDK include -) diff --git a/mllm/CMakeLists.txt b/mllm/CMakeLists.txt index 1759fc21a..9df6b7741 100644 --- a/mllm/CMakeLists.txt +++ b/mllm/CMakeLists.txt @@ -8,9 +8,6 @@ file(GLOB_RECURSE MLLM_RT_PREPROCESSOR_SRC ${CMAKE_CURRENT_LIST_DIR}/preprocesso if(MLLM_BUILD_EXPERIMENTS) file(GLOB_RECURSE MLLM_RT_AUTO_TUNE_SRC ${CMAKE_CURRENT_LIST_DIR}/experiments/auto_tune/*.cpp) endif() -if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE) - file(GLOB_RECURSE MLLM_QUALCOMM_AOT_SRC ${CMAKE_CURRENT_LIST_DIR}/backends/qnn/aot/*.cpp) -endif() file(GLOB WENET_AUDIO_SOURCES ${PROJECT_SOURCE_DIR}/third_party/wenet_audio/*) add_library( @@ -117,19 +114,17 @@ if(MLLM_BUILD_OPENCL_BACKEND) ) endif() +if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE OR MLLM_BUILD_QNN_BACKEND) + add_subdirectory(backends/qnn) +endif() + if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE) - # Build - target_include_directories(MllmRT PRIVATE - $ENV{QAIRT_SDK_ROOT}/include # QNN SDK include - $ENV{QAIRT_SDK_ROOT}/include/QNN # QNN SDK include - ) add_compile_definitions( MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE ) endif() if(MLLM_BUILD_QNN_BACKEND) - add_subdirectory(backends/qnn) add_compile_definitions( MLLM_QNN_BACKEND ) diff --git a/mllm/backends/qnn/CMakeLists.txt b/mllm/backends/qnn/CMakeLists.txt index cb1e4ec7d..0e4203e08 100644 --- a/mllm/backends/qnn/CMakeLists.txt +++ b/mllm/backends/qnn/CMakeLists.txt @@ -13,6 +13,14 @@ file(GLOB MLLM_QNN_SRC ) +if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE) + file(GLOB_RECURSE MLLM_QUALCOMM_AOT_SRC + ${CMAKE_CURRENT_LIST_DIR}/aot/*.hpp + ${CMAKE_CURRENT_LIST_DIR}/aot/*.cpp + ) + list(APPEND MLLM_QNN_SRC ${MLLM_QUALCOMM_AOT_SRC}) +endif() + add_library( MllmQNNBackend SHARED @@ -20,11 +28,11 @@ add_library( ) target_include_directories(MllmQNNBackend PUBLIC - $ENV{QNN_SDK_ROOT}/include/QNN # QNN SDK include + $ENV{QAIRT_SDK_ROOT}/include/QNN # QNN SDK include ${MLLM_INCLUDE_DIR} ) -message(STATUS "QNN SDK root: $ENV{QNN_SDK_ROOT}") +message(STATUS "QNN SDK root: $ENV{QAIRT_SDK_ROOT}") get_property(current_includes DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES) message(STATUS "MLLM_QNN INCLUDES: ${current_includes}") #print include directories diff --git a/mllm/backends/qnn/QNNAllocator.cpp b/mllm/backends/qnn/QNNAllocator.cpp index e7c16def2..bc4a73bfd 100644 --- a/mllm/backends/qnn/QNNAllocator.cpp +++ b/mllm/backends/qnn/QNNAllocator.cpp @@ -3,24 +3,12 @@ #include "mllm/backends/qnn/QNNAllocator.hpp" #include "mllm/backends/qnn/QNNTypeMacros.hpp" -#include "mllm/backends/qnn/QNNUtils.hpp" #include "mllm/utils/Common.hpp" #include "mllm/utils/Log.hpp" -#include -#include #include namespace mllm::qnn { -namespace { -constexpr bool kVerboseQnnAllocatorLogs = false; -} // namespace - -#define QNN_ALLOCATOR_VERBOSE(...) \ - do { \ - if constexpr (kVerboseQnnAllocatorLogs) { MLLM_INFO(__VA_ARGS__); } \ - } while (0) - // specified in QNN doc #define RPCMEM_HEAP_ID_SYSTEM 25 #define RPCMEM_DEFAULT_FLAGS 1 @@ -46,24 +34,8 @@ QNNAllocator::QNNAllocator(QNN_INTERFACE_VER_TYPE qnnInterface, void* context) rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle, "rpcmem_to_fd"); } -QNNAllocator::~QNNAllocator() { - for (auto iter = ptrToFdAndMemHandleMap_.begin(); iter != ptrToFdAndMemHandleMap_.end();) { - Qnn_ErrorHandle_t deregisterRet = qnnInterface_.memDeRegister(&iter->second.second, 1); - if (QNN_SUCCESS != deregisterRet) { - MLLM_WARN("~QNNAllocator: memDeRegister failed during shutdown, status=0x{:x}", deregisterRet); - } - qnnMemPtrSet_.erase(iter->first); - rpcmem_free(iter->first); - iter = ptrToFdAndMemHandleMap_.erase(iter); - } - - for (void* ptr : qnnMemPtrSet_) { rpcmem_free(ptr); } - qnnMemPtrSet_.clear(); -} - bool QNNAllocator::alloc(Storage* storage) { - size_t request_bytes = allocSize(storage); - uint8_t* ptr = (uint8_t*)rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, request_bytes); + uint8_t* ptr = (uint8_t*)rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocSize(storage)); MLLM_RT_ASSERT(ptr != nullptr); @@ -73,289 +45,25 @@ bool QNNAllocator::alloc(Storage* storage) { return true; } -/** - * @brief Free a storage buffer and manage QNN memory handle lifecycle - * - * This function handles the complex lifecycle of QNN shared buffers: - * 1. Checks if the buffer is already freed or never allocated - * 2. Detects if multiple pointers share the same mem_handle (aliases) - * 3. Only de-registers mem_handle when it's the last reference - * 4. Updates tensor ID/name mappings to point to alternative pointers if needed - * - * Key design considerations: - * - QNN doesn't support re-registering a de-registered buffer (fd may be invalidated) - * - Multiple buffer pointers can share the same mem_handle (common in decode phase) - * - Tensor mappings must be updated when pointers are redirected to aliases - * - * @param storage Pointer to the storage object containing the buffer to free - */ void QNNAllocator::free(Storage* storage) { - auto ptr = storage->ptr_; - - // Early return if ptr is nullptr or not in qnnMemPtrSet_ (already freed or never allocated) - // This is common during decode phase when buffers are reused, so we silently ignore - if (ptr == nullptr) { - // too noisy during decode; silently ignore nullptr frees - return; - } - - if (qnnMemPtrSet_.count(ptr) == 0) { - QNN_ALLOCATOR_VERBOSE("QNNAllocator::free called for ptr={} that is not in qnnMemPtrSet_, ignoring", ptr); - return; - } - - // Check if any other buffer pointer shares the same mem_handle (alias detection) - // This is important because in decode phase, multiple tensor wrappers may reference - // the same underlying buffer through different pointers - void* alternative_ptr = nullptr; // Another ptr using the same mem_handle, if any - - if (ptrToFdAndMemHandleMap_.count(ptr)) { - auto iter = ptrToFdAndMemHandleMap_.find(ptr); - auto mem_handle = iter->second.second; - - // Check if any other ptr is using the same mem_handle - // This handles the case where buffer reuse creates multiple pointers to the same mem_handle - for (const auto& [other_ptr, fd_and_handle] : ptrToFdAndMemHandleMap_) { - if (other_ptr != ptr && fd_and_handle.second == mem_handle) { - alternative_ptr = other_ptr; - break; - } - } - - // Only deRegister if this is the last ptr using this mem_handle - // If there are aliases, we must keep the mem_handle registered - if (alternative_ptr == nullptr) { - // No aliases found, safe to de-register the mem_handle - auto status = qnnInterface_.memDeRegister(&mem_handle, 1); - if (status != QNN_SUCCESS) { - MLLM_WARN("QNNAllocator::free memDeRegister failed, status=0x{:x}, ptr={}, fd={}", status, ptr, iter->second.first); - } - // Remove from ptrToFdAndMemHandleMap_ and ptrToSizeMap_ - // The actual buffer will be freed later in the function - ptrToFdAndMemHandleMap_.erase(iter); - ptrToSizeMap_.erase(ptr); - } else { - // Aliases exist, skip de-registration to avoid breaking other references - QNN_ALLOCATOR_VERBOSE("QNNAllocator::free skipping deRegister for ptr={} because other ptrs use the mem_handle", ptr); - ptrToFdAndMemHandleMap_.erase(iter); - ptrToSizeMap_.erase(ptr); - } - } else { - // ptr is in qnnMemPtrSet_ but not in ptrToFdAndMemHandleMap_ - // This means it was allocated but never registered (e.g., memRegister failed) - // Just free the buffer without deRegister - QNN_ALLOCATOR_VERBOSE("QNNAllocator::free freeing unregistered buffer ptr={}", ptr); - qnnMemPtrSet_.erase(ptr); - rpcmem_free(ptr); - eraseTensorMappingsForPtr(ptr, "free(unregistered buffer)"); - clearLastRegistrationIfMatches(ptr, "free(unregistered buffer)"); - return; + if (ptrToFdAndMemHandleMap_.count(storage->ptr_)) { + MLLM_RT_ASSERT_EQ(QNN_SUCCESS, + qnnInterface_.memDeRegister(&(ptrToFdAndMemHandleMap_.find(storage->ptr_)->second.second), 1)); } - // Update or keep tensor ID and name mappings - // If mem_handle is still in use (alternative_ptr exists), update mappings to point to alternative_ptr - // Otherwise, free the buffer and clear mappings - if (alternative_ptr != nullptr) { - // Update mappings to point to alternative_ptr instead of deleting them - // This ensures that future tensor lookups will find the correct buffer - for (auto& entry : tensorIdToPtrMap_) { - if (entry.second == ptr) { entry.second = alternative_ptr; } - } - for (auto& entry : tensorNameToPtrMap_) { - if (entry.second == ptr) { entry.second = alternative_ptr; } - } - // Don't free the buffer here since alternative_ptr is still using it - qnnMemPtrSet_.erase(ptr); - clearLastRegistrationIfMatches(ptr, "free(ptr) -> redirected to alias"); - } else { - // Since QNN doesn't support re-registering a deRegistered buffer (fd may be invalidated), - // we should free the buffer immediately even if there are mappings. - // The decode phase will allocate a new buffer when needed. - qnnMemPtrSet_.erase(ptr); - rpcmem_free(ptr); - eraseTensorMappingsForPtr(ptr, "free(ptr) -> mem_handle released"); - clearLastRegistrationIfMatches(ptr, "free(ptr) -> mem_handle released"); - } - storage->ptr_ = nullptr; + rpcmem_free(storage->ptr_); } -/** - * @brief Register a tensor's buffer to QNN shared memory - * - * This function implements a sophisticated buffer reuse mechanism to avoid duplicate registrations - * of the same tensor across prefill and decode phases. It uses a multi-level fallback strategy: - * - * 1. Check if the buffer is already registered (by ptr) - * 2. Check if a buffer exists for the same tensor ID (primary lookup) - * 3. Check if a buffer exists for the same tensor name (fallback lookup) - * 4. Check if we can reuse the last successfully registered buffer (last resort) - * 5. If all fallbacks fail, attempt new registration - * - * This is critical for decode phase where the same tensor (e.g., KV cache) is used repeatedly, - * and QNN HTP device has limited memory resources (~2.5GB typically). - * - * @param storage Storage object containing the buffer to register - * @param qnn_tensor QNN tensor structure to update with mem_handle - * @return true if registration succeeded, false otherwise - */ -bool QNNAllocator::registerQnnTensorToSharedBuffer(Storage* storage, Qnn_Tensor_t& qnn_tensor) { - MLLM_RT_ASSERT(storage != nullptr); - void* ptr = storage->ptr_; - +void QNNAllocator::registerQnnTensorToSharedBuffer(void* ptr, Qnn_Tensor_t& qnn_tensor) { // Make sure there has a memory that we can register to. - MLLM_RT_ASSERT(ptr != nullptr); MLLM_RT_ASSERT(qnnMemPtrSet_.count(ptr)); - // Save original tensor state in case we need to restore on failure - auto original_mem_type = QNN_TENSOR_GET_MEM_TYPE(qnn_tensor); - Qnn_MemHandle_t original_mem_handle = QNN_TENSOR_GET_MEM_HANDLE(qnn_tensor); - - // Extract tensor identification information - // Tensor ID is the primary identifier (more reliable than name) - uint32_t tensor_id = QNN_TENSOR_GET_ID(qnn_tensor); - const char* tensor_name_cstr = QNN_TENSOR_GET_NAME(qnn_tensor); - std::string tensor_name = tensor_name_cstr ? tensor_name_cstr : "unknown"; - - // Calculate buffer size from tensor dimensions and data type - uint32_t rank = QNN_TENSOR_GET_RANK(qnn_tensor); - uint32_t* dims_ptr = QNN_TENSOR_GET_DIMENSIONS(qnn_tensor); - Qnn_DataType_t data_type = QNN_TENSOR_GET_DATA_TYPE(qnn_tensor); - - size_t element_bytes = 0; - if (auto it = QNNDataTypeToSize.find(data_type); it != QNNDataTypeToSize.end()) { element_bytes = it->second; } - - size_t element_cnt = 1; - std::vector dims; - dims.reserve(rank); - for (uint32_t i = 0; i < rank; ++i) { - uint32_t dim = dims_ptr ? dims_ptr[i] : 0; - dims.push_back(dim); - element_cnt *= (dim == 0 ? 1 : dim); - } - size_t total_bytes = element_cnt * element_bytes; - - // Format shape string for error messages - std::string shape_str = "[]"; - if (!dims.empty()) { - shape_str = "["; - for (size_t i = 0; i < dims.size(); ++i) { - shape_str += std::to_string(dims[i]); - if (i + 1 < dims.size()) { shape_str += ", "; } - } - shape_str += "]"; - } - - QNN_ALLOCATOR_VERBOSE("registerQnnTensorToSharedBuffer: ptr={}, tensor_id={}, tensor_name={}, tensorIdToPtrMap_.size()={}", - ptr, tensor_id, tensor_name, tensorIdToPtrMap_.size()); - - /** - * @brief Update tensor ID/name mappings and size tracking - * - * This lambda updates the internal mappings that allow us to find existing buffers - * for the same tensor in future registration attempts. - */ - auto updateMappings = [&](void* mapped_ptr) { - tensorIdToPtrMap_[tensor_id] = mapped_ptr; - if (tensor_name != "unknown") { tensorNameToPtrMap_[tensor_name] = mapped_ptr; } - ptrToSizeMap_[mapped_ptr] = total_bytes; - }; - - /** - * @brief Reuse an existing registered buffer for this tensor - * - * This lambda implements the core buffer reuse logic: - * 1. Verifies the existing buffer is still registered - * 2. Copies data from new buffer to existing buffer if needed - * 3. Updates tensor to use existing mem_handle - * 4. Updates internal mappings - * 5. Frees the new buffer to avoid memory leak - * - * @param existing_ptr Pointer to the existing registered buffer - * @return true if reuse succeeded, false if buffer is no longer registered - */ - auto reuseExistingBuffer = [&](void* existing_ptr) -> bool { - auto fd_handle_iter = ptrToFdAndMemHandleMap_.find(existing_ptr); - if (fd_handle_iter == ptrToFdAndMemHandleMap_.end()) { return false; } - - Qnn_MemHandle_t existing_mem_handle = fd_handle_iter->second.second; - size_t existing_size = ptrToSizeMap_.count(existing_ptr) > 0 ? ptrToSizeMap_[existing_ptr] : 0; - - // If pointers differ, copy data from new buffer to existing buffer - // This handles the case where a new buffer was allocated but we want to reuse the old one - if (existing_ptr != ptr) { - size_t bytes_to_copy = total_bytes; - if (existing_size > 0) { bytes_to_copy = std::min(bytes_to_copy, existing_size); } - if (bytes_to_copy > 0) { std::memcpy(existing_ptr, ptr, bytes_to_copy); } - - // Free the new buffer since we're reusing the existing one - if (qnnMemPtrSet_.count(ptr) > 0) { - qnnMemPtrSet_.erase(ptr); - rpcmem_free(ptr); - } - storage->ptr_ = existing_ptr; - } - - // Update tensor to use existing mem_handle - QNN_TENSOR_SET_MEM_TYPE(qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); - QNN_TENSOR_SET_MEM_HANDLE(qnn_tensor, existing_mem_handle); - updateMappings(existing_ptr); - rememberLastRegistration(tensor_id, tensor_name, existing_ptr, existing_mem_handle, total_bytes); - return true; - }; - - // Level 1: Check if this exact buffer pointer is already registered - // This is the fastest path and handles the common case in decode phase + // if already registered, just set the mem handle if (ptrToFdAndMemHandleMap_.count(ptr) > 0) { Qnn_MemHandle_t mem_handle = ptrToFdAndMemHandleMap_[ptr].second; QNN_TENSOR_SET_MEM_TYPE(qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); QNN_TENSOR_SET_MEM_HANDLE(qnn_tensor, mem_handle); - updateMappings(ptr); - rememberLastRegistration(tensor_id, tensor_name, ptr, mem_handle, total_bytes); - return true; - } - - // Level 2: Check if we can reuse an existing buffer for the same tensor ID - // Tensor ID is the primary identifier and is more reliable than name - // This handles decode phase where the same tensor is used repeatedly - if (tensorIdToPtrMap_.count(tensor_id) > 0) { - void* existing_ptr = tensorIdToPtrMap_[tensor_id]; - QNN_ALLOCATOR_VERBOSE("Found existing mapping for tensor_id={}: existing_ptr={}", tensor_id, existing_ptr); - - if (existing_ptr == nullptr) { - // Mapping exists but buffer was freed, clean up and register new buffer - QNN_ALLOCATOR_VERBOSE("Existing mapping for tensor_id={} has nullptr ptr (buffer was freed), will register new buffer", - tensor_id); - tensorIdToPtrMap_.erase(tensor_id); - } else if (reuseExistingBuffer(existing_ptr)) { - return true; - } else { - // Buffer exists but is no longer registered, clean up mapping - MLLM_WARN("Existing ptr {} for tensor_id={} is no longer registered, removing from map", existing_ptr, tensor_id); - tensorIdToPtrMap_.erase(tensor_id); - } - } else { - QNN_ALLOCATOR_VERBOSE("No existing mapping found for tensor_id={}", tensor_id); - } - - // Level 3: Check by tensor name as fallback (in case ID changed or is 0) - // Some tensors may have ID=0, so name becomes the fallback identifier - if (tensor_name != "unknown" && tensorNameToPtrMap_.count(tensor_name) > 0) { - void* existing_ptr = tensorNameToPtrMap_[tensor_name]; - QNN_ALLOCATOR_VERBOSE("Found existing mapping for tensor_name={}: existing_ptr={}", tensor_name, existing_ptr); - - if (existing_ptr == nullptr) { - // Mapping exists but buffer was freed, clean up and register new buffer - QNN_ALLOCATOR_VERBOSE( - "Existing mapping for tensor_name={} has nullptr ptr (mem_handle was deRegistered), will register new buffer", - tensor_name); - tensorNameToPtrMap_.erase(tensor_name); - } else if (reuseExistingBuffer(existing_ptr)) { - return true; - } else { - // Buffer exists but is no longer registered, clean up mapping - MLLM_WARN("Existing ptr {} for tensor_name={} is no longer registered", existing_ptr, tensor_name); - tensorNameToPtrMap_.erase(tensor_name); - } + return; } // Get the file id of this memory space. @@ -365,227 +73,30 @@ bool QNNAllocator::registerQnnTensorToSharedBuffer(Storage* storage, Qnn_Tensor_ // Make qnn memory descriptor. Set ION. Qnn_MemDescriptor_t mem_descriptor = QNN_MEM_DESCRIPTOR_INIT; mem_descriptor.memShape = { - .numDim = rank, - .dimSize = dims_ptr, + .numDim = QNN_TENSOR_GET_RANK(qnn_tensor), + .dimSize = QNN_TENSOR_GET_DIMENSIONS(qnn_tensor), .shapeConfig = nullptr, }; - mem_descriptor.dataType = data_type; + mem_descriptor.dataType = QNN_TENSOR_GET_DATA_TYPE(qnn_tensor); mem_descriptor.memType = QNN_MEM_TYPE_ION; mem_descriptor.ionInfo.fd = mem_fd; QNN_TENSOR_SET_MEM_TYPE(qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); // Register to QNN memory Qnn_MemHandle_t mem_handle = QNN_TENSOR_GET_MEM_HANDLE(qnn_tensor); - auto status = qnnInterface_.memRegister(context_, &mem_descriptor, 1u, &mem_handle); - - // Attempt to register the buffer with QNN - // This can fail if: - // 1. QNN HTP device memory is exhausted (typically ~2.5GB limit) - // 2. FastRPC memory mapping fails - // 3. SMMU (System Memory Management Unit) mapping fails - if (status != QNN_SUCCESS) { - auto stats = getRegisteredBufferStats(); - MLLM_ERROR("QNNAllocator::registerQnnTensorToSharedBuffer memRegister failed, status=0x{:x}, ptr={}, fd={}, bytes={}, " - "shape={}, dtype={}, tensor_id={}, tensor_name={}", - status, ptr, mem_fd, total_bytes, shape_str, static_cast(mem_descriptor.dataType), tensor_id, tensor_name); - MLLM_ERROR("Current registered buffers: {} buffers, {} MB", stats.count, stats.total_bytes / (1024 * 1024)); - - // Multi-level fallback strategy when registration fails - // This is critical when QNN device memory is exhausted - bool fallback_success = false; - - // Fallback Level 1: Try to reuse buffer by tensor ID - if (tensorIdToPtrMap_.count(tensor_id) > 0) { - void* existing_ptr = tensorIdToPtrMap_[tensor_id]; - if (existing_ptr != nullptr) { - MLLM_WARN("Fallback: Reusing existing buffer by ID for tensor_id={}, tensor_name={}, old_ptr={}, new_ptr={}", tensor_id, - tensor_name, existing_ptr, ptr); - fallback_success = reuseExistingBuffer(existing_ptr); - } - } - - // Fallback Level 2: Try to reuse buffer by tensor name - if (!fallback_success && tensor_name != "unknown" && tensorNameToPtrMap_.count(tensor_name) > 0) { - void* existing_ptr = tensorNameToPtrMap_[tensor_name]; - if (existing_ptr != nullptr) { - MLLM_WARN("Fallback: Reusing existing buffer by name for tensor_id={}, tensor_name={}, old_ptr={}, new_ptr={}", - tensor_id, tensor_name, existing_ptr, ptr); - fallback_success = reuseExistingBuffer(existing_ptr); - } - } - - // Fallback Level 3: Try to reuse last successfully registered buffer - // This is a last resort when memory is exhausted and we can't find exact matches - if (!fallback_success && hasLastRegistrationInfo_) { - bool same_tensor_id = tensor_id != 0 && tensor_id == lastRegistrationInfo_.tensor_id; - bool same_tensor_name = - tensor_name != "unknown" && !tensor_name.empty() && tensor_name == lastRegistrationInfo_.tensor_name; - bool ptr_still_registered = - lastRegistrationInfo_.ptr != nullptr && ptrToFdAndMemHandleMap_.count(lastRegistrationInfo_.ptr) > 0; - if ((same_tensor_id || same_tensor_name) && ptr_still_registered) { - MLLM_WARN("Fallback: Reusing last successful buffer for tensor_id={}, tensor_name={}, old_ptr={}, new_ptr={}", - tensor_id, tensor_name, lastRegistrationInfo_.ptr, ptr); - fallback_success = reuseExistingBuffer(lastRegistrationInfo_.ptr); - } else { - MLLM_WARN("Fallback: Last registration info unusable for tensor_id={}, tensor_name={}, " - "same_tensor_id={}, same_tensor_name={}, ptr_registered={}", - tensor_id, tensor_name, same_tensor_id, same_tensor_name, ptr_still_registered); - } - } - - // If all fallbacks failed, we must free the buffer and return failure - // The caller should handle this gracefully (e.g., by retrying or using CPU fallback) - if (!fallback_success) { - MLLM_ERROR("QNNAllocator::registerQnnTensorToSharedBuffer: memRegister failed and fallback also failed. " - "Buffer ptr={} will be freed, tensor registration cannot proceed.", - ptr); - - if (qnnMemPtrSet_.count(ptr) > 0) { - qnnMemPtrSet_.erase(ptr); - rpcmem_free(ptr); - storage->ptr_ = nullptr; - eraseTensorMappingsForPtr(ptr, "register failure -> freed ptr"); - clearLastRegistrationIfMatches(ptr, "register failure -> freed ptr"); - QNN_ALLOCATOR_VERBOSE("QNNAllocator::registerQnnTensorToSharedBuffer: Freed ptr={} ({} bytes) after failure", ptr, - total_bytes); - } - - // Restore original tensor state - QNN_TENSOR_SET_MEM_HANDLE(qnn_tensor, original_mem_handle); - QNN_TENSOR_SET_MEM_TYPE(qnn_tensor, original_mem_type); - return false; - } - return true; - } else { - // Registration succeeded, log verbose information - QNN_ALLOCATOR_VERBOSE("Register shared buffer ptr={}, fd={}, bytes={}, shape={}, dtype={}, tensor_id={}, tensor_name={}", - ptr, mem_fd, total_bytes, shape_str, static_cast(mem_descriptor.dataType), tensor_id, - tensor_name); - } + MLLM_RT_ASSERT_EQ(QNN_SUCCESS, qnnInterface_.memRegister(context_, &mem_descriptor, 1u, &mem_handle)); QNN_TENSOR_SET_MEM_HANDLE(qnn_tensor, mem_handle); ptrToFdAndMemHandleMap_.insert({ptr, {mem_fd, mem_handle}}); - updateMappings(ptr); - rememberLastRegistration(tensor_id, tensor_name, ptr, mem_handle, total_bytes); - return true; } void QNNAllocator::deRegisterQnnTensorFromSharedBuffer(void* ptr) { - auto iter = ptrToFdAndMemHandleMap_.find(ptr); - if (iter == ptrToFdAndMemHandleMap_.end()) { return; } - - Qnn_ErrorHandle_t status = qnnInterface_.memDeRegister(&(iter->second.second), 1); - if (status != QNN_SUCCESS) { - MLLM_WARN("QNNAllocator::deRegisterQnnTensorFromSharedBuffer memDeRegister failed, status=0x{:x}, ptr={}, fd={}", status, - ptr, iter->second.first); - } - - ptrToFdAndMemHandleMap_.erase(iter); - ptrToSizeMap_.erase(ptr); - eraseTensorMappingsForPtr(ptr, "explicit deRegister"); - clearLastRegistrationIfMatches(ptr, "explicit deRegister"); + MLLM_RT_ASSERT_EQ(ptrToFdAndMemHandleMap_.count(ptr), 1); + MLLM_RT_ASSERT_EQ(QNN_SUCCESS, qnnInterface_.memDeRegister(&(ptrToFdAndMemHandleMap_[ptr].second), 1)); + ptrToFdAndMemHandleMap_.erase(ptr); } -QNNAllocator::BufferStats QNNAllocator::getRegisteredBufferStats() const { - BufferStats stats{}; - stats.count = ptrToFdAndMemHandleMap_.size(); - stats.total_bytes = 0; - - for (const auto& [ptr, size] : ptrToSizeMap_) { stats.total_bytes += size; } - - return stats; -} - -bool QNNAllocator::isRegistered(void* ptr) const { return ptrToFdAndMemHandleMap_.count(ptr) > 0; } - -size_t QNNAllocator::getRegisteredBufferSize(void* ptr) const { - auto it = ptrToSizeMap_.find(ptr); - if (it == ptrToSizeMap_.end()) { return 0; } - return it->second; -} - -/** - * @brief Erase all tensor ID and name mappings that point to a specific buffer pointer - * - * When a buffer is freed or de-registered, we need to clean up all mappings that reference it. - * This ensures that future lookups won't find stale pointers. - * - * @param ptr The buffer pointer to remove from mappings - * @param reason Reason for erasure (for debugging/logging purposes) - */ -void QNNAllocator::eraseTensorMappingsForPtr(void* ptr, std::string_view reason) { - if (ptr == nullptr) { return; } - - // Remove all tensor ID mappings that point to this ptr - for (auto it = tensorIdToPtrMap_.begin(); it != tensorIdToPtrMap_.end();) { - if (it->second == ptr) { - it = tensorIdToPtrMap_.erase(it); - } else { - ++it; - } - } - - // Remove all tensor name mappings that point to this ptr - for (auto it = tensorNameToPtrMap_.begin(); it != tensorNameToPtrMap_.end();) { - if (it->second == ptr) { - it = tensorNameToPtrMap_.erase(it); - } else { - ++it; - } - } -} - -/** - * @brief Remember the last successful buffer registration for fallback purposes - * - * This function stores information about the most recent successful registration. - * This information is used as a last-resort fallback when: - * 1. New registration fails (e.g., memory exhausted) - * 2. Exact tensor ID/name matches are not found - * 3. The last registered buffer is still valid and matches the tensor - * - * This is particularly useful in decode phase where memory pressure is high - * and we want to maximize buffer reuse. - * - * @param tensor_id Tensor ID of the registered tensor - * @param tensor_name Tensor name of the registered tensor - * @param ptr Buffer pointer that was successfully registered - * @param mem_handle QNN memory handle from successful registration - * @param total_bytes Size of the registered buffer in bytes - */ -void QNNAllocator::rememberLastRegistration(uint32_t tensor_id, const std::string& tensor_name, void* ptr, - Qnn_MemHandle_t mem_handle, size_t total_bytes) { - if (ptr == nullptr || mem_handle == nullptr) { return; } - lastRegistrationInfo_.tensor_id = tensor_id; - lastRegistrationInfo_.tensor_name = tensor_name; - lastRegistrationInfo_.ptr = ptr; - lastRegistrationInfo_.mem_handle = mem_handle; - lastRegistrationInfo_.bytes = total_bytes; - hasLastRegistrationInfo_ = true; - // Note: Remembered registration info is used as fallback mechanism, logging removed for performance -} - -/** - * @brief Clear the last registration info if it matches the given pointer - * - * When a buffer is freed or de-registered, we should clear the last registration - * info if it references that buffer. This prevents using stale registration info - * in future fallback attempts. - * - * @param ptr The buffer pointer to check against - * @param reason Reason for clearing (for debugging/logging purposes) - */ -void QNNAllocator::clearLastRegistrationIfMatches(void* ptr, std::string_view reason) { - if (!hasLastRegistrationInfo_ || ptr == nullptr) { return; } - if (lastRegistrationInfo_.ptr == ptr) { - lastRegistrationInfo_ = {}; - hasLastRegistrationInfo_ = false; - } -} - -#undef QNN_ALLOCATOR_VERBOSE - std::shared_ptr createQNNAllocator() { return std::make_shared(); } } // namespace mllm::qnn \ No newline at end of file diff --git a/mllm/backends/qnn/QNNAllocator.hpp b/mllm/backends/qnn/QNNAllocator.hpp index 9f8df9376..eac40a534 100644 --- a/mllm/backends/qnn/QNNAllocator.hpp +++ b/mllm/backends/qnn/QNNAllocator.hpp @@ -3,9 +3,8 @@ #pragma once -#include #include -#include +#include #include "QnnCommon.h" #include "QnnInterface.h" #include "mllm/backends/base/Allocator.hpp" @@ -31,7 +30,14 @@ class QNNAllocator final : public Allocator { QNNAllocator(); // need to setQNNPointer afterward QNNAllocator(QNN_INTERFACE_VER_TYPE qnnInterface, void* context); - ~QNNAllocator(); + ~QNNAllocator() { + for (auto iter = ptrToFdAndMemHandleMap_.begin(); iter != ptrToFdAndMemHandleMap_.end();) { + Qnn_ErrorHandle_t deregisterRet = qnnInterface_.memDeRegister(&iter->second.second, 1); + if (QNN_SUCCESS != deregisterRet) { MLLM_ERROR("~QNNAllocator: qnnInterface_.memDeRegister failed"); } + rpcmem_free(iter->first); + iter = ptrToFdAndMemHandleMap_.erase(iter); + } + } void setQNNPointer(QNN_INTERFACE_VER_TYPE qnnInterface, void* context) { this->qnnInterface_ = qnnInterface; @@ -69,21 +75,10 @@ class QNNAllocator final : public Allocator { // Sharing access in between processing domains in QNN HTP backend. Using shared buffers can // eliminate data copy in between client code on the host CPU and HTP accelerator. - bool registerQnnTensorToSharedBuffer(Storage* storage, Qnn_Tensor_t& qnn_tensor); + void registerQnnTensorToSharedBuffer(void* ptr, Qnn_Tensor_t& qnn_tensor); void deRegisterQnnTensorFromSharedBuffer(void* ptr); - // Debug: Get statistics about registered buffers - struct BufferStats { - size_t count; - size_t total_bytes; - }; - [[nodiscard]] BufferStats getRegisteredBufferStats() const; - - // Debug: Check if a ptr is already registered - bool isRegistered(void* ptr) const; - [[nodiscard]] size_t getRegisteredBufferSize(void* ptr) const; - private: QNN_INTERFACE_VER_TYPE qnnInterface_; Qnn_ContextHandle_t context_ = nullptr; @@ -95,63 +90,6 @@ class QNNAllocator final : public Allocator { // to check if the ptr is allocted by rpcmem_alloc std::set qnnMemPtrSet_; std::map> ptrToFdAndMemHandleMap_; - // Track buffer sizes for statistics - std::map ptrToSizeMap_; - // Map tensor name to registered buffer ptr for reuse (fallback identifier) - // Used when tensor ID is 0 or unavailable - std::map tensorNameToPtrMap_; - - // Map tensor ID to registered buffer ptr for reuse (primary identifier) - // Tensor ID is more reliable than name and is used as the primary lookup key - // This enables buffer reuse across prefill and decode phases - std::map tensorIdToPtrMap_; - - /** - * @brief Information about the last successful buffer registration - * - * This structure stores metadata about the most recent successful registration, - * which is used as a last-resort fallback when: - * - New registration fails (e.g., memory exhausted) - * - Exact tensor ID/name matches are not found - * - The last registered buffer is still valid and matches the tensor - * - * This is particularly useful in decode phase where memory pressure is high. - */ - struct LastRegistrationInfo { - uint32_t tensor_id = 0; // Tensor ID of the registered tensor - std::string tensor_name; // Tensor name of the registered tensor - void* ptr = nullptr; // Buffer pointer that was successfully registered - Qnn_MemHandle_t mem_handle = nullptr; // QNN memory handle from successful registration - size_t bytes = 0; // Size of the registered buffer in bytes - }; - - LastRegistrationInfo lastRegistrationInfo_{}; // Last successful registration info - bool hasLastRegistrationInfo_ = false; // Whether last registration info is valid - - /** - * @brief Erase all tensor ID and name mappings that point to a specific buffer pointer - * @param ptr The buffer pointer to remove from mappings - * @param reason Reason for erasure (for debugging/logging purposes) - */ - void eraseTensorMappingsForPtr(void* ptr, std::string_view reason); - - /** - * @brief Remember the last successful buffer registration for fallback purposes - * @param tensor_id Tensor ID of the registered tensor - * @param tensor_name Tensor name of the registered tensor - * @param ptr Buffer pointer that was successfully registered - * @param mem_handle QNN memory handle from successful registration - * @param total_bytes Size of the registered buffer in bytes - */ - void rememberLastRegistration(uint32_t tensor_id, const std::string& tensor_name, void* ptr, Qnn_MemHandle_t mem_handle, - size_t total_bytes); - - /** - * @brief Clear the last registration info if it matches the given pointer - * @param ptr The buffer pointer to check against - * @param reason Reason for clearing (for debugging/logging purposes) - */ - void clearLastRegistrationIfMatches(void* ptr, std::string_view reason); }; std::shared_ptr createQNNAllocator(); diff --git a/mllm/backends/qnn/QNNUtils.cpp b/mllm/backends/qnn/QNNUtils.cpp index f4517637f..7a3670624 100644 --- a/mllm/backends/qnn/QNNUtils.cpp +++ b/mllm/backends/qnn/QNNUtils.cpp @@ -298,6 +298,129 @@ bool freeQnnTensors(Qnn_Tensor_t*& tensors, uint32_t numTensors) { } // --------------- QNN Wrapper --------------- + +Qnn_DataType_t mllmDataTypeToQnnDataType(DataTypes dtype) { + Qnn_DataType_t ret = QNN_DATATYPE_UNDEFINED; + switch (dtype) { + case kInt8: { + ret = QNN_DATATYPE_INT_8; + break; + } + case kInt16: { + ret = QNN_DATATYPE_INT_16; + break; + } + case kInt32: { + ret = QNN_DATATYPE_INT_32; + break; + } + case kInt64: { + ret = QNN_DATATYPE_INT_64; + break; + } + case kUInt8: { + ret = QNN_DATATYPE_UINT_8; + break; + } + case kUInt16: { + ret = QNN_DATATYPE_UINT_16; + break; + } + case kUInt32: { + ret = QNN_DATATYPE_UINT_32; + break; + } + case kUInt64: { + ret = QNN_DATATYPE_UINT_64; + break; + } + case kFloat16: { + ret = QNN_DATATYPE_FLOAT_16; + break; + } + case kFloat32: { + ret = QNN_DATATYPE_FLOAT_32; + break; + } + // case kBFloat16: { + // ret = QNN_DATATYPE_BFLOAT_16; + // break; + // } + // FIXME: Maybe error here. + case kInt4: { + ret = QNN_DATATYPE_SFIXED_POINT_4; + break; + } + case kUInt4: { + ret = QNN_DATATYPE_UFIXED_POINT_4; + break; + } + case kInt8PerTensorSym: + case kInt8PerTensorAsy: + case kInt8PerChannelAsy: + case kInt8PerChannelSym: { + ret = QNN_DATATYPE_SFIXED_POINT_8; + break; + } + case kUInt8PerTensorSym: + case kUInt8PerTensorAsy: + case kUInt8PerChannelAsy: + case kUInt8PerChannelSym: { + ret = QNN_DATATYPE_UFIXED_POINT_8; + break; + } + case kInt16PerTensorSym: + case kInt16PerTensorAsy: + case kInt16PerChannelSym: + case kInt16PerChannelAsy: { + ret = QNN_DATATYPE_SFIXED_POINT_16; + break; + } + case kUInt16PerTensorSym: + case kUInt16PerTensorAsy: + case kUInt16PerChannelSym: + case kUInt16PerChannelAsy: { + ret = QNN_DATATYPE_UFIXED_POINT_16; + break; + } + default: { + MLLM_ERROR("Can't parse datatype: {}", nameOfType(dtype)); + ret = QNN_DATATYPE_UNDEFINED; + } + } + return ret; +} + +size_t qnnDataTypeToSize(Qnn_DataType_t dtype) { + switch (dtype) { + case QNN_DATATYPE_INT_8: + case QNN_DATATYPE_UINT_8: + case QNN_DATATYPE_BOOL_8: + case QNN_DATATYPE_SFIXED_POINT_8: + case QNN_DATATYPE_UFIXED_POINT_8: return 1; + + case QNN_DATATYPE_INT_16: + case QNN_DATATYPE_UINT_16: + case QNN_DATATYPE_FLOAT_16: + case QNN_DATATYPE_SFIXED_POINT_16: + case QNN_DATATYPE_UFIXED_POINT_16: return 2; + + case QNN_DATATYPE_INT_32: + case QNN_DATATYPE_UINT_32: + case QNN_DATATYPE_FLOAT_32: + case QNN_DATATYPE_SFIXED_POINT_32: + case QNN_DATATYPE_UFIXED_POINT_32: return 4; + + case QNN_DATATYPE_INT_64: + case QNN_DATATYPE_UINT_64: return 8; + + default: + MLLM_ERROR("qnnDataTypeToSize: unsupported Qnn_DataType_t {}", static_cast(dtype)); + MLLM_RT_ASSERT(false); + return 0; + } +} + QNNTensorWrapper::QNNTensorWrapper(const std::string& name, Qnn_TensorType_t type, Qnn_DataType_t dataType, const std::vector& dimensions, Qnn_QuantizeParams_t quantize) { name_ = name; @@ -328,16 +451,7 @@ std::shared_ptr QNNTensorWrapper::create(const std::string& na MLLM_RT_ASSERT(!name.empty()); if (type != QNN_TENSOR_TYPE_STATIC) { MLLM_RT_ASSERT(tensor.device() == kQNN); } - Qnn_DataType_t dataType = QNN_DATATYPE_UNDEFINED; - switch (tensor.dtype()) { - case kFloat32: dataType = QNN_DATATYPE_FLOAT_32; break; - case kFloat16: dataType = QNN_DATATYPE_FLOAT_16; break; - case kInt8: dataType = QNN_DATATYPE_SFIXED_POINT_8; break; - case kInt16: dataType = QNN_DATATYPE_SFIXED_POINT_16; break; - case kInt32: dataType = QNN_DATATYPE_SFIXED_POINT_32; break; - case kUInt8: dataType = QNN_DATATYPE_UFIXED_POINT_8; break; - default: MLLM_ERROR("Unsupported tensor element type for QNN: {}", (int)tensor.dtype()); break; - } + Qnn_DataType_t dataType = mllmDataTypeToQnnDataType(tensor.dtype()); std::vector dimensions(tensor.rank()); for (int i = 0; i < tensor.rank(); i++) { dimensions[i] = tensor.shape()[i]; } @@ -371,71 +485,45 @@ std::shared_ptr QNNTensorWrapper::createStaticTensor(const std } void QNNTensorWrapper::alloc() { + if (isAlloc_) { + MLLM_WARN("Tensor {} has already been allocated.", name_); + return; + } MLLM_RT_ASSERT(dataContainer_.device() == kQNN); - void* currentPtr = dataContainer_.impl()->ptr(); - if (!currentPtr) { - dataContainer_.alloc(); - currentPtr = dataContainer_.ptr(); - } + // if storage is not allocated, allocate it + // or, register the existing storage to QNN(passing allocated input to QNN) + if (!dataContainer_.impl()->ptr()) { dataContainer_.alloc(); } - auto allocator = std::static_pointer_cast(Context::instance().getBackend(kQNN)->allocator()); - - auto storage = dataContainer_.impl()->storage(); - MLLM_RT_ASSERT(storage != nullptr); - - size_t requiredBytes = dataContainer_.bytes(); - - // Check if we have a previously registered buffer pointer - // This handles the case where tensor dimensions change (e.g., in decode phase) - // and the existing registered buffer is too small - if (registeredPtr_) { - // Verify that the registered buffer is still valid - if (!allocator->isRegistered(registeredPtr_)) { - // Buffer was de-registered, clear the reference - registeredPtr_ = nullptr; - isAlloc_ = false; - } else { - // Check if the registered buffer is large enough for current requirements - // If not, we need to de-register it and allocate a new one - size_t registeredBytes = allocator->getRegisteredBufferSize(registeredPtr_); - if (registeredBytes > 0 && registeredBytes < requiredBytes) { - // Registered buffer is too small, de-register it - // A new buffer will be allocated and registered below - allocator->deRegisterQnnTensorFromSharedBuffer(registeredPtr_); - registeredPtr_ = nullptr; - isAlloc_ = false; - } - } - } + std::static_pointer_cast(Context::instance().getBackend(kQNN)->allocator()) + ->registerQnnTensorToSharedBuffer(dataContainer_.ptr(), qnnTensor_); - if (registeredPtr_ && registeredPtr_ != storage->ptr_) { - if (!allocator->isRegistered(registeredPtr_)) { - registeredPtr_ = nullptr; - } else { - void* freshPtr = storage->ptr_; - size_t bytesToCopy = dataContainer_.bytes(); - if (freshPtr && bytesToCopy > 0) { std::memcpy(registeredPtr_, freshPtr, bytesToCopy); } - if (freshPtr) { allocator->free(storage.get()); } - storage->ptr_ = registeredPtr_; - currentPtr = registeredPtr_; - } - } + isAlloc_ = true; +} + +void QNNTensorWrapper::setScaleOffsetQuantization(const std::vector& scaleOffsets, int32_t axis) { + scaleOffsets_ = scaleOffsets; + qnnTensor_.v2.quantizeParams.encodingDefinition = QNN_DEFINITION_DEFINED; + qnnTensor_.v2.quantizeParams.quantizationEncoding = QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET; + qnnTensor_.v2.quantizeParams.axisScaleOffsetEncoding = Qnn_AxisScaleOffset_t{ + .axis = axis, + .numScaleOffsets = (uint32_t)scaleOffsets_.size(), + .scaleOffset = scaleOffsets_.data(), + }; +} - if (isAlloc_ && registeredPtr_ == currentPtr) { return; } +void QNNTensorWrapper::setBlockwiseQuantization(const Qnn_BlockwiseExpansion_t& blockwise, + const std::vector& scaleOffsets) { + scaleOffsets_ = scaleOffsets; + blockwiseExpansion_ = blockwise; - if (!allocator->registerQnnTensorToSharedBuffer(storage.get(), qnnTensor_)) { - MLLM_ERROR("QNNTensorWrapper::alloc failed to register shared buffer for tensor {}", name_); - // Fail fast: prevent executing graph with invalid mem handle - MLLM_RT_ASSERT(false); - } + blockwiseExpansion_.scaleOffsets = scaleOffsets_.data(); - registeredPtr_ = storage->ptr_; - isAlloc_ = true; + qnnTensor_.v2.quantizeParams.encodingDefinition = QNN_DEFINITION_DEFINED; + qnnTensor_.v2.quantizeParams.quantizationEncoding = QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION; + qnnTensor_.v2.quantizeParams.blockwiseExpansion = &blockwiseExpansion_; } -void QNNTensorWrapper::resetAlloc() { isAlloc_ = false; } - void QNNTensorWrapper::initFromQnnTensor(Qnn_Tensor_t* qnnTensor) { if (qnnTensor == nullptr) { MLLM_ERROR("QNNTensorWrapper::setQnnTensor() received nullptr"); @@ -493,7 +581,7 @@ QNNParamTensorWrapper::~QNNParamTensorWrapper() { free(QNN_TENSOR_GET_CLIENT_BUF(qnnParam_.tensorParam).data); } void* QNNParamTensorWrapper::alloc() { - uint32_t dataSize = QNNDataTypeToSize.find(QNN_TENSOR_GET_DATA_TYPE(qnnParam_.tensorParam))->second; + uint32_t dataSize = qnnDataTypeToSize(QNN_TENSOR_GET_DATA_TYPE(qnnParam_.tensorParam)); for (int i = 0; i < QNN_TENSOR_GET_RANK(qnnParam_.tensorParam); i++) { dataSize *= qnnParam_.tensorParam.v2.dimensions[i]; } Qnn_ClientBuffer_t clientBuffer = {.data = malloc(dataSize), .dataSize = dataSize}; QNN_TENSOR_SET_CLIENT_BUF(qnnParam_.tensorParam, clientBuffer); diff --git a/mllm/backends/qnn/QNNUtils.hpp b/mllm/backends/qnn/QNNUtils.hpp index 99695e784..a369fcc56 100644 --- a/mllm/backends/qnn/QNNUtils.hpp +++ b/mllm/backends/qnn/QNNUtils.hpp @@ -20,7 +20,7 @@ class TensorValue; * @brief Utility functions for working with QNN tensors and QNN graphInfo structures. * @note It will NOT perform QNN checks, such as tensor version checks, etc. * Currently, QNN tensor v1 and v2 are compatible for common variables. - * Future modifications should refer to $QNN_SDK_ROOT/examples/QNN/SampleApp + * Future modifications should refer to $QAIRT_SDK_ROOT/examples/QNN/SampleApp */ namespace mllm::qnn { @@ -49,19 +49,13 @@ bool loadQNNSystemSymbol(); // --------------- End of QNN symbols loading --------------- +Qnn_DataType_t mllmDataTypeToQnnDataType(DataTypes dtype); +size_t qnnDataTypeToSize(Qnn_DataType_t dtype); + #define DEFAULT_QUANTIZE_PARAMS \ (Qnn_QuantizeParams_t{ \ QNN_DEFINITION_UNDEFINED, QNN_QUANTIZATION_ENCODING_UNDEFINED, {.scaleOffsetEncoding = {.scale = 0.0f, .offset = 0}}}) -const std::map QNNDataTypeToSize = { - {QNN_DATATYPE_INT_8, 1}, {QNN_DATATYPE_INT_16, 2}, {QNN_DATATYPE_INT_32, 4}, - {QNN_DATATYPE_INT_64, 8}, {QNN_DATATYPE_UINT_8, 1}, {QNN_DATATYPE_UINT_16, 2}, - {QNN_DATATYPE_UINT_32, 4}, {QNN_DATATYPE_UINT_64, 8}, {QNN_DATATYPE_FLOAT_16, 2}, - {QNN_DATATYPE_FLOAT_32, 4}, {QNN_DATATYPE_BOOL_8, 1}, {QNN_DATATYPE_SFIXED_POINT_8, 1}, - {QNN_DATATYPE_SFIXED_POINT_16, 2}, {QNN_DATATYPE_SFIXED_POINT_32, 4}, {QNN_DATATYPE_UFIXED_POINT_8, 1}, - {QNN_DATATYPE_UFIXED_POINT_16, 2}, {QNN_DATATYPE_UFIXED_POINT_32, 4}, -}; - // Utils for copying metadata to GraphInfo using GraphInfo_t = struct GraphInfo { Qnn_GraphHandle_t graph; @@ -207,11 +201,13 @@ class QNNTensorWrapper { // alloc graph input/output tensor memory in QNN shared buffer void alloc(); - // reset allocation flag when dataContainer is updated - void resetAlloc(); Tensor& getDataContainer() { return dataContainer_; } const std::vector* getDimension() { return &dimensions_; } + // Helper to set complex quantization params and manage memory + void setScaleOffsetQuantization(const std::vector& scaleOffsets, int32_t axis); + void setBlockwiseQuantization(const Qnn_BlockwiseExpansion_t& blockwise, const std::vector& scaleOffsets); + private: std::string name_; std::vector dimensions_; @@ -219,6 +215,10 @@ class QNNTensorWrapper { Qnn_Tensor_t qnnTensor_; bool isAlloc_ = false; void* registeredPtr_ = nullptr; + + // Storage for quantization parameters to ensure lifetime matches the tensor wrapper + std::vector scaleOffsets_; + Qnn_BlockwiseExpansion_t blockwiseExpansion_; }; class QNNParamTensorWrapper { diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp index 0f29498f5..c7afa9776 100644 --- a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp +++ b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp @@ -2,169 +2,34 @@ // Licensed under the MIT License. #include -#include +#include -#include -#include -#include -#include -#include +#include +#include +#include +#include -#include "mllm/utils/Common.hpp" +#include "mllm/backends/qnn/aot/passes/AOTCompileContext.hpp" #include "mllm/core/DataTypes.hpp" +#include "mllm/utils/Common.hpp" #include "mllm/backends/qnn/QNNTypeMacros.hpp" #include "mllm/compile/ir/linalg/Attribute.hpp" #include "mllm/backends/qnn/aot/QnnWrappersAPI.hpp" #include "mllm/backends/qnn/aot/QnnTargetMachine.hpp" -#include "mllm/backends/qnn/aot/passes/AOTCompileContext.hpp" +#include "mllm/backends/qnn/QNNUtils.hpp" +#include "mllm/utils/Log.hpp" namespace mllm::qnn::aot { -void __mllmLoggerCallback4QnnLogger(const char* fmt, QnnLog_Level_t level, uint64_t times_tamp, va_list argp) { - const char* level_str = ""; - switch (level) { - case QNN_LOG_LEVEL_ERROR: level_str = "[ERROR] "; break; - case QNN_LOG_LEVEL_WARN: level_str = "[WARN] "; break; - case QNN_LOG_LEVEL_INFO: level_str = "[INFO] "; break; - case QNN_LOG_LEVEL_DEBUG: level_str = "[DEBUG] "; break; - case QNN_LOG_LEVEL_VERBOSE: level_str = "[VERBOSE]"; break; - case QNN_LOG_LEVEL_MAX: level_str = "[UNKNOWN]"; break; - } - - double ms = (double)times_tamp / 1000000.0; - - { - fprintf(stdout, "QnnLogger(%8.1fms, %ld) %s: ", ms, times_tamp, level_str); - vfprintf(stdout, fmt, argp); - } -} - -size_t QnnAOTDataTypeSize(Qnn_DataType_t dtype) { - switch (dtype) { - case QNN_DATATYPE_INT_8: - case QNN_DATATYPE_UINT_8: - case QNN_DATATYPE_BOOL_8: - case QNN_DATATYPE_SFIXED_POINT_8: - case QNN_DATATYPE_UFIXED_POINT_8: return 1; - - case QNN_DATATYPE_INT_16: - case QNN_DATATYPE_UINT_16: - case QNN_DATATYPE_FLOAT_16: - case QNN_DATATYPE_SFIXED_POINT_16: - case QNN_DATATYPE_UFIXED_POINT_16: return 2; - - case QNN_DATATYPE_INT_32: - case QNN_DATATYPE_UINT_32: - case QNN_DATATYPE_FLOAT_32: - case QNN_DATATYPE_SFIXED_POINT_32: - case QNN_DATATYPE_UFIXED_POINT_32: return 4; - - case QNN_DATATYPE_INT_64: - case QNN_DATATYPE_UINT_64: return 8; - - default: - MLLM_ERROR("QnnAOTDataTypeSize: unsupported Qnn_DataType_t {}", static_cast(dtype)); - MLLM_RT_ASSERT(false); - return 0; - } -} - -QnnAOTParamScalar::QnnAOTParamScalar(const std::string& name, bool value) { - name_ = name; - qnn_param_.paramType = QNN_PARAMTYPE_SCALAR; - qnn_param_.name = name_.c_str(); - qnn_param_.scalarParam.dataType = QNN_DATATYPE_BOOL_8; - qnn_param_.scalarParam.bool8Value = static_cast(value); -} - -QnnAOTParamScalar::QnnAOTParamScalar(const std::string& name, uint32_t value) { - name_ = name; - qnn_param_.paramType = QNN_PARAMTYPE_SCALAR; - qnn_param_.name = name_.c_str(); - qnn_param_.scalarParam.dataType = QNN_DATATYPE_UINT_32; - qnn_param_.scalarParam.uint32Value = value; -} - -QnnAOTParamScalar::QnnAOTParamScalar(const std::string& name, float value) { - name_ = name; - qnn_param_.paramType = QNN_PARAMTYPE_SCALAR; - qnn_param_.name = name_.c_str(); - qnn_param_.scalarParam.dataType = QNN_DATATYPE_FLOAT_32; - qnn_param_.scalarParam.floatValue = value; -} - -Qnn_Param_t* QnnAOTParamScalar::getQnnParam() { return &(qnn_param_); } - -QnnAOTParamTensor::QnnAOTParamTensor(const std::string& param_name, const std::string& tensor_name, Qnn_DataType_t data_type, - const std::vector& dimensions) { - param_name_ = param_name; - tensor_name_ = tensor_name; - dimensions_ = dimensions; - // Fix parameters. - qnn_param_.paramType = QNN_PARAMTYPE_TENSOR; - qnn_param_.tensorParam.version = QNN_TENSOR_VERSION_2; - qnn_param_.tensorParam.v2 = QNN_TENSOR_V2_INIT; - qnn_param_.tensorParam.v2.type = QNN_TENSOR_TYPE_STATIC; - qnn_param_.tensorParam.v2.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; - qnn_param_.tensorParam.v2.quantizeParams = Qnn_QuantizeParams_t{ - QNN_DEFINITION_UNDEFINED, QNN_QUANTIZATION_ENCODING_UNDEFINED, {.scaleOffsetEncoding = {.scale = 0.0f, .offset = 0}}}; - qnn_param_.tensorParam.v2.memType = QNN_TENSORMEMTYPE_RAW; - // Custom parameters. - qnn_param_.name = param_name_.c_str(); - qnn_param_.tensorParam.v2.name = tensor_name_.c_str(); - qnn_param_.tensorParam.v2.dataType = data_type; - qnn_param_.tensorParam.v2.rank = dimensions_.size(); - qnn_param_.tensorParam.v2.dimensions = dimensions_.data(); - qnn_param_.tensorParam.v2.clientBuf = {.data = nullptr, .dataSize = 0}; -} - -QnnAOTParamTensor::~QnnAOTParamTensor() { - auto data = QNN_TENSOR_GET_CLIENT_BUF(qnn_param_.tensorParam).data; - MLLM_RT_ASSERT(data != nullptr); - if (data) { free(data); } -} - -void* QnnAOTParamTensor::alloc() { - uint32_t data_size = QnnAOTDataTypeSize(QNN_TENSOR_GET_DATA_TYPE(qnn_param_.tensorParam)); - for (int i = 0; i < QNN_TENSOR_GET_RANK(qnn_param_.tensorParam); i++) { - data_size *= qnn_param_.tensorParam.v2.dimensions[i]; - } - Qnn_ClientBuffer_t clientBuffer = {.data = malloc(data_size), .dataSize = data_size}; - QNN_TENSOR_SET_CLIENT_BUF(qnn_param_.tensorParam, clientBuffer); - MLLM_RT_ASSERT(QNN_TENSOR_GET_CLIENT_BUF(qnn_param_.tensorParam).data != nullptr); - return QNN_TENSOR_GET_CLIENT_BUF(qnn_param_.tensorParam).data; -} - -Qnn_Param_t* QnnAOTParamTensor::getQnnParam() { return &qnn_param_; } - -Qnn_Tensor_t* QnnAOTParamTensor::getQnnTensor() { return &qnn_param_.tensorParam; } - QnnAOTNodeTensor::QnnAOTNodeTensor(const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight) { - name_ = v->name(); - mllm_tensor_ = v->tensor_; - quant_spec_ = v->getAttr("quant_recipe")->cast_()->spec_; - for (auto s : v->tensor_.shape()) { shape_.emplace_back(s); } - - qnn_tensor_.version = QNN_TENSOR_VERSION_2; - qnn_tensor_.v2 = QNN_TENSOR_V2_INIT; - qnn_tensor_.v2.id = v->tensor_.uuid(); - qnn_tensor_.v2.name = name_.c_str(); - qnn_tensor_.v2.type = parseQnnTensorTypeFromIR(v); - qnn_tensor_.v2.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; - qnn_tensor_.v2.dataType = parseQnnDataTypeFromIR(v); - qnn_tensor_.v2.quantizeParams = parseQnnQuantizeParamFromIR(v); - qnn_tensor_.v2.rank = (uint32_t)v->tensor_.rank(); - qnn_tensor_.v2.dimensions = shape_.data(); - qnn_tensor_.v2.isDynamicDimensions = nullptr; - qnn_tensor_.v2.sparseParams = QNN_SPARSE_PARAMS_INIT; - qnn_tensor_.v2.isProduced = 0u; - - if (force_static_weight) { - qnn_tensor_.v2.memType = QNN_TENSORMEMTYPE_RAW; - qnn_tensor_.v2.clientBuf = { - .data = (void*)mllm_tensor_.ptr(), - .dataSize = (uint32_t)mllm_tensor_.bytes(), - }; + auto type = parseQnnTensorTypeFromIR(v); + auto name = v->name(); + auto quant = parseQnnQuantizeParamFromIR(v); + + if (force_static_weight || type == QNN_TENSOR_TYPE_STATIC) { + tensor_wrapper_ = mllm::qnn::QNNTensorWrapper::createStaticTensor(name, v->tensor_, quant); + } else { + tensor_wrapper_ = mllm::qnn::QNNTensorWrapper::create(name, type, v->tensor_, quant); } } @@ -232,95 +97,7 @@ Qnn_TensorType_t QnnAOTNodeTensor::parseQnnTensorTypeFromIR(const ir::tensor::Te } Qnn_DataType_t QnnAOTNodeTensor::parseQnnDataTypeFromIR(const ir::tensor::TensorValue::ptr_t& v) { - Qnn_DataType_t ret = QNN_DATATYPE_UNDEFINED; - switch (v->tensor_.dtype()) { - case kInt8: { - ret = QNN_DATATYPE_INT_8; - break; - } - case kInt16: { - ret = QNN_DATATYPE_INT_16; - break; - } - case kInt32: { - ret = QNN_DATATYPE_INT_32; - break; - } - case kInt64: { - ret = QNN_DATATYPE_INT_64; - break; - } - case kUInt8: { - ret = QNN_DATATYPE_UINT_8; - break; - } - case kUInt16: { - ret = QNN_DATATYPE_UINT_16; - break; - } - case kUInt32: { - ret = QNN_DATATYPE_UINT_32; - break; - } - case kUInt64: { - ret = QNN_DATATYPE_UINT_64; - break; - } - case kFloat16: { - ret = QNN_DATATYPE_FLOAT_16; - break; - } - case kFloat32: { - ret = QNN_DATATYPE_FLOAT_32; - break; - } - case kBFloat16: { - ret = QNN_DATATYPE_BFLOAT_16; - break; - } - // FIXME: Maybe error here. - case kInt4: { - ret = QNN_DATATYPE_SFIXED_POINT_4; - break; - } - case kUInt4: { - ret = QNN_DATATYPE_UFIXED_POINT_4; - break; - } - case kInt8PerTensorSym: - case kInt8PerTensorAsy: - case kInt8PerChannelAsy: - case kInt8PerChannelSym: { - ret = QNN_DATATYPE_SFIXED_POINT_8; - break; - } - case kUInt8PerTensorSym: - case kUInt8PerTensorAsy: - case kUInt8PerChannelAsy: - case kUInt8PerChannelSym: { - ret = QNN_DATATYPE_UFIXED_POINT_8; - break; - } - case kInt16PerTensorSym: - case kInt16PerTensorAsy: - case kInt16PerChannelSym: - case kInt16PerChannelAsy: { - ret = QNN_DATATYPE_SFIXED_POINT_16; - break; - } - case kUInt16PerTensorSym: - case kUInt16PerTensorAsy: - case kUInt16PerChannelSym: - case kUInt16PerChannelAsy: { - ret = QNN_DATATYPE_UFIXED_POINT_16; - break; - } - default: { - MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't parse datatype: {}", nameOfType(v->tensor_.dtype())); - ret = QNN_DATATYPE_UNDEFINED; - } - } - return ret; + return mllm::qnn::mllmDataTypeToQnnDataType(v->tensor_.dtype()); } std::string QnnAOTNodeTensor::parseQnnTensorNameFromIR(const ir::tensor::TensorValue::ptr_t& v) { return v->name(); } @@ -347,22 +124,15 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten // Prepare data auto num_scale_offsets = (uint32_t)v->tensor_.size(cfg->ch_axis); - Qnn_ScaleOffset_t* scale_array = (Qnn_ScaleOffset_t*)malloc(sizeof(Qnn_ScaleOffset_t) * num_scale_offsets); + std::vector scale_offsets(num_scale_offsets); MLLM_RT_ASSERT_EQ(num_scale_offsets, cfg->scale.size(0)); MLLM_RT_ASSERT_EQ(cfg->scale.dtype(), kFloat32); for (int i = 0; i < num_scale_offsets; ++i) { - scale_array[i].scale = cfg->scale.at({i}); - scale_array[i].offset = 0; + scale_offsets[i].scale = cfg->scale.at({i}); + scale_offsets[i].offset = 0; } - unreachable_handle_.emplace_back(scale_array); - ret.encodingDefinition = QNN_DEFINITION_DEFINED; - ret.quantizationEncoding = QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET; - ret.axisScaleOffsetEncoding = Qnn_AxisScaleOffset_t{ - .axis = cfg->ch_axis, - .numScaleOffsets = num_scale_offsets, - .scaleOffset = scale_array, - }; + tensor_wrapper_->setScaleOffsetQuantization(scale_offsets, cfg->ch_axis); break; } case ir::linalg::QuantizationSpecType::kSymPerBlock: @@ -376,27 +146,24 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten // Prepare data auto num_scale_offsets = (uint32_t)v->tensor_.size(cfg->ch_axis); - Qnn_ScaleOffset_t* scale_array = (Qnn_ScaleOffset_t*)malloc(sizeof(Qnn_ScaleOffset_t) * num_scale_offsets); + std::vector scale_offsets(num_scale_offsets); MLLM_RT_ASSERT_EQ(num_scale_offsets, cfg->scale_level_1_fp.size(0)); MLLM_RT_ASSERT_EQ(cfg->scale_level_0_int.dtype(), kUInt8); for (int i = 0; i < num_scale_offsets; ++i) { - scale_array[i].scale = cfg->scale_level_1_fp.at({i}); - scale_array[i].offset = 0; + scale_offsets[i].scale = cfg->scale_level_1_fp.at({i}); + scale_offsets[i].offset = 0; } - unreachable_handle_.emplace_back(scale_array); - auto block_scale_array = (Qnn_BlockwiseExpansion_t*)malloc(sizeof(Qnn_BlockwiseExpansion_t)); - unreachable_handle_.emplace_back(block_scale_array); - block_scale_array[0].axis = cfg->ch_axis; - block_scale_array[0].scaleOffsets = scale_array; - block_scale_array[0].numBlocksPerAxis = v->tensor_.size(cfg->ch_axis) / cfg->block_size; - block_scale_array[0].blockScaleBitwidth = 12; // 12 bits for 4 to 16 expansion - block_scale_array[0].blockScaleStorageType = QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8; - block_scale_array[0].blocksScale8 = cfg->scale_level_0_int.ptr(); + Qnn_BlockwiseExpansion_t blockwise_expansion; + blockwise_expansion.axis = cfg->ch_axis; + blockwise_expansion.axis = cfg->ch_axis; + blockwise_expansion.scaleOffsets = nullptr; // Will be set by setBlockwiseQuantization + blockwise_expansion.numBlocksPerAxis = v->tensor_.size(cfg->ch_axis) / cfg->block_size; + blockwise_expansion.blockScaleBitwidth = 12; // 12 bits for 4 to 16 expansion + blockwise_expansion.blockScaleStorageType = QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8; + blockwise_expansion.blocksScale8 = cfg->scale_level_0_int.ptr(); - ret.encodingDefinition = QNN_DEFINITION_DEFINED; - ret.quantizationEncoding = QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION; - ret.blockwiseExpansion = block_scale_array; + tensor_wrapper_->setBlockwiseQuantization(blockwise_expansion, scale_offsets); break; } default: { @@ -428,22 +195,26 @@ QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::emplaceOutput(const QnnAOTNodeTe return shared_from_this(); } -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::addParamScalar(const std::vector& params) { +QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::addParamScalar( + const std::vector>& params) { param_scalar.insert(param_scalar.end(), params.begin(), params.end()); return shared_from_this(); } -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::emplaceParamScalar(const QnnAOTParamScalar::ptr_t& param) { +QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::emplaceParamScalar( + const std::shared_ptr& param) { param_scalar.push_back(param); return shared_from_this(); } -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::addParamTensor(const std::vector& params) { +QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::addParamTensor( + const std::vector>& params) { param_tensor.insert(param_tensor.end(), params.begin(), params.end()); return shared_from_this(); } -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::emplaceParamTensor(const QnnAOTParamTensor::ptr_t& param) { +QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::emplaceParamTensor( + const std::shared_ptr& param) { param_tensor.push_back(param); return shared_from_this(); } @@ -465,77 +236,33 @@ QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::setPackageName(const std::string return shared_from_this(); } -QnnAOTGraph::QnnAOTGraph(const std::string& g_name, const std::shared_ptr& context) - : graph_name_(g_name), qnn_context_(context) { - belongs_context_name_ = context->name_; - - auto env = AOTCompileContext::getInstance().getEnv(); - auto qnn_interface = env->getFuncSymbol().qnn_interface_; - - auto ok = qnn_interface.graphCreate(context->qnn_ctx_handle_, g_name.c_str(), nullptr /*graph_config*/, &qnn_graph_handle_); - MLLM_RT_ASSERT_EQ(ok, QNN_SUCCESS); +QnnAOTGraph::QnnAOTGraph(QNN_INTERFACE_VER_TYPE& qnnInterface, Qnn_BackendHandle_t backendHandle, + Qnn_ContextHandle_t contextHandle, const std::string& graphName) { + qnn_model_ = std::make_shared(qnnInterface, backendHandle); + qnn_model_->initialize(contextHandle, graphName.c_str(), false); } void QnnAOTGraph::addOperation(const QnnAOTNodeOperation::ptr_t& qnn_op) { - auto env = AOTCompileContext::getInstance().getEnv(); - auto qnn_interface = env->getFuncSymbol().qnn_interface_; - - Qnn_OpConfig_t qnn_op_config = QNN_OPCONFIG_INIT; - qnn_op_config.version = QNN_OPCONFIG_VERSION_1; - qnn_op_config.v1 = QNN_OPCONFIG_V1_INIT; - qnn_op_config.v1.name = qnn_op->name_.c_str(); - qnn_op_config.v1.packageName = qnn_op->package_name_.c_str(); - qnn_op_config.v1.typeName = qnn_op->op_name_.c_str(); - - // Params - uint32_t param_counter = 0; - size_t total_param_size = qnn_op->param_scalar.size() + qnn_op->param_tensor.size(); - Qnn_Param_t* qnn_param_array = (Qnn_Param_t*)malloc(total_param_size * sizeof(Qnn_Param_t)); - qnn_op->unreachable_handle_.emplace_back(qnn_param_array); - { - // Tensor Param - for (const auto& p : qnn_op->param_tensor) { - auto ok = qnn_interface.tensorCreateGraphTensor(qnn_graph_handle_, p->getQnnTensor()); - MLLM_RT_ASSERT_EQ(ok, QNN_SUCCESS); - qnn_param_array[param_counter++] = *p->getQnnParam(); - } - for (const auto& p : qnn_op->param_scalar) { qnn_param_array[param_counter++] = *p->getQnnParam(); } - } + std::vector inputNames; + for (auto& in : qnn_op->inputs) inputNames.push_back(in->getWrapper()->getName()); - // Inputs - Qnn_Tensor_t* qnn_inputs_array = (Qnn_Tensor_t*)malloc(qnn_op->inputs.size() * sizeof(Qnn_Tensor_t)); - qnn_op->unreachable_handle_.emplace_back(qnn_inputs_array); - for (int i = 0; i < qnn_op->inputs.size(); ++i) { qnn_inputs_array[i] = *qnn_op->inputs[i]->getQnnTensor(); } + std::vector outputNames; + for (auto& out : qnn_op->outputs) outputNames.push_back(out->getWrapper()->getName()); - // Outputs - Qnn_Tensor_t* qnn_outputs_array = (Qnn_Tensor_t*)malloc(qnn_op->outputs.size() * sizeof(Qnn_Tensor_t)); - qnn_op->unreachable_handle_.emplace_back(qnn_outputs_array); - for (int i = 0; i < qnn_op->outputs.size(); ++i) { qnn_outputs_array[i] = *qnn_op->outputs[i]->getQnnTensor(); } + for (auto& in : qnn_op->inputs) qnn_model_->addTensorWrapper(in->getWrapper()); + for (auto& out : qnn_op->outputs) qnn_model_->addTensorWrapper(out->getWrapper()); - qnn_op_config.v1.params = qnn_param_array; - qnn_op_config.v1.numOfParams = total_param_size; - qnn_op_config.v1.inputTensors = qnn_inputs_array; - qnn_op_config.v1.numOfInputs = qnn_op->inputs.size(); - qnn_op_config.v1.outputTensors = qnn_outputs_array; - qnn_op_config.v1.numOfOutputs = qnn_op->outputs.size(); - - auto ok = qnn_interface.backendValidateOpConfig(env->getContext(belongs_context_name_)->bk_handle_, qnn_op_config); - MLLM_RT_ASSERT_EQ(ok, QNN_SUCCESS); - ok = qnn_interface.graphAddNode(qnn_graph_handle_, qnn_op_config); - MLLM_RT_ASSERT_EQ(ok, QNN_SUCCESS); + qnn_model_->addNode(QNN_OPCONFIG_VERSION_1, qnn_op->name_, qnn_op->package_name_, qnn_op->op_name_, qnn_op->param_tensor, + qnn_op->param_scalar, inputNames, outputNames); op_node_.insert({qnn_op->getName(), qnn_op}); } bool QnnAOTGraph::compile() { if (is_compiled_) { return true; } - - auto env = AOTCompileContext::getInstance().getEnv(); - auto qnn_interface = env->getFuncSymbol().qnn_interface_; - qnn_interface.graphFinalize(qnn_graph_handle_, env->getContext(belongs_context_name_)->profile_bk_handle_, nullptr); - + bool ret = qnn_model_->finalizeGraph(nullptr, nullptr) == mllm::qnn::MODEL_NO_ERROR; is_compiled_ = true; - return true; + return ret; } const std::vector QnnDynSymbolLoader::possible_qnn_dyn_lib_paths_{ @@ -687,7 +414,7 @@ std::shared_ptr QnnAOTEnv::createContext(const std::string& // 1. create logger and register callback. // clang-format off - MLLM_RT_ASSERT_EQ(qnn_htp_func_symbols_.qnn_interface_.logCreate(__mllmLoggerCallback4QnnLogger,QNN_LOG_LEVEL_VERBOSE, &context->log_), QNN_SUCCESS) + MLLM_RT_ASSERT_EQ(qnn_htp_func_symbols_.qnn_interface_.logCreate(__mllmQnnLoggerCallback,QNN_LOG_LEVEL_VERBOSE, &context->log_), QNN_SUCCESS) MLLM_RT_ASSERT_EQ(QNN_BACKEND_NO_ERROR, qnn_htp_func_symbols_.qnn_interface_.backendCreate(context->log_, (const QnnBackend_Config_t**)context->bk_cfg_, &context->bk_handle_)) // clang-format on @@ -819,11 +546,16 @@ std::vector QnnAOTEnv::createContextCustomConfig(bool } QnnAOTGraph::ptr_t QnnAOTEnv::captureAOTGraph(const std::string& qnn_context_name, const std::string& g_name) { - MLLM_RT_ASSERT(contexts_.count(qnn_context_name) == 1); - auto ret = QnnAOTGraph::create(g_name, contexts_[qnn_context_name]); - ret->belongs_context_name_ = qnn_context_name; - contexts_[qnn_context_name]->graphs_.insert({g_name, ret}); - return ret; + if (contexts_.find(qnn_context_name) == contexts_.end()) { + MLLM_ERROR("Context {} not found", qnn_context_name); + return nullptr; + } + auto& ctx = contexts_[qnn_context_name]; + if (ctx->graphs_.find(g_name) == ctx->graphs_.end()) { + ctx->graphs_[g_name] = + std::make_shared(qnn_htp_func_symbols_.qnn_interface_, ctx->bk_handle_, ctx->qnn_ctx_handle_, g_name); + } + return ctx->graphs_[g_name]; } void QnnAOTEnv::captureAOTNodeOp(const std::string& qnn_context_name, const std::string& graph_name, diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.hpp b/mllm/backends/qnn/aot/QnnWrappersAPI.hpp index 718c0219b..1c5189d55 100644 --- a/mllm/backends/qnn/aot/QnnWrappersAPI.hpp +++ b/mllm/backends/qnn/aot/QnnWrappersAPI.hpp @@ -11,25 +11,25 @@ #include #include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include #include "mllm/utils/Common.hpp" #include "mllm/compile/ir/tensor/Value.hpp" #include "mllm/compile/ir/linalg/Attribute.hpp" #include "mllm/backends/qnn/aot/QnnTargetMachine.hpp" +#include "mllm/backends/qnn/QNNModel.hpp" +#include "mllm/backends/qnn/QNNUtils.hpp" namespace mllm::qnn::aot { void __mllmLoggerCallback4QnnLogger(const char* fmt, QnnLog_Level_t level, uint64_t times_tamp, va_list argp); -size_t QnnAOTDataTypeSize(Qnn_DataType_t dtype); - // Collection of symbols that we need to load from qnn dyn lib. struct QnnFuncSymbols { using QnnInterfaceGetProvidersFuncType = Qnn_ErrorHandle_t(const QnnInterface_t*** providerList, uint32_t* numProviders); @@ -40,73 +40,17 @@ struct QnnFuncSymbols { QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface_; }; -class QnnAOTParamScalar { - public: - using ptr_t = std::shared_ptr; - - template - static inline ptr_t create(const std::string& name, T value) { - return std::make_shared(name, value); - }; - - QnnAOTParamScalar(const std::string& name, bool value); - - QnnAOTParamScalar(const std::string& name, uint32_t value); - - QnnAOTParamScalar(const std::string& name, float value); - - Qnn_Param_t* getQnnParam(); - - private: - std::string name_; - Qnn_Param_t qnn_param_ = QNN_PARAM_INIT; -}; - -class QnnAOTParamTensor { - public: - using ptr_t = std::shared_ptr; - - static inline ptr_t create(const std::string& param_name, const std::string& tensor_name, Qnn_DataType_t data_type, - const std::vector& dimensions) { - std::vector vec(dimensions.size()); - for (int i = 0; i < dimensions.size(); i++) { vec[i] = (uint32_t)dimensions[i]; } - return std::make_shared(param_name, tensor_name, data_type, vec); - } - - static inline ptr_t create(const std::string& param_name, const std::string& tensor_name, Qnn_DataType_t data_type, - const std::vector& dimensions) { - return std::make_shared(param_name, tensor_name, data_type, dimensions); - } - - QnnAOTParamTensor(const std::string& param_name, const std::string& tensor_name, Qnn_DataType_t data_type, - const std::vector& dimensions); - - ~QnnAOTParamTensor(); - - void* alloc(); - - Qnn_Param_t* getQnnParam(); - - Qnn_Tensor_t* getQnnTensor(); - - private: - std::string param_name_; - std::string tensor_name_; - std::vector dimensions_; - Qnn_Param_t qnn_param_ = QNN_PARAM_INIT; -}; - class QnnAOTNodeTensor : public std::enable_shared_from_this { public: using ptr_t = std::shared_ptr; static inline ptr_t create(const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight = false) { - return std::make_shared(v); + return std::make_shared(v, force_static_weight); } explicit QnnAOTNodeTensor(const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight = false); - inline Qnn_Tensor_t* getQnnTensor() { return &qnn_tensor_; } + std::shared_ptr getWrapper() { return tensor_wrapper_; } private: Qnn_TensorType_t parseQnnTensorTypeFromIR(const ir::tensor::TensorValue::ptr_t& v); @@ -117,14 +61,7 @@ class QnnAOTNodeTensor : public std::enable_shared_from_this { Qnn_QuantizeParams_t parseQnnQuantizeParamFromIR(const ir::tensor::TensorValue::ptr_t& v); - std::string name_; - Tensor mllm_tensor_; - std::vector shape_; - Qnn_Tensor_t qnn_tensor_ = QNN_TENSOR_INIT; - ir::linalg::QuantizationSpec::ptr_t quant_spec_ = nullptr; - - // To handle Qnn stuff - std::vector unreachable_handle_; + std::shared_ptr tensor_wrapper_; }; class QnnAOTNodeOperation : public std::enable_shared_from_this { @@ -145,13 +82,13 @@ class QnnAOTNodeOperation : public std::enable_shared_from_this& params); + QnnAOTNodeOperation::ptr_t addParamScalar(const std::vector>& params); - QnnAOTNodeOperation::ptr_t emplaceParamScalar(const QnnAOTParamScalar::ptr_t& param); + QnnAOTNodeOperation::ptr_t emplaceParamScalar(const std::shared_ptr& param); - QnnAOTNodeOperation::ptr_t addParamTensor(const std::vector& params); + QnnAOTNodeOperation::ptr_t addParamTensor(const std::vector>& params); - QnnAOTNodeOperation::ptr_t emplaceParamTensor(const QnnAOTParamTensor::ptr_t& param); + QnnAOTNodeOperation::ptr_t emplaceParamTensor(const std::shared_ptr& param); QnnAOTNodeOperation::ptr_t setOpName(const std::string& op_name); @@ -164,13 +101,10 @@ class QnnAOTNodeOperation : public std::enable_shared_from_this param_scalar; - std::vector param_tensor; + std::vector> param_scalar; + std::vector> param_tensor; std::vector inputs; std::vector outputs; - - // To handle Qnn stuff - std::vector unreachable_handle_; }; struct QnnDeviceAndContext; @@ -178,12 +112,8 @@ class QnnAOTGraph : public std::enable_shared_from_this { public: using ptr_t = std::shared_ptr; - QnnAOTGraph(const std::string& g_name, const std::shared_ptr& context); - - static inline ptr_t create(const std::string& g_name, const std::shared_ptr& context) { - auto ret = std::make_shared(g_name, context); - return ret; - } + QnnAOTGraph(QNN_INTERFACE_VER_TYPE& qnnInterface, Qnn_BackendHandle_t backendHandle, Qnn_ContextHandle_t contextHandle, + const std::string& graphName); void addOperation(const QnnAOTNodeOperation::ptr_t& qnn_op); @@ -193,10 +123,8 @@ class QnnAOTGraph : public std::enable_shared_from_this { std::unordered_map op_node_; std::unordered_map all_tensors_; - std::string graph_name_; - std::string belongs_context_name_; - Qnn_GraphHandle_t qnn_graph_handle_ = nullptr; - std::shared_ptr qnn_context_ = nullptr; + private: + std::shared_ptr qnn_model_; }; struct QnnDeviceAndContext { diff --git a/mllm/backends/qnn/aot/visitor/Base.hpp b/mllm/backends/qnn/aot/visitor/Base.hpp index 08b735b33..34e3c4fd3 100644 --- a/mllm/backends/qnn/aot/visitor/Base.hpp +++ b/mllm/backends/qnn/aot/visitor/Base.hpp @@ -16,7 +16,7 @@ class QnnAOTBasePattern : public ir::Pattern { bool isMatch(const mllm::ir::op_ptr_t& op) override { return false; } - bool rewrite(ir::IRWriter& writer, const ir::op_ptr_t& node) override { return false; } + bool rewrite(ir::IRWriter& writer, const ir::op_ptr_t& node) override { return compile(writer, node); } virtual bool compile(ir::IRWriter& writer, const ir::op_ptr_t& op) = 0; }; diff --git a/mllm/backends/qnn/custom-op-package/LLaMAPackage/Makefile b/mllm/backends/qnn/custom-op-package/LLaMAPackage/Makefile index e1ba91c26..d23dd519f 100644 --- a/mllm/backends/qnn/custom-op-package/LLaMAPackage/Makefile +++ b/mllm/backends/qnn/custom-op-package/LLaMAPackage/Makefile @@ -1,15 +1,15 @@ # check all setup prerequisites if the command goal is not clean ifneq ($(MAKECMDGOALS),clean) ifndef QNN_INCLUDE -$(info "INFO: Qnn include not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid") -QNN_INCLUDE := $(QNN_SDK_ROOT)/include/QNN +$(info "INFO: Qnn include not explicitly defined, attempting to use QAIRT_SDK_ROOT if it is valid") +QNN_INCLUDE := $(QAIRT_SDK_ROOT)/include/QNN endif ifeq ($(wildcard $(QNN_INCLUDE)),) $(error "ERROR: QNN_INCLUDE path is not set. QNN include paths must be set to obtain BE headers necessary to compile the package") endif ifndef QNN_TARGET_LIB -$(info "INFO: Qnn target not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid") -QNN_TARGET_LIB := $(QNN_SDK_ROOT)/lib/aarch64-android +$(info "INFO: Qnn target not explicitly defined, attempting to use QAIRT_SDK_ROOT if it is valid") +QNN_TARGET_LIB := $(QAIRT_SDK_ROOT)/lib/aarch64-android endif ifeq ($(wildcard $(QNN_TARGET_LIB)),) ifeq ($(MAKECMDGOALS),htp_aarch64)