Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/llama_qnn_aot/modeling_llama_qnn_aot.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
case kUInt8PerTensorSym: {
auto scale = m->getTopParameterFile()->pull(scale_name);
auto zp = m->getTopParameterFile()->pull(zp_name);
MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);

// Is 128! not 127!
auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
Expand Down
2 changes: 1 addition & 1 deletion examples/llama_qnn_aot/modeling_llama_qnn_aot_sha.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
case kUInt8PerTensorSym: {
auto scale = m->getTopParameterFile()->pull(scale_name);
auto zp = m->getTopParameterFile()->pull(zp_name);
MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);

// Is 128! not 127!
auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
Expand Down
2 changes: 1 addition & 1 deletion examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
case kUInt8PerTensorSym: {
auto scale = m->getTopParameterFile()->pull(scale_name);
auto zp = m->getTopParameterFile()->pull(zp_name);
MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);

// Is 128! not 127!
auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
Expand Down
2 changes: 1 addition & 1 deletion examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot_sha.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
case kUInt8PerTensorSym: {
auto scale = m->getTopParameterFile()->pull(scale_name);
auto zp = m->getTopParameterFile()->pull(zp_name);
MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);

// Is 128! not 127!
auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
Expand Down
4 changes: 0 additions & 4 deletions examples/qwen3_qnn_aot/compile_sha.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,8 @@ MLLM_MAIN({
CL - N,
}, mllm::kUInt8PerTensorSym);
trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kUInt8PerTensorSym);

trace_inputs[past_key_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.scale").impl(), true);
trace_inputs[past_key_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);

trace_inputs[past_value_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.scale").impl(), true);
trace_inputs[past_value_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
// clang-format on
}

Expand Down
2 changes: 1 addition & 1 deletion examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
case kUInt8PerTensorSym: {
auto scale = m->getTopParameterFile()->pull(scale_name);
auto zp = m->getTopParameterFile()->pull(zp_name);
MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);

// Is 128! not 127!
auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
Expand Down
6 changes: 2 additions & 4 deletions examples/qwen3_qnn_aot/modeling_qwen_qnn_aot_sha.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
case kUInt8PerTensorSym: {
auto scale = m->getTopParameterFile()->pull(scale_name);
auto zp = m->getTopParameterFile()->pull(zp_name);
MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 128);

// Is 128! not 127!
auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
Expand Down Expand Up @@ -356,14 +356,12 @@ class Qwen3AttentionSHA final : public nn::Module {
std::string h_str = std::to_string(h);

// K: De-quantize and re-quantize to int8
auto k_h = key_states_per_head[h].to(kFloat32);
k_h = k_h.to(kUInt8PerTensorSym);
auto k_h = key_states_per_head[h].to(kUInt8PerTensorSym);
k_h = ptq::QDQ_KV(this, k_h, "k_cast_to_int8_qdq_h" + h_str);
k_h = k_h.transpose(2, 3); // [B, 1, D, S]

// V: Quantize to int16 then int8
auto v_h = ptq::QDQ(this, value_states_per_head[h], "v_cast_to_int16_qdq_h" + h_str);
v_h = v_h.to(kFloat32);
v_h = v_h.to(kUInt8PerTensorSym);
v_h = ptq::QDQ_KV(this, v_h, "v_cast_to_int8_qdq_h" + h_str);
Comment on lines 358 to 366
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Verify the K/V quantization patterns across SHA files
echo "=== Qwen3 SHA (this file) ==="
rg -n "key_states_per_head\[h\]\.to\(" examples/qwen3_qnn_aot/modeling_qwen_qnn_aot_sha.hpp

echo ""
echo "=== Qwen2 SHA ==="
rg -n "key_states_per_head\[h\]\.to\(" examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot_sha.hpp

echo ""
echo "=== Llama SHA ==="
rg -n "key_states_per_head\[h\]\.to\(" examples/llama_qnn_aot/modeling_llama_qnn_aot_sha.hpp

Repository: UbiquitousLearning/mllm

Length of output: 318


🏁 Script executed:

#!/bin/bash
# Get context around the K quantization in each file
echo "=== Qwen3 SHA - K quantization context (lines 355-370) ==="
sed -n '355,370p' examples/qwen3_qnn_aot/modeling_qwen_qnn_aot_sha.hpp | cat -n

echo ""
echo "=== Qwen2 SHA - K quantization context (lines 343-358) ==="
sed -n '343,358p' examples/qwen2_qnn_aot/modeling_qwen2_qnn_aot_sha.hpp | cat -n

echo ""
echo "=== Llama SHA - K quantization context (lines 343-358) ==="
sed -n '343,358p' examples/llama_qnn_aot/modeling_llama_qnn_aot_sha.hpp | cat -n

Repository: UbiquitousLearning/mllm

Length of output: 2573


K/V quantization paths differ from other SHA variants—consider clarifying the design choice.

Both K and V quantization paths in this file skip the intermediate kFloat32 conversion step that is present in Qwen2 and Llama SHA implementations:

Qwen3 (this file):

  • K: key_states_per_head[h].to(kUInt8PerTensorSym)ptq::QDQ_KV(...)
  • V: ptq::QDQ(...).to(kUInt8PerTensorSym)ptq::QDQ_KV(...)

Qwen2/Llama:

  • K: key_states_per_head[h].to(kFloat32).to(kUInt8PerTensorSym)ptq::QDQ_KV(...)
  • V: ptq::QDQ(...).to(kFloat32).to(kUInt8PerTensorSym)ptq::QDQ_KV(...)

If this divergence is intentional (e.g., due to different input data types), add a comment explaining why. Otherwise, align the quantization path with the other models to ensure consistency.

🤖 Prompt for AI Agents
In `@examples/qwen3_qnn_aot/modeling_qwen_qnn_aot_sha.hpp` around lines 358 - 366,
The K/V quantization paths for Qwen3 differ from Qwen2/Llama—K uses
key_states_per_head[h].to(kUInt8PerTensorSym) then ptq::QDQ_KV, and V uses
ptq::QDQ(...) then .to(kUInt8PerTensorSym) then ptq::QDQ_KV—while other SHA
implementations convert via kFloat32 first; either make this explicit or align
the flows: update the K and V flows (the code around key_states_per_head[h],
ptq::QDQ, and ptq::QDQ_KV) to include the intermediate .to(kFloat32) conversion
like the other models, or add a concise comment above these snippets explaining
why Qwen3 intentionally omits kFloat32 (e.g., expected input dtype/performance
reason) so reviewers understand the divergence.


Expand Down
48 changes: 35 additions & 13 deletions mllm/backends/qnn/QNNAllocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,36 @@ namespace mllm::qnn {
#define RPCMEM_DEFAULT_FLAGS 1

QNNAllocator::QNNAllocator() {
void* libCdspHandle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
if (nullptr == libCdspHandle) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); }
libCdspHandle_ = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
if (nullptr == libCdspHandle_) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); }

rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle, "rpcmem_alloc");
rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle, "rpcmem_free");
rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle, "rpcmem_to_fd");
rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle_, "rpcmem_alloc");
rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle_, "rpcmem_free");
rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle_, "rpcmem_to_fd");
}

QNNAllocator::QNNAllocator(QNN_INTERFACE_VER_TYPE qnnInterface, void* context)
: qnnInterface_(qnnInterface), context_(context) {
MLLM_RT_ASSERT(context_ != nullptr);

void* libCdspHandle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
if (nullptr == libCdspHandle) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); }
libCdspHandle_ = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
if (nullptr == libCdspHandle_) { MLLM_ERROR_EXIT(1, "dlopen libcdsprpc.so failed"); }

rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle, "rpcmem_alloc");
rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle, "rpcmem_free");
rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle, "rpcmem_to_fd");
rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle_, "rpcmem_alloc");
rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle_, "rpcmem_free");
rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle_, "rpcmem_to_fd");
}
Comment on lines 16 to +35
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

cat -n mllm/backends/qnn/QNNAllocator.cpp | head -80

Repository: UbiquitousLearning/mllm

Length of output: 3359


🏁 Script executed:

rg "MLLM_ERROR_EXIT" mllm/utils/ -A 2 | head -20

Repository: UbiquitousLearning/mllm

Length of output: 1230


🏁 Script executed:

rg "define MLLM_ERROR_EXIT" mllm/utils/Log.hpp -A 5

Repository: UbiquitousLearning/mllm

Length of output: 540


🏁 Script executed:

rg "dlerror" mllm/ -A 2

Repository: UbiquitousLearning/mllm

Length of output: 2315


🏁 Script executed:

rg "MLLM_ERROR_EXIT.*ExitCode" mllm/ -B 1 | head -30

Repository: UbiquitousLearning/mllm

Length of output: 1691


🏁 Script executed:

rg "MLLM_ERROR_EXIT\(1" mllm/ -B 1 -A 1

Repository: UbiquitousLearning/mllm

Length of output: 1988


🏁 Script executed:

cat -n mllm/backends/qnn/QNNAllocator.cpp | sed -n '49,77p'

Repository: UbiquitousLearning/mllm

Length of output: 1224


🏁 Script executed:

rg "rpcmem_alloc|rpcmem_free|rpcmem_to_fd" mllm/backends/qnn/QNNAllocator.hpp -B 1 -A 1

Repository: UbiquitousLearning/mllm

Length of output: 436


🏁 Script executed:

rg "RpcMemAllocFn_t|RpcMemFreeFn_t|RpcMemToFdFn_t" mllm/backends/qnn/QNNAllocator.hpp -B 2 -A 2

Repository: UbiquitousLearning/mllm

Length of output: 443


Add dlsym failure checks before using rpcmem_ pointers.*

The rpcmem_alloc, rpcmem_free, and rpcmem_to_fd function pointers are loaded without validation in both constructors (lines 20–22 and 32–34). If any symbol fails to resolve, dlsym returns nullptr, and subsequent calls crash. Apply the dlopen pattern to dlsym results in both constructors:

🔧 Suggested fix
   rpcmem_alloc = (RpcMemAllocFn_t)dlsym(libCdspHandle_, "rpcmem_alloc");
+  if (!rpcmem_alloc) { MLLM_ERROR_EXIT(1, "dlsym rpcmem_alloc failed: {}", dlerror()); }
   rpcmem_free = (RpcMemFreeFn_t)dlsym(libCdspHandle_, "rpcmem_free");
+  if (!rpcmem_free) { MLLM_ERROR_EXIT(1, "dlsym rpcmem_free failed: {}", dlerror()); }
   rpcmem_to_fd = (RpcMemToFdFn_t)dlsym(libCdspHandle_, "rpcmem_to_fd");
+  if (!rpcmem_to_fd) { MLLM_ERROR_EXIT(1, "dlsym rpcmem_to_fd failed: {}", dlerror()); }
🤖 Prompt for AI Agents
In `@mllm/backends/qnn/QNNAllocator.cpp` around lines 16 - 35, The constructors of
QNNAllocator set rpcmem_alloc, rpcmem_free, and rpcmem_to_fd via dlsym without
checking for nullptr; update both QNNAllocator() and
QNNAllocator(QNN_INTERFACE_VER_TYPE, void*) to validate each dlsym result
(rpcmem_alloc, rpcmem_free, rpcmem_to_fd) after assignment, and if any is
nullptr call MLLM_ERROR_EXIT (or MLLM_RT_ASSERT if preferred) with a clear
message that includes dlerror() context; reference the libCdspHandle_ loading
logic and use the same error-handling pattern used for dlopen to fail fast when
symbols are missing.


QNNAllocator::~QNNAllocator() {
// Properly release all resources before unloading the library
// Since we hold libCdspHandle_, the library won't be unloaded until we dlclose it
shutdown();

// Now safe to unload the library
if (libCdspHandle_) {
dlclose(libCdspHandle_);
libCdspHandle_ = nullptr;
}
}

bool QNNAllocator::alloc(Storage* storage) {
Expand All @@ -46,12 +58,22 @@ bool QNNAllocator::alloc(Storage* storage) {
}

void QNNAllocator::free(Storage* storage) {
// Skip if shutdown was called or destructor is running
// During program exit, QNN library resources might be destroyed, so we can't safely call rpcmem_free
if (isShutdown_) { return; }

// Only free memory that was allocated by this allocator and not yet freed
if (!qnnMemPtrSet_.count(storage->ptr_)) {
return; // Not our memory or already freed, skip
}

if (ptrToFdAndMemHandleMap_.count(storage->ptr_)) {
MLLM_RT_ASSERT_EQ(QNN_SUCCESS,
qnnInterface_.memDeRegister(&(ptrToFdAndMemHandleMap_.find(storage->ptr_)->second.second), 1));
qnnInterface_.memDeRegister(&(ptrToFdAndMemHandleMap_.find(storage->ptr_)->second.second), 1);
ptrToFdAndMemHandleMap_.erase(storage->ptr_);
}

rpcmem_free(storage->ptr_);
qnnMemPtrSet_.erase(storage->ptr_);
}

void QNNAllocator::registerQnnTensorToSharedBuffer(void* ptr, Qnn_Tensor_t& qnn_tensor) {
Expand Down Expand Up @@ -99,4 +121,4 @@ void QNNAllocator::deRegisterQnnTensorFromSharedBuffer(void* ptr) {

std::shared_ptr<QNNAllocator> createQNNAllocator() { return std::make_shared<QNNAllocator>(); }

} // namespace mllm::qnn
} // namespace mllm::qnn
44 changes: 37 additions & 7 deletions mllm/backends/qnn/QNNAllocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,37 @@ class QNNAllocator final : public Allocator {
QNNAllocator(); // need to setQNNPointer afterward
QNNAllocator(QNN_INTERFACE_VER_TYPE qnnInterface, void* context);

~QNNAllocator() {
~QNNAllocator();

// Explicitly release all QNN memory resources. Call this for proper cleanup when you
// want to release memory during normal operation (not program exit).
// This is SAFE to call and will properly free all QNN resources.
void shutdown() {
if (isShutdown_) return;
isShutdown_ = true;

// First, deregister all registered memory
for (auto iter = ptrToFdAndMemHandleMap_.begin(); iter != ptrToFdAndMemHandleMap_.end();) {
Qnn_ErrorHandle_t deregisterRet = qnnInterface_.memDeRegister(&iter->second.second, 1);
if (QNN_SUCCESS != deregisterRet) { MLLM_ERROR("~QNNAllocator: qnnInterface_.memDeRegister failed"); }
rpcmem_free(iter->first);
if (QNN_SUCCESS != deregisterRet) { MLLM_ERROR("QNNAllocator::shutdown: qnnInterface_.memDeRegister failed"); }
iter = ptrToFdAndMemHandleMap_.erase(iter);
}

// Then, free all allocated memory (registered or not)
MLLM_INFO("QNNAllocator::shutdown: freeing all allocated memory");
for (void* ptr : qnnMemPtrSet_) { rpcmem_free(ptr); }
qnnMemPtrSet_.clear();
}

// Legacy name for shutdown() - kept for compatibility
void releaseAllResources() { shutdown(); }

// Mark the allocator as shut down without actually freeing memory.
// Use this in destructors to prevent crashes during program exit when
// QNN library resources might already be destroyed.
// After this is called, all free() calls become no-ops.
void markShutdown() { isShutdown_ = true; }

void setQNNPointer(QNN_INTERFACE_VER_TYPE qnnInterface, void* context) {
this->qnnInterface_ = qnnInterface;
this->context_ = context;
Expand Down Expand Up @@ -83,15 +105,23 @@ class QNNAllocator final : public Allocator {
QNN_INTERFACE_VER_TYPE qnnInterface_;
Qnn_ContextHandle_t context_ = nullptr;

RpcMemAllocFn_t rpcmem_alloc;
RpcMemFreeFn_t rpcmem_free;
RpcMemToFdFn_t rpcmem_to_fd;
// Hold the library handle to control unload order
// libcdsprpc.so will only be unloaded when this allocator is destroyed
void* libCdspHandle_ = nullptr;

RpcMemAllocFn_t rpcmem_alloc = nullptr;
RpcMemFreeFn_t rpcmem_free = nullptr;
RpcMemToFdFn_t rpcmem_to_fd = nullptr;

// to check if the ptr is allocted by rpcmem_alloc
std::set<void*> qnnMemPtrSet_;
std::map<void*, std::pair<int, Qnn_MemHandle_t>> ptrToFdAndMemHandleMap_;

// Flag to indicate shutdown has been called or destructor is running
// When true, free() calls become no-ops to avoid crashes during program exit
bool isShutdown_ = false;
};

std::shared_ptr<QNNAllocator> createQNNAllocator();

} // namespace mllm::qnn
} // namespace mllm::qnn
Loading