diff --git a/CMakeLists.txt b/CMakeLists.txt index a19e80df3..1a5e95900 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,9 +59,6 @@ option(MLLM_KERNEL_THREADS_VENDOR_APPLE_GCD "Enable Apple GCD Threads" OFF) option(MLLM_PERFETTO_ENABLE "Enable perfetto" OFF) option(MLLM_TRACY_ENABLE "Enable Tracy. A more advanced profiler" OFF) -# NPU AOT things -option(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE "Enable Qualcomm NPU AOT on X86 devices" OFF) - # Platform Hints option(MLLM_ANDROID_BURST_PERFORMANCE_HINTS "If MLLM need use APerformanceHintManager to tell android we need best performance" OFF) diff --git a/examples/ascend_add_demo/README.md b/examples/ascend_add_demo/README.md new file mode 100644 index 000000000..3c799332f --- /dev/null +++ b/examples/ascend_add_demo/README.md @@ -0,0 +1,81 @@ +# Ascend Add Op Demo + +这是一个简单的 demo,用于测试 Ascend 后端的 Add 算子实现。 + +## 功能 + +- 初始化 Ascend 后端和内存池 +- 创建两个输入张量(shape: [2, 3]) +- 在 Ascend NPU 上执行 Add 操作 +- 验证计算结果是否正确 + +## 编译和运行 + +### 方法 1: 使用自动化脚本(推荐) + +```bash +cd /home/HwHiAiUser/mLLM/examples/ascend_add_demo +./build_and_run.sh +``` + +脚本会自动: +- 检查环境变量 +- 配置 CMake +- 编译项目 +- 运行 demo + +### 方法 2: 手动编译 + +确保已经设置了必要的环境变量: +- `ASCEND_HOME_PATH`: Ascend SDK 路径(已设置: `/usr/local/Ascend/ascend-toolkit/latest`) +- `ATB_HOME_PATH`: ATB 库路径(已设置: `/usr/local/Ascend/nnal/nnal/atb/latest/atb/cxx_abi_0`) + +在项目根目录下: + +```bash +# 1. 创建构建目录 +mkdir -p build-ascend-demo && cd build-ascend-demo + +# 2. 配置 CMake +cmake .. \ + -DMLLM_BUILD_ASCEND_BACKEND=ON \ + -DMLLM_ENABLE_EXAMPLE=ON \ + -DCMAKE_BUILD_TYPE=Release + +# 3. 编译 +make ascend_add_demo -j$(nproc) + +# 4. 运行 +./examples/ascend_add_demo/ascend_add_demo +``` + +## 预期输出 + +``` +=== Ascend Add Op Demo === +1. Initializing Ascend backend... + ✓ Ascend backend initialized + +2. Creating input tensors... + Input x shape: [2, 3] + Input y shape: [2, 3] + +3. Transferring tensors to Ascend device... + ✓ Tensors transferred to Ascend + +4. Executing Add operation on Ascend... + ✓ Add operation completed + +5. Transferring result back to CPU and verifying... + Expected result: [11, 22, 33, 44, 55, 66] + Actual result: [11, 22, 33, 44, 55, 66] + +✓ Test PASSED! All values match expected results. +``` + +## 注意事项 + +- 当前实现使用 float16 数据类型 +- 需要 Ascend NPU 设备可用 +- 确保已正确安装 Ascend SDK 和 ATB 库 + diff --git a/examples/ascend_add_demo/build_and_run.sh b/examples/ascend_add_demo/build_and_run.sh new file mode 100755 index 000000000..94e3563df --- /dev/null +++ b/examples/ascend_add_demo/build_and_run.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +# Ascend Add Demo 编译和运行脚本 + +set -e # 遇到错误立即退出 + +# 颜色输出 +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +echo -e "${GREEN}=== Ascend Add Demo 编译和运行脚本 ===${NC}\n" + +# 检查环境变量 +echo -e "${YELLOW}检查环境变量...${NC}" +if [ -z "$ASCEND_HOME_PATH" ]; then + echo -e "${RED}错误: ASCEND_HOME_PATH 未设置${NC}" + exit 1 +fi +if [ -z "$ATB_HOME_PATH" ]; then + echo -e "${RED}错误: ATB_HOME_PATH 未设置${NC}" + exit 1 +fi +echo -e "${GREEN}✓ ASCEND_HOME_PATH: $ASCEND_HOME_PATH${NC}" +echo -e "${GREEN}✓ ATB_HOME_PATH: $ATB_HOME_PATH${NC}\n" + +# 获取项目根目录 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +BUILD_DIR="$PROJECT_ROOT/build-ascend-demo" + +echo -e "${YELLOW}项目根目录: $PROJECT_ROOT${NC}" +echo -e "${YELLOW}构建目录: $BUILD_DIR${NC}\n" + +# 创建构建目录 +if [ ! -d "$BUILD_DIR" ]; then + echo -e "${YELLOW}创建构建目录...${NC}" + mkdir -p "$BUILD_DIR" +fi + +cd "$BUILD_DIR" + +# 配置 CMake +echo -e "\n${YELLOW}配置 CMake...${NC}" +cmake "$PROJECT_ROOT" \ + -DMLLM_BUILD_ASCEND_BACKEND=ON \ + -DMLLM_ENABLE_EXAMPLE=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON + +# 编译 +echo -e "\n${YELLOW}开始编译...${NC}" +make ascend_add_demo -j$(nproc) + +# 检查编译结果 +if [ $? -eq 0 ]; then + echo -e "\n${GREEN}✓ 编译成功!${NC}\n" + + # 运行 + echo -e "${YELLOW}运行 demo...${NC}\n" + ./examples/ascend_add_demo/ascend_add_demo + + if [ $? -eq 0 ]; then + echo -e "\n${GREEN}✓ Demo 运行成功!${NC}" + else + echo -e "\n${RED}✗ Demo 运行失败${NC}" + exit 1 + fi +else + echo -e "\n${RED}✗ 编译失败${NC}" + exit 1 +fi + diff --git a/mllm/CMakeLists.txt b/mllm/CMakeLists.txt index 06fa5aab2..615643afc 100644 --- a/mllm/CMakeLists.txt +++ b/mllm/CMakeLists.txt @@ -24,7 +24,6 @@ add_library( ${MLLM_RT_MODELS_SRC} ${MLLM_RT_COMPILE_SRC} ${MLLM_RT_AUTO_TUNE_SRC} - ${MLLM_QUALCOMM_AOT_SRC} ${WENET_AUDIO_SOURCES} ) diff --git a/mllm/backends/ascend/AscendBackend.cpp b/mllm/backends/ascend/AscendBackend.cpp index 5ec76413a..7bd12d6e1 100644 --- a/mllm/backends/ascend/AscendBackend.cpp +++ b/mllm/backends/ascend/AscendBackend.cpp @@ -8,12 +8,18 @@ #include "mllm/backends/ascend/ops/AscendElewiseOps.hpp" #include "mllm/backends/ascend/ops/AscendX2XOp.hpp" +#include "mllm/backends/ascend/ops/AscendSiLUOp.hpp" +#include "mllm/backends/ascend/ops/AscendLinearOp.hpp" +#include "mllm/backends/ascend/ops/AscendRMSNormOp.hpp" +#include "mllm/backends/ascend/ops/AscendViewOp.hpp" +#include "mllm/backends/ascend/ops/AscendMatMulOp.hpp" +#include "mllm/backends/ascend/ops/AscendSoftmaxOp.hpp" namespace mllm::ascend { AscendBackend::AscendBackend() : Backend(kAscend, createAscendAllocator()) { - regOpFactory(); - regOpFactory(); + regOpFactory(); auto& devices = AscendDeviceMetaInfo::instance().devices; for (const auto& device : devices) { const auto bytes_to_mb = [](size_t bytes) { return bytes / (1024.0 * 1024.0); }; diff --git a/mllm/backends/ascend/AscendCommon.cpp b/mllm/backends/ascend/AscendCommon.cpp index 140a5a31e..a1ada40cf 100644 --- a/mllm/backends/ascend/AscendCommon.cpp +++ b/mllm/backends/ascend/AscendCommon.cpp @@ -207,6 +207,13 @@ void syncGlobalAtbStream() { } void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc) { + // Validate that the tensor is FP16 + if (t.dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "fillAtbTensorDesc: Tensor must be FP16, but got dtype={}", + static_cast(t.dtype())); + } + desc.dtype = ACL_FLOAT16; // Currently hardcoded as per demo, can be expanded later desc.format = ACL_FORMAT_ND; @@ -217,6 +224,13 @@ void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc) { } } +void fillAtbTensor(const Tensor& t, atb::Tensor& atb_tensor) { + fillAtbTensorDesc(t, atb_tensor.desc); + atb_tensor.deviceData = reinterpret_cast(t.ptr()); + // Use MLLM tensor's actual bytes as dataSize to match allocated memory + atb_tensor.dataSize = t.bytes(); +} + AscendDeviceMetaInfo::AscendDeviceMetaInfo() { #ifndef ASCENDC_CPU_DEBUG // Initialize ACL to query devices @@ -231,7 +245,6 @@ AscendDeviceMetaInfo::AscendDeviceMetaInfo() { ret = aclrtGetDeviceCount(&device_count); if (ret != ACL_SUCCESS) { MLLM_ERROR("Failed to get Ascend device count: {}", ret); - aclFinalize(); return; } @@ -265,9 +278,6 @@ AscendDeviceMetaInfo::AscendDeviceMetaInfo() { devices.push_back(info); } - - // Finalize ACL after enumeration - aclFinalize(); #else // In CPU debug mode, add a dummy device AscendDeviceInfo info; diff --git a/mllm/backends/ascend/AscendCommon.hpp b/mllm/backends/ascend/AscendCommon.hpp index 8d74c8707..5a2b69dc8 100644 --- a/mllm/backends/ascend/AscendCommon.hpp +++ b/mllm/backends/ascend/AscendCommon.hpp @@ -41,6 +41,9 @@ void syncGlobalAtbStream(); // Convert MLLM Tensor metadata to ATB TensorDesc void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc); +// Setup ATB Tensor with correct dataSize calculated by ATB Utils +void fillAtbTensor(const Tensor& t, atb::Tensor& atb_tensor); + // Ascend device information structure struct AscendDeviceInfo { std::string name; diff --git a/mllm/backends/ascend/ops/AscendElewiseOps.cpp b/mllm/backends/ascend/ops/AscendElewiseOps.cpp index 762ef1dfe..38bc4b139 100644 --- a/mllm/backends/ascend/ops/AscendElewiseOps.cpp +++ b/mllm/backends/ascend/ops/AscendElewiseOps.cpp @@ -34,9 +34,9 @@ void AscendAddOp::forward(const std::vector& inputs, std::vector if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) { NYI("AscendAddOp currently requires x/y/z have same dtype"); } - if (x.numel() != y.numel() || x.numel() != z.numel()) { - NYI("AscendAddOp demo only supports no-broadcast case (numel equal)"); - } + + // ATB ELEWISE_ADD supports broadcasting automatically + // No need to check numel equality atb::infer::ElewiseParam addParam; addParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_ADD; @@ -53,6 +53,88 @@ void AscendAddOp::forward(const std::vector& inputs, std::vector atb::Tensor atb_y; atb::Tensor atb_z; + fillAtbTensor(x, atb_x); + fillAtbTensor(y, atb_y); + fillAtbTensor(z, atb_z); + + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + inTensors.push_back(atb_y); + outTensors.push_back(atb_z); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB AddOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + { + ASCEND_TIME_SCOPE("AscendAddOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB AddOp Execute failed, status={}", static_cast(st)); + } + + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + +AscendSubOp::AscendSubOp(const aops::SubOpOptions& options) : aops::SubOp(options) {} + +void AscendSubOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +void AscendSubOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT_EQ(inputs.size(), 2); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& x = inputs[0]; + const auto& y = inputs[1]; + auto& z = outputs[0]; + + if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) { + NYI("AscendSubOp currently requires x/y/z have same dtype"); + } + + // ATB ELEWISE_SUB supports broadcasting automatically + // No need to check numel equality + + atb::infer::ElewiseParam subParam; + subParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_SUB; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(subParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ELEWISE_SUB) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_x; + atb::Tensor atb_y; + atb::Tensor atb_z; + fillAtbTensorDesc(x, atb_x.desc); fillAtbTensorDesc(y, atb_y.desc); fillAtbTensorDesc(z, atb_z.desc); @@ -77,7 +159,7 @@ void AscendAddOp::forward(const std::vector& inputs, std::vector uint64_t workspaceSize = 0; st = op->Setup(vp, workspaceSize, atb_ctx); if (st != atb::NO_ERROR) { - MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB AddOp Setup failed, status={}", static_cast(st)); + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SubOp Setup failed, status={}", static_cast(st)); } void* workspace = nullptr; @@ -88,13 +170,100 @@ void AscendAddOp::forward(const std::vector& inputs, std::vector mem_mgr.getBlockPtr(workspace_block_id, workspace); } { - ASCEND_TIME_SCOPE("AscendAddOp::forward"); + ASCEND_TIME_SCOPE("AscendSubOp::forward"); st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); } if (st != atb::NO_ERROR) { - MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB AddOp Execute failed, status={}", static_cast(st)); + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SubOp Execute failed, status={}", static_cast(st)); } + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + +AscendMulOp::AscendMulOp(const aops::MulOpOptions& options) : aops::MulOp(options) {} + +void AscendMulOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +void AscendMulOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT_EQ(inputs.size(), 2); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& x = inputs[0]; + const auto& y = inputs[1]; + auto& z = outputs[0]; + + if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) { + NYI("AscendMulOp currently requires x/y/z have same dtype"); + } + + // ATB ELEWISE_MUL supports broadcasting automatically + // No need to check numel equality + + atb::infer::ElewiseParam mulParam; + mulParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_MUL; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(mulParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ELEWISE_MUL) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_x; + atb::Tensor atb_y; + atb::Tensor atb_z; + + fillAtbTensorDesc(x, atb_x.desc); + fillAtbTensorDesc(y, atb_y.desc); + fillAtbTensorDesc(z, atb_z.desc); + + atb_x.deviceData = reinterpret_cast(x.ptr()); + atb_x.dataSize = x.bytes(); + atb_y.deviceData = reinterpret_cast(y.ptr()); + atb_y.dataSize = y.bytes(); + atb_z.deviceData = reinterpret_cast(z.ptr()); + atb_z.dataSize = z.bytes(); + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + inTensors.push_back(atb_y); + outTensors.push_back(atb_z); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MulOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + { + ASCEND_TIME_SCOPE("AscendMulOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MulOp Execute failed, status={}", static_cast(st)); + } syncGlobalAtbStream(); diff --git a/mllm/backends/ascend/ops/AscendElewiseOps.hpp b/mllm/backends/ascend/ops/AscendElewiseOps.hpp index 26117cbc2..9122e20cb 100644 --- a/mllm/backends/ascend/ops/AscendElewiseOps.hpp +++ b/mllm/backends/ascend/ops/AscendElewiseOps.hpp @@ -24,4 +24,34 @@ class AscendAddOpFactory final : public TypedOpFactory& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendSubOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::SubOpOptions& options) override { + return std::make_shared(options); + } +}; + +class AscendMulOp final : public aops::MulOp { + public: + explicit AscendMulOp(const aops::MulOpOptions& options); + + void setup(const std::vector& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendMulOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::MulOpOptions& options) override { + return std::make_shared(options); + } +}; + } // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendLinearOp.cpp b/mllm/backends/ascend/ops/AscendLinearOp.cpp new file mode 100644 index 000000000..41040cf74 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendLinearOp.cpp @@ -0,0 +1,167 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/ops/AscendLinearOp.hpp" + +#include +#include +#include +#include +#include + +#include "mllm/utils/Common.hpp" +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" + +namespace mllm::ascend { + +AscendLinearOp::AscendLinearOp(const aops::LinearOpOptions& options) : aops::LinearOp(options) {} + +void AscendLinearOp::reshape(const std::vector& inputs, std::vector& outputs) { + if (options().isRedirect()) { + const auto& input = inputs[0]; + const auto& weight = inputs[1]; + auto out_shape = input.shape(); + out_shape[out_shape.size() - 1] = weight.shape()[0]; // out_channels + outputs.emplace_back(Tensor::empty(out_shape, input.dtype(), input.device())); + return; + } + aops::LinearOp::reshape(inputs, outputs); +} + +void AscendLinearOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +void AscendLinearOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT(inputs.size() >= 1 && inputs.size() <= 3); + + const Tensor* weight_ptr = nullptr; + const Tensor* bias_ptr = nullptr; + + if (inputs.size() == 1) { + weight_ptr = &weight(); + if (options().bias) { bias_ptr = &bias(); } + } else if (inputs.size() == 2) { + weight_ptr = &inputs[1]; + } else if (inputs.size() == 3) { + weight_ptr = &inputs[1]; + bias_ptr = &inputs[2]; + } + + const auto& x = inputs[0]; + auto& y = outputs[0]; + + // Validate that input tensors are FP16 + if (x.dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendLinearOp: Input tensor must be FP16, but got dtype={}", + static_cast(x.dtype())); + } + if (weight_ptr->dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendLinearOp: Weight tensor must be FP16, but got dtype={}", + static_cast(weight_ptr->dtype())); + } + if (bias_ptr != nullptr && bias_ptr->dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendLinearOp: Bias tensor must be FP16, but got dtype={}", + static_cast(bias_ptr->dtype())); + } + + // Validate bias dimensions: ATB Linear requires bias to be 2D [1, out_channels] + if (bias_ptr != nullptr) { + const auto& bias_shape = bias_ptr->shape(); + if (bias_shape.size() == 1) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendLinearOp: Bias tensor must be 2D [1, out_channels], but got 1D shape with size={}. " + "Please reshape the bias tensor before passing to AscendLinearOp.", + bias_shape[0]); + } + if (bias_shape.size() != 2 || bias_shape[0] != 1) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendLinearOp: Bias tensor must be 2D with shape [1, out_channels], but got shape=[{}, {}]", + bias_shape.size() >= 1 ? bias_shape[0] : 0, + bias_shape.size() >= 2 ? bias_shape[1] : 0); + } + } + + + atb::infer::LinearParam linearParam; + linearParam.transposeA = false; + linearParam.transposeB = true; // Set to true because weight is [out_channels, in_channels] + linearParam.hasBias = (bias_ptr != nullptr); + linearParam.outDataType = ACL_DT_UNDEFINED; + linearParam.enAccum = false; + linearParam.matmulType = atb::infer::LinearParam::MATMUL_UNDEFINED; + linearParam.quantMode = atb::infer::LinearParam::QUANT_UNDEFINED; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(linearParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(Linear) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_x; + atb::Tensor atb_weight; + atb::Tensor atb_y; + atb::Tensor atb_bias; + + fillAtbTensor(x, atb_x); + fillAtbTensor(*weight_ptr, atb_weight); + fillAtbTensor(y, atb_y); + + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + inTensors.push_back(atb_weight); + + if (bias_ptr != nullptr) { + fillAtbTensor(*bias_ptr, atb_bias); + inTensors.push_back(atb_bias); + } + + outTensors.push_back(atb_y); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB LinearOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + + { + ASCEND_TIME_SCOPE("AscendLinearOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB LinearOp Execute failed, status={}", static_cast(st)); + } + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/ops/AscendLinearOp.hpp b/mllm/backends/ascend/ops/AscendLinearOp.hpp new file mode 100644 index 000000000..a0b78de4c --- /dev/null +++ b/mllm/backends/ascend/ops/AscendLinearOp.hpp @@ -0,0 +1,28 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/LinearOp.hpp" +#include "mllm/core/OpTypes.hpp" + +namespace mllm::ascend { + +class AscendLinearOp final : public aops::LinearOp { + public: + explicit AscendLinearOp(const aops::LinearOpOptions& options); + + void setup(const std::vector& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; + void reshape(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendLinearOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::LinearOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/ops/AscendMatMulOp.cpp b/mllm/backends/ascend/ops/AscendMatMulOp.cpp new file mode 100644 index 000000000..9e2013695 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendMatMulOp.cpp @@ -0,0 +1,147 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/ops/AscendMatMulOp.hpp" + +#include +#include +#include +#include +#include + +#include "mllm/utils/Common.hpp" +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" + +namespace mllm::ascend { + +AscendMatMulOp::AscendMatMulOp(const aops::MatMulOpOptions& options) : aops::MatMulOp(options) {} + +void AscendMatMulOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +namespace { + +// Helper to fill ATB tensor with custom shape (for reshape without copy) +void fillAtbTensorWithShape(const Tensor& t, atb::Tensor& atb_tensor, const std::vector& shape) { + atb::TensorDesc desc; + desc.dtype = ACL_FLOAT16; // Ascend uses FP16 + desc.format = ACL_FORMAT_ND; + + desc.shape.dimNum = shape.size(); + for (size_t i = 0; i < shape.size(); ++i) { + desc.shape.dims[i] = shape[i]; + } + + atb_tensor.desc = desc; + atb_tensor.dataSize = atb::Utils::GetTensorSize(atb_tensor); + atb_tensor.deviceData = reinterpret_cast(t.ptr()); +} + +} // namespace + +void AscendMatMulOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT_EQ(inputs.size(), 2); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& A = inputs[0]; + const auto& B = inputs[1]; + auto& C = outputs[0]; + + // ATB Linear/MatMul only supports 2D/3D tensors. + // For 4D tensors [B, H, S, D], we reshape to 3D [B*H, S, D], compute, then reshape back. + const auto& a_shape = A.shape(); + const auto& b_shape = B.shape(); + const auto& c_shape = C.shape(); + + bool is_4d = (a_shape.size() == 4); + + // Prepare shapes for ATB + std::vector atb_a_shape, atb_b_shape, atb_c_shape; + + if (is_4d) { + // Reshape [B, H, S, D] -> [B*H, S, D] + int64_t batch_heads_a = static_cast(a_shape[0]) * static_cast(a_shape[1]); + int64_t batch_heads_b = static_cast(b_shape[0]) * static_cast(b_shape[1]); + int64_t batch_heads_c = static_cast(c_shape[0]) * static_cast(c_shape[1]); + + atb_a_shape = {batch_heads_a, static_cast(a_shape[2]), static_cast(a_shape[3])}; + atb_b_shape = {batch_heads_b, static_cast(b_shape[2]), static_cast(b_shape[3])}; + atb_c_shape = {batch_heads_c, static_cast(c_shape[2]), static_cast(c_shape[3])}; + } else { + // 2D or 3D: use original shapes + for (auto dim : a_shape) atb_a_shape.push_back(static_cast(dim)); + for (auto dim : b_shape) atb_b_shape.push_back(static_cast(dim)); + for (auto dim : c_shape) atb_c_shape.push_back(static_cast(dim)); + } + + // Create LinearParam for ATB (used for MatMul) + atb::infer::LinearParam linearParam; + linearParam.transposeA = options_.transpose_a; + linearParam.transposeB = options_.transpose_b; + linearParam.hasBias = false; + linearParam.outDataType = ACL_DT_UNDEFINED; + linearParam.enAccum = false; + linearParam.matmulType = atb::infer::LinearParam::MATMUL_UNDEFINED; + linearParam.quantMode = atb::infer::LinearParam::QUANT_UNDEFINED; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(linearParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(MatMul) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_A, atb_B, atb_C; + fillAtbTensorWithShape(A, atb_A, atb_a_shape); + fillAtbTensorWithShape(B, atb_B, atb_b_shape); + fillAtbTensorWithShape(C, atb_C, atb_c_shape); + + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_A); + inTensors.push_back(atb_B); + outTensors.push_back(atb_C); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MatMulOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + + { + ASCEND_TIME_SCOPE("AscendMatMulOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MatMulOp Execute failed, status={}", static_cast(st)); + } + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/ops/AscendMatMulOp.hpp b/mllm/backends/ascend/ops/AscendMatMulOp.hpp new file mode 100644 index 000000000..059464b25 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendMatMulOp.hpp @@ -0,0 +1,27 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/MatMulOp.hpp" +#include "mllm/core/OpTypes.hpp" + +namespace mllm::ascend { + +class AscendMatMulOp final : public aops::MatMulOp { + public: + explicit AscendMatMulOp(const aops::MatMulOpOptions& options); + + void setup(const std::vector& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendMatMulOpFactory : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::MatMulOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/ops/AscendRMSNormOp.cpp b/mllm/backends/ascend/ops/AscendRMSNormOp.cpp new file mode 100644 index 000000000..54b3eeda2 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendRMSNormOp.cpp @@ -0,0 +1,106 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/ops/AscendRMSNormOp.hpp" + +#include +#include +#include +#include +#include +#include + +#include "mllm/utils/Common.hpp" +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" + +namespace mllm::ascend { + +AscendRMSNormOp::AscendRMSNormOp(const aops::RMSNormOpOptions& options) : aops::RMSNormOp(options) {} + +void AscendRMSNormOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +void AscendRMSNormOp::forward(const std::vector& inputs, std::vector& outputs) { + //MLLM_RT_ASSERT(inputs.size() == 1 || inputs.size() == 2, "AscendRMSNormOp expects 1 or 2 inputs"); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& x = inputs[0]; + const auto& weight = (inputs.size() == 2) ? inputs[1] : weight_; + auto& y = outputs[0]; + + const Tensor& weight_for_atb = weight; + + if (x.dtype() != y.dtype()) { + NYI("AscendRMSNormOp currently requires x/y have same dtype"); + } + if (x.numel() != y.numel()) { + NYI("AscendRMSNormOp requires x/y have same numel"); + } + + atb::infer::RmsNormParam rmsNormParam; + rmsNormParam.layerType = atb::infer::RmsNormParam::RmsNormType::RMS_NORM_NORM; + rmsNormParam.normParam.quantType = atb::infer::QuantType::QUANT_UNQUANT; + rmsNormParam.normParam.epsilon = options_.epsilon; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(rmsNormParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(RMS_NORM) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_x; + atb::Tensor atb_weight; + atb::Tensor atb_y; + + fillAtbTensor(x, atb_x); + fillAtbTensor(weight_for_atb, atb_weight); + fillAtbTensor(y, atb_y); + + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + inTensors.push_back(atb_weight); + outTensors.push_back(atb_y); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB RMSNormOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + { + ASCEND_TIME_SCOPE("AscendRMSNormOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB RMSNormOp Execute failed, status={}", static_cast(st)); + } + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/ops/AscendRMSNormOp.hpp b/mllm/backends/ascend/ops/AscendRMSNormOp.hpp new file mode 100644 index 000000000..65b899509 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendRMSNormOp.hpp @@ -0,0 +1,27 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/RMSNormOp.hpp" +#include "mllm/core/OpTypes.hpp" + +namespace mllm::ascend { + +class AscendRMSNormOp final : public aops::RMSNormOp { + public: + explicit AscendRMSNormOp(const aops::RMSNormOpOptions& options); + + void setup(const std::vector& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendRMSNormOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::RMSNormOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/ops/AscendSiLUOp.cpp b/mllm/backends/ascend/ops/AscendSiLUOp.cpp new file mode 100644 index 000000000..3a2299bf2 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendSiLUOp.cpp @@ -0,0 +1,115 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/ops/AscendSiLUOp.hpp" + +#include +#include +#include +#include +#include + +#include "mllm/utils/Common.hpp" +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" + +namespace mllm::ascend { + +AscendSiLUOp::AscendSiLUOp(const aops::SiLUOpOptions& options) : aops::SiLUOp(options) {} + +void AscendSiLUOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +void AscendSiLUOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT_EQ(inputs.size(), 1); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& x = inputs[0]; + auto& y = outputs[0]; + + // Validate that input tensors are FP16 + if (x.dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendSiLUOp: Input tensor must be FP16, but got dtype={}", + static_cast(x.dtype())); + } + if (y.dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendSiLUOp: Output tensor must be FP16, but got dtype={}", + static_cast(y.dtype())); + } + + if (x.dtype() != y.dtype()) { + NYI("AscendSiLUOp currently requires x/y have same dtype"); + } + if (x.numel() != y.numel()) { + NYI("AscendSiLUOp requires x/y have same numel"); + } + + atb::infer::ActivationParam siluParam; + siluParam.activationType = atb::infer::ACTIVATION_SWISH; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(siluParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ACTIVATION_SWISH) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_x; + atb::Tensor atb_y; + + fillAtbTensorDesc(x, atb_x.desc); + fillAtbTensorDesc(y, atb_y.desc); + + atb_x.deviceData = reinterpret_cast(x.ptr()); + atb_x.dataSize = x.bytes(); + atb_y.deviceData = reinterpret_cast(y.ptr()); + atb_y.dataSize = y.bytes(); + + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + outTensors.push_back(atb_y); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SiLUOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + { + ASCEND_TIME_SCOPE("AscendSiLUOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SiLUOp Execute failed, status={}", static_cast(st)); + } + + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/ops/AscendSiLUOp.hpp b/mllm/backends/ascend/ops/AscendSiLUOp.hpp new file mode 100644 index 000000000..421dd49d3 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendSiLUOp.hpp @@ -0,0 +1,27 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/SiLUOp.hpp" +#include "mllm/core/OpTypes.hpp" + +namespace mllm::ascend { + +class AscendSiLUOp final : public aops::SiLUOp { + public: + explicit AscendSiLUOp(const aops::SiLUOpOptions& options); + + void setup(const std::vector& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendSiLUOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::SiLUOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp b/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp new file mode 100644 index 000000000..db0ef47ea --- /dev/null +++ b/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp @@ -0,0 +1,135 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/ops/AscendSoftmaxOp.hpp" + +#include +#include +#include +#include +#include + +#include "mllm/utils/Common.hpp" +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" + +namespace mllm::ascend { + +AscendSoftmaxOp::AscendSoftmaxOp(const aops::SoftmaxOpOptions& options) : aops::SoftmaxOp(options) {} + +void AscendSoftmaxOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +void AscendSoftmaxOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT_EQ(inputs.size(), 1); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& x = inputs[0]; + auto& y = outputs[0]; + + // Validate that input tensors are FP16 + if (x.dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendSoftmaxOp: Input tensor must be FP16, but got dtype={}", + static_cast(x.dtype())); + } + if (y.dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendSoftmaxOp: Output tensor must be FP16, but got dtype={}", + static_cast(y.dtype())); + } + + if (x.dtype() != y.dtype()) { + NYI("AscendSoftmaxOp currently requires x/y have same dtype"); + } + if (x.numel() != y.numel()) { + NYI("AscendSoftmaxOp requires x/y have same numel"); + } + + // Configure Softmax parameters + atb::infer::SoftmaxParam softmaxParam; + + // Convert axis to positive index if negative + int axis = options_.axis; + if (axis < 0) { + axis = static_cast(x.rank()) + axis; + } + + // ATB expects axes as SVector + softmaxParam.axes.push_back(static_cast(axis)); + + // Create ATB operation + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(softmaxParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "ATB CreateOperation(Softmax) failed, status={}", + static_cast(st)); + } + + // Get global ATB context + atb::Context* atb_ctx = getGlobalAtbContext(); + + // Prepare ATB tensors + atb::Tensor atb_x; + atb::Tensor atb_y; + + fillAtbTensor(x, atb_x); + fillAtbTensor(y, atb_y); + + // Setup input/output tensors + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + outTensors.push_back(atb_y); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + // Setup operation (calculate required workspace size) + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "ATB SoftmaxOp Setup failed, status={}", + static_cast(st)); + } + + // Allocate workspace if needed + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + + // Execute operation + { + ASCEND_TIME_SCOPE("AscendSoftmaxOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "ATB SoftmaxOp Execute failed, status={}", + static_cast(st)); + } + + // Synchronize stream + syncGlobalAtbStream(); + + // Free workspace + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + // Destroy operation + atb::DestroyOperation(op); +} + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/ops/AscendSoftmaxOp.hpp b/mllm/backends/ascend/ops/AscendSoftmaxOp.hpp new file mode 100644 index 000000000..c52cb85e0 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendSoftmaxOp.hpp @@ -0,0 +1,27 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/SoftmaxOp.hpp" +#include "mllm/core/OpTypes.hpp" + +namespace mllm::ascend { + +class AscendSoftmaxOp final : public aops::SoftmaxOp { + public: + explicit AscendSoftmaxOp(const aops::SoftmaxOpOptions& options); + + void setup(const std::vector& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendSoftmaxOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::SoftmaxOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/ops/AscendViewOp.cpp b/mllm/backends/ascend/ops/AscendViewOp.cpp new file mode 100644 index 000000000..e7780cab2 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendViewOp.cpp @@ -0,0 +1,16 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/ops/AscendViewOp.hpp" + +namespace mllm::ascend { + +AscendViewOp::AscendViewOp(const aops::ViewOpOptions& options) : aops::ViewOp(options) {} + +void AscendViewOp::forward(const std::vector& inputs, std::vector& outputs) { + // View operation only changes metadata (shape), not actual data + // Just call the base class implementation which is empty + aops::ViewOp::forward(inputs, outputs); +} + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/ops/AscendViewOp.hpp b/mllm/backends/ascend/ops/AscendViewOp.hpp new file mode 100644 index 000000000..50918dcf9 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendViewOp.hpp @@ -0,0 +1,25 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/ViewOp.hpp" + +namespace mllm::ascend { + +class AscendViewOp final : public aops::ViewOp { + public: + explicit AscendViewOp(const aops::ViewOpOptions& options); + + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendViewOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::ViewOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::ascend diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp deleted file mode 100644 index 0f67bab56..000000000 --- a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp +++ /dev/null @@ -1,726 +0,0 @@ -// Copyright (c) MLLM Team. -// Licensed under the MIT License. -#include -#include - -#include - -#include -#include -#include -#include -#include - -#include "mllm/backends/qnn/aot/passes/AOTCompileContext.hpp" -#include "mllm/core/DataTypes.hpp" -#include "mllm/utils/Common.hpp" -#include "mllm/backends/qnn/QNNTypeMacros.hpp" -#include "mllm/compile/ir/linalg/Attribute.hpp" -#include "mllm/backends/qnn/aot/QnnWrappersAPI.hpp" -#include "mllm/backends/qnn/aot/QnnTargetMachine.hpp" -#include "mllm/backends/qnn/QNNUtils.hpp" -#include "mllm/utils/Log.hpp" - -namespace mllm::qnn::aot { - -QnnAOTNodeTensor::QnnAOTNodeTensor(const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight) { - auto type = parseQnnTensorTypeFromIR(v); - auto name = v->name(); - auto quant = parseQnnQuantizeParamFromIR(v); - - if (force_static_weight || type == QNN_TENSOR_TYPE_STATIC) { - tensor_wrapper_ = mllm::qnn::QNNTensorWrapper::createStaticTensor(name, v->tensor_, quant); - } else { - tensor_wrapper_ = mllm::qnn::QNNTensorWrapper::create(name, type, v->tensor_, quant); - } - setupComplexTensorQuantization(v); // per-channel and LPBQ cases -} - -Qnn_TensorType_t QnnAOTNodeTensor::parseQnnTensorTypeFromIR(const ir::tensor::TensorValue::ptr_t& v) { - auto type = v->tensor_.memType(); - Qnn_TensorType_t ret_qnn_tensor_type = QNN_TENSOR_TYPE_UNDEFINED; - switch (type) { - case kTensorMemTypes_Start: { - break; - } - - // For MLLM Frame work to use - case kNormal: { - ret_qnn_tensor_type = QNN_TENSOR_TYPE_NATIVE; - break; - } - case kExtraInput: { - ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; - break; - } - case kExtraOutput: { - ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; - break; - } - case kManual: { - ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READWRITE; - break; - } - case kGlobal: { - ret_qnn_tensor_type = QNN_TENSOR_TYPE_STATIC; - break; - } - - // Framework need to judge if this tensor is mmap from disk. - case kParams_Start: - case kParamsMMAP: - case kParamsNormal: - case kParams_End: { - ret_qnn_tensor_type = QNN_TENSOR_TYPE_STATIC; - break; - } - - // For QNN Backend to use. - case kQnnAppRead: { - ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; - break; - } - case kQnnAppWrite: { - ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; - break; - } - case kQnnAppReadWrite: { - ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READWRITE; - break; - } - case kTensorMemTypes_End: break; - } - - // Check Attribute. The Attribute priority is higher than tensor type - if (v->getAttr("qnn_graph_outputs")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; } - if (v->getAttr("qnn_graph_inputs")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; } - if (v->getAttr("constant")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_STATIC; } - - return ret_qnn_tensor_type; -} - -Qnn_DataType_t QnnAOTNodeTensor::parseQnnDataTypeFromIR(const ir::tensor::TensorValue::ptr_t& v) { - return mllm::qnn::mllmDataTypeToQnnDataType(v->tensor_.dtype()); -} - -std::string QnnAOTNodeTensor::parseQnnTensorNameFromIR(const ir::tensor::TensorValue::ptr_t& v) { return v->name(); } - -Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::tensor::TensorValue::ptr_t& v) { - Qnn_QuantizeParams_t ret = QNN_QUANTIZE_PARAMS_INIT; - - MLLM_RT_ASSERT(v); - MLLM_RT_ASSERT(v->getAttr("quant_recipe")); - auto quant_spec = v->getAttr("quant_recipe")->cast_()->spec_; - - switch (quant_spec->type) { - case ir::linalg::QuantizationSpecType::kRaw: - case ir::linalg::QuantizationSpecType::kSymPerChannel: - case ir::linalg::QuantizationSpecType::kLPBQ: { - break; - } - case ir::linalg::QuantizationSpecType::kAsymPerTensor: { - auto cfg = std::static_pointer_cast(quant_spec); - ret.encodingDefinition = QNN_DEFINITION_DEFINED; - ret.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET; - if (!cfg->scale || !cfg->zero_point) { - MLLM_ERROR_EXIT(ExitCode::kCoreError, "AsymPerTensor quant recipe has no scale or zero point. tensor: {}", v->name()); - } - ret.scaleOffsetEncoding = - Qnn_ScaleOffset_t{.scale = cfg->scale.item(), .offset = -cfg->zero_point.item()}; - MLLM_INFO("Configuring AsymPerTensor quantization for tensor: {}, scale: {}, zero_point: {}", v->name(), - cfg->scale.item(), cfg->zero_point.item()); - break; - } - case ir::linalg::QuantizationSpecType::kSymPerTensor: { - auto cfg = std::static_pointer_cast(quant_spec); - ret.encodingDefinition = QNN_DEFINITION_DEFINED; - ret.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET; - if (!cfg->scale) { - MLLM_ERROR_EXIT(ExitCode::kCoreError, "SymPerTensor quant recipe has no scale. tensor: {}", v->name()); - } - ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item(), .offset = 0}; - MLLM_INFO("Configuring SymPerTensor quantization for tensor: {}, scale: {}", v->name(), cfg->scale.item()); - break; - } - default: { - MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't handle kNone type"); - } - } - - return ret; -} - -void QnnAOTNodeTensor::setupComplexTensorQuantization(const ir::tensor::TensorValue::ptr_t& v) { - MLLM_RT_ASSERT(v->getAttr("quant_recipe")); - auto quant_spec = v->getAttr("quant_recipe")->cast_()->spec_; - - switch (quant_spec->type) { - case ir::linalg::QuantizationSpecType::kSymPerChannel: { - auto cfg = std::static_pointer_cast(quant_spec); - - // Prepare data - auto num_scale_offsets = (uint32_t)v->tensor_.size(cfg->ch_axis); - std::vector scale_offsets(num_scale_offsets); - MLLM_RT_ASSERT_EQ(num_scale_offsets, cfg->scale.size(0)); - MLLM_RT_ASSERT_EQ(cfg->scale.dtype(), kFloat32); - for (int i = 0; i < num_scale_offsets; ++i) { - scale_offsets[i].scale = cfg->scale.at({i}); - scale_offsets[i].offset = 0; - } - - tensor_wrapper_->setScaleOffsetQuantization(scale_offsets, cfg->ch_axis); - break; - } - case ir::linalg::QuantizationSpecType::kLPBQ: { - MLLM_INFO("Solving LPBQ quantization for tensor: {}", v->tensor_.name()); - // This LPBQ Type is for Conv2D Only !!! Linear has diff layout cmp with conv2d - - auto cfg = std::static_pointer_cast(quant_spec); - - // Prepare data - auto num_scale_offsets = (uint32_t)v->tensor_.size(-1); - std::vector scale_offsets(num_scale_offsets); - MLLM_RT_ASSERT_EQ(num_scale_offsets, cfg->scale_level_1_fp.size(-1)); - MLLM_RT_ASSERT_EQ(cfg->scale_level_0_int.dtype(), kUInt8); - MLLM_RT_ASSERT_EQ(cfg->scale_level_1_fp.dtype(), kFloat32); - MLLM_RT_ASSERT_EQ(cfg->scale_level_0_int.rank(), 1); - MLLM_RT_ASSERT_EQ(cfg->scale_level_1_fp.rank(), 1); - for (int i = 0; i < num_scale_offsets; ++i) { - scale_offsets[i].scale = cfg->scale_level_1_fp.at({i}); - scale_offsets[i].offset = 0; - } - - Qnn_BlockwiseExpansion_t blockwise_expansion; - blockwise_expansion.axis = v->tensor_.rank() - 1; - blockwise_expansion.scaleOffsets = nullptr; // Will be set by setBlockwiseQuantization - blockwise_expansion.numBlocksPerAxis = v->tensor_.size(-2) / cfg->block_size; - blockwise_expansion.blockScaleBitwidth = 4; // 4 bits for uint4 scale - blockwise_expansion.blockScaleStorageType = QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8; - blockwise_expansion.blocksScale8 = cfg->scale_level_0_int.ptr(); - - tensor_wrapper_->setBlockwiseQuantization(blockwise_expansion, scale_offsets); - break; - } - default: break; - } -} - -// QnnAOTNodeOperation implementations -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::addInputs(const std::vector& ins) { - inputs.insert(inputs.end(), ins.begin(), ins.end()); - return shared_from_this(); -} - -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::addOutputs(const std::vector& ous) { - outputs.insert(outputs.end(), ous.begin(), ous.end()); - return shared_from_this(); -} - -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::emplaceInput(const QnnAOTNodeTensor::ptr_t& input) { - inputs.push_back(input); - return shared_from_this(); -} - -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::emplaceOutput(const QnnAOTNodeTensor::ptr_t& output) { - outputs.push_back(output); - return shared_from_this(); -} - -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::addParamScalar( - const std::vector>& params) { - param_scalar.insert(param_scalar.end(), params.begin(), params.end()); - return shared_from_this(); -} - -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::emplaceParamScalar( - const std::shared_ptr& param) { - param_scalar.push_back(param); - return shared_from_this(); -} - -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::addParamTensor( - const std::vector>& params) { - param_tensor.insert(param_tensor.end(), params.begin(), params.end()); - return shared_from_this(); -} - -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::emplaceParamTensor( - const std::shared_ptr& param) { - param_tensor.push_back(param); - return shared_from_this(); -} - -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::setOpName(const std::string& op_name) { - op_name_ = op_name; - return shared_from_this(); -} - -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::setName(const std::string& name) { - name_ = name; - return shared_from_this(); -} - -std::string QnnAOTNodeOperation::getName() { return name_; } - -QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::setPackageName(const std::string& package_name) { - package_name_ = package_name; - return shared_from_this(); -} - -QnnAOTGraph::QnnAOTGraph(QNN_INTERFACE_VER_TYPE& qnnInterface, Qnn_BackendHandle_t backendHandle, - Qnn_ContextHandle_t contextHandle, const std::string& graphName) { - qnn_model_ = std::make_shared(qnnInterface, backendHandle); - - // Short Depth Conv On HMX Off - QnnHtpGraph_CustomConfig_t* p_custom_config = nullptr; - // FIXME: @chenghuaWang The code below will make llm inference slow!!! - // p_custom_config = (QnnHtpGraph_CustomConfig_t*)malloc(sizeof(QnnHtpGraph_CustomConfig_t)); - // p_custom_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_SHORT_DEPTH_CONV_ON_HMX_OFF; - // p_custom_config->shortDepthConvOnHmxOff = true; - // htp_graph_configs.push_back(static_cast(p_custom_config)); - - // Fold Relu Activation Into Conv Off - p_custom_config = (QnnHtpGraph_CustomConfig_t*)malloc(sizeof(QnnHtpGraph_CustomConfig_t)); - p_custom_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_FOLD_RELU_ACTIVATION_INTO_CONV_OFF; - p_custom_config->foldReluActivationIntoConvOff = true; - htp_graph_configs.push_back(static_cast(p_custom_config)); - - // FIXME: If need or not - p_custom_config = (QnnHtpGraph_CustomConfig_t*)malloc(sizeof(QnnHtpGraph_CustomConfig_t)); - p_custom_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION; - p_custom_config->precision = QNN_PRECISION_FLOAT16; - htp_graph_configs.push_back(static_cast(p_custom_config)); - - // Optimization level - p_custom_config = (QnnHtpGraph_CustomConfig_t*)malloc(sizeof(QnnHtpGraph_CustomConfig_t)); - p_custom_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - p_custom_config->optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - p_custom_config->optimizationOption.floatValue = 3; - htp_graph_configs.push_back(static_cast(p_custom_config)); - - // VTCM Size - p_custom_config = (QnnHtpGraph_CustomConfig_t*)malloc(sizeof(QnnHtpGraph_CustomConfig_t)); - p_custom_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - p_custom_config->vtcmSizeInMB = 8; - htp_graph_configs.push_back(static_cast(p_custom_config)); - - qnn_graph_configs.resize(htp_graph_configs.size()); - qnn_graph_configs.reserve(htp_graph_configs.size() + 1); - for (int i = 0; i < htp_graph_configs.size(); ++i) { - qnn_graph_configs[i].option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - qnn_graph_configs[i].customConfig = htp_graph_configs[i]; - qnn_graph_config_pass_in_.push_back(&qnn_graph_configs[i]); - } - - qnn_graph_config_pass_in_.push_back(nullptr); - - qnn_model_->initialize(contextHandle, graphName.c_str(), false, 1, qnn_graph_config_pass_in_.data()); -} - -void QnnAOTGraph::addTensor(const QnnAOTNodeTensor::ptr_t& tensor) { - qnn_model_->addTensorWrapper(tensor->getWrapper()); - all_tensors_.insert({tensor->getWrapper()->getName(), tensor}); -} - -void QnnAOTGraph::addOperation(const QnnAOTNodeOperation::ptr_t& qnn_op) { - std::vector inputNames; - for (auto& in : qnn_op->inputs) inputNames.push_back(in->getWrapper()->getName()); - - std::vector outputNames; - for (auto& out : qnn_op->outputs) outputNames.push_back(out->getWrapper()->getName()); - - for (auto& in : qnn_op->inputs) qnn_model_->addTensorWrapper(in->getWrapper()); - for (auto& out : qnn_op->outputs) qnn_model_->addTensorWrapper(out->getWrapper()); - - qnn_model_->addNode(QNN_OPCONFIG_VERSION_1, qnn_op->name_, qnn_op->package_name_, qnn_op->op_name_, qnn_op->param_tensor, - qnn_op->param_scalar, inputNames, outputNames); - - op_node_.insert({qnn_op->getName(), qnn_op}); -} - -bool QnnAOTGraph::compile() { - if (is_compiled_) { return true; } - bool ret = qnn_model_->finalizeGraph(nullptr, nullptr) == mllm::qnn::MODEL_NO_ERROR; - is_compiled_ = true; - return ret; -} - -const std::vector QnnDynSymbolLoader::possible_qnn_dyn_lib_paths_{ - "/opt/qcom/aistack/qairt/2.41.0.251128/lib/x86_64-linux-clang/", -}; - -QnnDynSymbolLoader::~QnnDynSymbolLoader() { - for (auto& item : libs_) { - if (item.second.handle_) { dlclose(item.second.handle_); } - } -} - -bool QnnDynSymbolLoader::loadQnnDynLib(const std::string& lib_name, int flag) { - for (auto const& path : possible_qnn_dyn_lib_paths_) { - auto real_path = path + lib_name; - auto handle = dlopen(real_path.c_str(), flag); - if (handle) { - auto descriptor = QnnDynLibDescriptor{.lib_name_ = lib_name, .lib_path_ = path, .handle_ = handle}; - libs_.insert({lib_name, descriptor}); - MLLM_INFO("QnnDynSymbolLoader::loadQnnDynLib {} success.", real_path); - return true; - } else { - char* error = dlerror(); - MLLM_ERROR("QnnDynSymbolLoader::loadQnnDynLib try for {} failed: {}", real_path, error ? error : "Unknown error"); - } - } - MLLM_ERROR("QnnDynSymbolLoader::loadQnnDynLib {} failed.", lib_name); - return false; -} - -bool QnnDynSymbolLoader::loadQnnDynLibAtPath(const std::string& path, const std::string& lib_name, int flag) { - auto real_path = path + lib_name; - auto handle = dlopen(real_path.c_str(), flag); - if (handle) { - auto descriptor = QnnDynLibDescriptor{.lib_name_ = lib_name, .lib_path_ = path, .handle_ = handle}; - libs_.insert({lib_name, descriptor}); - MLLM_INFO("QnnDynSymbolLoader::loadQnnDynLib {} success.", real_path); - return true; - } else { - char* error = dlerror(); - MLLM_ERROR("QnnDynSymbolLoader::loadQnnDynLib try for {} failed: {}", real_path, error ? error : "Unknown error"); - } - MLLM_ERROR("QnnDynSymbolLoader::loadQnnDynLib {} failed.", lib_name); - return false; -} - -QnnAOTEnv::QnnAOTEnv(const QcomTargetMachine& target_machine) : target_machine_(target_machine) { _setup(); } - -QnnAOTEnv::QnnAOTEnv(const std::string& lib_path, const QcomTargetMachine& target_machine) : target_machine_(target_machine) { - _setup(lib_path); -} - -void QnnAOTEnv::_setup(const std::string& path) { - auto& loader = QnnDynSymbolLoader::instance(); - std::string htp_backend_lib_name = "libQnnHtp.so"; - // GLOBAL Load - if (path.empty()) { - if (!loader.loadQnnDynLib(htp_backend_lib_name, - QnnDynSymbolLoader::DynFlag::kRTLD_NOW | QnnDynSymbolLoader::DynFlag::kRTLD_GLOBAL)) { - MLLM_ERROR("QnnAOTEnv::QnnAOTEnv {} failed.", htp_backend_lib_name); - exit(1); - } - } else { - if (!loader.loadQnnDynLibAtPath(path, htp_backend_lib_name, - QnnDynSymbolLoader::DynFlag::kRTLD_NOW | QnnDynSymbolLoader::DynFlag::kRTLD_GLOBAL)) { - MLLM_ERROR("QnnAOTEnv::QnnAOTEnv {} failed.", htp_backend_lib_name); - exit(1); - } - } - - auto qnn_interface_get_providers_func = - loader(htp_backend_lib_name).func("QnnInterface_getProviders"); - - QnnInterface_t** interface_providers = nullptr; - uint32_t num_providers = 0; - - MLLM_RT_ASSERT_EQ(qnn_interface_get_providers_func((const QnnInterface_t***)&interface_providers, &num_providers), - QNN_SUCCESS); - MLLM_RT_ASSERT(interface_providers != nullptr); - MLLM_RT_ASSERT(num_providers != 0); - - MLLM_INFO("QnnAOTEnv::QnnAOTEnv get HTP num_providers: {}", num_providers); - - bool found_valid_interface = false; - // Get correct provider - for (size_t provider_id = 0; provider_id < num_providers; provider_id++) { - if (QNN_API_VERSION_MAJOR == interface_providers[provider_id]->apiVersion.coreApiVersion.major - && QNN_API_VERSION_MINOR <= interface_providers[provider_id]->apiVersion.coreApiVersion.minor) { - found_valid_interface = true; - qnn_htp_func_symbols_.qnn_interface_ = interface_providers[provider_id]->QNN_INTERFACE_VER_NAME; - break; - } - } - MLLM_RT_ASSERT_EQ(found_valid_interface, true); - - // Check if this HTP Backend has specific property - if (nullptr != qnn_htp_func_symbols_.qnn_interface_.propertyHasCapability) { - auto status = qnn_htp_func_symbols_.qnn_interface_.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); - if (status == QNN_PROPERTY_NOT_SUPPORTED) { MLLM_WARN("Device property is not supported"); } - - MLLM_RT_ASSERT(status != QNN_PROPERTY_ERROR_UNKNOWN_KEY); - } - - // Try to config this target machine - { - auto device_custom_config = createDecideCustomConfigInfo(); - QnnHtpDevice_CustomConfig_t* p_custom_config = nullptr; - - switch (target_machine_.soc_htp_security_pd_session) { - case QcomSecurityPDSession::kHtpSignedPd: { - p_custom_config = (QnnHtpDevice_CustomConfig_t*)malloc(sizeof(QnnHtpDevice_CustomConfig_t)); - unreachable_handle_.push_back(p_custom_config); - p_custom_config->option = QNN_HTP_DEVICE_CONFIG_OPTION_SIGNEDPD; - p_custom_config->useSignedProcessDomain.useSignedProcessDomain = true; - p_custom_config->useSignedProcessDomain.deviceId = 0; - device_custom_config.push_back(static_cast(p_custom_config)); - break; - } - case QcomSecurityPDSession::kHtpUnsignedPd: - default: break; - } - - const std::vector device_platform_info = createDevicePlatformInfo(); - uint32_t num_custom_configs = device_platform_info.size() + device_custom_config.size(); - target_machine_qnn_config_.resize(num_custom_configs); - - for (std::size_t i = 0; i < device_custom_config.size(); ++i) { - target_machine_qnn_config_[i].option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - target_machine_qnn_config_[i].customConfig = device_custom_config[i]; - target_machine_qnn_config_ptrs_.push_back(&target_machine_qnn_config_[i]); - } - - if (!device_platform_info.empty()) { - // The length of platform info can only be 1. - MLLM_RT_ASSERT_EQ(device_platform_info.size(), 1u); - target_machine_qnn_config_[device_custom_config.size()].option = QNN_DEVICE_CONFIG_OPTION_PLATFORM_INFO; - target_machine_qnn_config_[device_custom_config.size()].hardwareInfo = device_platform_info.back(); - target_machine_qnn_config_ptrs_.push_back(&target_machine_qnn_config_[device_custom_config.size()]); - } - - // null terminated - target_machine_qnn_config_ptrs_.push_back(nullptr); - } -} - -std::shared_ptr QnnAOTEnv::createContext(const std::string& name, bool weights_sharing) { - // Check if context with this name already exists - if (contexts_.count(name) > 0) { - MLLM_WARN("Context '{}' already exists, reusing the existing context", name); - return contexts_[name]; - } - - std::shared_ptr context = std::make_shared(); - context->name_ = name; - - // 1. create logger and register callback. - // clang-format off - MLLM_RT_ASSERT_EQ(qnn_htp_func_symbols_.qnn_interface_.logCreate(__mllmQnnLoggerCallback,QNN_LOG_LEVEL_VERBOSE, &context->log_), QNN_SUCCESS) - MLLM_RT_ASSERT_EQ(QNN_BACKEND_NO_ERROR, qnn_htp_func_symbols_.qnn_interface_.backendCreate(context->log_, (const QnnBackend_Config_t**)context->bk_cfg_, &context->bk_handle_)) - // clang-format on - - // 2. Create HTP Device - // clang-format off - if (nullptr != qnn_htp_func_symbols_.qnn_interface_.deviceCreate) { - auto status = qnn_htp_func_symbols_.qnn_interface_.deviceCreate(context->log_, target_machine_qnn_config_ptrs_.data(), &context->device_handle_); - MLLM_RT_ASSERT_EQ(status, QNN_SUCCESS); - } - // clang-format on - - // 3. Create Profile - { - auto status = qnn_htp_func_symbols_.qnn_interface_.profileCreate(context->bk_handle_, QNN_PROFILE_LEVEL_DETAILED, - &context->profile_bk_handle_); - MLLM_RT_ASSERT_EQ(status, QNN_SUCCESS); - } - - // 4. Create Context - { - auto cfgs = createContextCustomConfig(weights_sharing); - if (cfgs.size()) { - context->qnn_context_config_ = (QnnContext_Config_t**)malloc(sizeof(QnnContext_Config_t*) * (cfgs.size() + 1)); - unreachable_handle_.emplace_back(context->qnn_context_config_); - } - for (int i = 0; i < cfgs.size(); ++i) { - context->qnn_context_config_[i] = (QnnContext_Config_t*)malloc(sizeof(QnnContext_Config_t)); - context->qnn_context_config_[i]->option = QNN_CONTEXT_CONFIG_OPTION_CUSTOM; - context->qnn_context_config_[i]->customConfig = cfgs[i]; - unreachable_handle_.emplace_back(context->qnn_context_config_[i]); - } - if (cfgs.size()) { context->qnn_context_config_[cfgs.size()] = nullptr; } - auto status = qnn_htp_func_symbols_.qnn_interface_.contextCreate(context->bk_handle_, context->device_handle_, - (const QnnContext_Config_t**)context->qnn_context_config_, - &context->qnn_ctx_handle_); - MLLM_RT_ASSERT_EQ(QNN_CONTEXT_NO_ERROR, status); - } - - // 5. Register MLLM's Qnn Opset - // clang-format off - { - // FIXME(wch): we need to register our own opset of qnn. - } - // clang-format on - - MLLM_RT_ASSERT_EQ(contexts_.count(name), 0); - contexts_[name] = context; - return context; -} - -void QnnAOTEnv::saveContext(const std::string& name, const std::string& path) { - if (contexts_.find(name) == contexts_.end()) { - MLLM_ERROR("QnnAOTEnv::saveContext Context {} not found", name); - return; - } - auto context = contexts_[name]; - - uint64_t binarySize = 0; - uint64_t writtenSize = 0; - - auto status = qnn_htp_func_symbols_.qnn_interface_.contextGetBinarySize(context->qnn_ctx_handle_, &binarySize); - MLLM_RT_ASSERT_EQ(status, QNN_SUCCESS); - - std::vector binaryBuffer(binarySize); - - status = qnn_htp_func_symbols_.qnn_interface_.contextGetBinary( - context->qnn_ctx_handle_, reinterpret_cast(binaryBuffer.data()), binarySize, &writtenSize); - MLLM_RT_ASSERT_EQ(status, QNN_SUCCESS); - - if (binarySize < writtenSize) { - MLLM_ERROR("QNN context binary size mismatch: expected {} bytes, but wrote {} bytes.", binarySize, writtenSize); - } - - std::ofstream file(path, std::ios::binary); - if (!file.is_open()) { - MLLM_ERROR("Failed to open file {} for writing QNN context.", path); - return; - } - file.write(reinterpret_cast(binaryBuffer.data()), writtenSize); - file.close(); - - MLLM_INFO("QNN context {} saved to {} written {}", name, path, writtenSize); -} - -void QnnAOTEnv::destroyContext(const std::string& name) { - // TODO -} - -std::vector QnnAOTEnv::createDevicePlatformInfo() { - std::vector ret; - QnnDevice_PlatformInfo_t* p_platform_info = nullptr; - QnnDevice_HardwareDeviceInfo_t* p_hw_device_info = nullptr; - QnnHtpDevice_DeviceInfoExtension_t* p_device_info_extension = nullptr; - QnnDevice_CoreInfo_t* p_core_info = nullptr; - - p_platform_info = (QnnDevice_PlatformInfo_t*)malloc(sizeof(QnnDevice_PlatformInfo_t)); - unreachable_handle_.push_back(p_platform_info); - p_platform_info->version = QNN_DEVICE_PLATFORM_INFO_VERSION_1; - p_platform_info->v1.numHwDevices = 1; - - p_hw_device_info = (QnnDevice_HardwareDeviceInfo_t*)malloc(sizeof(QnnDevice_HardwareDeviceInfo_t)); - unreachable_handle_.push_back(p_hw_device_info); - p_hw_device_info->version = QNN_DEVICE_HARDWARE_DEVICE_INFO_VERSION_1; - p_hw_device_info->v1.deviceId = 0; - p_hw_device_info->v1.deviceType = 0; - p_hw_device_info->v1.numCores = 1; - - p_device_info_extension = (QnnHtpDevice_DeviceInfoExtension_t*)malloc(sizeof(QnnHtpDevice_DeviceInfoExtension_t)); - unreachable_handle_.push_back(p_device_info_extension); - // clang-format off - p_device_info_extension->devType = QNN_HTP_DEVICE_TYPE_ON_CHIP; - p_device_info_extension->onChipDevice.vtcmSize = target_machine_.soc_htp_vtcm_total_memory_size; // in MB - p_device_info_extension->onChipDevice.signedPdSupport = target_machine_.soc_htp_security_pd_session == QcomSecurityPDSession::kHtpSignedPd; - p_device_info_extension->onChipDevice.socModel = static_cast(target_machine_.soc_htp_chipset); - p_device_info_extension->onChipDevice.arch = static_cast(target_machine_.soc_htp_arch); - p_device_info_extension->onChipDevice.dlbcSupport = true; - p_hw_device_info->v1.deviceInfoExtension = p_device_info_extension; - // clang-format on - - p_core_info = (QnnDevice_CoreInfo_t*)malloc(sizeof(QnnDevice_CoreInfo_t)); - unreachable_handle_.push_back(p_core_info); - p_core_info->version = QNN_DEVICE_CORE_INFO_VERSION_1; - p_core_info->v1.coreId = 0; - p_core_info->v1.coreType = 0; - p_core_info->v1.coreInfoExtension = nullptr; - p_hw_device_info->v1.cores = p_core_info; - - p_platform_info->v1.hwDevices = p_hw_device_info; - ret.push_back(p_platform_info); - - return ret; -} - -std::vector QnnAOTEnv::createDecideCustomConfigInfo() { - std::vector ret; - - QnnHtpDevice_CustomConfig_t* p_custom_config = (QnnHtpDevice_CustomConfig_t*)malloc(sizeof(QnnHtpDevice_CustomConfig_t)); - unreachable_handle_.push_back(p_custom_config); - p_custom_config->option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; - p_custom_config->socModel = static_cast(target_machine_.soc_htp_chipset); - ret.push_back(static_cast(p_custom_config)); - - return ret; -} - -std::vector QnnAOTEnv::createContextCustomConfig(bool weights_sharing) { - std::vector ret; - QnnHtpContext_CustomConfig_t* p_custom_config = nullptr; - - if (weights_sharing) { - p_custom_config = (QnnHtpContext_CustomConfig_t*)malloc(sizeof(QnnHtpContext_CustomConfig_t)); - unreachable_handle_.push_back(p_custom_config); - p_custom_config->option = QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED; - p_custom_config->weightSharingEnabled = true; - ret.push_back(static_cast(p_custom_config)); - } - - return ret; -} - -QnnAOTGraph::ptr_t QnnAOTEnv::captureAOTGraph(const std::string& qnn_context_name, const std::string& g_name) { - if (contexts_.find(qnn_context_name) == contexts_.end()) { - MLLM_ERROR("Context {} not found", qnn_context_name); - return nullptr; - } - auto& ctx = contexts_[qnn_context_name]; - if (ctx->graphs_.find(g_name) == ctx->graphs_.end()) { - ctx->graphs_[g_name] = - std::make_shared(qnn_htp_func_symbols_.qnn_interface_, ctx->bk_handle_, ctx->qnn_ctx_handle_, g_name); - } - return ctx->graphs_[g_name]; -} - -void QnnAOTEnv::captureAOTNodeOp(const std::string& qnn_context_name, const std::string& graph_name, - const QnnAOTNodeOperation::ptr_t& op) { - MLLM_RT_ASSERT_EQ(contexts_.count(qnn_context_name), 1); - MLLM_RT_ASSERT_EQ(contexts_[qnn_context_name]->graphs_.count(graph_name), 1); - contexts_[qnn_context_name]->graphs_[graph_name]->addOperation(op); -} - -QnnAOTNodeTensor::ptr_t QnnAOTEnv::captureQnnAOTNodeTensor(const std::string& qnn_context_name, const std::string& graph_name, - const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight) { - auto __qnn_tensor_name = v->name(); - - bool __qnn_enable_static_weight = force_static_weight; - - // Check if this value want static qnn weight. The static qnn weight will be shared through one context in diff graphs! - if (v->tensor_.memType() == kGlobal || (v->tensor_.memType() <= kParams_End && v->tensor_.memType() >= kParams_Start) - || v->getAttr("constant")) { - __qnn_enable_static_weight = true; - } - - MLLM_RT_ASSERT_EQ(contexts_.count(qnn_context_name), 1); - MLLM_RT_ASSERT_EQ(contexts_[qnn_context_name]->graphs_.count(graph_name), 1); - auto graph = contexts_[qnn_context_name]->graphs_[graph_name]; - - // If normal weight is cached, we return it directly - if (graph->all_tensors_.count(__qnn_tensor_name)) { return graph->all_tensors_[__qnn_tensor_name]; } - - QnnAOTNodeTensor::ptr_t ret = nullptr; - - // If static weight is cached, we return it directly. - if (__qnn_enable_static_weight) { - if (contexts_[qnn_context_name]->static_tensor_.count(__qnn_tensor_name)) { - ret = contexts_[qnn_context_name]->static_tensor_[__qnn_tensor_name]; - } - } - - // There has no Tensor in the cache. - if (ret == nullptr) { - ret = QnnAOTNodeTensor::create(v, __qnn_enable_static_weight); - - if (__qnn_enable_static_weight) { contexts_[qnn_context_name]->static_tensor_[__qnn_tensor_name] = ret; } - } - - graph->addTensor(ret); - - return ret; -} - -std::shared_ptr QnnAOTEnv::getContext(const std::string& name) { return contexts_[name]; } - -} // namespace mllm::qnn::aot diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.hpp b/mllm/backends/qnn/aot/QnnWrappersAPI.hpp deleted file mode 100644 index 6cb424bc6..000000000 --- a/mllm/backends/qnn/aot/QnnWrappersAPI.hpp +++ /dev/null @@ -1,251 +0,0 @@ -// Copyright (c) MLLM Team. -// Licensed under the MIT License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "mllm/utils/Common.hpp" -#include "mllm/compile/ir/tensor/Value.hpp" -#include "mllm/compile/ir/linalg/Attribute.hpp" -#include "mllm/backends/qnn/aot/QnnTargetMachine.hpp" -#include "mllm/backends/qnn/QNNModel.hpp" -#include "mllm/backends/qnn/QNNUtils.hpp" - -namespace mllm::qnn::aot { - -void __mllmLoggerCallback4QnnLogger(const char* fmt, QnnLog_Level_t level, uint64_t times_tamp, va_list argp); - -// Collection of symbols that we need to load from qnn dyn lib. -struct QnnFuncSymbols { - using QnnInterfaceGetProvidersFuncType = Qnn_ErrorHandle_t(const QnnInterface_t*** providerList, uint32_t* numProviders); - using QnnSystemInterfaceGetProvidersFuncType = Qnn_ErrorHandle_t(const QnnSystemInterface_t*** providerList, - uint32_t* numProviders); - - QNN_INTERFACE_VER_TYPE qnn_interface_; - QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface_; -}; - -class QnnAOTNodeTensor : public std::enable_shared_from_this { - public: - using ptr_t = std::shared_ptr; - - static inline ptr_t create(const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight = false) { - return std::make_shared(v, force_static_weight); - } - - explicit QnnAOTNodeTensor(const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight = false); - - std::shared_ptr getWrapper() { return tensor_wrapper_; } - - private: - Qnn_TensorType_t parseQnnTensorTypeFromIR(const ir::tensor::TensorValue::ptr_t& v); - - Qnn_DataType_t parseQnnDataTypeFromIR(const ir::tensor::TensorValue::ptr_t& v); - - std::string parseQnnTensorNameFromIR(const ir::tensor::TensorValue::ptr_t& v); - - Qnn_QuantizeParams_t parseQnnQuantizeParamFromIR(const ir::tensor::TensorValue::ptr_t& v); - - // intend for per-channel and LPBQ quantization - void setupComplexTensorQuantization(const ir::tensor::TensorValue::ptr_t& v); - - std::shared_ptr tensor_wrapper_; -}; - -class QnnAOTNodeOperation : public std::enable_shared_from_this { - public: - using ptr_t = std::shared_ptr; - - static inline ptr_t create(const std::string& op_name) { - auto ret = std::make_shared(); - ret->op_name_ = op_name; - return ret; - } - - QnnAOTNodeOperation::ptr_t addInputs(const std::vector& ins); - - QnnAOTNodeOperation::ptr_t addOutputs(const std::vector& ous); - - QnnAOTNodeOperation::ptr_t emplaceInput(const QnnAOTNodeTensor::ptr_t& input); - - QnnAOTNodeOperation::ptr_t emplaceOutput(const QnnAOTNodeTensor::ptr_t& output); - - QnnAOTNodeOperation::ptr_t addParamScalar(const std::vector>& params); - - QnnAOTNodeOperation::ptr_t emplaceParamScalar(const std::shared_ptr& param); - - QnnAOTNodeOperation::ptr_t addParamTensor(const std::vector>& params); - - QnnAOTNodeOperation::ptr_t emplaceParamTensor(const std::shared_ptr& param); - - QnnAOTNodeOperation::ptr_t setOpName(const std::string& op_name); - - QnnAOTNodeOperation::ptr_t setName(const std::string& name); - - std::string getName(); - - QnnAOTNodeOperation::ptr_t setPackageName(const std::string& package_name); - - std::string name_; - std::string op_name_; - std::string package_name_ = "qti.aisw"; - std::vector> param_scalar; - std::vector> param_tensor; - std::vector inputs; - std::vector outputs; -}; - -struct QnnDeviceAndContext; -class QnnAOTGraph : public std::enable_shared_from_this { - public: - using ptr_t = std::shared_ptr; - - QnnAOTGraph(QNN_INTERFACE_VER_TYPE& qnnInterface, Qnn_BackendHandle_t backendHandle, Qnn_ContextHandle_t contextHandle, - const std::string& graphName); - - void addOperation(const QnnAOTNodeOperation::ptr_t& qnn_op); - - void addTensor(const QnnAOTNodeTensor::ptr_t& tensor); - - bool compile(); - - bool is_compiled_ = false; - std::unordered_map op_node_; - std::unordered_map all_tensors_; - - private: - std::shared_ptr qnn_model_; - std::vector qnn_graph_configs; - std::vector htp_graph_configs; - std::vector qnn_graph_config_pass_in_; -}; - -struct QnnDeviceAndContext { - using ptr_t = std::shared_ptr; - - std::string name_; - Qnn_LogHandle_t log_ = nullptr; - Qnn_BackendHandle_t bk_handle_ = nullptr; - Qnn_DeviceHandle_t device_handle_ = nullptr; - QnnBackend_Config_t** bk_cfg_ = nullptr; - QnnContext_Config_t** qnn_context_config_ = nullptr; - Qnn_ProfileHandle_t profile_bk_handle_ = nullptr; - Qnn_ContextHandle_t qnn_ctx_handle_; - - std::unordered_map graphs_; //< for persistence keep graphs. - std::unordered_map static_tensor_; //< for weight sharing. -}; - -struct QnnDynLibDescriptor { - std::string lib_name_; - std::string lib_path_; - void* handle_ = nullptr; - - template - std::function func(const std::string& symbol_name) { - if (handle_ == nullptr) { MLLM_ERROR_EXIT(ExitCode::kCoreError, "QnnDynSymbolLoader: handle is nullptr."); } - auto func_ptr = dlsym(handle_, symbol_name.c_str()); - MLLM_RT_ASSERT(func_ptr != nullptr); - return (FuncType*)(func_ptr); - }; -}; - -class QnnDynSymbolLoader { - public: - enum DynFlag : int { // NOLINT performance-enum-size - kRTLD_NOW = RTLD_NOW, - kRTLD_LOCAL = RTLD_LOCAL, - kRTLD_GLOBAL = RTLD_GLOBAL, - }; - - static QnnDynSymbolLoader& instance() { - static QnnDynSymbolLoader instance; - return instance; - } - - ~QnnDynSymbolLoader(); - - QnnDynSymbolLoader() = default; - - QnnDynSymbolLoader(const QnnDynSymbolLoader&) = delete; - - QnnDynSymbolLoader& operator=(const QnnDynSymbolLoader&) = delete; - - bool loadQnnDynLib(const std::string& lib_name, int flag); - - bool loadQnnDynLibAtPath(const std::string& path, const std::string& lib_name, int flag); - - inline QnnDynLibDescriptor& operator()(const std::string& lib_name) { return libs_.at(lib_name); } - - private: - std::unordered_map libs_; - static const std::vector possible_qnn_dyn_lib_paths_; -}; - -// Device and Dynamic Lib included -class QnnAOTEnv { - public: - using ptr_t = std::shared_ptr; - - explicit QnnAOTEnv(const QcomTargetMachine& target_machine); - - QnnAOTEnv(const std::string& lib_path, const QcomTargetMachine& target_machine); - - std::shared_ptr createContext(const std::string& name, bool weights_sharing = false); - - void saveContext(const std::string& name, const std::string& path); - - void destroyContext(const std::string& name); - - // This is for All PUs, such as CPU, GPU, NPU - std::vector createDevicePlatformInfo(); - - // This function is for NPU only. - std::vector createDecideCustomConfigInfo(); - - std::vector createContextCustomConfig(bool weights_sharing); - - // Functions for build qnn graphs - QnnAOTGraph::ptr_t captureAOTGraph(const std::string& qnn_context_name, const std::string& g_name); - - void captureAOTNodeOp(const std::string& qnn_context_name, const std::string& graph_name, - const QnnAOTNodeOperation::ptr_t& op); - - QnnAOTNodeTensor::ptr_t captureQnnAOTNodeTensor(const std::string& qnn_context_name, const std::string& graph_name, - const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight = false); - - inline QnnFuncSymbols& getFuncSymbol() { return qnn_htp_func_symbols_; } - - std::shared_ptr getContext(const std::string& name); - - private: - void _setup(const std::string& path = ""); - - QcomTargetMachine target_machine_; - QnnFuncSymbols qnn_htp_func_symbols_; - std::unordered_map> contexts_; - - // device config for all to use - std::vector target_machine_qnn_config_; - std::vector target_machine_qnn_config_ptrs_; - - // void* handle that should be freed when QnnAOTEnv end - std::vector unreachable_handle_; -}; - -} // namespace mllm::qnn::aot diff --git a/mllm/backends/qnn/aot/README.md b/mllm/backends/qnn/aot/README.md deleted file mode 100644 index d2d28d1d4..000000000 --- a/mllm/backends/qnn/aot/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Qnn AOT - -This is the Qnn AOT API for X86 platform to build executable qnn model. This is not depends on QNNBackend target. diff --git a/mllm/backends/qnn/aot_rt/README.md b/mllm/backends/qnn/aot_rt/README.md deleted file mode 100644 index f7930caee..000000000 --- a/mllm/backends/qnn/aot_rt/README.md +++ /dev/null @@ -1 +0,0 @@ -# Runtime of AOT Models diff --git a/mllm/ffi/CMakeLists.txt b/mllm/ffi/CMakeLists.txt index 549d0a68f..c46d15af5 100644 --- a/mllm/ffi/CMakeLists.txt +++ b/mllm/ffi/CMakeLists.txt @@ -13,24 +13,11 @@ add_library(MllmFFIExtension SHARED ${CMAKE_CURRENT_LIST_DIR}/ModelService.cc ${CMAKE_CURRENT_LIST_DIR}/Nn.cc ${CMAKE_CURRENT_LIST_DIR}/Compile.cc - ${CMAKE_CURRENT_LIST_DIR}/qualcomm/QnnAOT.cc ) target_link_libraries(MllmFFIExtension PUBLIC tvm_ffi_header) target_link_libraries(MllmFFIExtension PUBLIC tvm_ffi_shared MllmRT MllmCPUBackend) set_target_properties(MllmFFIExtension PROPERTIES PREFIX "") -if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE) - # Build - target_include_directories(MllmFFIExtension PRIVATE - $ENV{QAIRT_SDK_ROOT}/include # QNN SDK include - $ENV{QAIRT_SDK_ROOT}/include/QNN # QNN SDK include - ) - add_compile_definitions( - MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE - ) -endif() - - # Set the depend search path. Windows do not need this, it will search dlls in the same directory first. if(APPLE) set_target_properties(MllmFFIExtension PROPERTIES diff --git a/mllm/ffi/Extension.cc b/mllm/ffi/Extension.cc index cb999191d..6dfb51f99 100644 --- a/mllm/ffi/Extension.cc +++ b/mllm/ffi/Extension.cc @@ -370,10 +370,6 @@ TVM_FFI_STATIC_INIT_BLOCK() { }); } -//===----------------------------------------------------------------------===// -// REGISTER: _Context Functions. -//===----------------------------------------------------------------------===// - //===----------------------------------------------------------------------===// // REGISTER: Quantize && Packing Functions. //===----------------------------------------------------------------------===// diff --git a/mllm/ffi/qualcomm/QnnAOT.cc b/mllm/ffi/qualcomm/QnnAOT.cc deleted file mode 100644 index e36cad641..000000000 --- a/mllm/ffi/qualcomm/QnnAOT.cc +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright (c) MLLM Team. -// Licensed under the MIT License. - -#include -#include -#include -#include -#include -#include - -#include "mllm/backends/qnn/aot/QnnTargetMachine.hpp" -#include "mllm/ffi/qualcomm/QnnAOT.hh" - -#ifdef MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE - -TVM_FFI_STATIC_INIT_BLOCK() { - namespace refl = tvm::ffi::reflection; - - refl::ObjectDef<::mllm::ffi::QcomHTPArchObj>(); - - refl::GlobalDef().def("mllm.qualcomm.QcomHTPArch.NONE", []() { - auto ret = mllm::qnn::aot::QcomHTPArch::NONE; - return mllm::ffi::QcomHTPArch(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomHTPArch.V68", []() { - auto ret = mllm::qnn::aot::QcomHTPArch::V68; - return mllm::ffi::QcomHTPArch(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomHTPArch.V69", []() { - auto ret = mllm::qnn::aot::QcomHTPArch::V69; - return mllm::ffi::QcomHTPArch(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomHTPArch.V73", []() { - auto ret = mllm::qnn::aot::QcomHTPArch::V73; - return mllm::ffi::QcomHTPArch(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomHTPArch.V75", []() { - auto ret = mllm::qnn::aot::QcomHTPArch::V75; - return mllm::ffi::QcomHTPArch(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomHTPArch.V79", []() { - auto ret = mllm::qnn::aot::QcomHTPArch::V79; - return mllm::ffi::QcomHTPArch(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomHTPArch.V81", []() { - auto ret = mllm::qnn::aot::QcomHTPArch::V81; - return mllm::ffi::QcomHTPArch(ret); - }); - - refl::ObjectDef<::mllm::ffi::QcomChipsetObj>(); - - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.UNKNOWN_SM", []() { - auto ret = mllm::qnn::aot::QcomChipset::UNKNOWN_SM; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SA8295", []() { - auto ret = mllm::qnn::aot::QcomChipset::SA8295; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SM8350", []() { - auto ret = mllm::qnn::aot::QcomChipset::SM8350; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SM8450", []() { - auto ret = mllm::qnn::aot::QcomChipset::SM8450; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SM8475", []() { - auto ret = mllm::qnn::aot::QcomChipset::SM8475; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SM8550", []() { - auto ret = mllm::qnn::aot::QcomChipset::SM8550; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SM8650", []() { - auto ret = mllm::qnn::aot::QcomChipset::SM8650; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SM8750", []() { - auto ret = mllm::qnn::aot::QcomChipset::SM8750; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SM8850", []() { - auto ret = mllm::qnn::aot::QcomChipset::SM8850; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SSG2115P", []() { - auto ret = mllm::qnn::aot::QcomChipset::SSG2115P; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SSG2125P", []() { - auto ret = mllm::qnn::aot::QcomChipset::SSG2125P; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SXR1230P", []() { - auto ret = mllm::qnn::aot::QcomChipset::SXR1230P; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SXR2230P", []() { - auto ret = mllm::qnn::aot::QcomChipset::SXR2230P; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SXR2330P", []() { - auto ret = mllm::qnn::aot::QcomChipset::SXR2330P; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.QCS9100", []() { - auto ret = mllm::qnn::aot::QcomChipset::QCS9100; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SAR2230P", []() { - auto ret = mllm::qnn::aot::QcomChipset::SAR2230P; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SA8255", []() { - auto ret = mllm::qnn::aot::QcomChipset::SA8255; - return mllm::ffi::QcomChipset(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SW6100", []() { - auto ret = mllm::qnn::aot::QcomChipset::SW6100; - return mllm::ffi::QcomChipset(ret); - }); - - refl::ObjectDef<::mllm::ffi::QcomTryBestPerformanceObj>(); - - refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpDefault", []() { - auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpDefault; - return mllm::ffi::QcomTryBestPerformance(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpSustainedHighPerformance", []() { - auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpSustainedHighPerformance; - return mllm::ffi::QcomTryBestPerformance(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpBurst", []() { - auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpBurst; - return mllm::ffi::QcomTryBestPerformance(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpHighPerformance", []() { - auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpHighPerformance; - return mllm::ffi::QcomTryBestPerformance(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpPowerSaver", []() { - auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpPowerSaver; - return mllm::ffi::QcomTryBestPerformance(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpLowPowerSaver", []() { - auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpLowPowerSaver; - return mllm::ffi::QcomTryBestPerformance(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpHighPowerSaver", []() { - auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpHighPowerSaver; - return mllm::ffi::QcomTryBestPerformance(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpLowBalanced", []() { - auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpLowBalanced; - return mllm::ffi::QcomTryBestPerformance(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpBalanced", []() { - auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpBalanced; - return mllm::ffi::QcomTryBestPerformance(ret); - }); - - refl::ObjectDef<::mllm::ffi::QcomSecurityPDSessionObj>(); - - refl::GlobalDef().def("mllm.qualcomm.QcomSecurityPDSession.HtpUnsignedPd", []() { - auto ret = mllm::qnn::aot::QcomSecurityPDSession::kHtpUnsignedPd; - return mllm::ffi::QcomSecurityPDSession(ret); - }); - refl::GlobalDef().def("mllm.qualcomm.QcomSecurityPDSession.HtpSignedPd", []() { - auto ret = mllm::qnn::aot::QcomSecurityPDSession::kHtpSignedPd; - return mllm::ffi::QcomSecurityPDSession(ret); - }); - - refl::ObjectDef().def_static( - "__create__", - [](const mllm::ffi::QcomChipset& chipset, const mllm::ffi::QcomHTPArch& arch, - const mllm::ffi::QcomTryBestPerformance& perf, const mllm::ffi::QcomSecurityPDSession& pd_session, uint32_t htp_vtcm) { - auto tm = mllm::qnn::aot::QcomTargetMachine{ - .soc_htp_chipset = chipset.get()->chipset_, - .soc_htp_arch = arch.get()->htp_arch_, - .soc_htp_performance = perf.get()->perf_, - .soc_htp_security_pd_session = pd_session.get()->pd_, - .soc_htp_vtcm_total_memory_size = htp_vtcm, - }; - return ::mllm::ffi::QcomTargetMachine(tm); - }); - - refl::ObjectDef().def_static( - "__create__", [](const mllm::ffi::QcomTargetMachine& machine, const std::string& path) -> mllm::ffi::QnnAOTEnv { - if (path.empty()) { - auto tm = machine.get()->target_machine_; - auto s = std::make_shared<::mllm::qnn::aot::QnnAOTEnv>(tm); - return ::mllm::ffi::QnnAOTEnv(s); - } else { - auto tm = machine.get()->target_machine_; - auto s = std::make_shared<::mllm::qnn::aot::QnnAOTEnv>(path, tm); - return ::mllm::ffi::QnnAOTEnv(s); - } - }); - - refl::ObjectDef<::mllm::ffi::QnnDeviceAndContextObj>(); - - refl::GlobalDef().def("mllm.qualcomm.QnnAOTEnv.createContext", - [](const mllm::ffi::QnnAOTEnv& self, const std::string& name, bool weights_sharing) { - auto s = self.get()->qnn_aot_env_ptr_->createContext(name, weights_sharing); - return mllm::ffi::QnnDeviceAndContext(s); - }); -} - -#endif diff --git a/mllm/ffi/qualcomm/QnnAOT.hh b/mllm/ffi/qualcomm/QnnAOT.hh deleted file mode 100644 index f0feb46f3..000000000 --- a/mllm/ffi/qualcomm/QnnAOT.hh +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright (c) MLLM Team. -// Licensed under the MIT License. - -#pragma once - -#include -#include -#include -#include - -#ifdef MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE -#include "mllm/backends/qnn/aot/QnnWrappersAPI.hpp" -#endif - -namespace mllm::ffi { - -#ifdef MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE - -//===----------------------------------------------------------------------===// -// MLLM Parameter File Define -//===----------------------------------------------------------------------===// -class QnnAOTEnvObj : public tvm::ffi::Object { - public: - ::mllm::qnn::aot::QnnAOTEnv::ptr_t qnn_aot_env_ptr_ = nullptr; - - explicit QnnAOTEnvObj(const ::mllm::qnn::aot::QnnAOTEnv::ptr_t& ptr) : qnn_aot_env_ptr_(ptr) { MLLM_EMPTY_SCOPE; } - - TVM_FFI_DECLARE_OBJECT_INFO_FINAL("mllm.qualcomm.QnnAOTEnv", QnnAOTEnvObj, tvm::ffi::Object); -}; - -class QnnAOTEnv : public tvm::ffi::ObjectRef { - public: - explicit QnnAOTEnv(::mllm::qnn::aot::QnnAOTEnv::ptr_t& ptr) { data_ = tvm::ffi::make_object(ptr); } - - TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(QnnAOTEnv, tvm::ffi::ObjectRef, QnnAOTEnvObj); // NOLINT -}; - -//===----------------------------------------------------------------------===// -// MLLM QnnDeviceAndContext Define -//===----------------------------------------------------------------------===// -class QnnDeviceAndContextObj : public tvm::ffi::Object { - public: - std::shared_ptr<::mllm::qnn::aot::QnnDeviceAndContext> qnn_device_and_context_ptr_ = nullptr; - - explicit QnnDeviceAndContextObj(const std::shared_ptr<::mllm::qnn::aot::QnnDeviceAndContext>& ptr) - : qnn_device_and_context_ptr_(ptr) { - MLLM_EMPTY_SCOPE; - } - - TVM_FFI_DECLARE_OBJECT_INFO_FINAL("mllm.qualcomm.QnnDeviceAndContext", QnnDeviceAndContextObj, tvm::ffi::Object); -}; - -class QnnDeviceAndContext : public tvm::ffi::ObjectRef { - public: - explicit QnnDeviceAndContext(std::shared_ptr<::mllm::qnn::aot::QnnDeviceAndContext>& ptr) { - data_ = tvm::ffi::make_object(ptr); - } - - TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(QnnDeviceAndContext, tvm::ffi::ObjectRef, QnnDeviceAndContextObj); // NOLINT -}; - -//===----------------------------------------------------------------------===// -// MLLM QcomHTPArch Define -//===----------------------------------------------------------------------===// -class QcomHTPArchObj : public tvm::ffi::Object { - public: - mllm::qnn::aot::QcomHTPArch htp_arch_; - - explicit QcomHTPArchObj(const mllm::qnn::aot::QcomHTPArch& obj) : htp_arch_(obj) { MLLM_EMPTY_SCOPE; } - - TVM_FFI_DECLARE_OBJECT_INFO_FINAL("mllm.qualcomm.QcomHTPArch", QcomHTPArchObj, tvm::ffi::Object); -}; - -class QcomHTPArch : public tvm::ffi::ObjectRef { - public: - explicit QcomHTPArch(mllm::qnn::aot::QcomHTPArch& ptr) { data_ = tvm::ffi::make_object(ptr); } - - TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(QcomHTPArch, tvm::ffi::ObjectRef, QcomHTPArchObj); // NOLINT -}; - -//===----------------------------------------------------------------------===// -// MLLM QcomChipset Define -//===----------------------------------------------------------------------===// -class QcomChipsetObj : public tvm::ffi::Object { - public: - mllm::qnn::aot::QcomChipset chipset_; - - explicit QcomChipsetObj(const mllm::qnn::aot::QcomChipset& obj) : chipset_(obj) { MLLM_EMPTY_SCOPE; } - - TVM_FFI_DECLARE_OBJECT_INFO_FINAL("mllm.qualcomm.QcomChipset", QcomChipsetObj, tvm::ffi::Object); -}; - -class QcomChipset : public tvm::ffi::ObjectRef { - public: - explicit QcomChipset(mllm::qnn::aot::QcomChipset& ptr) { data_ = tvm::ffi::make_object(ptr); } - - TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(QcomChipset, tvm::ffi::ObjectRef, QcomChipsetObj); // NOLINT -}; - -//===----------------------------------------------------------------------===// -// MLLM QcomTryBestPerformance Define -//===----------------------------------------------------------------------===// -class QcomTryBestPerformanceObj : public tvm::ffi::Object { - public: - mllm::qnn::aot::QcomTryBestPerformance perf_; - - explicit QcomTryBestPerformanceObj(const mllm::qnn::aot::QcomTryBestPerformance& obj) : perf_(obj) { MLLM_EMPTY_SCOPE; } - - TVM_FFI_DECLARE_OBJECT_INFO_FINAL("mllm.qualcomm.QcomTryBestPerformance", QcomTryBestPerformanceObj, tvm::ffi::Object); -}; - -class QcomTryBestPerformance : public tvm::ffi::ObjectRef { - public: - explicit QcomTryBestPerformance(mllm::qnn::aot::QcomTryBestPerformance& ptr) { - data_ = tvm::ffi::make_object(ptr); - } - - TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(QcomTryBestPerformance, tvm::ffi::ObjectRef, QcomTryBestPerformanceObj); // NOLINT -}; - -//===----------------------------------------------------------------------===// -// MLLM QcomSecurityPDSession Define -//===----------------------------------------------------------------------===// -class QcomSecurityPDSessionObj : public tvm::ffi::Object { - public: - mllm::qnn::aot::QcomSecurityPDSession pd_; - - explicit QcomSecurityPDSessionObj(const mllm::qnn::aot::QcomSecurityPDSession& obj) : pd_(obj) { MLLM_EMPTY_SCOPE; } - - TVM_FFI_DECLARE_OBJECT_INFO_FINAL("mllm.qualcomm.QcomSecurityPDSession", QcomSecurityPDSessionObj, tvm::ffi::Object); -}; - -class QcomSecurityPDSession : public tvm::ffi::ObjectRef { - public: - explicit QcomSecurityPDSession(mllm::qnn::aot::QcomSecurityPDSession& ptr) { - data_ = tvm::ffi::make_object(ptr); - } - - TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(QcomSecurityPDSession, tvm::ffi::ObjectRef, QcomSecurityPDSessionObj); // NOLINT -}; - -//===----------------------------------------------------------------------===// -// MLLM QcomTargetMachine Define -//===----------------------------------------------------------------------===// -class QcomTargetMachineObj : public tvm::ffi::Object { - public: - mllm::qnn::aot::QcomTargetMachine target_machine_; - - explicit QcomTargetMachineObj(const mllm::qnn::aot::QcomTargetMachine& obj) : target_machine_(obj) { MLLM_EMPTY_SCOPE; } - - TVM_FFI_DECLARE_OBJECT_INFO_FINAL("mllm.qualcomm.QcomTargetMachine", QcomTargetMachineObj, tvm::ffi::Object); -}; - -class QcomTargetMachine : public tvm::ffi::ObjectRef { - public: - explicit QcomTargetMachine(mllm::qnn::aot::QcomTargetMachine& ptr) { - data_ = tvm::ffi::make_object(ptr); - } - - TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(QcomTargetMachine, tvm::ffi::ObjectRef, QcomTargetMachineObj); // NOLINT -}; - -#endif - -} // namespace mllm::ffi diff --git a/mllm/models/smollm3_3B/modeling_smollm3.hpp b/mllm/models/smollm3_3B/modeling_smollm3.hpp index 3d7ee4a2f..90798387d 100644 --- a/mllm/models/smollm3_3B/modeling_smollm3.hpp +++ b/mllm/models/smollm3_3B/modeling_smollm3.hpp @@ -157,7 +157,7 @@ class Smollm3Attention final : public nn::Module { auto [key_states_result, value_states_result] = past_kv_cache->updateKVCache(layer_idx_, key_states, value_states); key_states = std::move(key_states_result); value_states = std::move(value_states_result); - + Tensor attn; if (key_states.dtype() == kFloat32) { attn = nn::functional::matmul(query_states, key_states, false, true) * (1.f / sqrtf(head_dim_)); diff --git a/mllm/nn/Functional.cpp b/mllm/nn/Functional.cpp index 4e70b092a..e1e015432 100644 --- a/mllm/nn/Functional.cpp +++ b/mllm/nn/Functional.cpp @@ -7,6 +7,7 @@ #include "mllm/core/aops/FlashAttention2Op.hpp" #include "mllm/core/aops/GatherOp.hpp" #include "mllm/core/aops/MatMulOp.hpp" +#include "mllm/core/aops/LinearOp.hpp" #include "mllm/core/aops/ReduceOps.hpp" #include "mllm/core/aops/Scatter2ShardsOp.hpp" #include "mllm/core/aops/SigmoidOp.hpp" @@ -16,6 +17,7 @@ #include "mllm/core/aops/ViewOp.hpp" #include "mllm/core/aops/TopKOp.hpp" #include "mllm/core/aops/SiLUOp.hpp" +#include "mllm/core/aops/RMSNormOp.hpp" #include "mllm/core/aops/PadOp.hpp" #include "mllm/core/aops/MaskedScatterOp.hpp" #include "mllm/core/aops/InterpolateOp.hpp" @@ -33,6 +35,16 @@ Tensor matmul(const Tensor& A, const Tensor& B, bool transpose_A, bool transpose {A, B})[0]; } +Tensor linear(const Tensor& x, const Tensor& weight, const Tensor& bias) { + aops::LinearOpOptions opts{}; + opts.setRedirect(true); + if (bias.isNil()) { + return Context::instance().buildOpAndSubmitTask(OpTypes::kLinear, opts, {x, weight})[0]; + } else { + return Context::instance().buildOpAndSubmitTask(OpTypes::kLinear, opts, {x, weight, bias})[0]; + } +} + Tensor view(const Tensor& x, const std::vector& shape) { return Context::instance().buildOpAndSubmitTask(OpTypes::kView, aops::ViewOpOptions{.to_shape = shape}, {x})[0]; } @@ -126,6 +138,11 @@ Tensor silu_(const Tensor& x) { return Context::instance().buildOpAndSubmitTask(OpTypes::kSiLU, opt, {x})[0]; } +Tensor rmsNorm(const Tensor& x, const Tensor& weight, float epsilon, bool add_unit_offset) { + return Context::instance().buildOpAndSubmitTask( + OpTypes::kRMSNorm, aops::RMSNormOpOptions{.epsilon = epsilon, .add_unit_offset = add_unit_offset}, {x, weight})[0]; +} + void scatter2Shards(const Tensor& src, const Tensor& shards_pointer, int32_t dim) { Context::instance().buildOpAndSubmitTask(OpTypes::kScatter2Shards, aops::Scatter2ShardsOpOptions{.dim = dim}, {src, shards_pointer}); diff --git a/mllm/nn/Functional.hpp b/mllm/nn/Functional.hpp index 31a57812c..c85b716e9 100644 --- a/mllm/nn/Functional.hpp +++ b/mllm/nn/Functional.hpp @@ -20,6 +20,8 @@ namespace mllm::nn::functional { Tensor matmul(const Tensor& A, const Tensor& B, bool transpose_A = false, bool transpose_B = false, aops::MatMulOpType type = aops::MatMulOpType::kDefault); +Tensor linear(const Tensor& x, const Tensor& weight, const Tensor& bias = Tensor()); + Tensor view(const Tensor& x, const std::vector& shape); std::vector split(const Tensor& x, int32_t split_size_or_sections, int32_t dim); @@ -131,6 +133,8 @@ Tensor mean(const Tensor& x, int32_t dim = std::numeric_limits::max(), Tensor silu(const Tensor& x); Tensor silu_(const Tensor& x); +Tensor rmsNorm(const Tensor& x, const Tensor& weight, float epsilon = 1e-5f, bool add_unit_offset = false); + void scatter2Shards(const Tensor& src, const Tensor& shards_pointer, int32_t dim); // If you want causal mask attention. Use Flash attention instead. diff --git a/pymllm/backends/__init__.py b/pymllm/backends/__init__.py deleted file mode 100644 index 5e926d580..000000000 --- a/pymllm/backends/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) MLLM Team. -# Licensed under the MIT License. - -from . import cuda, qualcomm diff --git a/pymllm/backends/cuda/__init__.py b/pymllm/backends/cuda/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/pymllm/backends/qualcomm/README.md b/pymllm/backends/qualcomm/README.md deleted file mode 100644 index 27122dbc2..000000000 --- a/pymllm/backends/qualcomm/README.md +++ /dev/null @@ -1 +0,0 @@ -# Qualcomm Qnn AOT API diff --git a/pymllm/backends/qualcomm/__init__.py b/pymllm/backends/qualcomm/__init__.py deleted file mode 100644 index bcd9c95de..000000000 --- a/pymllm/backends/qualcomm/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) MLLM Team. -# Licensed under the MIT License. - -from . import qnn_aot_env -from . import transformers diff --git a/pymllm/backends/qualcomm/nn.py b/pymllm/backends/qualcomm/nn.py deleted file mode 100644 index 0ba9aef55..000000000 --- a/pymllm/backends/qualcomm/nn.py +++ /dev/null @@ -1,11 +0,0 @@ -from pymllm.nn._layers import Softmax, RoPE - - -class QnnSoftmax(Softmax): - def __init__(self): - super().__init__() - - -class QnnRoPE(RoPE): - def __init__(self): - super().__init__() diff --git a/pymllm/backends/qualcomm/qnn_aot_env.py b/pymllm/backends/qualcomm/qnn_aot_env.py deleted file mode 100644 index 8b0c0d2e1..000000000 --- a/pymllm/backends/qualcomm/qnn_aot_env.py +++ /dev/null @@ -1,21 +0,0 @@ -from pymllm.ffi import is_qnn_aot_on_x86_enabled - -if is_qnn_aot_on_x86_enabled(): - from pymllm.ffi import ( - QnnDeviceAndContext, - QnnAOTEnv, - QcomChipset, - QcomHTPArch, - QcomSecurityPDSession, - QcomTargetMachine, - QcomTryBestPerformance, - ) -else: - # Define placeholder classes when QNN AOT is not enabled - QnnDeviceAndContext = None - QnnAOTEnv = None - QcomChipset = None - QcomHTPArch = None - QcomSecurityPDSession = None - QcomTargetMachine = None - QcomTryBestPerformance = None \ No newline at end of file diff --git a/pymllm/compile/mllm_ir/trace.py b/pymllm/compile/mllm_ir/trace.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/pymllm/nn/_layers.py b/pymllm/nn/_layers.py index 5adc79cbf..cf5ec37ea 100644 --- a/pymllm/nn/_layers.py +++ b/pymllm/nn/_layers.py @@ -5,7 +5,7 @@ from .. import ffi -class _Layer: +class Linear: def __init__(self): self.device: ffi.Device = ffi.cpu_() self.this_layer_name: str = None diff --git a/pymllm/nn/_module.py b/pymllm/nn/_module.py index a4f28e17c..f18c97460 100644 --- a/pymllm/nn/_module.py +++ b/pymllm/nn/_module.py @@ -2,7 +2,6 @@ # Licensed under the MIT License. from .. import ffi -from ._layers import _Layer class Module: diff --git a/pymllm/tests/qualcomm/test_context_create.py b/pymllm/tests/qualcomm/test_context_create.py deleted file mode 100644 index 18983daa7..000000000 --- a/pymllm/tests/qualcomm/test_context_create.py +++ /dev/null @@ -1,28 +0,0 @@ -import pymllm as mllm -from pymllm.backends.qualcomm.qnn_aot_env import ( - QnnAOTEnv, - QnnDeviceAndContext, - QcomTryBestPerformance, - QcomSecurityPDSession, - QcomTargetMachine, - QcomChipset, - QcomHTPArch, -) - - -qnn_aot_env: QnnAOTEnv = QnnAOTEnv( - machine=QcomTargetMachine( - soc_htp_chipset=QcomChipset.SM8850(), - soc_htp_arch=QcomHTPArch.V81(), - soc_htp_performance=QcomTryBestPerformance.HtpBurst(), - soc_htp_security_pd_session=QcomSecurityPDSession.HtpUnsignedPd(), - soc_htp_vtcm=8, # in MB - ), - path="/opt/qcom/aistack/qairt/2.41.0.251128/lib/x86_64-linux-clang/", -) - -if __name__ == "__main__": - mllm.echo("Testing tvm-ffi compatibility") - qnn_context: QnnDeviceAndContext = qnn_aot_env.create_context( - "context.0", weights_sharing=False - ) diff --git a/pyproject.toml b/pyproject.toml index 703d4456a..50a8eee99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] requires = [ - "scikit-build-core>=0.11.0", "apache-tvm-ffi" + "scikit-build-core==0.10.0", "apache-tvm-ffi" ] build-backend = "scikit_build_core.build" @@ -56,8 +56,6 @@ cmake.args = [ "-DCMAKE_BUILD_TYPE=Release", "-DMLLM_ENABLE_PY_MLLM=on" ] -sdist.exclude = [".*", ".*/*"] -wheel.exclude = [".*", ".*/*"] minimum-version = "build-system.requires" # Build configuration diff --git a/tasks/build_android.yaml b/tasks/build_android.yaml index 2378738e9..e7c251ebe 100644 --- a/tasks/build_android.yaml +++ b/tasks/build_android.yaml @@ -2,10 +2,10 @@ Tasks: - CMakeConfigTask: cmake_cfg_path: "build-android-arm64-v8a" cmake_build_type: "Release" - cmake_toolchain_file: "$ANDROID_NDK_PATH/build/cmake/android.toolchain.cmake" cmake_extra_args: - "-DMLLM_CROSS_COMPILE=ON" - "-DMLLM_BUILD_ARM_BACKEND=ON" + - "-DMLLM_BUILD_ASCEND_BACKEND=ON" - "-DANDROID_PLATFORM=android-28" - "-DANDROID_ABI=arm64-v8a" - '-DMLLM_CPU_BACKEND_COMPILE_OPTIONS="-march=armv8.2-a+fp16+fp16fml+dotprod+i8mm;-ffast-math;-Wno-nan-infinity-disabled"' diff --git a/tasks/build_x86.yaml b/tasks/build_x86.yaml index 617f05f9c..a2b60952d 100644 --- a/tasks/build_x86.yaml +++ b/tasks/build_x86.yaml @@ -11,7 +11,6 @@ Tasks: - '-DMLLM_CPU_BACKEND_COMPILE_OPTIONS="-march=native"' - "-DMLLM_KERNEL_USE_THREADS=ON" - "-DMLLM_KERNEL_THREADS_VENDOR_OPENMP=ON" - - "-DMLLM_KERNEL_USE_THREADS_VENDOR_MLLM=OFF" - CMakeBuildTask: cmake_cfg_path: "build" diff --git a/tasks/build_x86_qnn_aot.yaml b/tasks/build_x86_qnn_aot.yaml deleted file mode 100644 index d4809943f..000000000 --- a/tasks/build_x86_qnn_aot.yaml +++ /dev/null @@ -1,18 +0,0 @@ -Tasks: - - CMakeConfigTask: - cmake_cfg_path: "build-qnn-aot" - cmake_build_type: "RelWithDebInfo" - cmake_extra_args: - # Optional, If use Highway - - "-DHWY_ENABLE_TESTS=OFF" - - "-DHWY_ENABLE_EXAMPLES=OFF" - - "-DHWY_ENABLE_CONTRIB=OFF" - # Optional - - '-DMLLM_CPU_BACKEND_COMPILE_OPTIONS="-march=native"' - - "-DMLLM_KERNEL_USE_THREADS=ON" - - "-DMLLM_KERNEL_THREADS_VENDOR_OPENMP=ON" - - "-DMLLM_KERNEL_USE_THREADS_VENDOR_MLLM=OFF" - - "-DMLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE=ON" - - - CMakeBuildTask: - cmake_cfg_path: "build-qnn-aot" diff --git a/tests/ascend/AscendAttentionKernelTest.hpp b/tests/ascend/AscendAttentionKernelTest.hpp new file mode 100644 index 000000000..37a9b93c3 --- /dev/null +++ b/tests/ascend/AscendAttentionKernelTest.hpp @@ -0,0 +1,576 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/mllm.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/nn/Functional.hpp" +#include "KernelTestHelper.hpp" +#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp" +#include +#include +#include + +class AscendAttentionKernelTest : public KernelTest { + public: + AscendAttentionKernelTest() = default; + ~AscendAttentionKernelTest() override = default; + + // Test Scaled Dot-Product Attention using existing operators + // Attention(Q, K, V) = softmax(Q @ K^T / sqrt(d_k)) @ V + bool ScaledDotProductAttentionFloat16Test(const std::vector>& test_cases) { + using namespace mllm; // NOLINT + + for (const auto& [q_shape, k_shape, v_shape] : test_cases) { + // Validate shapes: Q=[B, S_q, D], K=[B, S_kv, D], V=[B, S_kv, D] + MLLM_RT_ASSERT_EQ(q_shape.size(), 3); + MLLM_RT_ASSERT_EQ(k_shape.size(), 3); + MLLM_RT_ASSERT_EQ(v_shape.size(), 3); + MLLM_RT_ASSERT_EQ(q_shape[0], k_shape[0]); // Same batch size + MLLM_RT_ASSERT_EQ(q_shape[0], v_shape[0]); + MLLM_RT_ASSERT_EQ(q_shape[2], k_shape[2]); // Same D dimension + MLLM_RT_ASSERT_EQ(k_shape[1], v_shape[1]); // K and V have same sequence length + + int32_t B = static_cast(q_shape[0]); + int32_t S_q = static_cast(q_shape[1]); + int32_t S_kv = static_cast(k_shape[1]); + int32_t D = static_cast(q_shape[2]); + + // 1. Create random FP16 inputs on CPU + Tensor Q_cpu = Tensor::random(q_shape, -1.0f, 1.0f, kFloat16, kCPU); + Tensor K_cpu = Tensor::random(k_shape, -1.0f, 1.0f, kFloat16, kCPU); + Tensor V_cpu = Tensor::random(v_shape, -1.0f, 1.0f, kFloat16, kCPU); + + // 2. Compute reference result on CPU using FP32 for better precision + Tensor Q_cpu_fp32 = Tensor::zeros(q_shape, kFloat32, kCPU); + Tensor K_cpu_fp32 = Tensor::zeros(k_shape, kFloat32, kCPU); + Tensor V_cpu_fp32 = Tensor::zeros(v_shape, kFloat32, kCPU); + + // Convert FP16 to FP32 + { + auto* q_fp16 = Q_cpu.ptr(); + auto* k_fp16 = K_cpu.ptr(); + auto* v_fp16 = V_cpu.ptr(); + auto* q_fp32 = Q_cpu_fp32.ptr(); + auto* k_fp32 = K_cpu_fp32.ptr(); + auto* v_fp32 = V_cpu_fp32.ptr(); + + for (size_t i = 0; i < Q_cpu.numel(); ++i) { + q_fp32[i] = MLLM_FP16_TO_FP32(q_fp16[i]); + } + for (size_t i = 0; i < K_cpu.numel(); ++i) { + k_fp32[i] = MLLM_FP16_TO_FP32(k_fp16[i]); + } + for (size_t i = 0; i < V_cpu.numel(); ++i) { + v_fp32[i] = MLLM_FP16_TO_FP32(v_fp16[i]); + } + } + + // Compute reference attention on CPU (FP32) + Tensor ref_cpu_fp32 = Tensor::zeros({B, S_q, D}, kFloat32, kCPU); + { + auto* q_ptr = Q_cpu_fp32.ptr(); + auto* k_ptr = K_cpu_fp32.ptr(); + auto* v_ptr = V_cpu_fp32.ptr(); + auto* out_ptr = ref_cpu_fp32.ptr(); + + float scale = 1.0f / std::sqrt(static_cast(D)); + + for (int32_t b = 0; b < B; ++b) { + // Compute Q @ K^T for this batch + std::vector scores(S_q * S_kv, 0.0f); + + for (int32_t i = 0; i < S_q; ++i) { + for (int32_t j = 0; j < S_kv; ++j) { + float sum = 0.0f; + for (int32_t k = 0; k < D; ++k) { + float q_val = q_ptr[b * S_q * D + i * D + k]; + float k_val = k_ptr[b * S_kv * D + j * D + k]; + sum += q_val * k_val; + } + scores[i * S_kv + j] = sum * scale; + } + } + + // Apply softmax along the last dimension (S_kv) + std::vector attn_weights(S_q * S_kv); + for (int32_t i = 0; i < S_q; ++i) { + // Find max for numerical stability + float max_val = -std::numeric_limits::infinity(); + for (int32_t j = 0; j < S_kv; ++j) { + max_val = std::max(max_val, scores[i * S_kv + j]); + } + + // Compute exp and sum + float sum_exp = 0.0f; + for (int32_t j = 0; j < S_kv; ++j) { + float exp_val = std::exp(scores[i * S_kv + j] - max_val); + attn_weights[i * S_kv + j] = exp_val; + sum_exp += exp_val; + } + + // Normalize + for (int32_t j = 0; j < S_kv; ++j) { + attn_weights[i * S_kv + j] /= sum_exp; + } + } + + // Compute output: attn_weights @ V + // out[S_q, D] = attn_weights[S_q, S_kv] @ V[S_kv, D] + for (int32_t i = 0; i < S_q; ++i) { + for (int32_t k = 0; k < D; ++k) { + float sum = 0.0f; + for (int32_t j = 0; j < S_kv; ++j) { + float attn_val = attn_weights[i * S_kv + j]; + float v_val = v_ptr[b * S_kv * D + j * D + k]; + sum += attn_val * v_val; + } + out_ptr[b * S_q * D + i * D + k] = sum; + } + } + } + } + + // Convert reference back to FP16 + Tensor ref_cpu = Tensor::zeros({B, S_q, D}, kFloat16, kCPU); + { + auto* ref_fp32 = ref_cpu_fp32.ptr(); + auto* ref_fp16 = ref_cpu.ptr(); + for (size_t i = 0; i < ref_cpu.numel(); ++i) { + ref_fp16[i] = MLLM_FP32_TO_FP16(ref_fp32[i]); + } + } + + // 3. Move inputs to Ascend and compute attention using existing operators + auto Q_ascend = Q_cpu.to(kAscend); + auto K_ascend = K_cpu.to(kAscend); + auto V_ascend = V_cpu.to(kAscend); + + float scale = 1.0f / std::sqrt(static_cast(D)); + + // Step 1: Q @ K^T (transpose_b=true) + auto scores = mllm::nn::functional::matmul(Q_ascend, K_ascend, false, true); + + // Step 2: Scale by 1/sqrt(d_k) + auto scale_tensor_cpu = Tensor::ones({1}, kFloat16, kCPU) * scale; + auto scale_tensor = scale_tensor_cpu.to(kAscend); + auto scaled_scores = scores * scale_tensor; + + // Step 3: Softmax along last dimension + auto attn_weights = mllm::nn::functional::softmax(scaled_scores, -1); + + // Step 4: attn_weights @ V + auto output_ascend = mllm::nn::functional::matmul(attn_weights, V_ascend, false, false); + + // 4. Move result back to CPU and compare + auto output_cpu = output_ascend.to(kCPU); + + auto result = mllm::test::allClose(output_cpu, ref_cpu, 5e-2f, 5e-2f); + if (!result.is_close) { + MLLM_ERROR("Attention test failed for shape Q=[{},{},{}], K=[{},{},{}], V=[{},{},{}]", + q_shape[0], q_shape[1], q_shape[2], + k_shape[0], k_shape[1], k_shape[2], + v_shape[0], v_shape[1], v_shape[2]); + MLLM_ERROR("Max absolute diff: {}, Max relative diff: {}", + result.max_absolute_diff, result.max_relative_diff); + return false; + } + } + return true; + } + + //===----------------------------------------------------------------------===// + // Multi-Head Attention with optional Causal Mask + // + // Input shapes: Q=[B, H, S_q, D], K=[B, H, S_kv, D], V=[B, H, S_kv, D] + // where H = num_heads, D = head_dim + // Mask shape: [1, 1, S_q, S_kv] (broadcastable to [B, H, S_q, S_kv]) + // + // Attention(Q, K, V, mask) = softmax(Q @ K^T / sqrt(d_k) + mask) @ V + //===----------------------------------------------------------------------===// + bool MultiHeadAttentionFloat16Test( + const std::vector>& test_cases) { + using namespace mllm; // NOLINT + + for (const auto& [q_shape, k_shape, v_shape, use_mask] : test_cases) { + // Validate shapes: Q=[B, H, S_q, D], K=[B, H, S_kv, D], V=[B, H, S_kv, D] + MLLM_RT_ASSERT_EQ(q_shape.size(), 4); + MLLM_RT_ASSERT_EQ(k_shape.size(), 4); + MLLM_RT_ASSERT_EQ(v_shape.size(), 4); + MLLM_RT_ASSERT_EQ(q_shape[0], k_shape[0]); // Same batch size + MLLM_RT_ASSERT_EQ(q_shape[0], v_shape[0]); + MLLM_RT_ASSERT_EQ(q_shape[1], k_shape[1]); // Same num_heads + MLLM_RT_ASSERT_EQ(q_shape[1], v_shape[1]); + MLLM_RT_ASSERT_EQ(q_shape[3], k_shape[3]); // Same head_dim + MLLM_RT_ASSERT_EQ(k_shape[2], v_shape[2]); // K and V have same sequence length + + int32_t B = static_cast(q_shape[0]); + int32_t H = static_cast(q_shape[1]); // num_heads + int32_t S_q = static_cast(q_shape[2]); + int32_t S_kv = static_cast(k_shape[2]); + int32_t D = static_cast(q_shape[3]); // head_dim + + // 1. Create random FP16 inputs on CPU + Tensor Q_cpu = Tensor::random(q_shape, -0.5f, 0.5f, kFloat16, kCPU); + Tensor K_cpu = Tensor::random(k_shape, -0.5f, 0.5f, kFloat16, kCPU); + Tensor V_cpu = Tensor::random(v_shape, -0.5f, 0.5f, kFloat16, kCPU); + + // 2. Create causal mask if needed + // Causal mask: mask[i, j] = 0 if j <= i, else -inf (large negative value) + Tensor mask_cpu; + if (use_mask) { + mask_cpu = Tensor::zeros({1, 1, S_q, S_kv}, kFloat16, kCPU); + auto* mask_ptr = mask_cpu.ptr(); + + // Fill causal mask: upper triangular part is masked (-inf) + for (int32_t i = 0; i < S_q; ++i) { + for (int32_t j = 0; j < S_kv; ++j) { + int32_t offset = S_kv - S_q; + if (j > i + offset) { + mask_ptr[i * S_kv + j] = MLLM_FP32_TO_FP16(-10000.0f); + } + } + } + } + + // 3. Compute reference result on CPU using FP32 for better precision + Tensor ref_cpu = computeMultiHeadAttentionCPU(Q_cpu, K_cpu, V_cpu, mask_cpu, use_mask); + + // 4. Move inputs to Ascend and compute attention + auto Q_ascend = Q_cpu.to(kAscend); + auto K_ascend = K_cpu.to(kAscend); + auto V_ascend = V_cpu.to(kAscend); + + float scale = 1.0f / std::sqrt(static_cast(D)); + + // Step 1: Q @ K^T (transpose_b=true) + auto scores = mllm::nn::functional::matmul(Q_ascend, K_ascend, false, true); + + // Step 2: Scale by 1/sqrt(d_k) + auto scale_tensor_cpu = Tensor::ones({1}, kFloat16, kCPU); + { + auto* scale_ptr = scale_tensor_cpu.ptr(); + scale_ptr[0] = MLLM_FP32_TO_FP16(scale); + } + auto scale_tensor = scale_tensor_cpu.to(kAscend); + auto scaled_scores = scores * scale_tensor; + + // Step 3: Add mask if needed (broadcasting: [1, 1, S_q, S_kv] -> [B, H, S_q, S_kv]) + if (use_mask) { + auto mask_ascend = mask_cpu.to(kAscend); + scaled_scores = scaled_scores + mask_ascend; + } + + // Step 4: Softmax along last dimension + auto attn_weights = mllm::nn::functional::softmax(scaled_scores, -1); + + // Step 5: attn_weights @ V + // [B, H, S_q, S_kv] @ [B, H, S_kv, D] -> [B, H, S_q, D] + auto output_ascend = mllm::nn::functional::matmul(attn_weights, V_ascend, false, false); + + // 5. Move result back to CPU and compare + auto output_cpu = output_ascend.to(kCPU); + + auto result = mllm::test::allClose(output_cpu, ref_cpu, 5e-2f, 5e-2f); + if (!result.is_close) { + MLLM_ERROR("Multi-head attention test failed for shape Q=[{},{},{},{}], K=[{},{},{},{}], V=[{},{},{},{}], mask={}", + q_shape[0], q_shape[1], q_shape[2], q_shape[3], + k_shape[0], k_shape[1], k_shape[2], k_shape[3], + v_shape[0], v_shape[1], v_shape[2], v_shape[3], + use_mask ? "true" : "false"); + MLLM_ERROR("Max absolute diff: {}, Max relative diff: {}", + result.max_absolute_diff, result.max_relative_diff); + return false; + } + + MLLM_INFO("Multi-head attention test passed: B={}, H={}, S_q={}, S_kv={}, D={}, mask={}", + B, H, S_q, S_kv, D, use_mask ? "true" : "false"); + } + return true; + } + + //===----------------------------------------------------------------------===// + // Multi-Head Attention with Grouped Query Attention (GQA) support + // + // GQA: num_q_heads > num_kv_heads, each KV head is shared by multiple Q heads + // Input shapes: Q=[B, H_q, S_q, D], K=[B, H_kv, S_kv, D], V=[B, H_kv, S_kv, D] + //===----------------------------------------------------------------------===// + bool GroupedQueryAttentionFloat16Test( + const std::vector>& test_cases) { + using namespace mllm; // NOLINT + + for (const auto& [q_shape, k_shape, v_shape, use_mask] : test_cases) { + // Validate shapes + MLLM_RT_ASSERT_EQ(q_shape.size(), 4); + MLLM_RT_ASSERT_EQ(k_shape.size(), 4); + MLLM_RT_ASSERT_EQ(v_shape.size(), 4); + MLLM_RT_ASSERT_EQ(q_shape[0], k_shape[0]); // Same batch size + MLLM_RT_ASSERT_EQ(q_shape[0], v_shape[0]); + MLLM_RT_ASSERT_EQ(k_shape[1], v_shape[1]); // KV have same num_heads + MLLM_RT_ASSERT_EQ(q_shape[3], k_shape[3]); // Same head_dim + MLLM_RT_ASSERT_EQ(k_shape[2], v_shape[2]); // K and V have same sequence length + + int32_t B = static_cast(q_shape[0]); + int32_t H_q = static_cast(q_shape[1]); // num query heads + int32_t H_kv = static_cast(k_shape[1]); // num KV heads + int32_t S_q = static_cast(q_shape[2]); + int32_t S_kv = static_cast(k_shape[2]); + int32_t D = static_cast(q_shape[3]); + + MLLM_RT_ASSERT_EQ(H_q % H_kv, 0); + int32_t num_groups = H_q / H_kv; + + // 1. Create random FP16 inputs on CPU + Tensor Q_cpu = Tensor::random(q_shape, -0.5f, 0.5f, kFloat16, kCPU); + Tensor K_cpu = Tensor::random(k_shape, -0.5f, 0.5f, kFloat16, kCPU); + Tensor V_cpu = Tensor::random(v_shape, -0.5f, 0.5f, kFloat16, kCPU); + + // 2. Create causal mask if needed + Tensor mask_cpu; + if (use_mask) { + mask_cpu = Tensor::zeros({1, 1, S_q, S_kv}, kFloat16, kCPU); + auto* mask_ptr = mask_cpu.ptr(); + int32_t offset = S_kv - S_q; + for (int32_t i = 0; i < S_q; ++i) { + for (int32_t j = 0; j < S_kv; ++j) { + if (j > i + offset) { + mask_ptr[i * S_kv + j] = MLLM_FP32_TO_FP16(-10000.0f); + } + } + } + } + + // 3. Compute reference on CPU + Tensor ref_cpu = computeGQACPU(Q_cpu, K_cpu, V_cpu, mask_cpu, use_mask, num_groups); + + // 4. Compute on Ascend + auto Q_ascend = Q_cpu.to(kAscend); + auto K_cpu_expanded = repeatKVHeads(K_cpu, num_groups); + auto V_cpu_expanded = repeatKVHeads(V_cpu, num_groups); + auto K_ascend = K_cpu_expanded.to(kAscend); + auto V_ascend = V_cpu_expanded.to(kAscend); + + float scale = 1.0f / std::sqrt(static_cast(D)); + + // Q @ K^T + auto scores = mllm::nn::functional::matmul(Q_ascend, K_ascend, false, true); + + // Scale + auto scale_tensor_cpu = Tensor::ones({1}, kFloat16, kCPU); + { + auto* scale_ptr = scale_tensor_cpu.ptr(); + scale_ptr[0] = MLLM_FP32_TO_FP16(scale); + } + auto scaled_scores = scores * scale_tensor_cpu.to(kAscend); + + // Add mask + if (use_mask) { + scaled_scores = scaled_scores + mask_cpu.to(kAscend); + } + + // Softmax + auto attn_weights = mllm::nn::functional::softmax(scaled_scores, -1); + + // attn_weights @ V + auto output_ascend = mllm::nn::functional::matmul(attn_weights, V_ascend, false, false); + + // 5. Compare + auto output_cpu = output_ascend.to(kCPU); + auto result = mllm::test::allClose(output_cpu, ref_cpu, 5e-2f, 5e-2f); + if (!result.is_close) { + MLLM_ERROR("GQA test failed: B={}, H_q={}, H_kv={}, S_q={}, S_kv={}, D={}, mask={}", + B, H_q, H_kv, S_q, S_kv, D, use_mask ? "true" : "false"); + MLLM_ERROR("Max absolute diff: {}, Max relative diff: {}", + result.max_absolute_diff, result.max_relative_diff); + return false; + } + + MLLM_INFO("GQA test passed: B={}, H_q={}, H_kv={}, S_q={}, S_kv={}, D={}, mask={}", + B, H_q, H_kv, S_q, S_kv, D, use_mask ? "true" : "false"); + } + return true; + } + + private: + //===----------------------------------------------------------------------===// + // Helper: Compute Multi-Head Attention reference on CPU (FP32) + //===----------------------------------------------------------------------===// + mllm::Tensor computeMultiHeadAttentionCPU( + const mllm::Tensor& Q_cpu, + const mllm::Tensor& K_cpu, + const mllm::Tensor& V_cpu, + const mllm::Tensor& mask_cpu, + bool use_mask) { + using namespace mllm; // NOLINT + + int32_t B = static_cast(Q_cpu.shape()[0]); + int32_t H = static_cast(Q_cpu.shape()[1]); + int32_t S_q = static_cast(Q_cpu.shape()[2]); + int32_t S_kv = static_cast(K_cpu.shape()[2]); + int32_t D = static_cast(Q_cpu.shape()[3]); + + // Convert inputs to FP32 + Tensor Q_fp32 = Tensor::zeros({B, H, S_q, D}, kFloat32, kCPU); + Tensor K_fp32 = Tensor::zeros({B, H, S_kv, D}, kFloat32, kCPU); + Tensor V_fp32 = Tensor::zeros({B, H, S_kv, D}, kFloat32, kCPU); + + auto* q_fp16 = Q_cpu.ptr(); + auto* k_fp16 = K_cpu.ptr(); + auto* v_fp16 = V_cpu.ptr(); + auto* q_fp32 = Q_fp32.ptr(); + auto* k_fp32 = K_fp32.ptr(); + auto* v_fp32 = V_fp32.ptr(); + + for (size_t i = 0; i < Q_cpu.numel(); ++i) { + q_fp32[i] = MLLM_FP16_TO_FP32(q_fp16[i]); + } + for (size_t i = 0; i < K_cpu.numel(); ++i) { + k_fp32[i] = MLLM_FP16_TO_FP32(k_fp16[i]); + } + for (size_t i = 0; i < V_cpu.numel(); ++i) { + v_fp32[i] = MLLM_FP16_TO_FP32(v_fp16[i]); + } + + // Convert mask to FP32 if needed + const mllm_fp16_t* mask_fp16 = nullptr; + if (use_mask) { + mask_fp16 = mask_cpu.ptr(); + } + + Tensor output_fp32 = Tensor::zeros({B, H, S_q, D}, kFloat32, kCPU); + auto* out_ptr = output_fp32.ptr(); + + float scale = 1.0f / std::sqrt(static_cast(D)); + + for (int32_t b = 0; b < B; ++b) { + for (int32_t h = 0; h < H; ++h) { + std::vector scores(S_q * S_kv, 0.0f); + for (int32_t i = 0; i < S_q; ++i) { + for (int32_t j = 0; j < S_kv; ++j) { + float sum = 0.0f; + for (int32_t k = 0; k < D; ++k) { + float q_val = q_fp32[((b * H + h) * S_q + i) * D + k]; + float k_val = k_fp32[((b * H + h) * S_kv + j) * D + k]; + sum += q_val * k_val; + } + scores[i * S_kv + j] = sum * scale; + + // Add mask (mask is broadcastable: [1, 1, S_q, S_kv]) + if (use_mask) { + float mask_val = MLLM_FP16_TO_FP32(mask_fp16[i * S_kv + j]); + scores[i * S_kv + j] += mask_val; + } + } + } + + // Softmax along last dimension + std::vector attn_weights(S_q * S_kv); + for (int32_t i = 0; i < S_q; ++i) { + float max_val = -std::numeric_limits::infinity(); + for (int32_t j = 0; j < S_kv; ++j) { + max_val = std::max(max_val, scores[i * S_kv + j]); + } + + float sum_exp = 0.0f; + for (int32_t j = 0; j < S_kv; ++j) { + float exp_val = std::exp(scores[i * S_kv + j] - max_val); + attn_weights[i * S_kv + j] = exp_val; + sum_exp += exp_val; + } + + for (int32_t j = 0; j < S_kv; ++j) { + attn_weights[i * S_kv + j] /= sum_exp; + } + } + + // Compute output: attn_weights @ V + for (int32_t i = 0; i < S_q; ++i) { + for (int32_t k = 0; k < D; ++k) { + float sum = 0.0f; + for (int32_t j = 0; j < S_kv; ++j) { + float attn_val = attn_weights[i * S_kv + j]; + float v_val = v_fp32[((b * H + h) * S_kv + j) * D + k]; + sum += attn_val * v_val; + } + out_ptr[((b * H + h) * S_q + i) * D + k] = sum; + } + } + } + } + + // Convert output back to FP16 + Tensor output_fp16 = Tensor::zeros({B, H, S_q, D}, kFloat16, kCPU); + auto* out_fp16 = output_fp16.ptr(); + for (size_t i = 0; i < output_fp16.numel(); ++i) { + out_fp16[i] = MLLM_FP32_TO_FP16(out_ptr[i]); + } + + return output_fp16; + } + + //===----------------------------------------------------------------------===// + // Helper: Repeat KV heads for GQA + // [B, H_kv, S, D] -> [B, H_q, S, D] where H_q = H_kv * num_groups + //===----------------------------------------------------------------------===// + mllm::Tensor repeatKVHeads(const mllm::Tensor& kv, int32_t num_groups) { + using namespace mllm; // NOLINT + + if (num_groups == 1) { + return kv; + } + + int32_t B = static_cast(kv.shape()[0]); + int32_t H_kv = static_cast(kv.shape()[1]); + int32_t S = static_cast(kv.shape()[2]); + int32_t D = static_cast(kv.shape()[3]); + int32_t H_q = H_kv * num_groups; + + Tensor expanded = Tensor::zeros({B, H_q, S, D}, kv.dtype(), kCPU); + auto* src = kv.ptr(); + auto* dst = expanded.ptr(); + + for (int32_t b = 0; b < B; ++b) { + for (int32_t h_kv = 0; h_kv < H_kv; ++h_kv) { + for (int32_t g = 0; g < num_groups; ++g) { + int32_t h_q = h_kv * num_groups + g; + for (int32_t s = 0; s < S; ++s) { + for (int32_t d = 0; d < D; ++d) { + size_t src_idx = ((b * H_kv + h_kv) * S + s) * D + d; + size_t dst_idx = ((b * H_q + h_q) * S + s) * D + d; + dst[dst_idx] = src[src_idx]; + } + } + } + } + } + + return expanded; + } + + //===----------------------------------------------------------------------===// + // Helper: Compute GQA reference on CPU + //===----------------------------------------------------------------------===// + mllm::Tensor computeGQACPU( + const mllm::Tensor& Q_cpu, + const mllm::Tensor& K_cpu, + const mllm::Tensor& V_cpu, + const mllm::Tensor& mask_cpu, + bool use_mask, + int32_t num_groups) { + // Expand KV heads and compute standard MHA + auto K_expanded = repeatKVHeads(K_cpu, num_groups); + auto V_expanded = repeatKVHeads(V_cpu, num_groups); + return computeMultiHeadAttentionCPU(Q_cpu, K_expanded, V_expanded, mask_cpu, use_mask); + } +}; diff --git a/tests/ascend/AscendKernelTest.hpp b/tests/ascend/AscendKernelTest.hpp index 138ee5ae8..a01028906 100644 --- a/tests/ascend/AscendKernelTest.hpp +++ b/tests/ascend/AscendKernelTest.hpp @@ -48,5 +48,75 @@ class AscendKernelTest : public KernelTest { } return true; } + + // Test Sub operation with different shapes + bool SubFloat16Test(const std::vector& shapes) { + using namespace mllm; // NOLINT + for (auto& shape : shapes) { + // 1. Construct random FP16 inputs on CPU + Tensor x_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU); + Tensor y_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU); + + // 2. Compute reference result (FP16) on CPU + Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU); + { + auto* x_ptr = x_cpu.ptr(); + auto* y_ptr = y_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + auto num_elements = x_cpu.numel(); + for (size_t i = 0; i < num_elements; ++i) { + r_ptr[i] = x_ptr[i] - y_ptr[i]; + } + } + + // 3. Move inputs to Ascend and run Sub (z = x - y) + auto x_ascend = x_cpu.to(kAscend); + auto y_ascend = y_cpu.to(kAscend); + auto z_ascend = x_ascend - y_ascend; + + // 4. Move result back to CPU and compare with reference using allClose + auto z_cpu = z_ascend.to(kCPU); + auto result = mllm::test::allClose(z_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + return false; + } + } + return true; + } + + // Test Mul operation with different shapes + bool MulFloat16Test(const std::vector& shapes) { + using namespace mllm; // NOLINT + for (auto& shape : shapes) { + // 1. Construct random FP16 inputs on CPU + Tensor x_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU); + Tensor y_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU); + + // 2. Compute reference result (FP16) on CPU + Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU); + { + auto* x_ptr = x_cpu.ptr(); + auto* y_ptr = y_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + auto num_elements = x_cpu.numel(); + for (size_t i = 0; i < num_elements; ++i) { + r_ptr[i] = x_ptr[i] * y_ptr[i]; + } + } + + // 3. Move inputs to Ascend and run Mul (z = x * y) + auto x_ascend = x_cpu.to(kAscend); + auto y_ascend = y_cpu.to(kAscend); + auto z_ascend = x_ascend * y_ascend; + + // 4. Move result back to CPU and compare with reference using allClose + auto z_cpu = z_ascend.to(kCPU); + auto result = mllm::test::allClose(z_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + return false; + } + } + return true; + } }; diff --git a/tests/ascend/AscendLinearKernelTest.hpp b/tests/ascend/AscendLinearKernelTest.hpp new file mode 100644 index 000000000..b7fca56fa --- /dev/null +++ b/tests/ascend/AscendLinearKernelTest.hpp @@ -0,0 +1,164 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/mllm.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/core/OpTypes.hpp" +#include "mllm/core/aops/LinearOp.hpp" +#include "mllm/engine/Context.hpp" +#include "mllm/nn/Functional.hpp" +#include "KernelTestHelper.hpp" +#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp" +#include +#include +#include + + +class AscendLinearKernelTest : public KernelTest { + public: + AscendLinearKernelTest() = default; + ~AscendLinearKernelTest() override = default; + + bool LinearFloat16Test(const std::vector>& test_cases) { + using namespace mllm; // NOLINT + for (auto& test_case : test_cases) { + auto input_shape = std::get<0>(test_case); + int in_channels = std::get<1>(test_case); + int out_channels = std::get<2>(test_case); + + std::cout << "[LinearTest] Testing shape=["; + for (size_t i = 0; i < input_shape.size(); ++i) { + std::cout << input_shape[i] << (i < input_shape.size() - 1 ? ", " : ""); + } + std::cout << "], in=" << in_channels << ", out=" << out_channels << std::endl; + + // 1. Construct random FP16 inputs on CPU + // x: [M, K] where K = in_channels + Tensor x_cpu = Tensor::random(input_shape, -1, 1, kFloat16, kCPU); + + // Weight shape for ATB: [K, N] where K=in_channels, N=out_channels + Tensor weight_cpu = Tensor::random({in_channels, out_channels}, -0.5, 0.5, kFloat16, kCPU); + + // 2. Compute reference result on CPU + // y = x @ weight, where x is [M, K], weight is [K, N], output is [M, N] + auto output_shape = input_shape; + output_shape[output_shape.size() - 1] = out_channels; + Tensor ref_cpu = Tensor::zeros(output_shape, kFloat16, kCPU); + + { + auto* x_ptr = x_cpu.ptr(); + auto* w_ptr = weight_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + + size_t batch_size = 1; + for (size_t i = 0; i < input_shape.size() - 1; ++i) { + batch_size *= input_shape[i]; + } + + for (size_t b = 0; b < batch_size; ++b) { + for (int o = 0; o < out_channels; ++o) { + float sum = 0.0f; + for (int i = 0; i < in_channels; ++i) { + float x_val = MLLM_FP16_TO_FP32(x_ptr[b * in_channels + i]); + float w_val = MLLM_FP16_TO_FP32(w_ptr[i * out_channels + o]); // weight is [K, N] + sum += x_val * w_val; + } + r_ptr[b * out_channels + o] = MLLM_FP32_TO_FP16(sum); + } + } + } + + // 3. Move inputs to Ascend and run Linear via matmul + auto x_ascend = x_cpu.to(kAscend); + auto weight_ascend = weight_cpu.to(kAscend); + + // Use matmul: y = x @ weight + auto y_ascend = nn::functional::matmul(x_ascend, weight_ascend, false, false); + + // 4. Move result back to CPU and compare with reference + auto y_cpu = y_ascend.to(kCPU); + auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + std::cout << "[LinearTest] FAILED!" << std::endl; + return false; + } + std::cout << "[LinearTest] PASSED" << std::endl; + } + return true; + } + + + bool LinearWithBiasFloat16Test(const std::vector>& test_cases) { + using namespace mllm; // NOLINT + for (auto& test_case : test_cases) { + auto input_shape = std::get<0>(test_case); + int in_channels = std::get<1>(test_case); + int out_channels = std::get<2>(test_case); + + std::cout << "[LinearWithBiasTest] Testing shape=["; + for (size_t i = 0; i < input_shape.size(); ++i) { + std::cout << input_shape[i] << (i < input_shape.size() - 1 ? ", " : ""); + } + std::cout << "], in=" << in_channels << ", out=" << out_channels << std::endl; + + // 1. Create random input, weight and bias on CPU + Tensor x_cpu = Tensor::random(input_shape, -1, 1, kFloat16, kCPU); + // Weight shape: [out_channels, in_channels] + Tensor weight_cpu = Tensor::random({out_channels, in_channels}, -0.5, 0.5, kFloat16, kCPU); + // Bias shape: [1, out_channels] for ATB Linear (2D tensor required) + Tensor bias_cpu = Tensor::random({1, out_channels}, -0.1, 0.1, kFloat16, kCPU); + + // 2. Compute reference result on CPU + auto output_shape = input_shape; + output_shape[output_shape.size() - 1] = out_channels; + Tensor ref_cpu = Tensor::zeros(output_shape, kFloat16, kCPU); + + { + auto* x_ptr = x_cpu.ptr(); + auto* w_ptr = weight_cpu.ptr(); + auto* b_ptr = bias_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + + size_t batch_size = 1; + for (size_t i = 0; i < input_shape.size() - 1; ++i) { + batch_size *= input_shape[i]; + } + + // y = x @ W^T + b, where W is [out_channels, in_channels] + for (size_t b = 0; b < batch_size; ++b) { + for (int o = 0; o < out_channels; ++o) { + float sum = 0.0f; + for (int i = 0; i < in_channels; ++i) { + float x_val = MLLM_FP16_TO_FP32(x_ptr[b * in_channels + i]); + float w_val = MLLM_FP16_TO_FP32(w_ptr[o * in_channels + i]); + sum += x_val * w_val; + } + float bias_val = MLLM_FP16_TO_FP32(b_ptr[o]); + sum += bias_val; + r_ptr[b * out_channels + o] = MLLM_FP32_TO_FP16(sum); + } + } + } + + // 3. Move tensors to Ascend and run linear + auto x_ascend = x_cpu.to(kAscend); + auto weight_ascend = weight_cpu.to(kAscend); + auto bias_ascend = bias_cpu.to(kAscend); + + // Use nn::functional::linear directly + auto y_ascend = nn::functional::linear(x_ascend, weight_ascend, bias_ascend); + + // 4. Compare result with reference + auto y_cpu = y_ascend.to(kCPU); + auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + std::cout << "[LinearWithBiasTest] FAILED!" << std::endl; + return false; + } + std::cout << "[LinearWithBiasTest] PASSED" << std::endl; + } + return true; + } +}; diff --git a/tests/ascend/AscendRMSNormKernelTest.hpp b/tests/ascend/AscendRMSNormKernelTest.hpp new file mode 100644 index 000000000..0af879c8f --- /dev/null +++ b/tests/ascend/AscendRMSNormKernelTest.hpp @@ -0,0 +1,85 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/mllm.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/nn/layers/RMSNorm.hpp" +#include "KernelTestHelper.hpp" +#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp" +#include +#include + +class AscendRMSNormKernelTest : public KernelTest { + public: + AscendRMSNormKernelTest() = default; + ~AscendRMSNormKernelTest() override = default; + + // Test RMSNorm operation with different shapes + bool RMSNormFloat16Test(const std::vector>& test_cases) { + using namespace mllm; // NOLINT + for (auto& test_case : test_cases) { + auto input_shape = std::get<0>(test_case); + int norm_size = std::get<1>(test_case); + float epsilon = std::get<2>(test_case); + + // Validate that norm_size matches the last dimension of input_shape + assert(norm_size == static_cast(input_shape.back()) && + "norm_size must equal the last dimension of input_shape"); + + // 1. Construct random FP16 inputs on CPU + Tensor x_cpu = Tensor::random(input_shape, -2, 2, kFloat16, kCPU); + + // Weight shape: [norm_size] + Tensor weight_cpu = Tensor::random({norm_size}, 0.5, 1.5, kFloat16, kCPU); + + // 2. Compute reference result (FP16) on CPU + // RMSNorm: y = x * weight / sqrt(mean(x^2) + epsilon) + Tensor ref_cpu = Tensor::zeros(input_shape, kFloat16, kCPU); + { + auto* x_ptr = x_cpu.ptr(); + auto* w_ptr = weight_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + + size_t batch_size = 1; + for (size_t i = 0; i < input_shape.size() - 1; ++i) { + batch_size *= input_shape[i]; + } + + // Perform RMSNorm for each batch + for (size_t b = 0; b < batch_size; ++b) { + float sum_squares = 0.0f; + for (int i = 0; i < norm_size; ++i) { + float x_val = MLLM_FP16_TO_FP32(x_ptr[b * norm_size + i]); + sum_squares += x_val * x_val; + } + float rms = std::sqrt(sum_squares / norm_size + epsilon); + + // Normalize and scale by weight + for (int i = 0; i < norm_size; ++i) { + float x_val = MLLM_FP16_TO_FP32(x_ptr[b * norm_size + i]); + float w_val = MLLM_FP16_TO_FP32(w_ptr[i]); + float result = (x_val / rms) * w_val; + r_ptr[b * norm_size + i] = MLLM_FP32_TO_FP16(result); + } + } + } + + // 3. Move inputs to Ascend and run RMSNorm + auto x_ascend = x_cpu.to(kAscend); + auto weight_ascend = weight_cpu.to(kAscend); + + // Use functional API - one line to execute the operator + auto y_ascend = nn::functional::rmsNorm(x_ascend, weight_ascend, epsilon); + + // 4. Move result back to CPU and compare with reference using allClose + auto y_cpu = y_ascend.to(kCPU); + auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + return false; + } + } + return true; + } +}; diff --git a/tests/ascend/AscendSiLUKernelTest.hpp b/tests/ascend/AscendSiLUKernelTest.hpp new file mode 100644 index 000000000..aaa798f69 --- /dev/null +++ b/tests/ascend/AscendSiLUKernelTest.hpp @@ -0,0 +1,67 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/mllm.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/nn/Functional.hpp" +#include "KernelTestHelper.hpp" +#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp" +#include +#include + +class AscendSiLUKernelTest : public KernelTest { + public: + AscendSiLUKernelTest() = default; + ~AscendSiLUKernelTest() override = default; + + // Test SiLU operation with different shapes + bool SiLUFloat16Test(const std::vector& shapes) { + using namespace mllm; // NOLINT + for (auto& shape : shapes) { + // 1. Construct random FP16 inputs on CPU + Tensor x_cpu = Tensor::random(shape, -5, 5, kFloat16, kCPU); + + // 2. Compute reference result (FP16) on CPU + // SiLU(x) = x * sigmoid(x) = x / (1 + exp(-x)) + Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU); + { + auto* x_ptr = x_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + auto num_elements = x_cpu.numel(); + for (size_t i = 0; i < num_elements; ++i) { + // Convert FP16 to FP32 for computation + float x_val = MLLM_FP16_TO_FP32(x_ptr[i]); + + // Compute sigmoid(x) = 1 / (1 + exp(-x)) + float sigmoid_x; + if (x_val >= 0) { + sigmoid_x = 1.0f / (1.0f + std::exp(-x_val)); + } else { + float exp_x = std::exp(x_val); + sigmoid_x = exp_x / (1.0f + exp_x); + } + + // SiLU(x) = x * sigmoid(x) + float result = x_val * sigmoid_x; + + // Convert back to FP16 + r_ptr[i] = MLLM_FP32_TO_FP16(result); + } + } + + // 3. Move inputs to Ascend and run SiLU + auto x_ascend = x_cpu.to(kAscend); + auto y_ascend = mllm::nn::functional::silu(x_ascend); + + // 4. Move result back to CPU and compare with reference using allClose + auto y_cpu = y_ascend.to(kCPU); + auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + return false; + } + } + return true; + } +}; diff --git a/tests/ascend/AscendSoftmaxKernelTest.hpp b/tests/ascend/AscendSoftmaxKernelTest.hpp new file mode 100644 index 000000000..2cc6d7b73 --- /dev/null +++ b/tests/ascend/AscendSoftmaxKernelTest.hpp @@ -0,0 +1,129 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/mllm.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/nn/Functional.hpp" +#include "KernelTestHelper.hpp" +#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp" +#include +#include + +class AscendSoftmaxKernelTest : public KernelTest { + public: + AscendSoftmaxKernelTest() = default; + ~AscendSoftmaxKernelTest() override = default; + + // Test Softmax operation with different shapes and axes + bool SoftmaxFloat16Test(const std::vector& shapes, const std::vector& axes) { + using namespace mllm; // NOLINT + for (auto& shape : shapes) { + for (auto axis : axes) { + // 1. Construct random FP16 inputs on CPU + Tensor x_cpu = Tensor::random(shape, -5, 5, kFloat16, kCPU); + + // 2. Compute reference result (FP16) on CPU + // Softmax(x_i) = exp(x_i - max(x)) / sum(exp(x_j - max(x))) + Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU); + { + auto* x_ptr = x_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + + // Convert axis to positive index + int ndim = static_cast(shape.size()); + int pos_axis = axis; + if (pos_axis < 0) { + pos_axis = ndim + pos_axis; + } + + // Calculate strides + std::vector strides(ndim); + strides[ndim - 1] = 1; + for (int i = ndim - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * shape[i + 1]; + } + + size_t outer_size = 1; + for (int i = 0; i < pos_axis; ++i) { + outer_size *= shape[i]; + } + + size_t axis_size = shape[pos_axis]; + + size_t inner_size = 1; + for (int i = pos_axis + 1; i < ndim; ++i) { + inner_size *= shape[i]; + } + + // Compute softmax for each slice along the axis + for (size_t outer = 0; outer < outer_size; ++outer) { + for (size_t inner = 0; inner < inner_size; ++inner) { + // Find max value for numerical stability + float max_val = -std::numeric_limits::infinity(); + for (size_t i = 0; i < axis_size; ++i) { + size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] + + i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner; + if (pos_axis == 0) { + idx = i * strides[0] + inner; + } else if (pos_axis == ndim - 1) { + idx = outer * axis_size + i; + } else { + idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner; + } + float val = MLLM_FP16_TO_FP32(x_ptr[idx]); + max_val = std::max(max_val, val); + } + + // Compute exp(x - max) and sum + float sum_exp = 0.0f; + std::vector exp_vals(axis_size); + for (size_t i = 0; i < axis_size; ++i) { + size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] + + i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner; + if (pos_axis == 0) { + idx = i * strides[0] + inner; + } else if (pos_axis == ndim - 1) { + idx = outer * axis_size + i; + } else { + idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner; + } + float val = MLLM_FP16_TO_FP32(x_ptr[idx]); + exp_vals[i] = std::exp(val - max_val); + sum_exp += exp_vals[i]; + } + + // Compute softmax and store result + for (size_t i = 0; i < axis_size; ++i) { + size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] + + i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner; + if (pos_axis == 0) { + idx = i * strides[0] + inner; + } else if (pos_axis == ndim - 1) { + idx = outer * axis_size + i; + } else { + idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner; + } + float result = exp_vals[i] / sum_exp; + r_ptr[idx] = MLLM_FP32_TO_FP16(result); + } + } + } + } + + // 3. Move inputs to Ascend and run Softmax + auto x_ascend = x_cpu.to(kAscend); + auto y_ascend = mllm::nn::functional::softmax(x_ascend, axis); + + // 4. Move result back to CPU and compare with reference using allClose + auto y_cpu = y_ascend.to(kCPU); + auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + return false; + } + } + } + return true; + } +}; diff --git a/tests/ascend/KernelTest.cpp b/tests/ascend/KernelTest.cpp index b0489f545..bccb7a154 100644 --- a/tests/ascend/KernelTest.cpp +++ b/tests/ascend/KernelTest.cpp @@ -25,6 +25,205 @@ TEST_F(AscendKernelTest, AddFloat16) { true); } +//===----------------------------------------------------------------------===// +// Element wise SUB. +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +TEST_F(AscendKernelTest, SubFloat16) { + EXPECT_EQ(SubFloat16Test({ + {2, 3}, + {1, 1}, + {4, 4}, + {8, 8}, + {16, 16}, + {32, 32}, + }), + true); +} + +//===----------------------------------------------------------------------===// +// Element wise MUL. +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +TEST_F(AscendKernelTest, MulFloat16) { + EXPECT_EQ(MulFloat16Test({ + {2, 3}, + {1, 1}, + {4, 4}, + {8, 8}, + {16, 16}, + {32, 32}, + }), + true); +} + +//===----------------------------------------------------------------------===// +// SiLU activation function. +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +#include "AscendSiLUKernelTest.hpp" +TEST_F(AscendSiLUKernelTest, SiLUFloat16) { + EXPECT_EQ(SiLUFloat16Test({ + {2, 3}, + {1, 1}, + {4, 4}, + {8, 8}, + {16, 16}, + {32, 32}, + {1, 1024}, + {128, 128}, + }), + true); +} + +//===----------------------------------------------------------------------===// +// Linear layer (MatMul based test). +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +#include "AscendLinearKernelTest.hpp" +TEST_F(AscendLinearKernelTest, LinearFloat16) { + EXPECT_EQ(LinearFloat16Test({ + // {input_shape, in_channels, out_channels} + {{2, 3}, 3, 4}, + {{1, 8}, 8, 16}, + {{4, 16}, 16, 32}, + {{8, 32}, 32, 64}, + {{1, 1024}, 1024, 512}, + }), + true); +} + +TEST_F(AscendLinearKernelTest, LinearWithBiasFloat16) { + EXPECT_EQ(LinearWithBiasFloat16Test({ + // {input_shape, in_channels, out_channels} + {{2, 3}, 3, 4}, + {{1, 8}, 8, 16}, + {{4, 16}, 16, 32}, + }), + true); +} + +//===----------------------------------------------------------------------===// +// RMSNorm layer. +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +#include "AscendRMSNormKernelTest.hpp" +TEST_F(AscendRMSNormKernelTest, RMSNormFloat16) { + EXPECT_EQ(RMSNormFloat16Test({ + // {input_shape, norm_size, epsilon} + // Note: ATB RMSNorm requires last dim to be multiple of 16 (FP16 alignment) + {{2, 16}, 16, 1e-5f}, + {{1, 32}, 32, 1e-5f}, + {{4, 64}, 64, 1e-6f}, + {{8, 128}, 128, 1e-5f}, + {{1, 1024}, 1024, 1e-5f}, + {{128, 256}, 256, 1e-5f}, + }), + true); +} + +//===----------------------------------------------------------------------===// +// Softmax activation function. +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +#include "AscendSoftmaxKernelTest.hpp" +TEST_F(AscendSoftmaxKernelTest, SoftmaxFloat16) { + EXPECT_EQ(SoftmaxFloat16Test({ + {2, 3}, + {1, 8}, + {4, 4}, + {8, 8}, + {16, 16}, + {1, 1024}, + {128, 128}, + }, + {-1, 0, 1} // Test different axes + ), + true); +} + +//===----------------------------------------------------------------------===// +// Scaled Dot-Product Attention (using existing operators). +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +#include "AscendAttentionKernelTest.hpp" +TEST_F(AscendAttentionKernelTest, ScaledDotProductAttentionFloat16) { + EXPECT_EQ(ScaledDotProductAttentionFloat16Test({ + // {Q_shape, K_shape, V_shape} + // Format: [B, S, D] + {{1, 4, 8}, {1, 4, 8}, {1, 4, 8}}, // Small: B=1, S=4, D=8 + {{1, 8, 16}, {1, 8, 16}, {1, 8, 16}}, // Medium: B=1, S=8, D=16 + {{2, 4, 8}, {2, 4, 8}, {2, 4, 8}}, // Batch=2 + {{1, 16, 32}, {1, 16, 32}, {1, 16, 32}}, // Larger: B=1, S=16, D=32 + {{1, 8, 64}, {1, 8, 64}, {1, 8, 64}}, // D=64 (common head dim) + }), + true); +} + +//===----------------------------------------------------------------------===// +// Multi-Head Attention with Causal Mask. +// +// FP16 (Ascend currently uses FP16) +// Input format: [B, H, S, D] where H = num_heads, D = head_dim +//===----------------------------------------------------------------------===// +TEST_F(AscendAttentionKernelTest, MultiHeadAttentionFloat16) { + EXPECT_EQ(MultiHeadAttentionFloat16Test({ + // {Q_shape, K_shape, V_shape, use_causal_mask} + // Format: [B, H, S, D] + + // Without mask + {{1, 1, 4, 8}, {1, 1, 4, 8}, {1, 1, 4, 8}, false}, // Single head, no mask + {{1, 4, 8, 16}, {1, 4, 8, 16}, {1, 4, 8, 16}, false}, // 4 heads, no mask + {{1, 8, 16, 64}, {1, 8, 16, 64}, {1, 8, 16, 64}, false}, // 8 heads, D=64 + + // With causal mask + {{1, 1, 4, 8}, {1, 1, 4, 8}, {1, 1, 4, 8}, true}, // Single head, with mask + {{1, 4, 8, 16}, {1, 4, 8, 16}, {1, 4, 8, 16}, true}, // 4 heads, with mask + {{1, 8, 16, 64}, {1, 8, 16, 64}, {1, 8, 16, 64}, true}, // 8 heads, with mask + {{2, 4, 8, 32}, {2, 4, 8, 32}, {2, 4, 8, 32}, true}, // Batch=2, with mask + + // Different S_q and S_kv (useful for KV cache scenarios) + {{1, 4, 1, 32}, {1, 4, 8, 32}, {1, 4, 8, 32}, true}, // S_q=1, S_kv=8 (decode) + {{1, 4, 4, 32}, {1, 4, 16, 32}, {1, 4, 16, 32}, true}, // S_q < S_kv + }), + true); +} + +//===----------------------------------------------------------------------===// +// Grouped Query Attention (GQA). +// +// FP16 (Ascend currently uses FP16) +// GQA: num_q_heads > num_kv_heads, each KV head is shared by multiple Q heads +//===----------------------------------------------------------------------===// +TEST_F(AscendAttentionKernelTest, GroupedQueryAttentionFloat16) { + EXPECT_EQ(GroupedQueryAttentionFloat16Test({ + // {Q_shape [B, H_q, S_q, D], K_shape [B, H_kv, S_kv, D], V_shape, use_mask} + + // GQA with 2 groups (H_q = 4, H_kv = 2) + {{1, 4, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, false}, + {{1, 4, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, true}, + + // GQA with 4 groups (H_q = 8, H_kv = 2) + {{1, 8, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, false}, + {{1, 8, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, true}, + + // MQA (Multi-Query Attention): H_kv = 1 + {{1, 4, 8, 32}, {1, 1, 8, 32}, {1, 1, 8, 32}, true}, + {{1, 8, 16, 64}, {1, 1, 16, 64}, {1, 1, 16, 64}, true}, + + // Batch > 1 + {{2, 8, 8, 32}, {2, 2, 8, 32}, {2, 2, 8, 32}, true}, + }), + true); +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv);