From 9bef2618343c06b9b86d1f8cde815a12c35ee088 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 30 Jan 2026 23:23:47 +0800 Subject: [PATCH 1/4] feat(ascend): create some new ops(SiLU,RMSNorm,Linear and so on) --- mllm/backends/ascend/ops/AscendElewiseOps.cpp | 173 +++++++++++++++++- mllm/backends/ascend/ops/AscendElewiseOps.hpp | 30 +++ mllm/backends/ascend/ops/AscendLinearOp.cpp | 167 +++++++++++++++++ mllm/backends/ascend/ops/AscendLinearOp.hpp | 28 +++ mllm/backends/ascend/ops/AscendMatMulOp.cpp | 147 +++++++++++++++ mllm/backends/ascend/ops/AscendMatMulOp.hpp | 27 +++ mllm/backends/ascend/ops/AscendRMSNormOp.cpp | 106 +++++++++++ mllm/backends/ascend/ops/AscendRMSNormOp.hpp | 27 +++ mllm/backends/ascend/ops/AscendSiLUOp.cpp | 115 ++++++++++++ mllm/backends/ascend/ops/AscendSiLUOp.hpp | 27 +++ mllm/backends/ascend/ops/AscendSoftmaxOp.cpp | 135 ++++++++++++++ mllm/backends/ascend/ops/AscendSoftmaxOp.hpp | 27 +++ mllm/backends/ascend/ops/AscendViewOp.cpp | 16 ++ mllm/backends/ascend/ops/AscendViewOp.hpp | 25 +++ 14 files changed, 1047 insertions(+), 3 deletions(-) create mode 100644 mllm/backends/ascend/ops/AscendLinearOp.cpp create mode 100644 mllm/backends/ascend/ops/AscendLinearOp.hpp create mode 100644 mllm/backends/ascend/ops/AscendMatMulOp.cpp create mode 100644 mllm/backends/ascend/ops/AscendMatMulOp.hpp create mode 100644 mllm/backends/ascend/ops/AscendRMSNormOp.cpp create mode 100644 mllm/backends/ascend/ops/AscendRMSNormOp.hpp create mode 100644 mllm/backends/ascend/ops/AscendSiLUOp.cpp create mode 100644 mllm/backends/ascend/ops/AscendSiLUOp.hpp create mode 100644 mllm/backends/ascend/ops/AscendSoftmaxOp.cpp create mode 100644 mllm/backends/ascend/ops/AscendSoftmaxOp.hpp create mode 100644 mllm/backends/ascend/ops/AscendViewOp.cpp create mode 100644 mllm/backends/ascend/ops/AscendViewOp.hpp diff --git a/mllm/backends/ascend/ops/AscendElewiseOps.cpp b/mllm/backends/ascend/ops/AscendElewiseOps.cpp index 762ef1dfe..be1e1b671 100644 --- a/mllm/backends/ascend/ops/AscendElewiseOps.cpp +++ b/mllm/backends/ascend/ops/AscendElewiseOps.cpp @@ -34,9 +34,6 @@ void AscendAddOp::forward(const std::vector& inputs, std::vector if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) { NYI("AscendAddOp currently requires x/y/z have same dtype"); } - if (x.numel() != y.numel() || x.numel() != z.numel()) { - NYI("AscendAddOp demo only supports no-broadcast case (numel equal)"); - } atb::infer::ElewiseParam addParam; addParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_ADD; @@ -106,4 +103,174 @@ void AscendAddOp::forward(const std::vector& inputs, std::vector atb::DestroyOperation(op); } +AscendSubOp::AscendSubOp(const aops::SubOpOptions& options) : aops::SubOp(options) {} + +void AscendSubOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +void AscendSubOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT_EQ(inputs.size(), 2); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& x = inputs[0]; + const auto& y = inputs[1]; + auto& z = outputs[0]; + + if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) { + NYI("AscendSubOp currently requires x/y/z have same dtype"); + } + + atb::infer::ElewiseParam subParam; + subParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_SUB; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(subParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ELEWISE_SUB) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_x; + atb::Tensor atb_y; + atb::Tensor atb_z; + + fillAtbTensorDesc(x, atb_x.desc); + fillAtbTensorDesc(y, atb_y.desc); + fillAtbTensorDesc(z, atb_z.desc); + + atb_x.deviceData = reinterpret_cast(x.ptr()); + atb_x.dataSize = x.bytes(); + atb_y.deviceData = reinterpret_cast(y.ptr()); + atb_y.dataSize = y.bytes(); + atb_z.deviceData = reinterpret_cast(z.ptr()); + atb_z.dataSize = z.bytes(); + + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + inTensors.push_back(atb_y); + outTensors.push_back(atb_z); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SubOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + { + ASCEND_TIME_SCOPE("AscendSubOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SubOp Execute failed, status={}", static_cast(st)); + } + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + +AscendMulOp::AscendMulOp(const aops::MulOpOptions& options) : aops::MulOp(options) {} + +void AscendMulOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +void AscendMulOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT_EQ(inputs.size(), 2); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& x = inputs[0]; + const auto& y = inputs[1]; + auto& z = outputs[0]; + + if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) { + NYI("AscendMulOp currently requires x/y/z have same dtype"); + } + + atb::infer::ElewiseParam mulParam; + mulParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_MUL; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(mulParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ELEWISE_MUL) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_x; + atb::Tensor atb_y; + atb::Tensor atb_z; + + fillAtbTensorDesc(x, atb_x.desc); + fillAtbTensorDesc(y, atb_y.desc); + fillAtbTensorDesc(z, atb_z.desc); + + atb_x.deviceData = reinterpret_cast(x.ptr()); + atb_x.dataSize = x.bytes(); + atb_y.deviceData = reinterpret_cast(y.ptr()); + atb_y.dataSize = y.bytes(); + atb_z.deviceData = reinterpret_cast(z.ptr()); + atb_z.dataSize = z.bytes(); + + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + inTensors.push_back(atb_y); + outTensors.push_back(atb_z); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MulOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + { + ASCEND_TIME_SCOPE("AscendMulOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MulOp Execute failed, status={}", static_cast(st)); + } + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + } // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendElewiseOps.hpp b/mllm/backends/ascend/ops/AscendElewiseOps.hpp index 26117cbc2..9122e20cb 100644 --- a/mllm/backends/ascend/ops/AscendElewiseOps.hpp +++ b/mllm/backends/ascend/ops/AscendElewiseOps.hpp @@ -24,4 +24,34 @@ class AscendAddOpFactory final : public TypedOpFactory& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendSubOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::SubOpOptions& options) override { + return std::make_shared(options); + } +}; + +class AscendMulOp final : public aops::MulOp { + public: + explicit AscendMulOp(const aops::MulOpOptions& options); + + void setup(const std::vector& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendMulOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::MulOpOptions& options) override { + return std::make_shared(options); + } +}; + } // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendLinearOp.cpp b/mllm/backends/ascend/ops/AscendLinearOp.cpp new file mode 100644 index 000000000..a8b986984 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendLinearOp.cpp @@ -0,0 +1,167 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/ops/AscendLinearOp.hpp" + +#include +#include +#include +#include +#include + +#include "mllm/utils/Common.hpp" +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" + +namespace mllm::ascend { + +AscendLinearOp::AscendLinearOp(const aops::LinearOpOptions& options) : aops::LinearOp(options) {} + +void AscendLinearOp::reshape(const std::vector& inputs, std::vector& outputs) { + if (options().isRedirect()) { + const auto& input = inputs[0]; + const auto& weight = inputs[1]; + auto out_shape = input.shape(); + out_shape[out_shape.size() - 1] = weight.shape()[0]; // out_channels + outputs.emplace_back(Tensor::empty(out_shape, input.dtype(), input.device())); + return; + } + aops::LinearOp::reshape(inputs, outputs); +} + +void AscendLinearOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +void AscendLinearOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT(inputs.size() >= 1 && inputs.size() <= 3); + + const Tensor* weight_ptr = nullptr; + const Tensor* bias_ptr = nullptr; + + if (inputs.size() == 1) { + weight_ptr = &weight(); + if (options().bias) { bias_ptr = &bias(); } + } else if (inputs.size() == 2) { + weight_ptr = &inputs[1]; + } else if (inputs.size() == 3) { + weight_ptr = &inputs[1]; + bias_ptr = &inputs[2]; + } + + const auto& x = inputs[0]; + auto& y = outputs[0]; + + // Validate that input tensors are FP16 + if (x.dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendLinearOp: Input tensor must be FP16, but got dtype={}", + static_cast(x.dtype())); + } + if (weight_ptr->dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendLinearOp: Weight tensor must be FP16, but got dtype={}", + static_cast(weight_ptr->dtype())); + } + if (bias_ptr != nullptr && bias_ptr->dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendLinearOp: Bias tensor must be FP16, but got dtype={}", + static_cast(bias_ptr->dtype())); + } + + // Validate bias dimensions: ATB Linear requires bias to be 2D [1, out_channels] + if (bias_ptr != nullptr) { + const auto& bias_shape = bias_ptr->shape(); + if (bias_shape.size() == 1) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendLinearOp: Bias tensor must be 2D [1, out_channels], but got 1D shape with size={}. " + "Please reshape the bias tensor before passing to AscendLinearOp.", + bias_shape[0]); + } + if (bias_shape.size() != 2 || bias_shape[0] != 1) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendLinearOp: Bias tensor must be 2D with shape [1, out_channels], but got shape=[{}, {}]", + bias_shape.size() >= 1 ? bias_shape[0] : 0, + bias_shape.size() >= 2 ? bias_shape[1] : 0); + } + } + + + atb::infer::LinearParam linearParam; + linearParam.transposeA = false; + linearParam.transposeB = true; // Set to true because weight is [out_channels, in_channels] + linearParam.hasBias = (bias_ptr != nullptr); + linearParam.outDataType = ACL_DT_UNDEFINED; + linearParam.enAccum = false; + linearParam.matmulType = atb::infer::LinearParam::MATMUL_UNDEFINED; + linearParam.quantMode = atb::infer::LinearParam::QUANT_UNDEFINED; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(linearParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(Linear) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_x; + atb::Tensor atb_weight; + atb::Tensor atb_y; + atb::Tensor atb_bias; + + fillAtbTensor(x, atb_x); + fillAtbTensor(*weight_ptr, atb_weight); + fillAtbTensor(y, atb_y); + + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + inTensors.push_back(atb_weight); + + if (bias_ptr != nullptr) { + fillAtbTensor(*bias_ptr, atb_bias); + inTensors.push_back(atb_bias); + } + + outTensors.push_back(atb_y); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB LinearOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + + { + ASCEND_TIME_SCOPE("AscendLinearOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB LinearOp Execute failed, status={}", static_cast(st)); + } + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + +} // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendLinearOp.hpp b/mllm/backends/ascend/ops/AscendLinearOp.hpp new file mode 100644 index 000000000..c1b490c70 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendLinearOp.hpp @@ -0,0 +1,28 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/LinearOp.hpp" +#include "mllm/core/OpTypes.hpp" + +namespace mllm::ascend { + +class AscendLinearOp final : public aops::LinearOp { + public: + explicit AscendLinearOp(const aops::LinearOpOptions& options); + + void setup(const std::vector& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; + void reshape(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendLinearOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::LinearOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendMatMulOp.cpp b/mllm/backends/ascend/ops/AscendMatMulOp.cpp new file mode 100644 index 000000000..b08cc77fb --- /dev/null +++ b/mllm/backends/ascend/ops/AscendMatMulOp.cpp @@ -0,0 +1,147 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/ops/AscendMatMulOp.hpp" + +#include +#include +#include +#include +#include + +#include "mllm/utils/Common.hpp" +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" + +namespace mllm::ascend { + +AscendMatMulOp::AscendMatMulOp(const aops::MatMulOpOptions& options) : aops::MatMulOp(options) {} + +void AscendMatMulOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +namespace { + +// Helper to fill ATB tensor with custom shape (for reshape without copy) +void fillAtbTensorWithShape(const Tensor& t, atb::Tensor& atb_tensor, const std::vector& shape) { + atb::TensorDesc desc; + desc.dtype = ACL_FLOAT16; // Ascend uses FP16 + desc.format = ACL_FORMAT_ND; + + desc.shape.dimNum = shape.size(); + for (size_t i = 0; i < shape.size(); ++i) { + desc.shape.dims[i] = shape[i]; + } + + atb_tensor.desc = desc; + atb_tensor.dataSize = atb::Utils::GetTensorSize(atb_tensor); + atb_tensor.deviceData = reinterpret_cast(t.ptr()); +} + +} // namespace + +void AscendMatMulOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT_EQ(inputs.size(), 2); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& A = inputs[0]; + const auto& B = inputs[1]; + auto& C = outputs[0]; + + // ATB Linear/MatMul only supports 2D/3D tensors. + // For 4D tensors [B, H, S, D], we reshape to 3D [B*H, S, D], compute, then reshape back. + const auto& a_shape = A.shape(); + const auto& b_shape = B.shape(); + const auto& c_shape = C.shape(); + + bool is_4d = (a_shape.size() == 4); + + // Prepare shapes for ATB + std::vector atb_a_shape, atb_b_shape, atb_c_shape; + + if (is_4d) { + // Reshape [B, H, S, D] -> [B*H, S, D] + int64_t batch_heads_a = static_cast(a_shape[0]) * static_cast(a_shape[1]); + int64_t batch_heads_b = static_cast(b_shape[0]) * static_cast(b_shape[1]); + int64_t batch_heads_c = static_cast(c_shape[0]) * static_cast(c_shape[1]); + + atb_a_shape = {batch_heads_a, static_cast(a_shape[2]), static_cast(a_shape[3])}; + atb_b_shape = {batch_heads_b, static_cast(b_shape[2]), static_cast(b_shape[3])}; + atb_c_shape = {batch_heads_c, static_cast(c_shape[2]), static_cast(c_shape[3])}; + } else { + // 2D or 3D: use original shapes + for (auto dim : a_shape) atb_a_shape.push_back(static_cast(dim)); + for (auto dim : b_shape) atb_b_shape.push_back(static_cast(dim)); + for (auto dim : c_shape) atb_c_shape.push_back(static_cast(dim)); + } + + // Create LinearParam for ATB (used for MatMul) + atb::infer::LinearParam linearParam; + linearParam.transposeA = options_.transpose_a; + linearParam.transposeB = options_.transpose_b; + linearParam.hasBias = false; + linearParam.outDataType = ACL_DT_UNDEFINED; + linearParam.enAccum = false; + linearParam.matmulType = atb::infer::LinearParam::MATMUL_UNDEFINED; + linearParam.quantMode = atb::infer::LinearParam::QUANT_UNDEFINED; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(linearParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(MatMul) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_A, atb_B, atb_C; + fillAtbTensorWithShape(A, atb_A, atb_a_shape); + fillAtbTensorWithShape(B, atb_B, atb_b_shape); + fillAtbTensorWithShape(C, atb_C, atb_c_shape); + + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_A); + inTensors.push_back(atb_B); + outTensors.push_back(atb_C); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MatMulOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + + { + ASCEND_TIME_SCOPE("AscendMatMulOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MatMulOp Execute failed, status={}", static_cast(st)); + } + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + +} // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendMatMulOp.hpp b/mllm/backends/ascend/ops/AscendMatMulOp.hpp new file mode 100644 index 000000000..5c10a4525 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendMatMulOp.hpp @@ -0,0 +1,27 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/MatMulOp.hpp" +#include "mllm/core/OpTypes.hpp" + +namespace mllm::ascend { + +class AscendMatMulOp final : public aops::MatMulOp { + public: + explicit AscendMatMulOp(const aops::MatMulOpOptions& options); + + void setup(const std::vector& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendMatMulOpFactory : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::MatMulOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendRMSNormOp.cpp b/mllm/backends/ascend/ops/AscendRMSNormOp.cpp new file mode 100644 index 000000000..639639e22 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendRMSNormOp.cpp @@ -0,0 +1,106 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/ops/AscendRMSNormOp.hpp" + +#include +#include +#include +#include +#include +#include + +#include "mllm/utils/Common.hpp" +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" + +namespace mllm::ascend { + +AscendRMSNormOp::AscendRMSNormOp(const aops::RMSNormOpOptions& options) : aops::RMSNormOp(options) {} + +void AscendRMSNormOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +void AscendRMSNormOp::forward(const std::vector& inputs, std::vector& outputs) { + //MLLM_RT_ASSERT(inputs.size() == 1 || inputs.size() == 2, "AscendRMSNormOp expects 1 or 2 inputs"); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& x = inputs[0]; + const auto& weight = (inputs.size() == 2) ? inputs[1] : weight_; + auto& y = outputs[0]; + + const Tensor& weight_for_atb = weight; + + if (x.dtype() != y.dtype()) { + NYI("AscendRMSNormOp currently requires x/y have same dtype"); + } + if (x.numel() != y.numel()) { + NYI("AscendRMSNormOp requires x/y have same numel"); + } + + atb::infer::RmsNormParam rmsNormParam; + rmsNormParam.layerType = atb::infer::RmsNormParam::RmsNormType::RMS_NORM_NORM; + rmsNormParam.normParam.quantType = atb::infer::QuantType::QUANT_UNQUANT; + rmsNormParam.normParam.epsilon = options_.epsilon; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(rmsNormParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(RMS_NORM) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_x; + atb::Tensor atb_weight; + atb::Tensor atb_y; + + fillAtbTensor(x, atb_x); + fillAtbTensor(weight_for_atb, atb_weight); + fillAtbTensor(y, atb_y); + + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + inTensors.push_back(atb_weight); + outTensors.push_back(atb_y); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB RMSNormOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + { + ASCEND_TIME_SCOPE("AscendRMSNormOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB RMSNormOp Execute failed, status={}", static_cast(st)); + } + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + +} // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendRMSNormOp.hpp b/mllm/backends/ascend/ops/AscendRMSNormOp.hpp new file mode 100644 index 000000000..2bfd7db16 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendRMSNormOp.hpp @@ -0,0 +1,27 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/RMSNormOp.hpp" +#include "mllm/core/OpTypes.hpp" + +namespace mllm::ascend { + +class AscendRMSNormOp final : public aops::RMSNormOp { + public: + explicit AscendRMSNormOp(const aops::RMSNormOpOptions& options); + + void setup(const std::vector& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendRMSNormOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::RMSNormOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendSiLUOp.cpp b/mllm/backends/ascend/ops/AscendSiLUOp.cpp new file mode 100644 index 000000000..8c6ec3e69 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendSiLUOp.cpp @@ -0,0 +1,115 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/ops/AscendSiLUOp.hpp" + +#include +#include +#include +#include +#include + +#include "mllm/utils/Common.hpp" +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" + +namespace mllm::ascend { + +AscendSiLUOp::AscendSiLUOp(const aops::SiLUOpOptions& options) : aops::SiLUOp(options) {} + +void AscendSiLUOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +void AscendSiLUOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT_EQ(inputs.size(), 1); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& x = inputs[0]; + auto& y = outputs[0]; + + // Validate that input tensors are FP16 + if (x.dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendSiLUOp: Input tensor must be FP16, but got dtype={}", + static_cast(x.dtype())); + } + if (y.dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendSiLUOp: Output tensor must be FP16, but got dtype={}", + static_cast(y.dtype())); + } + + if (x.dtype() != y.dtype()) { + NYI("AscendSiLUOp currently requires x/y have same dtype"); + } + if (x.numel() != y.numel()) { + NYI("AscendSiLUOp requires x/y have same numel"); + } + + atb::infer::ActivationParam siluParam; + siluParam.activationType = atb::infer::ACTIVATION_SWISH; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(siluParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ACTIVATION_SWISH) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_x; + atb::Tensor atb_y; + + fillAtbTensorDesc(x, atb_x.desc); + fillAtbTensorDesc(y, atb_y.desc); + + atb_x.deviceData = reinterpret_cast(x.ptr()); + atb_x.dataSize = x.bytes(); + atb_y.deviceData = reinterpret_cast(y.ptr()); + atb_y.dataSize = y.bytes(); + + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + outTensors.push_back(atb_y); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SiLUOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + { + ASCEND_TIME_SCOPE("AscendSiLUOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SiLUOp Execute failed, status={}", static_cast(st)); + } + + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + +} // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendSiLUOp.hpp b/mllm/backends/ascend/ops/AscendSiLUOp.hpp new file mode 100644 index 000000000..3e0ee27be --- /dev/null +++ b/mllm/backends/ascend/ops/AscendSiLUOp.hpp @@ -0,0 +1,27 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/SiLUOp.hpp" +#include "mllm/core/OpTypes.hpp" + +namespace mllm::ascend { + +class AscendSiLUOp final : public aops::SiLUOp { + public: + explicit AscendSiLUOp(const aops::SiLUOpOptions& options); + + void setup(const std::vector& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendSiLUOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::SiLUOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp b/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp new file mode 100644 index 000000000..25d09081a --- /dev/null +++ b/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp @@ -0,0 +1,135 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/ops/AscendSoftmaxOp.hpp" + +#include +#include +#include +#include +#include + +#include "mllm/utils/Common.hpp" +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" + +namespace mllm::ascend { + +AscendSoftmaxOp::AscendSoftmaxOp(const aops::SoftmaxOpOptions& options) : aops::SoftmaxOp(options) {} + +void AscendSoftmaxOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +void AscendSoftmaxOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT_EQ(inputs.size(), 1); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& x = inputs[0]; + auto& y = outputs[0]; + + // Validate that input tensors are FP16 + if (x.dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendSoftmaxOp: Input tensor must be FP16, but got dtype={}", + static_cast(x.dtype())); + } + if (y.dtype() != MLLM_TYPE_F16) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendSoftmaxOp: Output tensor must be FP16, but got dtype={}", + static_cast(y.dtype())); + } + + if (x.dtype() != y.dtype()) { + NYI("AscendSoftmaxOp currently requires x/y have same dtype"); + } + if (x.numel() != y.numel()) { + NYI("AscendSoftmaxOp requires x/y have same numel"); + } + + // Configure Softmax parameters + atb::infer::SoftmaxParam softmaxParam; + + // Convert axis to positive index if negative + int axis = options_.axis; + if (axis < 0) { + axis = static_cast(x.rank()) + axis; + } + + // ATB expects axes as SVector + softmaxParam.axes.push_back(static_cast(axis)); + + // Create ATB operation + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(softmaxParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "ATB CreateOperation(Softmax) failed, status={}", + static_cast(st)); + } + + // Get global ATB context + atb::Context* atb_ctx = getGlobalAtbContext(); + + // Prepare ATB tensors + atb::Tensor atb_x; + atb::Tensor atb_y; + + fillAtbTensor(x, atb_x); + fillAtbTensor(y, atb_y); + + // Setup input/output tensors + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + outTensors.push_back(atb_y); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + // Setup operation (calculate required workspace size) + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "ATB SoftmaxOp Setup failed, status={}", + static_cast(st)); + } + + // Allocate workspace if needed + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + + // Execute operation + { + ASCEND_TIME_SCOPE("AscendSoftmaxOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "ATB SoftmaxOp Execute failed, status={}", + static_cast(st)); + } + + // Synchronize stream + syncGlobalAtbStream(); + + // Free workspace + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + // Destroy operation + atb::DestroyOperation(op); +} + +} // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendSoftmaxOp.hpp b/mllm/backends/ascend/ops/AscendSoftmaxOp.hpp new file mode 100644 index 000000000..262be2fcb --- /dev/null +++ b/mllm/backends/ascend/ops/AscendSoftmaxOp.hpp @@ -0,0 +1,27 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/SoftmaxOp.hpp" +#include "mllm/core/OpTypes.hpp" + +namespace mllm::ascend { + +class AscendSoftmaxOp final : public aops::SoftmaxOp { + public: + explicit AscendSoftmaxOp(const aops::SoftmaxOpOptions& options); + + void setup(const std::vector& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendSoftmaxOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::SoftmaxOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendViewOp.cpp b/mllm/backends/ascend/ops/AscendViewOp.cpp new file mode 100644 index 000000000..c1d1ce435 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendViewOp.cpp @@ -0,0 +1,16 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/ops/AscendViewOp.hpp" + +namespace mllm::ascend { + +AscendViewOp::AscendViewOp(const aops::ViewOpOptions& options) : aops::ViewOp(options) {} + +void AscendViewOp::forward(const std::vector& inputs, std::vector& outputs) { + // View operation only changes metadata (shape), not actual data + // Just call the base class implementation which is empty + aops::ViewOp::forward(inputs, outputs); +} + +} // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscendViewOp.hpp b/mllm/backends/ascend/ops/AscendViewOp.hpp new file mode 100644 index 000000000..33fcf6712 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendViewOp.hpp @@ -0,0 +1,25 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/ViewOp.hpp" + +namespace mllm::ascend { + +class AscendViewOp final : public aops::ViewOp { + public: + explicit AscendViewOp(const aops::ViewOpOptions& options); + + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendViewOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::ViewOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::ascend \ No newline at end of file From 890bfeb898c55670055f5abb7eb1414cbf66a5fe Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 30 Jan 2026 23:25:01 +0800 Subject: [PATCH 2/4] feat(ascend): create tests for new ops --- tests/ascend/AscendAttentionKernelTest.hpp | 576 +++++++++++++++++++++ tests/ascend/AscendKernelTest.hpp | 70 +++ tests/ascend/AscendLinearKernelTest.hpp | 164 ++++++ tests/ascend/AscendRMSNormKernelTest.hpp | 85 +++ tests/ascend/AscendSiLUKernelTest.hpp | 67 +++ tests/ascend/AscendSoftmaxKernelTest.hpp | 129 +++++ tests/ascend/KernelTest.cpp | 198 +++++++ 7 files changed, 1289 insertions(+) create mode 100644 tests/ascend/AscendAttentionKernelTest.hpp create mode 100644 tests/ascend/AscendLinearKernelTest.hpp create mode 100644 tests/ascend/AscendRMSNormKernelTest.hpp create mode 100644 tests/ascend/AscendSiLUKernelTest.hpp create mode 100644 tests/ascend/AscendSoftmaxKernelTest.hpp diff --git a/tests/ascend/AscendAttentionKernelTest.hpp b/tests/ascend/AscendAttentionKernelTest.hpp new file mode 100644 index 000000000..b6bf9eb02 --- /dev/null +++ b/tests/ascend/AscendAttentionKernelTest.hpp @@ -0,0 +1,576 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/mllm.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/nn/Functional.hpp" +#include "KernelTestHelper.hpp" +#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp" +#include +#include +#include + +class AscendAttentionKernelTest : public KernelTest { + public: + AscendAttentionKernelTest() = default; + ~AscendAttentionKernelTest() override = default; + + // Test Scaled Dot-Product Attention using existing operators + // Attention(Q, K, V) = softmax(Q @ K^T / sqrt(d_k)) @ V + bool ScaledDotProductAttentionFloat16Test(const std::vector>& test_cases) { + using namespace mllm; // NOLINT + + for (const auto& [q_shape, k_shape, v_shape] : test_cases) { + // Validate shapes: Q=[B, S_q, D], K=[B, S_kv, D], V=[B, S_kv, D] + MLLM_RT_ASSERT_EQ(q_shape.size(), 3); + MLLM_RT_ASSERT_EQ(k_shape.size(), 3); + MLLM_RT_ASSERT_EQ(v_shape.size(), 3); + MLLM_RT_ASSERT_EQ(q_shape[0], k_shape[0]); // Same batch size + MLLM_RT_ASSERT_EQ(q_shape[0], v_shape[0]); + MLLM_RT_ASSERT_EQ(q_shape[2], k_shape[2]); // Same D dimension + MLLM_RT_ASSERT_EQ(k_shape[1], v_shape[1]); // K and V have same sequence length + + int32_t B = static_cast(q_shape[0]); + int32_t S_q = static_cast(q_shape[1]); + int32_t S_kv = static_cast(k_shape[1]); + int32_t D = static_cast(q_shape[2]); + + // 1. Create random FP16 inputs on CPU + Tensor Q_cpu = Tensor::random(q_shape, -1.0f, 1.0f, kFloat16, kCPU); + Tensor K_cpu = Tensor::random(k_shape, -1.0f, 1.0f, kFloat16, kCPU); + Tensor V_cpu = Tensor::random(v_shape, -1.0f, 1.0f, kFloat16, kCPU); + + // 2. Compute reference result on CPU using FP32 for better precision + Tensor Q_cpu_fp32 = Tensor::zeros(q_shape, kFloat32, kCPU); + Tensor K_cpu_fp32 = Tensor::zeros(k_shape, kFloat32, kCPU); + Tensor V_cpu_fp32 = Tensor::zeros(v_shape, kFloat32, kCPU); + + // Convert FP16 to FP32 + { + auto* q_fp16 = Q_cpu.ptr(); + auto* k_fp16 = K_cpu.ptr(); + auto* v_fp16 = V_cpu.ptr(); + auto* q_fp32 = Q_cpu_fp32.ptr(); + auto* k_fp32 = K_cpu_fp32.ptr(); + auto* v_fp32 = V_cpu_fp32.ptr(); + + for (size_t i = 0; i < Q_cpu.numel(); ++i) { + q_fp32[i] = MLLM_FP16_TO_FP32(q_fp16[i]); + } + for (size_t i = 0; i < K_cpu.numel(); ++i) { + k_fp32[i] = MLLM_FP16_TO_FP32(k_fp16[i]); + } + for (size_t i = 0; i < V_cpu.numel(); ++i) { + v_fp32[i] = MLLM_FP16_TO_FP32(v_fp16[i]); + } + } + + // Compute reference attention on CPU (FP32) + Tensor ref_cpu_fp32 = Tensor::zeros({B, S_q, D}, kFloat32, kCPU); + { + auto* q_ptr = Q_cpu_fp32.ptr(); + auto* k_ptr = K_cpu_fp32.ptr(); + auto* v_ptr = V_cpu_fp32.ptr(); + auto* out_ptr = ref_cpu_fp32.ptr(); + + float scale = 1.0f / std::sqrt(static_cast(D)); + + for (int32_t b = 0; b < B; ++b) { + // Compute Q @ K^T for this batch + std::vector scores(S_q * S_kv, 0.0f); + + for (int32_t i = 0; i < S_q; ++i) { + for (int32_t j = 0; j < S_kv; ++j) { + float sum = 0.0f; + for (int32_t k = 0; k < D; ++k) { + float q_val = q_ptr[b * S_q * D + i * D + k]; + float k_val = k_ptr[b * S_kv * D + j * D + k]; + sum += q_val * k_val; + } + scores[i * S_kv + j] = sum * scale; + } + } + + // Apply softmax along the last dimension (S_kv) + std::vector attn_weights(S_q * S_kv); + for (int32_t i = 0; i < S_q; ++i) { + // Find max for numerical stability + float max_val = -std::numeric_limits::infinity(); + for (int32_t j = 0; j < S_kv; ++j) { + max_val = std::max(max_val, scores[i * S_kv + j]); + } + + // Compute exp and sum + float sum_exp = 0.0f; + for (int32_t j = 0; j < S_kv; ++j) { + float exp_val = std::exp(scores[i * S_kv + j] - max_val); + attn_weights[i * S_kv + j] = exp_val; + sum_exp += exp_val; + } + + // Normalize + for (int32_t j = 0; j < S_kv; ++j) { + attn_weights[i * S_kv + j] /= sum_exp; + } + } + + // Compute output: attn_weights @ V + // out[S_q, D] = attn_weights[S_q, S_kv] @ V[S_kv, D] + for (int32_t i = 0; i < S_q; ++i) { + for (int32_t k = 0; k < D; ++k) { + float sum = 0.0f; + for (int32_t j = 0; j < S_kv; ++j) { + float attn_val = attn_weights[i * S_kv + j]; + float v_val = v_ptr[b * S_kv * D + j * D + k]; + sum += attn_val * v_val; + } + out_ptr[b * S_q * D + i * D + k] = sum; + } + } + } + } + + // Convert reference back to FP16 + Tensor ref_cpu = Tensor::zeros({B, S_q, D}, kFloat16, kCPU); + { + auto* ref_fp32 = ref_cpu_fp32.ptr(); + auto* ref_fp16 = ref_cpu.ptr(); + for (size_t i = 0; i < ref_cpu.numel(); ++i) { + ref_fp16[i] = MLLM_FP32_TO_FP16(ref_fp32[i]); + } + } + + // 3. Move inputs to Ascend and compute attention using existing operators + auto Q_ascend = Q_cpu.to(kAscend); + auto K_ascend = K_cpu.to(kAscend); + auto V_ascend = V_cpu.to(kAscend); + + float scale = 1.0f / std::sqrt(static_cast(D)); + + // Step 1: Q @ K^T (transpose_b=true) + auto scores = mllm::nn::functional::matmul(Q_ascend, K_ascend, false, true); + + // Step 2: Scale by 1/sqrt(d_k) + auto scale_tensor_cpu = Tensor::ones({1}, kFloat16, kCPU) * scale; + auto scale_tensor = scale_tensor_cpu.to(kAscend); + auto scaled_scores = scores * scale_tensor; + + // Step 3: Softmax along last dimension + auto attn_weights = mllm::nn::functional::softmax(scaled_scores, -1); + + // Step 4: attn_weights @ V + auto output_ascend = mllm::nn::functional::matmul(attn_weights, V_ascend, false, false); + + // 4. Move result back to CPU and compare + auto output_cpu = output_ascend.to(kCPU); + + auto result = mllm::test::allClose(output_cpu, ref_cpu, 5e-2f, 5e-2f); + if (!result.is_close) { + MLLM_ERROR("Attention test failed for shape Q=[{},{},{}], K=[{},{},{}], V=[{},{},{}]", + q_shape[0], q_shape[1], q_shape[2], + k_shape[0], k_shape[1], k_shape[2], + v_shape[0], v_shape[1], v_shape[2]); + MLLM_ERROR("Max absolute diff: {}, Max relative diff: {}", + result.max_absolute_diff, result.max_relative_diff); + return false; + } + } + return true; + } + + //===----------------------------------------------------------------------===// + // Multi-Head Attention with optional Causal Mask + // + // Input shapes: Q=[B, H, S_q, D], K=[B, H, S_kv, D], V=[B, H, S_kv, D] + // where H = num_heads, D = head_dim + // Mask shape: [1, 1, S_q, S_kv] (broadcastable to [B, H, S_q, S_kv]) + // + // Attention(Q, K, V, mask) = softmax(Q @ K^T / sqrt(d_k) + mask) @ V + //===----------------------------------------------------------------------===// + bool MultiHeadAttentionFloat16Test( + const std::vector>& test_cases) { + using namespace mllm; // NOLINT + + for (const auto& [q_shape, k_shape, v_shape, use_mask] : test_cases) { + // Validate shapes: Q=[B, H, S_q, D], K=[B, H, S_kv, D], V=[B, H, S_kv, D] + MLLM_RT_ASSERT_EQ(q_shape.size(), 4); + MLLM_RT_ASSERT_EQ(k_shape.size(), 4); + MLLM_RT_ASSERT_EQ(v_shape.size(), 4); + MLLM_RT_ASSERT_EQ(q_shape[0], k_shape[0]); // Same batch size + MLLM_RT_ASSERT_EQ(q_shape[0], v_shape[0]); + MLLM_RT_ASSERT_EQ(q_shape[1], k_shape[1]); // Same num_heads + MLLM_RT_ASSERT_EQ(q_shape[1], v_shape[1]); + MLLM_RT_ASSERT_EQ(q_shape[3], k_shape[3]); // Same head_dim + MLLM_RT_ASSERT_EQ(k_shape[2], v_shape[2]); // K and V have same sequence length + + int32_t B = static_cast(q_shape[0]); + int32_t H = static_cast(q_shape[1]); // num_heads + int32_t S_q = static_cast(q_shape[2]); + int32_t S_kv = static_cast(k_shape[2]); + int32_t D = static_cast(q_shape[3]); // head_dim + + // 1. Create random FP16 inputs on CPU + Tensor Q_cpu = Tensor::random(q_shape, -0.5f, 0.5f, kFloat16, kCPU); + Tensor K_cpu = Tensor::random(k_shape, -0.5f, 0.5f, kFloat16, kCPU); + Tensor V_cpu = Tensor::random(v_shape, -0.5f, 0.5f, kFloat16, kCPU); + + // 2. Create causal mask if needed + // Causal mask: mask[i, j] = 0 if j <= i, else -inf (large negative value) + Tensor mask_cpu; + if (use_mask) { + mask_cpu = Tensor::zeros({1, 1, S_q, S_kv}, kFloat16, kCPU); + auto* mask_ptr = mask_cpu.ptr(); + + // Fill causal mask: upper triangular part is masked (-inf) + for (int32_t i = 0; i < S_q; ++i) { + for (int32_t j = 0; j < S_kv; ++j) { + int32_t offset = S_kv - S_q; + if (j > i + offset) { + mask_ptr[i * S_kv + j] = MLLM_FP32_TO_FP16(-10000.0f); + } + } + } + } + + // 3. Compute reference result on CPU using FP32 for better precision + Tensor ref_cpu = computeMultiHeadAttentionCPU(Q_cpu, K_cpu, V_cpu, mask_cpu, use_mask); + + // 4. Move inputs to Ascend and compute attention + auto Q_ascend = Q_cpu.to(kAscend); + auto K_ascend = K_cpu.to(kAscend); + auto V_ascend = V_cpu.to(kAscend); + + float scale = 1.0f / std::sqrt(static_cast(D)); + + // Step 1: Q @ K^T (transpose_b=true) + auto scores = mllm::nn::functional::matmul(Q_ascend, K_ascend, false, true); + + // Step 2: Scale by 1/sqrt(d_k) + auto scale_tensor_cpu = Tensor::ones({1}, kFloat16, kCPU); + { + auto* scale_ptr = scale_tensor_cpu.ptr(); + scale_ptr[0] = MLLM_FP32_TO_FP16(scale); + } + auto scale_tensor = scale_tensor_cpu.to(kAscend); + auto scaled_scores = scores * scale_tensor; + + // Step 3: Add mask if needed (broadcasting: [1, 1, S_q, S_kv] -> [B, H, S_q, S_kv]) + if (use_mask) { + auto mask_ascend = mask_cpu.to(kAscend); + scaled_scores = scaled_scores + mask_ascend; + } + + // Step 4: Softmax along last dimension + auto attn_weights = mllm::nn::functional::softmax(scaled_scores, -1); + + // Step 5: attn_weights @ V + // [B, H, S_q, S_kv] @ [B, H, S_kv, D] -> [B, H, S_q, D] + auto output_ascend = mllm::nn::functional::matmul(attn_weights, V_ascend, false, false); + + // 5. Move result back to CPU and compare + auto output_cpu = output_ascend.to(kCPU); + + auto result = mllm::test::allClose(output_cpu, ref_cpu, 5e-2f, 5e-2f); + if (!result.is_close) { + MLLM_ERROR("Multi-head attention test failed for shape Q=[{},{},{},{}], K=[{},{},{},{}], V=[{},{},{},{}], mask={}", + q_shape[0], q_shape[1], q_shape[2], q_shape[3], + k_shape[0], k_shape[1], k_shape[2], k_shape[3], + v_shape[0], v_shape[1], v_shape[2], v_shape[3], + use_mask ? "true" : "false"); + MLLM_ERROR("Max absolute diff: {}, Max relative diff: {}", + result.max_absolute_diff, result.max_relative_diff); + return false; + } + + MLLM_INFO("Multi-head attention test passed: B={}, H={}, S_q={}, S_kv={}, D={}, mask={}", + B, H, S_q, S_kv, D, use_mask ? "true" : "false"); + } + return true; + } + + //===----------------------------------------------------------------------===// + // Multi-Head Attention with Grouped Query Attention (GQA) support + // + // GQA: num_q_heads > num_kv_heads, each KV head is shared by multiple Q heads + // Input shapes: Q=[B, H_q, S_q, D], K=[B, H_kv, S_kv, D], V=[B, H_kv, S_kv, D] + //===----------------------------------------------------------------------===// + bool GroupedQueryAttentionFloat16Test( + const std::vector>& test_cases) { + using namespace mllm; // NOLINT + + for (const auto& [q_shape, k_shape, v_shape, use_mask] : test_cases) { + // Validate shapes + MLLM_RT_ASSERT_EQ(q_shape.size(), 4); + MLLM_RT_ASSERT_EQ(k_shape.size(), 4); + MLLM_RT_ASSERT_EQ(v_shape.size(), 4); + MLLM_RT_ASSERT_EQ(q_shape[0], k_shape[0]); // Same batch size + MLLM_RT_ASSERT_EQ(q_shape[0], v_shape[0]); + MLLM_RT_ASSERT_EQ(k_shape[1], v_shape[1]); // KV have same num_heads + MLLM_RT_ASSERT_EQ(q_shape[3], k_shape[3]); // Same head_dim + MLLM_RT_ASSERT_EQ(k_shape[2], v_shape[2]); // K and V have same sequence length + + int32_t B = static_cast(q_shape[0]); + int32_t H_q = static_cast(q_shape[1]); // num query heads + int32_t H_kv = static_cast(k_shape[1]); // num KV heads + int32_t S_q = static_cast(q_shape[2]); + int32_t S_kv = static_cast(k_shape[2]); + int32_t D = static_cast(q_shape[3]); + + MLLM_RT_ASSERT_EQ(H_q % H_kv, 0); + int32_t num_groups = H_q / H_kv; + + // 1. Create random FP16 inputs on CPU + Tensor Q_cpu = Tensor::random(q_shape, -0.5f, 0.5f, kFloat16, kCPU); + Tensor K_cpu = Tensor::random(k_shape, -0.5f, 0.5f, kFloat16, kCPU); + Tensor V_cpu = Tensor::random(v_shape, -0.5f, 0.5f, kFloat16, kCPU); + + // 2. Create causal mask if needed + Tensor mask_cpu; + if (use_mask) { + mask_cpu = Tensor::zeros({1, 1, S_q, S_kv}, kFloat16, kCPU); + auto* mask_ptr = mask_cpu.ptr(); + int32_t offset = S_kv - S_q; + for (int32_t i = 0; i < S_q; ++i) { + for (int32_t j = 0; j < S_kv; ++j) { + if (j > i + offset) { + mask_ptr[i * S_kv + j] = MLLM_FP32_TO_FP16(-10000.0f); + } + } + } + } + + // 3. Compute reference on CPU + Tensor ref_cpu = computeGQACPU(Q_cpu, K_cpu, V_cpu, mask_cpu, use_mask, num_groups); + + // 4. Compute on Ascend + auto Q_ascend = Q_cpu.to(kAscend); + auto K_cpu_expanded = repeatKVHeads(K_cpu, num_groups); + auto V_cpu_expanded = repeatKVHeads(V_cpu, num_groups); + auto K_ascend = K_cpu_expanded.to(kAscend); + auto V_ascend = V_cpu_expanded.to(kAscend); + + float scale = 1.0f / std::sqrt(static_cast(D)); + + // Q @ K^T + auto scores = mllm::nn::functional::matmul(Q_ascend, K_ascend, false, true); + + // Scale + auto scale_tensor_cpu = Tensor::ones({1}, kFloat16, kCPU); + { + auto* scale_ptr = scale_tensor_cpu.ptr(); + scale_ptr[0] = MLLM_FP32_TO_FP16(scale); + } + auto scaled_scores = scores * scale_tensor_cpu.to(kAscend); + + // Add mask + if (use_mask) { + scaled_scores = scaled_scores + mask_cpu.to(kAscend); + } + + // Softmax + auto attn_weights = mllm::nn::functional::softmax(scaled_scores, -1); + + // attn_weights @ V + auto output_ascend = mllm::nn::functional::matmul(attn_weights, V_ascend, false, false); + + // 5. Compare + auto output_cpu = output_ascend.to(kCPU); + auto result = mllm::test::allClose(output_cpu, ref_cpu, 5e-2f, 5e-2f); + if (!result.is_close) { + MLLM_ERROR("GQA test failed: B={}, H_q={}, H_kv={}, S_q={}, S_kv={}, D={}, mask={}", + B, H_q, H_kv, S_q, S_kv, D, use_mask ? "true" : "false"); + MLLM_ERROR("Max absolute diff: {}, Max relative diff: {}", + result.max_absolute_diff, result.max_relative_diff); + return false; + } + + MLLM_INFO("GQA test passed: B={}, H_q={}, H_kv={}, S_q={}, S_kv={}, D={}, mask={}", + B, H_q, H_kv, S_q, S_kv, D, use_mask ? "true" : "false"); + } + return true; + } + + private: + //===----------------------------------------------------------------------===// + // Helper: Compute Multi-Head Attention reference on CPU (FP32) + //===----------------------------------------------------------------------===// + mllm::Tensor computeMultiHeadAttentionCPU( + const mllm::Tensor& Q_cpu, + const mllm::Tensor& K_cpu, + const mllm::Tensor& V_cpu, + const mllm::Tensor& mask_cpu, + bool use_mask) { + using namespace mllm; // NOLINT + + int32_t B = static_cast(Q_cpu.shape()[0]); + int32_t H = static_cast(Q_cpu.shape()[1]); + int32_t S_q = static_cast(Q_cpu.shape()[2]); + int32_t S_kv = static_cast(K_cpu.shape()[2]); + int32_t D = static_cast(Q_cpu.shape()[3]); + + // Convert inputs to FP32 + Tensor Q_fp32 = Tensor::zeros({B, H, S_q, D}, kFloat32, kCPU); + Tensor K_fp32 = Tensor::zeros({B, H, S_kv, D}, kFloat32, kCPU); + Tensor V_fp32 = Tensor::zeros({B, H, S_kv, D}, kFloat32, kCPU); + + auto* q_fp16 = Q_cpu.ptr(); + auto* k_fp16 = K_cpu.ptr(); + auto* v_fp16 = V_cpu.ptr(); + auto* q_fp32 = Q_fp32.ptr(); + auto* k_fp32 = K_fp32.ptr(); + auto* v_fp32 = V_fp32.ptr(); + + for (size_t i = 0; i < Q_cpu.numel(); ++i) { + q_fp32[i] = MLLM_FP16_TO_FP32(q_fp16[i]); + } + for (size_t i = 0; i < K_cpu.numel(); ++i) { + k_fp32[i] = MLLM_FP16_TO_FP32(k_fp16[i]); + } + for (size_t i = 0; i < V_cpu.numel(); ++i) { + v_fp32[i] = MLLM_FP16_TO_FP32(v_fp16[i]); + } + + // Convert mask to FP32 if needed + const mllm_fp16_t* mask_fp16 = nullptr; + if (use_mask) { + mask_fp16 = mask_cpu.ptr(); + } + + Tensor output_fp32 = Tensor::zeros({B, H, S_q, D}, kFloat32, kCPU); + auto* out_ptr = output_fp32.ptr(); + + float scale = 1.0f / std::sqrt(static_cast(D)); + + for (int32_t b = 0; b < B; ++b) { + for (int32_t h = 0; h < H; ++h) { + std::vector scores(S_q * S_kv, 0.0f); + for (int32_t i = 0; i < S_q; ++i) { + for (int32_t j = 0; j < S_kv; ++j) { + float sum = 0.0f; + for (int32_t k = 0; k < D; ++k) { + float q_val = q_fp32[((b * H + h) * S_q + i) * D + k]; + float k_val = k_fp32[((b * H + h) * S_kv + j) * D + k]; + sum += q_val * k_val; + } + scores[i * S_kv + j] = sum * scale; + + // Add mask (mask is broadcastable: [1, 1, S_q, S_kv]) + if (use_mask) { + float mask_val = MLLM_FP16_TO_FP32(mask_fp16[i * S_kv + j]); + scores[i * S_kv + j] += mask_val; + } + } + } + + // Softmax along last dimension + std::vector attn_weights(S_q * S_kv); + for (int32_t i = 0; i < S_q; ++i) { + float max_val = -std::numeric_limits::infinity(); + for (int32_t j = 0; j < S_kv; ++j) { + max_val = std::max(max_val, scores[i * S_kv + j]); + } + + float sum_exp = 0.0f; + for (int32_t j = 0; j < S_kv; ++j) { + float exp_val = std::exp(scores[i * S_kv + j] - max_val); + attn_weights[i * S_kv + j] = exp_val; + sum_exp += exp_val; + } + + for (int32_t j = 0; j < S_kv; ++j) { + attn_weights[i * S_kv + j] /= sum_exp; + } + } + + // Compute output: attn_weights @ V + for (int32_t i = 0; i < S_q; ++i) { + for (int32_t k = 0; k < D; ++k) { + float sum = 0.0f; + for (int32_t j = 0; j < S_kv; ++j) { + float attn_val = attn_weights[i * S_kv + j]; + float v_val = v_fp32[((b * H + h) * S_kv + j) * D + k]; + sum += attn_val * v_val; + } + out_ptr[((b * H + h) * S_q + i) * D + k] = sum; + } + } + } + } + + // Convert output back to FP16 + Tensor output_fp16 = Tensor::zeros({B, H, S_q, D}, kFloat16, kCPU); + auto* out_fp16 = output_fp16.ptr(); + for (size_t i = 0; i < output_fp16.numel(); ++i) { + out_fp16[i] = MLLM_FP32_TO_FP16(out_ptr[i]); + } + + return output_fp16; + } + + //===----------------------------------------------------------------------===// + // Helper: Repeat KV heads for GQA + // [B, H_kv, S, D] -> [B, H_q, S, D] where H_q = H_kv * num_groups + //===----------------------------------------------------------------------===// + mllm::Tensor repeatKVHeads(const mllm::Tensor& kv, int32_t num_groups) { + using namespace mllm; // NOLINT + + if (num_groups == 1) { + return kv; + } + + int32_t B = static_cast(kv.shape()[0]); + int32_t H_kv = static_cast(kv.shape()[1]); + int32_t S = static_cast(kv.shape()[2]); + int32_t D = static_cast(kv.shape()[3]); + int32_t H_q = H_kv * num_groups; + + Tensor expanded = Tensor::zeros({B, H_q, S, D}, kv.dtype(), kCPU); + auto* src = kv.ptr(); + auto* dst = expanded.ptr(); + + for (int32_t b = 0; b < B; ++b) { + for (int32_t h_kv = 0; h_kv < H_kv; ++h_kv) { + for (int32_t g = 0; g < num_groups; ++g) { + int32_t h_q = h_kv * num_groups + g; + for (int32_t s = 0; s < S; ++s) { + for (int32_t d = 0; d < D; ++d) { + size_t src_idx = ((b * H_kv + h_kv) * S + s) * D + d; + size_t dst_idx = ((b * H_q + h_q) * S + s) * D + d; + dst[dst_idx] = src[src_idx]; + } + } + } + } + } + + return expanded; + } + + //===----------------------------------------------------------------------===// + // Helper: Compute GQA reference on CPU + //===----------------------------------------------------------------------===// + mllm::Tensor computeGQACPU( + const mllm::Tensor& Q_cpu, + const mllm::Tensor& K_cpu, + const mllm::Tensor& V_cpu, + const mllm::Tensor& mask_cpu, + bool use_mask, + int32_t num_groups) { + // Expand KV heads and compute standard MHA + auto K_expanded = repeatKVHeads(K_cpu, num_groups); + auto V_expanded = repeatKVHeads(V_cpu, num_groups); + return computeMultiHeadAttentionCPU(Q_cpu, K_expanded, V_expanded, mask_cpu, use_mask); + } +}; \ No newline at end of file diff --git a/tests/ascend/AscendKernelTest.hpp b/tests/ascend/AscendKernelTest.hpp index 138ee5ae8..a01028906 100644 --- a/tests/ascend/AscendKernelTest.hpp +++ b/tests/ascend/AscendKernelTest.hpp @@ -48,5 +48,75 @@ class AscendKernelTest : public KernelTest { } return true; } + + // Test Sub operation with different shapes + bool SubFloat16Test(const std::vector& shapes) { + using namespace mllm; // NOLINT + for (auto& shape : shapes) { + // 1. Construct random FP16 inputs on CPU + Tensor x_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU); + Tensor y_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU); + + // 2. Compute reference result (FP16) on CPU + Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU); + { + auto* x_ptr = x_cpu.ptr(); + auto* y_ptr = y_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + auto num_elements = x_cpu.numel(); + for (size_t i = 0; i < num_elements; ++i) { + r_ptr[i] = x_ptr[i] - y_ptr[i]; + } + } + + // 3. Move inputs to Ascend and run Sub (z = x - y) + auto x_ascend = x_cpu.to(kAscend); + auto y_ascend = y_cpu.to(kAscend); + auto z_ascend = x_ascend - y_ascend; + + // 4. Move result back to CPU and compare with reference using allClose + auto z_cpu = z_ascend.to(kCPU); + auto result = mllm::test::allClose(z_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + return false; + } + } + return true; + } + + // Test Mul operation with different shapes + bool MulFloat16Test(const std::vector& shapes) { + using namespace mllm; // NOLINT + for (auto& shape : shapes) { + // 1. Construct random FP16 inputs on CPU + Tensor x_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU); + Tensor y_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU); + + // 2. Compute reference result (FP16) on CPU + Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU); + { + auto* x_ptr = x_cpu.ptr(); + auto* y_ptr = y_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + auto num_elements = x_cpu.numel(); + for (size_t i = 0; i < num_elements; ++i) { + r_ptr[i] = x_ptr[i] * y_ptr[i]; + } + } + + // 3. Move inputs to Ascend and run Mul (z = x * y) + auto x_ascend = x_cpu.to(kAscend); + auto y_ascend = y_cpu.to(kAscend); + auto z_ascend = x_ascend * y_ascend; + + // 4. Move result back to CPU and compare with reference using allClose + auto z_cpu = z_ascend.to(kCPU); + auto result = mllm::test::allClose(z_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + return false; + } + } + return true; + } }; diff --git a/tests/ascend/AscendLinearKernelTest.hpp b/tests/ascend/AscendLinearKernelTest.hpp new file mode 100644 index 000000000..4a7a6fed3 --- /dev/null +++ b/tests/ascend/AscendLinearKernelTest.hpp @@ -0,0 +1,164 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/mllm.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/core/OpTypes.hpp" +#include "mllm/core/aops/LinearOp.hpp" +#include "mllm/engine/Context.hpp" +#include "mllm/nn/Functional.hpp" +#include "KernelTestHelper.hpp" +#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp" +#include +#include +#include + + +class AscendLinearKernelTest : public KernelTest { + public: + AscendLinearKernelTest() = default; + ~AscendLinearKernelTest() override = default; + + bool LinearFloat16Test(const std::vector>& test_cases) { + using namespace mllm; // NOLINT + for (auto& test_case : test_cases) { + auto input_shape = std::get<0>(test_case); + int in_channels = std::get<1>(test_case); + int out_channels = std::get<2>(test_case); + + std::cout << "[LinearTest] Testing shape=["; + for (size_t i = 0; i < input_shape.size(); ++i) { + std::cout << input_shape[i] << (i < input_shape.size() - 1 ? ", " : ""); + } + std::cout << "], in=" << in_channels << ", out=" << out_channels << std::endl; + + // 1. Construct random FP16 inputs on CPU + // x: [M, K] where K = in_channels + Tensor x_cpu = Tensor::random(input_shape, -1, 1, kFloat16, kCPU); + + // Weight shape for ATB: [K, N] where K=in_channels, N=out_channels + Tensor weight_cpu = Tensor::random({in_channels, out_channels}, -0.5, 0.5, kFloat16, kCPU); + + // 2. Compute reference result on CPU + // y = x @ weight, where x is [M, K], weight is [K, N], output is [M, N] + auto output_shape = input_shape; + output_shape[output_shape.size() - 1] = out_channels; + Tensor ref_cpu = Tensor::zeros(output_shape, kFloat16, kCPU); + + { + auto* x_ptr = x_cpu.ptr(); + auto* w_ptr = weight_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + + size_t batch_size = 1; + for (size_t i = 0; i < input_shape.size() - 1; ++i) { + batch_size *= input_shape[i]; + } + + for (size_t b = 0; b < batch_size; ++b) { + for (int o = 0; o < out_channels; ++o) { + float sum = 0.0f; + for (int i = 0; i < in_channels; ++i) { + float x_val = MLLM_FP16_TO_FP32(x_ptr[b * in_channels + i]); + float w_val = MLLM_FP16_TO_FP32(w_ptr[i * out_channels + o]); // weight is [K, N] + sum += x_val * w_val; + } + r_ptr[b * out_channels + o] = MLLM_FP32_TO_FP16(sum); + } + } + } + + // 3. Move inputs to Ascend and run Linear via matmul + auto x_ascend = x_cpu.to(kAscend); + auto weight_ascend = weight_cpu.to(kAscend); + + // Use matmul: y = x @ weight + auto y_ascend = nn::functional::matmul(x_ascend, weight_ascend, false, false); + + // 4. Move result back to CPU and compare with reference + auto y_cpu = y_ascend.to(kCPU); + auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + std::cout << "[LinearTest] FAILED!" << std::endl; + return false; + } + std::cout << "[LinearTest] PASSED" << std::endl; + } + return true; + } + + + bool LinearWithBiasFloat16Test(const std::vector>& test_cases) { + using namespace mllm; // NOLINT + for (auto& test_case : test_cases) { + auto input_shape = std::get<0>(test_case); + int in_channels = std::get<1>(test_case); + int out_channels = std::get<2>(test_case); + + std::cout << "[LinearWithBiasTest] Testing shape=["; + for (size_t i = 0; i < input_shape.size(); ++i) { + std::cout << input_shape[i] << (i < input_shape.size() - 1 ? ", " : ""); + } + std::cout << "], in=" << in_channels << ", out=" << out_channels << std::endl; + + // 1. Create random input, weight and bias on CPU + Tensor x_cpu = Tensor::random(input_shape, -1, 1, kFloat16, kCPU); + // Weight shape: [out_channels, in_channels] + Tensor weight_cpu = Tensor::random({out_channels, in_channels}, -0.5, 0.5, kFloat16, kCPU); + // Bias shape: [1, out_channels] for ATB Linear (2D tensor required) + Tensor bias_cpu = Tensor::random({1, out_channels}, -0.1, 0.1, kFloat16, kCPU); + + // 2. Compute reference result on CPU + auto output_shape = input_shape; + output_shape[output_shape.size() - 1] = out_channels; + Tensor ref_cpu = Tensor::zeros(output_shape, kFloat16, kCPU); + + { + auto* x_ptr = x_cpu.ptr(); + auto* w_ptr = weight_cpu.ptr(); + auto* b_ptr = bias_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + + size_t batch_size = 1; + for (size_t i = 0; i < input_shape.size() - 1; ++i) { + batch_size *= input_shape[i]; + } + + // y = x @ W^T + b, where W is [out_channels, in_channels] + for (size_t b = 0; b < batch_size; ++b) { + for (int o = 0; o < out_channels; ++o) { + float sum = 0.0f; + for (int i = 0; i < in_channels; ++i) { + float x_val = MLLM_FP16_TO_FP32(x_ptr[b * in_channels + i]); + float w_val = MLLM_FP16_TO_FP32(w_ptr[o * in_channels + i]); + sum += x_val * w_val; + } + float bias_val = MLLM_FP16_TO_FP32(b_ptr[o]); + sum += bias_val; + r_ptr[b * out_channels + o] = MLLM_FP32_TO_FP16(sum); + } + } + } + + // 3. Move tensors to Ascend and run linear + auto x_ascend = x_cpu.to(kAscend); + auto weight_ascend = weight_cpu.to(kAscend); + auto bias_ascend = bias_cpu.to(kAscend); + + // Use nn::functional::linear directly + auto y_ascend = nn::functional::linear(x_ascend, weight_ascend, bias_ascend); + + // 4. Compare result with reference + auto y_cpu = y_ascend.to(kCPU); + auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + std::cout << "[LinearWithBiasTest] FAILED!" << std::endl; + return false; + } + std::cout << "[LinearWithBiasTest] PASSED" << std::endl; + } + return true; + } +}; \ No newline at end of file diff --git a/tests/ascend/AscendRMSNormKernelTest.hpp b/tests/ascend/AscendRMSNormKernelTest.hpp new file mode 100644 index 000000000..1e2dea58a --- /dev/null +++ b/tests/ascend/AscendRMSNormKernelTest.hpp @@ -0,0 +1,85 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/mllm.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/nn/layers/RMSNorm.hpp" +#include "KernelTestHelper.hpp" +#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp" +#include +#include + +class AscendRMSNormKernelTest : public KernelTest { + public: + AscendRMSNormKernelTest() = default; + ~AscendRMSNormKernelTest() override = default; + + // Test RMSNorm operation with different shapes + bool RMSNormFloat16Test(const std::vector>& test_cases) { + using namespace mllm; // NOLINT + for (auto& test_case : test_cases) { + auto input_shape = std::get<0>(test_case); + int norm_size = std::get<1>(test_case); + float epsilon = std::get<2>(test_case); + + // Validate that norm_size matches the last dimension of input_shape + assert(norm_size == static_cast(input_shape.back()) && + "norm_size must equal the last dimension of input_shape"); + + // 1. Construct random FP16 inputs on CPU + Tensor x_cpu = Tensor::random(input_shape, -2, 2, kFloat16, kCPU); + + // Weight shape: [norm_size] + Tensor weight_cpu = Tensor::random({norm_size}, 0.5, 1.5, kFloat16, kCPU); + + // 2. Compute reference result (FP16) on CPU + // RMSNorm: y = x * weight / sqrt(mean(x^2) + epsilon) + Tensor ref_cpu = Tensor::zeros(input_shape, kFloat16, kCPU); + { + auto* x_ptr = x_cpu.ptr(); + auto* w_ptr = weight_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + + size_t batch_size = 1; + for (size_t i = 0; i < input_shape.size() - 1; ++i) { + batch_size *= input_shape[i]; + } + + // Perform RMSNorm for each batch + for (size_t b = 0; b < batch_size; ++b) { + float sum_squares = 0.0f; + for (int i = 0; i < norm_size; ++i) { + float x_val = MLLM_FP16_TO_FP32(x_ptr[b * norm_size + i]); + sum_squares += x_val * x_val; + } + float rms = std::sqrt(sum_squares / norm_size + epsilon); + + // Normalize and scale by weight + for (int i = 0; i < norm_size; ++i) { + float x_val = MLLM_FP16_TO_FP32(x_ptr[b * norm_size + i]); + float w_val = MLLM_FP16_TO_FP32(w_ptr[i]); + float result = (x_val / rms) * w_val; + r_ptr[b * norm_size + i] = MLLM_FP32_TO_FP16(result); + } + } + } + + // 3. Move inputs to Ascend and run RMSNorm + auto x_ascend = x_cpu.to(kAscend); + auto weight_ascend = weight_cpu.to(kAscend); + + // Use functional API - one line to execute the operator + auto y_ascend = nn::functional::rmsNorm(x_ascend, weight_ascend, epsilon); + + // 4. Move result back to CPU and compare with reference using allClose + auto y_cpu = y_ascend.to(kCPU); + auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + return false; + } + } + return true; + } +}; \ No newline at end of file diff --git a/tests/ascend/AscendSiLUKernelTest.hpp b/tests/ascend/AscendSiLUKernelTest.hpp new file mode 100644 index 000000000..ce3ceb130 --- /dev/null +++ b/tests/ascend/AscendSiLUKernelTest.hpp @@ -0,0 +1,67 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/mllm.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/nn/Functional.hpp" +#include "KernelTestHelper.hpp" +#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp" +#include +#include + +class AscendSiLUKernelTest : public KernelTest { + public: + AscendSiLUKernelTest() = default; + ~AscendSiLUKernelTest() override = default; + + // Test SiLU operation with different shapes + bool SiLUFloat16Test(const std::vector& shapes) { + using namespace mllm; // NOLINT + for (auto& shape : shapes) { + // 1. Construct random FP16 inputs on CPU + Tensor x_cpu = Tensor::random(shape, -5, 5, kFloat16, kCPU); + + // 2. Compute reference result (FP16) on CPU + // SiLU(x) = x * sigmoid(x) = x / (1 + exp(-x)) + Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU); + { + auto* x_ptr = x_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + auto num_elements = x_cpu.numel(); + for (size_t i = 0; i < num_elements; ++i) { + // Convert FP16 to FP32 for computation + float x_val = MLLM_FP16_TO_FP32(x_ptr[i]); + + // Compute sigmoid(x) = 1 / (1 + exp(-x)) + float sigmoid_x; + if (x_val >= 0) { + sigmoid_x = 1.0f / (1.0f + std::exp(-x_val)); + } else { + float exp_x = std::exp(x_val); + sigmoid_x = exp_x / (1.0f + exp_x); + } + + // SiLU(x) = x * sigmoid(x) + float result = x_val * sigmoid_x; + + // Convert back to FP16 + r_ptr[i] = MLLM_FP32_TO_FP16(result); + } + } + + // 3. Move inputs to Ascend and run SiLU + auto x_ascend = x_cpu.to(kAscend); + auto y_ascend = mllm::nn::functional::silu(x_ascend); + + // 4. Move result back to CPU and compare with reference using allClose + auto y_cpu = y_ascend.to(kCPU); + auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + return false; + } + } + return true; + } +}; \ No newline at end of file diff --git a/tests/ascend/AscendSoftmaxKernelTest.hpp b/tests/ascend/AscendSoftmaxKernelTest.hpp new file mode 100644 index 000000000..9003c714d --- /dev/null +++ b/tests/ascend/AscendSoftmaxKernelTest.hpp @@ -0,0 +1,129 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/mllm.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/nn/Functional.hpp" +#include "KernelTestHelper.hpp" +#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp" +#include +#include + +class AscendSoftmaxKernelTest : public KernelTest { + public: + AscendSoftmaxKernelTest() = default; + ~AscendSoftmaxKernelTest() override = default; + + // Test Softmax operation with different shapes and axes + bool SoftmaxFloat16Test(const std::vector& shapes, const std::vector& axes) { + using namespace mllm; // NOLINT + for (auto& shape : shapes) { + for (auto axis : axes) { + // 1. Construct random FP16 inputs on CPU + Tensor x_cpu = Tensor::random(shape, -5, 5, kFloat16, kCPU); + + // 2. Compute reference result (FP16) on CPU + // Softmax(x_i) = exp(x_i - max(x)) / sum(exp(x_j - max(x))) + Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU); + { + auto* x_ptr = x_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + + // Convert axis to positive index + int ndim = static_cast(shape.size()); + int pos_axis = axis; + if (pos_axis < 0) { + pos_axis = ndim + pos_axis; + } + + // Calculate strides + std::vector strides(ndim); + strides[ndim - 1] = 1; + for (int i = ndim - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * shape[i + 1]; + } + + size_t outer_size = 1; + for (int i = 0; i < pos_axis; ++i) { + outer_size *= shape[i]; + } + + size_t axis_size = shape[pos_axis]; + + size_t inner_size = 1; + for (int i = pos_axis + 1; i < ndim; ++i) { + inner_size *= shape[i]; + } + + // Compute softmax for each slice along the axis + for (size_t outer = 0; outer < outer_size; ++outer) { + for (size_t inner = 0; inner < inner_size; ++inner) { + // Find max value for numerical stability + float max_val = -std::numeric_limits::infinity(); + for (size_t i = 0; i < axis_size; ++i) { + size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] + + i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner; + if (pos_axis == 0) { + idx = i * strides[0] + inner; + } else if (pos_axis == ndim - 1) { + idx = outer * axis_size + i; + } else { + idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner; + } + float val = MLLM_FP16_TO_FP32(x_ptr[idx]); + max_val = std::max(max_val, val); + } + + // Compute exp(x - max) and sum + float sum_exp = 0.0f; + std::vector exp_vals(axis_size); + for (size_t i = 0; i < axis_size; ++i) { + size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] + + i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner; + if (pos_axis == 0) { + idx = i * strides[0] + inner; + } else if (pos_axis == ndim - 1) { + idx = outer * axis_size + i; + } else { + idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner; + } + float val = MLLM_FP16_TO_FP32(x_ptr[idx]); + exp_vals[i] = std::exp(val - max_val); + sum_exp += exp_vals[i]; + } + + // Compute softmax and store result + for (size_t i = 0; i < axis_size; ++i) { + size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] + + i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner; + if (pos_axis == 0) { + idx = i * strides[0] + inner; + } else if (pos_axis == ndim - 1) { + idx = outer * axis_size + i; + } else { + idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner; + } + float result = exp_vals[i] / sum_exp; + r_ptr[idx] = MLLM_FP32_TO_FP16(result); + } + } + } + } + + // 3. Move inputs to Ascend and run Softmax + auto x_ascend = x_cpu.to(kAscend); + auto y_ascend = mllm::nn::functional::softmax(x_ascend, axis); + + // 4. Move result back to CPU and compare with reference using allClose + auto y_cpu = y_ascend.to(kCPU); + auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + return false; + } + } + } + return true; + } +}; \ No newline at end of file diff --git a/tests/ascend/KernelTest.cpp b/tests/ascend/KernelTest.cpp index b0489f545..4e1747e82 100644 --- a/tests/ascend/KernelTest.cpp +++ b/tests/ascend/KernelTest.cpp @@ -25,6 +25,204 @@ TEST_F(AscendKernelTest, AddFloat16) { true); } +//===----------------------------------------------------------------------===// +// Element wise SUB. +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +TEST_F(AscendKernelTest, SubFloat16) { + EXPECT_EQ(SubFloat16Test({ + {2, 3}, + {1, 1}, + {4, 4}, + {8, 8}, + {16, 16}, + {32, 32}, + }), + true); +} + +//===----------------------------------------------------------------------===// +// Element wise MUL. +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +TEST_F(AscendKernelTest, MulFloat16) { + EXPECT_EQ(MulFloat16Test({ + {2, 3}, + {1, 1}, + {4, 4}, + {8, 8}, + {16, 16}, + {32, 32}, + }), + true); +} + +//===----------------------------------------------------------------------===// +// SiLU activation function. +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +#include "AscendSiLUKernelTest.hpp" +TEST_F(AscendSiLUKernelTest, SiLUFloat16) { + EXPECT_EQ(SiLUFloat16Test({ + {2, 3}, + {1, 1}, + {4, 4}, + {8, 8}, + {16, 16}, + {32, 32}, + {1, 1024}, + {128, 128}, + }), + true); +} + +//===----------------------------------------------------------------------===// +// Linear layer (MatMul based test). +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +#include "AscendLinearKernelTest.hpp" +TEST_F(AscendLinearKernelTest, LinearFloat16) { + EXPECT_EQ(LinearFloat16Test({ + // {input_shape, in_channels, out_channels} + {{2, 3}, 3, 4}, + {{1, 8}, 8, 16}, + {{4, 16}, 16, 32}, + {{8, 32}, 32, 64}, + {{1, 1024}, 1024, 512}, + }), + true); +} + +TEST_F(AscendLinearKernelTest, LinearWithBiasFloat16) { + EXPECT_EQ(LinearWithBiasFloat16Test({ + // {input_shape, in_channels, out_channels} + {{2, 3}, 3, 4}, + {{1, 8}, 8, 16}, + {{4, 16}, 16, 32}, + }), + true); +} + +//===----------------------------------------------------------------------===// +// RMSNorm layer. +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +#include "AscendRMSNormKernelTest.hpp" +TEST_F(AscendRMSNormKernelTest, RMSNormFloat16) { + EXPECT_EQ(RMSNormFloat16Test({ + // {input_shape, norm_size, epsilon} + // Note: ATB RMSNorm requires last dim to be multiple of 16 (FP16 alignment) + {{2, 16}, 16, 1e-5f}, + {{1, 32}, 32, 1e-5f}, + {{4, 64}, 64, 1e-6f}, + {{8, 128}, 128, 1e-5f}, + {{1, 1024}, 1024, 1e-5f}, + {{128, 256}, 256, 1e-5f}, + }), + true); +} + +//===----------------------------------------------------------------------===// +// Softmax activation function. +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +#include "AscendSoftmaxKernelTest.hpp" +TEST_F(AscendSoftmaxKernelTest, SoftmaxFloat16) { + EXPECT_EQ(SoftmaxFloat16Test({ + {2, 3}, + {1, 8}, + {4, 4}, + {8, 8}, + {16, 16}, + {1, 1024}, + {128, 128}, + }, + {-1, 0, 1} // Test different axes + ), + true); +} + +//===----------------------------------------------------------------------===// +// Scaled Dot-Product Attention (using existing operators). +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +#include "AscendAttentionKernelTest.hpp" +TEST_F(AscendAttentionKernelTest, ScaledDotProductAttentionFloat16) { + EXPECT_EQ(ScaledDotProductAttentionFloat16Test({ + // {Q_shape, K_shape, V_shape} + // Format: [B, S, D] + {{1, 4, 8}, {1, 4, 8}, {1, 4, 8}}, // Small: B=1, S=4, D=8 + {{1, 8, 16}, {1, 8, 16}, {1, 8, 16}}, // Medium: B=1, S=8, D=16 + {{2, 4, 8}, {2, 4, 8}, {2, 4, 8}}, // Batch=2 + {{1, 16, 32}, {1, 16, 32}, {1, 16, 32}}, // Larger: B=1, S=16, D=32 + {{1, 8, 64}, {1, 8, 64}, {1, 8, 64}}, // D=64 (common head dim) + }), + true); +} + +//===----------------------------------------------------------------------===// +// Multi-Head Attention with Causal Mask. +// +// FP16 (Ascend currently uses FP16) +// Input format: [B, H, S, D] where H = num_heads, D = head_dim +//===----------------------------------------------------------------------===// +TEST_F(AscendAttentionKernelTest, MultiHeadAttentionFloat16) { + EXPECT_EQ(MultiHeadAttentionFloat16Test({ + // {Q_shape, K_shape, V_shape, use_causal_mask} + // Format: [B, H, S, D] + + // Without mask + {{1, 1, 4, 8}, {1, 1, 4, 8}, {1, 1, 4, 8}, false}, // Single head, no mask + {{1, 4, 8, 16}, {1, 4, 8, 16}, {1, 4, 8, 16}, false}, // 4 heads, no mask + {{1, 8, 16, 64}, {1, 8, 16, 64}, {1, 8, 16, 64}, false}, // 8 heads, D=64 + + // With causal mask + {{1, 1, 4, 8}, {1, 1, 4, 8}, {1, 1, 4, 8}, true}, // Single head, with mask + {{1, 4, 8, 16}, {1, 4, 8, 16}, {1, 4, 8, 16}, true}, // 4 heads, with mask + {{1, 8, 16, 64}, {1, 8, 16, 64}, {1, 8, 16, 64}, true}, // 8 heads, with mask + {{2, 4, 8, 32}, {2, 4, 8, 32}, {2, 4, 8, 32}, true}, // Batch=2, with mask + + // Different S_q and S_kv (useful for KV cache scenarios) + {{1, 4, 1, 32}, {1, 4, 8, 32}, {1, 4, 8, 32}, true}, // S_q=1, S_kv=8 (decode) + {{1, 4, 4, 32}, {1, 4, 16, 32}, {1, 4, 16, 32}, true}, // S_q < S_kv + }), + true); +} + +//===----------------------------------------------------------------------===// +// Grouped Query Attention (GQA). +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +TEST_F(AscendAttentionKernelTest, GroupedQueryAttentionFloat16) { + EXPECT_EQ(GroupedQueryAttentionFloat16Test({ + // {Q_shape [B, H_q, S_q, D], K_shape [B, H_kv, S_kv, D], V_shape, use_mask} + + // GQA with 2 groups (H_q = 4, H_kv = 2) + {{1, 4, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, false}, + {{1, 4, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, true}, + + // GQA with 4 groups (H_q = 8, H_kv = 2) + {{1, 8, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, false}, + {{1, 8, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, true}, + + // MQA (Multi-Query Attention): H_kv = 1 + {{1, 4, 8, 32}, {1, 1, 8, 32}, {1, 1, 8, 32}, true}, + {{1, 8, 16, 64}, {1, 1, 16, 64}, {1, 1, 16, 64}, true}, + + // Batch > 1 + {{2, 8, 8, 32}, {2, 2, 8, 32}, {2, 2, 8, 32}, true}, + }), + true); +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); From 206c752d6115d5a51ef9bdcfd7d37fd136a01201 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 30 Jan 2026 23:27:19 +0800 Subject: [PATCH 3/4] fix(ascend): fix some problems for new ops --- mllm/backends/ascend/AscendBackend.cpp | 10 ++++++++-- mllm/backends/ascend/AscendCommon.cpp | 10 +++++++--- mllm/backends/ascend/AscendCommon.hpp | 3 +++ mllm/nn/Functional.cpp | 17 +++++++++++++++++ mllm/nn/Functional.hpp | 4 ++++ tasks/build_arm_ascend.yaml | 4 ++-- 6 files changed, 41 insertions(+), 7 deletions(-) diff --git a/mllm/backends/ascend/AscendBackend.cpp b/mllm/backends/ascend/AscendBackend.cpp index 5ec76413a..6c17774b6 100644 --- a/mllm/backends/ascend/AscendBackend.cpp +++ b/mllm/backends/ascend/AscendBackend.cpp @@ -8,12 +8,18 @@ #include "mllm/backends/ascend/ops/AscendElewiseOps.hpp" #include "mllm/backends/ascend/ops/AscendX2XOp.hpp" +#include "mllm/backends/ascend/ops/AscendSiLUOp.hpp" +#include "mllm/backends/ascend/ops/AscendLinearOp.hpp" +#include "mllm/backends/ascend/ops/AscendRMSNormOp.hpp" +#include "mllm/backends/ascend/ops/AscendViewOp.hpp" +#include "mllm/backends/ascend/ops/AscendMatMulOp.hpp" +#include "mllm/backends/ascend/ops/AscendSoftmaxOp.hpp" namespace mllm::ascend { AscendBackend::AscendBackend() : Backend(kAscend, createAscendAllocator()) { - regOpFactory(); - regOpFactory(); + regOpFactory(); auto& devices = AscendDeviceMetaInfo::instance().devices; for (const auto& device : devices) { const auto bytes_to_mb = [](size_t bytes) { return bytes / (1024.0 * 1024.0); }; diff --git a/mllm/backends/ascend/AscendCommon.cpp b/mllm/backends/ascend/AscendCommon.cpp index 140a5a31e..252571583 100644 --- a/mllm/backends/ascend/AscendCommon.cpp +++ b/mllm/backends/ascend/AscendCommon.cpp @@ -217,6 +217,13 @@ void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc) { } } +void fillAtbTensor(const Tensor& t, atb::Tensor& atb_tensor) { + fillAtbTensorDesc(t, atb_tensor.desc); + atb_tensor.deviceData = reinterpret_cast(t.ptr()); + // Use MLLM tensor's actual bytes as dataSize to match allocated memory + atb_tensor.dataSize = t.bytes(); +} + AscendDeviceMetaInfo::AscendDeviceMetaInfo() { #ifndef ASCENDC_CPU_DEBUG // Initialize ACL to query devices @@ -231,7 +238,6 @@ AscendDeviceMetaInfo::AscendDeviceMetaInfo() { ret = aclrtGetDeviceCount(&device_count); if (ret != ACL_SUCCESS) { MLLM_ERROR("Failed to get Ascend device count: {}", ret); - aclFinalize(); return; } @@ -266,8 +272,6 @@ AscendDeviceMetaInfo::AscendDeviceMetaInfo() { devices.push_back(info); } - // Finalize ACL after enumeration - aclFinalize(); #else // In CPU debug mode, add a dummy device AscendDeviceInfo info; diff --git a/mllm/backends/ascend/AscendCommon.hpp b/mllm/backends/ascend/AscendCommon.hpp index 8d74c8707..5a2b69dc8 100644 --- a/mllm/backends/ascend/AscendCommon.hpp +++ b/mllm/backends/ascend/AscendCommon.hpp @@ -41,6 +41,9 @@ void syncGlobalAtbStream(); // Convert MLLM Tensor metadata to ATB TensorDesc void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc); +// Setup ATB Tensor with correct dataSize calculated by ATB Utils +void fillAtbTensor(const Tensor& t, atb::Tensor& atb_tensor); + // Ascend device information structure struct AscendDeviceInfo { std::string name; diff --git a/mllm/nn/Functional.cpp b/mllm/nn/Functional.cpp index 4e70b092a..e1e015432 100644 --- a/mllm/nn/Functional.cpp +++ b/mllm/nn/Functional.cpp @@ -7,6 +7,7 @@ #include "mllm/core/aops/FlashAttention2Op.hpp" #include "mllm/core/aops/GatherOp.hpp" #include "mllm/core/aops/MatMulOp.hpp" +#include "mllm/core/aops/LinearOp.hpp" #include "mllm/core/aops/ReduceOps.hpp" #include "mllm/core/aops/Scatter2ShardsOp.hpp" #include "mllm/core/aops/SigmoidOp.hpp" @@ -16,6 +17,7 @@ #include "mllm/core/aops/ViewOp.hpp" #include "mllm/core/aops/TopKOp.hpp" #include "mllm/core/aops/SiLUOp.hpp" +#include "mllm/core/aops/RMSNormOp.hpp" #include "mllm/core/aops/PadOp.hpp" #include "mllm/core/aops/MaskedScatterOp.hpp" #include "mllm/core/aops/InterpolateOp.hpp" @@ -33,6 +35,16 @@ Tensor matmul(const Tensor& A, const Tensor& B, bool transpose_A, bool transpose {A, B})[0]; } +Tensor linear(const Tensor& x, const Tensor& weight, const Tensor& bias) { + aops::LinearOpOptions opts{}; + opts.setRedirect(true); + if (bias.isNil()) { + return Context::instance().buildOpAndSubmitTask(OpTypes::kLinear, opts, {x, weight})[0]; + } else { + return Context::instance().buildOpAndSubmitTask(OpTypes::kLinear, opts, {x, weight, bias})[0]; + } +} + Tensor view(const Tensor& x, const std::vector& shape) { return Context::instance().buildOpAndSubmitTask(OpTypes::kView, aops::ViewOpOptions{.to_shape = shape}, {x})[0]; } @@ -126,6 +138,11 @@ Tensor silu_(const Tensor& x) { return Context::instance().buildOpAndSubmitTask(OpTypes::kSiLU, opt, {x})[0]; } +Tensor rmsNorm(const Tensor& x, const Tensor& weight, float epsilon, bool add_unit_offset) { + return Context::instance().buildOpAndSubmitTask( + OpTypes::kRMSNorm, aops::RMSNormOpOptions{.epsilon = epsilon, .add_unit_offset = add_unit_offset}, {x, weight})[0]; +} + void scatter2Shards(const Tensor& src, const Tensor& shards_pointer, int32_t dim) { Context::instance().buildOpAndSubmitTask(OpTypes::kScatter2Shards, aops::Scatter2ShardsOpOptions{.dim = dim}, {src, shards_pointer}); diff --git a/mllm/nn/Functional.hpp b/mllm/nn/Functional.hpp index 31a57812c..c85b716e9 100644 --- a/mllm/nn/Functional.hpp +++ b/mllm/nn/Functional.hpp @@ -20,6 +20,8 @@ namespace mllm::nn::functional { Tensor matmul(const Tensor& A, const Tensor& B, bool transpose_A = false, bool transpose_B = false, aops::MatMulOpType type = aops::MatMulOpType::kDefault); +Tensor linear(const Tensor& x, const Tensor& weight, const Tensor& bias = Tensor()); + Tensor view(const Tensor& x, const std::vector& shape); std::vector split(const Tensor& x, int32_t split_size_or_sections, int32_t dim); @@ -131,6 +133,8 @@ Tensor mean(const Tensor& x, int32_t dim = std::numeric_limits::max(), Tensor silu(const Tensor& x); Tensor silu_(const Tensor& x); +Tensor rmsNorm(const Tensor& x, const Tensor& weight, float epsilon = 1e-5f, bool add_unit_offset = false); + void scatter2Shards(const Tensor& src, const Tensor& shards_pointer, int32_t dim); // If you want causal mask attention. Use Flash attention instead. diff --git a/tasks/build_arm_ascend.yaml b/tasks/build_arm_ascend.yaml index 17ffd3f10..111a162ba 100644 --- a/tasks/build_arm_ascend.yaml +++ b/tasks/build_arm_ascend.yaml @@ -1,6 +1,6 @@ Tasks: - CMakeConfigTask: - cmake_cfg_path: "build-arm-ascend" + cmake_cfg_path: "build-ascend" cmake_build_type: "ReleaseDebInfo" cmake_extra_args: - "-DMLLM_CROSS_COMPILE=ON" @@ -15,4 +15,4 @@ Tasks: - "-DMLLM_KERNEL_USE_THREADS_VENDOR_MLLM=OFF" - CMakeBuildTask: - cmake_cfg_path: "build-arm-ascend" + cmake_cfg_path: "build-ascend" From 642b39765284d858929fb2af10a80b7bf071c283 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 31 Jan 2026 01:12:33 +0800 Subject: [PATCH 4/4] fix(ascend): fix some problems providing by CodeRabbit --- mllm/backends/ascend/ops/AscendLinearOp.cpp | 4 +- mllm/backends/ascend/ops/AscendRMSNormOp.cpp | 2 +- mllm/backends/ascend/ops/AscendSoftmaxOp.cpp | 19 ++++------ tests/ascend/AscendAttentionKernelTest.hpp | 1 + tests/ascend/AscendSoftmaxKernelTest.hpp | 40 +++----------------- 5 files changed, 18 insertions(+), 48 deletions(-) diff --git a/mllm/backends/ascend/ops/AscendLinearOp.cpp b/mllm/backends/ascend/ops/AscendLinearOp.cpp index a8b986984..049563170 100644 --- a/mllm/backends/ascend/ops/AscendLinearOp.cpp +++ b/mllm/backends/ascend/ops/AscendLinearOp.cpp @@ -21,8 +21,9 @@ AscendLinearOp::AscendLinearOp(const aops::LinearOpOptions& options) : aops::Lin void AscendLinearOp::reshape(const std::vector& inputs, std::vector& outputs) { if (options().isRedirect()) { + MLLM_RT_ASSERT(inputs.size() >= 1); const auto& input = inputs[0]; - const auto& weight = inputs[1]; + const auto& weight = inputs.size() >= 2 ? inputs[1] : this->weight(); auto out_shape = input.shape(); out_shape[out_shape.size() - 1] = weight.shape()[0]; // out_channels outputs.emplace_back(Tensor::empty(out_shape, input.dtype(), input.device())); @@ -37,6 +38,7 @@ void AscendLinearOp::setup(const std::vector& inputs, std::vector& inputs, std::vector& outputs) { MLLM_RT_ASSERT(inputs.size() >= 1 && inputs.size() <= 3); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); const Tensor* weight_ptr = nullptr; const Tensor* bias_ptr = nullptr; diff --git a/mllm/backends/ascend/ops/AscendRMSNormOp.cpp b/mllm/backends/ascend/ops/AscendRMSNormOp.cpp index 639639e22..7ce8c74d3 100644 --- a/mllm/backends/ascend/ops/AscendRMSNormOp.cpp +++ b/mllm/backends/ascend/ops/AscendRMSNormOp.cpp @@ -25,7 +25,7 @@ void AscendRMSNormOp::setup(const std::vector& inputs, std::vector& inputs, std::vector& outputs) { - //MLLM_RT_ASSERT(inputs.size() == 1 || inputs.size() == 2, "AscendRMSNormOp expects 1 or 2 inputs"); + MLLM_RT_ASSERT(inputs.size() == 1 || inputs.size() == 2); MLLM_RT_ASSERT_EQ(outputs.size(), 1); const auto& x = inputs[0]; diff --git a/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp b/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp index 25d09081a..0050ae718 100644 --- a/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp +++ b/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp @@ -54,14 +54,18 @@ void AscendSoftmaxOp::forward(const std::vector& inputs, std::vector(x.rank()); if (axis < 0) { - axis = static_cast(x.rank()) + axis; + axis = rank + axis; + } + if (axis < 0 || axis >= rank) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, + "AscendSoftmaxOp: axis {} out of range for rank {}", + axis, rank); } - // ATB expects axes as SVector softmaxParam.axes.push_back(static_cast(axis)); - // Create ATB operation atb::Operation* op = nullptr; auto st = atb::CreateOperation(softmaxParam, &op); if (st != atb::NO_ERROR || op == nullptr) { @@ -70,17 +74,14 @@ void AscendSoftmaxOp::forward(const std::vector& inputs, std::vector(st)); } - // Get global ATB context atb::Context* atb_ctx = getGlobalAtbContext(); - // Prepare ATB tensors atb::Tensor atb_x; atb::Tensor atb_y; fillAtbTensor(x, atb_x); fillAtbTensor(y, atb_y); - // Setup input/output tensors atb::SVector inTensors; atb::SVector outTensors; inTensors.push_back(atb_x); @@ -90,7 +91,6 @@ void AscendSoftmaxOp::forward(const std::vector& inputs, std::vectorSetup(vp, workspaceSize, atb_ctx); if (st != atb::NO_ERROR) { @@ -99,7 +99,6 @@ void AscendSoftmaxOp::forward(const std::vector& inputs, std::vector(st)); } - // Allocate workspace if needed void* workspace = nullptr; int workspace_block_id = -1; if (workspaceSize > 0) { @@ -108,7 +107,6 @@ void AscendSoftmaxOp::forward(const std::vector& inputs, std::vectorExecute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); @@ -119,16 +117,13 @@ void AscendSoftmaxOp::forward(const std::vector& inputs, std::vector(st)); } - // Synchronize stream syncGlobalAtbStream(); - // Free workspace if (workspace_block_id != -1) { auto& mem_mgr = getAscendMemoryManager(); mem_mgr.freeBlock(workspace_block_id); } - // Destroy operation atb::DestroyOperation(op); } diff --git a/tests/ascend/AscendAttentionKernelTest.hpp b/tests/ascend/AscendAttentionKernelTest.hpp index b6bf9eb02..80e5542bb 100644 --- a/tests/ascend/AscendAttentionKernelTest.hpp +++ b/tests/ascend/AscendAttentionKernelTest.hpp @@ -225,6 +225,7 @@ class AscendAttentionKernelTest : public KernelTest { // Causal mask: mask[i, j] = 0 if j <= i, else -inf (large negative value) Tensor mask_cpu; if (use_mask) { + MLLM_RT_ASSERT(S_kv >= S_q); mask_cpu = Tensor::zeros({1, 1, S_q, S_kv}, kFloat16, kCPU); auto* mask_ptr = mask_cpu.ptr(); diff --git a/tests/ascend/AscendSoftmaxKernelTest.hpp b/tests/ascend/AscendSoftmaxKernelTest.hpp index 9003c714d..95b6fe4c8 100644 --- a/tests/ascend/AscendSoftmaxKernelTest.hpp +++ b/tests/ascend/AscendSoftmaxKernelTest.hpp @@ -38,13 +38,6 @@ class AscendSoftmaxKernelTest : public KernelTest { pos_axis = ndim + pos_axis; } - // Calculate strides - std::vector strides(ndim); - strides[ndim - 1] = 1; - for (int i = ndim - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * shape[i + 1]; - } - size_t outer_size = 1; for (int i = 0; i < pos_axis; ++i) { outer_size *= shape[i]; @@ -60,18 +53,13 @@ class AscendSoftmaxKernelTest : public KernelTest { // Compute softmax for each slice along the axis for (size_t outer = 0; outer < outer_size; ++outer) { for (size_t inner = 0; inner < inner_size; ++inner) { + auto idx_at = [&](size_t i) -> size_t { + return (outer * axis_size + i) * inner_size + inner; + }; // Find max value for numerical stability float max_val = -std::numeric_limits::infinity(); for (size_t i = 0; i < axis_size; ++i) { - size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] + - i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner; - if (pos_axis == 0) { - idx = i * strides[0] + inner; - } else if (pos_axis == ndim - 1) { - idx = outer * axis_size + i; - } else { - idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner; - } + size_t idx = idx_at(i); float val = MLLM_FP16_TO_FP32(x_ptr[idx]); max_val = std::max(max_val, val); } @@ -80,15 +68,7 @@ class AscendSoftmaxKernelTest : public KernelTest { float sum_exp = 0.0f; std::vector exp_vals(axis_size); for (size_t i = 0; i < axis_size; ++i) { - size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] + - i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner; - if (pos_axis == 0) { - idx = i * strides[0] + inner; - } else if (pos_axis == ndim - 1) { - idx = outer * axis_size + i; - } else { - idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner; - } + size_t idx = idx_at(i); float val = MLLM_FP16_TO_FP32(x_ptr[idx]); exp_vals[i] = std::exp(val - max_val); sum_exp += exp_vals[i]; @@ -96,15 +76,7 @@ class AscendSoftmaxKernelTest : public KernelTest { // Compute softmax and store result for (size_t i = 0; i < axis_size; ++i) { - size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] + - i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner; - if (pos_axis == 0) { - idx = i * strides[0] + inner; - } else if (pos_axis == ndim - 1) { - idx = outer * axis_size + i; - } else { - idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner; - } + size_t idx = idx_at(i); float result = exp_vals[i] / sum_exp; r_ptr[idx] = MLLM_FP32_TO_FP16(result); }