From 9bef2618343c06b9b86d1f8cde815a12c35ee088 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 30 Jan 2026 23:23:47 +0800
Subject: [PATCH 1/4] feat(ascend): create some new ops(SiLU,RMSNorm,Linear and
 so on)

---
 mllm/backends/ascend/ops/AscendElewiseOps.cpp | 173 +++++++++++++++++-
 mllm/backends/ascend/ops/AscendElewiseOps.hpp |  30 +++
 mllm/backends/ascend/ops/AscendLinearOp.cpp   | 167 +++++++++++++++++
 mllm/backends/ascend/ops/AscendLinearOp.hpp   |  28 +++
 mllm/backends/ascend/ops/AscendMatMulOp.cpp   | 147 +++++++++++++++
 mllm/backends/ascend/ops/AscendMatMulOp.hpp   |  27 +++
 mllm/backends/ascend/ops/AscendRMSNormOp.cpp  | 106 +++++++++++
 mllm/backends/ascend/ops/AscendRMSNormOp.hpp  |  27 +++
 mllm/backends/ascend/ops/AscendSiLUOp.cpp     | 115 ++++++++++++
 mllm/backends/ascend/ops/AscendSiLUOp.hpp     |  27 +++
 mllm/backends/ascend/ops/AscendSoftmaxOp.cpp  | 135 ++++++++++++++
 mllm/backends/ascend/ops/AscendSoftmaxOp.hpp  |  27 +++
 mllm/backends/ascend/ops/AscendViewOp.cpp     |  16 ++
 mllm/backends/ascend/ops/AscendViewOp.hpp     |  25 +++
 14 files changed, 1047 insertions(+), 3 deletions(-)
 create mode 100644 mllm/backends/ascend/ops/AscendLinearOp.cpp
 create mode 100644 mllm/backends/ascend/ops/AscendLinearOp.hpp
 create mode 100644 mllm/backends/ascend/ops/AscendMatMulOp.cpp
 create mode 100644 mllm/backends/ascend/ops/AscendMatMulOp.hpp
 create mode 100644 mllm/backends/ascend/ops/AscendRMSNormOp.cpp
 create mode 100644 mllm/backends/ascend/ops/AscendRMSNormOp.hpp
 create mode 100644 mllm/backends/ascend/ops/AscendSiLUOp.cpp
 create mode 100644 mllm/backends/ascend/ops/AscendSiLUOp.hpp
 create mode 100644 mllm/backends/ascend/ops/AscendSoftmaxOp.cpp
 create mode 100644 mllm/backends/ascend/ops/AscendSoftmaxOp.hpp
 create mode 100644 mllm/backends/ascend/ops/AscendViewOp.cpp
 create mode 100644 mllm/backends/ascend/ops/AscendViewOp.hpp
diff --git a/mllm/backends/ascend/ops/AscendElewiseOps.cpp b/mllm/backends/ascend/ops/AscendElewiseOps.cpp
index 762ef1dfe..be1e1b671 100644
--- a/mllm/backends/ascend/ops/AscendElewiseOps.cpp
+++ b/mllm/backends/ascend/ops/AscendElewiseOps.cpp
@@ -34,9 +34,6 @@ void AscendAddOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>
   if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) {
     NYI("AscendAddOp currently requires x/y/z have same dtype");
   }
-  if (x.numel() != y.numel() || x.numel() != z.numel()) {
-    NYI("AscendAddOp demo only supports no-broadcast case (numel equal)");
-  }
 
   atb::infer::ElewiseParam addParam;
   addParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_ADD;
@@ -106,4 +103,174 @@ void AscendAddOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>
   atb::DestroyOperation(op);
 }
 
+AscendSubOp::AscendSubOp(const aops::SubOpOptions& options) : aops::SubOp(options) {}
+
+void AscendSubOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendSubOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT_EQ(inputs.size(), 2);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& x = inputs[0];
+  const auto& y = inputs[1];
+  auto& z = outputs[0];
+
+  if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) {
+    NYI("AscendSubOp currently requires x/y/z have same dtype");
+  }
+
+  atb::infer::ElewiseParam subParam;
+  subParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_SUB;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(subParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ELEWISE_SUB) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_x;
+  atb::Tensor atb_y;
+  atb::Tensor atb_z;
+
+  fillAtbTensorDesc(x, atb_x.desc);
+  fillAtbTensorDesc(y, atb_y.desc);
+  fillAtbTensorDesc(z, atb_z.desc);
+
+  atb_x.deviceData = reinterpret_cast<uint8_t*>(x.ptr<void>());
+  atb_x.dataSize = x.bytes();
+  atb_y.deviceData = reinterpret_cast<uint8_t*>(y.ptr<void>());
+  atb_y.dataSize = y.bytes();
+  atb_z.deviceData = reinterpret_cast<uint8_t*>(z.ptr<void>());
+  atb_z.dataSize = z.bytes();
+
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  inTensors.push_back(atb_y);
+  outTensors.push_back(atb_z);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SubOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+  {
+    ASCEND_TIME_SCOPE("AscendSubOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SubOp Execute failed, status={}", static_cast<int>(st));
+  }
+  
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
+AscendMulOp::AscendMulOp(const aops::MulOpOptions& options) : aops::MulOp(options) {}
+
+void AscendMulOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendMulOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT_EQ(inputs.size(), 2);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& x = inputs[0];
+  const auto& y = inputs[1];
+  auto& z = outputs[0];
+
+  if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) {
+    NYI("AscendMulOp currently requires x/y/z have same dtype");
+  }
+
+  atb::infer::ElewiseParam mulParam;
+  mulParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_MUL;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(mulParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ELEWISE_MUL) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_x;
+  atb::Tensor atb_y;
+  atb::Tensor atb_z;
+
+  fillAtbTensorDesc(x, atb_x.desc);
+  fillAtbTensorDesc(y, atb_y.desc);
+  fillAtbTensorDesc(z, atb_z.desc);
+
+  atb_x.deviceData = reinterpret_cast<uint8_t*>(x.ptr<void>());
+  atb_x.dataSize = x.bytes();
+  atb_y.deviceData = reinterpret_cast<uint8_t*>(y.ptr<void>());
+  atb_y.dataSize = y.bytes();
+  atb_z.deviceData = reinterpret_cast<uint8_t*>(z.ptr<void>());
+  atb_z.dataSize = z.bytes();
+
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  inTensors.push_back(atb_y);
+  outTensors.push_back(atb_z);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MulOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+  {
+    ASCEND_TIME_SCOPE("AscendMulOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MulOp Execute failed, status={}", static_cast<int>(st));
+  }
+  
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
 }  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendElewiseOps.hpp b/mllm/backends/ascend/ops/AscendElewiseOps.hpp
index 26117cbc2..9122e20cb 100644
--- a/mllm/backends/ascend/ops/AscendElewiseOps.hpp
+++ b/mllm/backends/ascend/ops/AscendElewiseOps.hpp
@@ -24,4 +24,34 @@ class AscendAddOpFactory final : public TypedOpFactory<OpTypes::kAdd, aops::AddO
   }
 };
 
+class AscendSubOp final : public aops::SubOp {
+ public:
+  explicit AscendSubOp(const aops::SubOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendSubOpFactory final : public TypedOpFactory<OpTypes::kSub, aops::SubOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::SubOpOptions& options) override {
+    return std::make_shared<AscendSubOp>(options);
+  }
+};
+
+class AscendMulOp final : public aops::MulOp {
+ public:
+  explicit AscendMulOp(const aops::MulOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendMulOpFactory final : public TypedOpFactory<OpTypes::kMul, aops::MulOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::MulOpOptions& options) override {
+    return std::make_shared<AscendMulOp>(options);
+  }
+};
+
 }  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendLinearOp.cpp b/mllm/backends/ascend/ops/AscendLinearOp.cpp
new file mode 100644
index 000000000..a8b986984
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendLinearOp.cpp
@@ -0,0 +1,167 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendLinearOp.hpp"
+
+#include <acl/acl.h>
+#include <atb/atb_infer.h>
+#include <atb/types.h>
+#include <atb/utils.h>
+#include <atb/infer_op_params.h>
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+namespace mllm::ascend {
+
+AscendLinearOp::AscendLinearOp(const aops::LinearOpOptions& options) : aops::LinearOp(options) {}
+
+void AscendLinearOp::reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  if (options().isRedirect()) {
+    const auto& input = inputs[0];
+    const auto& weight = inputs[1];
+    auto out_shape = input.shape();
+    out_shape[out_shape.size() - 1] = weight.shape()[0];  // out_channels
+    outputs.emplace_back(Tensor::empty(out_shape, input.dtype(), input.device()));
+    return;
+  }
+  aops::LinearOp::reshape(inputs, outputs);
+}
+
+void AscendLinearOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendLinearOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT(inputs.size() >= 1 && inputs.size() <= 3);
+
+  const Tensor* weight_ptr = nullptr;
+  const Tensor* bias_ptr = nullptr;
+
+  if (inputs.size() == 1) {
+    weight_ptr = &weight();
+    if (options().bias) { bias_ptr = &bias(); }
+  } else if (inputs.size() == 2) {
+    weight_ptr = &inputs[1];
+  } else if (inputs.size() == 3) {
+    weight_ptr = &inputs[1];
+    bias_ptr = &inputs[2];
+  }
+
+  const auto& x = inputs[0];
+  auto& y = outputs[0];
+
+  // Validate that input tensors are FP16
+  if (x.dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendLinearOp: Input tensor must be FP16, but got dtype={}",
+                    static_cast<int>(x.dtype()));
+  }
+  if (weight_ptr->dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendLinearOp: Weight tensor must be FP16, but got dtype={}",
+                    static_cast<int>(weight_ptr->dtype()));
+  }
+  if (bias_ptr != nullptr && bias_ptr->dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendLinearOp: Bias tensor must be FP16, but got dtype={}",
+                    static_cast<int>(bias_ptr->dtype()));
+  }
+
+  // Validate bias dimensions: ATB Linear requires bias to be 2D [1, out_channels]
+  if (bias_ptr != nullptr) {
+    const auto& bias_shape = bias_ptr->shape();
+    if (bias_shape.size() == 1) {
+      MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                      "AscendLinearOp: Bias tensor must be 2D [1, out_channels], but got 1D shape with size={}. "
+                      "Please reshape the bias tensor before passing to AscendLinearOp.",
+                      bias_shape[0]);
+    }
+    if (bias_shape.size() != 2 || bias_shape[0] != 1) {
+      MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                      "AscendLinearOp: Bias tensor must be 2D with shape [1, out_channels], but got shape=[{}, {}]",
+                      bias_shape.size() >= 1 ? bias_shape[0] : 0,
+                      bias_shape.size() >= 2 ? bias_shape[1] : 0);
+    }
+  }
+
+
+  atb::infer::LinearParam linearParam;
+  linearParam.transposeA = false;
+  linearParam.transposeB = true;  // Set to true because weight is [out_channels, in_channels]
+  linearParam.hasBias = (bias_ptr != nullptr);
+  linearParam.outDataType = ACL_DT_UNDEFINED;
+  linearParam.enAccum = false;
+  linearParam.matmulType = atb::infer::LinearParam::MATMUL_UNDEFINED;
+  linearParam.quantMode = atb::infer::LinearParam::QUANT_UNDEFINED;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(linearParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(Linear) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_x;
+  atb::Tensor atb_weight;
+  atb::Tensor atb_y;
+  atb::Tensor atb_bias;
+
+  fillAtbTensor(x, atb_x);
+  fillAtbTensor(*weight_ptr, atb_weight);
+  fillAtbTensor(y, atb_y);
+
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  inTensors.push_back(atb_weight);
+
+  if (bias_ptr != nullptr) {
+    fillAtbTensor(*bias_ptr, atb_bias);
+    inTensors.push_back(atb_bias);
+  }
+
+  outTensors.push_back(atb_y);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB LinearOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+
+  {
+    ASCEND_TIME_SCOPE("AscendLinearOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB LinearOp Execute failed, status={}", static_cast<int>(st));
+  }
+
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
+}  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendLinearOp.hpp b/mllm/backends/ascend/ops/AscendLinearOp.hpp
new file mode 100644
index 000000000..c1b490c70
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendLinearOp.hpp
@@ -0,0 +1,28 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/LinearOp.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+namespace mllm::ascend {
+
+class AscendLinearOp final : public aops::LinearOp {
+ public:
+  explicit AscendLinearOp(const aops::LinearOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendLinearOpFactory final : public TypedOpFactory<OpTypes::kLinear, aops::LinearOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::LinearOpOptions& options) override {
+    return std::make_shared<AscendLinearOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendMatMulOp.cpp b/mllm/backends/ascend/ops/AscendMatMulOp.cpp
new file mode 100644
index 000000000..b08cc77fb
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendMatMulOp.cpp
@@ -0,0 +1,147 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendMatMulOp.hpp"
+
+#include <acl/acl.h>
+#include <atb/atb_infer.h>
+#include <atb/types.h>
+#include <atb/utils.h>
+#include <atb/infer_op_params.h>
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+namespace mllm::ascend {
+
+AscendMatMulOp::AscendMatMulOp(const aops::MatMulOpOptions& options) : aops::MatMulOp(options) {}
+
+void AscendMatMulOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+namespace {
+
+// Helper to fill ATB tensor with custom shape (for reshape without copy)
+void fillAtbTensorWithShape(const Tensor& t, atb::Tensor& atb_tensor, const std::vector<int64_t>& shape) {
+  atb::TensorDesc desc;
+  desc.dtype = ACL_FLOAT16;  // Ascend uses FP16
+  desc.format = ACL_FORMAT_ND;
+
+  desc.shape.dimNum = shape.size();
+  for (size_t i = 0; i < shape.size(); ++i) {
+    desc.shape.dims[i] = shape[i];
+  }
+
+  atb_tensor.desc = desc;
+  atb_tensor.dataSize = atb::Utils::GetTensorSize(atb_tensor);
+  atb_tensor.deviceData = reinterpret_cast<uint8_t*>(t.ptr<void>());
+}
+
+}  // namespace
+
+void AscendMatMulOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT_EQ(inputs.size(), 2);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& A = inputs[0];
+  const auto& B = inputs[1];
+  auto& C = outputs[0];
+
+  // ATB Linear/MatMul only supports 2D/3D tensors.
+  // For 4D tensors [B, H, S, D], we reshape to 3D [B*H, S, D], compute, then reshape back.
+  const auto& a_shape = A.shape();
+  const auto& b_shape = B.shape();
+  const auto& c_shape = C.shape();
+
+  bool is_4d = (a_shape.size() == 4);
+
+  // Prepare shapes for ATB
+  std::vector<int64_t> atb_a_shape, atb_b_shape, atb_c_shape;
+
+  if (is_4d) {
+    // Reshape [B, H, S, D] -> [B*H, S, D]
+    int64_t batch_heads_a = static_cast<int64_t>(a_shape[0]) * static_cast<int64_t>(a_shape[1]);
+    int64_t batch_heads_b = static_cast<int64_t>(b_shape[0]) * static_cast<int64_t>(b_shape[1]);
+    int64_t batch_heads_c = static_cast<int64_t>(c_shape[0]) * static_cast<int64_t>(c_shape[1]);
+
+    atb_a_shape = {batch_heads_a, static_cast<int64_t>(a_shape[2]), static_cast<int64_t>(a_shape[3])};
+    atb_b_shape = {batch_heads_b, static_cast<int64_t>(b_shape[2]), static_cast<int64_t>(b_shape[3])};
+    atb_c_shape = {batch_heads_c, static_cast<int64_t>(c_shape[2]), static_cast<int64_t>(c_shape[3])};
+  } else {
+    // 2D or 3D: use original shapes
+    for (auto dim : a_shape) atb_a_shape.push_back(static_cast<int64_t>(dim));
+    for (auto dim : b_shape) atb_b_shape.push_back(static_cast<int64_t>(dim));
+    for (auto dim : c_shape) atb_c_shape.push_back(static_cast<int64_t>(dim));
+  }
+
+  // Create LinearParam for ATB (used for MatMul)
+  atb::infer::LinearParam linearParam;
+  linearParam.transposeA = options_.transpose_a;
+  linearParam.transposeB = options_.transpose_b;
+  linearParam.hasBias = false;
+  linearParam.outDataType = ACL_DT_UNDEFINED;
+  linearParam.enAccum = false;
+  linearParam.matmulType = atb::infer::LinearParam::MATMUL_UNDEFINED;
+  linearParam.quantMode = atb::infer::LinearParam::QUANT_UNDEFINED;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(linearParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(MatMul) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_A, atb_B, atb_C;
+  fillAtbTensorWithShape(A, atb_A, atb_a_shape);
+  fillAtbTensorWithShape(B, atb_B, atb_b_shape);
+  fillAtbTensorWithShape(C, atb_C, atb_c_shape);
+
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_A);
+  inTensors.push_back(atb_B);
+  outTensors.push_back(atb_C);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MatMulOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+
+  {
+    ASCEND_TIME_SCOPE("AscendMatMulOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MatMulOp Execute failed, status={}", static_cast<int>(st));
+  }
+
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
+}  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendMatMulOp.hpp b/mllm/backends/ascend/ops/AscendMatMulOp.hpp
new file mode 100644
index 000000000..5c10a4525
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendMatMulOp.hpp
@@ -0,0 +1,27 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/MatMulOp.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+namespace mllm::ascend {
+
+class AscendMatMulOp final : public aops::MatMulOp {
+ public:
+  explicit AscendMatMulOp(const aops::MatMulOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendMatMulOpFactory : public TypedOpFactory<OpTypes::kMatMul, aops::MatMulOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::MatMulOpOptions& options) override {
+    return std::make_shared<AscendMatMulOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendRMSNormOp.cpp b/mllm/backends/ascend/ops/AscendRMSNormOp.cpp
new file mode 100644
index 000000000..639639e22
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendRMSNormOp.cpp
@@ -0,0 +1,106 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendRMSNormOp.hpp"
+
+#include <acl/acl.h>
+#include <iostream>
+#include <atb/atb_infer.h>
+#include <atb/types.h>
+#include <atb/utils.h>
+#include <atb/infer_op_params.h>
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+namespace mllm::ascend {
+
+AscendRMSNormOp::AscendRMSNormOp(const aops::RMSNormOpOptions& options) : aops::RMSNormOp(options) {}
+
+void AscendRMSNormOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendRMSNormOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  //MLLM_RT_ASSERT(inputs.size() == 1 || inputs.size() == 2, "AscendRMSNormOp expects 1 or 2 inputs");
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& x = inputs[0];
+  const auto& weight = (inputs.size() == 2) ? inputs[1] : weight_;
+  auto& y = outputs[0];
+
+  const Tensor& weight_for_atb = weight;
+
+  if (x.dtype() != y.dtype()) {
+    NYI("AscendRMSNormOp currently requires x/y have same dtype");
+  }
+  if (x.numel() != y.numel()) {
+    NYI("AscendRMSNormOp requires x/y have same numel");
+  }
+
+  atb::infer::RmsNormParam rmsNormParam;
+  rmsNormParam.layerType = atb::infer::RmsNormParam::RmsNormType::RMS_NORM_NORM;
+  rmsNormParam.normParam.quantType = atb::infer::QuantType::QUANT_UNQUANT;
+  rmsNormParam.normParam.epsilon = options_.epsilon;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(rmsNormParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(RMS_NORM) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_x;
+  atb::Tensor atb_weight;
+  atb::Tensor atb_y;
+
+  fillAtbTensor(x, atb_x);
+  fillAtbTensor(weight_for_atb, atb_weight);
+  fillAtbTensor(y, atb_y);
+
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  inTensors.push_back(atb_weight);
+  outTensors.push_back(atb_y);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB RMSNormOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+  {
+    ASCEND_TIME_SCOPE("AscendRMSNormOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB RMSNormOp Execute failed, status={}", static_cast<int>(st));
+  }
+
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
+}  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendRMSNormOp.hpp b/mllm/backends/ascend/ops/AscendRMSNormOp.hpp
new file mode 100644
index 000000000..2bfd7db16
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendRMSNormOp.hpp
@@ -0,0 +1,27 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/RMSNormOp.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+namespace mllm::ascend {
+
+class AscendRMSNormOp final : public aops::RMSNormOp {
+ public:
+  explicit AscendRMSNormOp(const aops::RMSNormOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendRMSNormOpFactory final : public TypedOpFactory<OpTypes::kRMSNorm, aops::RMSNormOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::RMSNormOpOptions& options) override {
+    return std::make_shared<AscendRMSNormOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendSiLUOp.cpp b/mllm/backends/ascend/ops/AscendSiLUOp.cpp
new file mode 100644
index 000000000..8c6ec3e69
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendSiLUOp.cpp
@@ -0,0 +1,115 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendSiLUOp.hpp"
+
+#include <acl/acl.h>
+#include <atb/atb_infer.h>
+#include <atb/types.h>
+#include <atb/utils.h>
+#include <atb/infer_op_params.h>
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+namespace mllm::ascend {
+
+AscendSiLUOp::AscendSiLUOp(const aops::SiLUOpOptions& options) : aops::SiLUOp(options) {}
+
+void AscendSiLUOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendSiLUOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT_EQ(inputs.size(), 1);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& x = inputs[0];
+  auto& y = outputs[0];
+
+  // Validate that input tensors are FP16
+  if (x.dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendSiLUOp: Input tensor must be FP16, but got dtype={}",
+                    static_cast<int>(x.dtype()));
+  }
+  if (y.dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendSiLUOp: Output tensor must be FP16, but got dtype={}",
+                    static_cast<int>(y.dtype()));
+  }
+
+  if (x.dtype() != y.dtype()) {
+    NYI("AscendSiLUOp currently requires x/y have same dtype");
+  }
+  if (x.numel() != y.numel()) {
+    NYI("AscendSiLUOp requires x/y have same numel");
+  }
+
+  atb::infer::ActivationParam siluParam;
+  siluParam.activationType = atb::infer::ACTIVATION_SWISH;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(siluParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ACTIVATION_SWISH) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_x;
+  atb::Tensor atb_y;
+
+  fillAtbTensorDesc(x, atb_x.desc);
+  fillAtbTensorDesc(y, atb_y.desc);
+
+  atb_x.deviceData = reinterpret_cast<uint8_t*>(x.ptr<void>());
+  atb_x.dataSize = x.bytes();
+  atb_y.deviceData = reinterpret_cast<uint8_t*>(y.ptr<void>());
+  atb_y.dataSize = y.bytes();
+
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  outTensors.push_back(atb_y);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SiLUOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+  {
+    ASCEND_TIME_SCOPE("AscendSiLUOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SiLUOp Execute failed, status={}", static_cast<int>(st));
+  }
+
+
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
+}  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendSiLUOp.hpp b/mllm/backends/ascend/ops/AscendSiLUOp.hpp
new file mode 100644
index 000000000..3e0ee27be
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendSiLUOp.hpp
@@ -0,0 +1,27 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/SiLUOp.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+namespace mllm::ascend {
+
+class AscendSiLUOp final : public aops::SiLUOp {
+ public:
+  explicit AscendSiLUOp(const aops::SiLUOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendSiLUOpFactory final : public TypedOpFactory<OpTypes::kSiLU, aops::SiLUOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::SiLUOpOptions& options) override {
+    return std::make_shared<AscendSiLUOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp b/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp
new file mode 100644
index 000000000..25d09081a
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp
@@ -0,0 +1,135 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendSoftmaxOp.hpp"
+
+#include <acl/acl.h>
+#include <atb/atb_infer.h>
+#include <atb/types.h>
+#include <atb/utils.h>
+#include <atb/infer_op_params.h>
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+namespace mllm::ascend {
+
+AscendSoftmaxOp::AscendSoftmaxOp(const aops::SoftmaxOpOptions& options) : aops::SoftmaxOp(options) {}
+
+void AscendSoftmaxOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendSoftmaxOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT_EQ(inputs.size(), 1);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& x = inputs[0];
+  auto& y = outputs[0];
+
+  // Validate that input tensors are FP16
+  if (x.dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendSoftmaxOp: Input tensor must be FP16, but got dtype={}",
+                    static_cast<int>(x.dtype()));
+  }
+  if (y.dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendSoftmaxOp: Output tensor must be FP16, but got dtype={}",
+                    static_cast<int>(y.dtype()));
+  }
+
+  if (x.dtype() != y.dtype()) {
+    NYI("AscendSoftmaxOp currently requires x/y have same dtype");
+  }
+  if (x.numel() != y.numel()) {
+    NYI("AscendSoftmaxOp requires x/y have same numel");
+  }
+
+  // Configure Softmax parameters
+  atb::infer::SoftmaxParam softmaxParam;
+
+  // Convert axis to positive index if negative
+  int axis = options_.axis;
+  if (axis < 0) {
+    axis = static_cast<int>(x.rank()) + axis;
+  }
+
+  // ATB expects axes as SVector<int64_t>
+  softmaxParam.axes.push_back(static_cast<int64_t>(axis));
+
+  // Create ATB operation
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(softmaxParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "ATB CreateOperation(Softmax) failed, status={}",
+                    static_cast<int>(st));
+  }
+
+  // Get global ATB context
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  // Prepare ATB tensors
+  atb::Tensor atb_x;
+  atb::Tensor atb_y;
+
+  fillAtbTensor(x, atb_x);
+  fillAtbTensor(y, atb_y);
+
+  // Setup input/output tensors
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  outTensors.push_back(atb_y);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  // Setup operation (calculate required workspace size)
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "ATB SoftmaxOp Setup failed, status={}",
+                    static_cast<int>(st));
+  }
+
+  // Allocate workspace if needed
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+
+  // Execute operation
+  {
+    ASCEND_TIME_SCOPE("AscendSoftmaxOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "ATB SoftmaxOp Execute failed, status={}",
+                    static_cast<int>(st));
+  }
+
+  // Synchronize stream
+  syncGlobalAtbStream();
+
+  // Free workspace
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  // Destroy operation
+  atb::DestroyOperation(op);
+}
+
+}  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendSoftmaxOp.hpp b/mllm/backends/ascend/ops/AscendSoftmaxOp.hpp
new file mode 100644
index 000000000..262be2fcb
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendSoftmaxOp.hpp
@@ -0,0 +1,27 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/SoftmaxOp.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+namespace mllm::ascend {
+
+class AscendSoftmaxOp final : public aops::SoftmaxOp {
+ public:
+  explicit AscendSoftmaxOp(const aops::SoftmaxOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendSoftmaxOpFactory final : public TypedOpFactory<OpTypes::kSoftmax, aops::SoftmaxOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::SoftmaxOpOptions& options) override {
+    return std::make_shared<AscendSoftmaxOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendViewOp.cpp b/mllm/backends/ascend/ops/AscendViewOp.cpp
new file mode 100644
index 000000000..c1d1ce435
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendViewOp.cpp
@@ -0,0 +1,16 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendViewOp.hpp"
+
+namespace mllm::ascend {
+
+AscendViewOp::AscendViewOp(const aops::ViewOpOptions& options) : aops::ViewOp(options) {}
+
+void AscendViewOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  // View operation only changes metadata (shape), not actual data
+  // Just call the base class implementation which is empty
+  aops::ViewOp::forward(inputs, outputs);
+}
+
+}  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendViewOp.hpp b/mllm/backends/ascend/ops/AscendViewOp.hpp
new file mode 100644
index 000000000..33fcf6712
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendViewOp.hpp
@@ -0,0 +1,25 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/ViewOp.hpp"
+
+namespace mllm::ascend {
+
+class AscendViewOp final : public aops::ViewOp {
+ public:
+  explicit AscendViewOp(const aops::ViewOpOptions& options);
+
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendViewOpFactory final : public TypedOpFactory<OpTypes::kView, aops::ViewOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::ViewOpOptions& options) override {
+    return std::make_shared<AscendViewOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
\ No newline at end of file

From 890bfeb898c55670055f5abb7eb1414cbf66a5fe Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 30 Jan 2026 23:25:01 +0800
Subject: [PATCH 2/4] feat(ascend): create tests for new ops

---
 tests/ascend/AscendAttentionKernelTest.hpp | 576 +++++++++++++++++++++
 tests/ascend/AscendKernelTest.hpp          |  70 +++
 tests/ascend/AscendLinearKernelTest.hpp    | 164 ++++++
 tests/ascend/AscendRMSNormKernelTest.hpp   |  85 +++
 tests/ascend/AscendSiLUKernelTest.hpp      |  67 +++
 tests/ascend/AscendSoftmaxKernelTest.hpp   | 129 +++++
 tests/ascend/KernelTest.cpp                | 198 +++++++
 7 files changed, 1289 insertions(+)
 create mode 100644 tests/ascend/AscendAttentionKernelTest.hpp
 create mode 100644 tests/ascend/AscendLinearKernelTest.hpp
 create mode 100644 tests/ascend/AscendRMSNormKernelTest.hpp
 create mode 100644 tests/ascend/AscendSiLUKernelTest.hpp
 create mode 100644 tests/ascend/AscendSoftmaxKernelTest.hpp

diff --git a/tests/ascend/AscendAttentionKernelTest.hpp b/tests/ascend/AscendAttentionKernelTest.hpp
new file mode 100644
index 000000000..b6bf9eb02
--- /dev/null
+++ b/tests/ascend/AscendAttentionKernelTest.hpp
@@ -0,0 +1,576 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/mllm.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/nn/Functional.hpp"
+#include "KernelTestHelper.hpp"
+#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp"
+#include <vector>
+#include <cmath>
+#include <limits>
+
+class AscendAttentionKernelTest : public KernelTest {
+ public:
+  AscendAttentionKernelTest() = default;
+  ~AscendAttentionKernelTest() override = default;
+
+  // Test Scaled Dot-Product Attention using existing operators
+  // Attention(Q, K, V) = softmax(Q @ K^T / sqrt(d_k)) @ V
+  bool ScaledDotProductAttentionFloat16Test(const std::vector<std::tuple<mllm::Tensor::shape_t, mllm::Tensor::shape_t, mllm::Tensor::shape_t>>& test_cases) {
+    using namespace mllm;  // NOLINT
+
+    for (const auto& [q_shape, k_shape, v_shape] : test_cases) {
+      // Validate shapes: Q=[B, S_q, D], K=[B, S_kv, D], V=[B, S_kv, D]
+      MLLM_RT_ASSERT_EQ(q_shape.size(), 3);
+      MLLM_RT_ASSERT_EQ(k_shape.size(), 3);
+      MLLM_RT_ASSERT_EQ(v_shape.size(), 3);
+      MLLM_RT_ASSERT_EQ(q_shape[0], k_shape[0]);  // Same batch size
+      MLLM_RT_ASSERT_EQ(q_shape[0], v_shape[0]);
+      MLLM_RT_ASSERT_EQ(q_shape[2], k_shape[2]);  // Same D dimension
+      MLLM_RT_ASSERT_EQ(k_shape[1], v_shape[1]);  // K and V have same sequence length
+
+      int32_t B = static_cast<int32_t>(q_shape[0]);
+      int32_t S_q = static_cast<int32_t>(q_shape[1]);
+      int32_t S_kv = static_cast<int32_t>(k_shape[1]);
+      int32_t D = static_cast<int32_t>(q_shape[2]);
+
+      // 1. Create random FP16 inputs on CPU
+      Tensor Q_cpu = Tensor::random(q_shape, -1.0f, 1.0f, kFloat16, kCPU);
+      Tensor K_cpu = Tensor::random(k_shape, -1.0f, 1.0f, kFloat16, kCPU);
+      Tensor V_cpu = Tensor::random(v_shape, -1.0f, 1.0f, kFloat16, kCPU);
+
+      // 2. Compute reference result on CPU using FP32 for better precision
+      Tensor Q_cpu_fp32 = Tensor::zeros(q_shape, kFloat32, kCPU);
+      Tensor K_cpu_fp32 = Tensor::zeros(k_shape, kFloat32, kCPU);
+      Tensor V_cpu_fp32 = Tensor::zeros(v_shape, kFloat32, kCPU);
+
+      // Convert FP16 to FP32
+      {
+        auto* q_fp16 = Q_cpu.ptr<mllm_fp16_t>();
+        auto* k_fp16 = K_cpu.ptr<mllm_fp16_t>();
+        auto* v_fp16 = V_cpu.ptr<mllm_fp16_t>();
+        auto* q_fp32 = Q_cpu_fp32.ptr<mllm_fp32_t>();
+        auto* k_fp32 = K_cpu_fp32.ptr<mllm_fp32_t>();
+        auto* v_fp32 = V_cpu_fp32.ptr<mllm_fp32_t>();
+
+        for (size_t i = 0; i < Q_cpu.numel(); ++i) {
+          q_fp32[i] = MLLM_FP16_TO_FP32(q_fp16[i]);
+        }
+        for (size_t i = 0; i < K_cpu.numel(); ++i) {
+          k_fp32[i] = MLLM_FP16_TO_FP32(k_fp16[i]);
+        }
+        for (size_t i = 0; i < V_cpu.numel(); ++i) {
+          v_fp32[i] = MLLM_FP16_TO_FP32(v_fp16[i]);
+        }
+      }
+
+      // Compute reference attention on CPU (FP32)
+      Tensor ref_cpu_fp32 = Tensor::zeros({B, S_q, D}, kFloat32, kCPU);
+      {
+        auto* q_ptr = Q_cpu_fp32.ptr<mllm_fp32_t>();
+        auto* k_ptr = K_cpu_fp32.ptr<mllm_fp32_t>();
+        auto* v_ptr = V_cpu_fp32.ptr<mllm_fp32_t>();
+        auto* out_ptr = ref_cpu_fp32.ptr<mllm_fp32_t>();
+
+        float scale = 1.0f / std::sqrt(static_cast<float>(D));
+
+        for (int32_t b = 0; b < B; ++b) {
+          // Compute Q @ K^T for this batch
+          std::vector<float> scores(S_q * S_kv, 0.0f);
+
+          for (int32_t i = 0; i < S_q; ++i) {
+            for (int32_t j = 0; j < S_kv; ++j) {
+              float sum = 0.0f;
+              for (int32_t k = 0; k < D; ++k) {
+                float q_val = q_ptr[b * S_q * D + i * D + k];
+                float k_val = k_ptr[b * S_kv * D + j * D + k];
+                sum += q_val * k_val;
+              }
+              scores[i * S_kv + j] = sum * scale;
+            }
+          }
+
+          // Apply softmax along the last dimension (S_kv)
+          std::vector<float> attn_weights(S_q * S_kv);
+          for (int32_t i = 0; i < S_q; ++i) {
+            // Find max for numerical stability
+            float max_val = -std::numeric_limits<float>::infinity();
+            for (int32_t j = 0; j < S_kv; ++j) {
+              max_val = std::max(max_val, scores[i * S_kv + j]);
+            }
+
+            // Compute exp and sum
+            float sum_exp = 0.0f;
+            for (int32_t j = 0; j < S_kv; ++j) {
+              float exp_val = std::exp(scores[i * S_kv + j] - max_val);
+              attn_weights[i * S_kv + j] = exp_val;
+              sum_exp += exp_val;
+            }
+
+            // Normalize
+            for (int32_t j = 0; j < S_kv; ++j) {
+              attn_weights[i * S_kv + j] /= sum_exp;
+            }
+          }
+
+          // Compute output: attn_weights @ V
+          // out[S_q, D] = attn_weights[S_q, S_kv] @ V[S_kv, D]
+          for (int32_t i = 0; i < S_q; ++i) {
+            for (int32_t k = 0; k < D; ++k) {
+              float sum = 0.0f;
+              for (int32_t j = 0; j < S_kv; ++j) {
+                float attn_val = attn_weights[i * S_kv + j];
+                float v_val = v_ptr[b * S_kv * D + j * D + k];
+                sum += attn_val * v_val;
+              }
+              out_ptr[b * S_q * D + i * D + k] = sum;
+            }
+          }
+        }
+      }
+
+      // Convert reference back to FP16
+      Tensor ref_cpu = Tensor::zeros({B, S_q, D}, kFloat16, kCPU);
+      {
+        auto* ref_fp32 = ref_cpu_fp32.ptr<mllm_fp32_t>();
+        auto* ref_fp16 = ref_cpu.ptr<mllm_fp16_t>();
+        for (size_t i = 0; i < ref_cpu.numel(); ++i) {
+          ref_fp16[i] = MLLM_FP32_TO_FP16(ref_fp32[i]);
+        }
+      }
+
+      // 3. Move inputs to Ascend and compute attention using existing operators
+      auto Q_ascend = Q_cpu.to(kAscend);
+      auto K_ascend = K_cpu.to(kAscend);
+      auto V_ascend = V_cpu.to(kAscend);
+
+      float scale = 1.0f / std::sqrt(static_cast<float>(D));
+
+      // Step 1: Q @ K^T (transpose_b=true)
+      auto scores = mllm::nn::functional::matmul(Q_ascend, K_ascend, false, true);
+
+      // Step 2: Scale by 1/sqrt(d_k)
+      auto scale_tensor_cpu = Tensor::ones({1}, kFloat16, kCPU) * scale;
+      auto scale_tensor = scale_tensor_cpu.to(kAscend);
+      auto scaled_scores = scores * scale_tensor;
+
+      // Step 3: Softmax along last dimension
+      auto attn_weights = mllm::nn::functional::softmax(scaled_scores, -1);
+
+      // Step 4: attn_weights @ V
+      auto output_ascend = mllm::nn::functional::matmul(attn_weights, V_ascend, false, false);
+
+      // 4. Move result back to CPU and compare
+      auto output_cpu = output_ascend.to(kCPU);
+
+      auto result = mllm::test::allClose(output_cpu, ref_cpu, 5e-2f, 5e-2f);
+      if (!result.is_close) {
+        MLLM_ERROR("Attention test failed for shape Q=[{},{},{}], K=[{},{},{}], V=[{},{},{}]",
+                   q_shape[0], q_shape[1], q_shape[2],
+                   k_shape[0], k_shape[1], k_shape[2],
+                   v_shape[0], v_shape[1], v_shape[2]);
+        MLLM_ERROR("Max absolute diff: {}, Max relative diff: {}",
+                   result.max_absolute_diff, result.max_relative_diff);
+        return false;
+      }
+    }
+    return true;
+  }
+
+  //===----------------------------------------------------------------------===//
+  // Multi-Head Attention with optional Causal Mask
+  //
+  // Input shapes: Q=[B, H, S_q, D], K=[B, H, S_kv, D], V=[B, H, S_kv, D]
+  // where H = num_heads, D = head_dim
+  // Mask shape: [1, 1, S_q, S_kv] (broadcastable to [B, H, S_q, S_kv])
+  //
+  // Attention(Q, K, V, mask) = softmax(Q @ K^T / sqrt(d_k) + mask) @ V
+  //===----------------------------------------------------------------------===//
+  bool MultiHeadAttentionFloat16Test(
+      const std::vector<std::tuple<
+          mllm::Tensor::shape_t,  // Q shape [B, H, S_q, D]
+          mllm::Tensor::shape_t,  // K shape [B, H, S_kv, D]
+          mllm::Tensor::shape_t,  // V shape [B, H, S_kv, D]
+          bool                     // use_causal_mask
+      >>& test_cases) {
+    using namespace mllm;  // NOLINT
+
+    for (const auto& [q_shape, k_shape, v_shape, use_mask] : test_cases) {
+      // Validate shapes: Q=[B, H, S_q, D], K=[B, H, S_kv, D], V=[B, H, S_kv, D]
+      MLLM_RT_ASSERT_EQ(q_shape.size(), 4);
+      MLLM_RT_ASSERT_EQ(k_shape.size(), 4);
+      MLLM_RT_ASSERT_EQ(v_shape.size(), 4);
+      MLLM_RT_ASSERT_EQ(q_shape[0], k_shape[0]);  // Same batch size
+      MLLM_RT_ASSERT_EQ(q_shape[0], v_shape[0]);
+      MLLM_RT_ASSERT_EQ(q_shape[1], k_shape[1]);  // Same num_heads
+      MLLM_RT_ASSERT_EQ(q_shape[1], v_shape[1]);
+      MLLM_RT_ASSERT_EQ(q_shape[3], k_shape[3]);  // Same head_dim
+      MLLM_RT_ASSERT_EQ(k_shape[2], v_shape[2]);  // K and V have same sequence length
+
+      int32_t B = static_cast<int32_t>(q_shape[0]);
+      int32_t H = static_cast<int32_t>(q_shape[1]);  // num_heads
+      int32_t S_q = static_cast<int32_t>(q_shape[2]);
+      int32_t S_kv = static_cast<int32_t>(k_shape[2]);
+      int32_t D = static_cast<int32_t>(q_shape[3]);  // head_dim
+
+      // 1. Create random FP16 inputs on CPU
+      Tensor Q_cpu = Tensor::random(q_shape, -0.5f, 0.5f, kFloat16, kCPU);
+      Tensor K_cpu = Tensor::random(k_shape, -0.5f, 0.5f, kFloat16, kCPU);
+      Tensor V_cpu = Tensor::random(v_shape, -0.5f, 0.5f, kFloat16, kCPU);
+
+      // 2. Create causal mask if needed
+      // Causal mask: mask[i, j] = 0 if j <= i, else -inf (large negative value)
+      Tensor mask_cpu;
+      if (use_mask) {
+        mask_cpu = Tensor::zeros({1, 1, S_q, S_kv}, kFloat16, kCPU);
+        auto* mask_ptr = mask_cpu.ptr<mllm_fp16_t>();
+
+        // Fill causal mask: upper triangular part is masked (-inf)
+        for (int32_t i = 0; i < S_q; ++i) {
+          for (int32_t j = 0; j < S_kv; ++j) {
+            int32_t offset = S_kv - S_q;
+            if (j > i + offset) {
+              mask_ptr[i * S_kv + j] = MLLM_FP32_TO_FP16(-10000.0f);
+            }
+          }
+        }
+      }
+
+      // 3. Compute reference result on CPU using FP32 for better precision
+      Tensor ref_cpu = computeMultiHeadAttentionCPU(Q_cpu, K_cpu, V_cpu, mask_cpu, use_mask);
+
+      // 4. Move inputs to Ascend and compute attention
+      auto Q_ascend = Q_cpu.to(kAscend);
+      auto K_ascend = K_cpu.to(kAscend);
+      auto V_ascend = V_cpu.to(kAscend);
+
+      float scale = 1.0f / std::sqrt(static_cast<float>(D));
+
+      // Step 1: Q @ K^T (transpose_b=true)
+      auto scores = mllm::nn::functional::matmul(Q_ascend, K_ascend, false, true);
+
+      // Step 2: Scale by 1/sqrt(d_k)
+      auto scale_tensor_cpu = Tensor::ones({1}, kFloat16, kCPU);
+      {
+        auto* scale_ptr = scale_tensor_cpu.ptr<mllm_fp16_t>();
+        scale_ptr[0] = MLLM_FP32_TO_FP16(scale);
+      }
+      auto scale_tensor = scale_tensor_cpu.to(kAscend);
+      auto scaled_scores = scores * scale_tensor;
+
+      // Step 3: Add mask if needed (broadcasting: [1, 1, S_q, S_kv] -> [B, H, S_q, S_kv])
+      if (use_mask) {
+        auto mask_ascend = mask_cpu.to(kAscend);
+        scaled_scores = scaled_scores + mask_ascend;
+      }
+
+      // Step 4: Softmax along last dimension
+      auto attn_weights = mllm::nn::functional::softmax(scaled_scores, -1);
+
+      // Step 5: attn_weights @ V
+      // [B, H, S_q, S_kv] @ [B, H, S_kv, D] -> [B, H, S_q, D]
+      auto output_ascend = mllm::nn::functional::matmul(attn_weights, V_ascend, false, false);
+
+      // 5. Move result back to CPU and compare
+      auto output_cpu = output_ascend.to(kCPU);
+
+      auto result = mllm::test::allClose(output_cpu, ref_cpu, 5e-2f, 5e-2f);
+      if (!result.is_close) {
+        MLLM_ERROR("Multi-head attention test failed for shape Q=[{},{},{},{}], K=[{},{},{},{}], V=[{},{},{},{}], mask={}",
+                   q_shape[0], q_shape[1], q_shape[2], q_shape[3],
+                   k_shape[0], k_shape[1], k_shape[2], k_shape[3],
+                   v_shape[0], v_shape[1], v_shape[2], v_shape[3],
+                   use_mask ? "true" : "false");
+        MLLM_ERROR("Max absolute diff: {}, Max relative diff: {}",
+                   result.max_absolute_diff, result.max_relative_diff);
+        return false;
+      }
+
+      MLLM_INFO("Multi-head attention test passed: B={}, H={}, S_q={}, S_kv={}, D={}, mask={}",
+                B, H, S_q, S_kv, D, use_mask ? "true" : "false");
+    }
+    return true;
+  }
+
+  //===----------------------------------------------------------------------===//
+  // Multi-Head Attention with Grouped Query Attention (GQA) support
+  //
+  // GQA: num_q_heads > num_kv_heads, each KV head is shared by multiple Q heads
+  // Input shapes: Q=[B, H_q, S_q, D], K=[B, H_kv, S_kv, D], V=[B, H_kv, S_kv, D]
+  //===----------------------------------------------------------------------===//
+  bool GroupedQueryAttentionFloat16Test(
+      const std::vector<std::tuple<
+          mllm::Tensor::shape_t,  // Q shape [B, H_q, S_q, D]
+          mllm::Tensor::shape_t,  // K shape [B, H_kv, S_kv, D]
+          mllm::Tensor::shape_t,  // V shape [B, H_kv, S_kv, D]
+          bool                     // use_causal_mask
+      >>& test_cases) {
+    using namespace mllm;  // NOLINT
+
+    for (const auto& [q_shape, k_shape, v_shape, use_mask] : test_cases) {
+      // Validate shapes
+      MLLM_RT_ASSERT_EQ(q_shape.size(), 4);
+      MLLM_RT_ASSERT_EQ(k_shape.size(), 4);
+      MLLM_RT_ASSERT_EQ(v_shape.size(), 4);
+      MLLM_RT_ASSERT_EQ(q_shape[0], k_shape[0]);  // Same batch size
+      MLLM_RT_ASSERT_EQ(q_shape[0], v_shape[0]);
+      MLLM_RT_ASSERT_EQ(k_shape[1], v_shape[1]);  // KV have same num_heads
+      MLLM_RT_ASSERT_EQ(q_shape[3], k_shape[3]);  // Same head_dim
+      MLLM_RT_ASSERT_EQ(k_shape[2], v_shape[2]);  // K and V have same sequence length
+
+      int32_t B = static_cast<int32_t>(q_shape[0]);
+      int32_t H_q = static_cast<int32_t>(q_shape[1]);   // num query heads
+      int32_t H_kv = static_cast<int32_t>(k_shape[1]);  // num KV heads
+      int32_t S_q = static_cast<int32_t>(q_shape[2]);
+      int32_t S_kv = static_cast<int32_t>(k_shape[2]);
+      int32_t D = static_cast<int32_t>(q_shape[3]);
+
+      MLLM_RT_ASSERT_EQ(H_q % H_kv, 0);
+      int32_t num_groups = H_q / H_kv;
+
+      // 1. Create random FP16 inputs on CPU
+      Tensor Q_cpu = Tensor::random(q_shape, -0.5f, 0.5f, kFloat16, kCPU);
+      Tensor K_cpu = Tensor::random(k_shape, -0.5f, 0.5f, kFloat16, kCPU);
+      Tensor V_cpu = Tensor::random(v_shape, -0.5f, 0.5f, kFloat16, kCPU);
+
+      // 2. Create causal mask if needed
+      Tensor mask_cpu;
+      if (use_mask) {
+        mask_cpu = Tensor::zeros({1, 1, S_q, S_kv}, kFloat16, kCPU);
+        auto* mask_ptr = mask_cpu.ptr<mllm_fp16_t>();
+        int32_t offset = S_kv - S_q;
+        for (int32_t i = 0; i < S_q; ++i) {
+          for (int32_t j = 0; j < S_kv; ++j) {
+            if (j > i + offset) {
+              mask_ptr[i * S_kv + j] = MLLM_FP32_TO_FP16(-10000.0f);
+            }
+          }
+        }
+      }
+
+      // 3. Compute reference on CPU
+      Tensor ref_cpu = computeGQACPU(Q_cpu, K_cpu, V_cpu, mask_cpu, use_mask, num_groups);
+
+      // 4. Compute on Ascend
+      auto Q_ascend = Q_cpu.to(kAscend);
+      auto K_cpu_expanded = repeatKVHeads(K_cpu, num_groups);
+      auto V_cpu_expanded = repeatKVHeads(V_cpu, num_groups);
+      auto K_ascend = K_cpu_expanded.to(kAscend);
+      auto V_ascend = V_cpu_expanded.to(kAscend);
+
+      float scale = 1.0f / std::sqrt(static_cast<float>(D));
+
+      // Q @ K^T
+      auto scores = mllm::nn::functional::matmul(Q_ascend, K_ascend, false, true);
+
+      // Scale
+      auto scale_tensor_cpu = Tensor::ones({1}, kFloat16, kCPU);
+      {
+        auto* scale_ptr = scale_tensor_cpu.ptr<mllm_fp16_t>();
+        scale_ptr[0] = MLLM_FP32_TO_FP16(scale);
+      }
+      auto scaled_scores = scores * scale_tensor_cpu.to(kAscend);
+
+      // Add mask
+      if (use_mask) {
+        scaled_scores = scaled_scores + mask_cpu.to(kAscend);
+      }
+
+      // Softmax
+      auto attn_weights = mllm::nn::functional::softmax(scaled_scores, -1);
+
+      // attn_weights @ V
+      auto output_ascend = mllm::nn::functional::matmul(attn_weights, V_ascend, false, false);
+
+      // 5. Compare
+      auto output_cpu = output_ascend.to(kCPU);
+      auto result = mllm::test::allClose(output_cpu, ref_cpu, 5e-2f, 5e-2f);
+      if (!result.is_close) {
+        MLLM_ERROR("GQA test failed: B={}, H_q={}, H_kv={}, S_q={}, S_kv={}, D={}, mask={}",
+                   B, H_q, H_kv, S_q, S_kv, D, use_mask ? "true" : "false");
+        MLLM_ERROR("Max absolute diff: {}, Max relative diff: {}",
+                   result.max_absolute_diff, result.max_relative_diff);
+        return false;
+      }
+
+      MLLM_INFO("GQA test passed: B={}, H_q={}, H_kv={}, S_q={}, S_kv={}, D={}, mask={}",
+                B, H_q, H_kv, S_q, S_kv, D, use_mask ? "true" : "false");
+    }
+    return true;
+  }
+
+ private:
+  //===----------------------------------------------------------------------===//
+  // Helper: Compute Multi-Head Attention reference on CPU (FP32)
+  //===----------------------------------------------------------------------===//
+  mllm::Tensor computeMultiHeadAttentionCPU(
+      const mllm::Tensor& Q_cpu,
+      const mllm::Tensor& K_cpu,
+      const mllm::Tensor& V_cpu,
+      const mllm::Tensor& mask_cpu,
+      bool use_mask) {
+    using namespace mllm;  // NOLINT
+
+    int32_t B = static_cast<int32_t>(Q_cpu.shape()[0]);
+    int32_t H = static_cast<int32_t>(Q_cpu.shape()[1]);
+    int32_t S_q = static_cast<int32_t>(Q_cpu.shape()[2]);
+    int32_t S_kv = static_cast<int32_t>(K_cpu.shape()[2]);
+    int32_t D = static_cast<int32_t>(Q_cpu.shape()[3]);
+
+    // Convert inputs to FP32
+    Tensor Q_fp32 = Tensor::zeros({B, H, S_q, D}, kFloat32, kCPU);
+    Tensor K_fp32 = Tensor::zeros({B, H, S_kv, D}, kFloat32, kCPU);
+    Tensor V_fp32 = Tensor::zeros({B, H, S_kv, D}, kFloat32, kCPU);
+
+    auto* q_fp16 = Q_cpu.ptr<mllm_fp16_t>();
+    auto* k_fp16 = K_cpu.ptr<mllm_fp16_t>();
+    auto* v_fp16 = V_cpu.ptr<mllm_fp16_t>();
+    auto* q_fp32 = Q_fp32.ptr<mllm_fp32_t>();
+    auto* k_fp32 = K_fp32.ptr<mllm_fp32_t>();
+    auto* v_fp32 = V_fp32.ptr<mllm_fp32_t>();
+
+    for (size_t i = 0; i < Q_cpu.numel(); ++i) {
+      q_fp32[i] = MLLM_FP16_TO_FP32(q_fp16[i]);
+    }
+    for (size_t i = 0; i < K_cpu.numel(); ++i) {
+      k_fp32[i] = MLLM_FP16_TO_FP32(k_fp16[i]);
+    }
+    for (size_t i = 0; i < V_cpu.numel(); ++i) {
+      v_fp32[i] = MLLM_FP16_TO_FP32(v_fp16[i]);
+    }
+
+    // Convert mask to FP32 if needed
+    const mllm_fp16_t* mask_fp16 = nullptr;
+    if (use_mask) {
+      mask_fp16 = mask_cpu.ptr<mllm_fp16_t>();
+    }
+
+    Tensor output_fp32 = Tensor::zeros({B, H, S_q, D}, kFloat32, kCPU);
+    auto* out_ptr = output_fp32.ptr<mllm_fp32_t>();
+
+    float scale = 1.0f / std::sqrt(static_cast<float>(D));
+
+    for (int32_t b = 0; b < B; ++b) {
+      for (int32_t h = 0; h < H; ++h) {
+        std::vector<float> scores(S_q * S_kv, 0.0f);
+        for (int32_t i = 0; i < S_q; ++i) {
+          for (int32_t j = 0; j < S_kv; ++j) {
+            float sum = 0.0f;
+            for (int32_t k = 0; k < D; ++k) {
+              float q_val = q_fp32[((b * H + h) * S_q + i) * D + k];
+              float k_val = k_fp32[((b * H + h) * S_kv + j) * D + k];
+              sum += q_val * k_val;
+            }
+            scores[i * S_kv + j] = sum * scale;
+
+            // Add mask (mask is broadcastable: [1, 1, S_q, S_kv])
+            if (use_mask) {
+              float mask_val = MLLM_FP16_TO_FP32(mask_fp16[i * S_kv + j]);
+              scores[i * S_kv + j] += mask_val;
+            }
+          }
+        }
+
+        // Softmax along last dimension
+        std::vector<float> attn_weights(S_q * S_kv);
+        for (int32_t i = 0; i < S_q; ++i) {
+          float max_val = -std::numeric_limits<float>::infinity();
+          for (int32_t j = 0; j < S_kv; ++j) {
+            max_val = std::max(max_val, scores[i * S_kv + j]);
+          }
+
+          float sum_exp = 0.0f;
+          for (int32_t j = 0; j < S_kv; ++j) {
+            float exp_val = std::exp(scores[i * S_kv + j] - max_val);
+            attn_weights[i * S_kv + j] = exp_val;
+            sum_exp += exp_val;
+          }
+
+          for (int32_t j = 0; j < S_kv; ++j) {
+            attn_weights[i * S_kv + j] /= sum_exp;
+          }
+        }
+
+        // Compute output: attn_weights @ V
+        for (int32_t i = 0; i < S_q; ++i) {
+          for (int32_t k = 0; k < D; ++k) {
+            float sum = 0.0f;
+            for (int32_t j = 0; j < S_kv; ++j) {
+              float attn_val = attn_weights[i * S_kv + j];
+              float v_val = v_fp32[((b * H + h) * S_kv + j) * D + k];
+              sum += attn_val * v_val;
+            }
+            out_ptr[((b * H + h) * S_q + i) * D + k] = sum;
+          }
+        }
+      }
+    }
+
+    // Convert output back to FP16
+    Tensor output_fp16 = Tensor::zeros({B, H, S_q, D}, kFloat16, kCPU);
+    auto* out_fp16 = output_fp16.ptr<mllm_fp16_t>();
+    for (size_t i = 0; i < output_fp16.numel(); ++i) {
+      out_fp16[i] = MLLM_FP32_TO_FP16(out_ptr[i]);
+    }
+
+    return output_fp16;
+  }
+
+  //===----------------------------------------------------------------------===//
+  // Helper: Repeat KV heads for GQA
+  // [B, H_kv, S, D] -> [B, H_q, S, D] where H_q = H_kv * num_groups
+  //===----------------------------------------------------------------------===//
+  mllm::Tensor repeatKVHeads(const mllm::Tensor& kv, int32_t num_groups) {
+    using namespace mllm;  // NOLINT
+
+    if (num_groups == 1) {
+      return kv;
+    }
+
+    int32_t B = static_cast<int32_t>(kv.shape()[0]);
+    int32_t H_kv = static_cast<int32_t>(kv.shape()[1]);
+    int32_t S = static_cast<int32_t>(kv.shape()[2]);
+    int32_t D = static_cast<int32_t>(kv.shape()[3]);
+    int32_t H_q = H_kv * num_groups;
+
+    Tensor expanded = Tensor::zeros({B, H_q, S, D}, kv.dtype(), kCPU);
+    auto* src = kv.ptr<mllm_fp16_t>();
+    auto* dst = expanded.ptr<mllm_fp16_t>();
+
+    for (int32_t b = 0; b < B; ++b) {
+      for (int32_t h_kv = 0; h_kv < H_kv; ++h_kv) {
+        for (int32_t g = 0; g < num_groups; ++g) {
+          int32_t h_q = h_kv * num_groups + g;
+          for (int32_t s = 0; s < S; ++s) {
+            for (int32_t d = 0; d < D; ++d) {
+              size_t src_idx = ((b * H_kv + h_kv) * S + s) * D + d;
+              size_t dst_idx = ((b * H_q + h_q) * S + s) * D + d;
+              dst[dst_idx] = src[src_idx];
+            }
+          }
+        }
+      }
+    }
+
+    return expanded;
+  }
+
+  //===----------------------------------------------------------------------===//
+  // Helper: Compute GQA reference on CPU
+  //===----------------------------------------------------------------------===//
+  mllm::Tensor computeGQACPU(
+      const mllm::Tensor& Q_cpu,
+      const mllm::Tensor& K_cpu,
+      const mllm::Tensor& V_cpu,
+      const mllm::Tensor& mask_cpu,
+      bool use_mask,
+      int32_t num_groups) {
+    // Expand KV heads and compute standard MHA
+    auto K_expanded = repeatKVHeads(K_cpu, num_groups);
+    auto V_expanded = repeatKVHeads(V_cpu, num_groups);
+    return computeMultiHeadAttentionCPU(Q_cpu, K_expanded, V_expanded, mask_cpu, use_mask);
+  }
+};
\ No newline at end of file
diff --git a/tests/ascend/AscendKernelTest.hpp b/tests/ascend/AscendKernelTest.hpp
index 138ee5ae8..a01028906 100644
--- a/tests/ascend/AscendKernelTest.hpp
+++ b/tests/ascend/AscendKernelTest.hpp
@@ -48,5 +48,75 @@ class AscendKernelTest : public KernelTest {
     }
     return true;
   }
+
+  // Test Sub operation with different shapes
+  bool SubFloat16Test(const std::vector<mllm::Tensor::shape_t>& shapes) {
+    using namespace mllm;  // NOLINT
+    for (auto& shape : shapes) {
+      // 1. Construct random FP16 inputs on CPU
+      Tensor x_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU);
+      Tensor y_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU);
+
+      // 2. Compute reference result (FP16) on CPU
+      Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU);
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* y_ptr = y_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+        auto num_elements = x_cpu.numel();
+        for (size_t i = 0; i < num_elements; ++i) {
+          r_ptr[i] = x_ptr[i] - y_ptr[i];
+        }
+      }
+
+      // 3. Move inputs to Ascend and run Sub (z = x - y)
+      auto x_ascend = x_cpu.to(kAscend);
+      auto y_ascend = y_cpu.to(kAscend);
+      auto z_ascend = x_ascend - y_ascend;
+
+      // 4. Move result back to CPU and compare with reference using allClose
+      auto z_cpu = z_ascend.to(kCPU);
+      auto result = mllm::test::allClose(z_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Test Mul operation with different shapes
+  bool MulFloat16Test(const std::vector<mllm::Tensor::shape_t>& shapes) {
+    using namespace mllm;  // NOLINT
+    for (auto& shape : shapes) {
+      // 1. Construct random FP16 inputs on CPU
+      Tensor x_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU);
+      Tensor y_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU);
+
+      // 2. Compute reference result (FP16) on CPU
+      Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU);
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* y_ptr = y_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+        auto num_elements = x_cpu.numel();
+        for (size_t i = 0; i < num_elements; ++i) {
+          r_ptr[i] = x_ptr[i] * y_ptr[i];
+        }
+      }
+
+      // 3. Move inputs to Ascend and run Mul (z = x * y)
+      auto x_ascend = x_cpu.to(kAscend);
+      auto y_ascend = y_cpu.to(kAscend);
+      auto z_ascend = x_ascend * y_ascend;
+
+      // 4. Move result back to CPU and compare with reference using allClose
+      auto z_cpu = z_ascend.to(kCPU);
+      auto result = mllm::test::allClose(z_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        return false;
+      }
+    }
+    return true;
+  }
 };
 
diff --git a/tests/ascend/AscendLinearKernelTest.hpp b/tests/ascend/AscendLinearKernelTest.hpp
new file mode 100644
index 000000000..4a7a6fed3
--- /dev/null
+++ b/tests/ascend/AscendLinearKernelTest.hpp
@@ -0,0 +1,164 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/mllm.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/core/OpTypes.hpp"
+#include "mllm/core/aops/LinearOp.hpp"
+#include "mllm/engine/Context.hpp"
+#include "mllm/nn/Functional.hpp"
+#include "KernelTestHelper.hpp"
+#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp"
+#include <vector>
+#include <cmath>
+#include <iostream>
+
+
+class AscendLinearKernelTest : public KernelTest {
+ public:
+  AscendLinearKernelTest() = default;
+  ~AscendLinearKernelTest() override = default;
+
+  bool LinearFloat16Test(const std::vector<std::tuple<mllm::Tensor::shape_t, int, int>>& test_cases) {
+    using namespace mllm;  // NOLINT
+    for (auto& test_case : test_cases) {
+      auto input_shape = std::get<0>(test_case);
+      int in_channels = std::get<1>(test_case);
+      int out_channels = std::get<2>(test_case);
+
+      std::cout << "[LinearTest] Testing shape=[";
+      for (size_t i = 0; i < input_shape.size(); ++i) {
+        std::cout << input_shape[i] << (i < input_shape.size() - 1 ? ", " : "");
+      }
+      std::cout << "], in=" << in_channels << ", out=" << out_channels << std::endl;
+
+      // 1. Construct random FP16 inputs on CPU
+      // x: [M, K] where K = in_channels
+      Tensor x_cpu = Tensor::random(input_shape, -1, 1, kFloat16, kCPU);
+
+      // Weight shape for ATB: [K, N] where K=in_channels, N=out_channels
+      Tensor weight_cpu = Tensor::random({in_channels, out_channels}, -0.5, 0.5, kFloat16, kCPU);
+
+      // 2. Compute reference result on CPU
+      // y = x @ weight, where x is [M, K], weight is [K, N], output is [M, N]
+      auto output_shape = input_shape;
+      output_shape[output_shape.size() - 1] = out_channels;
+      Tensor ref_cpu = Tensor::zeros(output_shape, kFloat16, kCPU);
+
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* w_ptr = weight_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+
+        size_t batch_size = 1;
+        for (size_t i = 0; i < input_shape.size() - 1; ++i) {
+          batch_size *= input_shape[i];
+        }
+
+        for (size_t b = 0; b < batch_size; ++b) {
+          for (int o = 0; o < out_channels; ++o) {
+            float sum = 0.0f;
+            for (int i = 0; i < in_channels; ++i) {
+              float x_val = MLLM_FP16_TO_FP32(x_ptr[b * in_channels + i]);
+              float w_val = MLLM_FP16_TO_FP32(w_ptr[i * out_channels + o]);  // weight is [K, N]
+              sum += x_val * w_val;
+            }
+            r_ptr[b * out_channels + o] = MLLM_FP32_TO_FP16(sum);
+          }
+        }
+      }
+
+      // 3. Move inputs to Ascend and run Linear via matmul
+      auto x_ascend = x_cpu.to(kAscend);
+      auto weight_ascend = weight_cpu.to(kAscend);
+
+      // Use matmul: y = x @ weight
+      auto y_ascend = nn::functional::matmul(x_ascend, weight_ascend, false, false);
+
+      // 4. Move result back to CPU and compare with reference
+      auto y_cpu = y_ascend.to(kCPU);
+      auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        std::cout << "[LinearTest] FAILED!" << std::endl;
+        return false;
+      }
+      std::cout << "[LinearTest] PASSED" << std::endl;
+    }
+    return true;
+  }
+
+
+  bool LinearWithBiasFloat16Test(const std::vector<std::tuple<mllm::Tensor::shape_t, int, int>>& test_cases) {
+    using namespace mllm;  // NOLINT
+    for (auto& test_case : test_cases) {
+      auto input_shape = std::get<0>(test_case);
+      int in_channels = std::get<1>(test_case);
+      int out_channels = std::get<2>(test_case);
+
+      std::cout << "[LinearWithBiasTest] Testing shape=[";
+      for (size_t i = 0; i < input_shape.size(); ++i) {
+        std::cout << input_shape[i] << (i < input_shape.size() - 1 ? ", " : "");
+      }
+      std::cout << "], in=" << in_channels << ", out=" << out_channels << std::endl;
+
+      // 1. Create random input, weight and bias on CPU
+      Tensor x_cpu = Tensor::random(input_shape, -1, 1, kFloat16, kCPU);
+      // Weight shape: [out_channels, in_channels]
+      Tensor weight_cpu = Tensor::random({out_channels, in_channels}, -0.5, 0.5, kFloat16, kCPU);
+      // Bias shape: [1, out_channels] for ATB Linear (2D tensor required)
+      Tensor bias_cpu = Tensor::random({1, out_channels}, -0.1, 0.1, kFloat16, kCPU);
+
+      // 2. Compute reference result on CPU
+      auto output_shape = input_shape;
+      output_shape[output_shape.size() - 1] = out_channels;
+      Tensor ref_cpu = Tensor::zeros(output_shape, kFloat16, kCPU);
+
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* w_ptr = weight_cpu.ptr<mllm_fp16_t>();
+        auto* b_ptr = bias_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+
+        size_t batch_size = 1;
+        for (size_t i = 0; i < input_shape.size() - 1; ++i) {
+          batch_size *= input_shape[i];
+        }
+
+        // y = x @ W^T + b, where W is [out_channels, in_channels]
+        for (size_t b = 0; b < batch_size; ++b) {
+          for (int o = 0; o < out_channels; ++o) {
+            float sum = 0.0f;
+            for (int i = 0; i < in_channels; ++i) {
+              float x_val = MLLM_FP16_TO_FP32(x_ptr[b * in_channels + i]);
+              float w_val = MLLM_FP16_TO_FP32(w_ptr[o * in_channels + i]);
+              sum += x_val * w_val;
+            }
+            float bias_val = MLLM_FP16_TO_FP32(b_ptr[o]);
+            sum += bias_val;
+            r_ptr[b * out_channels + o] = MLLM_FP32_TO_FP16(sum);
+          }
+        }
+      }
+
+      // 3. Move tensors to Ascend and run linear
+      auto x_ascend = x_cpu.to(kAscend);
+      auto weight_ascend = weight_cpu.to(kAscend);
+      auto bias_ascend = bias_cpu.to(kAscend);
+
+      // Use nn::functional::linear directly
+      auto y_ascend = nn::functional::linear(x_ascend, weight_ascend, bias_ascend);
+
+      // 4. Compare result with reference
+      auto y_cpu = y_ascend.to(kCPU);
+      auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        std::cout << "[LinearWithBiasTest] FAILED!" << std::endl;
+        return false;
+      }
+      std::cout << "[LinearWithBiasTest] PASSED" << std::endl;
+    }
+    return true;
+  }
+};
\ No newline at end of file
diff --git a/tests/ascend/AscendRMSNormKernelTest.hpp b/tests/ascend/AscendRMSNormKernelTest.hpp
new file mode 100644
index 000000000..1e2dea58a
--- /dev/null
+++ b/tests/ascend/AscendRMSNormKernelTest.hpp
@@ -0,0 +1,85 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/mllm.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/nn/layers/RMSNorm.hpp"
+#include "KernelTestHelper.hpp"
+#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp"
+#include <vector>
+#include <cmath>
+
+class AscendRMSNormKernelTest : public KernelTest {
+ public:
+  AscendRMSNormKernelTest() = default;
+  ~AscendRMSNormKernelTest() override = default;
+
+  // Test RMSNorm operation with different shapes
+  bool RMSNormFloat16Test(const std::vector<std::tuple<mllm::Tensor::shape_t, int, float>>& test_cases) {
+    using namespace mllm;  // NOLINT
+    for (auto& test_case : test_cases) {
+      auto input_shape = std::get<0>(test_case);
+      int norm_size = std::get<1>(test_case);
+      float epsilon = std::get<2>(test_case);
+
+      // Validate that norm_size matches the last dimension of input_shape
+      assert(norm_size == static_cast<int>(input_shape.back()) &&
+             "norm_size must equal the last dimension of input_shape");
+
+      // 1. Construct random FP16 inputs on CPU
+      Tensor x_cpu = Tensor::random(input_shape, -2, 2, kFloat16, kCPU);
+
+      // Weight shape: [norm_size]
+      Tensor weight_cpu = Tensor::random({norm_size}, 0.5, 1.5, kFloat16, kCPU);
+
+      // 2. Compute reference result (FP16) on CPU
+      // RMSNorm: y = x * weight / sqrt(mean(x^2) + epsilon)
+      Tensor ref_cpu = Tensor::zeros(input_shape, kFloat16, kCPU);
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* w_ptr = weight_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+
+        size_t batch_size = 1;
+        for (size_t i = 0; i < input_shape.size() - 1; ++i) {
+          batch_size *= input_shape[i];
+        }
+
+        // Perform RMSNorm for each batch
+        for (size_t b = 0; b < batch_size; ++b) {
+          float sum_squares = 0.0f;
+          for (int i = 0; i < norm_size; ++i) {
+            float x_val = MLLM_FP16_TO_FP32(x_ptr[b * norm_size + i]);
+            sum_squares += x_val * x_val;
+          }
+          float rms = std::sqrt(sum_squares / norm_size + epsilon);
+
+          // Normalize and scale by weight
+          for (int i = 0; i < norm_size; ++i) {
+            float x_val = MLLM_FP16_TO_FP32(x_ptr[b * norm_size + i]);
+            float w_val = MLLM_FP16_TO_FP32(w_ptr[i]);
+            float result = (x_val / rms) * w_val;
+            r_ptr[b * norm_size + i] = MLLM_FP32_TO_FP16(result);
+          }
+        }
+      }
+
+      // 3. Move inputs to Ascend and run RMSNorm
+      auto x_ascend = x_cpu.to(kAscend);
+      auto weight_ascend = weight_cpu.to(kAscend);
+
+      // Use functional API - one line to execute the operator
+      auto y_ascend = nn::functional::rmsNorm(x_ascend, weight_ascend, epsilon);
+
+      // 4. Move result back to CPU and compare with reference using allClose
+      auto y_cpu = y_ascend.to(kCPU);
+      auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
\ No newline at end of file
diff --git a/tests/ascend/AscendSiLUKernelTest.hpp b/tests/ascend/AscendSiLUKernelTest.hpp
new file mode 100644
index 000000000..ce3ceb130
--- /dev/null
+++ b/tests/ascend/AscendSiLUKernelTest.hpp
@@ -0,0 +1,67 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/mllm.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/nn/Functional.hpp"
+#include "KernelTestHelper.hpp"
+#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp"
+#include <vector>
+#include <cmath>
+
+class AscendSiLUKernelTest : public KernelTest {
+ public:
+  AscendSiLUKernelTest() = default;
+  ~AscendSiLUKernelTest() override = default;
+
+  // Test SiLU operation with different shapes
+  bool SiLUFloat16Test(const std::vector<mllm::Tensor::shape_t>& shapes) {
+    using namespace mllm;  // NOLINT
+    for (auto& shape : shapes) {
+      // 1. Construct random FP16 inputs on CPU
+      Tensor x_cpu = Tensor::random(shape, -5, 5, kFloat16, kCPU);
+
+      // 2. Compute reference result (FP16) on CPU
+      // SiLU(x) = x * sigmoid(x) = x / (1 + exp(-x))
+      Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU);
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+        auto num_elements = x_cpu.numel();
+        for (size_t i = 0; i < num_elements; ++i) {
+          // Convert FP16 to FP32 for computation
+          float x_val = MLLM_FP16_TO_FP32(x_ptr[i]);
+
+          // Compute sigmoid(x) = 1 / (1 + exp(-x))
+          float sigmoid_x;
+          if (x_val >= 0) {
+            sigmoid_x = 1.0f / (1.0f + std::exp(-x_val));
+          } else {
+            float exp_x = std::exp(x_val);
+            sigmoid_x = exp_x / (1.0f + exp_x);
+          }
+
+          // SiLU(x) = x * sigmoid(x)
+          float result = x_val * sigmoid_x;
+
+          // Convert back to FP16
+          r_ptr[i] = MLLM_FP32_TO_FP16(result);
+        }
+      }
+
+      // 3. Move inputs to Ascend and run SiLU
+      auto x_ascend = x_cpu.to(kAscend);
+      auto y_ascend = mllm::nn::functional::silu(x_ascend);
+
+      // 4. Move result back to CPU and compare with reference using allClose
+      auto y_cpu = y_ascend.to(kCPU);
+      auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
\ No newline at end of file
diff --git a/tests/ascend/AscendSoftmaxKernelTest.hpp b/tests/ascend/AscendSoftmaxKernelTest.hpp
new file mode 100644
index 000000000..9003c714d
--- /dev/null
+++ b/tests/ascend/AscendSoftmaxKernelTest.hpp
@@ -0,0 +1,129 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/mllm.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/nn/Functional.hpp"
+#include "KernelTestHelper.hpp"
+#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp"
+#include <vector>
+#include <cmath>
+
+class AscendSoftmaxKernelTest : public KernelTest {
+ public:
+  AscendSoftmaxKernelTest() = default;
+  ~AscendSoftmaxKernelTest() override = default;
+
+  // Test Softmax operation with different shapes and axes
+  bool SoftmaxFloat16Test(const std::vector<mllm::Tensor::shape_t>& shapes, const std::vector<int>& axes) {
+    using namespace mllm;  // NOLINT
+    for (auto& shape : shapes) {
+      for (auto axis : axes) {
+        // 1. Construct random FP16 inputs on CPU
+        Tensor x_cpu = Tensor::random(shape, -5, 5, kFloat16, kCPU);
+
+        // 2. Compute reference result (FP16) on CPU
+        // Softmax(x_i) = exp(x_i - max(x)) / sum(exp(x_j - max(x)))
+        Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU);
+        {
+          auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+          auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+
+          // Convert axis to positive index
+          int ndim = static_cast<int>(shape.size());
+          int pos_axis = axis;
+          if (pos_axis < 0) {
+            pos_axis = ndim + pos_axis;
+          }
+
+          // Calculate strides
+          std::vector<size_t> strides(ndim);
+          strides[ndim - 1] = 1;
+          for (int i = ndim - 2; i >= 0; --i) {
+            strides[i] = strides[i + 1] * shape[i + 1];
+          }
+
+          size_t outer_size = 1;
+          for (int i = 0; i < pos_axis; ++i) {
+            outer_size *= shape[i];
+          }
+
+          size_t axis_size = shape[pos_axis];
+
+          size_t inner_size = 1;
+          for (int i = pos_axis + 1; i < ndim; ++i) {
+            inner_size *= shape[i];
+          }
+
+          // Compute softmax for each slice along the axis
+          for (size_t outer = 0; outer < outer_size; ++outer) {
+            for (size_t inner = 0; inner < inner_size; ++inner) {
+              // Find max value for numerical stability
+              float max_val = -std::numeric_limits<float>::infinity();
+              for (size_t i = 0; i < axis_size; ++i) {
+                size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] +
+                             i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner;
+                if (pos_axis == 0) {
+                  idx = i * strides[0] + inner;
+                } else if (pos_axis == ndim - 1) {
+                  idx = outer * axis_size + i;
+                } else {
+                  idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner;
+                }
+                float val = MLLM_FP16_TO_FP32(x_ptr[idx]);
+                max_val = std::max(max_val, val);
+              }
+
+              // Compute exp(x - max) and sum
+              float sum_exp = 0.0f;
+              std::vector<float> exp_vals(axis_size);
+              for (size_t i = 0; i < axis_size; ++i) {
+                size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] +
+                             i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner;
+                if (pos_axis == 0) {
+                  idx = i * strides[0] + inner;
+                } else if (pos_axis == ndim - 1) {
+                  idx = outer * axis_size + i;
+                } else {
+                  idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner;
+                }
+                float val = MLLM_FP16_TO_FP32(x_ptr[idx]);
+                exp_vals[i] = std::exp(val - max_val);
+                sum_exp += exp_vals[i];
+              }
+
+              // Compute softmax and store result
+              for (size_t i = 0; i < axis_size; ++i) {
+                size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] +
+                             i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner;
+                if (pos_axis == 0) {
+                  idx = i * strides[0] + inner;
+                } else if (pos_axis == ndim - 1) {
+                  idx = outer * axis_size + i;
+                } else {
+                  idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner;
+                }
+                float result = exp_vals[i] / sum_exp;
+                r_ptr[idx] = MLLM_FP32_TO_FP16(result);
+              }
+            }
+          }
+        }
+
+        // 3. Move inputs to Ascend and run Softmax
+        auto x_ascend = x_cpu.to(kAscend);
+        auto y_ascend = mllm::nn::functional::softmax(x_ascend, axis);
+
+        // 4. Move result back to CPU and compare with reference using allClose
+        auto y_cpu = y_ascend.to(kCPU);
+        auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f);
+        if (!result.is_close) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+};
\ No newline at end of file
diff --git a/tests/ascend/KernelTest.cpp b/tests/ascend/KernelTest.cpp
index b0489f545..4e1747e82 100644
--- a/tests/ascend/KernelTest.cpp
+++ b/tests/ascend/KernelTest.cpp
@@ -25,6 +25,204 @@ TEST_F(AscendKernelTest, AddFloat16) {
             true);
 }
 
+//===----------------------------------------------------------------------===//
+// Element wise SUB.
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+TEST_F(AscendKernelTest, SubFloat16) {
+  EXPECT_EQ(SubFloat16Test({
+                {2, 3},
+                {1, 1},
+                {4, 4},
+                {8, 8},
+                {16, 16},
+                {32, 32},
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// Element wise MUL.
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+TEST_F(AscendKernelTest, MulFloat16) {
+  EXPECT_EQ(MulFloat16Test({
+                {2, 3},
+                {1, 1},
+                {4, 4},
+                {8, 8},
+                {16, 16},
+                {32, 32},
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// SiLU activation function.
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+#include "AscendSiLUKernelTest.hpp"
+TEST_F(AscendSiLUKernelTest, SiLUFloat16) {
+  EXPECT_EQ(SiLUFloat16Test({
+                {2, 3},
+                {1, 1},
+                {4, 4},
+                {8, 8},
+                {16, 16},
+                {32, 32},
+                {1, 1024},
+                {128, 128},
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// Linear layer (MatMul based test).
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+#include "AscendLinearKernelTest.hpp"
+TEST_F(AscendLinearKernelTest, LinearFloat16) {
+  EXPECT_EQ(LinearFloat16Test({
+                // {input_shape, in_channels, out_channels}
+                {{2, 3}, 3, 4},
+                {{1, 8}, 8, 16},
+                {{4, 16}, 16, 32},
+                {{8, 32}, 32, 64},
+                {{1, 1024}, 1024, 512},
+            }),
+            true);
+}
+
+TEST_F(AscendLinearKernelTest, LinearWithBiasFloat16) {
+  EXPECT_EQ(LinearWithBiasFloat16Test({
+                // {input_shape, in_channels, out_channels}
+                {{2, 3}, 3, 4},
+                {{1, 8}, 8, 16},
+                {{4, 16}, 16, 32},
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// RMSNorm layer.
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+#include "AscendRMSNormKernelTest.hpp"
+TEST_F(AscendRMSNormKernelTest, RMSNormFloat16) {
+  EXPECT_EQ(RMSNormFloat16Test({
+                // {input_shape, norm_size, epsilon}
+                // Note: ATB RMSNorm requires last dim to be multiple of 16 (FP16 alignment)
+                {{2, 16}, 16, 1e-5f},
+                {{1, 32}, 32, 1e-5f},
+                {{4, 64}, 64, 1e-6f},
+                {{8, 128}, 128, 1e-5f},
+                {{1, 1024}, 1024, 1e-5f},
+                {{128, 256}, 256, 1e-5f},
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// Softmax activation function.
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+#include "AscendSoftmaxKernelTest.hpp"
+TEST_F(AscendSoftmaxKernelTest, SoftmaxFloat16) {
+  EXPECT_EQ(SoftmaxFloat16Test({
+                {2, 3},
+                {1, 8},
+                {4, 4},
+                {8, 8},
+                {16, 16},
+                {1, 1024},
+                {128, 128},
+            },
+            {-1, 0, 1}  // Test different axes
+            ),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// Scaled Dot-Product Attention (using existing operators).
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+#include "AscendAttentionKernelTest.hpp"
+TEST_F(AscendAttentionKernelTest, ScaledDotProductAttentionFloat16) {
+  EXPECT_EQ(ScaledDotProductAttentionFloat16Test({
+                // {Q_shape, K_shape, V_shape}
+                // Format: [B, S, D]
+                {{1, 4, 8}, {1, 4, 8}, {1, 4, 8}},      // Small: B=1, S=4, D=8
+                {{1, 8, 16}, {1, 8, 16}, {1, 8, 16}},   // Medium: B=1, S=8, D=16
+                {{2, 4, 8}, {2, 4, 8}, {2, 4, 8}},      // Batch=2
+                {{1, 16, 32}, {1, 16, 32}, {1, 16, 32}}, // Larger: B=1, S=16, D=32
+                {{1, 8, 64}, {1, 8, 64}, {1, 8, 64}},   // D=64 (common head dim)
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// Multi-Head Attention with Causal Mask.
+//
+// FP16 (Ascend currently uses FP16)
+// Input format: [B, H, S, D] where H = num_heads, D = head_dim
+//===----------------------------------------------------------------------===//
+TEST_F(AscendAttentionKernelTest, MultiHeadAttentionFloat16) {
+  EXPECT_EQ(MultiHeadAttentionFloat16Test({
+                // {Q_shape, K_shape, V_shape, use_causal_mask}
+                // Format: [B, H, S, D]
+
+                // Without mask
+                {{1, 1, 4, 8}, {1, 1, 4, 8}, {1, 1, 4, 8}, false},      // Single head, no mask
+                {{1, 4, 8, 16}, {1, 4, 8, 16}, {1, 4, 8, 16}, false},   // 4 heads, no mask
+                {{1, 8, 16, 64}, {1, 8, 16, 64}, {1, 8, 16, 64}, false}, // 8 heads, D=64
+
+                // With causal mask
+                {{1, 1, 4, 8}, {1, 1, 4, 8}, {1, 1, 4, 8}, true},       // Single head, with mask
+                {{1, 4, 8, 16}, {1, 4, 8, 16}, {1, 4, 8, 16}, true},    // 4 heads, with mask
+                {{1, 8, 16, 64}, {1, 8, 16, 64}, {1, 8, 16, 64}, true}, // 8 heads, with mask
+                {{2, 4, 8, 32}, {2, 4, 8, 32}, {2, 4, 8, 32}, true},    // Batch=2, with mask
+
+                // Different S_q and S_kv (useful for KV cache scenarios)
+                {{1, 4, 1, 32}, {1, 4, 8, 32}, {1, 4, 8, 32}, true},    // S_q=1, S_kv=8 (decode)
+                {{1, 4, 4, 32}, {1, 4, 16, 32}, {1, 4, 16, 32}, true},  // S_q < S_kv
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// Grouped Query Attention (GQA).
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+TEST_F(AscendAttentionKernelTest, GroupedQueryAttentionFloat16) {
+  EXPECT_EQ(GroupedQueryAttentionFloat16Test({
+                // {Q_shape [B, H_q, S_q, D], K_shape [B, H_kv, S_kv, D], V_shape, use_mask}
+
+                // GQA with 2 groups (H_q = 4, H_kv = 2)
+                {{1, 4, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, false},
+                {{1, 4, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, true},
+
+                // GQA with 4 groups (H_q = 8, H_kv = 2)
+                {{1, 8, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, false},
+                {{1, 8, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, true},
+
+                // MQA (Multi-Query Attention): H_kv = 1
+                {{1, 4, 8, 32}, {1, 1, 8, 32}, {1, 1, 8, 32}, true},
+                {{1, 8, 16, 64}, {1, 1, 16, 64}, {1, 1, 16, 64}, true},
+
+                // Batch > 1
+                {{2, 8, 8, 32}, {2, 2, 8, 32}, {2, 2, 8, 32}, true},
+            }),
+            true);
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   

From 206c752d6115d5a51ef9bdcfd7d37fd136a01201 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 30 Jan 2026 23:27:19 +0800
Subject: [PATCH 3/4] fix(ascend): fix some problems for new ops

---
 mllm/backends/ascend/AscendBackend.cpp | 10 ++++++++--
 mllm/backends/ascend/AscendCommon.cpp  | 10 +++++++---
 mllm/backends/ascend/AscendCommon.hpp  |  3 +++
 mllm/nn/Functional.cpp                 | 17 +++++++++++++++++
 mllm/nn/Functional.hpp                 |  4 ++++
 tasks/build_arm_ascend.yaml            |  4 ++--
 6 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/mllm/backends/ascend/AscendBackend.cpp b/mllm/backends/ascend/AscendBackend.cpp
index 5ec76413a..6c17774b6 100644
--- a/mllm/backends/ascend/AscendBackend.cpp
+++ b/mllm/backends/ascend/AscendBackend.cpp
@@ -8,12 +8,18 @@
 
 #include "mllm/backends/ascend/ops/AscendElewiseOps.hpp"
 #include "mllm/backends/ascend/ops/AscendX2XOp.hpp"
+#include "mllm/backends/ascend/ops/AscendSiLUOp.hpp"
+#include "mllm/backends/ascend/ops/AscendLinearOp.hpp"
+#include "mllm/backends/ascend/ops/AscendRMSNormOp.hpp"
+#include "mllm/backends/ascend/ops/AscendViewOp.hpp"
+#include "mllm/backends/ascend/ops/AscendMatMulOp.hpp"
+#include "mllm/backends/ascend/ops/AscendSoftmaxOp.hpp"
 
 namespace mllm::ascend {
 
 AscendBackend::AscendBackend() : Backend(kAscend, createAscendAllocator()) {
-  regOpFactory<AscendAddOpFactory>();
-  regOpFactory<AscendX2XOpFactory>();
+ regOpFactory<AscendAddOpFactory,AscendSubOpFactory,AscendMulOpFactory,AscendX2XOpFactory,AscendSiLUOpFactory,
+              AscendLinearOpFactory,AscendRMSNormOpFactory,AscendViewOpFactory,AscendMatMulOpFactory,AscendSoftmaxOpFactory>();
   auto& devices = AscendDeviceMetaInfo::instance().devices;
   for (const auto& device : devices) {
     const auto bytes_to_mb = [](size_t bytes) { return bytes / (1024.0 * 1024.0); };
diff --git a/mllm/backends/ascend/AscendCommon.cpp b/mllm/backends/ascend/AscendCommon.cpp
index 140a5a31e..252571583 100644
--- a/mllm/backends/ascend/AscendCommon.cpp
+++ b/mllm/backends/ascend/AscendCommon.cpp
@@ -217,6 +217,13 @@ void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc) {
   }
 }
 
+void fillAtbTensor(const Tensor& t, atb::Tensor& atb_tensor) {
+  fillAtbTensorDesc(t, atb_tensor.desc);
+  atb_tensor.deviceData = reinterpret_cast<uint8_t*>(t.ptr<void>());
+  // Use MLLM tensor's actual bytes as dataSize to match allocated memory
+  atb_tensor.dataSize = t.bytes();
+}
+
 AscendDeviceMetaInfo::AscendDeviceMetaInfo() {
 #ifndef ASCENDC_CPU_DEBUG
   // Initialize ACL to query devices
@@ -231,7 +238,6 @@ AscendDeviceMetaInfo::AscendDeviceMetaInfo() {
   ret = aclrtGetDeviceCount(&device_count);
   if (ret != ACL_SUCCESS) {
     MLLM_ERROR("Failed to get Ascend device count: {}", ret);
-    aclFinalize();
     return;
   }
 
@@ -266,8 +272,6 @@ AscendDeviceMetaInfo::AscendDeviceMetaInfo() {
     devices.push_back(info);
   }
 
-  // Finalize ACL after enumeration
-  aclFinalize();
 #else
   // In CPU debug mode, add a dummy device
   AscendDeviceInfo info;
diff --git a/mllm/backends/ascend/AscendCommon.hpp b/mllm/backends/ascend/AscendCommon.hpp
index 8d74c8707..5a2b69dc8 100644
--- a/mllm/backends/ascend/AscendCommon.hpp
+++ b/mllm/backends/ascend/AscendCommon.hpp
@@ -41,6 +41,9 @@ void syncGlobalAtbStream();
 // Convert MLLM Tensor metadata to ATB TensorDesc
 void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc);
 
+// Setup ATB Tensor with correct dataSize calculated by ATB Utils
+void fillAtbTensor(const Tensor& t, atb::Tensor& atb_tensor);
+
 // Ascend device information structure
 struct AscendDeviceInfo {
   std::string name;
diff --git a/mllm/nn/Functional.cpp b/mllm/nn/Functional.cpp
index 4e70b092a..e1e015432 100644
--- a/mllm/nn/Functional.cpp
+++ b/mllm/nn/Functional.cpp
@@ -7,6 +7,7 @@
 #include "mllm/core/aops/FlashAttention2Op.hpp"
 #include "mllm/core/aops/GatherOp.hpp"
 #include "mllm/core/aops/MatMulOp.hpp"
+#include "mllm/core/aops/LinearOp.hpp"
 #include "mllm/core/aops/ReduceOps.hpp"
 #include "mllm/core/aops/Scatter2ShardsOp.hpp"
 #include "mllm/core/aops/SigmoidOp.hpp"
@@ -16,6 +17,7 @@
 #include "mllm/core/aops/ViewOp.hpp"
 #include "mllm/core/aops/TopKOp.hpp"
 #include "mllm/core/aops/SiLUOp.hpp"
+#include "mllm/core/aops/RMSNormOp.hpp"
 #include "mllm/core/aops/PadOp.hpp"
 #include "mllm/core/aops/MaskedScatterOp.hpp"
 #include "mllm/core/aops/InterpolateOp.hpp"
@@ -33,6 +35,16 @@ Tensor matmul(const Tensor& A, const Tensor& B, bool transpose_A, bool transpose
       {A, B})[0];
 }
 
+Tensor linear(const Tensor& x, const Tensor& weight, const Tensor& bias) {
+  aops::LinearOpOptions opts{};
+  opts.setRedirect(true);
+  if (bias.isNil()) {
+    return Context::instance().buildOpAndSubmitTask(OpTypes::kLinear, opts, {x, weight})[0];
+  } else {
+    return Context::instance().buildOpAndSubmitTask(OpTypes::kLinear, opts, {x, weight, bias})[0];
+  }
+}
+
 Tensor view(const Tensor& x, const std::vector<int32_t>& shape) {
   return Context::instance().buildOpAndSubmitTask(OpTypes::kView, aops::ViewOpOptions{.to_shape = shape}, {x})[0];
 }
@@ -126,6 +138,11 @@ Tensor silu_(const Tensor& x) {
   return Context::instance().buildOpAndSubmitTask(OpTypes::kSiLU, opt, {x})[0];
 }
 
+Tensor rmsNorm(const Tensor& x, const Tensor& weight, float epsilon, bool add_unit_offset) {
+  return Context::instance().buildOpAndSubmitTask(
+      OpTypes::kRMSNorm, aops::RMSNormOpOptions{.epsilon = epsilon, .add_unit_offset = add_unit_offset}, {x, weight})[0];
+}
+
 void scatter2Shards(const Tensor& src, const Tensor& shards_pointer, int32_t dim) {
   Context::instance().buildOpAndSubmitTask(OpTypes::kScatter2Shards, aops::Scatter2ShardsOpOptions{.dim = dim},
                                            {src, shards_pointer});
diff --git a/mllm/nn/Functional.hpp b/mllm/nn/Functional.hpp
index 31a57812c..c85b716e9 100644
--- a/mllm/nn/Functional.hpp
+++ b/mllm/nn/Functional.hpp
@@ -20,6 +20,8 @@ namespace mllm::nn::functional {
 Tensor matmul(const Tensor& A, const Tensor& B, bool transpose_A = false, bool transpose_B = false,
               aops::MatMulOpType type = aops::MatMulOpType::kDefault);
 
+Tensor linear(const Tensor& x, const Tensor& weight, const Tensor& bias = Tensor());
+
 Tensor view(const Tensor& x, const std::vector<int32_t>& shape);
 
 std::vector<Tensor> split(const Tensor& x, int32_t split_size_or_sections, int32_t dim);
@@ -131,6 +133,8 @@ Tensor mean(const Tensor& x, int32_t dim = std::numeric_limits<int32_t>::max(),
 Tensor silu(const Tensor& x);
 Tensor silu_(const Tensor& x);
 
+Tensor rmsNorm(const Tensor& x, const Tensor& weight, float epsilon = 1e-5f, bool add_unit_offset = false);
+
 void scatter2Shards(const Tensor& src, const Tensor& shards_pointer, int32_t dim);
 
 // If you want causal mask attention. Use Flash attention instead.
diff --git a/tasks/build_arm_ascend.yaml b/tasks/build_arm_ascend.yaml
index 17ffd3f10..111a162ba 100644
--- a/tasks/build_arm_ascend.yaml
+++ b/tasks/build_arm_ascend.yaml
@@ -1,6 +1,6 @@
 Tasks:
   - CMakeConfigTask:
-      cmake_cfg_path: "build-arm-ascend"
+      cmake_cfg_path: "build-ascend"
       cmake_build_type: "ReleaseDebInfo"
       cmake_extra_args:
         - "-DMLLM_CROSS_COMPILE=ON"
@@ -15,4 +15,4 @@ Tasks:
         - "-DMLLM_KERNEL_USE_THREADS_VENDOR_MLLM=OFF"
 
   - CMakeBuildTask:
-      cmake_cfg_path: "build-arm-ascend"
+      cmake_cfg_path: "build-ascend"

From 642b39765284d858929fb2af10a80b7bf071c283 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sat, 31 Jan 2026 01:12:33 +0800
Subject: [PATCH 4/4] fix(ascend): fix some problems providing by CodeRabbit

---
 mllm/backends/ascend/ops/AscendLinearOp.cpp  |  4 +-
 mllm/backends/ascend/ops/AscendRMSNormOp.cpp |  2 +-
 mllm/backends/ascend/ops/AscendSoftmaxOp.cpp | 19 ++++------
 tests/ascend/AscendAttentionKernelTest.hpp   |  1 +
 tests/ascend/AscendSoftmaxKernelTest.hpp     | 40 +++-----------------
 5 files changed, 18 insertions(+), 48 deletions(-)

diff --git a/mllm/backends/ascend/ops/AscendLinearOp.cpp b/mllm/backends/ascend/ops/AscendLinearOp.cpp
index a8b986984..049563170 100644
--- a/mllm/backends/ascend/ops/AscendLinearOp.cpp
+++ b/mllm/backends/ascend/ops/AscendLinearOp.cpp
@@ -21,8 +21,9 @@ AscendLinearOp::AscendLinearOp(const aops::LinearOpOptions& options) : aops::Lin
 
 void AscendLinearOp::reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
   if (options().isRedirect()) {
+    MLLM_RT_ASSERT(inputs.size() >= 1);
     const auto& input = inputs[0];
-    const auto& weight = inputs[1];
+    const auto& weight = inputs.size() >= 2 ? inputs[1] : this->weight();
     auto out_shape = input.shape();
     out_shape[out_shape.size() - 1] = weight.shape()[0];  // out_channels
     outputs.emplace_back(Tensor::empty(out_shape, input.dtype(), input.device()));
@@ -37,6 +38,7 @@ void AscendLinearOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor
 
 void AscendLinearOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
   MLLM_RT_ASSERT(inputs.size() >= 1 && inputs.size() <= 3);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
 
   const Tensor* weight_ptr = nullptr;
   const Tensor* bias_ptr = nullptr;
diff --git a/mllm/backends/ascend/ops/AscendRMSNormOp.cpp b/mllm/backends/ascend/ops/AscendRMSNormOp.cpp
index 639639e22..7ce8c74d3 100644
--- a/mllm/backends/ascend/ops/AscendRMSNormOp.cpp
+++ b/mllm/backends/ascend/ops/AscendRMSNormOp.cpp
@@ -25,7 +25,7 @@ void AscendRMSNormOp::setup(const std::vector<Tensor>& inputs, std::vector<Tenso
 }
 
 void AscendRMSNormOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
-  //MLLM_RT_ASSERT(inputs.size() == 1 || inputs.size() == 2, "AscendRMSNormOp expects 1 or 2 inputs");
+  MLLM_RT_ASSERT(inputs.size() == 1 || inputs.size() == 2);
   MLLM_RT_ASSERT_EQ(outputs.size(), 1);
 
   const auto& x = inputs[0];
diff --git a/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp b/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp
index 25d09081a..0050ae718 100644
--- a/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp
+++ b/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp
@@ -54,14 +54,18 @@ void AscendSoftmaxOp::forward(const std::vector<Tensor>& inputs, std::vector<Ten
 
   // Convert axis to positive index if negative
   int axis = options_.axis;
+  const int rank = static_cast<int>(x.rank());
   if (axis < 0) {
-    axis = static_cast<int>(x.rank()) + axis;
+    axis = rank + axis;
+  }
+  if (axis < 0 || axis >= rank) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendSoftmaxOp: axis {} out of range for rank {}",
+                    axis, rank);
   }
 
-  // ATB expects axes as SVector<int64_t>
   softmaxParam.axes.push_back(static_cast<int64_t>(axis));
 
-  // Create ATB operation
   atb::Operation* op = nullptr;
   auto st = atb::CreateOperation(softmaxParam, &op);
   if (st != atb::NO_ERROR || op == nullptr) {
@@ -70,17 +74,14 @@ void AscendSoftmaxOp::forward(const std::vector<Tensor>& inputs, std::vector<Ten
                     static_cast<int>(st));
   }
 
-  // Get global ATB context
   atb::Context* atb_ctx = getGlobalAtbContext();
 
-  // Prepare ATB tensors
   atb::Tensor atb_x;
   atb::Tensor atb_y;
 
   fillAtbTensor(x, atb_x);
   fillAtbTensor(y, atb_y);
 
-  // Setup input/output tensors
   atb::SVector<atb::Tensor> inTensors;
   atb::SVector<atb::Tensor> outTensors;
   inTensors.push_back(atb_x);
@@ -90,7 +91,6 @@ void AscendSoftmaxOp::forward(const std::vector<Tensor>& inputs, std::vector<Ten
   vp.inTensors = inTensors;
   vp.outTensors = outTensors;
 
-  // Setup operation (calculate required workspace size)
   uint64_t workspaceSize = 0;
   st = op->Setup(vp, workspaceSize, atb_ctx);
   if (st != atb::NO_ERROR) {
@@ -99,7 +99,6 @@ void AscendSoftmaxOp::forward(const std::vector<Tensor>& inputs, std::vector<Ten
                     static_cast<int>(st));
   }
 
-  // Allocate workspace if needed
   void* workspace = nullptr;
   int workspace_block_id = -1;
   if (workspaceSize > 0) {
@@ -108,7 +107,6 @@ void AscendSoftmaxOp::forward(const std::vector<Tensor>& inputs, std::vector<Ten
     mem_mgr.getBlockPtr(workspace_block_id, workspace);
   }
 
-  // Execute operation
   {
     ASCEND_TIME_SCOPE("AscendSoftmaxOp::forward");
     st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
@@ -119,16 +117,13 @@ void AscendSoftmaxOp::forward(const std::vector<Tensor>& inputs, std::vector<Ten
                     static_cast<int>(st));
   }
 
-  // Synchronize stream
   syncGlobalAtbStream();
 
-  // Free workspace
   if (workspace_block_id != -1) {
     auto& mem_mgr = getAscendMemoryManager();
     mem_mgr.freeBlock(workspace_block_id);
   }
 
-  // Destroy operation
   atb::DestroyOperation(op);
 }
 
diff --git a/tests/ascend/AscendAttentionKernelTest.hpp b/tests/ascend/AscendAttentionKernelTest.hpp
index b6bf9eb02..80e5542bb 100644
--- a/tests/ascend/AscendAttentionKernelTest.hpp
+++ b/tests/ascend/AscendAttentionKernelTest.hpp
@@ -225,6 +225,7 @@ class AscendAttentionKernelTest : public KernelTest {
       // Causal mask: mask[i, j] = 0 if j <= i, else -inf (large negative value)
       Tensor mask_cpu;
       if (use_mask) {
+        MLLM_RT_ASSERT(S_kv >= S_q);
         mask_cpu = Tensor::zeros({1, 1, S_q, S_kv}, kFloat16, kCPU);
         auto* mask_ptr = mask_cpu.ptr<mllm_fp16_t>();
 
diff --git a/tests/ascend/AscendSoftmaxKernelTest.hpp b/tests/ascend/AscendSoftmaxKernelTest.hpp
index 9003c714d..95b6fe4c8 100644
--- a/tests/ascend/AscendSoftmaxKernelTest.hpp
+++ b/tests/ascend/AscendSoftmaxKernelTest.hpp
@@ -38,13 +38,6 @@ class AscendSoftmaxKernelTest : public KernelTest {
             pos_axis = ndim + pos_axis;
           }
 
-          // Calculate strides
-          std::vector<size_t> strides(ndim);
-          strides[ndim - 1] = 1;
-          for (int i = ndim - 2; i >= 0; --i) {
-            strides[i] = strides[i + 1] * shape[i + 1];
-          }
-
           size_t outer_size = 1;
           for (int i = 0; i < pos_axis; ++i) {
             outer_size *= shape[i];
@@ -60,18 +53,13 @@ class AscendSoftmaxKernelTest : public KernelTest {
           // Compute softmax for each slice along the axis
           for (size_t outer = 0; outer < outer_size; ++outer) {
             for (size_t inner = 0; inner < inner_size; ++inner) {
+              auto idx_at = [&](size_t i) -> size_t {
+                return (outer * axis_size + i) * inner_size + inner;
+              };
               // Find max value for numerical stability
               float max_val = -std::numeric_limits<float>::infinity();
               for (size_t i = 0; i < axis_size; ++i) {
-                size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] +
-                             i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner;
-                if (pos_axis == 0) {
-                  idx = i * strides[0] + inner;
-                } else if (pos_axis == ndim - 1) {
-                  idx = outer * axis_size + i;
-                } else {
-                  idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner;
-                }
+                size_t idx = idx_at(i);
                 float val = MLLM_FP16_TO_FP32(x_ptr[idx]);
                 max_val = std::max(max_val, val);
               }
@@ -80,15 +68,7 @@ class AscendSoftmaxKernelTest : public KernelTest {
               float sum_exp = 0.0f;
               std::vector<float> exp_vals(axis_size);
               for (size_t i = 0; i < axis_size; ++i) {
-                size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] +
-                             i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner;
-                if (pos_axis == 0) {
-                  idx = i * strides[0] + inner;
-                } else if (pos_axis == ndim - 1) {
-                  idx = outer * axis_size + i;
-                } else {
-                  idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner;
-                }
+                size_t idx = idx_at(i);
                 float val = MLLM_FP16_TO_FP32(x_ptr[idx]);
                 exp_vals[i] = std::exp(val - max_val);
                 sum_exp += exp_vals[i];
@@ -96,15 +76,7 @@ class AscendSoftmaxKernelTest : public KernelTest {
 
               // Compute softmax and store result
               for (size_t i = 0; i < axis_size; ++i) {
-                size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] +
-                             i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner;
-                if (pos_axis == 0) {
-                  idx = i * strides[0] + inner;
-                } else if (pos_axis == ndim - 1) {
-                  idx = outer * axis_size + i;
-                } else {
-                  idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner;
-                }
+                size_t idx = idx_at(i);
                 float result = exp_vals[i] / sum_exp;
                 r_ptr[idx] = MLLM_FP32_TO_FP16(result);
               }