diff --git a/CMakeLists.txt b/CMakeLists.txt
index a19e80df3..1a5e95900 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,9 +59,6 @@ option(MLLM_KERNEL_THREADS_VENDOR_APPLE_GCD "Enable Apple GCD Threads" OFF)
 option(MLLM_PERFETTO_ENABLE "Enable perfetto" OFF)
 option(MLLM_TRACY_ENABLE "Enable Tracy. A more advanced profiler" OFF)
 
-# NPU AOT things
-option(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE "Enable Qualcomm NPU AOT on X86 devices" OFF)
-
 # Platform Hints
 option(MLLM_ANDROID_BURST_PERFORMANCE_HINTS "If MLLM need use APerformanceHintManager to tell android we need best performance" OFF)
 
diff --git a/examples/ascend_add_demo/README.md b/examples/ascend_add_demo/README.md
new file mode 100644
index 000000000..3c799332f
--- /dev/null
+++ b/examples/ascend_add_demo/README.md
@@ -0,0 +1,81 @@
+# Ascend Add Op Demo
+
+这是一个简单的 demo，用于测试 Ascend 后端的 Add 算子实现。
+
+## 功能
+
+- 初始化 Ascend 后端和内存池
+- 创建两个输入张量（shape: [2, 3]）
+- 在 Ascend NPU 上执行 Add 操作
+- 验证计算结果是否正确
+
+## 编译和运行
+
+### 方法 1: 使用自动化脚本（推荐）
+
+```bash
+cd /home/HwHiAiUser/mLLM/examples/ascend_add_demo
+./build_and_run.sh
+```
+
+脚本会自动：
+- 检查环境变量
+- 配置 CMake
+- 编译项目
+- 运行 demo
+
+### 方法 2: 手动编译
+
+确保已经设置了必要的环境变量：
+- `ASCEND_HOME_PATH`: Ascend SDK 路径（已设置: `/usr/local/Ascend/ascend-toolkit/latest`）
+- `ATB_HOME_PATH`: ATB 库路径（已设置: `/usr/local/Ascend/nnal/nnal/atb/latest/atb/cxx_abi_0`）
+
+在项目根目录下：
+
+```bash
+# 1. 创建构建目录
+mkdir -p build-ascend-demo && cd build-ascend-demo
+
+# 2. 配置 CMake
+cmake .. \
+    -DMLLM_BUILD_ASCEND_BACKEND=ON \
+    -DMLLM_ENABLE_EXAMPLE=ON \
+    -DCMAKE_BUILD_TYPE=Release
+
+# 3. 编译
+make ascend_add_demo -j$(nproc)
+
+# 4. 运行
+./examples/ascend_add_demo/ascend_add_demo
+```
+
+## 预期输出
+
+```
+=== Ascend Add Op Demo ===
+1. Initializing Ascend backend...
+   ✓ Ascend backend initialized
+
+2. Creating input tensors...
+   Input x shape: [2, 3]
+   Input y shape: [2, 3]
+
+3. Transferring tensors to Ascend device...
+   ✓ Tensors transferred to Ascend
+
+4. Executing Add operation on Ascend...
+   ✓ Add operation completed
+
+5. Transferring result back to CPU and verifying...
+   Expected result: [11, 22, 33, 44, 55, 66]
+   Actual result:   [11, 22, 33, 44, 55, 66]
+
+✓ Test PASSED! All values match expected results.
+```
+
+## 注意事项
+
+- 当前实现使用 float16 数据类型
+- 需要 Ascend NPU 设备可用
+- 确保已正确安装 Ascend SDK 和 ATB 库
+
diff --git a/examples/ascend_add_demo/build_and_run.sh b/examples/ascend_add_demo/build_and_run.sh
new file mode 100755
index 000000000..94e3563df
--- /dev/null
+++ b/examples/ascend_add_demo/build_and_run.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# Ascend Add Demo 编译和运行脚本
+
+set -e  # 遇到错误立即退出
+
+# 颜色输出
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}=== Ascend Add Demo 编译和运行脚本 ===${NC}\n"
+
+# 检查环境变量
+echo -e "${YELLOW}检查环境变量...${NC}"
+if [ -z "$ASCEND_HOME_PATH" ]; then
+    echo -e "${RED}错误: ASCEND_HOME_PATH 未设置${NC}"
+    exit 1
+fi
+if [ -z "$ATB_HOME_PATH" ]; then
+    echo -e "${RED}错误: ATB_HOME_PATH 未设置${NC}"
+    exit 1
+fi
+echo -e "${GREEN}✓ ASCEND_HOME_PATH: $ASCEND_HOME_PATH${NC}"
+echo -e "${GREEN}✓ ATB_HOME_PATH: $ATB_HOME_PATH${NC}\n"
+
+# 获取项目根目录
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+BUILD_DIR="$PROJECT_ROOT/build-ascend-demo"
+
+echo -e "${YELLOW}项目根目录: $PROJECT_ROOT${NC}"
+echo -e "${YELLOW}构建目录: $BUILD_DIR${NC}\n"
+
+# 创建构建目录
+if [ ! -d "$BUILD_DIR" ]; then
+    echo -e "${YELLOW}创建构建目录...${NC}"
+    mkdir -p "$BUILD_DIR"
+fi
+
+cd "$BUILD_DIR"
+
+# 配置 CMake
+echo -e "\n${YELLOW}配置 CMake...${NC}"
+cmake "$PROJECT_ROOT" \
+    -DMLLM_BUILD_ASCEND_BACKEND=ON \
+    -DMLLM_ENABLE_EXAMPLE=ON \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+
+# 编译
+echo -e "\n${YELLOW}开始编译...${NC}"
+make ascend_add_demo -j$(nproc)
+
+# 检查编译结果
+if [ $? -eq 0 ]; then
+    echo -e "\n${GREEN}✓ 编译成功！${NC}\n"
+    
+    # 运行
+    echo -e "${YELLOW}运行 demo...${NC}\n"
+    ./examples/ascend_add_demo/ascend_add_demo
+    
+    if [ $? -eq 0 ]; then
+        echo -e "\n${GREEN}✓ Demo 运行成功！${NC}"
+    else
+        echo -e "\n${RED}✗ Demo 运行失败${NC}"
+        exit 1
+    fi
+else
+    echo -e "\n${RED}✗ 编译失败${NC}"
+    exit 1
+fi
+
diff --git a/mllm/CMakeLists.txt b/mllm/CMakeLists.txt
index 06fa5aab2..615643afc 100644
--- a/mllm/CMakeLists.txt
+++ b/mllm/CMakeLists.txt
@@ -24,7 +24,6 @@ add_library(
   ${MLLM_RT_MODELS_SRC}
   ${MLLM_RT_COMPILE_SRC}
   ${MLLM_RT_AUTO_TUNE_SRC}
-  ${MLLM_QUALCOMM_AOT_SRC}
   ${WENET_AUDIO_SOURCES}
 )
 
diff --git a/mllm/backends/ascend/AscendBackend.cpp b/mllm/backends/ascend/AscendBackend.cpp
index 5ec76413a..7bd12d6e1 100644
--- a/mllm/backends/ascend/AscendBackend.cpp
+++ b/mllm/backends/ascend/AscendBackend.cpp
@@ -8,12 +8,18 @@
 
 #include "mllm/backends/ascend/ops/AscendElewiseOps.hpp"
 #include "mllm/backends/ascend/ops/AscendX2XOp.hpp"
+#include "mllm/backends/ascend/ops/AscendSiLUOp.hpp"
+#include "mllm/backends/ascend/ops/AscendLinearOp.hpp"
+#include "mllm/backends/ascend/ops/AscendRMSNormOp.hpp"
+#include "mllm/backends/ascend/ops/AscendViewOp.hpp"
+#include "mllm/backends/ascend/ops/AscendMatMulOp.hpp"
+#include "mllm/backends/ascend/ops/AscendSoftmaxOp.hpp"
 
 namespace mllm::ascend {
 
 AscendBackend::AscendBackend() : Backend(kAscend, createAscendAllocator()) {
-  regOpFactory<AscendAddOpFactory>();
-  regOpFactory<AscendX2XOpFactory>();
+  regOpFactory<AscendAddOpFactory,AscendSubOpFactory,AscendMulOpFactory,AscendX2XOpFactory,AscendSiLUOpFactory,
+              AscendLinearOpFactory,AscendRMSNormOpFactory,AscendViewOpFactory,AscendMatMulOpFactory,AscendSoftmaxOpFactory>();
   auto& devices = AscendDeviceMetaInfo::instance().devices;
   for (const auto& device : devices) {
     const auto bytes_to_mb = [](size_t bytes) { return bytes / (1024.0 * 1024.0); };
diff --git a/mllm/backends/ascend/AscendCommon.cpp b/mllm/backends/ascend/AscendCommon.cpp
index 140a5a31e..a1ada40cf 100644
--- a/mllm/backends/ascend/AscendCommon.cpp
+++ b/mllm/backends/ascend/AscendCommon.cpp
@@ -207,6 +207,13 @@ void syncGlobalAtbStream() {
 }
 
 void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc) {
+  // Validate that the tensor is FP16
+  if (t.dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "fillAtbTensorDesc: Tensor must be FP16, but got dtype={}",
+                    static_cast<int>(t.dtype()));
+  }
+
   desc.dtype = ACL_FLOAT16; // Currently hardcoded as per demo, can be expanded later
   desc.format = ACL_FORMAT_ND;
 
@@ -217,6 +224,13 @@ void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc) {
   }
 }
 
+void fillAtbTensor(const Tensor& t, atb::Tensor& atb_tensor) {
+  fillAtbTensorDesc(t, atb_tensor.desc);
+  atb_tensor.deviceData = reinterpret_cast<uint8_t*>(t.ptr<void>());
+  // Use MLLM tensor's actual bytes as dataSize to match allocated memory
+  atb_tensor.dataSize = t.bytes();
+}
+
 AscendDeviceMetaInfo::AscendDeviceMetaInfo() {
 #ifndef ASCENDC_CPU_DEBUG
   // Initialize ACL to query devices
@@ -231,7 +245,6 @@ AscendDeviceMetaInfo::AscendDeviceMetaInfo() {
   ret = aclrtGetDeviceCount(&device_count);
   if (ret != ACL_SUCCESS) {
     MLLM_ERROR("Failed to get Ascend device count: {}", ret);
-    aclFinalize();
     return;
   }
 
@@ -265,9 +278,6 @@ AscendDeviceMetaInfo::AscendDeviceMetaInfo() {
 
     devices.push_back(info);
   }
-
-  // Finalize ACL after enumeration
-  aclFinalize();
 #else
   // In CPU debug mode, add a dummy device
   AscendDeviceInfo info;
diff --git a/mllm/backends/ascend/AscendCommon.hpp b/mllm/backends/ascend/AscendCommon.hpp
index 8d74c8707..5a2b69dc8 100644
--- a/mllm/backends/ascend/AscendCommon.hpp
+++ b/mllm/backends/ascend/AscendCommon.hpp
@@ -41,6 +41,9 @@ void syncGlobalAtbStream();
 // Convert MLLM Tensor metadata to ATB TensorDesc
 void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc);
 
+// Setup ATB Tensor with correct dataSize calculated by ATB Utils
+void fillAtbTensor(const Tensor& t, atb::Tensor& atb_tensor);
+
 // Ascend device information structure
 struct AscendDeviceInfo {
   std::string name;
diff --git a/mllm/backends/ascend/ops/AscendElewiseOps.cpp b/mllm/backends/ascend/ops/AscendElewiseOps.cpp
index 762ef1dfe..38bc4b139 100644
--- a/mllm/backends/ascend/ops/AscendElewiseOps.cpp
+++ b/mllm/backends/ascend/ops/AscendElewiseOps.cpp
@@ -34,9 +34,9 @@ void AscendAddOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>
   if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) {
     NYI("AscendAddOp currently requires x/y/z have same dtype");
   }
-  if (x.numel() != y.numel() || x.numel() != z.numel()) {
-    NYI("AscendAddOp demo only supports no-broadcast case (numel equal)");
-  }
+
+  // ATB ELEWISE_ADD supports broadcasting automatically
+  // No need to check numel equality
 
   atb::infer::ElewiseParam addParam;
   addParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_ADD;
@@ -53,6 +53,88 @@ void AscendAddOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>
   atb::Tensor atb_y;
   atb::Tensor atb_z;
 
+  fillAtbTensor(x, atb_x);
+  fillAtbTensor(y, atb_y);
+  fillAtbTensor(z, atb_z);
+
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  inTensors.push_back(atb_y);
+  outTensors.push_back(atb_z);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB AddOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+  {
+    ASCEND_TIME_SCOPE("AscendAddOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB AddOp Execute failed, status={}", static_cast<int>(st));
+  }
+
+  
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
+AscendSubOp::AscendSubOp(const aops::SubOpOptions& options) : aops::SubOp(options) {}
+
+void AscendSubOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendSubOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT_EQ(inputs.size(), 2);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& x = inputs[0];
+  const auto& y = inputs[1];
+  auto& z = outputs[0];
+
+  if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) {
+    NYI("AscendSubOp currently requires x/y/z have same dtype");
+  }
+
+  // ATB ELEWISE_SUB supports broadcasting automatically
+  // No need to check numel equality
+
+  atb::infer::ElewiseParam subParam;
+  subParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_SUB;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(subParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ELEWISE_SUB) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_x;
+  atb::Tensor atb_y;
+  atb::Tensor atb_z;
+
   fillAtbTensorDesc(x, atb_x.desc);
   fillAtbTensorDesc(y, atb_y.desc);
   fillAtbTensorDesc(z, atb_z.desc);
@@ -77,7 +159,7 @@ void AscendAddOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>
   uint64_t workspaceSize = 0;
   st = op->Setup(vp, workspaceSize, atb_ctx);
   if (st != atb::NO_ERROR) {
-    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB AddOp Setup failed, status={}", static_cast<int>(st));
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SubOp Setup failed, status={}", static_cast<int>(st));
   }
 
   void* workspace = nullptr;
@@ -88,13 +170,100 @@ void AscendAddOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>
     mem_mgr.getBlockPtr(workspace_block_id, workspace);
   }
   {
-    ASCEND_TIME_SCOPE("AscendAddOp::forward");
+    ASCEND_TIME_SCOPE("AscendSubOp::forward");
     st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
   }
   if (st != atb::NO_ERROR) {
-    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB AddOp Execute failed, status={}", static_cast<int>(st));
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SubOp Execute failed, status={}", static_cast<int>(st));
   }
+  
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
+AscendMulOp::AscendMulOp(const aops::MulOpOptions& options) : aops::MulOp(options) {}
+
+void AscendMulOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendMulOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT_EQ(inputs.size(), 2);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& x = inputs[0];
+  const auto& y = inputs[1];
+  auto& z = outputs[0];
+
+  if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) {
+    NYI("AscendMulOp currently requires x/y/z have same dtype");
+  }
+
+  // ATB ELEWISE_MUL supports broadcasting automatically
+  // No need to check numel equality
+
+  atb::infer::ElewiseParam mulParam;
+  mulParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_MUL;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(mulParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ELEWISE_MUL) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_x;
+  atb::Tensor atb_y;
+  atb::Tensor atb_z;
+
+  fillAtbTensorDesc(x, atb_x.desc);
+  fillAtbTensorDesc(y, atb_y.desc);
+  fillAtbTensorDesc(z, atb_z.desc);
+
+  atb_x.deviceData = reinterpret_cast<uint8_t*>(x.ptr<void>());
+  atb_x.dataSize = x.bytes();
+  atb_y.deviceData = reinterpret_cast<uint8_t*>(y.ptr<void>());
+  atb_y.dataSize = y.bytes();
+  atb_z.deviceData = reinterpret_cast<uint8_t*>(z.ptr<void>());
+  atb_z.dataSize = z.bytes();
 
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  inTensors.push_back(atb_y);
+  outTensors.push_back(atb_z);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MulOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+  {
+    ASCEND_TIME_SCOPE("AscendMulOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MulOp Execute failed, status={}", static_cast<int>(st));
+  }
   
   syncGlobalAtbStream();
 
diff --git a/mllm/backends/ascend/ops/AscendElewiseOps.hpp b/mllm/backends/ascend/ops/AscendElewiseOps.hpp
index 26117cbc2..9122e20cb 100644
--- a/mllm/backends/ascend/ops/AscendElewiseOps.hpp
+++ b/mllm/backends/ascend/ops/AscendElewiseOps.hpp
@@ -24,4 +24,34 @@ class AscendAddOpFactory final : public TypedOpFactory<OpTypes::kAdd, aops::AddO
   }
 };
 
+class AscendSubOp final : public aops::SubOp {
+ public:
+  explicit AscendSubOp(const aops::SubOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendSubOpFactory final : public TypedOpFactory<OpTypes::kSub, aops::SubOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::SubOpOptions& options) override {
+    return std::make_shared<AscendSubOp>(options);
+  }
+};
+
+class AscendMulOp final : public aops::MulOp {
+ public:
+  explicit AscendMulOp(const aops::MulOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendMulOpFactory final : public TypedOpFactory<OpTypes::kMul, aops::MulOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::MulOpOptions& options) override {
+    return std::make_shared<AscendMulOp>(options);
+  }
+};
+
 }  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscendLinearOp.cpp b/mllm/backends/ascend/ops/AscendLinearOp.cpp
new file mode 100644
index 000000000..41040cf74
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendLinearOp.cpp
@@ -0,0 +1,167 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendLinearOp.hpp"
+
+#include <acl/acl.h>
+#include <atb/atb_infer.h>
+#include <atb/types.h>
+#include <atb/utils.h>
+#include <atb/infer_op_params.h>
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+namespace mllm::ascend {
+
+AscendLinearOp::AscendLinearOp(const aops::LinearOpOptions& options) : aops::LinearOp(options) {}
+
+void AscendLinearOp::reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  if (options().isRedirect()) {
+    const auto& input = inputs[0];
+    const auto& weight = inputs[1];
+    auto out_shape = input.shape();
+    out_shape[out_shape.size() - 1] = weight.shape()[0];  // out_channels
+    outputs.emplace_back(Tensor::empty(out_shape, input.dtype(), input.device()));
+    return;
+  }
+  aops::LinearOp::reshape(inputs, outputs);
+}
+
+void AscendLinearOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendLinearOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT(inputs.size() >= 1 && inputs.size() <= 3);
+
+  const Tensor* weight_ptr = nullptr;
+  const Tensor* bias_ptr = nullptr;
+
+  if (inputs.size() == 1) {
+    weight_ptr = &weight();
+    if (options().bias) { bias_ptr = &bias(); }
+  } else if (inputs.size() == 2) {
+    weight_ptr = &inputs[1];
+  } else if (inputs.size() == 3) {
+    weight_ptr = &inputs[1];
+    bias_ptr = &inputs[2];
+  }
+
+  const auto& x = inputs[0];
+  auto& y = outputs[0];
+
+  // Validate that input tensors are FP16
+  if (x.dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendLinearOp: Input tensor must be FP16, but got dtype={}",
+                    static_cast<int>(x.dtype()));
+  }
+  if (weight_ptr->dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendLinearOp: Weight tensor must be FP16, but got dtype={}",
+                    static_cast<int>(weight_ptr->dtype()));
+  }
+  if (bias_ptr != nullptr && bias_ptr->dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendLinearOp: Bias tensor must be FP16, but got dtype={}",
+                    static_cast<int>(bias_ptr->dtype()));
+  }
+
+  // Validate bias dimensions: ATB Linear requires bias to be 2D [1, out_channels]
+  if (bias_ptr != nullptr) {
+    const auto& bias_shape = bias_ptr->shape();
+    if (bias_shape.size() == 1) {
+      MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                      "AscendLinearOp: Bias tensor must be 2D [1, out_channels], but got 1D shape with size={}. "
+                      "Please reshape the bias tensor before passing to AscendLinearOp.",
+                      bias_shape[0]);
+    }
+    if (bias_shape.size() != 2 || bias_shape[0] != 1) {
+      MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                      "AscendLinearOp: Bias tensor must be 2D with shape [1, out_channels], but got shape=[{}, {}]",
+                      bias_shape.size() >= 1 ? bias_shape[0] : 0,
+                      bias_shape.size() >= 2 ? bias_shape[1] : 0);
+    }
+  }
+
+
+  atb::infer::LinearParam linearParam;
+  linearParam.transposeA = false;
+  linearParam.transposeB = true;  // Set to true because weight is [out_channels, in_channels]
+  linearParam.hasBias = (bias_ptr != nullptr);
+  linearParam.outDataType = ACL_DT_UNDEFINED;
+  linearParam.enAccum = false;
+  linearParam.matmulType = atb::infer::LinearParam::MATMUL_UNDEFINED;
+  linearParam.quantMode = atb::infer::LinearParam::QUANT_UNDEFINED;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(linearParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(Linear) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_x;
+  atb::Tensor atb_weight;
+  atb::Tensor atb_y;
+  atb::Tensor atb_bias;
+
+  fillAtbTensor(x, atb_x);
+  fillAtbTensor(*weight_ptr, atb_weight);
+  fillAtbTensor(y, atb_y);
+
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  inTensors.push_back(atb_weight);
+
+  if (bias_ptr != nullptr) {
+    fillAtbTensor(*bias_ptr, atb_bias);
+    inTensors.push_back(atb_bias);
+  }
+
+  outTensors.push_back(atb_y);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB LinearOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+
+  {
+    ASCEND_TIME_SCOPE("AscendLinearOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB LinearOp Execute failed, status={}", static_cast<int>(st));
+  }
+
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscendLinearOp.hpp b/mllm/backends/ascend/ops/AscendLinearOp.hpp
new file mode 100644
index 000000000..a0b78de4c
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendLinearOp.hpp
@@ -0,0 +1,28 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/LinearOp.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+namespace mllm::ascend {
+
+class AscendLinearOp final : public aops::LinearOp {
+ public:
+  explicit AscendLinearOp(const aops::LinearOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendLinearOpFactory final : public TypedOpFactory<OpTypes::kLinear, aops::LinearOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::LinearOpOptions& options) override {
+    return std::make_shared<AscendLinearOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscendMatMulOp.cpp b/mllm/backends/ascend/ops/AscendMatMulOp.cpp
new file mode 100644
index 000000000..9e2013695
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendMatMulOp.cpp
@@ -0,0 +1,147 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendMatMulOp.hpp"
+
+#include <acl/acl.h>
+#include <atb/atb_infer.h>
+#include <atb/types.h>
+#include <atb/utils.h>
+#include <atb/infer_op_params.h>
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+namespace mllm::ascend {
+
+AscendMatMulOp::AscendMatMulOp(const aops::MatMulOpOptions& options) : aops::MatMulOp(options) {}
+
+void AscendMatMulOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+namespace {
+
+// Helper to fill ATB tensor with custom shape (for reshape without copy)
+void fillAtbTensorWithShape(const Tensor& t, atb::Tensor& atb_tensor, const std::vector<int64_t>& shape) {
+  atb::TensorDesc desc;
+  desc.dtype = ACL_FLOAT16;  // Ascend uses FP16
+  desc.format = ACL_FORMAT_ND;
+
+  desc.shape.dimNum = shape.size();
+  for (size_t i = 0; i < shape.size(); ++i) {
+    desc.shape.dims[i] = shape[i];
+  }
+
+  atb_tensor.desc = desc;
+  atb_tensor.dataSize = atb::Utils::GetTensorSize(atb_tensor);
+  atb_tensor.deviceData = reinterpret_cast<uint8_t*>(t.ptr<void>());
+}
+
+}  // namespace
+
+void AscendMatMulOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT_EQ(inputs.size(), 2);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& A = inputs[0];
+  const auto& B = inputs[1];
+  auto& C = outputs[0];
+
+  // ATB Linear/MatMul only supports 2D/3D tensors.
+  // For 4D tensors [B, H, S, D], we reshape to 3D [B*H, S, D], compute, then reshape back.
+  const auto& a_shape = A.shape();
+  const auto& b_shape = B.shape();
+  const auto& c_shape = C.shape();
+
+  bool is_4d = (a_shape.size() == 4);
+
+  // Prepare shapes for ATB
+  std::vector<int64_t> atb_a_shape, atb_b_shape, atb_c_shape;
+
+  if (is_4d) {
+    // Reshape [B, H, S, D] -> [B*H, S, D]
+    int64_t batch_heads_a = static_cast<int64_t>(a_shape[0]) * static_cast<int64_t>(a_shape[1]);
+    int64_t batch_heads_b = static_cast<int64_t>(b_shape[0]) * static_cast<int64_t>(b_shape[1]);
+    int64_t batch_heads_c = static_cast<int64_t>(c_shape[0]) * static_cast<int64_t>(c_shape[1]);
+
+    atb_a_shape = {batch_heads_a, static_cast<int64_t>(a_shape[2]), static_cast<int64_t>(a_shape[3])};
+    atb_b_shape = {batch_heads_b, static_cast<int64_t>(b_shape[2]), static_cast<int64_t>(b_shape[3])};
+    atb_c_shape = {batch_heads_c, static_cast<int64_t>(c_shape[2]), static_cast<int64_t>(c_shape[3])};
+  } else {
+    // 2D or 3D: use original shapes
+    for (auto dim : a_shape) atb_a_shape.push_back(static_cast<int64_t>(dim));
+    for (auto dim : b_shape) atb_b_shape.push_back(static_cast<int64_t>(dim));
+    for (auto dim : c_shape) atb_c_shape.push_back(static_cast<int64_t>(dim));
+  }
+
+  // Create LinearParam for ATB (used for MatMul)
+  atb::infer::LinearParam linearParam;
+  linearParam.transposeA = options_.transpose_a;
+  linearParam.transposeB = options_.transpose_b;
+  linearParam.hasBias = false;
+  linearParam.outDataType = ACL_DT_UNDEFINED;
+  linearParam.enAccum = false;
+  linearParam.matmulType = atb::infer::LinearParam::MATMUL_UNDEFINED;
+  linearParam.quantMode = atb::infer::LinearParam::QUANT_UNDEFINED;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(linearParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(MatMul) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_A, atb_B, atb_C;
+  fillAtbTensorWithShape(A, atb_A, atb_a_shape);
+  fillAtbTensorWithShape(B, atb_B, atb_b_shape);
+  fillAtbTensorWithShape(C, atb_C, atb_c_shape);
+
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_A);
+  inTensors.push_back(atb_B);
+  outTensors.push_back(atb_C);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MatMulOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+
+  {
+    ASCEND_TIME_SCOPE("AscendMatMulOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB MatMulOp Execute failed, status={}", static_cast<int>(st));
+  }
+
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscendMatMulOp.hpp b/mllm/backends/ascend/ops/AscendMatMulOp.hpp
new file mode 100644
index 000000000..059464b25
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendMatMulOp.hpp
@@ -0,0 +1,27 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/MatMulOp.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+namespace mllm::ascend {
+
+class AscendMatMulOp final : public aops::MatMulOp {
+ public:
+  explicit AscendMatMulOp(const aops::MatMulOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendMatMulOpFactory : public TypedOpFactory<OpTypes::kMatMul, aops::MatMulOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::MatMulOpOptions& options) override {
+    return std::make_shared<AscendMatMulOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscendRMSNormOp.cpp b/mllm/backends/ascend/ops/AscendRMSNormOp.cpp
new file mode 100644
index 000000000..54b3eeda2
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendRMSNormOp.cpp
@@ -0,0 +1,106 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendRMSNormOp.hpp"
+
+#include <acl/acl.h>
+#include <iostream>
+#include <atb/atb_infer.h>
+#include <atb/types.h>
+#include <atb/utils.h>
+#include <atb/infer_op_params.h>
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+namespace mllm::ascend {
+
+AscendRMSNormOp::AscendRMSNormOp(const aops::RMSNormOpOptions& options) : aops::RMSNormOp(options) {}
+
+void AscendRMSNormOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendRMSNormOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  //MLLM_RT_ASSERT(inputs.size() == 1 || inputs.size() == 2, "AscendRMSNormOp expects 1 or 2 inputs");
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& x = inputs[0];
+  const auto& weight = (inputs.size() == 2) ? inputs[1] : weight_;
+  auto& y = outputs[0];
+
+  const Tensor& weight_for_atb = weight;
+
+  if (x.dtype() != y.dtype()) {
+    NYI("AscendRMSNormOp currently requires x/y have same dtype");
+  }
+  if (x.numel() != y.numel()) {
+    NYI("AscendRMSNormOp requires x/y have same numel");
+  }
+
+  atb::infer::RmsNormParam rmsNormParam;
+  rmsNormParam.layerType = atb::infer::RmsNormParam::RmsNormType::RMS_NORM_NORM;
+  rmsNormParam.normParam.quantType = atb::infer::QuantType::QUANT_UNQUANT;
+  rmsNormParam.normParam.epsilon = options_.epsilon;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(rmsNormParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(RMS_NORM) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_x;
+  atb::Tensor atb_weight;
+  atb::Tensor atb_y;
+
+  fillAtbTensor(x, atb_x);
+  fillAtbTensor(weight_for_atb, atb_weight);
+  fillAtbTensor(y, atb_y);
+
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  inTensors.push_back(atb_weight);
+  outTensors.push_back(atb_y);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB RMSNormOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+  {
+    ASCEND_TIME_SCOPE("AscendRMSNormOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB RMSNormOp Execute failed, status={}", static_cast<int>(st));
+  }
+
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscendRMSNormOp.hpp b/mllm/backends/ascend/ops/AscendRMSNormOp.hpp
new file mode 100644
index 000000000..65b899509
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendRMSNormOp.hpp
@@ -0,0 +1,27 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/RMSNormOp.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+namespace mllm::ascend {
+
+class AscendRMSNormOp final : public aops::RMSNormOp {
+ public:
+  explicit AscendRMSNormOp(const aops::RMSNormOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendRMSNormOpFactory final : public TypedOpFactory<OpTypes::kRMSNorm, aops::RMSNormOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::RMSNormOpOptions& options) override {
+    return std::make_shared<AscendRMSNormOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscendSiLUOp.cpp b/mllm/backends/ascend/ops/AscendSiLUOp.cpp
new file mode 100644
index 000000000..3a2299bf2
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendSiLUOp.cpp
@@ -0,0 +1,115 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendSiLUOp.hpp"
+
+#include <acl/acl.h>
+#include <atb/atb_infer.h>
+#include <atb/types.h>
+#include <atb/utils.h>
+#include <atb/infer_op_params.h>
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+namespace mllm::ascend {
+
+AscendSiLUOp::AscendSiLUOp(const aops::SiLUOpOptions& options) : aops::SiLUOp(options) {}
+
+void AscendSiLUOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendSiLUOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT_EQ(inputs.size(), 1);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& x = inputs[0];
+  auto& y = outputs[0];
+
+  // Validate that input tensors are FP16
+  if (x.dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendSiLUOp: Input tensor must be FP16, but got dtype={}",
+                    static_cast<int>(x.dtype()));
+  }
+  if (y.dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendSiLUOp: Output tensor must be FP16, but got dtype={}",
+                    static_cast<int>(y.dtype()));
+  }
+
+  if (x.dtype() != y.dtype()) {
+    NYI("AscendSiLUOp currently requires x/y have same dtype");
+  }
+  if (x.numel() != y.numel()) {
+    NYI("AscendSiLUOp requires x/y have same numel");
+  }
+
+  atb::infer::ActivationParam siluParam;
+  siluParam.activationType = atb::infer::ACTIVATION_SWISH;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(siluParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ACTIVATION_SWISH) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_x;
+  atb::Tensor atb_y;
+
+  fillAtbTensorDesc(x, atb_x.desc);
+  fillAtbTensorDesc(y, atb_y.desc);
+
+  atb_x.deviceData = reinterpret_cast<uint8_t*>(x.ptr<void>());
+  atb_x.dataSize = x.bytes();
+  atb_y.deviceData = reinterpret_cast<uint8_t*>(y.ptr<void>());
+  atb_y.dataSize = y.bytes();
+
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  outTensors.push_back(atb_y);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SiLUOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+  {
+    ASCEND_TIME_SCOPE("AscendSiLUOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SiLUOp Execute failed, status={}", static_cast<int>(st));
+  }
+
+
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscendSiLUOp.hpp b/mllm/backends/ascend/ops/AscendSiLUOp.hpp
new file mode 100644
index 000000000..421dd49d3
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendSiLUOp.hpp
@@ -0,0 +1,27 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/SiLUOp.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+namespace mllm::ascend {
+
+class AscendSiLUOp final : public aops::SiLUOp {
+ public:
+  explicit AscendSiLUOp(const aops::SiLUOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendSiLUOpFactory final : public TypedOpFactory<OpTypes::kSiLU, aops::SiLUOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::SiLUOpOptions& options) override {
+    return std::make_shared<AscendSiLUOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp b/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp
new file mode 100644
index 000000000..db0ef47ea
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendSoftmaxOp.cpp
@@ -0,0 +1,135 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendSoftmaxOp.hpp"
+
+#include <acl/acl.h>
+#include <atb/atb_infer.h>
+#include <atb/types.h>
+#include <atb/utils.h>
+#include <atb/infer_op_params.h>
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+namespace mllm::ascend {
+
+AscendSoftmaxOp::AscendSoftmaxOp(const aops::SoftmaxOpOptions& options) : aops::SoftmaxOp(options) {}
+
+void AscendSoftmaxOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendSoftmaxOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT_EQ(inputs.size(), 1);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& x = inputs[0];
+  auto& y = outputs[0];
+
+  // Validate that input tensors are FP16
+  if (x.dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendSoftmaxOp: Input tensor must be FP16, but got dtype={}",
+                    static_cast<int>(x.dtype()));
+  }
+  if (y.dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "AscendSoftmaxOp: Output tensor must be FP16, but got dtype={}",
+                    static_cast<int>(y.dtype()));
+  }
+
+  if (x.dtype() != y.dtype()) {
+    NYI("AscendSoftmaxOp currently requires x/y have same dtype");
+  }
+  if (x.numel() != y.numel()) {
+    NYI("AscendSoftmaxOp requires x/y have same numel");
+  }
+
+  // Configure Softmax parameters
+  atb::infer::SoftmaxParam softmaxParam;
+
+  // Convert axis to positive index if negative
+  int axis = options_.axis;
+  if (axis < 0) {
+    axis = static_cast<int>(x.rank()) + axis;
+  }
+
+  // ATB expects axes as SVector<int64_t>
+  softmaxParam.axes.push_back(static_cast<int64_t>(axis));
+
+  // Create ATB operation
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(softmaxParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "ATB CreateOperation(Softmax) failed, status={}",
+                    static_cast<int>(st));
+  }
+
+  // Get global ATB context
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  // Prepare ATB tensors
+  atb::Tensor atb_x;
+  atb::Tensor atb_y;
+
+  fillAtbTensor(x, atb_x);
+  fillAtbTensor(y, atb_y);
+
+  // Setup input/output tensors
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  outTensors.push_back(atb_y);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  // Setup operation (calculate required workspace size)
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "ATB SoftmaxOp Setup failed, status={}",
+                    static_cast<int>(st));
+  }
+
+  // Allocate workspace if needed
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+
+  // Execute operation
+  {
+    ASCEND_TIME_SCOPE("AscendSoftmaxOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "ATB SoftmaxOp Execute failed, status={}",
+                    static_cast<int>(st));
+  }
+
+  // Synchronize stream
+  syncGlobalAtbStream();
+
+  // Free workspace
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  // Destroy operation
+  atb::DestroyOperation(op);
+}
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscendSoftmaxOp.hpp b/mllm/backends/ascend/ops/AscendSoftmaxOp.hpp
new file mode 100644
index 000000000..c52cb85e0
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendSoftmaxOp.hpp
@@ -0,0 +1,27 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/SoftmaxOp.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+namespace mllm::ascend {
+
+class AscendSoftmaxOp final : public aops::SoftmaxOp {
+ public:
+  explicit AscendSoftmaxOp(const aops::SoftmaxOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendSoftmaxOpFactory final : public TypedOpFactory<OpTypes::kSoftmax, aops::SoftmaxOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::SoftmaxOpOptions& options) override {
+    return std::make_shared<AscendSoftmaxOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscendViewOp.cpp b/mllm/backends/ascend/ops/AscendViewOp.cpp
new file mode 100644
index 000000000..e7780cab2
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendViewOp.cpp
@@ -0,0 +1,16 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendViewOp.hpp"
+
+namespace mllm::ascend {
+
+AscendViewOp::AscendViewOp(const aops::ViewOpOptions& options) : aops::ViewOp(options) {}
+
+void AscendViewOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  // View operation only changes metadata (shape), not actual data
+  // Just call the base class implementation which is empty
+  aops::ViewOp::forward(inputs, outputs);
+}
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscendViewOp.hpp b/mllm/backends/ascend/ops/AscendViewOp.hpp
new file mode 100644
index 000000000..50918dcf9
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendViewOp.hpp
@@ -0,0 +1,25 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/ViewOp.hpp"
+
+namespace mllm::ascend {
+
+class AscendViewOp final : public aops::ViewOp {
+ public:
+  explicit AscendViewOp(const aops::ViewOpOptions& options);
+
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendViewOpFactory final : public TypedOpFactory<OpTypes::kView, aops::ViewOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::ViewOpOptions& options) override {
+    return std::make_shared<AscendViewOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp
deleted file mode 100644
index 0f67bab56..000000000
--- a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp
+++ /dev/null
@@ -1,726 +0,0 @@
-// Copyright (c) MLLM Team.
-// Licensed under the MIT License.
-#include <memory>
-#include <fstream>
-
-#include <QnnTypes.h>
-
-#include <QnnContext.h>
-#include <HTP/QnnHtpDevice.h>
-#include <HTP/QnnHtpCommon.h>
-#include <HTP/QnnHtpContext.h>
-#include <HTP/QnnHtpGraph.h>
-
-#include "mllm/backends/qnn/aot/passes/AOTCompileContext.hpp"
-#include "mllm/core/DataTypes.hpp"
-#include "mllm/utils/Common.hpp"
-#include "mllm/backends/qnn/QNNTypeMacros.hpp"
-#include "mllm/compile/ir/linalg/Attribute.hpp"
-#include "mllm/backends/qnn/aot/QnnWrappersAPI.hpp"
-#include "mllm/backends/qnn/aot/QnnTargetMachine.hpp"
-#include "mllm/backends/qnn/QNNUtils.hpp"
-#include "mllm/utils/Log.hpp"
-
-namespace mllm::qnn::aot {
-
-QnnAOTNodeTensor::QnnAOTNodeTensor(const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight) {
-  auto type = parseQnnTensorTypeFromIR(v);
-  auto name = v->name();
-  auto quant = parseQnnQuantizeParamFromIR(v);
-
-  if (force_static_weight || type == QNN_TENSOR_TYPE_STATIC) {
-    tensor_wrapper_ = mllm::qnn::QNNTensorWrapper::createStaticTensor(name, v->tensor_, quant);
-  } else {
-    tensor_wrapper_ = mllm::qnn::QNNTensorWrapper::create(name, type, v->tensor_, quant);
-  }
-  setupComplexTensorQuantization(v);  // per-channel and LPBQ cases
-}
-
-Qnn_TensorType_t QnnAOTNodeTensor::parseQnnTensorTypeFromIR(const ir::tensor::TensorValue::ptr_t& v) {
-  auto type = v->tensor_.memType();
-  Qnn_TensorType_t ret_qnn_tensor_type = QNN_TENSOR_TYPE_UNDEFINED;
-  switch (type) {
-    case kTensorMemTypes_Start: {
-      break;
-    }
-
-    // For MLLM Frame work to use
-    case kNormal: {
-      ret_qnn_tensor_type = QNN_TENSOR_TYPE_NATIVE;
-      break;
-    }
-    case kExtraInput: {
-      ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
-      break;
-    }
-    case kExtraOutput: {
-      ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
-      break;
-    }
-    case kManual: {
-      ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READWRITE;
-      break;
-    }
-    case kGlobal: {
-      ret_qnn_tensor_type = QNN_TENSOR_TYPE_STATIC;
-      break;
-    }
-
-    // Framework need to judge if this tensor is mmap from disk.
-    case kParams_Start:
-    case kParamsMMAP:
-    case kParamsNormal:
-    case kParams_End: {
-      ret_qnn_tensor_type = QNN_TENSOR_TYPE_STATIC;
-      break;
-    }
-
-    // For QNN Backend to use.
-    case kQnnAppRead: {
-      ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
-      break;
-    }
-    case kQnnAppWrite: {
-      ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
-      break;
-    }
-    case kQnnAppReadWrite: {
-      ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READWRITE;
-      break;
-    }
-    case kTensorMemTypes_End: break;
-  }
-
-  // Check Attribute. The Attribute priority is higher than tensor type
-  if (v->getAttr("qnn_graph_outputs")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; }
-  if (v->getAttr("qnn_graph_inputs")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; }
-  if (v->getAttr("constant")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_STATIC; }
-
-  return ret_qnn_tensor_type;
-}
-
-Qnn_DataType_t QnnAOTNodeTensor::parseQnnDataTypeFromIR(const ir::tensor::TensorValue::ptr_t& v) {
-  return mllm::qnn::mllmDataTypeToQnnDataType(v->tensor_.dtype());
-}
-
-std::string QnnAOTNodeTensor::parseQnnTensorNameFromIR(const ir::tensor::TensorValue::ptr_t& v) { return v->name(); }
-
-Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::tensor::TensorValue::ptr_t& v) {
-  Qnn_QuantizeParams_t ret = QNN_QUANTIZE_PARAMS_INIT;
-
-  MLLM_RT_ASSERT(v);
-  MLLM_RT_ASSERT(v->getAttr("quant_recipe"));
-  auto quant_spec = v->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_;
-
-  switch (quant_spec->type) {
-    case ir::linalg::QuantizationSpecType::kRaw:
-    case ir::linalg::QuantizationSpecType::kSymPerChannel:
-    case ir::linalg::QuantizationSpecType::kLPBQ: {
-      break;
-    }
-    case ir::linalg::QuantizationSpecType::kAsymPerTensor: {
-      auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecAsymPerTensor>(quant_spec);
-      ret.encodingDefinition = QNN_DEFINITION_DEFINED;
-      ret.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
-      if (!cfg->scale || !cfg->zero_point) {
-        MLLM_ERROR_EXIT(ExitCode::kCoreError, "AsymPerTensor quant recipe has no scale or zero point. tensor: {}", v->name());
-      }
-      ret.scaleOffsetEncoding =
-          Qnn_ScaleOffset_t{.scale = cfg->scale.item<float>(), .offset = -cfg->zero_point.item<int32_t>()};
-      MLLM_INFO("Configuring AsymPerTensor quantization for tensor: {}, scale: {}, zero_point: {}", v->name(),
-                cfg->scale.item<float>(), cfg->zero_point.item<int32_t>());
-      break;
-    }
-    case ir::linalg::QuantizationSpecType::kSymPerTensor: {
-      auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecSymPerTensor>(quant_spec);
-      ret.encodingDefinition = QNN_DEFINITION_DEFINED;
-      ret.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
-      if (!cfg->scale) {
-        MLLM_ERROR_EXIT(ExitCode::kCoreError, "SymPerTensor quant recipe has no scale. tensor: {}", v->name());
-      }
-      ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item<float>(), .offset = 0};
-      MLLM_INFO("Configuring SymPerTensor quantization for tensor: {}, scale: {}", v->name(), cfg->scale.item<float>());
-      break;
-    }
-    default: {
-      MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't handle kNone type");
-    }
-  }
-
-  return ret;
-}
-
-void QnnAOTNodeTensor::setupComplexTensorQuantization(const ir::tensor::TensorValue::ptr_t& v) {
-  MLLM_RT_ASSERT(v->getAttr("quant_recipe"));
-  auto quant_spec = v->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_;
-
-  switch (quant_spec->type) {
-    case ir::linalg::QuantizationSpecType::kSymPerChannel: {
-      auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecSymPerChannel>(quant_spec);
-
-      // Prepare data
-      auto num_scale_offsets = (uint32_t)v->tensor_.size(cfg->ch_axis);
-      std::vector<Qnn_ScaleOffset_t> scale_offsets(num_scale_offsets);
-      MLLM_RT_ASSERT_EQ(num_scale_offsets, cfg->scale.size(0));
-      MLLM_RT_ASSERT_EQ(cfg->scale.dtype(), kFloat32);
-      for (int i = 0; i < num_scale_offsets; ++i) {
-        scale_offsets[i].scale = cfg->scale.at<float>({i});
-        scale_offsets[i].offset = 0;
-      }
-
-      tensor_wrapper_->setScaleOffsetQuantization(scale_offsets, cfg->ch_axis);
-      break;
-    }
-    case ir::linalg::QuantizationSpecType::kLPBQ: {
-      MLLM_INFO("Solving LPBQ quantization for tensor: {}", v->tensor_.name());
-      // This LPBQ Type is for Conv2D Only !!! Linear has diff layout cmp with conv2d
-
-      auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecLPBQ>(quant_spec);
-
-      // Prepare data
-      auto num_scale_offsets = (uint32_t)v->tensor_.size(-1);
-      std::vector<Qnn_ScaleOffset_t> scale_offsets(num_scale_offsets);
-      MLLM_RT_ASSERT_EQ(num_scale_offsets, cfg->scale_level_1_fp.size(-1));
-      MLLM_RT_ASSERT_EQ(cfg->scale_level_0_int.dtype(), kUInt8);
-      MLLM_RT_ASSERT_EQ(cfg->scale_level_1_fp.dtype(), kFloat32);
-      MLLM_RT_ASSERT_EQ(cfg->scale_level_0_int.rank(), 1);
-      MLLM_RT_ASSERT_EQ(cfg->scale_level_1_fp.rank(), 1);
-      for (int i = 0; i < num_scale_offsets; ++i) {
-        scale_offsets[i].scale = cfg->scale_level_1_fp.at<float>({i});
-        scale_offsets[i].offset = 0;
-      }
-
-      Qnn_BlockwiseExpansion_t blockwise_expansion;
-      blockwise_expansion.axis = v->tensor_.rank() - 1;
-      blockwise_expansion.scaleOffsets = nullptr;  // Will be set by setBlockwiseQuantization
-      blockwise_expansion.numBlocksPerAxis = v->tensor_.size(-2) / cfg->block_size;
-      blockwise_expansion.blockScaleBitwidth = 4;  // 4 bits for uint4 scale
-      blockwise_expansion.blockScaleStorageType = QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8;
-      blockwise_expansion.blocksScale8 = cfg->scale_level_0_int.ptr<mllm_uint8_t>();
-
-      tensor_wrapper_->setBlockwiseQuantization(blockwise_expansion, scale_offsets);
-      break;
-    }
-    default: break;
-  }
-}
-
-// QnnAOTNodeOperation implementations
-QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::addInputs(const std::vector<QnnAOTNodeTensor::ptr_t>& ins) {
-  inputs.insert(inputs.end(), ins.begin(), ins.end());
-  return shared_from_this();
-}
-
-QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::addOutputs(const std::vector<QnnAOTNodeTensor::ptr_t>& ous) {
-  outputs.insert(outputs.end(), ous.begin(), ous.end());
-  return shared_from_this();
-}
-
-QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::emplaceInput(const QnnAOTNodeTensor::ptr_t& input) {
-  inputs.push_back(input);
-  return shared_from_this();
-}
-
-QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::emplaceOutput(const QnnAOTNodeTensor::ptr_t& output) {
-  outputs.push_back(output);
-  return shared_from_this();
-}
-
-QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::addParamScalar(
-    const std::vector<std::shared_ptr<mllm::qnn::QNNParamScalarWrapper>>& params) {
-  param_scalar.insert(param_scalar.end(), params.begin(), params.end());
-  return shared_from_this();
-}
-
-QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::emplaceParamScalar(
-    const std::shared_ptr<mllm::qnn::QNNParamScalarWrapper>& param) {
-  param_scalar.push_back(param);
-  return shared_from_this();
-}
-
-QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::addParamTensor(
-    const std::vector<std::shared_ptr<mllm::qnn::QNNParamTensorWrapper>>& params) {
-  param_tensor.insert(param_tensor.end(), params.begin(), params.end());
-  return shared_from_this();
-}
-
-QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::emplaceParamTensor(
-    const std::shared_ptr<mllm::qnn::QNNParamTensorWrapper>& param) {
-  param_tensor.push_back(param);
-  return shared_from_this();
-}
-
-QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::setOpName(const std::string& op_name) {
-  op_name_ = op_name;
-  return shared_from_this();
-}
-
-QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::setName(const std::string& name) {
-  name_ = name;
-  return shared_from_this();
-}
-
-std::string QnnAOTNodeOperation::getName() { return name_; }
-
-QnnAOTNodeOperation::ptr_t QnnAOTNodeOperation::setPackageName(const std::string& package_name) {
-  package_name_ = package_name;
-  return shared_from_this();
-}
-
-QnnAOTGraph::QnnAOTGraph(QNN_INTERFACE_VER_TYPE& qnnInterface, Qnn_BackendHandle_t backendHandle,
-                         Qnn_ContextHandle_t contextHandle, const std::string& graphName) {
-  qnn_model_ = std::make_shared<mllm::qnn::QNNModel>(qnnInterface, backendHandle);
-
-  // Short Depth Conv On HMX Off
-  QnnHtpGraph_CustomConfig_t* p_custom_config = nullptr;
-  // FIXME: @chenghuaWang The code below will make llm inference slow!!!
-  // p_custom_config = (QnnHtpGraph_CustomConfig_t*)malloc(sizeof(QnnHtpGraph_CustomConfig_t));
-  // p_custom_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_SHORT_DEPTH_CONV_ON_HMX_OFF;
-  // p_custom_config->shortDepthConvOnHmxOff = true;
-  // htp_graph_configs.push_back(static_cast<QnnGraph_CustomConfig_t>(p_custom_config));
-
-  // Fold Relu Activation Into Conv Off
-  p_custom_config = (QnnHtpGraph_CustomConfig_t*)malloc(sizeof(QnnHtpGraph_CustomConfig_t));
-  p_custom_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_FOLD_RELU_ACTIVATION_INTO_CONV_OFF;
-  p_custom_config->foldReluActivationIntoConvOff = true;
-  htp_graph_configs.push_back(static_cast<QnnGraph_CustomConfig_t>(p_custom_config));
-
-  // FIXME: If need or not
-  p_custom_config = (QnnHtpGraph_CustomConfig_t*)malloc(sizeof(QnnHtpGraph_CustomConfig_t));
-  p_custom_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
-  p_custom_config->precision = QNN_PRECISION_FLOAT16;
-  htp_graph_configs.push_back(static_cast<QnnGraph_CustomConfig_t>(p_custom_config));
-
-  // Optimization level
-  p_custom_config = (QnnHtpGraph_CustomConfig_t*)malloc(sizeof(QnnHtpGraph_CustomConfig_t));
-  p_custom_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-  p_custom_config->optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
-  p_custom_config->optimizationOption.floatValue = 3;
-  htp_graph_configs.push_back(static_cast<QnnGraph_CustomConfig_t>(p_custom_config));
-
-  // VTCM Size
-  p_custom_config = (QnnHtpGraph_CustomConfig_t*)malloc(sizeof(QnnHtpGraph_CustomConfig_t));
-  p_custom_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
-  p_custom_config->vtcmSizeInMB = 8;
-  htp_graph_configs.push_back(static_cast<QnnGraph_CustomConfig_t>(p_custom_config));
-
-  qnn_graph_configs.resize(htp_graph_configs.size());
-  qnn_graph_configs.reserve(htp_graph_configs.size() + 1);
-  for (int i = 0; i < htp_graph_configs.size(); ++i) {
-    qnn_graph_configs[i].option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-    qnn_graph_configs[i].customConfig = htp_graph_configs[i];
-    qnn_graph_config_pass_in_.push_back(&qnn_graph_configs[i]);
-  }
-
-  qnn_graph_config_pass_in_.push_back(nullptr);
-
-  qnn_model_->initialize(contextHandle, graphName.c_str(), false, 1, qnn_graph_config_pass_in_.data());
-}
-
-void QnnAOTGraph::addTensor(const QnnAOTNodeTensor::ptr_t& tensor) {
-  qnn_model_->addTensorWrapper(tensor->getWrapper());
-  all_tensors_.insert({tensor->getWrapper()->getName(), tensor});
-}
-
-void QnnAOTGraph::addOperation(const QnnAOTNodeOperation::ptr_t& qnn_op) {
-  std::vector<std::string> inputNames;
-  for (auto& in : qnn_op->inputs) inputNames.push_back(in->getWrapper()->getName());
-
-  std::vector<std::string> outputNames;
-  for (auto& out : qnn_op->outputs) outputNames.push_back(out->getWrapper()->getName());
-
-  for (auto& in : qnn_op->inputs) qnn_model_->addTensorWrapper(in->getWrapper());
-  for (auto& out : qnn_op->outputs) qnn_model_->addTensorWrapper(out->getWrapper());
-
-  qnn_model_->addNode(QNN_OPCONFIG_VERSION_1, qnn_op->name_, qnn_op->package_name_, qnn_op->op_name_, qnn_op->param_tensor,
-                      qnn_op->param_scalar, inputNames, outputNames);
-
-  op_node_.insert({qnn_op->getName(), qnn_op});
-}
-
-bool QnnAOTGraph::compile() {
-  if (is_compiled_) { return true; }
-  bool ret = qnn_model_->finalizeGraph(nullptr, nullptr) == mllm::qnn::MODEL_NO_ERROR;
-  is_compiled_ = true;
-  return ret;
-}
-
-const std::vector<std::string> QnnDynSymbolLoader::possible_qnn_dyn_lib_paths_{
-    "/opt/qcom/aistack/qairt/2.41.0.251128/lib/x86_64-linux-clang/",
-};
-
-QnnDynSymbolLoader::~QnnDynSymbolLoader() {
-  for (auto& item : libs_) {
-    if (item.second.handle_) { dlclose(item.second.handle_); }
-  }
-}
-
-bool QnnDynSymbolLoader::loadQnnDynLib(const std::string& lib_name, int flag) {
-  for (auto const& path : possible_qnn_dyn_lib_paths_) {
-    auto real_path = path + lib_name;
-    auto handle = dlopen(real_path.c_str(), flag);
-    if (handle) {
-      auto descriptor = QnnDynLibDescriptor{.lib_name_ = lib_name, .lib_path_ = path, .handle_ = handle};
-      libs_.insert({lib_name, descriptor});
-      MLLM_INFO("QnnDynSymbolLoader::loadQnnDynLib {} success.", real_path);
-      return true;
-    } else {
-      char* error = dlerror();
-      MLLM_ERROR("QnnDynSymbolLoader::loadQnnDynLib try for {} failed: {}", real_path, error ? error : "Unknown error");
-    }
-  }
-  MLLM_ERROR("QnnDynSymbolLoader::loadQnnDynLib {} failed.", lib_name);
-  return false;
-}
-
-bool QnnDynSymbolLoader::loadQnnDynLibAtPath(const std::string& path, const std::string& lib_name, int flag) {
-  auto real_path = path + lib_name;
-  auto handle = dlopen(real_path.c_str(), flag);
-  if (handle) {
-    auto descriptor = QnnDynLibDescriptor{.lib_name_ = lib_name, .lib_path_ = path, .handle_ = handle};
-    libs_.insert({lib_name, descriptor});
-    MLLM_INFO("QnnDynSymbolLoader::loadQnnDynLib {} success.", real_path);
-    return true;
-  } else {
-    char* error = dlerror();
-    MLLM_ERROR("QnnDynSymbolLoader::loadQnnDynLib try for {} failed: {}", real_path, error ? error : "Unknown error");
-  }
-  MLLM_ERROR("QnnDynSymbolLoader::loadQnnDynLib {} failed.", lib_name);
-  return false;
-}
-
-QnnAOTEnv::QnnAOTEnv(const QcomTargetMachine& target_machine) : target_machine_(target_machine) { _setup(); }
-
-QnnAOTEnv::QnnAOTEnv(const std::string& lib_path, const QcomTargetMachine& target_machine) : target_machine_(target_machine) {
-  _setup(lib_path);
-}
-
-void QnnAOTEnv::_setup(const std::string& path) {
-  auto& loader = QnnDynSymbolLoader::instance();
-  std::string htp_backend_lib_name = "libQnnHtp.so";
-  // GLOBAL Load
-  if (path.empty()) {
-    if (!loader.loadQnnDynLib(htp_backend_lib_name,
-                              QnnDynSymbolLoader::DynFlag::kRTLD_NOW | QnnDynSymbolLoader::DynFlag::kRTLD_GLOBAL)) {
-      MLLM_ERROR("QnnAOTEnv::QnnAOTEnv {} failed.", htp_backend_lib_name);
-      exit(1);
-    }
-  } else {
-    if (!loader.loadQnnDynLibAtPath(path, htp_backend_lib_name,
-                                    QnnDynSymbolLoader::DynFlag::kRTLD_NOW | QnnDynSymbolLoader::DynFlag::kRTLD_GLOBAL)) {
-      MLLM_ERROR("QnnAOTEnv::QnnAOTEnv {} failed.", htp_backend_lib_name);
-      exit(1);
-    }
-  }
-
-  auto qnn_interface_get_providers_func =
-      loader(htp_backend_lib_name).func<QnnFuncSymbols::QnnInterfaceGetProvidersFuncType>("QnnInterface_getProviders");
-
-  QnnInterface_t** interface_providers = nullptr;
-  uint32_t num_providers = 0;
-
-  MLLM_RT_ASSERT_EQ(qnn_interface_get_providers_func((const QnnInterface_t***)&interface_providers, &num_providers),
-                    QNN_SUCCESS);
-  MLLM_RT_ASSERT(interface_providers != nullptr);
-  MLLM_RT_ASSERT(num_providers != 0);
-
-  MLLM_INFO("QnnAOTEnv::QnnAOTEnv get HTP num_providers: {}", num_providers);
-
-  bool found_valid_interface = false;
-  // Get correct provider
-  for (size_t provider_id = 0; provider_id < num_providers; provider_id++) {
-    if (QNN_API_VERSION_MAJOR == interface_providers[provider_id]->apiVersion.coreApiVersion.major
-        && QNN_API_VERSION_MINOR <= interface_providers[provider_id]->apiVersion.coreApiVersion.minor) {
-      found_valid_interface = true;
-      qnn_htp_func_symbols_.qnn_interface_ = interface_providers[provider_id]->QNN_INTERFACE_VER_NAME;
-      break;
-    }
-  }
-  MLLM_RT_ASSERT_EQ(found_valid_interface, true);
-
-  // Check if this HTP Backend has specific property
-  if (nullptr != qnn_htp_func_symbols_.qnn_interface_.propertyHasCapability) {
-    auto status = qnn_htp_func_symbols_.qnn_interface_.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE);
-    if (status == QNN_PROPERTY_NOT_SUPPORTED) { MLLM_WARN("Device property is not supported"); }
-
-    MLLM_RT_ASSERT(status != QNN_PROPERTY_ERROR_UNKNOWN_KEY);
-  }
-
-  // Try to config this target machine
-  {
-    auto device_custom_config = createDecideCustomConfigInfo();
-    QnnHtpDevice_CustomConfig_t* p_custom_config = nullptr;
-
-    switch (target_machine_.soc_htp_security_pd_session) {
-      case QcomSecurityPDSession::kHtpSignedPd: {
-        p_custom_config = (QnnHtpDevice_CustomConfig_t*)malloc(sizeof(QnnHtpDevice_CustomConfig_t));
-        unreachable_handle_.push_back(p_custom_config);
-        p_custom_config->option = QNN_HTP_DEVICE_CONFIG_OPTION_SIGNEDPD;
-        p_custom_config->useSignedProcessDomain.useSignedProcessDomain = true;
-        p_custom_config->useSignedProcessDomain.deviceId = 0;
-        device_custom_config.push_back(static_cast<QnnDevice_CustomConfig_t>(p_custom_config));
-        break;
-      }
-      case QcomSecurityPDSession::kHtpUnsignedPd:
-      default: break;
-    }
-
-    const std::vector<QnnDevice_PlatformInfo_t*> device_platform_info = createDevicePlatformInfo();
-    uint32_t num_custom_configs = device_platform_info.size() + device_custom_config.size();
-    target_machine_qnn_config_.resize(num_custom_configs);
-
-    for (std::size_t i = 0; i < device_custom_config.size(); ++i) {
-      target_machine_qnn_config_[i].option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
-      target_machine_qnn_config_[i].customConfig = device_custom_config[i];
-      target_machine_qnn_config_ptrs_.push_back(&target_machine_qnn_config_[i]);
-    }
-
-    if (!device_platform_info.empty()) {
-      // The length of platform info can only be 1.
-      MLLM_RT_ASSERT_EQ(device_platform_info.size(), 1u);
-      target_machine_qnn_config_[device_custom_config.size()].option = QNN_DEVICE_CONFIG_OPTION_PLATFORM_INFO;
-      target_machine_qnn_config_[device_custom_config.size()].hardwareInfo = device_platform_info.back();
-      target_machine_qnn_config_ptrs_.push_back(&target_machine_qnn_config_[device_custom_config.size()]);
-    }
-
-    // null terminated
-    target_machine_qnn_config_ptrs_.push_back(nullptr);
-  }
-}
-
-std::shared_ptr<QnnDeviceAndContext> QnnAOTEnv::createContext(const std::string& name, bool weights_sharing) {
-  // Check if context with this name already exists
-  if (contexts_.count(name) > 0) {
-    MLLM_WARN("Context '{}' already exists, reusing the existing context", name);
-    return contexts_[name];
-  }
-
-  std::shared_ptr<QnnDeviceAndContext> context = std::make_shared<QnnDeviceAndContext>();
-  context->name_ = name;
-
-  // 1. create logger and register callback.
-  // clang-format off
-  MLLM_RT_ASSERT_EQ(qnn_htp_func_symbols_.qnn_interface_.logCreate(__mllmQnnLoggerCallback,QNN_LOG_LEVEL_VERBOSE, &context->log_), QNN_SUCCESS)
-  MLLM_RT_ASSERT_EQ(QNN_BACKEND_NO_ERROR, qnn_htp_func_symbols_.qnn_interface_.backendCreate(context->log_, (const QnnBackend_Config_t**)context->bk_cfg_, &context->bk_handle_))
-  // clang-format on
-
-  // 2. Create HTP Device
-  // clang-format off
-  if (nullptr != qnn_htp_func_symbols_.qnn_interface_.deviceCreate) {
-    auto status = qnn_htp_func_symbols_.qnn_interface_.deviceCreate(context->log_, target_machine_qnn_config_ptrs_.data(), &context->device_handle_);
-    MLLM_RT_ASSERT_EQ(status, QNN_SUCCESS);
-  }
-  // clang-format on
-
-  // 3. Create Profile
-  {
-    auto status = qnn_htp_func_symbols_.qnn_interface_.profileCreate(context->bk_handle_, QNN_PROFILE_LEVEL_DETAILED,
-                                                                     &context->profile_bk_handle_);
-    MLLM_RT_ASSERT_EQ(status, QNN_SUCCESS);
-  }
-
-  // 4. Create Context
-  {
-    auto cfgs = createContextCustomConfig(weights_sharing);
-    if (cfgs.size()) {
-      context->qnn_context_config_ = (QnnContext_Config_t**)malloc(sizeof(QnnContext_Config_t*) * (cfgs.size() + 1));
-      unreachable_handle_.emplace_back(context->qnn_context_config_);
-    }
-    for (int i = 0; i < cfgs.size(); ++i) {
-      context->qnn_context_config_[i] = (QnnContext_Config_t*)malloc(sizeof(QnnContext_Config_t));
-      context->qnn_context_config_[i]->option = QNN_CONTEXT_CONFIG_OPTION_CUSTOM;
-      context->qnn_context_config_[i]->customConfig = cfgs[i];
-      unreachable_handle_.emplace_back(context->qnn_context_config_[i]);
-    }
-    if (cfgs.size()) { context->qnn_context_config_[cfgs.size()] = nullptr; }
-    auto status = qnn_htp_func_symbols_.qnn_interface_.contextCreate(context->bk_handle_, context->device_handle_,
-                                                                     (const QnnContext_Config_t**)context->qnn_context_config_,
-                                                                     &context->qnn_ctx_handle_);
-    MLLM_RT_ASSERT_EQ(QNN_CONTEXT_NO_ERROR, status);
-  }
-
-  // 5. Register MLLM's Qnn Opset
-  // clang-format off
-  {
-    // FIXME(wch): we need to register our own opset of qnn.
-  }
-  // clang-format on
-
-  MLLM_RT_ASSERT_EQ(contexts_.count(name), 0);
-  contexts_[name] = context;
-  return context;
-}
-
-void QnnAOTEnv::saveContext(const std::string& name, const std::string& path) {
-  if (contexts_.find(name) == contexts_.end()) {
-    MLLM_ERROR("QnnAOTEnv::saveContext Context {} not found", name);
-    return;
-  }
-  auto context = contexts_[name];
-
-  uint64_t binarySize = 0;
-  uint64_t writtenSize = 0;
-
-  auto status = qnn_htp_func_symbols_.qnn_interface_.contextGetBinarySize(context->qnn_ctx_handle_, &binarySize);
-  MLLM_RT_ASSERT_EQ(status, QNN_SUCCESS);
-
-  std::vector<uint8_t> binaryBuffer(binarySize);
-
-  status = qnn_htp_func_symbols_.qnn_interface_.contextGetBinary(
-      context->qnn_ctx_handle_, reinterpret_cast<void*>(binaryBuffer.data()), binarySize, &writtenSize);
-  MLLM_RT_ASSERT_EQ(status, QNN_SUCCESS);
-
-  if (binarySize < writtenSize) {
-    MLLM_ERROR("QNN context binary size mismatch: expected {} bytes, but wrote {} bytes.", binarySize, writtenSize);
-  }
-
-  std::ofstream file(path, std::ios::binary);
-  if (!file.is_open()) {
-    MLLM_ERROR("Failed to open file {} for writing QNN context.", path);
-    return;
-  }
-  file.write(reinterpret_cast<char*>(binaryBuffer.data()), writtenSize);
-  file.close();
-
-  MLLM_INFO("QNN context {} saved to {} written {}", name, path, writtenSize);
-}
-
-void QnnAOTEnv::destroyContext(const std::string& name) {
-  // TODO
-}
-
-std::vector<QnnDevice_PlatformInfo_t*> QnnAOTEnv::createDevicePlatformInfo() {
-  std::vector<QnnDevice_PlatformInfo_t*> ret;
-  QnnDevice_PlatformInfo_t* p_platform_info = nullptr;
-  QnnDevice_HardwareDeviceInfo_t* p_hw_device_info = nullptr;
-  QnnHtpDevice_DeviceInfoExtension_t* p_device_info_extension = nullptr;
-  QnnDevice_CoreInfo_t* p_core_info = nullptr;
-
-  p_platform_info = (QnnDevice_PlatformInfo_t*)malloc(sizeof(QnnDevice_PlatformInfo_t));
-  unreachable_handle_.push_back(p_platform_info);
-  p_platform_info->version = QNN_DEVICE_PLATFORM_INFO_VERSION_1;
-  p_platform_info->v1.numHwDevices = 1;
-
-  p_hw_device_info = (QnnDevice_HardwareDeviceInfo_t*)malloc(sizeof(QnnDevice_HardwareDeviceInfo_t));
-  unreachable_handle_.push_back(p_hw_device_info);
-  p_hw_device_info->version = QNN_DEVICE_HARDWARE_DEVICE_INFO_VERSION_1;
-  p_hw_device_info->v1.deviceId = 0;
-  p_hw_device_info->v1.deviceType = 0;
-  p_hw_device_info->v1.numCores = 1;
-
-  p_device_info_extension = (QnnHtpDevice_DeviceInfoExtension_t*)malloc(sizeof(QnnHtpDevice_DeviceInfoExtension_t));
-  unreachable_handle_.push_back(p_device_info_extension);
-  // clang-format off
-  p_device_info_extension->devType = QNN_HTP_DEVICE_TYPE_ON_CHIP;
-  p_device_info_extension->onChipDevice.vtcmSize = target_machine_.soc_htp_vtcm_total_memory_size;  // in MB
-  p_device_info_extension->onChipDevice.signedPdSupport = target_machine_.soc_htp_security_pd_session == QcomSecurityPDSession::kHtpSignedPd;
-  p_device_info_extension->onChipDevice.socModel = static_cast<uint32_t>(target_machine_.soc_htp_chipset);
-  p_device_info_extension->onChipDevice.arch = static_cast<QnnHtpDevice_Arch_t>(target_machine_.soc_htp_arch);
-  p_device_info_extension->onChipDevice.dlbcSupport = true;
-  p_hw_device_info->v1.deviceInfoExtension = p_device_info_extension;
-  // clang-format on
-
-  p_core_info = (QnnDevice_CoreInfo_t*)malloc(sizeof(QnnDevice_CoreInfo_t));
-  unreachable_handle_.push_back(p_core_info);
-  p_core_info->version = QNN_DEVICE_CORE_INFO_VERSION_1;
-  p_core_info->v1.coreId = 0;
-  p_core_info->v1.coreType = 0;
-  p_core_info->v1.coreInfoExtension = nullptr;
-  p_hw_device_info->v1.cores = p_core_info;
-
-  p_platform_info->v1.hwDevices = p_hw_device_info;
-  ret.push_back(p_platform_info);
-
-  return ret;
-}
-
-std::vector<QnnDevice_CustomConfig_t> QnnAOTEnv::createDecideCustomConfigInfo() {
-  std::vector<QnnDevice_CustomConfig_t> ret;
-
-  QnnHtpDevice_CustomConfig_t* p_custom_config = (QnnHtpDevice_CustomConfig_t*)malloc(sizeof(QnnHtpDevice_CustomConfig_t));
-  unreachable_handle_.push_back(p_custom_config);
-  p_custom_config->option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
-  p_custom_config->socModel = static_cast<uint32_t>(target_machine_.soc_htp_chipset);
-  ret.push_back(static_cast<QnnDevice_CustomConfig_t>(p_custom_config));
-
-  return ret;
-}
-
-std::vector<QnnContext_CustomConfig_t> QnnAOTEnv::createContextCustomConfig(bool weights_sharing) {
-  std::vector<QnnContext_CustomConfig_t> ret;
-  QnnHtpContext_CustomConfig_t* p_custom_config = nullptr;
-
-  if (weights_sharing) {
-    p_custom_config = (QnnHtpContext_CustomConfig_t*)malloc(sizeof(QnnHtpContext_CustomConfig_t));
-    unreachable_handle_.push_back(p_custom_config);
-    p_custom_config->option = QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED;
-    p_custom_config->weightSharingEnabled = true;
-    ret.push_back(static_cast<QnnContext_CustomConfig_t>(p_custom_config));
-  }
-
-  return ret;
-}
-
-QnnAOTGraph::ptr_t QnnAOTEnv::captureAOTGraph(const std::string& qnn_context_name, const std::string& g_name) {
-  if (contexts_.find(qnn_context_name) == contexts_.end()) {
-    MLLM_ERROR("Context {} not found", qnn_context_name);
-    return nullptr;
-  }
-  auto& ctx = contexts_[qnn_context_name];
-  if (ctx->graphs_.find(g_name) == ctx->graphs_.end()) {
-    ctx->graphs_[g_name] =
-        std::make_shared<QnnAOTGraph>(qnn_htp_func_symbols_.qnn_interface_, ctx->bk_handle_, ctx->qnn_ctx_handle_, g_name);
-  }
-  return ctx->graphs_[g_name];
-}
-
-void QnnAOTEnv::captureAOTNodeOp(const std::string& qnn_context_name, const std::string& graph_name,
-                                 const QnnAOTNodeOperation::ptr_t& op) {
-  MLLM_RT_ASSERT_EQ(contexts_.count(qnn_context_name), 1);
-  MLLM_RT_ASSERT_EQ(contexts_[qnn_context_name]->graphs_.count(graph_name), 1);
-  contexts_[qnn_context_name]->graphs_[graph_name]->addOperation(op);
-}
-
-QnnAOTNodeTensor::ptr_t QnnAOTEnv::captureQnnAOTNodeTensor(const std::string& qnn_context_name, const std::string& graph_name,
-                                                           const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight) {
-  auto __qnn_tensor_name = v->name();
-
-  bool __qnn_enable_static_weight = force_static_weight;
-
-  // Check if this value want static qnn weight. The static qnn weight will be shared through one context in diff graphs!
-  if (v->tensor_.memType() == kGlobal || (v->tensor_.memType() <= kParams_End && v->tensor_.memType() >= kParams_Start)
-      || v->getAttr("constant")) {
-    __qnn_enable_static_weight = true;
-  }
-
-  MLLM_RT_ASSERT_EQ(contexts_.count(qnn_context_name), 1);
-  MLLM_RT_ASSERT_EQ(contexts_[qnn_context_name]->graphs_.count(graph_name), 1);
-  auto graph = contexts_[qnn_context_name]->graphs_[graph_name];
-
-  // If normal weight is cached, we return it directly
-  if (graph->all_tensors_.count(__qnn_tensor_name)) { return graph->all_tensors_[__qnn_tensor_name]; }
-
-  QnnAOTNodeTensor::ptr_t ret = nullptr;
-
-  // If static weight is cached, we return it directly.
-  if (__qnn_enable_static_weight) {
-    if (contexts_[qnn_context_name]->static_tensor_.count(__qnn_tensor_name)) {
-      ret = contexts_[qnn_context_name]->static_tensor_[__qnn_tensor_name];
-    }
-  }
-
-  // There has no Tensor in the cache.
-  if (ret == nullptr) {
-    ret = QnnAOTNodeTensor::create(v, __qnn_enable_static_weight);
-
-    if (__qnn_enable_static_weight) { contexts_[qnn_context_name]->static_tensor_[__qnn_tensor_name] = ret; }
-  }
-
-  graph->addTensor(ret);
-
-  return ret;
-}
-
-std::shared_ptr<QnnDeviceAndContext> QnnAOTEnv::getContext(const std::string& name) { return contexts_[name]; }
-
-}  // namespace mllm::qnn::aot
diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.hpp b/mllm/backends/qnn/aot/QnnWrappersAPI.hpp
deleted file mode 100644
index 6cb424bc6..000000000
--- a/mllm/backends/qnn/aot/QnnWrappersAPI.hpp
+++ /dev/null
@@ -1,251 +0,0 @@
-// Copyright (c) MLLM Team.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include <dlfcn.h>
-#include <cstdlib>
-#include <vector>
-#include <string>
-#include <memory>
-#include <functional>
-#include <unordered_map>
-
-#include <QnnTypes.h>
-#include <QnnCommon.h>
-#include <QnnContext.h>
-#include <QnnInterface.h>
-#include <QnnSdkBuildId.h>
-#include <HTP/QnnHtpDevice.h>
-#include <System/QnnSystemInterface.h>
-
-#include "mllm/utils/Common.hpp"
-#include "mllm/compile/ir/tensor/Value.hpp"
-#include "mllm/compile/ir/linalg/Attribute.hpp"
-#include "mllm/backends/qnn/aot/QnnTargetMachine.hpp"
-#include "mllm/backends/qnn/QNNModel.hpp"
-#include "mllm/backends/qnn/QNNUtils.hpp"
-
-namespace mllm::qnn::aot {
-
-void __mllmLoggerCallback4QnnLogger(const char* fmt, QnnLog_Level_t level, uint64_t times_tamp, va_list argp);
-
-// Collection of symbols that we need to load from qnn dyn lib.
-struct QnnFuncSymbols {
-  using QnnInterfaceGetProvidersFuncType = Qnn_ErrorHandle_t(const QnnInterface_t*** providerList, uint32_t* numProviders);
-  using QnnSystemInterfaceGetProvidersFuncType = Qnn_ErrorHandle_t(const QnnSystemInterface_t*** providerList,
-                                                                   uint32_t* numProviders);
-
-  QNN_INTERFACE_VER_TYPE qnn_interface_;
-  QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface_;
-};
-
-class QnnAOTNodeTensor : public std::enable_shared_from_this<QnnAOTNodeTensor> {
- public:
-  using ptr_t = std::shared_ptr<QnnAOTNodeTensor>;
-
-  static inline ptr_t create(const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight = false) {
-    return std::make_shared<QnnAOTNodeTensor>(v, force_static_weight);
-  }
-
-  explicit QnnAOTNodeTensor(const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight = false);
-
-  std::shared_ptr<mllm::qnn::QNNTensorWrapper> getWrapper() { return tensor_wrapper_; }
-
- private:
-  Qnn_TensorType_t parseQnnTensorTypeFromIR(const ir::tensor::TensorValue::ptr_t& v);
-
-  Qnn_DataType_t parseQnnDataTypeFromIR(const ir::tensor::TensorValue::ptr_t& v);
-
-  std::string parseQnnTensorNameFromIR(const ir::tensor::TensorValue::ptr_t& v);
-
-  Qnn_QuantizeParams_t parseQnnQuantizeParamFromIR(const ir::tensor::TensorValue::ptr_t& v);
-
-  // intend for per-channel and LPBQ quantization
-  void setupComplexTensorQuantization(const ir::tensor::TensorValue::ptr_t& v);
-
-  std::shared_ptr<mllm::qnn::QNNTensorWrapper> tensor_wrapper_;
-};
-
-class QnnAOTNodeOperation : public std::enable_shared_from_this<QnnAOTNodeOperation> {
- public:
-  using ptr_t = std::shared_ptr<QnnAOTNodeOperation>;
-
-  static inline ptr_t create(const std::string& op_name) {
-    auto ret = std::make_shared<QnnAOTNodeOperation>();
-    ret->op_name_ = op_name;
-    return ret;
-  }
-
-  QnnAOTNodeOperation::ptr_t addInputs(const std::vector<QnnAOTNodeTensor::ptr_t>& ins);
-
-  QnnAOTNodeOperation::ptr_t addOutputs(const std::vector<QnnAOTNodeTensor::ptr_t>& ous);
-
-  QnnAOTNodeOperation::ptr_t emplaceInput(const QnnAOTNodeTensor::ptr_t& input);
-
-  QnnAOTNodeOperation::ptr_t emplaceOutput(const QnnAOTNodeTensor::ptr_t& output);
-
-  QnnAOTNodeOperation::ptr_t addParamScalar(const std::vector<std::shared_ptr<mllm::qnn::QNNParamScalarWrapper>>& params);
-
-  QnnAOTNodeOperation::ptr_t emplaceParamScalar(const std::shared_ptr<mllm::qnn::QNNParamScalarWrapper>& param);
-
-  QnnAOTNodeOperation::ptr_t addParamTensor(const std::vector<std::shared_ptr<mllm::qnn::QNNParamTensorWrapper>>& params);
-
-  QnnAOTNodeOperation::ptr_t emplaceParamTensor(const std::shared_ptr<mllm::qnn::QNNParamTensorWrapper>& param);
-
-  QnnAOTNodeOperation::ptr_t setOpName(const std::string& op_name);
-
-  QnnAOTNodeOperation::ptr_t setName(const std::string& name);
-
-  std::string getName();
-
-  QnnAOTNodeOperation::ptr_t setPackageName(const std::string& package_name);
-
-  std::string name_;
-  std::string op_name_;
-  std::string package_name_ = "qti.aisw";
-  std::vector<std::shared_ptr<mllm::qnn::QNNParamScalarWrapper>> param_scalar;
-  std::vector<std::shared_ptr<mllm::qnn::QNNParamTensorWrapper>> param_tensor;
-  std::vector<QnnAOTNodeTensor::ptr_t> inputs;
-  std::vector<QnnAOTNodeTensor::ptr_t> outputs;
-};
-
-struct QnnDeviceAndContext;
-class QnnAOTGraph : public std::enable_shared_from_this<QnnAOTGraph> {
- public:
-  using ptr_t = std::shared_ptr<QnnAOTGraph>;
-
-  QnnAOTGraph(QNN_INTERFACE_VER_TYPE& qnnInterface, Qnn_BackendHandle_t backendHandle, Qnn_ContextHandle_t contextHandle,
-              const std::string& graphName);
-
-  void addOperation(const QnnAOTNodeOperation::ptr_t& qnn_op);
-
-  void addTensor(const QnnAOTNodeTensor::ptr_t& tensor);
-
-  bool compile();
-
-  bool is_compiled_ = false;
-  std::unordered_map<std::string, QnnAOTNodeOperation::ptr_t> op_node_;
-  std::unordered_map<std::string, QnnAOTNodeTensor::ptr_t> all_tensors_;
-
- private:
-  std::shared_ptr<mllm::qnn::QNNModel> qnn_model_;
-  std::vector<QnnGraph_Config_t> qnn_graph_configs;
-  std::vector<QnnGraph_CustomConfig_t> htp_graph_configs;
-  std::vector<const QnnGraph_Config_t*> qnn_graph_config_pass_in_;
-};
-
-struct QnnDeviceAndContext {
-  using ptr_t = std::shared_ptr<QnnDeviceAndContext>;
-
-  std::string name_;
-  Qnn_LogHandle_t log_ = nullptr;
-  Qnn_BackendHandle_t bk_handle_ = nullptr;
-  Qnn_DeviceHandle_t device_handle_ = nullptr;
-  QnnBackend_Config_t** bk_cfg_ = nullptr;
-  QnnContext_Config_t** qnn_context_config_ = nullptr;
-  Qnn_ProfileHandle_t profile_bk_handle_ = nullptr;
-  Qnn_ContextHandle_t qnn_ctx_handle_;
-
-  std::unordered_map<std::string, QnnAOTGraph::ptr_t> graphs_;              //< for persistence keep graphs.
-  std::unordered_map<std::string, QnnAOTNodeTensor::ptr_t> static_tensor_;  //< for weight sharing.
-};
-
-struct QnnDynLibDescriptor {
-  std::string lib_name_;
-  std::string lib_path_;
-  void* handle_ = nullptr;
-
-  template<typename FuncType>
-  std::function<FuncType> func(const std::string& symbol_name) {
-    if (handle_ == nullptr) { MLLM_ERROR_EXIT(ExitCode::kCoreError, "QnnDynSymbolLoader: handle is nullptr."); }
-    auto func_ptr = dlsym(handle_, symbol_name.c_str());
-    MLLM_RT_ASSERT(func_ptr != nullptr);
-    return (FuncType*)(func_ptr);
-  };
-};
-
-class QnnDynSymbolLoader {
- public:
-  enum DynFlag : int {  // NOLINT performance-enum-size
-    kRTLD_NOW = RTLD_NOW,
-    kRTLD_LOCAL = RTLD_LOCAL,
-    kRTLD_GLOBAL = RTLD_GLOBAL,
-  };
-
-  static QnnDynSymbolLoader& instance() {
-    static QnnDynSymbolLoader instance;
-    return instance;
-  }
-
-  ~QnnDynSymbolLoader();
-
-  QnnDynSymbolLoader() = default;
-
-  QnnDynSymbolLoader(const QnnDynSymbolLoader&) = delete;
-
-  QnnDynSymbolLoader& operator=(const QnnDynSymbolLoader&) = delete;
-
-  bool loadQnnDynLib(const std::string& lib_name, int flag);
-
-  bool loadQnnDynLibAtPath(const std::string& path, const std::string& lib_name, int flag);
-
-  inline QnnDynLibDescriptor& operator()(const std::string& lib_name) { return libs_.at(lib_name); }
-
- private:
-  std::unordered_map<std::string, QnnDynLibDescriptor> libs_;
-  static const std::vector<std::string> possible_qnn_dyn_lib_paths_;
-};
-
-// Device and Dynamic Lib included
-class QnnAOTEnv {
- public:
-  using ptr_t = std::shared_ptr<QnnAOTEnv>;
-
-  explicit QnnAOTEnv(const QcomTargetMachine& target_machine);
-
-  QnnAOTEnv(const std::string& lib_path, const QcomTargetMachine& target_machine);
-
-  std::shared_ptr<QnnDeviceAndContext> createContext(const std::string& name, bool weights_sharing = false);
-
-  void saveContext(const std::string& name, const std::string& path);
-
-  void destroyContext(const std::string& name);
-
-  // This is for All PUs, such as CPU, GPU, NPU
-  std::vector<QnnDevice_PlatformInfo_t*> createDevicePlatformInfo();
-
-  // This function is for NPU only.
-  std::vector<QnnDevice_CustomConfig_t> createDecideCustomConfigInfo();
-
-  std::vector<QnnContext_CustomConfig_t> createContextCustomConfig(bool weights_sharing);
-
-  // Functions for build qnn graphs
-  QnnAOTGraph::ptr_t captureAOTGraph(const std::string& qnn_context_name, const std::string& g_name);
-
-  void captureAOTNodeOp(const std::string& qnn_context_name, const std::string& graph_name,
-                        const QnnAOTNodeOperation::ptr_t& op);
-
-  QnnAOTNodeTensor::ptr_t captureQnnAOTNodeTensor(const std::string& qnn_context_name, const std::string& graph_name,
-                                                  const ir::tensor::TensorValue::ptr_t& v, bool force_static_weight = false);
-
-  inline QnnFuncSymbols& getFuncSymbol() { return qnn_htp_func_symbols_; }
-
-  std::shared_ptr<QnnDeviceAndContext> getContext(const std::string& name);
-
- private:
-  void _setup(const std::string& path = "");
-
-  QcomTargetMachine target_machine_;
-  QnnFuncSymbols qnn_htp_func_symbols_;
-  std::unordered_map<std::string, std::shared_ptr<QnnDeviceAndContext>> contexts_;
-
-  // device config for all to use
-  std::vector<QnnDevice_Config_t> target_machine_qnn_config_;
-  std::vector<const QnnDevice_Config_t*> target_machine_qnn_config_ptrs_;
-
-  // void* handle that should be freed when QnnAOTEnv end
-  std::vector<void*> unreachable_handle_;
-};
-
-}  // namespace mllm::qnn::aot
diff --git a/mllm/backends/qnn/aot/README.md b/mllm/backends/qnn/aot/README.md
deleted file mode 100644
index d2d28d1d4..000000000
--- a/mllm/backends/qnn/aot/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Qnn AOT
-
-This is the Qnn AOT API for X86 platform to build executable qnn model. This is not depends on QNNBackend target.
diff --git a/mllm/backends/qnn/aot_rt/README.md b/mllm/backends/qnn/aot_rt/README.md
deleted file mode 100644
index f7930caee..000000000
--- a/mllm/backends/qnn/aot_rt/README.md
+++ /dev/null
@@ -1 +0,0 @@
-# Runtime of AOT Models
diff --git a/mllm/ffi/CMakeLists.txt b/mllm/ffi/CMakeLists.txt
index 549d0a68f..c46d15af5 100644
--- a/mllm/ffi/CMakeLists.txt
+++ b/mllm/ffi/CMakeLists.txt
@@ -13,24 +13,11 @@ add_library(MllmFFIExtension SHARED
   ${CMAKE_CURRENT_LIST_DIR}/ModelService.cc
   ${CMAKE_CURRENT_LIST_DIR}/Nn.cc
   ${CMAKE_CURRENT_LIST_DIR}/Compile.cc
-  ${CMAKE_CURRENT_LIST_DIR}/qualcomm/QnnAOT.cc
 )
 target_link_libraries(MllmFFIExtension PUBLIC tvm_ffi_header)
 target_link_libraries(MllmFFIExtension PUBLIC tvm_ffi_shared MllmRT MllmCPUBackend)
 set_target_properties(MllmFFIExtension PROPERTIES PREFIX "")
 
-if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
-  # Build
-  target_include_directories(MllmFFIExtension PRIVATE
-    $ENV{QAIRT_SDK_ROOT}/include # QNN SDK include
-    $ENV{QAIRT_SDK_ROOT}/include/QNN # QNN SDK include
-  )
-  add_compile_definitions(
-    MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE
-  )
-endif()
-
-
 # Set the depend search path. Windows do not need this, it will search dlls in the same directory first.
 if(APPLE)
     set_target_properties(MllmFFIExtension PROPERTIES
diff --git a/mllm/ffi/Extension.cc b/mllm/ffi/Extension.cc
index cb999191d..6dfb51f99 100644
--- a/mllm/ffi/Extension.cc
+++ b/mllm/ffi/Extension.cc
@@ -370,10 +370,6 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                         });
 }
 
-//===----------------------------------------------------------------------===//
-// REGISTER: _Context Functions.
-//===----------------------------------------------------------------------===//
-
 //===----------------------------------------------------------------------===//
 // REGISTER: Quantize && Packing Functions.
 //===----------------------------------------------------------------------===//
diff --git a/mllm/ffi/qualcomm/QnnAOT.cc b/mllm/ffi/qualcomm/QnnAOT.cc
deleted file mode 100644
index e36cad641..000000000
--- a/mllm/ffi/qualcomm/QnnAOT.cc
+++ /dev/null
@@ -1,211 +0,0 @@
-// Copyright (c) MLLM Team.
-// Licensed under the MIT License.
-
-#include <tvm/ffi/any.h>
-#include <tvm/ffi/string.h>
-#include <tvm/ffi/container/shape.h>
-#include <tvm/ffi/container/tensor.h>
-#include <tvm/ffi/reflection/registry.h>
-#include <memory>
-
-#include "mllm/backends/qnn/aot/QnnTargetMachine.hpp"
-#include "mllm/ffi/qualcomm/QnnAOT.hh"
-
-#ifdef MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE
-
-TVM_FFI_STATIC_INIT_BLOCK() {
-  namespace refl = tvm::ffi::reflection;
-
-  refl::ObjectDef<::mllm::ffi::QcomHTPArchObj>();
-
-  refl::GlobalDef().def("mllm.qualcomm.QcomHTPArch.NONE", []() {
-    auto ret = mllm::qnn::aot::QcomHTPArch::NONE;
-    return mllm::ffi::QcomHTPArch(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomHTPArch.V68", []() {
-    auto ret = mllm::qnn::aot::QcomHTPArch::V68;
-    return mllm::ffi::QcomHTPArch(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomHTPArch.V69", []() {
-    auto ret = mllm::qnn::aot::QcomHTPArch::V69;
-    return mllm::ffi::QcomHTPArch(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomHTPArch.V73", []() {
-    auto ret = mllm::qnn::aot::QcomHTPArch::V73;
-    return mllm::ffi::QcomHTPArch(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomHTPArch.V75", []() {
-    auto ret = mllm::qnn::aot::QcomHTPArch::V75;
-    return mllm::ffi::QcomHTPArch(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomHTPArch.V79", []() {
-    auto ret = mllm::qnn::aot::QcomHTPArch::V79;
-    return mllm::ffi::QcomHTPArch(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomHTPArch.V81", []() {
-    auto ret = mllm::qnn::aot::QcomHTPArch::V81;
-    return mllm::ffi::QcomHTPArch(ret);
-  });
-
-  refl::ObjectDef<::mllm::ffi::QcomChipsetObj>();
-
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.UNKNOWN_SM", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::UNKNOWN_SM;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SA8295", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SA8295;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SM8350", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SM8350;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SM8450", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SM8450;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SM8475", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SM8475;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SM8550", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SM8550;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SM8650", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SM8650;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SM8750", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SM8750;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SM8850", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SM8850;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SSG2115P", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SSG2115P;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SSG2125P", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SSG2125P;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SXR1230P", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SXR1230P;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SXR2230P", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SXR2230P;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SXR2330P", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SXR2330P;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.QCS9100", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::QCS9100;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SAR2230P", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SAR2230P;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SA8255", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SA8255;
-    return mllm::ffi::QcomChipset(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomChipset.SW6100", []() {
-    auto ret = mllm::qnn::aot::QcomChipset::SW6100;
-    return mllm::ffi::QcomChipset(ret);
-  });
-
-  refl::ObjectDef<::mllm::ffi::QcomTryBestPerformanceObj>();
-
-  refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpDefault", []() {
-    auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpDefault;
-    return mllm::ffi::QcomTryBestPerformance(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpSustainedHighPerformance", []() {
-    auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpSustainedHighPerformance;
-    return mllm::ffi::QcomTryBestPerformance(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpBurst", []() {
-    auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpBurst;
-    return mllm::ffi::QcomTryBestPerformance(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpHighPerformance", []() {
-    auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpHighPerformance;
-    return mllm::ffi::QcomTryBestPerformance(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpPowerSaver", []() {
-    auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpPowerSaver;
-    return mllm::ffi::QcomTryBestPerformance(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpLowPowerSaver", []() {
-    auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpLowPowerSaver;
-    return mllm::ffi::QcomTryBestPerformance(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpHighPowerSaver", []() {
-    auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpHighPowerSaver;
-    return mllm::ffi::QcomTryBestPerformance(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpLowBalanced", []() {
-    auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpLowBalanced;
-    return mllm::ffi::QcomTryBestPerformance(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomTryBestPerformance.HtpBalanced", []() {
-    auto ret = mllm::qnn::aot::QcomTryBestPerformance::kHtpBalanced;
-    return mllm::ffi::QcomTryBestPerformance(ret);
-  });
-
-  refl::ObjectDef<::mllm::ffi::QcomSecurityPDSessionObj>();
-
-  refl::GlobalDef().def("mllm.qualcomm.QcomSecurityPDSession.HtpUnsignedPd", []() {
-    auto ret = mllm::qnn::aot::QcomSecurityPDSession::kHtpUnsignedPd;
-    return mllm::ffi::QcomSecurityPDSession(ret);
-  });
-  refl::GlobalDef().def("mllm.qualcomm.QcomSecurityPDSession.HtpSignedPd", []() {
-    auto ret = mllm::qnn::aot::QcomSecurityPDSession::kHtpSignedPd;
-    return mllm::ffi::QcomSecurityPDSession(ret);
-  });
-
-  refl::ObjectDef<mllm::ffi::QcomTargetMachineObj>().def_static(
-      "__create__",
-      [](const mllm::ffi::QcomChipset& chipset, const mllm::ffi::QcomHTPArch& arch,
-         const mllm::ffi::QcomTryBestPerformance& perf, const mllm::ffi::QcomSecurityPDSession& pd_session, uint32_t htp_vtcm) {
-        auto tm = mllm::qnn::aot::QcomTargetMachine{
-            .soc_htp_chipset = chipset.get()->chipset_,
-            .soc_htp_arch = arch.get()->htp_arch_,
-            .soc_htp_performance = perf.get()->perf_,
-            .soc_htp_security_pd_session = pd_session.get()->pd_,
-            .soc_htp_vtcm_total_memory_size = htp_vtcm,
-        };
-        return ::mllm::ffi::QcomTargetMachine(tm);
-      });
-
-  refl::ObjectDef<mllm::ffi::QnnAOTEnvObj>().def_static(
-      "__create__", [](const mllm::ffi::QcomTargetMachine& machine, const std::string& path) -> mllm::ffi::QnnAOTEnv {
-        if (path.empty()) {
-          auto tm = machine.get()->target_machine_;
-          auto s = std::make_shared<::mllm::qnn::aot::QnnAOTEnv>(tm);
-          return ::mllm::ffi::QnnAOTEnv(s);
-        } else {
-          auto tm = machine.get()->target_machine_;
-          auto s = std::make_shared<::mllm::qnn::aot::QnnAOTEnv>(path, tm);
-          return ::mllm::ffi::QnnAOTEnv(s);
-        }
-      });
-
-  refl::ObjectDef<::mllm::ffi::QnnDeviceAndContextObj>();
-
-  refl::GlobalDef().def("mllm.qualcomm.QnnAOTEnv.createContext",
-                        [](const mllm::ffi::QnnAOTEnv& self, const std::string& name, bool weights_sharing) {
-                          auto s = self.get()->qnn_aot_env_ptr_->createContext(name, weights_sharing);
-                          return mllm::ffi::QnnDeviceAndContext(s);
-                        });
-}
-
-#endif
diff --git a/mllm/ffi/qualcomm/QnnAOT.hh b/mllm/ffi/qualcomm/QnnAOT.hh
deleted file mode 100644
index f0feb46f3..000000000
--- a/mllm/ffi/qualcomm/QnnAOT.hh
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (c) MLLM Team.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include <tvm/ffi/object.h>
-#include <tvm/ffi/memory.h>
-#include <tvm/ffi/container/shape.h>
-#include <tvm/ffi/container/tensor.h>
-
-#ifdef MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE
-#include "mllm/backends/qnn/aot/QnnWrappersAPI.hpp"
-#endif
-
-namespace mllm::ffi {
-
-#ifdef MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE
-
-//===----------------------------------------------------------------------===//
-// MLLM Parameter File Define
-//===----------------------------------------------------------------------===//
-class QnnAOTEnvObj : public tvm::ffi::Object {
- public:
-  ::mllm::qnn::aot::QnnAOTEnv::ptr_t qnn_aot_env_ptr_ = nullptr;
-
-  explicit QnnAOTEnvObj(const ::mllm::qnn::aot::QnnAOTEnv::ptr_t& ptr) : qnn_aot_env_ptr_(ptr) { MLLM_EMPTY_SCOPE; }
-
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("mllm.qualcomm.QnnAOTEnv", QnnAOTEnvObj, tvm::ffi::Object);
-};
-
-class QnnAOTEnv : public tvm::ffi::ObjectRef {
- public:
-  explicit QnnAOTEnv(::mllm::qnn::aot::QnnAOTEnv::ptr_t& ptr) { data_ = tvm::ffi::make_object<QnnAOTEnvObj>(ptr); }
-
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(QnnAOTEnv, tvm::ffi::ObjectRef, QnnAOTEnvObj);  // NOLINT
-};
-
-//===----------------------------------------------------------------------===//
-// MLLM QnnDeviceAndContext Define
-//===----------------------------------------------------------------------===//
-class QnnDeviceAndContextObj : public tvm::ffi::Object {
- public:
-  std::shared_ptr<::mllm::qnn::aot::QnnDeviceAndContext> qnn_device_and_context_ptr_ = nullptr;
-
-  explicit QnnDeviceAndContextObj(const std::shared_ptr<::mllm::qnn::aot::QnnDeviceAndContext>& ptr)
-      : qnn_device_and_context_ptr_(ptr) {
-    MLLM_EMPTY_SCOPE;
-  }
-
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("mllm.qualcomm.QnnDeviceAndContext", QnnDeviceAndContextObj, tvm::ffi::Object);
-};
-
-class QnnDeviceAndContext : public tvm::ffi::ObjectRef {
- public:
-  explicit QnnDeviceAndContext(std::shared_ptr<::mllm::qnn::aot::QnnDeviceAndContext>& ptr) {
-    data_ = tvm::ffi::make_object<QnnDeviceAndContextObj>(ptr);
-  }
-
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(QnnDeviceAndContext, tvm::ffi::ObjectRef, QnnDeviceAndContextObj);  // NOLINT
-};
-
-//===----------------------------------------------------------------------===//
-// MLLM QcomHTPArch Define
-//===----------------------------------------------------------------------===//
-class QcomHTPArchObj : public tvm::ffi::Object {
- public:
-  mllm::qnn::aot::QcomHTPArch htp_arch_;
-
-  explicit QcomHTPArchObj(const mllm::qnn::aot::QcomHTPArch& obj) : htp_arch_(obj) { MLLM_EMPTY_SCOPE; }
-
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("mllm.qualcomm.QcomHTPArch", QcomHTPArchObj, tvm::ffi::Object);
-};
-
-class QcomHTPArch : public tvm::ffi::ObjectRef {
- public:
-  explicit QcomHTPArch(mllm::qnn::aot::QcomHTPArch& ptr) { data_ = tvm::ffi::make_object<QcomHTPArchObj>(ptr); }
-
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(QcomHTPArch, tvm::ffi::ObjectRef, QcomHTPArchObj);  // NOLINT
-};
-
-//===----------------------------------------------------------------------===//
-// MLLM QcomChipset Define
-//===----------------------------------------------------------------------===//
-class QcomChipsetObj : public tvm::ffi::Object {
- public:
-  mllm::qnn::aot::QcomChipset chipset_;
-
-  explicit QcomChipsetObj(const mllm::qnn::aot::QcomChipset& obj) : chipset_(obj) { MLLM_EMPTY_SCOPE; }
-
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("mllm.qualcomm.QcomChipset", QcomChipsetObj, tvm::ffi::Object);
-};
-
-class QcomChipset : public tvm::ffi::ObjectRef {
- public:
-  explicit QcomChipset(mllm::qnn::aot::QcomChipset& ptr) { data_ = tvm::ffi::make_object<QcomChipsetObj>(ptr); }
-
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(QcomChipset, tvm::ffi::ObjectRef, QcomChipsetObj);  // NOLINT
-};
-
-//===----------------------------------------------------------------------===//
-// MLLM QcomTryBestPerformance Define
-//===----------------------------------------------------------------------===//
-class QcomTryBestPerformanceObj : public tvm::ffi::Object {
- public:
-  mllm::qnn::aot::QcomTryBestPerformance perf_;
-
-  explicit QcomTryBestPerformanceObj(const mllm::qnn::aot::QcomTryBestPerformance& obj) : perf_(obj) { MLLM_EMPTY_SCOPE; }
-
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("mllm.qualcomm.QcomTryBestPerformance", QcomTryBestPerformanceObj, tvm::ffi::Object);
-};
-
-class QcomTryBestPerformance : public tvm::ffi::ObjectRef {
- public:
-  explicit QcomTryBestPerformance(mllm::qnn::aot::QcomTryBestPerformance& ptr) {
-    data_ = tvm::ffi::make_object<QcomTryBestPerformanceObj>(ptr);
-  }
-
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(QcomTryBestPerformance, tvm::ffi::ObjectRef, QcomTryBestPerformanceObj);  // NOLINT
-};
-
-//===----------------------------------------------------------------------===//
-// MLLM QcomSecurityPDSession Define
-//===----------------------------------------------------------------------===//
-class QcomSecurityPDSessionObj : public tvm::ffi::Object {
- public:
-  mllm::qnn::aot::QcomSecurityPDSession pd_;
-
-  explicit QcomSecurityPDSessionObj(const mllm::qnn::aot::QcomSecurityPDSession& obj) : pd_(obj) { MLLM_EMPTY_SCOPE; }
-
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("mllm.qualcomm.QcomSecurityPDSession", QcomSecurityPDSessionObj, tvm::ffi::Object);
-};
-
-class QcomSecurityPDSession : public tvm::ffi::ObjectRef {
- public:
-  explicit QcomSecurityPDSession(mllm::qnn::aot::QcomSecurityPDSession& ptr) {
-    data_ = tvm::ffi::make_object<QcomSecurityPDSessionObj>(ptr);
-  }
-
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(QcomSecurityPDSession, tvm::ffi::ObjectRef, QcomSecurityPDSessionObj);  // NOLINT
-};
-
-//===----------------------------------------------------------------------===//
-// MLLM QcomTargetMachine Define
-//===----------------------------------------------------------------------===//
-class QcomTargetMachineObj : public tvm::ffi::Object {
- public:
-  mllm::qnn::aot::QcomTargetMachine target_machine_;
-
-  explicit QcomTargetMachineObj(const mllm::qnn::aot::QcomTargetMachine& obj) : target_machine_(obj) { MLLM_EMPTY_SCOPE; }
-
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("mllm.qualcomm.QcomTargetMachine", QcomTargetMachineObj, tvm::ffi::Object);
-};
-
-class QcomTargetMachine : public tvm::ffi::ObjectRef {
- public:
-  explicit QcomTargetMachine(mllm::qnn::aot::QcomTargetMachine& ptr) {
-    data_ = tvm::ffi::make_object<QcomTargetMachineObj>(ptr);
-  }
-
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(QcomTargetMachine, tvm::ffi::ObjectRef, QcomTargetMachineObj);  // NOLINT
-};
-
-#endif
-
-}  // namespace mllm::ffi
diff --git a/mllm/models/smollm3_3B/modeling_smollm3.hpp b/mllm/models/smollm3_3B/modeling_smollm3.hpp
index 3d7ee4a2f..90798387d 100644
--- a/mllm/models/smollm3_3B/modeling_smollm3.hpp
+++ b/mllm/models/smollm3_3B/modeling_smollm3.hpp
@@ -157,7 +157,7 @@ class Smollm3Attention final : public nn::Module {
     auto [key_states_result, value_states_result] = past_kv_cache->updateKVCache(layer_idx_, key_states, value_states);
     key_states = std::move(key_states_result);
     value_states = std::move(value_states_result);
-
+    
     Tensor attn;
     if (key_states.dtype() == kFloat32) {
       attn = nn::functional::matmul(query_states, key_states, false, true) * (1.f / sqrtf(head_dim_));
diff --git a/mllm/nn/Functional.cpp b/mllm/nn/Functional.cpp
index 4e70b092a..e1e015432 100644
--- a/mllm/nn/Functional.cpp
+++ b/mllm/nn/Functional.cpp
@@ -7,6 +7,7 @@
 #include "mllm/core/aops/FlashAttention2Op.hpp"
 #include "mllm/core/aops/GatherOp.hpp"
 #include "mllm/core/aops/MatMulOp.hpp"
+#include "mllm/core/aops/LinearOp.hpp"
 #include "mllm/core/aops/ReduceOps.hpp"
 #include "mllm/core/aops/Scatter2ShardsOp.hpp"
 #include "mllm/core/aops/SigmoidOp.hpp"
@@ -16,6 +17,7 @@
 #include "mllm/core/aops/ViewOp.hpp"
 #include "mllm/core/aops/TopKOp.hpp"
 #include "mllm/core/aops/SiLUOp.hpp"
+#include "mllm/core/aops/RMSNormOp.hpp"
 #include "mllm/core/aops/PadOp.hpp"
 #include "mllm/core/aops/MaskedScatterOp.hpp"
 #include "mllm/core/aops/InterpolateOp.hpp"
@@ -33,6 +35,16 @@ Tensor matmul(const Tensor& A, const Tensor& B, bool transpose_A, bool transpose
       {A, B})[0];
 }
 
+Tensor linear(const Tensor& x, const Tensor& weight, const Tensor& bias) {
+  aops::LinearOpOptions opts{};
+  opts.setRedirect(true);
+  if (bias.isNil()) {
+    return Context::instance().buildOpAndSubmitTask(OpTypes::kLinear, opts, {x, weight})[0];
+  } else {
+    return Context::instance().buildOpAndSubmitTask(OpTypes::kLinear, opts, {x, weight, bias})[0];
+  }
+}
+
 Tensor view(const Tensor& x, const std::vector<int32_t>& shape) {
   return Context::instance().buildOpAndSubmitTask(OpTypes::kView, aops::ViewOpOptions{.to_shape = shape}, {x})[0];
 }
@@ -126,6 +138,11 @@ Tensor silu_(const Tensor& x) {
   return Context::instance().buildOpAndSubmitTask(OpTypes::kSiLU, opt, {x})[0];
 }
 
+Tensor rmsNorm(const Tensor& x, const Tensor& weight, float epsilon, bool add_unit_offset) {
+  return Context::instance().buildOpAndSubmitTask(
+      OpTypes::kRMSNorm, aops::RMSNormOpOptions{.epsilon = epsilon, .add_unit_offset = add_unit_offset}, {x, weight})[0];
+}
+
 void scatter2Shards(const Tensor& src, const Tensor& shards_pointer, int32_t dim) {
   Context::instance().buildOpAndSubmitTask(OpTypes::kScatter2Shards, aops::Scatter2ShardsOpOptions{.dim = dim},
                                            {src, shards_pointer});
diff --git a/mllm/nn/Functional.hpp b/mllm/nn/Functional.hpp
index 31a57812c..c85b716e9 100644
--- a/mllm/nn/Functional.hpp
+++ b/mllm/nn/Functional.hpp
@@ -20,6 +20,8 @@ namespace mllm::nn::functional {
 Tensor matmul(const Tensor& A, const Tensor& B, bool transpose_A = false, bool transpose_B = false,
               aops::MatMulOpType type = aops::MatMulOpType::kDefault);
 
+Tensor linear(const Tensor& x, const Tensor& weight, const Tensor& bias = Tensor());
+
 Tensor view(const Tensor& x, const std::vector<int32_t>& shape);
 
 std::vector<Tensor> split(const Tensor& x, int32_t split_size_or_sections, int32_t dim);
@@ -131,6 +133,8 @@ Tensor mean(const Tensor& x, int32_t dim = std::numeric_limits<int32_t>::max(),
 Tensor silu(const Tensor& x);
 Tensor silu_(const Tensor& x);
 
+Tensor rmsNorm(const Tensor& x, const Tensor& weight, float epsilon = 1e-5f, bool add_unit_offset = false);
+
 void scatter2Shards(const Tensor& src, const Tensor& shards_pointer, int32_t dim);
 
 // If you want causal mask attention. Use Flash attention instead.
diff --git a/pymllm/backends/__init__.py b/pymllm/backends/__init__.py
deleted file mode 100644
index 5e926d580..000000000
--- a/pymllm/backends/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) MLLM Team.
-# Licensed under the MIT License.
-
-from . import cuda, qualcomm
diff --git a/pymllm/backends/cuda/__init__.py b/pymllm/backends/cuda/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/pymllm/backends/qualcomm/README.md b/pymllm/backends/qualcomm/README.md
deleted file mode 100644
index 27122dbc2..000000000
--- a/pymllm/backends/qualcomm/README.md
+++ /dev/null
@@ -1 +0,0 @@
-# Qualcomm Qnn AOT API
diff --git a/pymllm/backends/qualcomm/__init__.py b/pymllm/backends/qualcomm/__init__.py
deleted file mode 100644
index bcd9c95de..000000000
--- a/pymllm/backends/qualcomm/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) MLLM Team.
-# Licensed under the MIT License.
-
-from . import qnn_aot_env
-from . import transformers
diff --git a/pymllm/backends/qualcomm/nn.py b/pymllm/backends/qualcomm/nn.py
deleted file mode 100644
index 0ba9aef55..000000000
--- a/pymllm/backends/qualcomm/nn.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from pymllm.nn._layers import Softmax, RoPE
-
-
-class QnnSoftmax(Softmax):
-    def __init__(self):
-        super().__init__()
-
-
-class QnnRoPE(RoPE):
-    def __init__(self):
-        super().__init__()
diff --git a/pymllm/backends/qualcomm/qnn_aot_env.py b/pymllm/backends/qualcomm/qnn_aot_env.py
deleted file mode 100644
index 8b0c0d2e1..000000000
--- a/pymllm/backends/qualcomm/qnn_aot_env.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from pymllm.ffi import is_qnn_aot_on_x86_enabled
-
-if is_qnn_aot_on_x86_enabled():
-    from pymllm.ffi import (
-        QnnDeviceAndContext,
-        QnnAOTEnv,
-        QcomChipset,
-        QcomHTPArch,
-        QcomSecurityPDSession,
-        QcomTargetMachine,
-        QcomTryBestPerformance,
-    )
-else:
-    # Define placeholder classes when QNN AOT is not enabled
-    QnnDeviceAndContext = None
-    QnnAOTEnv = None
-    QcomChipset = None
-    QcomHTPArch = None
-    QcomSecurityPDSession = None
-    QcomTargetMachine = None
-    QcomTryBestPerformance = None
\ No newline at end of file
diff --git a/pymllm/compile/mllm_ir/trace.py b/pymllm/compile/mllm_ir/trace.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/pymllm/nn/_layers.py b/pymllm/nn/_layers.py
index 5adc79cbf..cf5ec37ea 100644
--- a/pymllm/nn/_layers.py
+++ b/pymllm/nn/_layers.py
@@ -5,7 +5,7 @@
 from .. import ffi
 
 
-class _Layer:
+class Linear:
     def __init__(self):
         self.device: ffi.Device = ffi.cpu_()
         self.this_layer_name: str = None
diff --git a/pymllm/nn/_module.py b/pymllm/nn/_module.py
index a4f28e17c..f18c97460 100644
--- a/pymllm/nn/_module.py
+++ b/pymllm/nn/_module.py
@@ -2,7 +2,6 @@
 # Licensed under the MIT License.
 
 from .. import ffi
-from ._layers import _Layer
 
 
 class Module:
diff --git a/pymllm/tests/qualcomm/test_context_create.py b/pymllm/tests/qualcomm/test_context_create.py
deleted file mode 100644
index 18983daa7..000000000
--- a/pymllm/tests/qualcomm/test_context_create.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import pymllm as mllm
-from pymllm.backends.qualcomm.qnn_aot_env import (
-    QnnAOTEnv,
-    QnnDeviceAndContext,
-    QcomTryBestPerformance,
-    QcomSecurityPDSession,
-    QcomTargetMachine,
-    QcomChipset,
-    QcomHTPArch,
-)
-
-
-qnn_aot_env: QnnAOTEnv = QnnAOTEnv(
-    machine=QcomTargetMachine(
-        soc_htp_chipset=QcomChipset.SM8850(),
-        soc_htp_arch=QcomHTPArch.V81(),
-        soc_htp_performance=QcomTryBestPerformance.HtpBurst(),
-        soc_htp_security_pd_session=QcomSecurityPDSession.HtpUnsignedPd(),
-        soc_htp_vtcm=8,  # in MB
-    ),
-    path="/opt/qcom/aistack/qairt/2.41.0.251128/lib/x86_64-linux-clang/",
-)
-
-if __name__ == "__main__":
-    mllm.echo("Testing tvm-ffi compatibility")
-    qnn_context: QnnDeviceAndContext = qnn_aot_env.create_context(
-        "context.0", weights_sharing=False
-    )
diff --git a/pyproject.toml b/pyproject.toml
index 703d4456a..50a8eee99 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [build-system]
 requires = [
-  "scikit-build-core>=0.11.0", "apache-tvm-ffi"
+  "scikit-build-core==0.10.0", "apache-tvm-ffi"
 ]
 build-backend = "scikit_build_core.build"
 
@@ -56,8 +56,6 @@ cmake.args = [
   "-DCMAKE_BUILD_TYPE=Release",
   "-DMLLM_ENABLE_PY_MLLM=on"
 ]
-sdist.exclude = [".*", ".*/*"]
-wheel.exclude = [".*", ".*/*"]
 minimum-version = "build-system.requires"
 
 # Build configuration
diff --git a/tasks/build_android.yaml b/tasks/build_android.yaml
index 2378738e9..e7c251ebe 100644
--- a/tasks/build_android.yaml
+++ b/tasks/build_android.yaml
@@ -2,10 +2,10 @@ Tasks:
   - CMakeConfigTask:
       cmake_cfg_path: "build-android-arm64-v8a"
       cmake_build_type: "Release"
-      cmake_toolchain_file: "$ANDROID_NDK_PATH/build/cmake/android.toolchain.cmake"
       cmake_extra_args:
         - "-DMLLM_CROSS_COMPILE=ON"
         - "-DMLLM_BUILD_ARM_BACKEND=ON"
+        - "-DMLLM_BUILD_ASCEND_BACKEND=ON"
         - "-DANDROID_PLATFORM=android-28"
         - "-DANDROID_ABI=arm64-v8a"
         - '-DMLLM_CPU_BACKEND_COMPILE_OPTIONS="-march=armv8.2-a+fp16+fp16fml+dotprod+i8mm;-ffast-math;-Wno-nan-infinity-disabled"'
diff --git a/tasks/build_x86.yaml b/tasks/build_x86.yaml
index 617f05f9c..a2b60952d 100644
--- a/tasks/build_x86.yaml
+++ b/tasks/build_x86.yaml
@@ -11,7 +11,6 @@ Tasks:
         - '-DMLLM_CPU_BACKEND_COMPILE_OPTIONS="-march=native"'
         - "-DMLLM_KERNEL_USE_THREADS=ON"
         - "-DMLLM_KERNEL_THREADS_VENDOR_OPENMP=ON"
-        - "-DMLLM_KERNEL_USE_THREADS_VENDOR_MLLM=OFF"
 
   - CMakeBuildTask:
       cmake_cfg_path: "build"
diff --git a/tasks/build_x86_qnn_aot.yaml b/tasks/build_x86_qnn_aot.yaml
deleted file mode 100644
index d4809943f..000000000
--- a/tasks/build_x86_qnn_aot.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-Tasks:
-  - CMakeConfigTask:
-      cmake_cfg_path: "build-qnn-aot"
-      cmake_build_type: "RelWithDebInfo"
-      cmake_extra_args:
-        # Optional, If use Highway
-        - "-DHWY_ENABLE_TESTS=OFF"
-        - "-DHWY_ENABLE_EXAMPLES=OFF"
-        - "-DHWY_ENABLE_CONTRIB=OFF"
-        # Optional
-        - '-DMLLM_CPU_BACKEND_COMPILE_OPTIONS="-march=native"'
-        - "-DMLLM_KERNEL_USE_THREADS=ON"
-        - "-DMLLM_KERNEL_THREADS_VENDOR_OPENMP=ON"
-        - "-DMLLM_KERNEL_USE_THREADS_VENDOR_MLLM=OFF"
-        - "-DMLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE=ON"
-
-  - CMakeBuildTask:
-      cmake_cfg_path: "build-qnn-aot"
diff --git a/tests/ascend/AscendAttentionKernelTest.hpp b/tests/ascend/AscendAttentionKernelTest.hpp
new file mode 100644
index 000000000..37a9b93c3
--- /dev/null
+++ b/tests/ascend/AscendAttentionKernelTest.hpp
@@ -0,0 +1,576 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/mllm.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/nn/Functional.hpp"
+#include "KernelTestHelper.hpp"
+#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp"
+#include <vector>
+#include <cmath>
+#include <limits>
+
+class AscendAttentionKernelTest : public KernelTest {
+ public:
+  AscendAttentionKernelTest() = default;
+  ~AscendAttentionKernelTest() override = default;
+
+  // Test Scaled Dot-Product Attention using existing operators
+  // Attention(Q, K, V) = softmax(Q @ K^T / sqrt(d_k)) @ V
+  bool ScaledDotProductAttentionFloat16Test(const std::vector<std::tuple<mllm::Tensor::shape_t, mllm::Tensor::shape_t, mllm::Tensor::shape_t>>& test_cases) {
+    using namespace mllm;  // NOLINT
+
+    for (const auto& [q_shape, k_shape, v_shape] : test_cases) {
+      // Validate shapes: Q=[B, S_q, D], K=[B, S_kv, D], V=[B, S_kv, D]
+      MLLM_RT_ASSERT_EQ(q_shape.size(), 3);
+      MLLM_RT_ASSERT_EQ(k_shape.size(), 3);
+      MLLM_RT_ASSERT_EQ(v_shape.size(), 3);
+      MLLM_RT_ASSERT_EQ(q_shape[0], k_shape[0]);  // Same batch size
+      MLLM_RT_ASSERT_EQ(q_shape[0], v_shape[0]);
+      MLLM_RT_ASSERT_EQ(q_shape[2], k_shape[2]);  // Same D dimension
+      MLLM_RT_ASSERT_EQ(k_shape[1], v_shape[1]);  // K and V have same sequence length
+
+      int32_t B = static_cast<int32_t>(q_shape[0]);
+      int32_t S_q = static_cast<int32_t>(q_shape[1]);
+      int32_t S_kv = static_cast<int32_t>(k_shape[1]);
+      int32_t D = static_cast<int32_t>(q_shape[2]);
+
+      // 1. Create random FP16 inputs on CPU
+      Tensor Q_cpu = Tensor::random(q_shape, -1.0f, 1.0f, kFloat16, kCPU);
+      Tensor K_cpu = Tensor::random(k_shape, -1.0f, 1.0f, kFloat16, kCPU);
+      Tensor V_cpu = Tensor::random(v_shape, -1.0f, 1.0f, kFloat16, kCPU);
+
+      // 2. Compute reference result on CPU using FP32 for better precision
+      Tensor Q_cpu_fp32 = Tensor::zeros(q_shape, kFloat32, kCPU);
+      Tensor K_cpu_fp32 = Tensor::zeros(k_shape, kFloat32, kCPU);
+      Tensor V_cpu_fp32 = Tensor::zeros(v_shape, kFloat32, kCPU);
+
+      // Convert FP16 to FP32
+      {
+        auto* q_fp16 = Q_cpu.ptr<mllm_fp16_t>();
+        auto* k_fp16 = K_cpu.ptr<mllm_fp16_t>();
+        auto* v_fp16 = V_cpu.ptr<mllm_fp16_t>();
+        auto* q_fp32 = Q_cpu_fp32.ptr<mllm_fp32_t>();
+        auto* k_fp32 = K_cpu_fp32.ptr<mllm_fp32_t>();
+        auto* v_fp32 = V_cpu_fp32.ptr<mllm_fp32_t>();
+
+        for (size_t i = 0; i < Q_cpu.numel(); ++i) {
+          q_fp32[i] = MLLM_FP16_TO_FP32(q_fp16[i]);
+        }
+        for (size_t i = 0; i < K_cpu.numel(); ++i) {
+          k_fp32[i] = MLLM_FP16_TO_FP32(k_fp16[i]);
+        }
+        for (size_t i = 0; i < V_cpu.numel(); ++i) {
+          v_fp32[i] = MLLM_FP16_TO_FP32(v_fp16[i]);
+        }
+      }
+
+      // Compute reference attention on CPU (FP32)
+      Tensor ref_cpu_fp32 = Tensor::zeros({B, S_q, D}, kFloat32, kCPU);
+      {
+        auto* q_ptr = Q_cpu_fp32.ptr<mllm_fp32_t>();
+        auto* k_ptr = K_cpu_fp32.ptr<mllm_fp32_t>();
+        auto* v_ptr = V_cpu_fp32.ptr<mllm_fp32_t>();
+        auto* out_ptr = ref_cpu_fp32.ptr<mllm_fp32_t>();
+
+        float scale = 1.0f / std::sqrt(static_cast<float>(D));
+
+        for (int32_t b = 0; b < B; ++b) {
+          // Compute Q @ K^T for this batch
+          std::vector<float> scores(S_q * S_kv, 0.0f);
+
+          for (int32_t i = 0; i < S_q; ++i) {
+            for (int32_t j = 0; j < S_kv; ++j) {
+              float sum = 0.0f;
+              for (int32_t k = 0; k < D; ++k) {
+                float q_val = q_ptr[b * S_q * D + i * D + k];
+                float k_val = k_ptr[b * S_kv * D + j * D + k];
+                sum += q_val * k_val;
+              }
+              scores[i * S_kv + j] = sum * scale;
+            }
+          }
+
+          // Apply softmax along the last dimension (S_kv)
+          std::vector<float> attn_weights(S_q * S_kv);
+          for (int32_t i = 0; i < S_q; ++i) {
+            // Find max for numerical stability
+            float max_val = -std::numeric_limits<float>::infinity();
+            for (int32_t j = 0; j < S_kv; ++j) {
+              max_val = std::max(max_val, scores[i * S_kv + j]);
+            }
+
+            // Compute exp and sum
+            float sum_exp = 0.0f;
+            for (int32_t j = 0; j < S_kv; ++j) {
+              float exp_val = std::exp(scores[i * S_kv + j] - max_val);
+              attn_weights[i * S_kv + j] = exp_val;
+              sum_exp += exp_val;
+            }
+
+            // Normalize
+            for (int32_t j = 0; j < S_kv; ++j) {
+              attn_weights[i * S_kv + j] /= sum_exp;
+            }
+          }
+
+          // Compute output: attn_weights @ V
+          // out[S_q, D] = attn_weights[S_q, S_kv] @ V[S_kv, D]
+          for (int32_t i = 0; i < S_q; ++i) {
+            for (int32_t k = 0; k < D; ++k) {
+              float sum = 0.0f;
+              for (int32_t j = 0; j < S_kv; ++j) {
+                float attn_val = attn_weights[i * S_kv + j];
+                float v_val = v_ptr[b * S_kv * D + j * D + k];
+                sum += attn_val * v_val;
+              }
+              out_ptr[b * S_q * D + i * D + k] = sum;
+            }
+          }
+        }
+      }
+
+      // Convert reference back to FP16
+      Tensor ref_cpu = Tensor::zeros({B, S_q, D}, kFloat16, kCPU);
+      {
+        auto* ref_fp32 = ref_cpu_fp32.ptr<mllm_fp32_t>();
+        auto* ref_fp16 = ref_cpu.ptr<mllm_fp16_t>();
+        for (size_t i = 0; i < ref_cpu.numel(); ++i) {
+          ref_fp16[i] = MLLM_FP32_TO_FP16(ref_fp32[i]);
+        }
+      }
+
+      // 3. Move inputs to Ascend and compute attention using existing operators
+      auto Q_ascend = Q_cpu.to(kAscend);
+      auto K_ascend = K_cpu.to(kAscend);
+      auto V_ascend = V_cpu.to(kAscend);
+
+      float scale = 1.0f / std::sqrt(static_cast<float>(D));
+
+      // Step 1: Q @ K^T (transpose_b=true)
+      auto scores = mllm::nn::functional::matmul(Q_ascend, K_ascend, false, true);
+
+      // Step 2: Scale by 1/sqrt(d_k)
+      auto scale_tensor_cpu = Tensor::ones({1}, kFloat16, kCPU) * scale;
+      auto scale_tensor = scale_tensor_cpu.to(kAscend);
+      auto scaled_scores = scores * scale_tensor;
+
+      // Step 3: Softmax along last dimension
+      auto attn_weights = mllm::nn::functional::softmax(scaled_scores, -1);
+
+      // Step 4: attn_weights @ V
+      auto output_ascend = mllm::nn::functional::matmul(attn_weights, V_ascend, false, false);
+
+      // 4. Move result back to CPU and compare
+      auto output_cpu = output_ascend.to(kCPU);
+
+      auto result = mllm::test::allClose(output_cpu, ref_cpu, 5e-2f, 5e-2f);
+      if (!result.is_close) {
+        MLLM_ERROR("Attention test failed for shape Q=[{},{},{}], K=[{},{},{}], V=[{},{},{}]",
+                   q_shape[0], q_shape[1], q_shape[2],
+                   k_shape[0], k_shape[1], k_shape[2],
+                   v_shape[0], v_shape[1], v_shape[2]);
+        MLLM_ERROR("Max absolute diff: {}, Max relative diff: {}",
+                   result.max_absolute_diff, result.max_relative_diff);
+        return false;
+      }
+    }
+    return true;
+  }
+
+  //===----------------------------------------------------------------------===//
+  // Multi-Head Attention with optional Causal Mask
+  //
+  // Input shapes: Q=[B, H, S_q, D], K=[B, H, S_kv, D], V=[B, H, S_kv, D]
+  // where H = num_heads, D = head_dim
+  // Mask shape: [1, 1, S_q, S_kv] (broadcastable to [B, H, S_q, S_kv])
+  //
+  // Attention(Q, K, V, mask) = softmax(Q @ K^T / sqrt(d_k) + mask) @ V
+  //===----------------------------------------------------------------------===//
+  bool MultiHeadAttentionFloat16Test(
+      const std::vector<std::tuple<
+          mllm::Tensor::shape_t,  // Q shape [B, H, S_q, D]
+          mllm::Tensor::shape_t,  // K shape [B, H, S_kv, D]
+          mllm::Tensor::shape_t,  // V shape [B, H, S_kv, D]
+          bool                     // use_causal_mask
+      >>& test_cases) {
+    using namespace mllm;  // NOLINT
+
+    for (const auto& [q_shape, k_shape, v_shape, use_mask] : test_cases) {
+      // Validate shapes: Q=[B, H, S_q, D], K=[B, H, S_kv, D], V=[B, H, S_kv, D]
+      MLLM_RT_ASSERT_EQ(q_shape.size(), 4);
+      MLLM_RT_ASSERT_EQ(k_shape.size(), 4);
+      MLLM_RT_ASSERT_EQ(v_shape.size(), 4);
+      MLLM_RT_ASSERT_EQ(q_shape[0], k_shape[0]);  // Same batch size
+      MLLM_RT_ASSERT_EQ(q_shape[0], v_shape[0]);
+      MLLM_RT_ASSERT_EQ(q_shape[1], k_shape[1]);  // Same num_heads
+      MLLM_RT_ASSERT_EQ(q_shape[1], v_shape[1]);
+      MLLM_RT_ASSERT_EQ(q_shape[3], k_shape[3]);  // Same head_dim
+      MLLM_RT_ASSERT_EQ(k_shape[2], v_shape[2]);  // K and V have same sequence length
+
+      int32_t B = static_cast<int32_t>(q_shape[0]);
+      int32_t H = static_cast<int32_t>(q_shape[1]);  // num_heads
+      int32_t S_q = static_cast<int32_t>(q_shape[2]);
+      int32_t S_kv = static_cast<int32_t>(k_shape[2]);
+      int32_t D = static_cast<int32_t>(q_shape[3]);  // head_dim
+
+      // 1. Create random FP16 inputs on CPU
+      Tensor Q_cpu = Tensor::random(q_shape, -0.5f, 0.5f, kFloat16, kCPU);
+      Tensor K_cpu = Tensor::random(k_shape, -0.5f, 0.5f, kFloat16, kCPU);
+      Tensor V_cpu = Tensor::random(v_shape, -0.5f, 0.5f, kFloat16, kCPU);
+
+      // 2. Create causal mask if needed
+      // Causal mask: mask[i, j] = 0 if j <= i, else -inf (large negative value)
+      Tensor mask_cpu;
+      if (use_mask) {
+        mask_cpu = Tensor::zeros({1, 1, S_q, S_kv}, kFloat16, kCPU);
+        auto* mask_ptr = mask_cpu.ptr<mllm_fp16_t>();
+
+        // Fill causal mask: upper triangular part is masked (-inf)
+        for (int32_t i = 0; i < S_q; ++i) {
+          for (int32_t j = 0; j < S_kv; ++j) {
+            int32_t offset = S_kv - S_q;
+            if (j > i + offset) {
+              mask_ptr[i * S_kv + j] = MLLM_FP32_TO_FP16(-10000.0f);
+            }
+          }
+        }
+      }
+
+      // 3. Compute reference result on CPU using FP32 for better precision
+      Tensor ref_cpu = computeMultiHeadAttentionCPU(Q_cpu, K_cpu, V_cpu, mask_cpu, use_mask);
+
+      // 4. Move inputs to Ascend and compute attention
+      auto Q_ascend = Q_cpu.to(kAscend);
+      auto K_ascend = K_cpu.to(kAscend);
+      auto V_ascend = V_cpu.to(kAscend);
+
+      float scale = 1.0f / std::sqrt(static_cast<float>(D));
+
+      // Step 1: Q @ K^T (transpose_b=true)
+      auto scores = mllm::nn::functional::matmul(Q_ascend, K_ascend, false, true);
+
+      // Step 2: Scale by 1/sqrt(d_k)
+      auto scale_tensor_cpu = Tensor::ones({1}, kFloat16, kCPU);
+      {
+        auto* scale_ptr = scale_tensor_cpu.ptr<mllm_fp16_t>();
+        scale_ptr[0] = MLLM_FP32_TO_FP16(scale);
+      }
+      auto scale_tensor = scale_tensor_cpu.to(kAscend);
+      auto scaled_scores = scores * scale_tensor;
+
+      // Step 3: Add mask if needed (broadcasting: [1, 1, S_q, S_kv] -> [B, H, S_q, S_kv])
+      if (use_mask) {
+        auto mask_ascend = mask_cpu.to(kAscend);
+        scaled_scores = scaled_scores + mask_ascend;
+      }
+
+      // Step 4: Softmax along last dimension
+      auto attn_weights = mllm::nn::functional::softmax(scaled_scores, -1);
+
+      // Step 5: attn_weights @ V
+      // [B, H, S_q, S_kv] @ [B, H, S_kv, D] -> [B, H, S_q, D]
+      auto output_ascend = mllm::nn::functional::matmul(attn_weights, V_ascend, false, false);
+
+      // 5. Move result back to CPU and compare
+      auto output_cpu = output_ascend.to(kCPU);
+
+      auto result = mllm::test::allClose(output_cpu, ref_cpu, 5e-2f, 5e-2f);
+      if (!result.is_close) {
+        MLLM_ERROR("Multi-head attention test failed for shape Q=[{},{},{},{}], K=[{},{},{},{}], V=[{},{},{},{}], mask={}",
+                   q_shape[0], q_shape[1], q_shape[2], q_shape[3],
+                   k_shape[0], k_shape[1], k_shape[2], k_shape[3],
+                   v_shape[0], v_shape[1], v_shape[2], v_shape[3],
+                   use_mask ? "true" : "false");
+        MLLM_ERROR("Max absolute diff: {}, Max relative diff: {}",
+                   result.max_absolute_diff, result.max_relative_diff);
+        return false;
+      }
+
+      MLLM_INFO("Multi-head attention test passed: B={}, H={}, S_q={}, S_kv={}, D={}, mask={}",
+                B, H, S_q, S_kv, D, use_mask ? "true" : "false");
+    }
+    return true;
+  }
+
+  //===----------------------------------------------------------------------===//
+  // Multi-Head Attention with Grouped Query Attention (GQA) support
+  //
+  // GQA: num_q_heads > num_kv_heads, each KV head is shared by multiple Q heads
+  // Input shapes: Q=[B, H_q, S_q, D], K=[B, H_kv, S_kv, D], V=[B, H_kv, S_kv, D]
+  //===----------------------------------------------------------------------===//
+  bool GroupedQueryAttentionFloat16Test(
+      const std::vector<std::tuple<
+          mllm::Tensor::shape_t,  // Q shape [B, H_q, S_q, D]
+          mllm::Tensor::shape_t,  // K shape [B, H_kv, S_kv, D]
+          mllm::Tensor::shape_t,  // V shape [B, H_kv, S_kv, D]
+          bool                     // use_causal_mask
+      >>& test_cases) {
+    using namespace mllm;  // NOLINT
+
+    for (const auto& [q_shape, k_shape, v_shape, use_mask] : test_cases) {
+      // Validate shapes
+      MLLM_RT_ASSERT_EQ(q_shape.size(), 4);
+      MLLM_RT_ASSERT_EQ(k_shape.size(), 4);
+      MLLM_RT_ASSERT_EQ(v_shape.size(), 4);
+      MLLM_RT_ASSERT_EQ(q_shape[0], k_shape[0]);  // Same batch size
+      MLLM_RT_ASSERT_EQ(q_shape[0], v_shape[0]);
+      MLLM_RT_ASSERT_EQ(k_shape[1], v_shape[1]);  // KV have same num_heads
+      MLLM_RT_ASSERT_EQ(q_shape[3], k_shape[3]);  // Same head_dim
+      MLLM_RT_ASSERT_EQ(k_shape[2], v_shape[2]);  // K and V have same sequence length
+
+      int32_t B = static_cast<int32_t>(q_shape[0]);
+      int32_t H_q = static_cast<int32_t>(q_shape[1]);   // num query heads
+      int32_t H_kv = static_cast<int32_t>(k_shape[1]);  // num KV heads
+      int32_t S_q = static_cast<int32_t>(q_shape[2]);
+      int32_t S_kv = static_cast<int32_t>(k_shape[2]);
+      int32_t D = static_cast<int32_t>(q_shape[3]);
+
+      MLLM_RT_ASSERT_EQ(H_q % H_kv, 0);
+      int32_t num_groups = H_q / H_kv;
+
+      // 1. Create random FP16 inputs on CPU
+      Tensor Q_cpu = Tensor::random(q_shape, -0.5f, 0.5f, kFloat16, kCPU);
+      Tensor K_cpu = Tensor::random(k_shape, -0.5f, 0.5f, kFloat16, kCPU);
+      Tensor V_cpu = Tensor::random(v_shape, -0.5f, 0.5f, kFloat16, kCPU);
+
+      // 2. Create causal mask if needed
+      Tensor mask_cpu;
+      if (use_mask) {
+        mask_cpu = Tensor::zeros({1, 1, S_q, S_kv}, kFloat16, kCPU);
+        auto* mask_ptr = mask_cpu.ptr<mllm_fp16_t>();
+        int32_t offset = S_kv - S_q;
+        for (int32_t i = 0; i < S_q; ++i) {
+          for (int32_t j = 0; j < S_kv; ++j) {
+            if (j > i + offset) {
+              mask_ptr[i * S_kv + j] = MLLM_FP32_TO_FP16(-10000.0f);
+            }
+          }
+        }
+      }
+
+      // 3. Compute reference on CPU
+      Tensor ref_cpu = computeGQACPU(Q_cpu, K_cpu, V_cpu, mask_cpu, use_mask, num_groups);
+
+      // 4. Compute on Ascend
+      auto Q_ascend = Q_cpu.to(kAscend);
+      auto K_cpu_expanded = repeatKVHeads(K_cpu, num_groups);
+      auto V_cpu_expanded = repeatKVHeads(V_cpu, num_groups);
+      auto K_ascend = K_cpu_expanded.to(kAscend);
+      auto V_ascend = V_cpu_expanded.to(kAscend);
+
+      float scale = 1.0f / std::sqrt(static_cast<float>(D));
+
+      // Q @ K^T
+      auto scores = mllm::nn::functional::matmul(Q_ascend, K_ascend, false, true);
+
+      // Scale
+      auto scale_tensor_cpu = Tensor::ones({1}, kFloat16, kCPU);
+      {
+        auto* scale_ptr = scale_tensor_cpu.ptr<mllm_fp16_t>();
+        scale_ptr[0] = MLLM_FP32_TO_FP16(scale);
+      }
+      auto scaled_scores = scores * scale_tensor_cpu.to(kAscend);
+
+      // Add mask
+      if (use_mask) {
+        scaled_scores = scaled_scores + mask_cpu.to(kAscend);
+      }
+
+      // Softmax
+      auto attn_weights = mllm::nn::functional::softmax(scaled_scores, -1);
+
+      // attn_weights @ V
+      auto output_ascend = mllm::nn::functional::matmul(attn_weights, V_ascend, false, false);
+
+      // 5. Compare
+      auto output_cpu = output_ascend.to(kCPU);
+      auto result = mllm::test::allClose(output_cpu, ref_cpu, 5e-2f, 5e-2f);
+      if (!result.is_close) {
+        MLLM_ERROR("GQA test failed: B={}, H_q={}, H_kv={}, S_q={}, S_kv={}, D={}, mask={}",
+                   B, H_q, H_kv, S_q, S_kv, D, use_mask ? "true" : "false");
+        MLLM_ERROR("Max absolute diff: {}, Max relative diff: {}",
+                   result.max_absolute_diff, result.max_relative_diff);
+        return false;
+      }
+
+      MLLM_INFO("GQA test passed: B={}, H_q={}, H_kv={}, S_q={}, S_kv={}, D={}, mask={}",
+                B, H_q, H_kv, S_q, S_kv, D, use_mask ? "true" : "false");
+    }
+    return true;
+  }
+
+ private:
+  //===----------------------------------------------------------------------===//
+  // Helper: Compute Multi-Head Attention reference on CPU (FP32)
+  //===----------------------------------------------------------------------===//
+  mllm::Tensor computeMultiHeadAttentionCPU(
+      const mllm::Tensor& Q_cpu,
+      const mllm::Tensor& K_cpu,
+      const mllm::Tensor& V_cpu,
+      const mllm::Tensor& mask_cpu,
+      bool use_mask) {
+    using namespace mllm;  // NOLINT
+
+    int32_t B = static_cast<int32_t>(Q_cpu.shape()[0]);
+    int32_t H = static_cast<int32_t>(Q_cpu.shape()[1]);
+    int32_t S_q = static_cast<int32_t>(Q_cpu.shape()[2]);
+    int32_t S_kv = static_cast<int32_t>(K_cpu.shape()[2]);
+    int32_t D = static_cast<int32_t>(Q_cpu.shape()[3]);
+
+    // Convert inputs to FP32
+    Tensor Q_fp32 = Tensor::zeros({B, H, S_q, D}, kFloat32, kCPU);
+    Tensor K_fp32 = Tensor::zeros({B, H, S_kv, D}, kFloat32, kCPU);
+    Tensor V_fp32 = Tensor::zeros({B, H, S_kv, D}, kFloat32, kCPU);
+
+    auto* q_fp16 = Q_cpu.ptr<mllm_fp16_t>();
+    auto* k_fp16 = K_cpu.ptr<mllm_fp16_t>();
+    auto* v_fp16 = V_cpu.ptr<mllm_fp16_t>();
+    auto* q_fp32 = Q_fp32.ptr<mllm_fp32_t>();
+    auto* k_fp32 = K_fp32.ptr<mllm_fp32_t>();
+    auto* v_fp32 = V_fp32.ptr<mllm_fp32_t>();
+
+    for (size_t i = 0; i < Q_cpu.numel(); ++i) {
+      q_fp32[i] = MLLM_FP16_TO_FP32(q_fp16[i]);
+    }
+    for (size_t i = 0; i < K_cpu.numel(); ++i) {
+      k_fp32[i] = MLLM_FP16_TO_FP32(k_fp16[i]);
+    }
+    for (size_t i = 0; i < V_cpu.numel(); ++i) {
+      v_fp32[i] = MLLM_FP16_TO_FP32(v_fp16[i]);
+    }
+
+    // Convert mask to FP32 if needed
+    const mllm_fp16_t* mask_fp16 = nullptr;
+    if (use_mask) {
+      mask_fp16 = mask_cpu.ptr<mllm_fp16_t>();
+    }
+
+    Tensor output_fp32 = Tensor::zeros({B, H, S_q, D}, kFloat32, kCPU);
+    auto* out_ptr = output_fp32.ptr<mllm_fp32_t>();
+
+    float scale = 1.0f / std::sqrt(static_cast<float>(D));
+
+    for (int32_t b = 0; b < B; ++b) {
+      for (int32_t h = 0; h < H; ++h) {
+        std::vector<float> scores(S_q * S_kv, 0.0f);
+        for (int32_t i = 0; i < S_q; ++i) {
+          for (int32_t j = 0; j < S_kv; ++j) {
+            float sum = 0.0f;
+            for (int32_t k = 0; k < D; ++k) {
+              float q_val = q_fp32[((b * H + h) * S_q + i) * D + k];
+              float k_val = k_fp32[((b * H + h) * S_kv + j) * D + k];
+              sum += q_val * k_val;
+            }
+            scores[i * S_kv + j] = sum * scale;
+
+            // Add mask (mask is broadcastable: [1, 1, S_q, S_kv])
+            if (use_mask) {
+              float mask_val = MLLM_FP16_TO_FP32(mask_fp16[i * S_kv + j]);
+              scores[i * S_kv + j] += mask_val;
+            }
+          }
+        }
+
+        // Softmax along last dimension
+        std::vector<float> attn_weights(S_q * S_kv);
+        for (int32_t i = 0; i < S_q; ++i) {
+          float max_val = -std::numeric_limits<float>::infinity();
+          for (int32_t j = 0; j < S_kv; ++j) {
+            max_val = std::max(max_val, scores[i * S_kv + j]);
+          }
+
+          float sum_exp = 0.0f;
+          for (int32_t j = 0; j < S_kv; ++j) {
+            float exp_val = std::exp(scores[i * S_kv + j] - max_val);
+            attn_weights[i * S_kv + j] = exp_val;
+            sum_exp += exp_val;
+          }
+
+          for (int32_t j = 0; j < S_kv; ++j) {
+            attn_weights[i * S_kv + j] /= sum_exp;
+          }
+        }
+
+        // Compute output: attn_weights @ V
+        for (int32_t i = 0; i < S_q; ++i) {
+          for (int32_t k = 0; k < D; ++k) {
+            float sum = 0.0f;
+            for (int32_t j = 0; j < S_kv; ++j) {
+              float attn_val = attn_weights[i * S_kv + j];
+              float v_val = v_fp32[((b * H + h) * S_kv + j) * D + k];
+              sum += attn_val * v_val;
+            }
+            out_ptr[((b * H + h) * S_q + i) * D + k] = sum;
+          }
+        }
+      }
+    }
+
+    // Convert output back to FP16
+    Tensor output_fp16 = Tensor::zeros({B, H, S_q, D}, kFloat16, kCPU);
+    auto* out_fp16 = output_fp16.ptr<mllm_fp16_t>();
+    for (size_t i = 0; i < output_fp16.numel(); ++i) {
+      out_fp16[i] = MLLM_FP32_TO_FP16(out_ptr[i]);
+    }
+
+    return output_fp16;
+  }
+
+  //===----------------------------------------------------------------------===//
+  // Helper: Repeat KV heads for GQA
+  // [B, H_kv, S, D] -> [B, H_q, S, D] where H_q = H_kv * num_groups
+  //===----------------------------------------------------------------------===//
+  mllm::Tensor repeatKVHeads(const mllm::Tensor& kv, int32_t num_groups) {
+    using namespace mllm;  // NOLINT
+
+    if (num_groups == 1) {
+      return kv;
+    }
+
+    int32_t B = static_cast<int32_t>(kv.shape()[0]);
+    int32_t H_kv = static_cast<int32_t>(kv.shape()[1]);
+    int32_t S = static_cast<int32_t>(kv.shape()[2]);
+    int32_t D = static_cast<int32_t>(kv.shape()[3]);
+    int32_t H_q = H_kv * num_groups;
+
+    Tensor expanded = Tensor::zeros({B, H_q, S, D}, kv.dtype(), kCPU);
+    auto* src = kv.ptr<mllm_fp16_t>();
+    auto* dst = expanded.ptr<mllm_fp16_t>();
+
+    for (int32_t b = 0; b < B; ++b) {
+      for (int32_t h_kv = 0; h_kv < H_kv; ++h_kv) {
+        for (int32_t g = 0; g < num_groups; ++g) {
+          int32_t h_q = h_kv * num_groups + g;
+          for (int32_t s = 0; s < S; ++s) {
+            for (int32_t d = 0; d < D; ++d) {
+              size_t src_idx = ((b * H_kv + h_kv) * S + s) * D + d;
+              size_t dst_idx = ((b * H_q + h_q) * S + s) * D + d;
+              dst[dst_idx] = src[src_idx];
+            }
+          }
+        }
+      }
+    }
+
+    return expanded;
+  }
+
+  //===----------------------------------------------------------------------===//
+  // Helper: Compute GQA reference on CPU
+  //===----------------------------------------------------------------------===//
+  mllm::Tensor computeGQACPU(
+      const mllm::Tensor& Q_cpu,
+      const mllm::Tensor& K_cpu,
+      const mllm::Tensor& V_cpu,
+      const mllm::Tensor& mask_cpu,
+      bool use_mask,
+      int32_t num_groups) {
+    // Expand KV heads and compute standard MHA
+    auto K_expanded = repeatKVHeads(K_cpu, num_groups);
+    auto V_expanded = repeatKVHeads(V_cpu, num_groups);
+    return computeMultiHeadAttentionCPU(Q_cpu, K_expanded, V_expanded, mask_cpu, use_mask);
+  }
+};
diff --git a/tests/ascend/AscendKernelTest.hpp b/tests/ascend/AscendKernelTest.hpp
index 138ee5ae8..a01028906 100644
--- a/tests/ascend/AscendKernelTest.hpp
+++ b/tests/ascend/AscendKernelTest.hpp
@@ -48,5 +48,75 @@ class AscendKernelTest : public KernelTest {
     }
     return true;
   }
+
+  // Test Sub operation with different shapes
+  bool SubFloat16Test(const std::vector<mllm::Tensor::shape_t>& shapes) {
+    using namespace mllm;  // NOLINT
+    for (auto& shape : shapes) {
+      // 1. Construct random FP16 inputs on CPU
+      Tensor x_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU);
+      Tensor y_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU);
+
+      // 2. Compute reference result (FP16) on CPU
+      Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU);
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* y_ptr = y_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+        auto num_elements = x_cpu.numel();
+        for (size_t i = 0; i < num_elements; ++i) {
+          r_ptr[i] = x_ptr[i] - y_ptr[i];
+        }
+      }
+
+      // 3. Move inputs to Ascend and run Sub (z = x - y)
+      auto x_ascend = x_cpu.to(kAscend);
+      auto y_ascend = y_cpu.to(kAscend);
+      auto z_ascend = x_ascend - y_ascend;
+
+      // 4. Move result back to CPU and compare with reference using allClose
+      auto z_cpu = z_ascend.to(kCPU);
+      auto result = mllm::test::allClose(z_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Test Mul operation with different shapes
+  bool MulFloat16Test(const std::vector<mllm::Tensor::shape_t>& shapes) {
+    using namespace mllm;  // NOLINT
+    for (auto& shape : shapes) {
+      // 1. Construct random FP16 inputs on CPU
+      Tensor x_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU);
+      Tensor y_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU);
+
+      // 2. Compute reference result (FP16) on CPU
+      Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU);
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* y_ptr = y_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+        auto num_elements = x_cpu.numel();
+        for (size_t i = 0; i < num_elements; ++i) {
+          r_ptr[i] = x_ptr[i] * y_ptr[i];
+        }
+      }
+
+      // 3. Move inputs to Ascend and run Mul (z = x * y)
+      auto x_ascend = x_cpu.to(kAscend);
+      auto y_ascend = y_cpu.to(kAscend);
+      auto z_ascend = x_ascend * y_ascend;
+
+      // 4. Move result back to CPU and compare with reference using allClose
+      auto z_cpu = z_ascend.to(kCPU);
+      auto result = mllm::test::allClose(z_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        return false;
+      }
+    }
+    return true;
+  }
 };
 
diff --git a/tests/ascend/AscendLinearKernelTest.hpp b/tests/ascend/AscendLinearKernelTest.hpp
new file mode 100644
index 000000000..b7fca56fa
--- /dev/null
+++ b/tests/ascend/AscendLinearKernelTest.hpp
@@ -0,0 +1,164 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/mllm.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/core/OpTypes.hpp"
+#include "mllm/core/aops/LinearOp.hpp"
+#include "mllm/engine/Context.hpp"
+#include "mllm/nn/Functional.hpp"
+#include "KernelTestHelper.hpp"
+#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp"
+#include <vector>
+#include <cmath>
+#include <iostream>
+
+
+class AscendLinearKernelTest : public KernelTest {
+ public:
+  AscendLinearKernelTest() = default;
+  ~AscendLinearKernelTest() override = default;
+
+  bool LinearFloat16Test(const std::vector<std::tuple<mllm::Tensor::shape_t, int, int>>& test_cases) {
+    using namespace mllm;  // NOLINT
+    for (auto& test_case : test_cases) {
+      auto input_shape = std::get<0>(test_case);
+      int in_channels = std::get<1>(test_case);
+      int out_channels = std::get<2>(test_case);
+
+      std::cout << "[LinearTest] Testing shape=[";
+      for (size_t i = 0; i < input_shape.size(); ++i) {
+        std::cout << input_shape[i] << (i < input_shape.size() - 1 ? ", " : "");
+      }
+      std::cout << "], in=" << in_channels << ", out=" << out_channels << std::endl;
+
+      // 1. Construct random FP16 inputs on CPU
+      // x: [M, K] where K = in_channels
+      Tensor x_cpu = Tensor::random(input_shape, -1, 1, kFloat16, kCPU);
+
+      // Weight shape for ATB: [K, N] where K=in_channels, N=out_channels
+      Tensor weight_cpu = Tensor::random({in_channels, out_channels}, -0.5, 0.5, kFloat16, kCPU);
+
+      // 2. Compute reference result on CPU
+      // y = x @ weight, where x is [M, K], weight is [K, N], output is [M, N]
+      auto output_shape = input_shape;
+      output_shape[output_shape.size() - 1] = out_channels;
+      Tensor ref_cpu = Tensor::zeros(output_shape, kFloat16, kCPU);
+
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* w_ptr = weight_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+
+        size_t batch_size = 1;
+        for (size_t i = 0; i < input_shape.size() - 1; ++i) {
+          batch_size *= input_shape[i];
+        }
+
+        for (size_t b = 0; b < batch_size; ++b) {
+          for (int o = 0; o < out_channels; ++o) {
+            float sum = 0.0f;
+            for (int i = 0; i < in_channels; ++i) {
+              float x_val = MLLM_FP16_TO_FP32(x_ptr[b * in_channels + i]);
+              float w_val = MLLM_FP16_TO_FP32(w_ptr[i * out_channels + o]);  // weight is [K, N]
+              sum += x_val * w_val;
+            }
+            r_ptr[b * out_channels + o] = MLLM_FP32_TO_FP16(sum);
+          }
+        }
+      }
+
+      // 3. Move inputs to Ascend and run Linear via matmul
+      auto x_ascend = x_cpu.to(kAscend);
+      auto weight_ascend = weight_cpu.to(kAscend);
+
+      // Use matmul: y = x @ weight
+      auto y_ascend = nn::functional::matmul(x_ascend, weight_ascend, false, false);
+
+      // 4. Move result back to CPU and compare with reference
+      auto y_cpu = y_ascend.to(kCPU);
+      auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        std::cout << "[LinearTest] FAILED!" << std::endl;
+        return false;
+      }
+      std::cout << "[LinearTest] PASSED" << std::endl;
+    }
+    return true;
+  }
+
+
+  bool LinearWithBiasFloat16Test(const std::vector<std::tuple<mllm::Tensor::shape_t, int, int>>& test_cases) {
+    using namespace mllm;  // NOLINT
+    for (auto& test_case : test_cases) {
+      auto input_shape = std::get<0>(test_case);
+      int in_channels = std::get<1>(test_case);
+      int out_channels = std::get<2>(test_case);
+
+      std::cout << "[LinearWithBiasTest] Testing shape=[";
+      for (size_t i = 0; i < input_shape.size(); ++i) {
+        std::cout << input_shape[i] << (i < input_shape.size() - 1 ? ", " : "");
+      }
+      std::cout << "], in=" << in_channels << ", out=" << out_channels << std::endl;
+
+      // 1. Create random input, weight and bias on CPU
+      Tensor x_cpu = Tensor::random(input_shape, -1, 1, kFloat16, kCPU);
+      // Weight shape: [out_channels, in_channels]
+      Tensor weight_cpu = Tensor::random({out_channels, in_channels}, -0.5, 0.5, kFloat16, kCPU);
+      // Bias shape: [1, out_channels] for ATB Linear (2D tensor required)
+      Tensor bias_cpu = Tensor::random({1, out_channels}, -0.1, 0.1, kFloat16, kCPU);
+
+      // 2. Compute reference result on CPU
+      auto output_shape = input_shape;
+      output_shape[output_shape.size() - 1] = out_channels;
+      Tensor ref_cpu = Tensor::zeros(output_shape, kFloat16, kCPU);
+
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* w_ptr = weight_cpu.ptr<mllm_fp16_t>();
+        auto* b_ptr = bias_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+
+        size_t batch_size = 1;
+        for (size_t i = 0; i < input_shape.size() - 1; ++i) {
+          batch_size *= input_shape[i];
+        }
+
+        // y = x @ W^T + b, where W is [out_channels, in_channels]
+        for (size_t b = 0; b < batch_size; ++b) {
+          for (int o = 0; o < out_channels; ++o) {
+            float sum = 0.0f;
+            for (int i = 0; i < in_channels; ++i) {
+              float x_val = MLLM_FP16_TO_FP32(x_ptr[b * in_channels + i]);
+              float w_val = MLLM_FP16_TO_FP32(w_ptr[o * in_channels + i]);
+              sum += x_val * w_val;
+            }
+            float bias_val = MLLM_FP16_TO_FP32(b_ptr[o]);
+            sum += bias_val;
+            r_ptr[b * out_channels + o] = MLLM_FP32_TO_FP16(sum);
+          }
+        }
+      }
+
+      // 3. Move tensors to Ascend and run linear
+      auto x_ascend = x_cpu.to(kAscend);
+      auto weight_ascend = weight_cpu.to(kAscend);
+      auto bias_ascend = bias_cpu.to(kAscend);
+
+      // Use nn::functional::linear directly
+      auto y_ascend = nn::functional::linear(x_ascend, weight_ascend, bias_ascend);
+
+      // 4. Compare result with reference
+      auto y_cpu = y_ascend.to(kCPU);
+      auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        std::cout << "[LinearWithBiasTest] FAILED!" << std::endl;
+        return false;
+      }
+      std::cout << "[LinearWithBiasTest] PASSED" << std::endl;
+    }
+    return true;
+  }
+};
diff --git a/tests/ascend/AscendRMSNormKernelTest.hpp b/tests/ascend/AscendRMSNormKernelTest.hpp
new file mode 100644
index 000000000..0af879c8f
--- /dev/null
+++ b/tests/ascend/AscendRMSNormKernelTest.hpp
@@ -0,0 +1,85 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/mllm.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/nn/layers/RMSNorm.hpp"
+#include "KernelTestHelper.hpp"
+#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp"
+#include <vector>
+#include <cmath>
+
+class AscendRMSNormKernelTest : public KernelTest {
+ public:
+  AscendRMSNormKernelTest() = default;
+  ~AscendRMSNormKernelTest() override = default;
+
+  // Test RMSNorm operation with different shapes
+  bool RMSNormFloat16Test(const std::vector<std::tuple<mllm::Tensor::shape_t, int, float>>& test_cases) {
+    using namespace mllm;  // NOLINT
+    for (auto& test_case : test_cases) {
+      auto input_shape = std::get<0>(test_case);
+      int norm_size = std::get<1>(test_case);
+      float epsilon = std::get<2>(test_case);
+
+      // Validate that norm_size matches the last dimension of input_shape
+      assert(norm_size == static_cast<int>(input_shape.back()) &&
+             "norm_size must equal the last dimension of input_shape");
+
+      // 1. Construct random FP16 inputs on CPU
+      Tensor x_cpu = Tensor::random(input_shape, -2, 2, kFloat16, kCPU);
+
+      // Weight shape: [norm_size]
+      Tensor weight_cpu = Tensor::random({norm_size}, 0.5, 1.5, kFloat16, kCPU);
+
+      // 2. Compute reference result (FP16) on CPU
+      // RMSNorm: y = x * weight / sqrt(mean(x^2) + epsilon)
+      Tensor ref_cpu = Tensor::zeros(input_shape, kFloat16, kCPU);
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* w_ptr = weight_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+
+        size_t batch_size = 1;
+        for (size_t i = 0; i < input_shape.size() - 1; ++i) {
+          batch_size *= input_shape[i];
+        }
+
+        // Perform RMSNorm for each batch
+        for (size_t b = 0; b < batch_size; ++b) {
+          float sum_squares = 0.0f;
+          for (int i = 0; i < norm_size; ++i) {
+            float x_val = MLLM_FP16_TO_FP32(x_ptr[b * norm_size + i]);
+            sum_squares += x_val * x_val;
+          }
+          float rms = std::sqrt(sum_squares / norm_size + epsilon);
+
+          // Normalize and scale by weight
+          for (int i = 0; i < norm_size; ++i) {
+            float x_val = MLLM_FP16_TO_FP32(x_ptr[b * norm_size + i]);
+            float w_val = MLLM_FP16_TO_FP32(w_ptr[i]);
+            float result = (x_val / rms) * w_val;
+            r_ptr[b * norm_size + i] = MLLM_FP32_TO_FP16(result);
+          }
+        }
+      }
+
+      // 3. Move inputs to Ascend and run RMSNorm
+      auto x_ascend = x_cpu.to(kAscend);
+      auto weight_ascend = weight_cpu.to(kAscend);
+
+      // Use functional API - one line to execute the operator
+      auto y_ascend = nn::functional::rmsNorm(x_ascend, weight_ascend, epsilon);
+
+      // 4. Move result back to CPU and compare with reference using allClose
+      auto y_cpu = y_ascend.to(kCPU);
+      auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
diff --git a/tests/ascend/AscendSiLUKernelTest.hpp b/tests/ascend/AscendSiLUKernelTest.hpp
new file mode 100644
index 000000000..aaa798f69
--- /dev/null
+++ b/tests/ascend/AscendSiLUKernelTest.hpp
@@ -0,0 +1,67 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/mllm.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/nn/Functional.hpp"
+#include "KernelTestHelper.hpp"
+#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp"
+#include <vector>
+#include <cmath>
+
+class AscendSiLUKernelTest : public KernelTest {
+ public:
+  AscendSiLUKernelTest() = default;
+  ~AscendSiLUKernelTest() override = default;
+
+  // Test SiLU operation with different shapes
+  bool SiLUFloat16Test(const std::vector<mllm::Tensor::shape_t>& shapes) {
+    using namespace mllm;  // NOLINT
+    for (auto& shape : shapes) {
+      // 1. Construct random FP16 inputs on CPU
+      Tensor x_cpu = Tensor::random(shape, -5, 5, kFloat16, kCPU);
+
+      // 2. Compute reference result (FP16) on CPU
+      // SiLU(x) = x * sigmoid(x) = x / (1 + exp(-x))
+      Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU);
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+        auto num_elements = x_cpu.numel();
+        for (size_t i = 0; i < num_elements; ++i) {
+          // Convert FP16 to FP32 for computation
+          float x_val = MLLM_FP16_TO_FP32(x_ptr[i]);
+
+          // Compute sigmoid(x) = 1 / (1 + exp(-x))
+          float sigmoid_x;
+          if (x_val >= 0) {
+            sigmoid_x = 1.0f / (1.0f + std::exp(-x_val));
+          } else {
+            float exp_x = std::exp(x_val);
+            sigmoid_x = exp_x / (1.0f + exp_x);
+          }
+
+          // SiLU(x) = x * sigmoid(x)
+          float result = x_val * sigmoid_x;
+
+          // Convert back to FP16
+          r_ptr[i] = MLLM_FP32_TO_FP16(result);
+        }
+      }
+
+      // 3. Move inputs to Ascend and run SiLU
+      auto x_ascend = x_cpu.to(kAscend);
+      auto y_ascend = mllm::nn::functional::silu(x_ascend);
+
+      // 4. Move result back to CPU and compare with reference using allClose
+      auto y_cpu = y_ascend.to(kCPU);
+      auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
diff --git a/tests/ascend/AscendSoftmaxKernelTest.hpp b/tests/ascend/AscendSoftmaxKernelTest.hpp
new file mode 100644
index 000000000..2cc6d7b73
--- /dev/null
+++ b/tests/ascend/AscendSoftmaxKernelTest.hpp
@@ -0,0 +1,129 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/mllm.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/nn/Functional.hpp"
+#include "KernelTestHelper.hpp"
+#include "mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp"
+#include <vector>
+#include <cmath>
+
+class AscendSoftmaxKernelTest : public KernelTest {
+ public:
+  AscendSoftmaxKernelTest() = default;
+  ~AscendSoftmaxKernelTest() override = default;
+
+  // Test Softmax operation with different shapes and axes
+  bool SoftmaxFloat16Test(const std::vector<mllm::Tensor::shape_t>& shapes, const std::vector<int>& axes) {
+    using namespace mllm;  // NOLINT
+    for (auto& shape : shapes) {
+      for (auto axis : axes) {
+        // 1. Construct random FP16 inputs on CPU
+        Tensor x_cpu = Tensor::random(shape, -5, 5, kFloat16, kCPU);
+
+        // 2. Compute reference result (FP16) on CPU
+        // Softmax(x_i) = exp(x_i - max(x)) / sum(exp(x_j - max(x)))
+        Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU);
+        {
+          auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+          auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+
+          // Convert axis to positive index
+          int ndim = static_cast<int>(shape.size());
+          int pos_axis = axis;
+          if (pos_axis < 0) {
+            pos_axis = ndim + pos_axis;
+          }
+
+          // Calculate strides
+          std::vector<size_t> strides(ndim);
+          strides[ndim - 1] = 1;
+          for (int i = ndim - 2; i >= 0; --i) {
+            strides[i] = strides[i + 1] * shape[i + 1];
+          }
+
+          size_t outer_size = 1;
+          for (int i = 0; i < pos_axis; ++i) {
+            outer_size *= shape[i];
+          }
+
+          size_t axis_size = shape[pos_axis];
+
+          size_t inner_size = 1;
+          for (int i = pos_axis + 1; i < ndim; ++i) {
+            inner_size *= shape[i];
+          }
+
+          // Compute softmax for each slice along the axis
+          for (size_t outer = 0; outer < outer_size; ++outer) {
+            for (size_t inner = 0; inner < inner_size; ++inner) {
+              // Find max value for numerical stability
+              float max_val = -std::numeric_limits<float>::infinity();
+              for (size_t i = 0; i < axis_size; ++i) {
+                size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] +
+                             i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner;
+                if (pos_axis == 0) {
+                  idx = i * strides[0] + inner;
+                } else if (pos_axis == ndim - 1) {
+                  idx = outer * axis_size + i;
+                } else {
+                  idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner;
+                }
+                float val = MLLM_FP16_TO_FP32(x_ptr[idx]);
+                max_val = std::max(max_val, val);
+              }
+
+              // Compute exp(x - max) and sum
+              float sum_exp = 0.0f;
+              std::vector<float> exp_vals(axis_size);
+              for (size_t i = 0; i < axis_size; ++i) {
+                size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] +
+                             i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner;
+                if (pos_axis == 0) {
+                  idx = i * strides[0] + inner;
+                } else if (pos_axis == ndim - 1) {
+                  idx = outer * axis_size + i;
+                } else {
+                  idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner;
+                }
+                float val = MLLM_FP16_TO_FP32(x_ptr[idx]);
+                exp_vals[i] = std::exp(val - max_val);
+                sum_exp += exp_vals[i];
+              }
+
+              // Compute softmax and store result
+              for (size_t i = 0; i < axis_size; ++i) {
+                size_t idx = outer * strides[pos_axis > 0 ? pos_axis - 1 : 0] * shape[pos_axis] +
+                             i * (pos_axis < ndim - 1 ? strides[pos_axis] : 1) + inner;
+                if (pos_axis == 0) {
+                  idx = i * strides[0] + inner;
+                } else if (pos_axis == ndim - 1) {
+                  idx = outer * axis_size + i;
+                } else {
+                  idx = outer * strides[pos_axis - 1] * shape[pos_axis] + i * strides[pos_axis] + inner;
+                }
+                float result = exp_vals[i] / sum_exp;
+                r_ptr[idx] = MLLM_FP32_TO_FP16(result);
+              }
+            }
+          }
+        }
+
+        // 3. Move inputs to Ascend and run Softmax
+        auto x_ascend = x_cpu.to(kAscend);
+        auto y_ascend = mllm::nn::functional::softmax(x_ascend, axis);
+
+        // 4. Move result back to CPU and compare with reference using allClose
+        auto y_cpu = y_ascend.to(kCPU);
+        auto result = mllm::test::allClose(y_cpu, ref_cpu, 1e-2f, 1e-2f);
+        if (!result.is_close) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+};
diff --git a/tests/ascend/KernelTest.cpp b/tests/ascend/KernelTest.cpp
index b0489f545..bccb7a154 100644
--- a/tests/ascend/KernelTest.cpp
+++ b/tests/ascend/KernelTest.cpp
@@ -25,6 +25,205 @@ TEST_F(AscendKernelTest, AddFloat16) {
             true);
 }
 
+//===----------------------------------------------------------------------===//
+// Element wise SUB.
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+TEST_F(AscendKernelTest, SubFloat16) {
+  EXPECT_EQ(SubFloat16Test({
+                {2, 3},
+                {1, 1},
+                {4, 4},
+                {8, 8},
+                {16, 16},
+                {32, 32},
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// Element wise MUL.
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+TEST_F(AscendKernelTest, MulFloat16) {
+  EXPECT_EQ(MulFloat16Test({
+                {2, 3},
+                {1, 1},
+                {4, 4},
+                {8, 8},
+                {16, 16},
+                {32, 32},
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// SiLU activation function.
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+#include "AscendSiLUKernelTest.hpp"
+TEST_F(AscendSiLUKernelTest, SiLUFloat16) {
+  EXPECT_EQ(SiLUFloat16Test({
+                {2, 3},
+                {1, 1},
+                {4, 4},
+                {8, 8},
+                {16, 16},
+                {32, 32},
+                {1, 1024},
+                {128, 128},
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// Linear layer (MatMul based test).
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+#include "AscendLinearKernelTest.hpp"
+TEST_F(AscendLinearKernelTest, LinearFloat16) {
+  EXPECT_EQ(LinearFloat16Test({
+                // {input_shape, in_channels, out_channels}
+                {{2, 3}, 3, 4},
+                {{1, 8}, 8, 16},
+                {{4, 16}, 16, 32},
+                {{8, 32}, 32, 64},
+                {{1, 1024}, 1024, 512},
+            }),
+            true);
+}
+
+TEST_F(AscendLinearKernelTest, LinearWithBiasFloat16) {
+  EXPECT_EQ(LinearWithBiasFloat16Test({
+                // {input_shape, in_channels, out_channels}
+                {{2, 3}, 3, 4},
+                {{1, 8}, 8, 16},
+                {{4, 16}, 16, 32},
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// RMSNorm layer.
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+#include "AscendRMSNormKernelTest.hpp"
+TEST_F(AscendRMSNormKernelTest, RMSNormFloat16) {
+  EXPECT_EQ(RMSNormFloat16Test({
+                // {input_shape, norm_size, epsilon}
+                // Note: ATB RMSNorm requires last dim to be multiple of 16 (FP16 alignment)
+                {{2, 16}, 16, 1e-5f},
+                {{1, 32}, 32, 1e-5f},
+                {{4, 64}, 64, 1e-6f},
+                {{8, 128}, 128, 1e-5f},
+                {{1, 1024}, 1024, 1e-5f},
+                {{128, 256}, 256, 1e-5f},
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// Softmax activation function.
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+#include "AscendSoftmaxKernelTest.hpp"
+TEST_F(AscendSoftmaxKernelTest, SoftmaxFloat16) {
+  EXPECT_EQ(SoftmaxFloat16Test({
+                {2, 3},
+                {1, 8},
+                {4, 4},
+                {8, 8},
+                {16, 16},
+                {1, 1024},
+                {128, 128},
+            },
+            {-1, 0, 1}  // Test different axes
+            ),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// Scaled Dot-Product Attention (using existing operators).
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+#include "AscendAttentionKernelTest.hpp"
+TEST_F(AscendAttentionKernelTest, ScaledDotProductAttentionFloat16) {
+  EXPECT_EQ(ScaledDotProductAttentionFloat16Test({
+                // {Q_shape, K_shape, V_shape}
+                // Format: [B, S, D]
+                {{1, 4, 8}, {1, 4, 8}, {1, 4, 8}},      // Small: B=1, S=4, D=8
+                {{1, 8, 16}, {1, 8, 16}, {1, 8, 16}},   // Medium: B=1, S=8, D=16
+                {{2, 4, 8}, {2, 4, 8}, {2, 4, 8}},      // Batch=2
+                {{1, 16, 32}, {1, 16, 32}, {1, 16, 32}}, // Larger: B=1, S=16, D=32
+                {{1, 8, 64}, {1, 8, 64}, {1, 8, 64}},   // D=64 (common head dim)
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// Multi-Head Attention with Causal Mask.
+//
+// FP16 (Ascend currently uses FP16)
+// Input format: [B, H, S, D] where H = num_heads, D = head_dim
+//===----------------------------------------------------------------------===//
+TEST_F(AscendAttentionKernelTest, MultiHeadAttentionFloat16) {
+  EXPECT_EQ(MultiHeadAttentionFloat16Test({
+                // {Q_shape, K_shape, V_shape, use_causal_mask}
+                // Format: [B, H, S, D]
+
+                // Without mask
+                {{1, 1, 4, 8}, {1, 1, 4, 8}, {1, 1, 4, 8}, false},      // Single head, no mask
+                {{1, 4, 8, 16}, {1, 4, 8, 16}, {1, 4, 8, 16}, false},   // 4 heads, no mask
+                {{1, 8, 16, 64}, {1, 8, 16, 64}, {1, 8, 16, 64}, false}, // 8 heads, D=64
+
+                // With causal mask
+                {{1, 1, 4, 8}, {1, 1, 4, 8}, {1, 1, 4, 8}, true},       // Single head, with mask
+                {{1, 4, 8, 16}, {1, 4, 8, 16}, {1, 4, 8, 16}, true},    // 4 heads, with mask
+                {{1, 8, 16, 64}, {1, 8, 16, 64}, {1, 8, 16, 64}, true}, // 8 heads, with mask
+                {{2, 4, 8, 32}, {2, 4, 8, 32}, {2, 4, 8, 32}, true},    // Batch=2, with mask
+
+                // Different S_q and S_kv (useful for KV cache scenarios)
+                {{1, 4, 1, 32}, {1, 4, 8, 32}, {1, 4, 8, 32}, true},    // S_q=1, S_kv=8 (decode)
+                {{1, 4, 4, 32}, {1, 4, 16, 32}, {1, 4, 16, 32}, true},  // S_q < S_kv
+            }),
+            true);
+}
+
+//===----------------------------------------------------------------------===//
+// Grouped Query Attention (GQA).
+//
+// FP16 (Ascend currently uses FP16)
+// GQA: num_q_heads > num_kv_heads, each KV head is shared by multiple Q heads
+//===----------------------------------------------------------------------===//
+TEST_F(AscendAttentionKernelTest, GroupedQueryAttentionFloat16) {
+  EXPECT_EQ(GroupedQueryAttentionFloat16Test({
+                // {Q_shape [B, H_q, S_q, D], K_shape [B, H_kv, S_kv, D], V_shape, use_mask}
+
+                // GQA with 2 groups (H_q = 4, H_kv = 2)
+                {{1, 4, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, false},
+                {{1, 4, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, true},
+
+                // GQA with 4 groups (H_q = 8, H_kv = 2)
+                {{1, 8, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, false},
+                {{1, 8, 8, 32}, {1, 2, 8, 32}, {1, 2, 8, 32}, true},
+
+                // MQA (Multi-Query Attention): H_kv = 1
+                {{1, 4, 8, 32}, {1, 1, 8, 32}, {1, 1, 8, 32}, true},
+                {{1, 8, 16, 64}, {1, 1, 16, 64}, {1, 1, 16, 64}, true},
+
+                // Batch > 1
+                {{2, 8, 8, 32}, {2, 2, 8, 32}, {2, 2, 8, 32}, true},
+            }),
+            true);
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);