UbiquitousLearning · lywbarca · Jan 28, 2026 · Jan 28, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -59,9 +59,6 @@ option(MLLM_KERNEL_THREADS_VENDOR_APPLE_GCD "Enable Apple GCD Threads" OFF)
 option(MLLM_PERFETTO_ENABLE "Enable perfetto" OFF)
 option(MLLM_TRACY_ENABLE "Enable Tracy. A more advanced profiler" OFF)
 
-# NPU AOT things
-option(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE "Enable Qualcomm NPU AOT on X86 devices" OFF)
-
 # Platform Hints
 option(MLLM_ANDROID_BURST_PERFORMANCE_HINTS "If MLLM need use APerformanceHintManager to tell android we need best performance" OFF)
 

diff --git a/examples/ascend_add_demo/README.md b/examples/ascend_add_demo/README.md
@@ -0,0 +1,81 @@
+# Ascend Add Op Demo
+
+这是一个简单的 demo，用于测试 Ascend 后端的 Add 算子实现。
+
+## 功能
+
+- 初始化 Ascend 后端和内存池
+- 创建两个输入张量（shape: [2, 3]）
+- 在 Ascend NPU 上执行 Add 操作
+- 验证计算结果是否正确
+
+## 编译和运行
+
+### 方法 1: 使用自动化脚本（推荐）
+
+```bash
+cd /home/HwHiAiUser/mLLM/examples/ascend_add_demo
+./build_and_run.sh
-```bash
-cd /home/HwHiAiUser/mLLM/examples/ascend_add_demo
-./build_and_run.sh
-```bash
-cd /home/HwHiAiUser/mLLM/examples/ascend_add_demo
-./build_and_run.sh
+```
+
+脚本会自动：
+- 检查环境变量
+- 配置 CMake
+- 编译项目
+- 运行 demo
+
+### 方法 2: 手动编译
+
+确保已经设置了必要的环境变量：
+- `ASCEND_HOME_PATH`: Ascend SDK 路径（已设置: `/usr/local/Ascend/ascend-toolkit/latest`）
+- `ATB_HOME_PATH`: ATB 库路径（已设置: `/usr/local/Ascend/nnal/nnal/atb/latest/atb/cxx_abi_0`）
+
+在项目根目录下：
+
+```bash
+# 1. 创建构建目录
+mkdir -p build-ascend-demo && cd build-ascend-demo
+
+# 2. 配置 CMake
+cmake .. \
+    -DMLLM_BUILD_ASCEND_BACKEND=ON \
+    -DMLLM_ENABLE_EXAMPLE=ON \
+    -DCMAKE_BUILD_TYPE=Release
+
+# 3. 编译
+make ascend_add_demo -j$(nproc)
+
+# 4. 运行
+./examples/ascend_add_demo/ascend_add_demo
+```
+
+## 预期输出
+
+```
+=== Ascend Add Op Demo ===
+1. Initializing Ascend backend...
+   ✓ Ascend backend initialized
+
+2. Creating input tensors...
+   Input x shape: [2, 3]
+   Input y shape: [2, 3]
+
+3. Transferring tensors to Ascend device...
+   ✓ Tensors transferred to Ascend
+
+4. Executing Add operation on Ascend...
+   ✓ Add operation completed
+
+5. Transferring result back to CPU and verifying...
+   Expected result: [11, 22, 33, 44, 55, 66]
+   Actual result:   [11, 22, 33, 44, 55, 66]
+
+✓ Test PASSED! All values match expected results.
+```
+
+## 注意事项
+
+- 当前实现使用 float16 数据类型
+- 需要 Ascend NPU 设备可用
+- 确保已正确安装 Ascend SDK 和 ATB 库
+
diff --git a/examples/ascend_add_demo/build_and_run.sh b/examples/ascend_add_demo/build_and_run.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# Ascend Add Demo 编译和运行脚本
+
+set -e  # 遇到错误立即退出
+
+# 颜色输出
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}=== Ascend Add Demo 编译和运行脚本 ===${NC}\n"
+
+# 检查环境变量
+echo -e "${YELLOW}检查环境变量...${NC}"
+if [ -z "$ASCEND_HOME_PATH" ]; then
+    echo -e "${RED}错误: ASCEND_HOME_PATH 未设置${NC}"
+    exit 1
+fi
+if [ -z "$ATB_HOME_PATH" ]; then
+    echo -e "${RED}错误: ATB_HOME_PATH 未设置${NC}"
+    exit 1
+fi
+echo -e "${GREEN}✓ ASCEND_HOME_PATH: $ASCEND_HOME_PATH${NC}"
+echo -e "${GREEN}✓ ATB_HOME_PATH: $ATB_HOME_PATH${NC}\n"
+
+# 获取项目根目录
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+BUILD_DIR="$PROJECT_ROOT/build-ascend-demo"
+
+echo -e "${YELLOW}项目根目录: $PROJECT_ROOT${NC}"
+echo -e "${YELLOW}构建目录: $BUILD_DIR${NC}\n"
+
+# 创建构建目录
+if [ ! -d "$BUILD_DIR" ]; then
+    echo -e "${YELLOW}创建构建目录...${NC}"
+    mkdir -p "$BUILD_DIR"
+fi
+
+cd "$BUILD_DIR"
+
+# 配置 CMake
+echo -e "\n${YELLOW}配置 CMake...${NC}"
+cmake "$PROJECT_ROOT" \
+    -DMLLM_BUILD_ASCEND_BACKEND=ON \
+    -DMLLM_ENABLE_EXAMPLE=ON \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+
+# 编译
+echo -e "\n${YELLOW}开始编译...${NC}"
+make ascend_add_demo -j$(nproc)
+
+# 检查编译结果
+if [ $? -eq 0 ]; then
+    echo -e "\n${GREEN}✓ 编译成功！${NC}\n"
+
+    # 运行
+    echo -e "${YELLOW}运行 demo...${NC}\n"
+    ./examples/ascend_add_demo/ascend_add_demo
+
+    if [ $? -eq 0 ]; then
+        echo -e "\n${GREEN}✓ Demo 运行成功！${NC}"
+    else
+        echo -e "\n${RED}✗ Demo 运行失败${NC}"
+        exit 1
+    fi
-# 编译
-echo -e "\n${YELLOW}开始编译...${NC}"
-make ascend_add_demo -j$(nproc)
-
-# 检查编译结果
-if [ $? -eq 0 ]; then
-    echo -e "\n${GREEN}✓ 编译成功！${NC}\n"
-    
-    # 运行
-    echo -e "${YELLOW}运行 demo...${NC}\n"
-    ./examples/ascend_add_demo/ascend_add_demo
-    
-    if [ $? -eq 0 ]; then
-        echo -e "\n${GREEN}✓ Demo 运行成功！${NC}"
-    else
-        echo -e "\n${RED}✗ Demo 运行失败${NC}"
-        exit 1
-    fi
+# 编译
+echo -e "\n${YELLOW}开始编译...${NC}"
+if make ascend_add_demo -j"$(nproc)"; then
+    echo -e "\n${GREEN}✓ 编译成功！${NC}\n"
+    
+    # 运行
+    echo -e "${YELLOW}运行 demo...${NC}\n"
+    if ./examples/ascend_add_demo/ascend_add_demo; then
+        echo -e "\n${GREEN}✓ Demo 运行成功！${NC}"
+    else
+        echo -e "\n${RED}✗ Demo 运行失败${NC}"
+        exit 1
+    fi
+else
+    echo -e "\n${RED}✗ 编译失败${NC}"
+    exit 1
+fi
-# 编译
-echo -e "\n${YELLOW}开始编译...${NC}"
-make ascend_add_demo -j$(nproc)
-
-# 检查编译结果
-if [ $? -eq 0 ]; then
-    echo -e "\n${GREEN}✓ 编译成功！${NC}\n"
-    
-    # 运行
-    echo -e "${YELLOW}运行 demo...${NC}\n"
-    ./examples/ascend_add_demo/ascend_add_demo
-    
-    if [ $? -eq 0 ]; then
-        echo -e "\n${GREEN}✓ Demo 运行成功！${NC}"
-    else
-        echo -e "\n${RED}✗ Demo 运行失败${NC}"
-        exit 1
-    fi
+# 编译
+echo -e "\n${YELLOW}开始编译...${NC}"
+if make ascend_add_demo -j"$(nproc)"; then
+    echo -e "\n${GREEN}✓ 编译成功！${NC}\n"
+    
+    # 运行
+    echo -e "${YELLOW}运行 demo...${NC}\n"
+    if ./examples/ascend_add_demo/ascend_add_demo; then
+        echo -e "\n${GREEN}✓ Demo 运行成功！${NC}"
+    else
+        echo -e "\n${RED}✗ Demo 运行失败${NC}"
+        exit 1
+    fi
+else
+    echo -e "\n${RED}✗ 编译失败${NC}"
+    exit 1
+fi
+else
+    echo -e "\n${RED}✗ 编译失败${NC}"
+    exit 1
+fi
+
diff --git a/mllm/CMakeLists.txt b/mllm/CMakeLists.txt
@@ -24,7 +24,6 @@ add_library(
   ${MLLM_RT_MODELS_SRC}
   ${MLLM_RT_COMPILE_SRC}
   ${MLLM_RT_AUTO_TUNE_SRC}
-  ${MLLM_QUALCOMM_AOT_SRC}
   ${WENET_AUDIO_SOURCES}
 )
 

diff --git a/mllm/backends/ascend/AscendBackend.cpp b/mllm/backends/ascend/AscendBackend.cpp
@@ -8,12 +8,18 @@
 
 #include "mllm/backends/ascend/ops/AscendElewiseOps.hpp"
 #include "mllm/backends/ascend/ops/AscendX2XOp.hpp"
+#include "mllm/backends/ascend/ops/AscendSiLUOp.hpp"
+#include "mllm/backends/ascend/ops/AscendLinearOp.hpp"
+#include "mllm/backends/ascend/ops/AscendRMSNormOp.hpp"
+#include "mllm/backends/ascend/ops/AscendViewOp.hpp"
+#include "mllm/backends/ascend/ops/AscendMatMulOp.hpp"
+#include "mllm/backends/ascend/ops/AscendSoftmaxOp.hpp"
 
 namespace mllm::ascend {
 
 AscendBackend::AscendBackend() : Backend(kAscend, createAscendAllocator()) {
-  regOpFactory<AscendAddOpFactory>();
-  regOpFactory<AscendX2XOpFactory>();
+  regOpFactory<AscendAddOpFactory,AscendSubOpFactory,AscendMulOpFactory,AscendX2XOpFactory,AscendSiLUOpFactory,
+              AscendLinearOpFactory,AscendRMSNormOpFactory,AscendViewOpFactory,AscendMatMulOpFactory,AscendSoftmaxOpFactory>();
   auto& devices = AscendDeviceMetaInfo::instance().devices;
   for (const auto& device : devices) {
     const auto bytes_to_mb = [](size_t bytes) { return bytes / (1024.0 * 1024.0); };

diff --git a/mllm/backends/ascend/AscendCommon.cpp b/mllm/backends/ascend/AscendCommon.cpp
@@ -207,6 +207,13 @@ void syncGlobalAtbStream() {
 }
 
 void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc) {
+  // Validate that the tensor is FP16
+  if (t.dtype() != MLLM_TYPE_F16) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError,
+                    "fillAtbTensorDesc: Tensor must be FP16, but got dtype={}",
+                    static_cast<int>(t.dtype()));
+  }
+
   desc.dtype = ACL_FLOAT16; // Currently hardcoded as per demo, can be expanded later
   desc.format = ACL_FORMAT_ND;
 
@@ -217,6 +224,13 @@ void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc) {
   }
 }
 
+void fillAtbTensor(const Tensor& t, atb::Tensor& atb_tensor) {
+  fillAtbTensorDesc(t, atb_tensor.desc);
+  atb_tensor.deviceData = reinterpret_cast<uint8_t*>(t.ptr<void>());
+  // Use MLLM tensor's actual bytes as dataSize to match allocated memory
+  atb_tensor.dataSize = t.bytes();
+}
+
 AscendDeviceMetaInfo::AscendDeviceMetaInfo() {
 #ifndef ASCENDC_CPU_DEBUG
   // Initialize ACL to query devices
@@ -231,7 +245,6 @@ AscendDeviceMetaInfo::AscendDeviceMetaInfo() {
   ret = aclrtGetDeviceCount(&device_count);
   if (ret != ACL_SUCCESS) {
     MLLM_ERROR("Failed to get Ascend device count: {}", ret);
-    aclFinalize();
     return;
   }
 
@@ -265,9 +278,6 @@ AscendDeviceMetaInfo::AscendDeviceMetaInfo() {
 
     devices.push_back(info);
   }
-
-  // Finalize ACL after enumeration
-  aclFinalize();
 #else
   // In CPU debug mode, add a dummy device
   AscendDeviceInfo info;

diff --git a/mllm/backends/ascend/AscendCommon.hpp b/mllm/backends/ascend/AscendCommon.hpp
@@ -41,6 +41,9 @@ void syncGlobalAtbStream();
 // Convert MLLM Tensor metadata to ATB TensorDesc
 void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc);
 
+// Setup ATB Tensor with correct dataSize calculated by ATB Utils
+void fillAtbTensor(const Tensor& t, atb::Tensor& atb_tensor);
+
 // Ascend device information structure
 struct AscendDeviceInfo {
   std::string name;