From 0ffe938196dfb1f6aeb27b684cc79b94a31d0816 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 12 Dec 2025 01:06:25 +0800
Subject: [PATCH 01/16] feat(ascend): add simple Ascend add demo

---
 examples/CMakeLists.txt                 |   1 +
 examples/ascend_add_demo/CMakeLists.txt |  22 ++++
 examples/ascend_add_demo/main.cpp       | 128 ++++++++++++++++++++++++
 3 files changed, 151 insertions(+)
 create mode 100644 examples/ascend_add_demo/CMakeLists.txt
 create mode 100644 examples/ascend_add_demo/main.cpp
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 6c49cfb22..a143a7989 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,3 +20,4 @@ endif()
 if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
   add_subdirectory(qwen3_qnn_aot)
 endif()
+
diff --git a/examples/ascend_add_demo/CMakeLists.txt b/examples/ascend_add_demo/CMakeLists.txt
new file mode 100644
index 000000000..15eaefd7c
--- /dev/null
+++ b/examples/ascend_add_demo/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_executable(ascend_add_demo main.cpp)
+
+if(DEFINED ENV{ASCEND_HOME_PATH})
+  target_include_directories(ascend_add_demo PRIVATE $ENV{ASCEND_HOME_PATH}/include)
+  target_link_directories(ascend_add_demo PRIVATE $ENV{ASCEND_HOME_PATH}/lib64)
+endif()
+
+target_link_libraries(ascend_add_demo PRIVATE
+  MllmRT
+  MllmAscendBackend
+  ascendcl  # 添加 ACL 库，因为 main.cpp 中直接使用了 aclrtMemcpy
+)
+
+set_target_properties(ascend_add_demo PROPERTIES
+  CXX_STANDARD 20
+  CXX_STANDARD_REQUIRED ON
+)
+
+target_include_directories(ascend_add_demo PRIVATE
+  ${CMAKE_SOURCE_DIR}
+)
+
diff --git a/examples/ascend_add_demo/main.cpp b/examples/ascend_add_demo/main.cpp
new file mode 100644
index 000000000..6e439d388
--- /dev/null
+++ b/examples/ascend_add_demo/main.cpp
@@ -0,0 +1,128 @@
+#include <iostream>
+#include <vector>
+#include <cstring>
+#include <acl/acl.h>
+#include "mllm/mllm.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/engine/Context.hpp"
+#include "mllm/core/aops/ElewiseOps.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+using namespace mllm;
+
+int main() {
+  std::cout << "=== Ascend Add Op Demo ===" << std::endl;
+
+  try {
+    std::cout << "1. Initializing Ascend backend..." << std::endl;
+    initAscendBackend();
+    std::cout << "   ✓ Ascend backend initialized\n" << std::endl;
+
+    std::cout << "2. Preparing test data..." << std::endl;
+    const int batch = 2;
+    const int size = 3;
+    std::vector<float> data_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+    std::vector<float> data_y = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f};
+    std::vector<float> expected = {11.0f, 22.0f, 33.0f, 44.0f, 55.0f, 66.0f};
+    
+    std::cout << "   Input X: [";
+    for (size_t i = 0; i < data_x.size(); ++i) {
+      std::cout << data_x[i];
+      if (i < data_x.size() - 1) std::cout << ", ";
+    }
+    std::cout << "]" << std::endl;
+    
+    std::cout << "   Input Y: [";
+    for (size_t i = 0; i < data_y.size(); ++i) {
+      std::cout << data_y[i];
+      if (i < data_y.size() - 1) std::cout << ", ";
+    }
+    std::cout << "]\n" << std::endl;
+
+    std::cout << "3. Preparing tensors on Ascend..." << std::endl;
+    auto x_handle = ascend::prepareAscendTensor(data_x, batch, size);
+    auto y_handle = ascend::prepareAscendTensor(data_y, batch, size);
+    auto& x_ascend = x_handle.tensor();
+    auto& y_ascend = y_handle.tensor();
+    std::cout << "   ✓ Tensors ready on Ascend device\n" << std::endl;
+
+    std::cout << "4. Executing Add operation on Ascend..." << std::endl;
+    auto& ctx = Context::instance();
+    std::cout << "context over" <<std::endl;
+    auto z_ascend = ctx.buildOpAndSubmitTask(
+        OpTypes::kAdd, 
+        aops::AddOpOptions{}, 
+        {x_ascend, y_ascend}
+    )[0];
+    std::cout << "   ✓ Add operation completed\n" << std::endl;
+
+    std::cout << "\n5. Copying result from NPU to CPU for verification..." << std::endl;
+    std::vector<half_float::half> z_data_fp16(batch * size);
+    
+    auto ret = aclrtMemcpy(
+        z_data_fp16.data(), batch * size * sizeof(half_float::half),
+        z_ascend.ptr<void>(), z_ascend.bytes(),
+        ACL_MEMCPY_DEVICE_TO_HOST
+    );
+    if (ret != ACL_SUCCESS) {
+      std::cerr << "   ✗ Failed to copy result back to CPU: ACL error " << ret << std::endl;
+      x_handle.release();
+      y_handle.release();
+      return 1;
+    }
+    
+    std::vector<float> result(batch * size);
+    for (size_t i = 0; i < result.size(); ++i) {
+      result[i] = static_cast<float>(z_data_fp16[i]);
+    }
+    
+    std::cout << "   ✓ Result copied to CPU\n" << std::endl;
+    
+    std::cout << "6. Verifying results..." << std::endl;
+    std::cout << "   Actual result:   [";
+    for (size_t i = 0; i < result.size(); ++i) {
+      std::cout << result[i];
+      if (i < result.size() - 1) std::cout << ", ";
+    }
+    std::cout << "]" << std::endl;
+    
+    std::cout << "   Expected result: [";
+    for (size_t i = 0; i < expected.size(); ++i) {
+      std::cout << expected[i];
+      if (i < expected.size() - 1) std::cout << ", ";
+    }
+    std::cout << "]" << std::endl;
+    
+    bool correct = true;
+    const float tolerance = 0.1f;  
+    
+    for (size_t i = 0; i < result.size(); ++i) {
+      float diff = std::abs(result[i] - expected[i]);
+      if (diff > tolerance) {
+        correct = false;
+        std::cout << "   ✗ Mismatch at index " << i 
+                  << ": expected " << expected[i] 
+                  << ", got " << result[i] 
+                  << " (diff: " << diff << ")" << std::endl;
+      }
+    }
+    
+    if (correct) {
+      std::cout << "\n✓✓✓ Test PASSED! All values match expected results. ✓✓✓" << std::endl;
+    } else {
+      std::cout << "\n✗✗✗ Test FAILED! Results don't match expected values. ✗✗✗" << std::endl;
+    }
+    
+    // 清理内存池中的块
+    x_handle.release();
+    y_handle.release();
+    
+    return correct ? 0 : 1;
+
+  } catch (const std::exception& e) {
+    std::cerr << "\n✗ Error: " << e.what() << std::endl;
+    return 1;
+  }
+}
+

From 70d98a9c97e8b94c14bf7febf6b287ffe1d38403 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 12 Dec 2025 01:08:03 +0800
Subject: [PATCH 02/16] feat(ascend memory): introduce memory pool to Ascend
 backend

---
 .../ascend/memory/AscendMemoryManager.cpp     |  67 +++++++++++
 .../ascend/memory/AscendMemoryManager.hpp     |  36 ++++++
 .../ascend/memory/AscendMemoryPool.cpp        | 106 ++++++++++++++++++
 .../ascend/memory/AscendMemoryPool.hpp        |  40 +++++++
 mllm/backends/ascend/memory/MemoryBlock.hpp   |  17 +++
 5 files changed, 266 insertions(+)
 create mode 100644 mllm/backends/ascend/memory/AscendMemoryManager.cpp
 create mode 100644 mllm/backends/ascend/memory/AscendMemoryManager.hpp
 create mode 100644 mllm/backends/ascend/memory/AscendMemoryPool.cpp
 create mode 100644 mllm/backends/ascend/memory/AscendMemoryPool.hpp
 create mode 100644 mllm/backends/ascend/memory/MemoryBlock.hpp

diff --git a/mllm/backends/ascend/memory/AscendMemoryManager.cpp b/mllm/backends/ascend/memory/AscendMemoryManager.cpp
new file mode 100644
index 000000000..b1d4920c3
--- /dev/null
+++ b/mllm/backends/ascend/memory/AscendMemoryManager.cpp
@@ -0,0 +1,67 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include <acl/acl.h>
+
+#include "AscendMemoryManager.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+#include "mllm/utils/Common.hpp"
+#include "mllm/utils/Log.hpp"
+
+namespace mllm::ascend {
+
+static AscendMemoryManager g_ascendMemoryManager;
+
+AscendMemoryManager::AscendMemoryManager() {}
+
+AscendMemoryManager &getAscendMemoryManager() {
+    return g_ascendMemoryManager;
+}
+
+void AscendMemoryManager::createMemoryPool(size_t pool_size)
+{
+    uint32_t device_count = 0;
+    auto ret = aclrtGetDeviceCount(&device_count);
+    MLLM_ACL_CHECK(ret);
+    for (size_t i = 0; i < device_count; i++) {
+
+        aclrtSetDevice(i);
+
+        std::shared_ptr<AscendMemoryPool> memory_pool = std::make_shared<AscendMemoryPool>(pool_size);
+        memory_pools_.push_back(memory_pool);
+        MLLM_INFO("create mempool for device {} success", i);
+    }
+}
+
+int32_t AscendMemoryManager::getDeviceId()
+{
+    int32_t device_id = -1;
+    auto ret = aclrtGetDevice(&device_id);
+    MLLM_ACL_CHECK(ret);
+    return device_id;
+}
+
+std::shared_ptr<AscendMemoryPool> &AscendMemoryManager::getMemoryPool()
+{
+    size_t device_id = static_cast<size_t>(getDeviceId());
+    if (device_id >= memory_pools_.size()) {
+        MLLM_ERROR_EXIT(::mllm::ExitCode::kAscendError, "Invalid device id {}", device_id);
+    }
+    return memory_pools_[device_id];
+}
+
+void AscendMemoryManager::allocateBlock(uint32_t size, int &block_id)
+{
+    getMemoryPool()->allocateBlock(size, block_id);
+}
+
+void AscendMemoryManager::freeBlock(int block_id)
+{
+    getMemoryPool()->freeBlock(block_id);
+}
+
+void AscendMemoryManager::getBlockPtr(int block_id, void *&addr)
+{
+    getMemoryPool()->getBlockPtr(block_id, addr);
+}
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/memory/AscendMemoryManager.hpp b/mllm/backends/ascend/memory/AscendMemoryManager.hpp
new file mode 100644
index 000000000..ef3007c3f
--- /dev/null
+++ b/mllm/backends/ascend/memory/AscendMemoryManager.hpp
@@ -0,0 +1,36 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include <cstdint>
+#include "AscendMemoryPool.hpp"
+
+namespace mllm::ascend {
+
+
+class AscendMemoryManager {
+public:  
+    AscendMemoryManager();
+
+    void createMemoryPool(size_t pool_size);
+
+    int32_t getDeviceId();
+
+    std::shared_ptr<AscendMemoryPool> &getMemoryPool();
+
+    void allocateBlock(uint32_t size, int &block_id);
+
+    void freeBlock(int block_id);
+
+    void getBlockPtr(int block_id, void *&addr);
+
+private:
+    std::vector<std::shared_ptr<AscendMemoryPool>> memory_pools_;
+};
+
+AscendMemoryManager &getAscendMemoryManager();
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/memory/AscendMemoryPool.cpp b/mllm/backends/ascend/memory/AscendMemoryPool.cpp
new file mode 100644
index 000000000..a5234d96b
--- /dev/null
+++ b/mllm/backends/ascend/memory/AscendMemoryPool.cpp
@@ -0,0 +1,106 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include <atb/types.h>
+#include <acl/acl.h>
+#include "AscendMemoryPool.hpp"
+#include "mllm/utils/Common.hpp"
+#include "mllm/utils/Log.hpp"
+
+namespace mllm::ascend {
+
+constexpr size_t POOL_SIZE = 104857600;  // 100 MiB，
+
+AscendMemoryPool::AscendMemoryPool(size_t pool_size = POOL_SIZE) {
+    auto ret = aclrtMalloc(&base_mem_ptr_, pool_size, ACL_MEM_MALLOC_HUGE_FIRST);
+    if (ret != ACL_SUCCESS) {
+        MLLM_ERROR("Failed to allocate Ascend memory pool of size {} bytes: ACL error {}", 
+                   pool_size, int(ret));
+        base_mem_ptr_ = nullptr;
+        cur_mem_ptr_ = nullptr;
+        remain_size_ = 0;
+        return;
+    }
+    cur_mem_ptr_ = base_mem_ptr_;
+    remain_size_ = pool_size;
+}
+
+AscendMemoryPool::~AscendMemoryPool() {
+    if (base_mem_ptr_ != nullptr) {
+        auto ret = aclrtFree(base_mem_ptr_);
+        if (ret != ACL_SUCCESS) {
+            MLLM_ERROR("Failed to free Ascend memory pool: ACL error {}", int(ret));
+        }
+    }
+    MLLM_INFO("release MemoryPool success");
+}
+
+uint64_t AscendMemoryPool::generateBlocksId() {
+    return static_cast<uint64_t>(id_.fetch_add(1, std::memory_order_relaxed));
+}
+
+void AscendMemoryPool::allocateBlock(uint32_t size, int &block_id) {
+    std::unique_lock<std::mutex> lock(block_mutex_);
+
+    size_t align_size = ((size + 31) & ~31) + 32;  
+
+    for (auto it = free_blocks_.begin(); it != free_blocks_.end(); it++) {
+        if (it->second.block_size_ >= align_size) {
+            block_id = it->second.block_id_;
+            used_blocks_.insert(*it);
+            free_blocks_.erase(it);
+            MLLM_INFO("find free block id {} to allocate", block_id);
+            return;
+        }
+    }
+
+    if (remain_size_ > align_size) {
+        block_id = generateBlocksId();
+        uint64_t cur_mem_ptr_align = (reinterpret_cast<uint64_t>(cur_mem_ptr_) + 63) & ~63;  
+        remain_size_ -= (cur_mem_ptr_align - reinterpret_cast<uint64_t>(cur_mem_ptr_));
+        cur_mem_ptr_ = reinterpret_cast<void *>(cur_mem_ptr_align);
+
+        MemoryBlock block = {block_id, align_size, cur_mem_ptr_};
+        used_blocks_.insert({block_id, block});
+        remain_size_ -= align_size;
+        cur_mem_ptr_ = reinterpret_cast<uint8_t *>(cur_mem_ptr_) + align_size;
+        MLLM_INFO("allocate block id {} for size {}", block_id, align_size);
+        return;
+    }
+    MLLM_ERROR("allocate block fail");
+}
+
+void AscendMemoryPool::freeBlock(int block_id) {
+    std::unique_lock<std::mutex> lock(block_mutex_);
+
+    if (block_id < 0) {
+        MLLM_INFO("skip over the invalid block id {}", block_id);
+        return;
+    }
+
+    auto it = used_blocks_.find(block_id);
+    if (it != used_blocks_.end()) {
+        free_blocks_.insert(*it);
+        used_blocks_.erase(it);
+    } else {
+        MLLM_ERROR("Double free block id {}", block_id);
+    }
+}
+
+void AscendMemoryPool::getBlockPtr(int block_id, void *&addr) {
+    std::unique_lock<std::mutex> lock(block_mutex_);
+
+    if (block_id < 0) {
+        MLLM_INFO("Invalid block id {} to get ptr", block_id);
+        return;
+    }
+
+    auto it = used_blocks_.find(block_id);
+    if (it != used_blocks_.end()) {
+        addr = it->second.address_;
+    } else {
+        MLLM_ERROR("Get block address error, block id {}", block_id);
+    }
+}
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/memory/AscendMemoryPool.hpp b/mllm/backends/ascend/memory/AscendMemoryPool.hpp
new file mode 100644
index 000000000..1e41fc041
--- /dev/null
+++ b/mllm/backends/ascend/memory/AscendMemoryPool.hpp
@@ -0,0 +1,40 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <atomic>
+#include <mutex>
+#include <unordered_map>
+#include <cstdint>
+#include <cstddef>
+#include "MemoryBlock.hpp"
+
+namespace mllm::ascend {
+
+class AscendMemoryPool {
+public:
+    explicit AscendMemoryPool(size_t pool_size);
+    ~AscendMemoryPool();
+
+    void allocateBlock(uint32_t size, int& block_id);
+
+    void freeBlock(int block_id);
+
+    void getBlockPtr(int block_id, void*& addr);
+
+private:
+    uint64_t generateBlocksId();
+
+    std::atomic<uint64_t> id_ = 0;  
+    std::mutex block_mutex_;              
+    
+    void* base_mem_ptr_ = nullptr;      
+    void* cur_mem_ptr_ = nullptr;   
+    int64_t remain_size_ = 0;       
+    
+    std::unordered_map<int, MemoryBlock> used_blocks_;  
+    std::unordered_map<int, MemoryBlock> free_blocks_;  
+};
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/memory/MemoryBlock.hpp b/mllm/backends/ascend/memory/MemoryBlock.hpp
new file mode 100644
index 000000000..eda57bf09
--- /dev/null
+++ b/mllm/backends/ascend/memory/MemoryBlock.hpp
@@ -0,0 +1,17 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+
+namespace mllm::ascend {
+
+struct MemoryBlock {
+    int64_t block_id_;          
+    size_t block_size_;         
+    void* address_ = nullptr;  
+};
+
+}  // namespace mllm::ascend

From 1d6bd24155d0555a2135890dbdb6a0bd2194c130 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 12 Dec 2025 01:09:07 +0800
Subject: [PATCH 03/16] feat(ascend backend): create Ascend backend runtime,
 allocator and dispatcher

---
 mllm/backends/ascend/AscendAllocator.cpp  | 121 ++++++++------
 mllm/backends/ascend/AscendAllocator.hpp  |  12 ++
 mllm/backends/ascend/AscendBackend.cpp    |  13 +-
 mllm/backends/ascend/AscendCommon.cpp     | 187 ++++++++++++++++++++++
 mllm/backends/ascend/AscendCommon.hpp     |  96 +++++++++++
 mllm/backends/ascend/AscendDispatcher.cpp |  91 +++++++++++
 mllm/backends/ascend/AscendDispatcher.hpp |  41 +++++
 mllm/backends/ascend/CMakeLists.txt       |  57 ++++---
 mllm/backends/ascend/Register.cpp         |  41 +++++
 9 files changed, 587 insertions(+), 72 deletions(-)
 create mode 100644 mllm/backends/ascend/Register.cpp

diff --git a/mllm/backends/ascend/AscendAllocator.cpp b/mllm/backends/ascend/AscendAllocator.cpp
index 641550b52..f41ba3612 100644
--- a/mllm/backends/ascend/AscendAllocator.cpp
+++ b/mllm/backends/ascend/AscendAllocator.cpp
@@ -1,84 +1,105 @@
 // Copyright (c) MLLM Team.
 // Licensed under the MIT License.
 
-#ifndef ASCENDC_CPU_DEBUG
-#include <acl/acl.h>
-#else
-#include <tikicpulib.h>
-#endif
-
 #include "mllm/backends/ascend/AscendAllocator.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+
+#include "mllm/utils/Common.hpp"
 
 namespace mllm::ascend {
 
+AscendAllocator::AscendAllocator() {
+    MLLM_INFO("AscendAllocator created with memory pool support");
+}
+
+AscendAllocator::~AscendAllocator() {
+    std::lock_guard<std::mutex> lock(block_map_mutex_);
+    if (!storage_to_block_id_.empty()) {
+        MLLM_WARN("AscendAllocator destroyed with {} storage blocks still allocated",
+                  storage_to_block_id_.size());
+    }
+}
+
 bool AscendAllocator::alloc(Storage* storage) {
-#ifdef ASCENDC_CPU_DEBUG
-  storage->ptr_ = AscendC::GmAlloc(storage->size_);
-#else
-  aclrtMalloc((void**)&(storage->ptr_), storage->size_, ACL_MEM_MALLOC_HUGE_FIRST);
-#endif
-  return storage->ptr_ != nullptr;
+    auto& mem_manager = getAscendMemoryManager();
+    int block_id = -1;
+    mem_manager.allocateBlock(storage->size_, block_id);
+    if (block_id < 0) {
+        MLLM_ERROR("Failed to allocate block of size {} bytes from memory pool", storage->size_);
+        return false;
+    }
+
+    mem_manager.getBlockPtr(block_id, storage->ptr_);
+    if (storage->ptr_ == nullptr) {
+        MLLM_ERROR("Failed to get pointer for block ID {}", block_id);
+        mem_manager.freeBlock(block_id);
+        return false;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(block_map_mutex_);
+        storage_to_block_id_[storage->ptr_] = block_id;
+    }
+
+    MLLM_INFO("Allocated storage: size={} bytes, block ID={}, ptr={}",
+               storage->size_, block_id, storage->ptr_);
+    return true;
 }
 
 bool AscendAllocator::alloc(const Storage::ptr_t& storage) {
-#ifdef ASCENDC_CPU_DEBUG
-  storage->ptr_ = AscendC::GmAlloc(storage->size_);
-#else
-  aclrtMalloc((void**)&(storage->ptr_), storage->size_, ACL_MEM_MALLOC_HUGE_FIRST);
-#endif
-  return storage->ptr_ != nullptr;
+    return alloc(storage.get());
 }
 
 void AscendAllocator::free(const Storage::ptr_t& storage) {
-#ifdef ASCENDC_CPU_DEBUG
-  AscendC::GmFree((void*)storage->ptr_);
-#else
-  aclrtFree(storage->ptr_);
-#endif
+    free(storage.get());
 }
 
 void AscendAllocator::free(Storage* storage) {
-#ifdef ASCENDC_CPU_DEBUG
-  AscendC::GmFree((void*)storage->ptr_);
-#else
-  aclrtFree(storage->ptr_);
-#endif
+    if (storage->ptr_ == nullptr) {
+        return;
+    }
+
+    int block_id = -1;
+    {
+        std::lock_guard<std::mutex> lock(block_map_mutex_);
+        auto it = storage_to_block_id_.find(storage->ptr_);
+        if (it != storage_to_block_id_.end()) {
+            block_id = it->second;
+            storage_to_block_id_.erase(it);
+        }
+    }
+
+    if (block_id >= 0) {
+        getAscendMemoryManager().freeBlock(block_id);
+        MLLM_INFO("Freed storage: block ID={}, ptr={}", block_id, storage->ptr_);
+    } else {
+        MLLM_WARN("Attempted to free storage with no block ID mapping: ptr={}", storage->ptr_);
+    }
+
+    storage->ptr_ = nullptr;
 }
 
 bool AscendAllocator::generalAlloc(void** ptr, size_t cap, size_t align) {
-#ifdef ASCENDC_CPU_DEBUG
-  *ptr = AscendC::GmAlloc(cap);
-#else
-  aclrtMalloc((void**)ptr, cap, ACL_MEM_MALLOC_HUGE_FIRST);
-#endif
-  return *ptr != nullptr;
+    return true;
 }
 
 void AscendAllocator::generalFree(void* ptr) {
-#ifdef ASCENDC_CPU_DEBUG
-  AscendC::GmFree((void*)ptr);
-#else
-  aclrtFree(ptr);
-#endif
+    
 }
 
 size_t AscendAllocator::allocSize(const Storage::ptr_t& storage) {
-  // remember that alloc size should be aligned
-  size_t align_size = alignSize();
-  size_t required_size = storage->size_;
-  size_t aligned_size = (required_size + align_size - 1) & ~(align_size - 1);
-  return aligned_size;
+  // Ascend allocations don't require manual alignment padding
+  // since AscendMemoryPool already provides proper alignment
+  return storage->size_;
 }
 
 size_t AscendAllocator::allocSize(Storage* storage) {
-  // remember that alloc size should be aligned
-  size_t align_size = alignSize();
-  size_t required_size = storage->size_;
-  size_t aligned_size = (required_size + align_size - 1) & ~(align_size - 1);
-  return aligned_size;
+  // Ascend allocations don't require manual alignment padding
+  // since AscendMemoryPool already provides proper alignment
+  return storage->size_;
 }
 
-size_t AscendAllocator::alignSize() const { return 128; }
+size_t AscendAllocator::alignSize() const { return 64; }
 
 std::shared_ptr<AscendAllocator> createAscendAllocator() { return std::make_shared<AscendAllocator>(); }
 
diff --git a/mllm/backends/ascend/AscendAllocator.hpp b/mllm/backends/ascend/AscendAllocator.hpp
index 82f000fab..d3c7390c2 100644
--- a/mllm/backends/ascend/AscendAllocator.hpp
+++ b/mllm/backends/ascend/AscendAllocator.hpp
@@ -6,10 +6,17 @@
 #include "mllm/backends/base/Allocator.hpp"
 #include "mllm/core/Storage.hpp"
 
+#include <unordered_map>
+#include <mutex>
+
+
 namespace mllm::ascend {
 
 class AscendAllocator final : public Allocator {
  public:
+  AscendAllocator();
+  ~AscendAllocator();
+
   inline bool ctrlByMemManager() override { return false; }
 
   bool alloc(Storage* storage) override;
@@ -29,6 +36,11 @@ class AscendAllocator final : public Allocator {
   size_t allocSize(const Storage::ptr_t& storage) override;
 
   [[nodiscard]] size_t alignSize() const override;
+
+private:
+  std::mutex block_map_mutex_;
+  std::unordered_map<void*, int> storage_to_block_id_;  // Storage ptr -> block ID
+
 };
 
 std::shared_ptr<AscendAllocator> createAscendAllocator();
diff --git a/mllm/backends/ascend/AscendBackend.cpp b/mllm/backends/ascend/AscendBackend.cpp
index e30db6d69..408cb2518 100644
--- a/mllm/backends/ascend/AscendBackend.cpp
+++ b/mllm/backends/ascend/AscendBackend.cpp
@@ -3,11 +3,22 @@
 
 #include "mllm/backends/ascend/AscendBackend.hpp"
 #include "mllm/backends/ascend/AscendAllocator.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
 #include "mllm/core/DeviceTypes.hpp"
 
+#include "mllm/backends/ascend/ops/AscendElewiseOps.hpp"
+
 namespace mllm::ascend {
 
-AscendBackend::AscendBackend() : Backend(kAscend, createAscendAllocator()) {}
+AscendBackend::AscendBackend() : Backend(kAscend, createAscendAllocator()) {
+  regOpFactory<AscendAddOpFactory>();
+  auto& devices = AscendDeviceMetaInfo::instance().devices;
+  for (const auto& device : devices) {
+    const auto bytes_to_mb = [](size_t bytes) { return bytes / (1024.0 * 1024.0); };
+    MLLM_INFO("Found Ascend device {} (ID: {}, SOC: {}, Memory: {:.2f} MB free / {:.2f} MB total)", device.name,
+              device.id, device.soc_version, bytes_to_mb(device.free_memory), bytes_to_mb(device.total_memory));
+  }
+}
 
 std::shared_ptr<AscendBackend> createAscendBackend() { return std::make_shared<AscendBackend>(); }
 
diff --git a/mllm/backends/ascend/AscendCommon.cpp b/mllm/backends/ascend/AscendCommon.cpp
index e69de29bb..dabb9412c 100644
--- a/mllm/backends/ascend/AscendCommon.cpp
+++ b/mllm/backends/ascend/AscendCommon.cpp
@@ -0,0 +1,187 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+#include <vector>
+
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+
+namespace mllm::ascend {
+
+static aclrtStream g_atb_stream = nullptr;
+
+AscendTensorHandle::AscendTensorHandle(Tensor tensor, int block_id)
+    : tensor_(std::move(tensor)), block_id_(block_id) {}
+
+AscendTensorHandle::~AscendTensorHandle() { release(); }
+
+AscendTensorHandle::AscendTensorHandle(AscendTensorHandle&& other) noexcept
+    : tensor_(std::move(other.tensor_)), block_id_(other.block_id_) {
+  other.block_id_ = -1;
+}
+
+AscendTensorHandle& AscendTensorHandle::operator=(AscendTensorHandle&& other) noexcept {
+  if (this != &other) {
+    release();
+    tensor_ = std::move(other.tensor_);
+    block_id_ = other.block_id_;
+    other.block_id_ = -1;
+  }
+  return *this;
+}
+
+void AscendTensorHandle::release() {
+  if (block_id_ >= 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(block_id_);
+    block_id_ = -1;
+    tensor_.impl()->storage()->ptr_ = nullptr;
+  }
+}
+
+AscendTensorHandle prepareAscendTensor(const std::vector<float>& host_data,
+                                       int batch,
+                                       int size) {
+  const size_t expected_elements = static_cast<size_t>(batch) * static_cast<size_t>(size);
+  MLLM_RT_ASSERT_EQ(host_data.size(), expected_elements);
+
+  std::vector<half_float::half> fp16_data(expected_elements);
+  for (size_t i = 0; i < expected_elements; ++i) {
+    fp16_data[i] = half_float::half(host_data[i]);
+  }
+
+  auto tensor = Tensor::empty({batch, size}, kFloat16, kAscend);
+
+  auto& mem_mgr = getAscendMemoryManager();
+  int block_id = -1;
+  const uint32_t bytes = static_cast<uint32_t>(expected_elements * sizeof(half_float::half));
+
+  mem_mgr.allocateBlock(bytes, block_id);
+
+  void* device_ptr = nullptr;
+  mem_mgr.getBlockPtr(block_id, device_ptr);
+  tensor.impl()->storage()->ptr_ = device_ptr;
+
+  auto ret = aclrtMemcpy(
+      device_ptr, bytes,
+      fp16_data.data(), bytes,
+      ACL_MEMCPY_HOST_TO_DEVICE);
+
+  if (ret != ACL_SUCCESS) {
+    mem_mgr.freeBlock(block_id);
+    MLLM_ACL_CHECK(ret);
+  }
+
+  return AscendTensorHandle(std::move(tensor), block_id);
+}
+
+atb::Context* getGlobalAtbContext() {
+  static atb::Context* ctx = nullptr;
+  
+  if (ctx == nullptr) {
+    // 1. Set Device
+    auto acl_ret = aclrtSetDevice(0);
+    MLLM_ACL_CHECK(acl_ret);
+    
+    // 2. Create Context
+    auto ret = atb::CreateContext(&ctx);
+    MLLM_ATB_CHECK(ret);
+    
+    // 3. Create Stream
+    acl_ret = aclrtCreateStream(&g_atb_stream);
+    MLLM_ACL_CHECK(acl_ret);
+    
+    // 4. Set Stream
+    ctx->SetExecuteStream(g_atb_stream);
+  }
+  return ctx;
+}
+
+aclrtStream getGlobalAtbStream() {
+  getGlobalAtbContext(); // Ensure initialized
+  return g_atb_stream;
+}
+
+void syncGlobalAtbStream() {
+  if (g_atb_stream != nullptr) {
+    auto ret = aclrtSynchronizeStream(g_atb_stream);
+    MLLM_ACL_CHECK(ret);
+  }
+}
+
+void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc) {
+  desc.dtype = ACL_FLOAT16; // Currently hardcoded as per demo, can be expanded later
+  desc.format = ACL_FORMAT_ND;
+
+  auto shape = t.shape();
+  desc.shape.dimNum = static_cast<uint64_t>(shape.size());
+  for (uint64_t i = 0; i < desc.shape.dimNum; ++i) {
+    desc.shape.dims[i] = static_cast<int64_t>(shape[i]);
+  }
+}
+
+AscendDeviceMetaInfo::AscendDeviceMetaInfo() {
+#ifndef ASCENDC_CPU_DEBUG
+  // Initialize ACL to query devices
+  auto ret = aclInit(nullptr);
+  if (ret != ACL_SUCCESS) {
+    MLLM_ERROR("Failed to initialize ACL for device enumeration: {}", ret);
+    return;
+  }
+
+  // Get device count
+  uint32_t device_count = 0;
+  ret = aclrtGetDeviceCount(&device_count);
+  if (ret != ACL_SUCCESS) {
+    MLLM_ERROR("Failed to get Ascend device count: {}", ret);
+    aclFinalize();
+    return;
+  }
+
+  // Collect info for each device
+  for (uint32_t i = 0; i < device_count; ++i) {
+    AscendDeviceInfo info;
+    info.id = i;
+    info.name = "Ascend Device " + std::to_string(i);
+
+    // Set device to query its properties
+    ret = aclrtSetDevice(i);
+    if (ret == ACL_SUCCESS) {
+      // Get memory information
+      size_t free_mem = 0, total_mem = 0;
+      ret = aclrtGetMemInfo(ACL_HBM_MEM, &free_mem, &total_mem);
+      if (ret == ACL_SUCCESS) {
+        info.total_memory = total_mem;
+        info.free_memory = free_mem;
+      } else {
+        info.total_memory = 0;
+        info.free_memory = 0;
+      }
+
+      // SOC version - platform specific, set to unknown for now
+      info.soc_version = "Unknown";
+    } else {
+      info.total_memory = 0;
+      info.free_memory = 0;
+      info.soc_version = "Unknown";
+    }
+
+    devices.push_back(info);
+  }
+
+  // Finalize ACL after enumeration
+  aclFinalize();
+#else
+  // In CPU debug mode, add a dummy device
+  AscendDeviceInfo info;
+  info.id = 0;
+  info.name = "Ascend CPU Debug Device";
+  info.total_memory = 0;
+  info.free_memory = 0;
+  info.soc_version = "CPU_DEBUG";
+  devices.push_back(info);
+#endif
+}
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/AscendCommon.hpp b/mllm/backends/ascend/AscendCommon.hpp
index e69de29bb..7b7cea8ec 100644
--- a/mllm/backends/ascend/AscendCommon.hpp
+++ b/mllm/backends/ascend/AscendCommon.hpp
@@ -0,0 +1,96 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include <acl/acl.h>
+#include <atb/atb_infer.h>
+#include <atb/types.h>
+
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/utils/Common.hpp"
+
+// Ascend ACL error checking macro
+#define MLLM_ACL_CHECK(err)                                                                                    \
+  if (err != ACL_SUCCESS) {                                                                                    \
+    MLLM_ERROR_EXIT(::mllm::ExitCode::kAscendError, "ACL error code {}: {}", int(err), aclGetRecentErrMsg()); \
+  }
+
+// Ascend ATB error checking macro
+#define MLLM_ATB_CHECK(err)                                                                                    \
+  if (err != atb::NO_ERROR) {                                                                                  \
+    MLLM_ERROR_EXIT(::mllm::ExitCode::kAscendError, "ATB error code {}", int(err));                           \
+  }
+
+namespace mllm::ascend {
+
+// Get global ATB Context (Lazy Initialization: aclrtSetDevice, atb::CreateContext, aclrtCreateStream, SetExecuteStream)
+atb::Context* getGlobalAtbContext();
+
+// Get global ATB Stream
+aclrtStream getGlobalAtbStream();
+
+// Sync global ATB Stream
+void syncGlobalAtbStream();
+
+// Convert MLLM Tensor metadata to ATB TensorDesc
+void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc);
+
+// Ascend device information structure
+struct AscendDeviceInfo {
+  std::string name;
+  unsigned int id;
+  size_t total_memory;  // bytes
+  size_t free_memory;   // bytes
+  std::string soc_version;
+};
+
+// Ascend device metadata collector (singleton)
+class AscendDeviceMetaInfo {
+ public:
+  AscendDeviceMetaInfo();
+
+  static AscendDeviceMetaInfo& instance() {
+    static AscendDeviceMetaInfo instance;
+    return instance;
+  }
+
+  AscendDeviceMetaInfo(const AscendDeviceMetaInfo&) = delete;
+  AscendDeviceMetaInfo& operator=(const AscendDeviceMetaInfo&) = delete;
+
+  std::vector<AscendDeviceInfo> devices;
+};
+
+// RAII handle for Ascend tensor with automatic memory block management
+struct AscendTensorHandle {
+  AscendTensorHandle() = default;
+  AscendTensorHandle(Tensor tensor, int block_id);  // Construct with tensor and memory block ID
+  ~AscendTensorHandle();  // Auto-release memory block
+
+  AscendTensorHandle(const AscendTensorHandle&) = delete;
+  AscendTensorHandle& operator=(const AscendTensorHandle&) = delete;
+  AscendTensorHandle(AscendTensorHandle&& other) noexcept;  // Move constructor
+  AscendTensorHandle& operator=(AscendTensorHandle&& other) noexcept;  // Move assignment
+
+  void release();  // Manually release memory block and invalidate handle
+  bool valid() const { return block_id_ >= 0; }  // Check if handle owns a valid memory block
+
+  Tensor& tensor() { return tensor_; }  // Access tensor
+  const Tensor& tensor() const { return tensor_; }  // Access tensor (const)
+  int blockId() const { return block_id_; }  // Get memory block ID
+
+ private:
+  Tensor tensor_;
+  int block_id_{-1};
+};
+
+// Prepare Ascend tensor from host float data (converts to FP16, allocates device memory, copies data)
+AscendTensorHandle prepareAscendTensor(const std::vector<float>& host_data,
+                                       int batch,
+                                       int size);
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/AscendDispatcher.cpp b/mllm/backends/ascend/AscendDispatcher.cpp
index e69de29bb..0ceae8f9e 100644
--- a/mllm/backends/ascend/AscendDispatcher.cpp
+++ b/mllm/backends/ascend/AscendDispatcher.cpp
@@ -0,0 +1,91 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/AscendDispatcher.hpp"
+#include "mllm/backends/ascend/AscendBackend.hpp"
+#include "mllm/core/OpTypes.hpp"
+#include "mllm/engine/Dispatcher.hpp"
+#include "mllm/engine/Context.hpp"
+#include "mllm/utils/Common.hpp"
+#include "mllm/nn/Module.hpp"
+#include "mllm/tracy_perf/Tracy.hpp"
+
+#ifdef MLLM_PERFETTO_ENABLE
+#include "mllm/engine/Perf.hpp"
+#endif
+
+namespace mllm::ascend {
+
+AscendDispatcher::AscendDispatcher(exec::static_thread_pool& thread_pool, dispatcher_id_t id,
+                                   const AscendDispatcherOptions& options)
+    : Dispatcher(thread_pool, id), options_(options) {}
+
+void AscendDispatcher::receive(const Task::ptr_t& task) {
+  switch (task->type) {
+    case TaskTypes::kExecuteModule:
+    case TaskTypes::kExecuteOp: {
+      process(task);
+      break;
+    }
+    default: NYI("Only execute op/module task is supported in AscendDispatcher::receive");
+  }
+}
+
+TaskResult::sender_t AscendDispatcher::asyncReceive(const Task::ptr_t& task) {
+  switch (task->type) {
+    case TaskTypes::kExecuteModule: {
+      MLLM_EMPTY_SCOPE;
+      break;
+    }
+    default: NYI("Only execute module task is supported in AscendDispatcher::asyncReceive");
+  }
+  auto scheduler = thread_pool_.get_scheduler();
+  return stdexec::schedule(scheduler) | stdexec::then([this, task] { process(task); });
+}
+
+void AscendDispatcher::process(const Task::ptr_t& task) {
+  MLLM_TRACY_ZONE_SCOPED;
+  switch (task->type) {
+    case TaskTypes::kExecuteOp: {
+      task->op->reshape(task->inputs, task->outputs);
+      task->op->setup(task->inputs, task->outputs);
+      task->op->forward(task->inputs, task->outputs);
+      
+      break;
+    }
+    case TaskTypes::kExecuteModule: {
+      auto moduleName = static_cast<nn::Module*>(task->custom_context_ptr)->getModuleName();
+#ifdef MLLM_PERFETTO_ENABLE
+      MLLM_PERF_TRACE_EVENT("mllm.ascend.execute.", perfetto::DynamicString{moduleName},
+                            [&](perfetto::EventContext ctx) {
+                              int cnt = 0;
+                              for (auto& i : task->inputs) {
+                                ctx.AddDebugAnnotation(perfetto::DynamicString{"inputs-" + std::to_string(cnt++)},
+                                                       i.shape());
+                              }
+                            });
+#endif
+      auto ascendBackend = std::static_pointer_cast<AscendBackend>(Context::instance().getBackend(kAscend));
+
+      task->outputs = ((nn::Module*)(task->custom_context_ptr))->forward(task->inputs, task->args);
+
+      // TODO:
+      // ascendBackend->graphExecute(moduleName, task->inputs, task->outputs);
+      break;
+    }
+    default: NYI("AscendDispatcher::process not supported task type");
+  }
+}
+
+void AscendDispatcher::syncWait() {
+  // TODO
+}
+
+AscendDispatcher::ptr_t createAscendDispatcher(exec::static_thread_pool& thread_pool,
+                                               const AscendDispatcherOptions& options) {
+  return std::make_shared<AscendDispatcher>(thread_pool, Dispatcher::ascend_dispatcher_id, options);
+}
+
+}  // namespace mllm::ascend
+
+
diff --git a/mllm/backends/ascend/AscendDispatcher.hpp b/mllm/backends/ascend/AscendDispatcher.hpp
index e69de29bb..7b8ad5943 100644
--- a/mllm/backends/ascend/AscendDispatcher.hpp
+++ b/mllm/backends/ascend/AscendDispatcher.hpp
@@ -0,0 +1,41 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <memory>
+
+#include "mllm/engine/Dispatcher.hpp"
+#include "mllm/utils/Common.hpp"
+
+namespace mllm::ascend {
+
+struct AscendDispatcherOptions {
+  MLLM_EMPTY_SCOPE;
+};
+
+class AscendDispatcher final : public Dispatcher {
+ public:
+  using ptr_t = std::shared_ptr<AscendDispatcher>;
+
+  explicit AscendDispatcher(exec::static_thread_pool& thread_pool, dispatcher_id_t id,
+                            const AscendDispatcherOptions& options);
+
+  void receive(const Task::ptr_t& task) override;
+
+  TaskResult::sender_t asyncReceive(const Task::ptr_t& task) override;
+
+  void process(const Task::ptr_t& task) override;
+
+  void syncWait() override;
+
+ private:
+  AscendDispatcherOptions options_;
+};
+
+AscendDispatcher::ptr_t createAscendDispatcher(exec::static_thread_pool& thread_pool,
+                                               const AscendDispatcherOptions& options);
+
+}  // namespace mllm::ascend
+
+
diff --git a/mllm/backends/ascend/CMakeLists.txt b/mllm/backends/ascend/CMakeLists.txt
index b8653bfbf..6386948fd 100644
--- a/mllm/backends/ascend/CMakeLists.txt
+++ b/mllm/backends/ascend/CMakeLists.txt
@@ -1,10 +1,6 @@
-file(GLOB MLLM_ASCEND_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/kernels/*.cpp)
-
 set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
-set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type")
-set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest"
-    CACHE STRING "ASCEND CANN package installation directory"
-)
+set(SOC_VERSION "Ascend310B1" CACHE STRING "system on chip type")
+
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
 endif()
@@ -13,35 +9,54 @@ if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
 endif()
 
 message(STATUS "SOC_VERSION is ${SOC_VERSION}, RUN_MODE is ${RUN_MODE}")
-if("${RUN_MODE}" STREQUAL "cpu")
-  include(cmake/cpu_lib.cmake)
-else()
-  include(cmake/npu_lib.cmake)
-endif()
 
 # ============ The CATLASS Code ============
 # TODO add catlass
 
 # ============ MLLM Code ============
-add_library(
-  MllmAscendBackend SHARED
-  AscendAllocator.cpp
-  AscendBackend.cpp
-  AscendCommon.cpp
-  AscendDispatcher.cpp
+file(GLOB MLLM_ASCEND_CORE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Ascend*.cpp)
+file(GLOB MLLM_ASCEND_MEMORY_FILES ${CMAKE_CURRENT_SOURCE_DIR}/memory/*.cpp)
+file(GLOB MLLM_ASCEND_OPS_FILES ${CMAKE_CURRENT_SOURCE_DIR}/ops/*.cpp)
+
+set(MLLM_ASCEND_SOURCES
+  ${MLLM_ASCEND_CORE_FILES}
+  ${MLLM_ASCEND_MEMORY_FILES}
+  ${MLLM_ASCEND_OPS_FILES}
+  ${CMAKE_CURRENT_SOURCE_DIR}/Register.cpp
 )
+
+add_library(MllmAscendBackend SHARED ${MLLM_ASCEND_SOURCES})
+
+if(DEFINED ENV{ASCEND_HOME_PATH})
+  target_include_directories(MllmAscendBackend PUBLIC $ENV{ASCEND_HOME_PATH}/include)
+  target_link_directories(MllmAscendBackend PRIVATE $ENV{ASCEND_HOME_PATH}/lib64)
+else()
+  message(WARNING "ASCEND_HOME_PATH is not set, Ascend headers and libs may not be found")
+endif()
+
+if(DEFINED ENV{ATB_HOME_PATH})
+  target_include_directories(MllmAscendBackend PUBLIC $ENV{ATB_HOME_PATH}/include)
+  target_link_directories(MllmAscendBackend PRIVATE $ENV{ATB_HOME_PATH}/lib)
+else()
+  message(WARNING "ATB_HOME_PATH not defined, ATB library will not be linked")
+endif()
+
+
 target_link_libraries(MllmAscendBackend PRIVATE
-  $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},npu>:host_intf_pub>>
-  $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
   ascendcl
-  $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
-  $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:MllmAscendKernel>>
+  opapi
+  nnopbase
+  atb
   MllmRT
 )
 target_compile_definitions(MllmAscendBackend PUBLIC ASCENDC_DUMP=0)
+
+target_compile_definitions(MllmAscendBackend PUBLIC MLLM_USE_ASCEND_MEMPOOL)
 set_target_properties(MllmAscendBackend PROPERTIES CXX_STANDARD 20)
 target_compile_options(MllmAscendBackend PUBLIC 
   $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>> 
   -O2
 )
 target_include_directories(MllmAscendBackend PUBLIC "${MLLM_ASCEND_INSTALL_PATH}/include/")
+
+
diff --git a/mllm/backends/ascend/Register.cpp b/mllm/backends/ascend/Register.cpp
new file mode 100644
index 000000000..2cdca6a03
--- /dev/null
+++ b/mllm/backends/ascend/Register.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include <memory>
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/DeviceTypes.hpp"
+#include "mllm/engine/Context.hpp"
+#include "mllm/mllm.hpp"
+#include "mllm/backends/ascend/AscendBackend.hpp"
+#include "mllm/backends/ascend/AscendDispatcher.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+
+namespace mllm {
+
+void initAscendBackend() {
+  auto& ctx = Context::instance();
+
+  // 1. Create memory pool
+  size_t pool_size = 100 * 1024 * 1024;  // 100MB, can be adjusted as needed
+  ascend::getAscendMemoryManager().createMemoryPool(pool_size);
+  MLLM_INFO("Ascend memory pool initialized");
+
+  // 2. Register backend
+  auto backend = std::make_shared<ascend::AscendBackend>();
+  ctx.registerBackend(backend);
+
+  // 3. Register allocator
+  ctx.memoryManager()->registerAllocator(kAscend, backend->allocator(), MemoryManagerOptions());
+
+  // 4. Register dispatcher
+  auto dispatcher = ascend::createAscendDispatcher(ctx.dispatcherManager()->getExecutor(), 
+                                                   ascend::AscendDispatcherOptions{});
+  ctx.dispatcherManager()->registerDispatcher(dispatcher);
+  MLLM_INFO("Ascend dispatcher registered");
+
+  // 5. Register custom ops 
+  // ctx.registerCustomizedOp(kAscend, "CustomOpName", 
+  //                          std::make_shared<CustomOpFactory>());
+}
+
+}  // namespace mllm

From bf060a9660b11f950da60211d51a766959f285df Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 12 Dec 2025 01:09:42 +0800
Subject: [PATCH 04/16] feat(ascend): add Ascend elementwise ops

---
 mllm/backends/ascend/ops/AscendElewiseOps.hpp |  27 ++++
 mllm/backends/ascend/ops/AscnedElewiseOps.cpp | 118 ++++++++++++++++++
 2 files changed, 145 insertions(+)
 create mode 100644 mllm/backends/ascend/ops/AscendElewiseOps.hpp
 create mode 100644 mllm/backends/ascend/ops/AscnedElewiseOps.cpp

diff --git a/mllm/backends/ascend/ops/AscendElewiseOps.hpp b/mllm/backends/ascend/ops/AscendElewiseOps.hpp
new file mode 100644
index 000000000..26117cbc2
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendElewiseOps.hpp
@@ -0,0 +1,27 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/ElewiseOps.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+namespace mllm::ascend {
+
+class AscendAddOp final : public aops::AddOp {
+ public:
+  explicit AscendAddOp(const aops::AddOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendAddOpFactory final : public TypedOpFactory<OpTypes::kAdd, aops::AddOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::AddOpOptions& options) override {
+    return std::make_shared<AscendAddOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
\ No newline at end of file
diff --git a/mllm/backends/ascend/ops/AscnedElewiseOps.cpp b/mllm/backends/ascend/ops/AscnedElewiseOps.cpp
new file mode 100644
index 000000000..fc6fad429
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscnedElewiseOps.cpp
@@ -0,0 +1,118 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendElewiseOps.hpp"
+
+#include <acl/acl.h>
+#include <atb/atb_infer.h>
+#include <atb/types.h>
+#include <atb/utils.h>
+#include <atb/infer_op_params.h>
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+namespace mllm::ascend {
+
+AscendAddOp::AscendAddOp(const aops::AddOpOptions& options) : aops::AddOp(options) {}
+
+void AscendAddOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  for (auto& t : outputs) {
+    if (!t.isNil()) {
+      auto& mem_mgr = getAscendMemoryManager();
+      int block_id = -1;
+      void* device_ptr = nullptr;
+      
+      mem_mgr.allocateBlock(static_cast<uint32_t>(t.bytes()), block_id);
+      mem_mgr.getBlockPtr(block_id, device_ptr);
+      
+      t.impl()->storage()->ptr_ = device_ptr;
+    }
+  }
+}
+
+void AscendAddOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT_EQ(inputs.size(), 2);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& x = inputs[0];
+  const auto& y = inputs[1];
+  auto& z = outputs[0];
+
+  if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) {
+    NYI("AscendAddOp currently requires x/y/z have same dtype");
+  }
+  if (x.numel() != y.numel() || x.numel() != z.numel()) {
+    NYI("AscendAddOp demo only supports no-broadcast case (numel equal)");
+  }
+
+  atb::infer::ElewiseParam addParam;
+  addParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_ADD;
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(addParam, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ELEWISE_ADD) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+
+  atb::Tensor atb_x;
+  atb::Tensor atb_y;
+  atb::Tensor atb_z;
+
+  fillAtbTensorDesc(x, atb_x.desc);
+  fillAtbTensorDesc(y, atb_y.desc);
+  fillAtbTensorDesc(z, atb_z.desc);
+
+  atb_x.deviceData = reinterpret_cast<uint8_t*>(x.ptr<void>());
+  atb_x.dataSize = x.bytes();
+  atb_y.deviceData = reinterpret_cast<uint8_t*>(y.ptr<void>());
+  atb_y.dataSize = y.bytes();
+  atb_z.deviceData = reinterpret_cast<uint8_t*>(z.ptr<void>());
+  atb_z.dataSize = z.bytes();
+
+  atb::SVector<atb::Tensor> inTensors;
+  atb::SVector<atb::Tensor> outTensors;
+  inTensors.push_back(atb_x);
+  inTensors.push_back(atb_y);
+  outTensors.push_back(atb_z);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB AddOp Setup failed, status={}", static_cast<int>(st));
+  }
+
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+
+  st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB AddOp Execute failed, status={}", static_cast<int>(st));
+  }
+
+  
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
+}  // namespace mllm::ascend
\ No newline at end of file

From a618fd8785a149d64d676303e2eb2ab198584319 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 12 Dec 2025 01:12:31 +0800
Subject: [PATCH 05/16] fix(ascend):add enum for Ascend

---
 mllm/core/DeviceTypes.hpp  | 6 ++++++
 mllm/engine/Dispatcher.hpp | 1 +
 mllm/utils/Common.hpp      | 1 +
 3 files changed, 8 insertions(+)

diff --git a/mllm/core/DeviceTypes.hpp b/mllm/core/DeviceTypes.hpp
index 0ccd1c345..3c6a98ac4 100644
--- a/mllm/core/DeviceTypes.hpp
+++ b/mllm/core/DeviceTypes.hpp
@@ -33,6 +33,8 @@ inline const char* deviceTypes2Str(DeviceTypes type) {
     case DeviceTypes::kCUDA: return "CUDA";
     case DeviceTypes::kOpenCL: return "OpenCL";
     case DeviceTypes::kQNN: return "QNN";
+    case DeviceTypes::kAscend: return "Ascend";
+    case DeviceTypes::kAscendHost: return "AscendHost";
     case DeviceTypes::kDeviceTypes_End: return "DeviceTypes_End";
     default: return "Unknown";
   }
@@ -47,6 +49,10 @@ inline DeviceTypes str2DeviceType(const std::string& type_str) {
     return DeviceTypes::kOpenCL;
   } else if (type_str == "QNN") {
     return DeviceTypes::kQNN;
+  } else if (type_str == "Ascend") {
+    return DeviceTypes::kAscend;
+  } else if (type_str == "AscendHost") {
+    return DeviceTypes::kAscendHost;
   } else {
     return DeviceTypes::kDeviceTypes_End;
   }
diff --git a/mllm/engine/Dispatcher.hpp b/mllm/engine/Dispatcher.hpp
index 7ac5b8597..8ed7044fe 100644
--- a/mllm/engine/Dispatcher.hpp
+++ b/mllm/engine/Dispatcher.hpp
@@ -25,6 +25,7 @@ class Dispatcher {
   static constexpr int32_t cuda_dispatcher_id = static_cast<int32_t>(DeviceTypes::kCUDA);
   static constexpr int32_t opencl_dispatcher_id = static_cast<int32_t>(DeviceTypes::kOpenCL);
   static constexpr int32_t qnn_dispatcher_id = static_cast<int32_t>(DeviceTypes::kQNN);
+  static constexpr int32_t ascend_dispatcher_id = static_cast<int32_t>(DeviceTypes::kAscend);
   static constexpr int32_t trace_dispatcher_id = static_cast<int32_t>(DeviceTypes::kDeviceTypes_End) + 1;
   static constexpr int32_t cpu_memory_disk_io_dispatcher_id = static_cast<int32_t>(DeviceTypes::kDeviceTypes_End) + 2;
   static constexpr int32_t custom_dispatcher_start_id = static_cast<int32_t>(DeviceTypes::kDeviceTypes_End) + 3;
diff --git a/mllm/utils/Common.hpp b/mllm/utils/Common.hpp
index 1df4de265..abcd7d169 100644
--- a/mllm/utils/Common.hpp
+++ b/mllm/utils/Common.hpp
@@ -31,6 +31,7 @@ enum class ExitCode : int32_t {  // NOLINT
   kCudaError,
   kQnnError,
   kOpenCLError,
+  kAscendError,
   kIOError,
   kShapeError,
   kCPUKernelError,

From eb52ca84adcef37bbfa77cea00875beb7d0b17d9 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 12 Dec 2025 01:14:44 +0800
Subject: [PATCH 06/16] feat(ascend): create for Ascend

---
 tasks/build_arm_ascend.yaml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 tasks/build_arm_ascend.yaml

diff --git a/tasks/build_arm_ascend.yaml b/tasks/build_arm_ascend.yaml
new file mode 100644
index 000000000..4546f2d81
--- /dev/null
+++ b/tasks/build_arm_ascend.yaml
@@ -0,0 +1,18 @@
+Tasks:
+  - CMakeConfigTask:
+      cmake_cfg_path: "build-arm-ascend"
+      cmake_build_type: "Release"
+      cmake_extra_args:
+        - "-DMLLM_CROSS_COMPILE=ON"
+        - "-DMLLM_BUILD_ARM_BACKEND=ON"
+        - "-DMLLM_BUILD_ASCEND_BACKEND=ON"
+        - "-DANDROID_PLATFORM=android-28"
+        - "-DANDROID_ABI=arm64-v8a"
+        - '-DMLLM_CPU_BACKEND_COMPILE_OPTIONS="-march=armv8.2-a+fp16+fp16fml+dotprod+i8mm;-ffast-math;-Wno-nan-infinity-disabled"'
+        - "-DCMAKE_INSTALL_PREFIX=/root/mllm-install-android-arm64-v8a"
+        - "-DMLLM_KERNEL_USE_THREADS=ON"
+        - "-DMLLM_KERNEL_THREADS_VENDOR_OPENMP=ON"
+        - "-DMLLM_KERNEL_USE_THREADS_VENDOR_MLLM=OFF"
+
+  - CMakeBuildTask:
+      cmake_cfg_path: "build-arm-ascend"

From d44f81caa046a54086108fe83e1dd7411c4031ce Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sat, 13 Dec 2025 17:43:11 +0800
Subject: [PATCH 07/16] fix(ascend): fix critical issues from CodeRabbit review

---
 examples/ascend_add_demo/main.cpp             |  1 -
 mllm/backends/ascend/AscendAllocator.cpp      |  9 ++--
 mllm/backends/ascend/AscendCommon.cpp         | 51 ++++++++++---------
 mllm/backends/ascend/AscendDispatcher.cpp     |  5 +-
 mllm/backends/ascend/AscendDispatcher.hpp     |  2 +-
 .../ascend/memory/AscendMemoryManager.cpp     |  7 ++-
 .../ascend/memory/AscendMemoryPool.hpp        | 10 ++--
 7 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/examples/ascend_add_demo/main.cpp b/examples/ascend_add_demo/main.cpp
index 6e439d388..b9704fb5b 100644
--- a/examples/ascend_add_demo/main.cpp
+++ b/examples/ascend_add_demo/main.cpp
@@ -114,7 +114,6 @@ int main() {
       std::cout << "\n✗✗✗ Test FAILED! Results don't match expected values. ✗✗✗" << std::endl;
     }
     
-    // 清理内存池中的块
     x_handle.release();
     y_handle.release();
     
diff --git a/mllm/backends/ascend/AscendAllocator.cpp b/mllm/backends/ascend/AscendAllocator.cpp
index f41ba3612..56dc9db37 100644
--- a/mllm/backends/ascend/AscendAllocator.cpp
+++ b/mllm/backends/ascend/AscendAllocator.cpp
@@ -3,7 +3,7 @@
 
 #include "mllm/backends/ascend/AscendAllocator.hpp"
 #include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
-
+#include <iostream>
 #include "mllm/utils/Common.hpp"
 
 namespace mllm::ascend {
@@ -80,11 +80,14 @@ void AscendAllocator::free(Storage* storage) {
 }
 
 bool AscendAllocator::generalAlloc(void** ptr, size_t cap, size_t align) {
-    return true;
+    //we don't support generalAlloc , therefore return false
+    std::cout << "generalAlloc is not supported in AscendAllocator" << std::endl;
+    return false;
 }
 
 void AscendAllocator::generalFree(void* ptr) {
-    
+    //we don't support generalFree , therefore do nothing
+    std::cout << "generalFree is not supported in AscendAllocator" << std::endl;
 }
 
 size_t AscendAllocator::allocSize(const Storage::ptr_t& storage) {
diff --git a/mllm/backends/ascend/AscendCommon.cpp b/mllm/backends/ascend/AscendCommon.cpp
index dabb9412c..d98eb29de 100644
--- a/mllm/backends/ascend/AscendCommon.cpp
+++ b/mllm/backends/ascend/AscendCommon.cpp
@@ -4,12 +4,17 @@
 #include "mllm/backends/ascend/AscendCommon.hpp"
 
 #include <vector>
-
+#include <mutex>
 #include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
 
 namespace mllm::ascend {
 
-static aclrtStream g_atb_stream = nullptr;
+namespace {
+aclrtStream& globalAtbStream() {
+  static aclrtStream stream = nullptr;
+  return stream;
+}
+}  // namespace
 
 AscendTensorHandle::AscendTensorHandle(Tensor tensor, int block_id)
     : tensor_(std::move(tensor)), block_id_(block_id) {}
@@ -37,6 +42,8 @@ void AscendTensorHandle::release() {
     mem_mgr.freeBlock(block_id_);
     block_id_ = -1;
     tensor_.impl()->storage()->ptr_ = nullptr;
+  } else if (tensor_.impl() != nullptr) {
+    tensor_.delete_();
   }
 }
 
@@ -52,16 +59,10 @@ AscendTensorHandle prepareAscendTensor(const std::vector<float>& host_data,
   }
 
   auto tensor = Tensor::empty({batch, size}, kFloat16, kAscend);
+  tensor.alloc();
 
-  auto& mem_mgr = getAscendMemoryManager();
-  int block_id = -1;
-  const uint32_t bytes = static_cast<uint32_t>(expected_elements * sizeof(half_float::half));
-
-  mem_mgr.allocateBlock(bytes, block_id);
-
-  void* device_ptr = nullptr;
-  mem_mgr.getBlockPtr(block_id, device_ptr);
-  tensor.impl()->storage()->ptr_ = device_ptr;
+  void* device_ptr = tensor.ptr<void>();
+  const size_t bytes = tensor.bytes();
 
   auto ret = aclrtMemcpy(
       device_ptr, bytes,
@@ -69,43 +70,45 @@ AscendTensorHandle prepareAscendTensor(const std::vector<float>& host_data,
       ACL_MEMCPY_HOST_TO_DEVICE);
 
   if (ret != ACL_SUCCESS) {
-    mem_mgr.freeBlock(block_id);
     MLLM_ACL_CHECK(ret);
   }
 
-  return AscendTensorHandle(std::move(tensor), block_id);
+  return AscendTensorHandle(std::move(tensor), -1);
 }
 
 atb::Context* getGlobalAtbContext() {
   static atb::Context* ctx = nullptr;
-  
-  if (ctx == nullptr) {
+  static std::once_flag init_flag;
+
+  std::call_once(init_flag, [&] {
     // 1. Set Device
     auto acl_ret = aclrtSetDevice(0);
     MLLM_ACL_CHECK(acl_ret);
-    
+
     // 2. Create Context
     auto ret = atb::CreateContext(&ctx);
     MLLM_ATB_CHECK(ret);
-    
+
     // 3. Create Stream
-    acl_ret = aclrtCreateStream(&g_atb_stream);
+    auto& stream = globalAtbStream();
+    acl_ret = aclrtCreateStream(&stream);
     MLLM_ACL_CHECK(acl_ret);
-    
+
     // 4. Set Stream
-    ctx->SetExecuteStream(g_atb_stream);
-  }
+    ctx->SetExecuteStream(stream);
+  });
   return ctx;
 }
 
 aclrtStream getGlobalAtbStream() {
   getGlobalAtbContext(); // Ensure initialized
-  return g_atb_stream;
+  return globalAtbStream();
 }
 
 void syncGlobalAtbStream() {
-  if (g_atb_stream != nullptr) {
-    auto ret = aclrtSynchronizeStream(g_atb_stream);
+  auto stream = globalAtbStream();
+  if (stream != nullptr) {
+    auto ret = aclrtSynchronizeStream(stream);
     MLLM_ACL_CHECK(ret);
   }
 }
diff --git a/mllm/backends/ascend/AscendDispatcher.cpp b/mllm/backends/ascend/AscendDispatcher.cpp
index 0ceae8f9e..8960ece77 100644
--- a/mllm/backends/ascend/AscendDispatcher.cpp
+++ b/mllm/backends/ascend/AscendDispatcher.cpp
@@ -50,10 +50,9 @@ void AscendDispatcher::process(const Task::ptr_t& task) {
       task->op->reshape(task->inputs, task->outputs);
       task->op->setup(task->inputs, task->outputs);
       task->op->forward(task->inputs, task->outputs);
-      
       break;
     }
-    case TaskTypes::kExecuteModule: {
+    case TaskTypes::kExecuteModule: {   //TODO: execute module
       auto moduleName = static_cast<nn::Module*>(task->custom_context_ptr)->getModuleName();
 #ifdef MLLM_PERFETTO_ENABLE
       MLLM_PERF_TRACE_EVENT("mllm.ascend.execute.", perfetto::DynamicString{moduleName},
@@ -78,7 +77,7 @@ void AscendDispatcher::process(const Task::ptr_t& task) {
 }
 
 void AscendDispatcher::syncWait() {
-  // TODO
+  // TODO: Implement synchronization behavior for outstanding scheduled tasks (if required by engine).
 }
 
 AscendDispatcher::ptr_t createAscendDispatcher(exec::static_thread_pool& thread_pool,
diff --git a/mllm/backends/ascend/AscendDispatcher.hpp b/mllm/backends/ascend/AscendDispatcher.hpp
index 7b8ad5943..71852da12 100644
--- a/mllm/backends/ascend/AscendDispatcher.hpp
+++ b/mllm/backends/ascend/AscendDispatcher.hpp
@@ -18,7 +18,7 @@ class AscendDispatcher final : public Dispatcher {
  public:
   using ptr_t = std::shared_ptr<AscendDispatcher>;
 
-  explicit AscendDispatcher(exec::static_thread_pool& thread_pool, dispatcher_id_t id,
+  explicit AscendDispatcher(exec::static_thread_pool& thread_pool, dispatcher_id_t dispatcher_id,
                             const AscendDispatcherOptions& options);
 
   void receive(const Task::ptr_t& task) override;
diff --git a/mllm/backends/ascend/memory/AscendMemoryManager.cpp b/mllm/backends/ascend/memory/AscendMemoryManager.cpp
index b1d4920c3..ed9122b02 100644
--- a/mllm/backends/ascend/memory/AscendMemoryManager.cpp
+++ b/mllm/backends/ascend/memory/AscendMemoryManager.cpp
@@ -10,12 +10,11 @@
 
 namespace mllm::ascend {
 
-static AscendMemoryManager g_ascendMemoryManager;
-
-AscendMemoryManager::AscendMemoryManager() {}
+AscendMemoryManager::AscendMemoryManager() = default;
 
 AscendMemoryManager &getAscendMemoryManager() {
-    return g_ascendMemoryManager;
+    static AscendMemoryManager instance;
+    return instance;
 }
 
 void AscendMemoryManager::createMemoryPool(size_t pool_size)
diff --git a/mllm/backends/ascend/memory/AscendMemoryPool.hpp b/mllm/backends/ascend/memory/AscendMemoryPool.hpp
index 1e41fc041..d3b01d22f 100644
--- a/mllm/backends/ascend/memory/AscendMemoryPool.hpp
+++ b/mllm/backends/ascend/memory/AscendMemoryPool.hpp
@@ -26,12 +26,12 @@ class AscendMemoryPool {
 private:
     uint64_t generateBlocksId();
 
-    std::atomic<uint64_t> id_ = 0;  
-    std::mutex block_mutex_;              
+    std::atomic<uint64_t> id_ = 0;
+    std::mutex block_mutex_;
     
-    void* base_mem_ptr_ = nullptr;      
-    void* cur_mem_ptr_ = nullptr;   
-    int64_t remain_size_ = 0;       
+    void* base_mem_ptr_ = nullptr;
+    void* cur_mem_ptr_ = nullptr;
+    int64_t remain_size_ = 0;
     
     std::unordered_map<int, MemoryBlock> used_blocks_;  
     std::unordered_map<int, MemoryBlock> free_blocks_;  

From e25dd341b4fc64c188913e0de8e6d5816a141998 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 16 Dec 2025 16:26:34 +0800
Subject: [PATCH 08/16] feat(ascned):add core design document of ascend backend

---
 docs/ascend_backend/core_design.rst | 187 ++++++++++++++++++++++++++++
 1 file changed, 187 insertions(+)
 create mode 100644 docs/ascend_backend/core_design.rst

diff --git a/docs/ascend_backend/core_design.rst b/docs/ascend_backend/core_design.rst
new file mode 100644
index 000000000..592089736
--- /dev/null
+++ b/docs/ascend_backend/core_design.rst
@@ -0,0 +1,187 @@
+Ascend Backend 设计概述
+====================
+
+总览
+----
+Ascend Backend 将 mLLM 的算子执行能力接入华为 Ascend NPU，提供端到端的调度、内存管理与算子生命周期管理，使模型在 Ascend 上高效运行。
+
+设计目标
+--------
+- 统一后端：作为 mLLM 原生后端，统一接口与调度流程。
+- ATB 单算子验证：打通算子从框架到 NPU 的完整链路。
+- 生命周期管理：算子创建、准备、执行、销毁的统一抽象。
+- 内存管理：专用 Ascend 设备内存池，减少反复申请释放。
+- 扩展性：便于新增算子、执行模式和性能优化。
+
+架构组件
+--------
+.. code-block:: text
+
+┌─────────────────────────────────────────────────────────────┐
+│                    MLLM 框架                                │
+│  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐       │
+│  │   模块       │  │    层         │  │  调度器      │       │
+│  └──────┬───────┘  └──────┬───────┘  └──────┬───────┘       │
+└─────────┼─────────────────┼─────────────────┼───────────────┘
+          │                 │                 │
+          └─────────────────┴─────────────────┘
+                            │
+┌──────────────────────────────────────────────────────────────┐
+│              Ascend 后端基础设施                              │
+│                                                              │
+│  ┌────────────────────────────────────────────────────────┐  │
+│  │              AscendBackend（核心管理）                  │  │
+│  │  - 设备/算子注册  - 分配器绑定                           │  │
+│  │  - 设备信息日志                                         │  │
+│  └─────────┬──────────────────────────────────────────────┘  │
+│            │                                                 │
+│  ┌─────────┴──────────┬──────────────┬─────────────────┐     │
+│  │                    │              │                 │     │
+│  ▼                    ▼              ▼                 ▼     │
+│  AscendDispatcher  AscendAllocator Ascend Ops  AscendCommon  │
+│  （执行：算子/      MemoryManager  （目前是add     （共用代码） │
+│   模块任务）          （内存池）     未来图执行）               │
+│                                                              │
+│                                                              │
+│                                                              │
+└────────────────────────────┬─────────────────────────────────┘
+                             │
+┌────────────────────────────▼──────────────────────────────────┐
+│                     Ascend runtime                            │
+│  ┌──────────────┐  ┌──────────────┐  ┌─────────────────┐      │
+│  │  ATB 上下文   │  │  ACL 流      │  │  ATB/ACL 接口   │      │
+│  └──────────────┘  └──────────────┘  └─────────────────┘      │
+│                                                               │
+│  ┌──────────────────────────────────────────────────────┐     │
+│  │           Ascend NPU 硬件（orangepi ai pro）          │     │
+│  └──────────────────────────────────────────────────────┘     │
+└───────────────────────────────────────────────────────────────┘
+
+关键模块
+--------
+
+1. mLLM 框架层
+
+框架层负责算子抽象、计算任务构建以及统一调度接口的提供。不依赖任何具体设备实现，仅通过 Backend 接口与底层后端交互。算子在该层被封装为可调度的任务（Task），并通过 DispatcherManager 提交给对应设备后端执行。
+
+2. Ascend 后端基础设施层
+
+该层是 Ascend Backend 的核心实现，负责承接来自框架层的算子任务，并将其映射到 Ascend 运行时执行。主要组成包括：
+
+**AscendBackend**
+
+- 后端入口与核心管理模块，负责后端注册、算子工厂管理、分配器与调度器绑定等。
+
+**AscendDispatcher**
+
+- 任务调度与执行模块，负责驱动算子按照统一的生命周期（reshape / setup / forward）执行。
+
+**AscendAllocator / AscendMemoryManager**
+
+- Ascend 设备内存管理模块，负责 Tensor 与 workspace 的分配、回收及内存池管理。
+
+**Ascend Ops / AscendCommon**
+
+- Ascend 专用算子实现及 ATB / ACL 公共工具封装，屏蔽底层运行时细节。
+
+3. Ascend Runtime 层
+
+运行时层由 Ascend CANN 提供，包含 ATB 算子库、ACL 执行接口以及执行上下文与流管理。
+
+4. Ascend 硬件层
+
+最底层为 Ascend NPU 硬件，负责实际的计算执行。
+
+执行流程（单算子路径）
+----------------------
+
+.. code-block:: text
+1. Ascend Backend 初始化
+   - Context 注册 Backend、Allocator、Dispatcher
+
+2. 输入 Tensor 准备
+   - Ascend Tensor 分配
+   - Host → Device 拷贝
+
+3. 构建并提交算子任务
+   - 创建 Ascend Op、Task
+   - 提交至 Dispatcher
+
+4. Ascend 上执行算子
+   - reshape、setup、forward
+   - → ATB Operation Execute
+
+5. 结果回传与资源释放
+   - Device → Host 拷贝验证
+   - Tensor 资源释放
+
+算子支持与映射
+--------------------------
+
+支持的算子
+~~~~~~~~~~~~~~~~~~~
+
+当前版本的 Ascend Backend 以验证端到端执行链路为目标，实现了基于 ATB 的 **Add 算子** 支持。
+算子映射策略
+~~~~~~~~~~~~
+
+在 Ascend Backend 中，框架算子并不直接依赖底层运行时实现，而是通过后端算子层进行统一映射。
+后续扩展
+~~~~~~~~~~~~~~~~~
+
+在当前单算子执行路径稳定的基础上，Ascend Backend 将逐步扩展算子支持范围与执行模式。
+
+添加新算子的方法
+~~~~~~~~~~~~~~~~~
+Step 1：确认 ATB 支持与算子约束
+--------------------------------
+- 确认 ATB 是否支持目标算子类型及对应参数结构
+
+Step 2：实现 Ascend 算子类
+--------------------------------
+- 在 Ascend 后端中定义算子类
+- 实现统一的算子生命周期：reshape → setup → forward
+- 在 forward 阶段调用 ATB 单算子完成执行
+
+Step 3：注册算子并接入调度链路
+--------------------------------
+- 将新算子注册到 AscendBackend并完成相关适配
+
+Step 4：测试与验证
+--------------------------------
+- 构建最小示例在 Ascend 设备上运行新算子
+- 与 CPU 参考结果对比，验证计算正确性
+
+算子计算结果测试
+~~~~~~~~~~~~~~~~~
+- 基准结果：在 CPU/参考实现上运行同一输入，获得期望输出。
+- 输入构造：覆盖典型维度与边界尺寸；固定随机种子，避免非确定性。
+- 误差度量：按 dtype 选择误差标准，如浮点用相对/绝对误差（rtol/atol），整型用全相等。
+- 数据搬运：确保 Host→Device / Device→Host 拷贝后再次比对，排查搬运或对齐问题。
+
+内存与数据管理
+--------------
+- AscendMemoryManager（单例）：按设备创建独立内存池，当前通过 `aclrtGetDeviceCount` 为每个 device 分配池。
+- AscendMemoryPool：预分配一个较大的空间，`aclrtMalloc(..., ACL_MEM_MALLOC_HUGE_FIRST)` 获取内存，维护 base/cur 指针与剩余空间。
+- 块分配策略：首先从 32B 对齐，优先从有空间 的 free_blocks 复用，若没有可用的则在池内线性切分（64B 对齐），并返回递增 block id。
+- 线程安全：分配/释放/取指针均持锁。
+- 多设备调度：通过当前 device id 选择对应内存池，确保多卡环境下内存隔离。
+
+性能与调优
+----------
+- 内存池复用：通过 AscendMemoryPool 预分配大块显存，并在线性切分/复用 block，减少频繁 aclrtMalloc/aclrtFree 带来的碎片与性能开销。
+- 对齐与访问：按 32B/64B 对齐划分内存块，兼顾 ATB/ACL 对齐需求与访存效率。
+- 执行路径简化：当前以单算子执行链路为主，重点验证端到端正确性与内存/数据通路的稳定性，为后续多算子/多流并行奠定基础。
+- 日志观测：通过 AscendCommon 与统一日志系统记录内存分配、算子执行等关键行为，用于简单的性能和资源使用分析。
+
+测试与验证
+----------
+- 结果正确性测试：在 CPU 或参考实现上计算结果，在 Ascend Backend 上运行相同输入，对比数值。
+- 时间测试：针对算子构不同步骤，记录执行时间。
+- 端到端验证：在示例工程中跑通完整链路，同时观察输出结果与耗时，确保调度、内存池和运行时组合下行为稳定。
+
+后续扩展
+--------
+- 支持更多算子。
+- 更完整的 profiling 与可视化。
+- 上下文/图级缓存，减少重复创建。
\ No newline at end of file

From e9e684256c427e486e387640a3fbbc049d2e8a08 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 16 Dec 2025 16:31:25 +0800
Subject: [PATCH 09/16] fix(ascend): add result validation and timing
 measurement

---
 examples/ascend_add_demo/main.cpp             | 87 ++++++----------
 mllm/backends/ascend/AscendCommon.cpp         | 98 ++++++++++++++++++-
 mllm/backends/ascend/AscendCommon.hpp         | 41 +++++++-
 mllm/backends/ascend/ops/AscnedElewiseOps.cpp |  6 +-
 4 files changed, 169 insertions(+), 63 deletions(-)

diff --git a/examples/ascend_add_demo/main.cpp b/examples/ascend_add_demo/main.cpp
index b9704fb5b..216591b2d 100644
--- a/examples/ascend_add_demo/main.cpp
+++ b/examples/ascend_add_demo/main.cpp
@@ -57,63 +57,36 @@ int main() {
     )[0];
     std::cout << "   ✓ Add operation completed\n" << std::endl;
 
-    std::cout << "\n5. Copying result from NPU to CPU for verification..." << std::endl;
-    std::vector<half_float::half> z_data_fp16(batch * size);
-    
-    auto ret = aclrtMemcpy(
-        z_data_fp16.data(), batch * size * sizeof(half_float::half),
-        z_ascend.ptr<void>(), z_ascend.bytes(),
-        ACL_MEMCPY_DEVICE_TO_HOST
-    );
-    if (ret != ACL_SUCCESS) {
-      std::cerr << "   ✗ Failed to copy result back to CPU: ACL error " << ret << std::endl;
-      x_handle.release();
-      y_handle.release();
-      return 1;
-    }
-    
-    std::vector<float> result(batch * size);
-    for (size_t i = 0; i < result.size(); ++i) {
-      result[i] = static_cast<float>(z_data_fp16[i]);
-    }
-    
-    std::cout << "   ✓ Result copied to CPU\n" << std::endl;
-    
-    std::cout << "6. Verifying results..." << std::endl;
-    std::cout << "   Actual result:   [";
-    for (size_t i = 0; i < result.size(); ++i) {
-      std::cout << result[i];
-      if (i < result.size() - 1) std::cout << ", ";
-    }
-    std::cout << "]" << std::endl;
-    
-    std::cout << "   Expected result: [";
-    for (size_t i = 0; i < expected.size(); ++i) {
-      std::cout << expected[i];
-      if (i < expected.size() - 1) std::cout << ", ";
-    }
-    std::cout << "]" << std::endl;
-    
-    bool correct = true;
-    const float tolerance = 0.1f;  
-    
-    for (size_t i = 0; i < result.size(); ++i) {
-      float diff = std::abs(result[i] - expected[i]);
-      if (diff > tolerance) {
-        correct = false;
-        std::cout << "   ✗ Mismatch at index " << i 
-                  << ": expected " << expected[i] 
-                  << ", got " << result[i] 
-                  << " (diff: " << diff << ")" << std::endl;
-      }
-    }
-    
-    if (correct) {
-      std::cout << "\n✓✓✓ Test PASSED! All values match expected results. ✓✓✓" << std::endl;
-    } else {
-      std::cout << "\n✗✗✗ Test FAILED! Results don't match expected values. ✗✗✗" << std::endl;
-    }
-    
+  std::cout << "\n5. Copying result from NPU to CPU for verification..." << std::endl;
+  std::vector<float> actual;
+  bool correct = ascend::verifyAscendTensor(
+      z_ascend,
+      expected,
+      /*atol=*/1e-2f,
+      /*rtol=*/1e-2f,
+      /*verbose=*/true,
+      &actual);
+
+  std::cout << "   Actual result:   [";
+  for (size_t i = 0; i < actual.size(); ++i) {
+    std::cout << actual[i];
+    if (i < actual.size() - 1) std::cout << ", ";
+  }
+  std::cout << "]" << std::endl;
+
+  std::cout << "   Expected result: [";
+  for (size_t i = 0; i < expected.size(); ++i) {
+    std::cout << expected[i];
+    if (i < expected.size() - 1) std::cout << ", ";
+  }
+  std::cout << "]" << std::endl;
+
+  if (correct) {
+    std::cout << "\n✓✓✓ Test PASSED! All values match expected results. ✓✓✓" << std::endl;
+  } else {
+    std::cout << "\n✗✗✗ Test FAILED! Results don't match expected values. ✗✗✗" << std::endl;
+  }
+  
     x_handle.release();
     y_handle.release();
     
diff --git a/mllm/backends/ascend/AscendCommon.cpp b/mllm/backends/ascend/AscendCommon.cpp
index d98eb29de..98d4d82f0 100644
--- a/mllm/backends/ascend/AscendCommon.cpp
+++ b/mllm/backends/ascend/AscendCommon.cpp
@@ -3,9 +3,13 @@
 
 #include "mllm/backends/ascend/AscendCommon.hpp"
 
-#include <vector>
+#include <chrono>
+#include <cmath>
+#include <iostream>
 #include <mutex>
+#include <vector>
 #include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include "mllm/core/DataTypes.hpp"
 
 namespace mllm::ascend {
 
@@ -73,7 +77,97 @@ AscendTensorHandle prepareAscendTensor(const std::vector<float>& host_data,
     MLLM_ACL_CHECK(ret);
   }
 
-  return AscendTensorHandle(std::move(tensor), -1);
+  return {std::move(tensor), -1};
+}
+
+std::vector<float> copyAscendTensorToHost(const Tensor& t) {
+  MLLM_RT_ASSERT(t.dtype() == kFloat16);
+  syncGlobalAtbStream();
+
+  const size_t elem_cnt = t.numel();
+  std::vector<half_float::half> device_fp16(elem_cnt);
+
+  auto ret = aclrtMemcpy(
+      device_fp16.data(), elem_cnt * sizeof(half_float::half),
+      t.ptr<void>(), t.bytes(),
+      ACL_MEMCPY_DEVICE_TO_HOST);
+  MLLM_ACL_CHECK(ret);
+
+  std::vector<float> host(elem_cnt);
+  for (size_t i = 0; i < elem_cnt; ++i) {
+    host[i] = static_cast<float>(device_fp16[i]);
+  }
+  return host;
+}
+
+bool verifyAscendTensor(const Tensor& t,
+                        const std::vector<float>& expected,
+                        float atol,
+                        float rtol,
+                        bool verbose,
+                        std::vector<float>* actual_out) {
+  auto actual = copyAscendTensorToHost(t);
+  if (actual_out != nullptr) {
+    *actual_out = actual;
+  }
+
+  if (actual.size() != expected.size()) {
+    if (verbose) {
+      std::cout << "[AscendVerify] size mismatch: actual " << actual.size()
+                << " vs expected " << expected.size() << "\n";
+    }
+    return false;
+  }
+
+  bool ok = true;
+  for (size_t i = 0; i < actual.size(); ++i) {
+    const float diff = std::abs(actual[i] - expected[i]);
+    const float thr = atol + rtol * std::abs(expected[i]);
+    if (diff > thr) {
+      ok = false;
+      if (verbose) {
+        std::cout << "[AscendVerify] idx " << i
+                  << " expected " << expected[i]
+                  << " got " << actual[i]
+                  << " diff " << diff
+                  << " thr " << thr << "\n";
+      }
+    }
+  }
+
+  if (verbose) {
+    std::cout << (ok ? "[AscendVerify] OK" : "[AscendVerify] FAIL") << "\n";
+  }
+  return ok;
+}
+
+bool verifyAscendTensor(const Tensor& t,
+                        const RefFn& ref_fn,
+                        float atol,
+                        float rtol,
+                        bool verbose,
+                        std::vector<float>* actual_out) {
+  auto expected = ref_fn();
+  return verifyAscendTensor(t, expected, atol, rtol, verbose, actual_out);
+}
+
+AscendTimer::AscendTimer(const char* tag, bool sync_before, bool sync_after)
+    : tag_(tag),
+      sync_before_(sync_before),
+      sync_after_(sync_after) {
+  if (sync_before_) {
+    syncGlobalAtbStream();
+  }
+  start_ = std::chrono::high_resolution_clock::now();
+}
+
+AscendTimer::~AscendTimer() {
+  if (sync_after_) {
+    syncGlobalAtbStream();
+  }
+  const auto end = std::chrono::high_resolution_clock::now();
+  const double ms = std::chrono::duration<double, std::milli>(end - start_).count();
+  std::cout << "[AscendTimer] " << tag_ << " : " << ms << " ms\n";
 }
 
 atb::Context* getGlobalAtbContext() {
diff --git a/mllm/backends/ascend/AscendCommon.hpp b/mllm/backends/ascend/AscendCommon.hpp
index 7b7cea8ec..8d74c8707 100644
--- a/mllm/backends/ascend/AscendCommon.hpp
+++ b/mllm/backends/ascend/AscendCommon.hpp
@@ -3,16 +3,17 @@
 
 #pragma once
 
+#include <functional>
 #include <string>
 #include <vector>
+#include <chrono>
 
 #include <acl/acl.h>
 #include <atb/atb_infer.h>
 #include <atb/types.h>
 
-#include "mllm/core/DataTypes.hpp"
 #include "mllm/core/Tensor.hpp"
-#include "mllm/utils/Common.hpp"
+#include "mllm/utils/Common.hpp"  // IWYU pragma: keep
 
 // Ascend ACL error checking macro
 #define MLLM_ACL_CHECK(err)                                                                                    \
@@ -93,4 +94,40 @@ AscendTensorHandle prepareAscendTensor(const std::vector<float>& host_data,
                                        int batch,
                                        int size);
 
+// Copy Ascend tensor to host as float (currently assumes FP16 tensor data).
+std::vector<float> copyAscendTensorToHost(const Tensor& t);
+
+// Verify Ascend tensor against expected values.
+bool verifyAscendTensor(const Tensor& t,
+                        const std::vector<float>& expected,
+                        float atol = 1e-2f,
+                        float rtol = 1e-2f,
+                        bool verbose = true,
+                        std::vector<float>* actual_out = nullptr);
+
+using RefFn = std::function<std::vector<float>()>;
+bool verifyAscendTensor(const Tensor& t,
+                        const RefFn& ref_fn,
+                        float atol = 1e-2f,
+                        float rtol = 1e-2f,
+                        bool verbose = true,
+                        std::vector<float>* actual_out = nullptr);
+
+// RAII timer for measuring scoped durations (optionally syncs the global stream).
+class AscendTimer {
+ public:
+  explicit AscendTimer(const char* tag, bool sync_before = true, bool sync_after = true);
+  ~AscendTimer();
+
+ private:
+  const char* tag_;
+  bool sync_before_;
+  bool sync_after_;
+  std::chrono::high_resolution_clock::time_point start_;
+};
+
+// Convenience macros for scoped timing.
+#define ASCEND_TIME_SCOPE(tag) ::mllm::ascend::AscendTimer timer_scope_##__LINE__(tag, true, true)
+#define ASCEND_TIME_SCOPE_NOSYNC(tag) ::mllm::ascend::AscendTimer timer_scope_##__LINE__(tag, false, false)
+
 }  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscnedElewiseOps.cpp b/mllm/backends/ascend/ops/AscnedElewiseOps.cpp
index fc6fad429..f5a504dd2 100644
--- a/mllm/backends/ascend/ops/AscnedElewiseOps.cpp
+++ b/mllm/backends/ascend/ops/AscnedElewiseOps.cpp
@@ -98,8 +98,10 @@ void AscendAddOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>
     mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
     mem_mgr.getBlockPtr(workspace_block_id, workspace);
   }
-
-  st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  {
+    ASCEND_TIME_SCOPE("AscendAddOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
   if (st != atb::NO_ERROR) {
     MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB AddOp Execute failed, status={}", static_cast<int>(st));
   }

From 852406a68ebdf2d02d244b9d55fdb7196267de6a Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 16 Dec 2025 17:53:54 +0800
Subject: [PATCH 10/16] fix(ascend): use the common code path of setup

---
 mllm/backends/ascend/ops/AscnedElewiseOps.cpp | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/mllm/backends/ascend/ops/AscnedElewiseOps.cpp b/mllm/backends/ascend/ops/AscnedElewiseOps.cpp
index f5a504dd2..4972e0cb5 100644
--- a/mllm/backends/ascend/ops/AscnedElewiseOps.cpp
+++ b/mllm/backends/ascend/ops/AscnedElewiseOps.cpp
@@ -20,18 +20,19 @@ namespace mllm::ascend {
 AscendAddOp::AscendAddOp(const aops::AddOpOptions& options) : aops::AddOp(options) {}
 
 void AscendAddOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
-  for (auto& t : outputs) {
-    if (!t.isNil()) {
-      auto& mem_mgr = getAscendMemoryManager();
-      int block_id = -1;
-      void* device_ptr = nullptr;
+  // for (auto& t : outputs) {
+  //   if (!t.isNil()) {
+  //     auto& mem_mgr = getAscendMemoryManager();
+  //     int block_id = -1;
+  //     void* device_ptr = nullptr;
       
-      mem_mgr.allocateBlock(static_cast<uint32_t>(t.bytes()), block_id);
-      mem_mgr.getBlockPtr(block_id, device_ptr);
+  //     mem_mgr.allocateBlock(static_cast<uint32_t>(t.bytes()), block_id);
+  //     mem_mgr.getBlockPtr(block_id, device_ptr);
       
-      t.impl()->storage()->ptr_ = device_ptr;
-    }
-  }
+  //     t.impl()->storage()->ptr_ = device_ptr;
+  //   }
+  // }
+  BaseOp::setup(inputs, outputs);
 }
 
 void AscendAddOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {

From f82e1acdaf321cb3b4557ddd9d4e48aac49c05ff Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sat, 20 Dec 2025 15:53:13 +0800
Subject: [PATCH 11/16] fix(ascend): fix some problem of document

---
 docs/ascend_backend/core_design.rst | 102 +++++++++++++++-------------
 docs/ascend_backend/index.rst       |   9 +++
 docs/index.rst                      |   5 ++
 3 files changed, 67 insertions(+), 49 deletions(-)
 create mode 100644 docs/ascend_backend/index.rst

diff --git a/docs/ascend_backend/core_design.rst b/docs/ascend_backend/core_design.rst
index 592089736..db835823e 100644
--- a/docs/ascend_backend/core_design.rst
+++ b/docs/ascend_backend/core_design.rst
@@ -1,5 +1,5 @@
-Ascend Backend 设计概述
-====================
+Ascend Backend 
+========================
 
 总览
 ----
@@ -15,47 +15,50 @@ Ascend Backend 将 mLLM 的算子执行能力接入华为 Ascend NPU，提供端
 
 架构组件
 --------
-.. code-block:: text
-
-┌─────────────────────────────────────────────────────────────┐
-│                    MLLM 框架                                │
-│  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐       │
-│  │   模块       │  │    层         │  │  调度器      │       │
-│  └──────┬───────┘  └──────┬───────┘  └──────┬───────┘       │
-└─────────┼─────────────────┼─────────────────┼───────────────┘
-          │                 │                 │
-          └─────────────────┴─────────────────┘
-                            │
-┌──────────────────────────────────────────────────────────────┐
-│              Ascend 后端基础设施                              │
-│                                                              │
-│  ┌────────────────────────────────────────────────────────┐  │
-│  │              AscendBackend（核心管理）                  │  │
-│  │  - 设备/算子注册  - 分配器绑定                           │  │
-│  │  - 设备信息日志                                         │  │
-│  └─────────┬──────────────────────────────────────────────┘  │
-│            │                                                 │
-│  ┌─────────┴──────────┬──────────────┬─────────────────┐     │
-│  │                    │              │                 │     │
-│  ▼                    ▼              ▼                 ▼     │
-│  AscendDispatcher  AscendAllocator Ascend Ops  AscendCommon  │
-│  （执行：算子/      MemoryManager  （目前是add     （共用代码） │
-│   模块任务）          （内存池）     未来图执行）               │
-│                                                              │
-│                                                              │
-│                                                              │
-└────────────────────────────┬─────────────────────────────────┘
-                             │
-┌────────────────────────────▼──────────────────────────────────┐
-│                     Ascend runtime                            │
-│  ┌──────────────┐  ┌──────────────┐  ┌─────────────────┐      │
-│  │  ATB 上下文   │  │  ACL 流      │  │  ATB/ACL 接口   │      │
-│  └──────────────┘  └──────────────┘  └─────────────────┘      │
-│                                                               │
-│  ┌──────────────────────────────────────────────────────┐     │
-│  │           Ascend NPU 硬件（orangepi ai pro）          │     │
-│  └──────────────────────────────────────────────────────┘     │
-└───────────────────────────────────────────────────────────────┘
+
+架构图如下：
+
+::
+
+  ┌─────────────────────────────────────────────────────────────┐
+  │                    MLLM 框架                                │
+  │  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐       │
+  │  │   模块       │  │    层         │  │  调度器      │       │
+  │  └──────┬───────┘  └──────┬───────┘  └──────┬───────┘       │
+  └─────────┼─────────────────┼─────────────────┼───────────────┘
+            │                 │                 │
+            └─────────────────┴─────────────────┘
+                              │
+  ┌──────────────────────────────────────────────────────────────┐
+  │              Ascend 后端基础设施                              │
+  │                                                              │
+  │  ┌────────────────────────────────────────────────────────┐  │
+  │  │              AscendBackend（核心管理）                  │  │
+  │  │  - 设备/算子注册  - 分配器绑定                           │  │
+  │  │  - 设备信息日志                                         │  │
+  │  └─────────┬──────────────────────────────────────────────┘  │
+  │            │                                                 │
+  │  ┌─────────┴──────────┬──────────────┬─────────────────┐     │
+  │  │                    │              │                 │     │
+  │  ▼                    ▼              ▼                 ▼     │
+  │  AscendDispatcher  AscendAllocator Ascend Ops  AscendCommon  │
+  │  （执行：算子/      MemoryManager  （目前是add     （共用代码） │
+  │   模块任务）          （内存池）     未来图执行）               │
+  │                                                              │
+  │                                                              │
+  │                                                              │
+  └────────────────────────────┬─────────────────────────────────┘
+                               │
+  ┌────────────────────────────▼──────────────────────────────────┐
+  │                     Ascend runtime                            │
+  │  ┌──────────────┐  ┌──────────────┐  ┌─────────────────┐      │
+  │  │  ATB 上下文   │  │  ACL 流      │  │  ATB/ACL 接口   │      │
+  │  └──────────────┘  └──────────────┘  └─────────────────┘      │
+  │                                                               │
+  │  ┌──────────────────────────────────────────────────────┐     │
+  │  │           Ascend NPU 硬件（orangepi ai pro）          │     │
+  │  └──────────────────────────────────────────────────────┘     │
+  └───────────────────────────────────────────────────────────────┘
 
 关键模块
 --------
@@ -95,7 +98,6 @@ Ascend Backend 将 mLLM 的算子执行能力接入华为 Ascend NPU，提供端
 执行流程（单算子路径）
 ----------------------
 
-.. code-block:: text
 1. Ascend Backend 初始化
    - Context 注册 Backend、Allocator、Dispatcher
 
@@ -116,16 +118,18 @@ Ascend Backend 将 mLLM 的算子执行能力接入华为 Ascend NPU，提供端
    - Tensor 资源释放
 
 算子支持与映射
---------------------------
+--------------
 
 支持的算子
 ~~~~~~~~~~~~~~~~~~~
 
 当前版本的 Ascend Backend 以验证端到端执行链路为目标，实现了基于 ATB 的 **Add 算子** 支持。
+
 算子映射策略
 ~~~~~~~~~~~~
 
 在 Ascend Backend 中，框架算子并不直接依赖底层运行时实现，而是通过后端算子层进行统一映射。
+
 后续扩展
 ~~~~~~~~~~~~~~~~~
 
@@ -134,21 +138,21 @@ Ascend Backend 将 mLLM 的算子执行能力接入华为 Ascend NPU，提供端
 添加新算子的方法
 ~~~~~~~~~~~~~~~~~
 Step 1：确认 ATB 支持与算子约束
---------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 - 确认 ATB 是否支持目标算子类型及对应参数结构
 
 Step 2：实现 Ascend 算子类
---------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 - 在 Ascend 后端中定义算子类
 - 实现统一的算子生命周期：reshape → setup → forward
 - 在 forward 阶段调用 ATB 单算子完成执行
 
 Step 3：注册算子并接入调度链路
---------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 - 将新算子注册到 AscendBackend并完成相关适配
 
 Step 4：测试与验证
---------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^
 - 构建最小示例在 Ascend 设备上运行新算子
 - 与 CPU 参考结果对比，验证计算正确性
 
diff --git a/docs/ascend_backend/index.rst b/docs/ascend_backend/index.rst
new file mode 100644
index 000000000..fa97f15a1
--- /dev/null
+++ b/docs/ascend_backend/index.rst
@@ -0,0 +1,9 @@
+Ascend Backend
+====================
+
+.. toctree::
+   :maxdepth: 2
+
+   core_design
+
+
diff --git a/docs/index.rst b/docs/index.rst
index 8cdcc187f..1f06ef487 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -351,6 +351,11 @@ Documents
 
    cpu_backend/index
 
+.. toctree::
+   :maxdepth: 2
+
+   ascend_backend/index
+
 .. toctree::
    :maxdepth: 2
 

From beeabedc6daad0908bd106b0c4bfd92e674ab75b Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sat, 20 Dec 2025 15:55:20 +0800
Subject: [PATCH 12/16] feat(ascend): add a X2X op for transmitting tensor from
 cpu to npu or from npu to cpu

---
 mllm/backends/ascend/ops/AscendX2XOp.cpp | 94 ++++++++++++++++++++++++
 mllm/backends/ascend/ops/AscendX2XOp.hpp | 27 +++++++
 2 files changed, 121 insertions(+)
 create mode 100644 mllm/backends/ascend/ops/AscendX2XOp.cpp
 create mode 100644 mllm/backends/ascend/ops/AscendX2XOp.hpp

diff --git a/mllm/backends/ascend/ops/AscendX2XOp.cpp b/mllm/backends/ascend/ops/AscendX2XOp.cpp
new file mode 100644
index 000000000..daa875a60
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendX2XOp.cpp
@@ -0,0 +1,94 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendX2XOp.hpp"
+
+#include <acl/acl.h>
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/core/DeviceTypes.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+namespace mllm::ascend {
+
+AscendX2XOp::AscendX2XOp(const aops::X2XOpOptions& options) : aops::X2XOp(options) {}
+
+void AscendX2XOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT_EQ(inputs.size(), 1);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  const auto& input = inputs[0];
+  auto& output = outputs[0];
+
+  const DeviceTypes input_device = input.device();
+  const DeviceTypes output_device = output.device();
+
+  // Case 1: CPU -> Ascend
+  if (input_device == kCPU && output_device == kAscend) {
+    const size_t data_size = input.bytes();
+    const void* src_data = input.ptr<void>();
+    void* dst_data = output.ptr<void>();
+
+    // Copy data from CPU to Ascend device
+    auto ret = aclrtMemcpy(
+        dst_data, data_size,
+        src_data, data_size,
+        ACL_MEMCPY_HOST_TO_DEVICE);
+
+    if (ret != ACL_SUCCESS) {
+      MLLM_ACL_CHECK(ret);
+    }
+
+    syncGlobalAtbStream();
+    return;
+  }
+
+  // Case 2: Ascend -> CPU
+  if (input_device == kAscend && output_device == kCPU) {
+    const size_t data_size = input.bytes();
+    const void* src_data = input.ptr<void>();
+    void* dst_data = output.ptr<void>();
+
+    // Copy data from Ascend device to CPU
+    auto ret = aclrtMemcpy(
+        dst_data, data_size,
+        src_data, data_size,
+        ACL_MEMCPY_DEVICE_TO_HOST);
+
+    if (ret != ACL_SUCCESS) {
+      MLLM_ACL_CHECK(ret);
+    }
+
+    syncGlobalAtbStream();
+    return;
+  }
+
+  // Case 3: Ascend -> Ascend (same device, just copy pointer or do memcpy)
+  if (input_device == kAscend && output_device == kAscend) {
+    const size_t data_size = input.bytes();
+    const void* src_data = input.ptr<void>();
+    void* dst_data = output.ptr<void>();
+
+    if (src_data != dst_data) {
+      auto ret = aclrtMemcpy(
+          dst_data, data_size,
+          src_data, data_size,
+          ACL_MEMCPY_DEVICE_TO_DEVICE);
+
+      if (ret != ACL_SUCCESS) {
+        MLLM_ACL_CHECK(ret);
+      }
+
+      syncGlobalAtbStream();
+    }
+    return;
+  }
+
+  MLLM_ERROR("AscendX2XOp only supports transform between CPU and Ascend devices. "
+             "Input device: {}, Output device: {}", 
+             static_cast<int>(input_device), static_cast<int>(output_device));
+}
+
+}  // namespace mllm::ascend
+
diff --git a/mllm/backends/ascend/ops/AscendX2XOp.hpp b/mllm/backends/ascend/ops/AscendX2XOp.hpp
new file mode 100644
index 000000000..bfe9364a3
--- /dev/null
+++ b/mllm/backends/ascend/ops/AscendX2XOp.hpp
@@ -0,0 +1,27 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/X2XOp.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+namespace mllm::ascend {
+
+class AscendX2XOp final : public aops::X2XOp {
+ public:
+  explicit AscendX2XOp(const aops::X2XOpOptions& options);
+
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendX2XOpFactory final : public TypedOpFactory<OpTypes::kX2X, aops::X2XOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::X2XOpOptions& options) override {
+    return std::make_shared<AscendX2XOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
+

From add14d177d916cf57469d8aa5f809e783fd6e45d Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sat, 20 Dec 2025 15:59:39 +0800
Subject: [PATCH 13/16] fix(ascend): create the test part

---
 mllm/backends/ascend/AscendBackend.cpp |  2 +
 mllm/backends/ascend/AscendCommon.cpp  | 19 +++++-----
 tests/CMakeLists.txt                   |  3 ++
 tests/ascend/AscendKernelTest.hpp      | 52 ++++++++++++++++++++++++++
 tests/ascend/CMakeLists.txt            | 26 +++++++++++++
 tests/ascend/KernelTest.cpp            | 45 ++++++++++++++++++++++
 tests/ascend/KernelTestHelper.hpp      | 18 +++++++++
 7 files changed, 155 insertions(+), 10 deletions(-)
 create mode 100644 tests/ascend/AscendKernelTest.hpp
 create mode 100644 tests/ascend/CMakeLists.txt
 create mode 100644 tests/ascend/KernelTest.cpp
 create mode 100644 tests/ascend/KernelTestHelper.hpp

diff --git a/mllm/backends/ascend/AscendBackend.cpp b/mllm/backends/ascend/AscendBackend.cpp
index 408cb2518..5ec76413a 100644
--- a/mllm/backends/ascend/AscendBackend.cpp
+++ b/mllm/backends/ascend/AscendBackend.cpp
@@ -7,11 +7,13 @@
 #include "mllm/core/DeviceTypes.hpp"
 
 #include "mllm/backends/ascend/ops/AscendElewiseOps.hpp"
+#include "mllm/backends/ascend/ops/AscendX2XOp.hpp"
 
 namespace mllm::ascend {
 
 AscendBackend::AscendBackend() : Backend(kAscend, createAscendAllocator()) {
   regOpFactory<AscendAddOpFactory>();
+  regOpFactory<AscendX2XOpFactory>();
   auto& devices = AscendDeviceMetaInfo::instance().devices;
   for (const auto& device : devices) {
     const auto bytes_to_mb = [](size_t bytes) { return bytes / (1024.0 * 1024.0); };
diff --git a/mllm/backends/ascend/AscendCommon.cpp b/mllm/backends/ascend/AscendCommon.cpp
index 98d4d82f0..140a5a31e 100644
--- a/mllm/backends/ascend/AscendCommon.cpp
+++ b/mllm/backends/ascend/AscendCommon.cpp
@@ -81,21 +81,20 @@ AscendTensorHandle prepareAscendTensor(const std::vector<float>& host_data,
 }
 
 std::vector<float> copyAscendTensorToHost(const Tensor& t) {
+  // Current implementation assumes FP16 tensor on Ascend.
   MLLM_RT_ASSERT(t.dtype() == kFloat16);
-  syncGlobalAtbStream();
 
-  const size_t elem_cnt = t.numel();
-  std::vector<half_float::half> device_fp16(elem_cnt);
-
-  auto ret = aclrtMemcpy(
-      device_fp16.data(), elem_cnt * sizeof(half_float::half),
-      t.ptr<void>(), t.bytes(),
-      ACL_MEMCPY_DEVICE_TO_HOST);
-  MLLM_ACL_CHECK(ret);
+  // Use generic .to(kCPU) + CPU-side cast instead of raw aclrtMemcpy.
+  // This goes through the X2X op we implemented for Ascend, keeping
+  // all device transfer logic in one place.
+  auto cpu_tensor = const_cast<Tensor&>(t).to(::mllm::kCPU);
 
+  const size_t elem_cnt = cpu_tensor.numel();
   std::vector<float> host(elem_cnt);
+
+  auto* src = cpu_tensor.ptr<mllm_fp16_t>();
   for (size_t i = 0; i < elem_cnt; ++i) {
-    host[i] = static_cast<float>(device_fp16[i]);
+    host[i] = static_cast<float>(src[i]);
   }
   return host;
 }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a83123e25..89eaeb49c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -11,3 +11,6 @@ add_subdirectory(cpu)
 if(MLLM_BUILD_CUDA_BACKEND)
   add_subdirectory(cuda)
 endif()
+if(DEFINED ENV{ASCEND_HOME_PATH})
+  add_subdirectory(ascend)
+endif()
diff --git a/tests/ascend/AscendKernelTest.hpp b/tests/ascend/AscendKernelTest.hpp
new file mode 100644
index 000000000..138ee5ae8
--- /dev/null
+++ b/tests/ascend/AscendKernelTest.hpp
@@ -0,0 +1,52 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/mllm.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "KernelTestHelper.hpp"
+
+#include <vector>
+
+class AscendKernelTest : public KernelTest {
+ public:
+  AscendKernelTest() = default;
+  ~AscendKernelTest() override = default;
+
+  // Test Add operation with different shapes
+  bool AddFloat16Test(const std::vector<mllm::Tensor::shape_t>& shapes) {
+    using namespace mllm;  // NOLINT
+    for (auto& shape : shapes) {
+      // 1. Construct random FP16 inputs on CPU
+      Tensor x_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU);
+      Tensor y_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU);
+
+      // 2. Compute reference result (FP16) on CPU
+      Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU);
+      {
+        auto* x_ptr = x_cpu.ptr<mllm_fp16_t>();
+        auto* y_ptr = y_cpu.ptr<mllm_fp16_t>();
+        auto* r_ptr = ref_cpu.ptr<mllm_fp16_t>();
+        auto num_elements = x_cpu.numel();
+        for (size_t i = 0; i < num_elements; ++i) {
+          r_ptr[i] = x_ptr[i] + y_ptr[i];
+        }
+      }
+
+      // 3. Move inputs to Ascend and run Add (z = x + y)
+      auto x_ascend = x_cpu.to(kAscend);
+      auto y_ascend = y_cpu.to(kAscend);
+      auto z_ascend = x_ascend + y_ascend;
+
+      // 4. Move result back to CPU and compare with reference using allClose
+      auto z_cpu = z_ascend.to(kCPU);
+      auto result = mllm::test::allClose(z_cpu, ref_cpu, 1e-2f, 1e-2f);
+      if (!result.is_close) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
diff --git a/tests/ascend/CMakeLists.txt b/tests/ascend/CMakeLists.txt
new file mode 100644
index 000000000..0025fb40b
--- /dev/null
+++ b/tests/ascend/CMakeLists.txt
@@ -0,0 +1,26 @@
+add_executable(Mllm-Test-AscendKernel KernelTest.cpp)
+
+if(DEFINED ENV{ASCEND_HOME_PATH})
+  target_include_directories(Mllm-Test-AscendKernel PRIVATE $ENV{ASCEND_HOME_PATH}/include)
+  target_link_directories(Mllm-Test-AscendKernel PRIVATE $ENV{ASCEND_HOME_PATH}/lib64)
+endif()
+
+target_link_libraries(Mllm-Test-AscendKernel PRIVATE
+  gtest_main
+  MllmRT
+  MllmCPUBackend
+  MllmAscendBackend
+  ascendcl
+)
+
+target_include_directories(Mllm-Test-AscendKernel PRIVATE
+  ${CMAKE_SOURCE_DIR}
+)
+
+set_target_properties(Mllm-Test-AscendKernel PROPERTIES
+  CXX_STANDARD 20
+  CXX_STANDARD_REQUIRED ON
+)
+
+include(GoogleTest)
+
diff --git a/tests/ascend/KernelTest.cpp b/tests/ascend/KernelTest.cpp
new file mode 100644
index 000000000..025c89994
--- /dev/null
+++ b/tests/ascend/KernelTest.cpp
@@ -0,0 +1,45 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include <gtest/gtest.h>
+
+#include "mllm/mllm.hpp"
+
+/// Kernel tests
+#include "AscendKernelTest.hpp"
+
+//===----------------------------------------------------------------------===//
+// Element wise ADD.
+//
+// FP16 (Ascend currently uses FP16)
+//===----------------------------------------------------------------------===//
+TEST_F(AscendKernelTest, AddFloat16) {
+  EXPECT_EQ(AddFloat16Test({
+                {2, 3},
+                {1, 1},
+                {4, 4},
+                {8, 8},
+                {16, 16},
+                {32, 32},
+            }),
+            true);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  
+  // Initialize Ascend backend
+  mllm::initAscendBackend();
+  
+  // Initialize context
+  mllm::initializeContext();
+  
+  auto ret = RUN_ALL_TESTS();
+  
+  // Cleanup
+  mllm::memoryReport();
+  mllm::shutdownContext();
+  
+  return ret;
+}
+
diff --git a/tests/ascend/KernelTestHelper.hpp b/tests/ascend/KernelTestHelper.hpp
new file mode 100644
index 000000000..03a9f86f2
--- /dev/null
+++ b/tests/ascend/KernelTestHelper.hpp
@@ -0,0 +1,18 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+#pragma once
+
+#include <gtest/gtest.h>
+
+class KernelTest : public testing::Test {
+ public:
+  KernelTest() = default;
+  ~KernelTest() override = default;
+
+  // If the constructor and destructor are not enough for setting up
+  // and cleaning up each test, you can define the following methods:
+  void SetUp() override {}
+
+  void TearDown() override {}
+};
+

From 6b88b8d36b7db038954732a57b9c8f4c7c4e88b8 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sat, 20 Dec 2025 16:12:27 +0800
Subject: [PATCH 14/16] fix(ascend): move to the test folder

---
 examples/CMakeLists.txt                 |   1 -
 examples/ascend_add_demo/CMakeLists.txt |  22 ------
 examples/ascend_add_demo/main.cpp       | 100 ------------------------
 3 files changed, 123 deletions(-)
 delete mode 100644 examples/ascend_add_demo/CMakeLists.txt
 delete mode 100644 examples/ascend_add_demo/main.cpp

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a143a7989..6c49cfb22 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,4 +20,3 @@ endif()
 if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
   add_subdirectory(qwen3_qnn_aot)
 endif()
-
diff --git a/examples/ascend_add_demo/CMakeLists.txt b/examples/ascend_add_demo/CMakeLists.txt
deleted file mode 100644
index 15eaefd7c..000000000
--- a/examples/ascend_add_demo/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-add_executable(ascend_add_demo main.cpp)
-
-if(DEFINED ENV{ASCEND_HOME_PATH})
-  target_include_directories(ascend_add_demo PRIVATE $ENV{ASCEND_HOME_PATH}/include)
-  target_link_directories(ascend_add_demo PRIVATE $ENV{ASCEND_HOME_PATH}/lib64)
-endif()
-
-target_link_libraries(ascend_add_demo PRIVATE
-  MllmRT
-  MllmAscendBackend
-  ascendcl  # 添加 ACL 库，因为 main.cpp 中直接使用了 aclrtMemcpy
-)
-
-set_target_properties(ascend_add_demo PROPERTIES
-  CXX_STANDARD 20
-  CXX_STANDARD_REQUIRED ON
-)
-
-target_include_directories(ascend_add_demo PRIVATE
-  ${CMAKE_SOURCE_DIR}
-)
-
diff --git a/examples/ascend_add_demo/main.cpp b/examples/ascend_add_demo/main.cpp
deleted file mode 100644
index 216591b2d..000000000
--- a/examples/ascend_add_demo/main.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <cstring>
-#include <acl/acl.h>
-#include "mllm/mllm.hpp"
-#include "mllm/backends/ascend/AscendCommon.hpp"
-#include "mllm/core/Tensor.hpp"
-#include "mllm/engine/Context.hpp"
-#include "mllm/core/aops/ElewiseOps.hpp"
-#include "mllm/core/OpTypes.hpp"
-
-using namespace mllm;
-
-int main() {
-  std::cout << "=== Ascend Add Op Demo ===" << std::endl;
-
-  try {
-    std::cout << "1. Initializing Ascend backend..." << std::endl;
-    initAscendBackend();
-    std::cout << "   ✓ Ascend backend initialized\n" << std::endl;
-
-    std::cout << "2. Preparing test data..." << std::endl;
-    const int batch = 2;
-    const int size = 3;
-    std::vector<float> data_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-    std::vector<float> data_y = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f};
-    std::vector<float> expected = {11.0f, 22.0f, 33.0f, 44.0f, 55.0f, 66.0f};
-    
-    std::cout << "   Input X: [";
-    for (size_t i = 0; i < data_x.size(); ++i) {
-      std::cout << data_x[i];
-      if (i < data_x.size() - 1) std::cout << ", ";
-    }
-    std::cout << "]" << std::endl;
-    
-    std::cout << "   Input Y: [";
-    for (size_t i = 0; i < data_y.size(); ++i) {
-      std::cout << data_y[i];
-      if (i < data_y.size() - 1) std::cout << ", ";
-    }
-    std::cout << "]\n" << std::endl;
-
-    std::cout << "3. Preparing tensors on Ascend..." << std::endl;
-    auto x_handle = ascend::prepareAscendTensor(data_x, batch, size);
-    auto y_handle = ascend::prepareAscendTensor(data_y, batch, size);
-    auto& x_ascend = x_handle.tensor();
-    auto& y_ascend = y_handle.tensor();
-    std::cout << "   ✓ Tensors ready on Ascend device\n" << std::endl;
-
-    std::cout << "4. Executing Add operation on Ascend..." << std::endl;
-    auto& ctx = Context::instance();
-    std::cout << "context over" <<std::endl;
-    auto z_ascend = ctx.buildOpAndSubmitTask(
-        OpTypes::kAdd, 
-        aops::AddOpOptions{}, 
-        {x_ascend, y_ascend}
-    )[0];
-    std::cout << "   ✓ Add operation completed\n" << std::endl;
-
-  std::cout << "\n5. Copying result from NPU to CPU for verification..." << std::endl;
-  std::vector<float> actual;
-  bool correct = ascend::verifyAscendTensor(
-      z_ascend,
-      expected,
-      /*atol=*/1e-2f,
-      /*rtol=*/1e-2f,
-      /*verbose=*/true,
-      &actual);
-
-  std::cout << "   Actual result:   [";
-  for (size_t i = 0; i < actual.size(); ++i) {
-    std::cout << actual[i];
-    if (i < actual.size() - 1) std::cout << ", ";
-  }
-  std::cout << "]" << std::endl;
-
-  std::cout << "   Expected result: [";
-  for (size_t i = 0; i < expected.size(); ++i) {
-    std::cout << expected[i];
-    if (i < expected.size() - 1) std::cout << ", ";
-  }
-  std::cout << "]" << std::endl;
-
-  if (correct) {
-    std::cout << "\n✓✓✓ Test PASSED! All values match expected results. ✓✓✓" << std::endl;
-  } else {
-    std::cout << "\n✗✗✗ Test FAILED! Results don't match expected values. ✗✗✗" << std::endl;
-  }
-  
-    x_handle.release();
-    y_handle.release();
-    
-    return correct ? 0 : 1;
-
-  } catch (const std::exception& e) {
-    std::cerr << "\n✗ Error: " << e.what() << std::endl;
-    return 1;
-  }
-}
-

From 2d25eda619121baf475f67db7f49cdbda7d7db2e Mon Sep 17 00:00:00 2001
From: Chenghua <68260701+chenghuaWang@users.noreply.github.com>
Date: Tue, 23 Dec 2025 13:56:44 +0800
Subject: [PATCH 15/16] Update build_arm_ascend.yaml

---
 tasks/build_arm_ascend.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tasks/build_arm_ascend.yaml b/tasks/build_arm_ascend.yaml
index 4546f2d81..17ffd3f10 100644
--- a/tasks/build_arm_ascend.yaml
+++ b/tasks/build_arm_ascend.yaml
@@ -1,7 +1,7 @@
 Tasks:
   - CMakeConfigTask:
       cmake_cfg_path: "build-arm-ascend"
-      cmake_build_type: "Release"
+      cmake_build_type: "ReleaseDebInfo"
       cmake_extra_args:
         - "-DMLLM_CROSS_COMPILE=ON"
         - "-DMLLM_BUILD_ARM_BACKEND=ON"
@@ -9,7 +9,7 @@ Tasks:
         - "-DANDROID_PLATFORM=android-28"
         - "-DANDROID_ABI=arm64-v8a"
         - '-DMLLM_CPU_BACKEND_COMPILE_OPTIONS="-march=armv8.2-a+fp16+fp16fml+dotprod+i8mm;-ffast-math;-Wno-nan-infinity-disabled"'
-        - "-DCMAKE_INSTALL_PREFIX=/root/mllm-install-android-arm64-v8a"
+        - "-DCMAKE_INSTALL_PREFIX=./mllm-install-android-arm64-v8a"
         - "-DMLLM_KERNEL_USE_THREADS=ON"
         - "-DMLLM_KERNEL_THREADS_VENDOR_OPENMP=ON"
         - "-DMLLM_KERNEL_USE_THREADS_VENDOR_MLLM=OFF"

From 2fa8eed23877644f62dfd69ea847e3f482529828 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 23 Dec 2025 16:56:02 +0800
Subject: [PATCH 16/16] fix(ascend): address review comments

---
 CMakeLists.txt                                      |  4 ++++
 mllm/backends/ascend/CMakeLists.txt                 | 13 +++++--------
 .../{AscnedElewiseOps.cpp => AscendElewiseOps.cpp}  | 12 ------------
 mllm/backends/cpu/kernels/arm/rmsnorm.cpp           |  4 ++--
 mllm/backends/cpu/kernels/arm/softmax.cpp           |  8 ++++----
 tests/CMakeLists.txt                                |  2 +-
 tests/ascend/CMakeLists.txt                         |  1 -
 tests/ascend/KernelTest.cpp                         |  6 +++---
 8 files changed, 19 insertions(+), 31 deletions(-)
 rename mllm/backends/ascend/ops/{AscnedElewiseOps.cpp => AscendElewiseOps.cpp} (89%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 221e956d5..7d167435e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,6 +45,10 @@ option(MLLM_CPU_BACKEND_USE_SME2 "Enable SME2" OFF)
 # Ascend Backend: Options
 option(MLLM_ASCEND_CPU_DEBUG_MODE "Enable CPU Debug mode in ascend" OFF)
 
+# run mode and SOC version for Ascend backend
+set(MLLM_RUN_MODE "npu" CACHE STRING "Run mode for mLLM backends: cpu/sim/npu")
+set(MLLM_SOC_VERSION "Ascend310B1" CACHE STRING "SOC version for Ascend backend")
+
 # Threads
 option(MLLM_KERNEL_USE_THREADS "Enable Threads" ON)
 option(MLLM_KERNEL_USE_THREADS_VENDOR_MLLM "Enable mllm's thread pool" ON)
diff --git a/mllm/backends/ascend/CMakeLists.txt b/mllm/backends/ascend/CMakeLists.txt
index 6386948fd..bb0feac46 100644
--- a/mllm/backends/ascend/CMakeLists.txt
+++ b/mllm/backends/ascend/CMakeLists.txt
@@ -1,11 +1,8 @@
-set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
-set(SOC_VERSION "Ascend310B1" CACHE STRING "system on chip type")
-
 if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+  set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
 endif()
 if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
-    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+  set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
 endif()
 
 message(STATUS "SOC_VERSION is ${SOC_VERSION}, RUN_MODE is ${RUN_MODE}")
@@ -29,20 +26,20 @@ add_library(MllmAscendBackend SHARED ${MLLM_ASCEND_SOURCES})
 
 if(DEFINED ENV{ASCEND_HOME_PATH})
   target_include_directories(MllmAscendBackend PUBLIC $ENV{ASCEND_HOME_PATH}/include)
-  target_link_directories(MllmAscendBackend PRIVATE $ENV{ASCEND_HOME_PATH}/lib64)
+  target_link_directories(MllmAscendBackend PUBLIC $ENV{ASCEND_HOME_PATH}/lib64)
 else()
   message(WARNING "ASCEND_HOME_PATH is not set, Ascend headers and libs may not be found")
 endif()
 
 if(DEFINED ENV{ATB_HOME_PATH})
   target_include_directories(MllmAscendBackend PUBLIC $ENV{ATB_HOME_PATH}/include)
-  target_link_directories(MllmAscendBackend PRIVATE $ENV{ATB_HOME_PATH}/lib)
+  target_link_directories(MllmAscendBackend PUBLIC $ENV{ATB_HOME_PATH}/lib)
 else()
   message(WARNING "ATB_HOME_PATH not defined, ATB library will not be linked")
 endif()
 
 
-target_link_libraries(MllmAscendBackend PRIVATE
+target_link_libraries(MllmAscendBackend PUBLIC
   ascendcl
   opapi
   nnopbase
diff --git a/mllm/backends/ascend/ops/AscnedElewiseOps.cpp b/mllm/backends/ascend/ops/AscendElewiseOps.cpp
similarity index 89%
rename from mllm/backends/ascend/ops/AscnedElewiseOps.cpp
rename to mllm/backends/ascend/ops/AscendElewiseOps.cpp
index 4972e0cb5..762ef1dfe 100644
--- a/mllm/backends/ascend/ops/AscnedElewiseOps.cpp
+++ b/mllm/backends/ascend/ops/AscendElewiseOps.cpp
@@ -20,18 +20,6 @@ namespace mllm::ascend {
 AscendAddOp::AscendAddOp(const aops::AddOpOptions& options) : aops::AddOp(options) {}
 
 void AscendAddOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
-  // for (auto& t : outputs) {
-  //   if (!t.isNil()) {
-  //     auto& mem_mgr = getAscendMemoryManager();
-  //     int block_id = -1;
-  //     void* device_ptr = nullptr;
-      
-  //     mem_mgr.allocateBlock(static_cast<uint32_t>(t.bytes()), block_id);
-  //     mem_mgr.getBlockPtr(block_id, device_ptr);
-      
-  //     t.impl()->storage()->ptr_ = device_ptr;
-  //   }
-  // }
   BaseOp::setup(inputs, outputs);
 }
 
diff --git a/mllm/backends/cpu/kernels/arm/rmsnorm.cpp b/mllm/backends/cpu/kernels/arm/rmsnorm.cpp
index 564bdd301..2c0f5e853 100644
--- a/mllm/backends/cpu/kernels/arm/rmsnorm.cpp
+++ b/mllm/backends/cpu/kernels/arm/rmsnorm.cpp
@@ -17,7 +17,7 @@ void rmsnorm_fp32(const mllm_fp32_t* __restrict X, const mllm_fp32_t* __restrict
   auto w_ptr = W;
 
   // pass 1
-  const float rms = 1.f / std::sqrtf(vsquare_mean_fp32(x_ptr, D) + epsilon);
+  const float rms = 1.f / std::sqrt(vsquare_mean_fp32(x_ptr, D) + epsilon);
 
   // pass 2
   if (add_unit_offset) {
@@ -106,7 +106,7 @@ void rmsnorm_fp16(const mllm_fp16_t* __restrict X, const mllm_fp16_t* __restrict
 
   // pass 1: compute RMS scaling factor
   float mean_square = vsquare_mean_fp16(x_ptr, D);
-  const float rms_float = 1.f / std::sqrtf(mean_square + epsilon);
+  const float rms_float = 1.f / std::sqrt(mean_square + epsilon);
   float16_t rms_fp16 = static_cast<float16_t>(rms_float);
   float16x8_t rms_vec = vdupq_n_f16(rms_fp16);
 
diff --git a/mllm/backends/cpu/kernels/arm/softmax.cpp b/mllm/backends/cpu/kernels/arm/softmax.cpp
index d8423b943..dd9f39d56 100644
--- a/mllm/backends/cpu/kernels/arm/softmax.cpp
+++ b/mllm/backends/cpu/kernels/arm/softmax.cpp
@@ -25,7 +25,7 @@ void softmax_v1_fp32(const mllm_fp32_t* __restrict X, mllm_fp32_t* __restrict Y,
     // Pass 2: minus max_value and calculate exp
     float sum = 0.f;
     for (int i = 0; i < len; ++i) {
-      auto tmp = std::expf(X[i * stride] - max_value);
+      auto tmp = std::exp(X[i * stride] - max_value);
       Y[i * stride] = tmp;
       sum += tmp;
     }
@@ -112,7 +112,7 @@ void softmax_v1_fp32(const mllm_fp32_t* __restrict X, mllm_fp32_t* __restrict Y,
   }
   float sum_value = vaddvq_f32(sum_vec_0);
   for (; i < len; ++i) {
-    float tmp = std::expf(X[i] - max_value);
+    float tmp = std::exp(X[i] - max_value);
     Y[i] = tmp;
     sum_value += tmp;
   }
@@ -163,7 +163,7 @@ void softmax_v1_fp16(const mllm_fp16_t* __restrict X, mllm_fp16_t* __restrict Y,
     // Pass 2: minus max_value and calculate exp
     float sum = 0.f;
     for (int i = 0; i < len; ++i) {
-      auto tmp = std::expf(X[i * stride] - max_value);
+      auto tmp = std::exp(X[i * stride] - max_value);
       Y[i * stride] = static_cast<float16_t>(tmp);
       sum += tmp;
     }
@@ -229,7 +229,7 @@ void softmax_v1_fp16(const mllm_fp16_t* __restrict X, mllm_fp16_t* __restrict Y,
   sum_vec_0 = vaddq_f32(sum_vec_0, sum_vec_2);
   float sum_value = vaddvq_f32(sum_vec_0);
   for (; i < len; ++i) {
-    float tmp = std::expf(X[i] - max_value);
+    float tmp = std::exp(X[i] - max_value);
     Y[i] = static_cast<float16_t>(tmp);
     sum_value += tmp;
   }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 89eaeb49c..7e4e3642c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -11,6 +11,6 @@ add_subdirectory(cpu)
 if(MLLM_BUILD_CUDA_BACKEND)
   add_subdirectory(cuda)
 endif()
-if(DEFINED ENV{ASCEND_HOME_PATH})
+if(MLLM_BUILD_ASCEND_BACKEND)
   add_subdirectory(ascend)
 endif()
diff --git a/tests/ascend/CMakeLists.txt b/tests/ascend/CMakeLists.txt
index 0025fb40b..24944bf0b 100644
--- a/tests/ascend/CMakeLists.txt
+++ b/tests/ascend/CMakeLists.txt
@@ -10,7 +10,6 @@ target_link_libraries(Mllm-Test-AscendKernel PRIVATE
   MllmRT
   MllmCPUBackend
   MllmAscendBackend
-  ascendcl
 )
 
 target_include_directories(Mllm-Test-AscendKernel PRIVATE
diff --git a/tests/ascend/KernelTest.cpp b/tests/ascend/KernelTest.cpp
index 025c89994..b0489f545 100644
--- a/tests/ascend/KernelTest.cpp
+++ b/tests/ascend/KernelTest.cpp
@@ -28,11 +28,11 @@ TEST_F(AscendKernelTest, AddFloat16) {
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   
-  // Initialize Ascend backend
-  mllm::initAscendBackend();
-  
   // Initialize context
   mllm::initializeContext();
+
+  // Initialize Ascend backend
+  mllm::initAscendBackend();
   
   auto ret = RUN_ALL_TESTS();