UbiquitousLearning · chenghuaWang · Dec 23, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -45,6 +45,10 @@ option(MLLM_CPU_BACKEND_USE_SME2 "Enable SME2" OFF)
 # Ascend Backend: Options
 option(MLLM_ASCEND_CPU_DEBUG_MODE "Enable CPU Debug mode in ascend" OFF)
 
+# run mode and SOC version for Ascend backend
+set(MLLM_RUN_MODE "npu" CACHE STRING "Run mode for mLLM backends: cpu/sim/npu")
+set(MLLM_SOC_VERSION "Ascend310B1" CACHE STRING "SOC version for Ascend backend")
+
 # Threads
 option(MLLM_KERNEL_USE_THREADS "Enable Threads" ON)
 option(MLLM_KERNEL_USE_THREADS_VENDOR_MLLM "Enable mllm's thread pool" ON)

@@ -0,0 +1,191 @@
+Ascend Backend 
+========================
+
+总览
+----
+Ascend Backend 将 mLLM 的算子执行能力接入华为 Ascend NPU，提供端到端的调度、内存管理与算子生命周期管理，使模型在 Ascend 上高效运行。
+
+设计目标
+--------
+- 统一后端：作为 mLLM 原生后端，统一接口与调度流程。
+- ATB 单算子验证：打通算子从框架到 NPU 的完整链路。
+- 生命周期管理：算子创建、准备、执行、销毁的统一抽象。
+- 内存管理：专用 Ascend 设备内存池，减少反复申请释放。
+- 扩展性：便于新增算子、执行模式和性能优化。
+
+架构组件
+--------
+
+架构图如下：
+
+::
+
+  ┌─────────────────────────────────────────────────────────────┐
+  │                    MLLM 框架                                │
+  │  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐       │
+  │  │   模块       │  │    层         │  │  调度器      │       │
+  │  └──────┬───────┘  └──────┬───────┘  └──────┬───────┘       │
+  └─────────┼─────────────────┼─────────────────┼───────────────┘
+            │                 │                 │
+            └─────────────────┴─────────────────┘
+                              │
+  ┌──────────────────────────────────────────────────────────────┐
+  │              Ascend 后端基础设施                              │
+  │                                                              │
+  │  ┌────────────────────────────────────────────────────────┐  │
+  │  │              AscendBackend（核心管理）                  │  │
+  │  │  - 设备/算子注册  - 分配器绑定                           │  │
+  │  │  - 设备信息日志                                         │  │
+  │  └─────────┬──────────────────────────────────────────────┘  │
+  │            │                                                 │
+  │  ┌─────────┴──────────┬──────────────┬─────────────────┐     │
+  │  │                    │              │                 │     │
+  │  ▼                    ▼              ▼                 ▼     │
+  │  AscendDispatcher  AscendAllocator Ascend Ops  AscendCommon  │
+  │  （执行：算子/      MemoryManager  （目前是add     （共用代码） │
+  │   模块任务）          （内存池）     未来图执行）               │
+  │                                                              │
+  │                                                              │
+  │                                                              │
+  └────────────────────────────┬─────────────────────────────────┘
+                               │
+  ┌────────────────────────────▼──────────────────────────────────┐
+  │                     Ascend runtime                            │
+  │  ┌──────────────┐  ┌──────────────┐  ┌─────────────────┐      │
+  │  │  ATB 上下文   │  │  ACL 流      │  │  ATB/ACL 接口   │      │
+  │  └──────────────┘  └──────────────┘  └─────────────────┘      │
+  │                                                               │
+  │  ┌──────────────────────────────────────────────────────┐     │
+  │  │           Ascend NPU 硬件（orangepi ai pro）          │     │
+  │  └──────────────────────────────────────────────────────┘     │
+  └───────────────────────────────────────────────────────────────┘
+
+关键模块
+--------
+
+1. mLLM 框架层
+
+框架层负责算子抽象、计算任务构建以及统一调度接口的提供。不依赖任何具体设备实现，仅通过 Backend 接口与底层后端交互。算子在该层被封装为可调度的任务（Task），并通过 DispatcherManager 提交给对应设备后端执行。
+
+2. Ascend 后端基础设施层
+
+该层是 Ascend Backend 的核心实现，负责承接来自框架层的算子任务，并将其映射到 Ascend 运行时执行。主要组成包括：
+
+**AscendBackend**
+
+- 后端入口与核心管理模块，负责后端注册、算子工厂管理、分配器与调度器绑定等。
+
+**AscendDispatcher**
+
+- 任务调度与执行模块，负责驱动算子按照统一的生命周期（reshape / setup / forward）执行。
+
+**AscendAllocator / AscendMemoryManager**
+
+- Ascend 设备内存管理模块，负责 Tensor 与 workspace 的分配、回收及内存池管理。
+
+**Ascend Ops / AscendCommon**
+
+- Ascend 专用算子实现及 ATB / ACL 公共工具封装，屏蔽底层运行时细节。
+
+3. Ascend Runtime 层
+
+运行时层由 Ascend CANN 提供，包含 ATB 算子库、ACL 执行接口以及执行上下文与流管理。
+
+4. Ascend 硬件层
+
+最底层为 Ascend NPU 硬件，负责实际的计算执行。
+
+执行流程（单算子路径）
+----------------------
+
+1. Ascend Backend 初始化
+   - Context 注册 Backend、Allocator、Dispatcher
+
+2. 输入 Tensor 准备
+   - Ascend Tensor 分配
+   - Host → Device 拷贝
+
+3. 构建并提交算子任务
+   - 创建 Ascend Op、Task
+   - 提交至 Dispatcher
+
+4. Ascend 上执行算子
+   - reshape、setup、forward
+   - → ATB Operation Execute
+
+5. 结果回传与资源释放
+   - Device → Host 拷贝验证
+   - Tensor 资源释放
+
+算子支持与映射
+--------------
+
+支持的算子
+~~~~~~~~~~~~~~~~~~~
+
+当前版本的 Ascend Backend 以验证端到端执行链路为目标，实现了基于 ATB 的 **Add 算子** 支持。
+
+算子映射策略
+~~~~~~~~~~~~
+
+在 Ascend Backend 中，框架算子并不直接依赖底层运行时实现，而是通过后端算子层进行统一映射。
+
+后续扩展
+~~~~~~~~~~~~~~~~~
+
+在当前单算子执行路径稳定的基础上，Ascend Backend 将逐步扩展算子支持范围与执行模式。
+
+添加新算子的方法
+~~~~~~~~~~~~~~~~~
+Step 1：确认 ATB 支持与算子约束
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+- 确认 ATB 是否支持目标算子类型及对应参数结构
+
+Step 2：实现 Ascend 算子类
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+- 在 Ascend 后端中定义算子类
+- 实现统一的算子生命周期：reshape → setup → forward
+- 在 forward 阶段调用 ATB 单算子完成执行
+
+Step 3：注册算子并接入调度链路
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+- 将新算子注册到 AscendBackend并完成相关适配
+
+Step 4：测试与验证
+^^^^^^^^^^^^^^^^^^^^^^^^
+- 构建最小示例在 Ascend 设备上运行新算子
+- 与 CPU 参考结果对比，验证计算正确性
+
+算子计算结果测试
+~~~~~~~~~~~~~~~~~
+- 基准结果：在 CPU/参考实现上运行同一输入，获得期望输出。
+- 输入构造：覆盖典型维度与边界尺寸；固定随机种子，避免非确定性。
+- 误差度量：按 dtype 选择误差标准，如浮点用相对/绝对误差（rtol/atol），整型用全相等。
+- 数据搬运：确保 Host→Device / Device→Host 拷贝后再次比对，排查搬运或对齐问题。
+
+内存与数据管理
+--------------
+- AscendMemoryManager（单例）：按设备创建独立内存池，当前通过 `aclrtGetDeviceCount` 为每个 device 分配池。
+- AscendMemoryPool：预分配一个较大的空间，`aclrtMalloc(..., ACL_MEM_MALLOC_HUGE_FIRST)` 获取内存，维护 base/cur 指针与剩余空间。
+- 块分配策略：首先从 32B 对齐，优先从有空间 的 free_blocks 复用，若没有可用的则在池内线性切分（64B 对齐），并返回递增 block id。
+- 线程安全：分配/释放/取指针均持锁。
+- 多设备调度：通过当前 device id 选择对应内存池，确保多卡环境下内存隔离。
+
+性能与调优
+----------
+- 内存池复用：通过 AscendMemoryPool 预分配大块显存，并在线性切分/复用 block，减少频繁 aclrtMalloc/aclrtFree 带来的碎片与性能开销。
+- 对齐与访问：按 32B/64B 对齐划分内存块，兼顾 ATB/ACL 对齐需求与访存效率。
+- 执行路径简化：当前以单算子执行链路为主，重点验证端到端正确性与内存/数据通路的稳定性，为后续多算子/多流并行奠定基础。
+- 日志观测：通过 AscendCommon 与统一日志系统记录内存分配、算子执行等关键行为，用于简单的性能和资源使用分析。
+
+测试与验证
+----------
+- 结果正确性测试：在 CPU 或参考实现上计算结果，在 Ascend Backend 上运行相同输入，对比数值。
+- 时间测试：针对算子构不同步骤，记录执行时间。
+- 端到端验证：在示例工程中跑通完整链路，同时观察输出结果与耗时，确保调度、内存池和运行时组合下行为稳定。
+
+后续扩展
+--------
+- 支持更多算子。
+- 更完整的 profiling 与可视化。
+- 上下文/图级缓存，减少重复创建。
@@ -0,0 +1,9 @@
+Ascend Backend
+====================
+
+.. toctree::
+   :maxdepth: 2
+
+   core_design
+
+
@@ -351,6 +351,11 @@ Documents
 
    cpu_backend/index
 
+.. toctree::
+   :maxdepth: 2
+
+   ascend_backend/index
+
 .. toctree::
    :maxdepth: 2
 

diff --git a/mllm/backends/ascend/AscendAllocator.cpp b/mllm/backends/ascend/AscendAllocator.cpp
@@ -1,84 +1,108 @@
 // Copyright (c) MLLM Team.
 // Licensed under the MIT License.
 
-#ifndef ASCENDC_CPU_DEBUG
-#include <acl/acl.h>
-#else
-#include <tikicpulib.h>
-#endif
-
 #include "mllm/backends/ascend/AscendAllocator.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include <iostream>
+#include "mllm/utils/Common.hpp"
 
 namespace mllm::ascend {
 
+AscendAllocator::AscendAllocator() {
+    MLLM_INFO("AscendAllocator created with memory pool support");
+}
+
+AscendAllocator::~AscendAllocator() {
+    std::lock_guard<std::mutex> lock(block_map_mutex_);
+    if (!storage_to_block_id_.empty()) {
+        MLLM_WARN("AscendAllocator destroyed with {} storage blocks still allocated",
+                  storage_to_block_id_.size());
+    }
+}
+
 bool AscendAllocator::alloc(Storage* storage) {
-#ifdef ASCENDC_CPU_DEBUG
-  storage->ptr_ = AscendC::GmAlloc(storage->size_);
-#else
-  aclrtMalloc((void**)&(storage->ptr_), storage->size_, ACL_MEM_MALLOC_HUGE_FIRST);
-#endif
-  return storage->ptr_ != nullptr;
+    auto& mem_manager = getAscendMemoryManager();
+    int block_id = -1;
+    mem_manager.allocateBlock(storage->size_, block_id);
+    if (block_id < 0) {
+        MLLM_ERROR("Failed to allocate block of size {} bytes from memory pool", storage->size_);
+        return false;
+    }
+
+    mem_manager.getBlockPtr(block_id, storage->ptr_);
+    if (storage->ptr_ == nullptr) {
+        MLLM_ERROR("Failed to get pointer for block ID {}", block_id);
+        mem_manager.freeBlock(block_id);
+        return false;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(block_map_mutex_);
+        storage_to_block_id_[storage->ptr_] = block_id;
+    }
+
+    MLLM_INFO("Allocated storage: size={} bytes, block ID={}, ptr={}",
+               storage->size_, block_id, storage->ptr_);
+    return true;
 }
 
 bool AscendAllocator::alloc(const Storage::ptr_t& storage) {
-#ifdef ASCENDC_CPU_DEBUG
-  storage->ptr_ = AscendC::GmAlloc(storage->size_);
-#else
-  aclrtMalloc((void**)&(storage->ptr_), storage->size_, ACL_MEM_MALLOC_HUGE_FIRST);
-#endif
-  return storage->ptr_ != nullptr;
+    return alloc(storage.get());
 }
 
 void AscendAllocator::free(const Storage::ptr_t& storage) {
-#ifdef ASCENDC_CPU_DEBUG
-  AscendC::GmFree((void*)storage->ptr_);
-#else
-  aclrtFree(storage->ptr_);
-#endif
+    free(storage.get());
 }
 
 void AscendAllocator::free(Storage* storage) {
-#ifdef ASCENDC_CPU_DEBUG
-  AscendC::GmFree((void*)storage->ptr_);
-#else
-  aclrtFree(storage->ptr_);
-#endif
+    if (storage->ptr_ == nullptr) {
+        return;
+    }
+
+    int block_id = -1;
+    {
+        std::lock_guard<std::mutex> lock(block_map_mutex_);
+        auto it = storage_to_block_id_.find(storage->ptr_);
+        if (it != storage_to_block_id_.end()) {
+            block_id = it->second;
+            storage_to_block_id_.erase(it);
+        }
+    }
+
+    if (block_id >= 0) {
+        getAscendMemoryManager().freeBlock(block_id);
+        MLLM_INFO("Freed storage: block ID={}, ptr={}", block_id, storage->ptr_);
+    } else {
+        MLLM_WARN("Attempted to free storage with no block ID mapping: ptr={}", storage->ptr_);
+    }
+
+    storage->ptr_ = nullptr;
 }
 
 bool AscendAllocator::generalAlloc(void** ptr, size_t cap, size_t align) {
-#ifdef ASCENDC_CPU_DEBUG
-  *ptr = AscendC::GmAlloc(cap);
-#else
-  aclrtMalloc((void**)ptr, cap, ACL_MEM_MALLOC_HUGE_FIRST);
-#endif
-  return *ptr != nullptr;
+    //we don't support generalAlloc , therefore return false
+    std::cout << "generalAlloc is not supported in AscendAllocator" << std::endl;
+    return false;
 }
 
 void AscendAllocator::generalFree(void* ptr) {
-#ifdef ASCENDC_CPU_DEBUG
-  AscendC::GmFree((void*)ptr);
-#else
-  aclrtFree(ptr);
-#endif
+    //we don't support generalFree , therefore do nothing
+    std::cout << "generalFree is not supported in AscendAllocator" << std::endl;
 }
 
 size_t AscendAllocator::allocSize(const Storage::ptr_t& storage) {
-  // remember that alloc size should be aligned
-  size_t align_size = alignSize();
-  size_t required_size = storage->size_;
-  size_t aligned_size = (required_size + align_size - 1) & ~(align_size - 1);
-  return aligned_size;
+  // Ascend allocations don't require manual alignment padding
+  // since AscendMemoryPool already provides proper alignment
+  return storage->size_;
 }
 
 size_t AscendAllocator::allocSize(Storage* storage) {
-  // remember that alloc size should be aligned
-  size_t align_size = alignSize();
-  size_t required_size = storage->size_;
-  size_t aligned_size = (required_size + align_size - 1) & ~(align_size - 1);
-  return aligned_size;
+  // Ascend allocations don't require manual alignment padding
+  // since AscendMemoryPool already provides proper alignment
+  return storage->size_;
 }
 
-size_t AscendAllocator::alignSize() const { return 128; }
+size_t AscendAllocator::alignSize() const { return 64; }
 
 std::shared_ptr<AscendAllocator> createAscendAllocator() { return std::make_shared<AscendAllocator>(); }