From 0ffe938196dfb1f6aeb27b684cc79b94a31d0816 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 12 Dec 2025 01:06:25 +0800 Subject: [PATCH 01/16] feat(ascend): add simple Ascend add demo --- examples/CMakeLists.txt | 1 + examples/ascend_add_demo/CMakeLists.txt | 22 ++++ examples/ascend_add_demo/main.cpp | 128 ++++++++++++++++++++++++ 3 files changed, 151 insertions(+) create mode 100644 examples/ascend_add_demo/CMakeLists.txt create mode 100644 examples/ascend_add_demo/main.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 6c49cfb22..a143a7989 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -20,3 +20,4 @@ endif() if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE) add_subdirectory(qwen3_qnn_aot) endif() + diff --git a/examples/ascend_add_demo/CMakeLists.txt b/examples/ascend_add_demo/CMakeLists.txt new file mode 100644 index 000000000..15eaefd7c --- /dev/null +++ b/examples/ascend_add_demo/CMakeLists.txt @@ -0,0 +1,22 @@ +add_executable(ascend_add_demo main.cpp) + +if(DEFINED ENV{ASCEND_HOME_PATH}) + target_include_directories(ascend_add_demo PRIVATE $ENV{ASCEND_HOME_PATH}/include) + target_link_directories(ascend_add_demo PRIVATE $ENV{ASCEND_HOME_PATH}/lib64) +endif() + +target_link_libraries(ascend_add_demo PRIVATE + MllmRT + MllmAscendBackend + ascendcl # 添加 ACL 库,因为 main.cpp 中直接使用了 aclrtMemcpy +) + +set_target_properties(ascend_add_demo PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON +) + +target_include_directories(ascend_add_demo PRIVATE + ${CMAKE_SOURCE_DIR} +) + diff --git a/examples/ascend_add_demo/main.cpp b/examples/ascend_add_demo/main.cpp new file mode 100644 index 000000000..6e439d388 --- /dev/null +++ b/examples/ascend_add_demo/main.cpp @@ -0,0 +1,128 @@ +#include +#include +#include +#include +#include "mllm/mllm.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/engine/Context.hpp" +#include "mllm/core/aops/ElewiseOps.hpp" +#include "mllm/core/OpTypes.hpp" + +using namespace mllm; + +int main() { + std::cout << "=== Ascend Add Op Demo ===" << std::endl; + + try { + std::cout << "1. Initializing Ascend backend..." << std::endl; + initAscendBackend(); + std::cout << " ✓ Ascend backend initialized\n" << std::endl; + + std::cout << "2. Preparing test data..." << std::endl; + const int batch = 2; + const int size = 3; + std::vector data_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + std::vector data_y = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f}; + std::vector expected = {11.0f, 22.0f, 33.0f, 44.0f, 55.0f, 66.0f}; + + std::cout << " Input X: ["; + for (size_t i = 0; i < data_x.size(); ++i) { + std::cout << data_x[i]; + if (i < data_x.size() - 1) std::cout << ", "; + } + std::cout << "]" << std::endl; + + std::cout << " Input Y: ["; + for (size_t i = 0; i < data_y.size(); ++i) { + std::cout << data_y[i]; + if (i < data_y.size() - 1) std::cout << ", "; + } + std::cout << "]\n" << std::endl; + + std::cout << "3. Preparing tensors on Ascend..." << std::endl; + auto x_handle = ascend::prepareAscendTensor(data_x, batch, size); + auto y_handle = ascend::prepareAscendTensor(data_y, batch, size); + auto& x_ascend = x_handle.tensor(); + auto& y_ascend = y_handle.tensor(); + std::cout << " ✓ Tensors ready on Ascend device\n" << std::endl; + + std::cout << "4. Executing Add operation on Ascend..." << std::endl; + auto& ctx = Context::instance(); + std::cout << "context over" < z_data_fp16(batch * size); + + auto ret = aclrtMemcpy( + z_data_fp16.data(), batch * size * sizeof(half_float::half), + z_ascend.ptr(), z_ascend.bytes(), + ACL_MEMCPY_DEVICE_TO_HOST + ); + if (ret != ACL_SUCCESS) { + std::cerr << " ✗ Failed to copy result back to CPU: ACL error " << ret << std::endl; + x_handle.release(); + y_handle.release(); + return 1; + } + + std::vector result(batch * size); + for (size_t i = 0; i < result.size(); ++i) { + result[i] = static_cast(z_data_fp16[i]); + } + + std::cout << " ✓ Result copied to CPU\n" << std::endl; + + std::cout << "6. Verifying results..." << std::endl; + std::cout << " Actual result: ["; + for (size_t i = 0; i < result.size(); ++i) { + std::cout << result[i]; + if (i < result.size() - 1) std::cout << ", "; + } + std::cout << "]" << std::endl; + + std::cout << " Expected result: ["; + for (size_t i = 0; i < expected.size(); ++i) { + std::cout << expected[i]; + if (i < expected.size() - 1) std::cout << ", "; + } + std::cout << "]" << std::endl; + + bool correct = true; + const float tolerance = 0.1f; + + for (size_t i = 0; i < result.size(); ++i) { + float diff = std::abs(result[i] - expected[i]); + if (diff > tolerance) { + correct = false; + std::cout << " ✗ Mismatch at index " << i + << ": expected " << expected[i] + << ", got " << result[i] + << " (diff: " << diff << ")" << std::endl; + } + } + + if (correct) { + std::cout << "\n✓✓✓ Test PASSED! All values match expected results. ✓✓✓" << std::endl; + } else { + std::cout << "\n✗✗✗ Test FAILED! Results don't match expected values. ✗✗✗" << std::endl; + } + + // 清理内存池中的块 + x_handle.release(); + y_handle.release(); + + return correct ? 0 : 1; + + } catch (const std::exception& e) { + std::cerr << "\n✗ Error: " << e.what() << std::endl; + return 1; + } +} + From 70d98a9c97e8b94c14bf7febf6b287ffe1d38403 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 12 Dec 2025 01:08:03 +0800 Subject: [PATCH 02/16] feat(ascend memory): introduce memory pool to Ascend backend --- .../ascend/memory/AscendMemoryManager.cpp | 67 +++++++++++ .../ascend/memory/AscendMemoryManager.hpp | 36 ++++++ .../ascend/memory/AscendMemoryPool.cpp | 106 ++++++++++++++++++ .../ascend/memory/AscendMemoryPool.hpp | 40 +++++++ mllm/backends/ascend/memory/MemoryBlock.hpp | 17 +++ 5 files changed, 266 insertions(+) create mode 100644 mllm/backends/ascend/memory/AscendMemoryManager.cpp create mode 100644 mllm/backends/ascend/memory/AscendMemoryManager.hpp create mode 100644 mllm/backends/ascend/memory/AscendMemoryPool.cpp create mode 100644 mllm/backends/ascend/memory/AscendMemoryPool.hpp create mode 100644 mllm/backends/ascend/memory/MemoryBlock.hpp diff --git a/mllm/backends/ascend/memory/AscendMemoryManager.cpp b/mllm/backends/ascend/memory/AscendMemoryManager.cpp new file mode 100644 index 000000000..b1d4920c3 --- /dev/null +++ b/mllm/backends/ascend/memory/AscendMemoryManager.cpp @@ -0,0 +1,67 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include + +#include "AscendMemoryManager.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" +#include "mllm/utils/Common.hpp" +#include "mllm/utils/Log.hpp" + +namespace mllm::ascend { + +static AscendMemoryManager g_ascendMemoryManager; + +AscendMemoryManager::AscendMemoryManager() {} + +AscendMemoryManager &getAscendMemoryManager() { + return g_ascendMemoryManager; +} + +void AscendMemoryManager::createMemoryPool(size_t pool_size) +{ + uint32_t device_count = 0; + auto ret = aclrtGetDeviceCount(&device_count); + MLLM_ACL_CHECK(ret); + for (size_t i = 0; i < device_count; i++) { + + aclrtSetDevice(i); + + std::shared_ptr memory_pool = std::make_shared(pool_size); + memory_pools_.push_back(memory_pool); + MLLM_INFO("create mempool for device {} success", i); + } +} + +int32_t AscendMemoryManager::getDeviceId() +{ + int32_t device_id = -1; + auto ret = aclrtGetDevice(&device_id); + MLLM_ACL_CHECK(ret); + return device_id; +} + +std::shared_ptr &AscendMemoryManager::getMemoryPool() +{ + size_t device_id = static_cast(getDeviceId()); + if (device_id >= memory_pools_.size()) { + MLLM_ERROR_EXIT(::mllm::ExitCode::kAscendError, "Invalid device id {}", device_id); + } + return memory_pools_[device_id]; +} + +void AscendMemoryManager::allocateBlock(uint32_t size, int &block_id) +{ + getMemoryPool()->allocateBlock(size, block_id); +} + +void AscendMemoryManager::freeBlock(int block_id) +{ + getMemoryPool()->freeBlock(block_id); +} + +void AscendMemoryManager::getBlockPtr(int block_id, void *&addr) +{ + getMemoryPool()->getBlockPtr(block_id, addr); +} +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/memory/AscendMemoryManager.hpp b/mllm/backends/ascend/memory/AscendMemoryManager.hpp new file mode 100644 index 000000000..ef3007c3f --- /dev/null +++ b/mllm/backends/ascend/memory/AscendMemoryManager.hpp @@ -0,0 +1,36 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include "AscendMemoryPool.hpp" + +namespace mllm::ascend { + + +class AscendMemoryManager { +public: + AscendMemoryManager(); + + void createMemoryPool(size_t pool_size); + + int32_t getDeviceId(); + + std::shared_ptr &getMemoryPool(); + + void allocateBlock(uint32_t size, int &block_id); + + void freeBlock(int block_id); + + void getBlockPtr(int block_id, void *&addr); + +private: + std::vector> memory_pools_; +}; + +AscendMemoryManager &getAscendMemoryManager(); + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/memory/AscendMemoryPool.cpp b/mllm/backends/ascend/memory/AscendMemoryPool.cpp new file mode 100644 index 000000000..a5234d96b --- /dev/null +++ b/mllm/backends/ascend/memory/AscendMemoryPool.cpp @@ -0,0 +1,106 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include +#include +#include "AscendMemoryPool.hpp" +#include "mllm/utils/Common.hpp" +#include "mllm/utils/Log.hpp" + +namespace mllm::ascend { + +constexpr size_t POOL_SIZE = 104857600; // 100 MiB, + +AscendMemoryPool::AscendMemoryPool(size_t pool_size = POOL_SIZE) { + auto ret = aclrtMalloc(&base_mem_ptr_, pool_size, ACL_MEM_MALLOC_HUGE_FIRST); + if (ret != ACL_SUCCESS) { + MLLM_ERROR("Failed to allocate Ascend memory pool of size {} bytes: ACL error {}", + pool_size, int(ret)); + base_mem_ptr_ = nullptr; + cur_mem_ptr_ = nullptr; + remain_size_ = 0; + return; + } + cur_mem_ptr_ = base_mem_ptr_; + remain_size_ = pool_size; +} + +AscendMemoryPool::~AscendMemoryPool() { + if (base_mem_ptr_ != nullptr) { + auto ret = aclrtFree(base_mem_ptr_); + if (ret != ACL_SUCCESS) { + MLLM_ERROR("Failed to free Ascend memory pool: ACL error {}", int(ret)); + } + } + MLLM_INFO("release MemoryPool success"); +} + +uint64_t AscendMemoryPool::generateBlocksId() { + return static_cast(id_.fetch_add(1, std::memory_order_relaxed)); +} + +void AscendMemoryPool::allocateBlock(uint32_t size, int &block_id) { + std::unique_lock lock(block_mutex_); + + size_t align_size = ((size + 31) & ~31) + 32; + + for (auto it = free_blocks_.begin(); it != free_blocks_.end(); it++) { + if (it->second.block_size_ >= align_size) { + block_id = it->second.block_id_; + used_blocks_.insert(*it); + free_blocks_.erase(it); + MLLM_INFO("find free block id {} to allocate", block_id); + return; + } + } + + if (remain_size_ > align_size) { + block_id = generateBlocksId(); + uint64_t cur_mem_ptr_align = (reinterpret_cast(cur_mem_ptr_) + 63) & ~63; + remain_size_ -= (cur_mem_ptr_align - reinterpret_cast(cur_mem_ptr_)); + cur_mem_ptr_ = reinterpret_cast(cur_mem_ptr_align); + + MemoryBlock block = {block_id, align_size, cur_mem_ptr_}; + used_blocks_.insert({block_id, block}); + remain_size_ -= align_size; + cur_mem_ptr_ = reinterpret_cast(cur_mem_ptr_) + align_size; + MLLM_INFO("allocate block id {} for size {}", block_id, align_size); + return; + } + MLLM_ERROR("allocate block fail"); +} + +void AscendMemoryPool::freeBlock(int block_id) { + std::unique_lock lock(block_mutex_); + + if (block_id < 0) { + MLLM_INFO("skip over the invalid block id {}", block_id); + return; + } + + auto it = used_blocks_.find(block_id); + if (it != used_blocks_.end()) { + free_blocks_.insert(*it); + used_blocks_.erase(it); + } else { + MLLM_ERROR("Double free block id {}", block_id); + } +} + +void AscendMemoryPool::getBlockPtr(int block_id, void *&addr) { + std::unique_lock lock(block_mutex_); + + if (block_id < 0) { + MLLM_INFO("Invalid block id {} to get ptr", block_id); + return; + } + + auto it = used_blocks_.find(block_id); + if (it != used_blocks_.end()) { + addr = it->second.address_; + } else { + MLLM_ERROR("Get block address error, block id {}", block_id); + } +} + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/memory/AscendMemoryPool.hpp b/mllm/backends/ascend/memory/AscendMemoryPool.hpp new file mode 100644 index 000000000..1e41fc041 --- /dev/null +++ b/mllm/backends/ascend/memory/AscendMemoryPool.hpp @@ -0,0 +1,40 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include +#include +#include "MemoryBlock.hpp" + +namespace mllm::ascend { + +class AscendMemoryPool { +public: + explicit AscendMemoryPool(size_t pool_size); + ~AscendMemoryPool(); + + void allocateBlock(uint32_t size, int& block_id); + + void freeBlock(int block_id); + + void getBlockPtr(int block_id, void*& addr); + +private: + uint64_t generateBlocksId(); + + std::atomic id_ = 0; + std::mutex block_mutex_; + + void* base_mem_ptr_ = nullptr; + void* cur_mem_ptr_ = nullptr; + int64_t remain_size_ = 0; + + std::unordered_map used_blocks_; + std::unordered_map free_blocks_; +}; + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/memory/MemoryBlock.hpp b/mllm/backends/ascend/memory/MemoryBlock.hpp new file mode 100644 index 000000000..eda57bf09 --- /dev/null +++ b/mllm/backends/ascend/memory/MemoryBlock.hpp @@ -0,0 +1,17 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include +#include + +namespace mllm::ascend { + +struct MemoryBlock { + int64_t block_id_; + size_t block_size_; + void* address_ = nullptr; +}; + +} // namespace mllm::ascend From 1d6bd24155d0555a2135890dbdb6a0bd2194c130 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 12 Dec 2025 01:09:07 +0800 Subject: [PATCH 03/16] feat(ascend backend): create Ascend backend runtime, allocator and dispatcher --- mllm/backends/ascend/AscendAllocator.cpp | 121 ++++++++------ mllm/backends/ascend/AscendAllocator.hpp | 12 ++ mllm/backends/ascend/AscendBackend.cpp | 13 +- mllm/backends/ascend/AscendCommon.cpp | 187 ++++++++++++++++++++++ mllm/backends/ascend/AscendCommon.hpp | 96 +++++++++++ mllm/backends/ascend/AscendDispatcher.cpp | 91 +++++++++++ mllm/backends/ascend/AscendDispatcher.hpp | 41 +++++ mllm/backends/ascend/CMakeLists.txt | 57 ++++--- mllm/backends/ascend/Register.cpp | 41 +++++ 9 files changed, 587 insertions(+), 72 deletions(-) create mode 100644 mllm/backends/ascend/Register.cpp diff --git a/mllm/backends/ascend/AscendAllocator.cpp b/mllm/backends/ascend/AscendAllocator.cpp index 641550b52..f41ba3612 100644 --- a/mllm/backends/ascend/AscendAllocator.cpp +++ b/mllm/backends/ascend/AscendAllocator.cpp @@ -1,84 +1,105 @@ // Copyright (c) MLLM Team. // Licensed under the MIT License. -#ifndef ASCENDC_CPU_DEBUG -#include -#else -#include -#endif - #include "mllm/backends/ascend/AscendAllocator.hpp" +#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" + +#include "mllm/utils/Common.hpp" namespace mllm::ascend { +AscendAllocator::AscendAllocator() { + MLLM_INFO("AscendAllocator created with memory pool support"); +} + +AscendAllocator::~AscendAllocator() { + std::lock_guard lock(block_map_mutex_); + if (!storage_to_block_id_.empty()) { + MLLM_WARN("AscendAllocator destroyed with {} storage blocks still allocated", + storage_to_block_id_.size()); + } +} + bool AscendAllocator::alloc(Storage* storage) { -#ifdef ASCENDC_CPU_DEBUG - storage->ptr_ = AscendC::GmAlloc(storage->size_); -#else - aclrtMalloc((void**)&(storage->ptr_), storage->size_, ACL_MEM_MALLOC_HUGE_FIRST); -#endif - return storage->ptr_ != nullptr; + auto& mem_manager = getAscendMemoryManager(); + int block_id = -1; + mem_manager.allocateBlock(storage->size_, block_id); + if (block_id < 0) { + MLLM_ERROR("Failed to allocate block of size {} bytes from memory pool", storage->size_); + return false; + } + + mem_manager.getBlockPtr(block_id, storage->ptr_); + if (storage->ptr_ == nullptr) { + MLLM_ERROR("Failed to get pointer for block ID {}", block_id); + mem_manager.freeBlock(block_id); + return false; + } + + { + std::lock_guard lock(block_map_mutex_); + storage_to_block_id_[storage->ptr_] = block_id; + } + + MLLM_INFO("Allocated storage: size={} bytes, block ID={}, ptr={}", + storage->size_, block_id, storage->ptr_); + return true; } bool AscendAllocator::alloc(const Storage::ptr_t& storage) { -#ifdef ASCENDC_CPU_DEBUG - storage->ptr_ = AscendC::GmAlloc(storage->size_); -#else - aclrtMalloc((void**)&(storage->ptr_), storage->size_, ACL_MEM_MALLOC_HUGE_FIRST); -#endif - return storage->ptr_ != nullptr; + return alloc(storage.get()); } void AscendAllocator::free(const Storage::ptr_t& storage) { -#ifdef ASCENDC_CPU_DEBUG - AscendC::GmFree((void*)storage->ptr_); -#else - aclrtFree(storage->ptr_); -#endif + free(storage.get()); } void AscendAllocator::free(Storage* storage) { -#ifdef ASCENDC_CPU_DEBUG - AscendC::GmFree((void*)storage->ptr_); -#else - aclrtFree(storage->ptr_); -#endif + if (storage->ptr_ == nullptr) { + return; + } + + int block_id = -1; + { + std::lock_guard lock(block_map_mutex_); + auto it = storage_to_block_id_.find(storage->ptr_); + if (it != storage_to_block_id_.end()) { + block_id = it->second; + storage_to_block_id_.erase(it); + } + } + + if (block_id >= 0) { + getAscendMemoryManager().freeBlock(block_id); + MLLM_INFO("Freed storage: block ID={}, ptr={}", block_id, storage->ptr_); + } else { + MLLM_WARN("Attempted to free storage with no block ID mapping: ptr={}", storage->ptr_); + } + + storage->ptr_ = nullptr; } bool AscendAllocator::generalAlloc(void** ptr, size_t cap, size_t align) { -#ifdef ASCENDC_CPU_DEBUG - *ptr = AscendC::GmAlloc(cap); -#else - aclrtMalloc((void**)ptr, cap, ACL_MEM_MALLOC_HUGE_FIRST); -#endif - return *ptr != nullptr; + return true; } void AscendAllocator::generalFree(void* ptr) { -#ifdef ASCENDC_CPU_DEBUG - AscendC::GmFree((void*)ptr); -#else - aclrtFree(ptr); -#endif + } size_t AscendAllocator::allocSize(const Storage::ptr_t& storage) { - // remember that alloc size should be aligned - size_t align_size = alignSize(); - size_t required_size = storage->size_; - size_t aligned_size = (required_size + align_size - 1) & ~(align_size - 1); - return aligned_size; + // Ascend allocations don't require manual alignment padding + // since AscendMemoryPool already provides proper alignment + return storage->size_; } size_t AscendAllocator::allocSize(Storage* storage) { - // remember that alloc size should be aligned - size_t align_size = alignSize(); - size_t required_size = storage->size_; - size_t aligned_size = (required_size + align_size - 1) & ~(align_size - 1); - return aligned_size; + // Ascend allocations don't require manual alignment padding + // since AscendMemoryPool already provides proper alignment + return storage->size_; } -size_t AscendAllocator::alignSize() const { return 128; } +size_t AscendAllocator::alignSize() const { return 64; } std::shared_ptr createAscendAllocator() { return std::make_shared(); } diff --git a/mllm/backends/ascend/AscendAllocator.hpp b/mllm/backends/ascend/AscendAllocator.hpp index 82f000fab..d3c7390c2 100644 --- a/mllm/backends/ascend/AscendAllocator.hpp +++ b/mllm/backends/ascend/AscendAllocator.hpp @@ -6,10 +6,17 @@ #include "mllm/backends/base/Allocator.hpp" #include "mllm/core/Storage.hpp" +#include +#include + + namespace mllm::ascend { class AscendAllocator final : public Allocator { public: + AscendAllocator(); + ~AscendAllocator(); + inline bool ctrlByMemManager() override { return false; } bool alloc(Storage* storage) override; @@ -29,6 +36,11 @@ class AscendAllocator final : public Allocator { size_t allocSize(const Storage::ptr_t& storage) override; [[nodiscard]] size_t alignSize() const override; + +private: + std::mutex block_map_mutex_; + std::unordered_map storage_to_block_id_; // Storage ptr -> block ID + }; std::shared_ptr createAscendAllocator(); diff --git a/mllm/backends/ascend/AscendBackend.cpp b/mllm/backends/ascend/AscendBackend.cpp index e30db6d69..408cb2518 100644 --- a/mllm/backends/ascend/AscendBackend.cpp +++ b/mllm/backends/ascend/AscendBackend.cpp @@ -3,11 +3,22 @@ #include "mllm/backends/ascend/AscendBackend.hpp" #include "mllm/backends/ascend/AscendAllocator.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" #include "mllm/core/DeviceTypes.hpp" +#include "mllm/backends/ascend/ops/AscendElewiseOps.hpp" + namespace mllm::ascend { -AscendBackend::AscendBackend() : Backend(kAscend, createAscendAllocator()) {} +AscendBackend::AscendBackend() : Backend(kAscend, createAscendAllocator()) { + regOpFactory(); + auto& devices = AscendDeviceMetaInfo::instance().devices; + for (const auto& device : devices) { + const auto bytes_to_mb = [](size_t bytes) { return bytes / (1024.0 * 1024.0); }; + MLLM_INFO("Found Ascend device {} (ID: {}, SOC: {}, Memory: {:.2f} MB free / {:.2f} MB total)", device.name, + device.id, device.soc_version, bytes_to_mb(device.free_memory), bytes_to_mb(device.total_memory)); + } +} std::shared_ptr createAscendBackend() { return std::make_shared(); } diff --git a/mllm/backends/ascend/AscendCommon.cpp b/mllm/backends/ascend/AscendCommon.cpp index e69de29bb..dabb9412c 100644 --- a/mllm/backends/ascend/AscendCommon.cpp +++ b/mllm/backends/ascend/AscendCommon.cpp @@ -0,0 +1,187 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/AscendCommon.hpp" + +#include + +#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" + +namespace mllm::ascend { + +static aclrtStream g_atb_stream = nullptr; + +AscendTensorHandle::AscendTensorHandle(Tensor tensor, int block_id) + : tensor_(std::move(tensor)), block_id_(block_id) {} + +AscendTensorHandle::~AscendTensorHandle() { release(); } + +AscendTensorHandle::AscendTensorHandle(AscendTensorHandle&& other) noexcept + : tensor_(std::move(other.tensor_)), block_id_(other.block_id_) { + other.block_id_ = -1; +} + +AscendTensorHandle& AscendTensorHandle::operator=(AscendTensorHandle&& other) noexcept { + if (this != &other) { + release(); + tensor_ = std::move(other.tensor_); + block_id_ = other.block_id_; + other.block_id_ = -1; + } + return *this; +} + +void AscendTensorHandle::release() { + if (block_id_ >= 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(block_id_); + block_id_ = -1; + tensor_.impl()->storage()->ptr_ = nullptr; + } +} + +AscendTensorHandle prepareAscendTensor(const std::vector& host_data, + int batch, + int size) { + const size_t expected_elements = static_cast(batch) * static_cast(size); + MLLM_RT_ASSERT_EQ(host_data.size(), expected_elements); + + std::vector fp16_data(expected_elements); + for (size_t i = 0; i < expected_elements; ++i) { + fp16_data[i] = half_float::half(host_data[i]); + } + + auto tensor = Tensor::empty({batch, size}, kFloat16, kAscend); + + auto& mem_mgr = getAscendMemoryManager(); + int block_id = -1; + const uint32_t bytes = static_cast(expected_elements * sizeof(half_float::half)); + + mem_mgr.allocateBlock(bytes, block_id); + + void* device_ptr = nullptr; + mem_mgr.getBlockPtr(block_id, device_ptr); + tensor.impl()->storage()->ptr_ = device_ptr; + + auto ret = aclrtMemcpy( + device_ptr, bytes, + fp16_data.data(), bytes, + ACL_MEMCPY_HOST_TO_DEVICE); + + if (ret != ACL_SUCCESS) { + mem_mgr.freeBlock(block_id); + MLLM_ACL_CHECK(ret); + } + + return AscendTensorHandle(std::move(tensor), block_id); +} + +atb::Context* getGlobalAtbContext() { + static atb::Context* ctx = nullptr; + + if (ctx == nullptr) { + // 1. Set Device + auto acl_ret = aclrtSetDevice(0); + MLLM_ACL_CHECK(acl_ret); + + // 2. Create Context + auto ret = atb::CreateContext(&ctx); + MLLM_ATB_CHECK(ret); + + // 3. Create Stream + acl_ret = aclrtCreateStream(&g_atb_stream); + MLLM_ACL_CHECK(acl_ret); + + // 4. Set Stream + ctx->SetExecuteStream(g_atb_stream); + } + return ctx; +} + +aclrtStream getGlobalAtbStream() { + getGlobalAtbContext(); // Ensure initialized + return g_atb_stream; +} + +void syncGlobalAtbStream() { + if (g_atb_stream != nullptr) { + auto ret = aclrtSynchronizeStream(g_atb_stream); + MLLM_ACL_CHECK(ret); + } +} + +void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc) { + desc.dtype = ACL_FLOAT16; // Currently hardcoded as per demo, can be expanded later + desc.format = ACL_FORMAT_ND; + + auto shape = t.shape(); + desc.shape.dimNum = static_cast(shape.size()); + for (uint64_t i = 0; i < desc.shape.dimNum; ++i) { + desc.shape.dims[i] = static_cast(shape[i]); + } +} + +AscendDeviceMetaInfo::AscendDeviceMetaInfo() { +#ifndef ASCENDC_CPU_DEBUG + // Initialize ACL to query devices + auto ret = aclInit(nullptr); + if (ret != ACL_SUCCESS) { + MLLM_ERROR("Failed to initialize ACL for device enumeration: {}", ret); + return; + } + + // Get device count + uint32_t device_count = 0; + ret = aclrtGetDeviceCount(&device_count); + if (ret != ACL_SUCCESS) { + MLLM_ERROR("Failed to get Ascend device count: {}", ret); + aclFinalize(); + return; + } + + // Collect info for each device + for (uint32_t i = 0; i < device_count; ++i) { + AscendDeviceInfo info; + info.id = i; + info.name = "Ascend Device " + std::to_string(i); + + // Set device to query its properties + ret = aclrtSetDevice(i); + if (ret == ACL_SUCCESS) { + // Get memory information + size_t free_mem = 0, total_mem = 0; + ret = aclrtGetMemInfo(ACL_HBM_MEM, &free_mem, &total_mem); + if (ret == ACL_SUCCESS) { + info.total_memory = total_mem; + info.free_memory = free_mem; + } else { + info.total_memory = 0; + info.free_memory = 0; + } + + // SOC version - platform specific, set to unknown for now + info.soc_version = "Unknown"; + } else { + info.total_memory = 0; + info.free_memory = 0; + info.soc_version = "Unknown"; + } + + devices.push_back(info); + } + + // Finalize ACL after enumeration + aclFinalize(); +#else + // In CPU debug mode, add a dummy device + AscendDeviceInfo info; + info.id = 0; + info.name = "Ascend CPU Debug Device"; + info.total_memory = 0; + info.free_memory = 0; + info.soc_version = "CPU_DEBUG"; + devices.push_back(info); +#endif +} + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/AscendCommon.hpp b/mllm/backends/ascend/AscendCommon.hpp index e69de29bb..7b7cea8ec 100644 --- a/mllm/backends/ascend/AscendCommon.hpp +++ b/mllm/backends/ascend/AscendCommon.hpp @@ -0,0 +1,96 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include +#include + +#include +#include +#include + +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/utils/Common.hpp" + +// Ascend ACL error checking macro +#define MLLM_ACL_CHECK(err) \ + if (err != ACL_SUCCESS) { \ + MLLM_ERROR_EXIT(::mllm::ExitCode::kAscendError, "ACL error code {}: {}", int(err), aclGetRecentErrMsg()); \ + } + +// Ascend ATB error checking macro +#define MLLM_ATB_CHECK(err) \ + if (err != atb::NO_ERROR) { \ + MLLM_ERROR_EXIT(::mllm::ExitCode::kAscendError, "ATB error code {}", int(err)); \ + } + +namespace mllm::ascend { + +// Get global ATB Context (Lazy Initialization: aclrtSetDevice, atb::CreateContext, aclrtCreateStream, SetExecuteStream) +atb::Context* getGlobalAtbContext(); + +// Get global ATB Stream +aclrtStream getGlobalAtbStream(); + +// Sync global ATB Stream +void syncGlobalAtbStream(); + +// Convert MLLM Tensor metadata to ATB TensorDesc +void fillAtbTensorDesc(const Tensor& t, atb::TensorDesc& desc); + +// Ascend device information structure +struct AscendDeviceInfo { + std::string name; + unsigned int id; + size_t total_memory; // bytes + size_t free_memory; // bytes + std::string soc_version; +}; + +// Ascend device metadata collector (singleton) +class AscendDeviceMetaInfo { + public: + AscendDeviceMetaInfo(); + + static AscendDeviceMetaInfo& instance() { + static AscendDeviceMetaInfo instance; + return instance; + } + + AscendDeviceMetaInfo(const AscendDeviceMetaInfo&) = delete; + AscendDeviceMetaInfo& operator=(const AscendDeviceMetaInfo&) = delete; + + std::vector devices; +}; + +// RAII handle for Ascend tensor with automatic memory block management +struct AscendTensorHandle { + AscendTensorHandle() = default; + AscendTensorHandle(Tensor tensor, int block_id); // Construct with tensor and memory block ID + ~AscendTensorHandle(); // Auto-release memory block + + AscendTensorHandle(const AscendTensorHandle&) = delete; + AscendTensorHandle& operator=(const AscendTensorHandle&) = delete; + AscendTensorHandle(AscendTensorHandle&& other) noexcept; // Move constructor + AscendTensorHandle& operator=(AscendTensorHandle&& other) noexcept; // Move assignment + + void release(); // Manually release memory block and invalidate handle + bool valid() const { return block_id_ >= 0; } // Check if handle owns a valid memory block + + Tensor& tensor() { return tensor_; } // Access tensor + const Tensor& tensor() const { return tensor_; } // Access tensor (const) + int blockId() const { return block_id_; } // Get memory block ID + + private: + Tensor tensor_; + int block_id_{-1}; +}; + +// Prepare Ascend tensor from host float data (converts to FP16, allocates device memory, copies data) +AscendTensorHandle prepareAscendTensor(const std::vector& host_data, + int batch, + int size); + +} // namespace mllm::ascend diff --git a/mllm/backends/ascend/AscendDispatcher.cpp b/mllm/backends/ascend/AscendDispatcher.cpp index e69de29bb..0ceae8f9e 100644 --- a/mllm/backends/ascend/AscendDispatcher.cpp +++ b/mllm/backends/ascend/AscendDispatcher.cpp @@ -0,0 +1,91 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/AscendDispatcher.hpp" +#include "mllm/backends/ascend/AscendBackend.hpp" +#include "mllm/core/OpTypes.hpp" +#include "mllm/engine/Dispatcher.hpp" +#include "mllm/engine/Context.hpp" +#include "mllm/utils/Common.hpp" +#include "mllm/nn/Module.hpp" +#include "mllm/tracy_perf/Tracy.hpp" + +#ifdef MLLM_PERFETTO_ENABLE +#include "mllm/engine/Perf.hpp" +#endif + +namespace mllm::ascend { + +AscendDispatcher::AscendDispatcher(exec::static_thread_pool& thread_pool, dispatcher_id_t id, + const AscendDispatcherOptions& options) + : Dispatcher(thread_pool, id), options_(options) {} + +void AscendDispatcher::receive(const Task::ptr_t& task) { + switch (task->type) { + case TaskTypes::kExecuteModule: + case TaskTypes::kExecuteOp: { + process(task); + break; + } + default: NYI("Only execute op/module task is supported in AscendDispatcher::receive"); + } +} + +TaskResult::sender_t AscendDispatcher::asyncReceive(const Task::ptr_t& task) { + switch (task->type) { + case TaskTypes::kExecuteModule: { + MLLM_EMPTY_SCOPE; + break; + } + default: NYI("Only execute module task is supported in AscendDispatcher::asyncReceive"); + } + auto scheduler = thread_pool_.get_scheduler(); + return stdexec::schedule(scheduler) | stdexec::then([this, task] { process(task); }); +} + +void AscendDispatcher::process(const Task::ptr_t& task) { + MLLM_TRACY_ZONE_SCOPED; + switch (task->type) { + case TaskTypes::kExecuteOp: { + task->op->reshape(task->inputs, task->outputs); + task->op->setup(task->inputs, task->outputs); + task->op->forward(task->inputs, task->outputs); + + break; + } + case TaskTypes::kExecuteModule: { + auto moduleName = static_cast(task->custom_context_ptr)->getModuleName(); +#ifdef MLLM_PERFETTO_ENABLE + MLLM_PERF_TRACE_EVENT("mllm.ascend.execute.", perfetto::DynamicString{moduleName}, + [&](perfetto::EventContext ctx) { + int cnt = 0; + for (auto& i : task->inputs) { + ctx.AddDebugAnnotation(perfetto::DynamicString{"inputs-" + std::to_string(cnt++)}, + i.shape()); + } + }); +#endif + auto ascendBackend = std::static_pointer_cast(Context::instance().getBackend(kAscend)); + + task->outputs = ((nn::Module*)(task->custom_context_ptr))->forward(task->inputs, task->args); + + // TODO: + // ascendBackend->graphExecute(moduleName, task->inputs, task->outputs); + break; + } + default: NYI("AscendDispatcher::process not supported task type"); + } +} + +void AscendDispatcher::syncWait() { + // TODO +} + +AscendDispatcher::ptr_t createAscendDispatcher(exec::static_thread_pool& thread_pool, + const AscendDispatcherOptions& options) { + return std::make_shared(thread_pool, Dispatcher::ascend_dispatcher_id, options); +} + +} // namespace mllm::ascend + + diff --git a/mllm/backends/ascend/AscendDispatcher.hpp b/mllm/backends/ascend/AscendDispatcher.hpp index e69de29bb..7b8ad5943 100644 --- a/mllm/backends/ascend/AscendDispatcher.hpp +++ b/mllm/backends/ascend/AscendDispatcher.hpp @@ -0,0 +1,41 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include + +#include "mllm/engine/Dispatcher.hpp" +#include "mllm/utils/Common.hpp" + +namespace mllm::ascend { + +struct AscendDispatcherOptions { + MLLM_EMPTY_SCOPE; +}; + +class AscendDispatcher final : public Dispatcher { + public: + using ptr_t = std::shared_ptr; + + explicit AscendDispatcher(exec::static_thread_pool& thread_pool, dispatcher_id_t id, + const AscendDispatcherOptions& options); + + void receive(const Task::ptr_t& task) override; + + TaskResult::sender_t asyncReceive(const Task::ptr_t& task) override; + + void process(const Task::ptr_t& task) override; + + void syncWait() override; + + private: + AscendDispatcherOptions options_; +}; + +AscendDispatcher::ptr_t createAscendDispatcher(exec::static_thread_pool& thread_pool, + const AscendDispatcherOptions& options); + +} // namespace mllm::ascend + + diff --git a/mllm/backends/ascend/CMakeLists.txt b/mllm/backends/ascend/CMakeLists.txt index b8653bfbf..6386948fd 100644 --- a/mllm/backends/ascend/CMakeLists.txt +++ b/mllm/backends/ascend/CMakeLists.txt @@ -1,10 +1,6 @@ -file(GLOB MLLM_ASCEND_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/kernels/*.cpp) - set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") -set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type") -set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" - CACHE STRING "ASCEND CANN package installation directory" -) +set(SOC_VERSION "Ascend310B1" CACHE STRING "system on chip type") + if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) endif() @@ -13,35 +9,54 @@ if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) endif() message(STATUS "SOC_VERSION is ${SOC_VERSION}, RUN_MODE is ${RUN_MODE}") -if("${RUN_MODE}" STREQUAL "cpu") - include(cmake/cpu_lib.cmake) -else() - include(cmake/npu_lib.cmake) -endif() # ============ The CATLASS Code ============ # TODO add catlass # ============ MLLM Code ============ -add_library( - MllmAscendBackend SHARED - AscendAllocator.cpp - AscendBackend.cpp - AscendCommon.cpp - AscendDispatcher.cpp +file(GLOB MLLM_ASCEND_CORE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Ascend*.cpp) +file(GLOB MLLM_ASCEND_MEMORY_FILES ${CMAKE_CURRENT_SOURCE_DIR}/memory/*.cpp) +file(GLOB MLLM_ASCEND_OPS_FILES ${CMAKE_CURRENT_SOURCE_DIR}/ops/*.cpp) + +set(MLLM_ASCEND_SOURCES + ${MLLM_ASCEND_CORE_FILES} + ${MLLM_ASCEND_MEMORY_FILES} + ${MLLM_ASCEND_OPS_FILES} + ${CMAKE_CURRENT_SOURCE_DIR}/Register.cpp ) + +add_library(MllmAscendBackend SHARED ${MLLM_ASCEND_SOURCES}) + +if(DEFINED ENV{ASCEND_HOME_PATH}) + target_include_directories(MllmAscendBackend PUBLIC $ENV{ASCEND_HOME_PATH}/include) + target_link_directories(MllmAscendBackend PRIVATE $ENV{ASCEND_HOME_PATH}/lib64) +else() + message(WARNING "ASCEND_HOME_PATH is not set, Ascend headers and libs may not be found") +endif() + +if(DEFINED ENV{ATB_HOME_PATH}) + target_include_directories(MllmAscendBackend PUBLIC $ENV{ATB_HOME_PATH}/include) + target_link_directories(MllmAscendBackend PRIVATE $ENV{ATB_HOME_PATH}/lib) +else() + message(WARNING "ATB_HOME_PATH not defined, ATB library will not be linked") +endif() + + target_link_libraries(MllmAscendBackend PRIVATE - $:host_intf_pub>> - $:tikicpulib::${SOC_VERSION}>> ascendcl - $:c_sec>> - $:MllmAscendKernel>> + opapi + nnopbase + atb MllmRT ) target_compile_definitions(MllmAscendBackend PUBLIC ASCENDC_DUMP=0) + +target_compile_definitions(MllmAscendBackend PUBLIC MLLM_USE_ASCEND_MEMPOOL) set_target_properties(MllmAscendBackend PROPERTIES CXX_STANDARD 20) target_compile_options(MllmAscendBackend PUBLIC $:-g>> -O2 ) target_include_directories(MllmAscendBackend PUBLIC "${MLLM_ASCEND_INSTALL_PATH}/include/") + + diff --git a/mllm/backends/ascend/Register.cpp b/mllm/backends/ascend/Register.cpp new file mode 100644 index 000000000..2cdca6a03 --- /dev/null +++ b/mllm/backends/ascend/Register.cpp @@ -0,0 +1,41 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/DeviceTypes.hpp" +#include "mllm/engine/Context.hpp" +#include "mllm/mllm.hpp" +#include "mllm/backends/ascend/AscendBackend.hpp" +#include "mllm/backends/ascend/AscendDispatcher.hpp" +#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" + +namespace mllm { + +void initAscendBackend() { + auto& ctx = Context::instance(); + + // 1. Create memory pool + size_t pool_size = 100 * 1024 * 1024; // 100MB, can be adjusted as needed + ascend::getAscendMemoryManager().createMemoryPool(pool_size); + MLLM_INFO("Ascend memory pool initialized"); + + // 2. Register backend + auto backend = std::make_shared(); + ctx.registerBackend(backend); + + // 3. Register allocator + ctx.memoryManager()->registerAllocator(kAscend, backend->allocator(), MemoryManagerOptions()); + + // 4. Register dispatcher + auto dispatcher = ascend::createAscendDispatcher(ctx.dispatcherManager()->getExecutor(), + ascend::AscendDispatcherOptions{}); + ctx.dispatcherManager()->registerDispatcher(dispatcher); + MLLM_INFO("Ascend dispatcher registered"); + + // 5. Register custom ops + // ctx.registerCustomizedOp(kAscend, "CustomOpName", + // std::make_shared()); +} + +} // namespace mllm From bf060a9660b11f950da60211d51a766959f285df Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 12 Dec 2025 01:09:42 +0800 Subject: [PATCH 04/16] feat(ascend): add Ascend elementwise ops --- mllm/backends/ascend/ops/AscendElewiseOps.hpp | 27 ++++ mllm/backends/ascend/ops/AscnedElewiseOps.cpp | 118 ++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 mllm/backends/ascend/ops/AscendElewiseOps.hpp create mode 100644 mllm/backends/ascend/ops/AscnedElewiseOps.cpp diff --git a/mllm/backends/ascend/ops/AscendElewiseOps.hpp b/mllm/backends/ascend/ops/AscendElewiseOps.hpp new file mode 100644 index 000000000..26117cbc2 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendElewiseOps.hpp @@ -0,0 +1,27 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/ElewiseOps.hpp" +#include "mllm/core/OpTypes.hpp" + +namespace mllm::ascend { + +class AscendAddOp final : public aops::AddOp { + public: + explicit AscendAddOp(const aops::AddOpOptions& options); + + void setup(const std::vector& inputs, std::vector& outputs) override; + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendAddOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::AddOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::ascend \ No newline at end of file diff --git a/mllm/backends/ascend/ops/AscnedElewiseOps.cpp b/mllm/backends/ascend/ops/AscnedElewiseOps.cpp new file mode 100644 index 000000000..fc6fad429 --- /dev/null +++ b/mllm/backends/ascend/ops/AscnedElewiseOps.cpp @@ -0,0 +1,118 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/ops/AscendElewiseOps.hpp" + +#include +#include +#include +#include +#include + +#include "mllm/utils/Common.hpp" +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" + +namespace mllm::ascend { + +AscendAddOp::AscendAddOp(const aops::AddOpOptions& options) : aops::AddOp(options) {} + +void AscendAddOp::setup(const std::vector& inputs, std::vector& outputs) { + for (auto& t : outputs) { + if (!t.isNil()) { + auto& mem_mgr = getAscendMemoryManager(); + int block_id = -1; + void* device_ptr = nullptr; + + mem_mgr.allocateBlock(static_cast(t.bytes()), block_id); + mem_mgr.getBlockPtr(block_id, device_ptr); + + t.impl()->storage()->ptr_ = device_ptr; + } + } +} + +void AscendAddOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT_EQ(inputs.size(), 2); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& x = inputs[0]; + const auto& y = inputs[1]; + auto& z = outputs[0]; + + if (x.dtype() != y.dtype() || x.dtype() != z.dtype()) { + NYI("AscendAddOp currently requires x/y/z have same dtype"); + } + if (x.numel() != y.numel() || x.numel() != z.numel()) { + NYI("AscendAddOp demo only supports no-broadcast case (numel equal)"); + } + + atb::infer::ElewiseParam addParam; + addParam.elewiseType = atb::infer::ElewiseParam::ELEWISE_ADD; + + atb::Operation* op = nullptr; + auto st = atb::CreateOperation(addParam, &op); + if (st != atb::NO_ERROR || op == nullptr) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(ELEWISE_ADD) failed, status={}", static_cast(st)); + } + + atb::Context* atb_ctx = getGlobalAtbContext(); + + atb::Tensor atb_x; + atb::Tensor atb_y; + atb::Tensor atb_z; + + fillAtbTensorDesc(x, atb_x.desc); + fillAtbTensorDesc(y, atb_y.desc); + fillAtbTensorDesc(z, atb_z.desc); + + atb_x.deviceData = reinterpret_cast(x.ptr()); + atb_x.dataSize = x.bytes(); + atb_y.deviceData = reinterpret_cast(y.ptr()); + atb_y.dataSize = y.bytes(); + atb_z.deviceData = reinterpret_cast(z.ptr()); + atb_z.dataSize = z.bytes(); + + atb::SVector inTensors; + atb::SVector outTensors; + inTensors.push_back(atb_x); + inTensors.push_back(atb_y); + outTensors.push_back(atb_z); + + atb::VariantPack vp; + vp.inTensors = inTensors; + vp.outTensors = outTensors; + + uint64_t workspaceSize = 0; + st = op->Setup(vp, workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB AddOp Setup failed, status={}", static_cast(st)); + } + + void* workspace = nullptr; + int workspace_block_id = -1; + if (workspaceSize > 0) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); + mem_mgr.getBlockPtr(workspace_block_id, workspace); + } + + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + if (st != atb::NO_ERROR) { + MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB AddOp Execute failed, status={}", static_cast(st)); + } + + + syncGlobalAtbStream(); + + if (workspace_block_id != -1) { + auto& mem_mgr = getAscendMemoryManager(); + mem_mgr.freeBlock(workspace_block_id); + } + + atb::DestroyOperation(op); +} + +} // namespace mllm::ascend \ No newline at end of file From a618fd8785a149d64d676303e2eb2ab198584319 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 12 Dec 2025 01:12:31 +0800 Subject: [PATCH 05/16] fix(ascend):add enum for Ascend --- mllm/core/DeviceTypes.hpp | 6 ++++++ mllm/engine/Dispatcher.hpp | 1 + mllm/utils/Common.hpp | 1 + 3 files changed, 8 insertions(+) diff --git a/mllm/core/DeviceTypes.hpp b/mllm/core/DeviceTypes.hpp index 0ccd1c345..3c6a98ac4 100644 --- a/mllm/core/DeviceTypes.hpp +++ b/mllm/core/DeviceTypes.hpp @@ -33,6 +33,8 @@ inline const char* deviceTypes2Str(DeviceTypes type) { case DeviceTypes::kCUDA: return "CUDA"; case DeviceTypes::kOpenCL: return "OpenCL"; case DeviceTypes::kQNN: return "QNN"; + case DeviceTypes::kAscend: return "Ascend"; + case DeviceTypes::kAscendHost: return "AscendHost"; case DeviceTypes::kDeviceTypes_End: return "DeviceTypes_End"; default: return "Unknown"; } @@ -47,6 +49,10 @@ inline DeviceTypes str2DeviceType(const std::string& type_str) { return DeviceTypes::kOpenCL; } else if (type_str == "QNN") { return DeviceTypes::kQNN; + } else if (type_str == "Ascend") { + return DeviceTypes::kAscend; + } else if (type_str == "AscendHost") { + return DeviceTypes::kAscendHost; } else { return DeviceTypes::kDeviceTypes_End; } diff --git a/mllm/engine/Dispatcher.hpp b/mllm/engine/Dispatcher.hpp index 7ac5b8597..8ed7044fe 100644 --- a/mllm/engine/Dispatcher.hpp +++ b/mllm/engine/Dispatcher.hpp @@ -25,6 +25,7 @@ class Dispatcher { static constexpr int32_t cuda_dispatcher_id = static_cast(DeviceTypes::kCUDA); static constexpr int32_t opencl_dispatcher_id = static_cast(DeviceTypes::kOpenCL); static constexpr int32_t qnn_dispatcher_id = static_cast(DeviceTypes::kQNN); + static constexpr int32_t ascend_dispatcher_id = static_cast(DeviceTypes::kAscend); static constexpr int32_t trace_dispatcher_id = static_cast(DeviceTypes::kDeviceTypes_End) + 1; static constexpr int32_t cpu_memory_disk_io_dispatcher_id = static_cast(DeviceTypes::kDeviceTypes_End) + 2; static constexpr int32_t custom_dispatcher_start_id = static_cast(DeviceTypes::kDeviceTypes_End) + 3; diff --git a/mllm/utils/Common.hpp b/mllm/utils/Common.hpp index 1df4de265..abcd7d169 100644 --- a/mllm/utils/Common.hpp +++ b/mllm/utils/Common.hpp @@ -31,6 +31,7 @@ enum class ExitCode : int32_t { // NOLINT kCudaError, kQnnError, kOpenCLError, + kAscendError, kIOError, kShapeError, kCPUKernelError, From eb52ca84adcef37bbfa77cea00875beb7d0b17d9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 12 Dec 2025 01:14:44 +0800 Subject: [PATCH 06/16] feat(ascend): create for Ascend --- tasks/build_arm_ascend.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 tasks/build_arm_ascend.yaml diff --git a/tasks/build_arm_ascend.yaml b/tasks/build_arm_ascend.yaml new file mode 100644 index 000000000..4546f2d81 --- /dev/null +++ b/tasks/build_arm_ascend.yaml @@ -0,0 +1,18 @@ +Tasks: + - CMakeConfigTask: + cmake_cfg_path: "build-arm-ascend" + cmake_build_type: "Release" + cmake_extra_args: + - "-DMLLM_CROSS_COMPILE=ON" + - "-DMLLM_BUILD_ARM_BACKEND=ON" + - "-DMLLM_BUILD_ASCEND_BACKEND=ON" + - "-DANDROID_PLATFORM=android-28" + - "-DANDROID_ABI=arm64-v8a" + - '-DMLLM_CPU_BACKEND_COMPILE_OPTIONS="-march=armv8.2-a+fp16+fp16fml+dotprod+i8mm;-ffast-math;-Wno-nan-infinity-disabled"' + - "-DCMAKE_INSTALL_PREFIX=/root/mllm-install-android-arm64-v8a" + - "-DMLLM_KERNEL_USE_THREADS=ON" + - "-DMLLM_KERNEL_THREADS_VENDOR_OPENMP=ON" + - "-DMLLM_KERNEL_USE_THREADS_VENDOR_MLLM=OFF" + + - CMakeBuildTask: + cmake_cfg_path: "build-arm-ascend" From d44f81caa046a54086108fe83e1dd7411c4031ce Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 13 Dec 2025 17:43:11 +0800 Subject: [PATCH 07/16] fix(ascend): fix critical issues from CodeRabbit review --- examples/ascend_add_demo/main.cpp | 1 - mllm/backends/ascend/AscendAllocator.cpp | 9 ++-- mllm/backends/ascend/AscendCommon.cpp | 51 ++++++++++--------- mllm/backends/ascend/AscendDispatcher.cpp | 5 +- mllm/backends/ascend/AscendDispatcher.hpp | 2 +- .../ascend/memory/AscendMemoryManager.cpp | 7 ++- .../ascend/memory/AscendMemoryPool.hpp | 10 ++-- 7 files changed, 44 insertions(+), 41 deletions(-) diff --git a/examples/ascend_add_demo/main.cpp b/examples/ascend_add_demo/main.cpp index 6e439d388..b9704fb5b 100644 --- a/examples/ascend_add_demo/main.cpp +++ b/examples/ascend_add_demo/main.cpp @@ -114,7 +114,6 @@ int main() { std::cout << "\n✗✗✗ Test FAILED! Results don't match expected values. ✗✗✗" << std::endl; } - // 清理内存池中的块 x_handle.release(); y_handle.release(); diff --git a/mllm/backends/ascend/AscendAllocator.cpp b/mllm/backends/ascend/AscendAllocator.cpp index f41ba3612..56dc9db37 100644 --- a/mllm/backends/ascend/AscendAllocator.cpp +++ b/mllm/backends/ascend/AscendAllocator.cpp @@ -3,7 +3,7 @@ #include "mllm/backends/ascend/AscendAllocator.hpp" #include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" - +#include #include "mllm/utils/Common.hpp" namespace mllm::ascend { @@ -80,11 +80,14 @@ void AscendAllocator::free(Storage* storage) { } bool AscendAllocator::generalAlloc(void** ptr, size_t cap, size_t align) { - return true; + //we don't support generalAlloc , therefore return false + std::cout << "generalAlloc is not supported in AscendAllocator" << std::endl; + return false; } void AscendAllocator::generalFree(void* ptr) { - + //we don't support generalFree , therefore do nothing + std::cout << "generalFree is not supported in AscendAllocator" << std::endl; } size_t AscendAllocator::allocSize(const Storage::ptr_t& storage) { diff --git a/mllm/backends/ascend/AscendCommon.cpp b/mllm/backends/ascend/AscendCommon.cpp index dabb9412c..d98eb29de 100644 --- a/mllm/backends/ascend/AscendCommon.cpp +++ b/mllm/backends/ascend/AscendCommon.cpp @@ -4,12 +4,17 @@ #include "mllm/backends/ascend/AscendCommon.hpp" #include - +#include #include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" namespace mllm::ascend { -static aclrtStream g_atb_stream = nullptr; +namespace { +aclrtStream& globalAtbStream() { + static aclrtStream stream = nullptr; + return stream; +} +} // namespace AscendTensorHandle::AscendTensorHandle(Tensor tensor, int block_id) : tensor_(std::move(tensor)), block_id_(block_id) {} @@ -37,6 +42,8 @@ void AscendTensorHandle::release() { mem_mgr.freeBlock(block_id_); block_id_ = -1; tensor_.impl()->storage()->ptr_ = nullptr; + } else if (tensor_.impl() != nullptr) { + tensor_.delete_(); } } @@ -52,16 +59,10 @@ AscendTensorHandle prepareAscendTensor(const std::vector& host_data, } auto tensor = Tensor::empty({batch, size}, kFloat16, kAscend); + tensor.alloc(); - auto& mem_mgr = getAscendMemoryManager(); - int block_id = -1; - const uint32_t bytes = static_cast(expected_elements * sizeof(half_float::half)); - - mem_mgr.allocateBlock(bytes, block_id); - - void* device_ptr = nullptr; - mem_mgr.getBlockPtr(block_id, device_ptr); - tensor.impl()->storage()->ptr_ = device_ptr; + void* device_ptr = tensor.ptr(); + const size_t bytes = tensor.bytes(); auto ret = aclrtMemcpy( device_ptr, bytes, @@ -69,43 +70,45 @@ AscendTensorHandle prepareAscendTensor(const std::vector& host_data, ACL_MEMCPY_HOST_TO_DEVICE); if (ret != ACL_SUCCESS) { - mem_mgr.freeBlock(block_id); MLLM_ACL_CHECK(ret); } - return AscendTensorHandle(std::move(tensor), block_id); + return AscendTensorHandle(std::move(tensor), -1); } atb::Context* getGlobalAtbContext() { static atb::Context* ctx = nullptr; - - if (ctx == nullptr) { + static std::once_flag init_flag; + + std::call_once(init_flag, [&] { // 1. Set Device auto acl_ret = aclrtSetDevice(0); MLLM_ACL_CHECK(acl_ret); - + // 2. Create Context auto ret = atb::CreateContext(&ctx); MLLM_ATB_CHECK(ret); - + // 3. Create Stream - acl_ret = aclrtCreateStream(&g_atb_stream); + auto& stream = globalAtbStream(); + acl_ret = aclrtCreateStream(&stream); MLLM_ACL_CHECK(acl_ret); - + // 4. Set Stream - ctx->SetExecuteStream(g_atb_stream); - } + ctx->SetExecuteStream(stream); + }); return ctx; } aclrtStream getGlobalAtbStream() { getGlobalAtbContext(); // Ensure initialized - return g_atb_stream; + return globalAtbStream(); } void syncGlobalAtbStream() { - if (g_atb_stream != nullptr) { - auto ret = aclrtSynchronizeStream(g_atb_stream); + auto stream = globalAtbStream(); + if (stream != nullptr) { + auto ret = aclrtSynchronizeStream(stream); MLLM_ACL_CHECK(ret); } } diff --git a/mllm/backends/ascend/AscendDispatcher.cpp b/mllm/backends/ascend/AscendDispatcher.cpp index 0ceae8f9e..8960ece77 100644 --- a/mllm/backends/ascend/AscendDispatcher.cpp +++ b/mllm/backends/ascend/AscendDispatcher.cpp @@ -50,10 +50,9 @@ void AscendDispatcher::process(const Task::ptr_t& task) { task->op->reshape(task->inputs, task->outputs); task->op->setup(task->inputs, task->outputs); task->op->forward(task->inputs, task->outputs); - break; } - case TaskTypes::kExecuteModule: { + case TaskTypes::kExecuteModule: { //TODO: execute module auto moduleName = static_cast(task->custom_context_ptr)->getModuleName(); #ifdef MLLM_PERFETTO_ENABLE MLLM_PERF_TRACE_EVENT("mllm.ascend.execute.", perfetto::DynamicString{moduleName}, @@ -78,7 +77,7 @@ void AscendDispatcher::process(const Task::ptr_t& task) { } void AscendDispatcher::syncWait() { - // TODO + // TODO: Implement synchronization behavior for outstanding scheduled tasks (if required by engine). } AscendDispatcher::ptr_t createAscendDispatcher(exec::static_thread_pool& thread_pool, diff --git a/mllm/backends/ascend/AscendDispatcher.hpp b/mllm/backends/ascend/AscendDispatcher.hpp index 7b8ad5943..71852da12 100644 --- a/mllm/backends/ascend/AscendDispatcher.hpp +++ b/mllm/backends/ascend/AscendDispatcher.hpp @@ -18,7 +18,7 @@ class AscendDispatcher final : public Dispatcher { public: using ptr_t = std::shared_ptr; - explicit AscendDispatcher(exec::static_thread_pool& thread_pool, dispatcher_id_t id, + explicit AscendDispatcher(exec::static_thread_pool& thread_pool, dispatcher_id_t dispatcher_id, const AscendDispatcherOptions& options); void receive(const Task::ptr_t& task) override; diff --git a/mllm/backends/ascend/memory/AscendMemoryManager.cpp b/mllm/backends/ascend/memory/AscendMemoryManager.cpp index b1d4920c3..ed9122b02 100644 --- a/mllm/backends/ascend/memory/AscendMemoryManager.cpp +++ b/mllm/backends/ascend/memory/AscendMemoryManager.cpp @@ -10,12 +10,11 @@ namespace mllm::ascend { -static AscendMemoryManager g_ascendMemoryManager; - -AscendMemoryManager::AscendMemoryManager() {} +AscendMemoryManager::AscendMemoryManager() = default; AscendMemoryManager &getAscendMemoryManager() { - return g_ascendMemoryManager; + static AscendMemoryManager instance; + return instance; } void AscendMemoryManager::createMemoryPool(size_t pool_size) diff --git a/mllm/backends/ascend/memory/AscendMemoryPool.hpp b/mllm/backends/ascend/memory/AscendMemoryPool.hpp index 1e41fc041..d3b01d22f 100644 --- a/mllm/backends/ascend/memory/AscendMemoryPool.hpp +++ b/mllm/backends/ascend/memory/AscendMemoryPool.hpp @@ -26,12 +26,12 @@ class AscendMemoryPool { private: uint64_t generateBlocksId(); - std::atomic id_ = 0; - std::mutex block_mutex_; + std::atomic id_ = 0; + std::mutex block_mutex_; - void* base_mem_ptr_ = nullptr; - void* cur_mem_ptr_ = nullptr; - int64_t remain_size_ = 0; + void* base_mem_ptr_ = nullptr; + void* cur_mem_ptr_ = nullptr; + int64_t remain_size_ = 0; std::unordered_map used_blocks_; std::unordered_map free_blocks_; From e25dd341b4fc64c188913e0de8e6d5816a141998 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 16 Dec 2025 16:26:34 +0800 Subject: [PATCH 08/16] feat(ascned):add core design document of ascend backend --- docs/ascend_backend/core_design.rst | 187 ++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 docs/ascend_backend/core_design.rst diff --git a/docs/ascend_backend/core_design.rst b/docs/ascend_backend/core_design.rst new file mode 100644 index 000000000..592089736 --- /dev/null +++ b/docs/ascend_backend/core_design.rst @@ -0,0 +1,187 @@ +Ascend Backend 设计概述 +==================== + +总览 +---- +Ascend Backend 将 mLLM 的算子执行能力接入华为 Ascend NPU,提供端到端的调度、内存管理与算子生命周期管理,使模型在 Ascend 上高效运行。 + +设计目标 +-------- +- 统一后端:作为 mLLM 原生后端,统一接口与调度流程。 +- ATB 单算子验证:打通算子从框架到 NPU 的完整链路。 +- 生命周期管理:算子创建、准备、执行、销毁的统一抽象。 +- 内存管理:专用 Ascend 设备内存池,减少反复申请释放。 +- 扩展性:便于新增算子、执行模式和性能优化。 + +架构组件 +-------- +.. code-block:: text + +┌─────────────────────────────────────────────────────────────┐ +│ MLLM 框架 │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ 模块 │ │ 层 │ │ 调度器 │ │ +│ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ +└─────────┼─────────────────┼─────────────────┼───────────────┘ + │ │ │ + └─────────────────┴─────────────────┘ + │ +┌──────────────────────────────────────────────────────────────┐ +│ Ascend 后端基础设施 │ +│ │ +│ ┌────────────────────────────────────────────────────────┐ │ +│ │ AscendBackend(核心管理) │ │ +│ │ - 设备/算子注册 - 分配器绑定 │ │ +│ │ - 设备信息日志 │ │ +│ └─────────┬──────────────────────────────────────────────┘ │ +│ │ │ +│ ┌─────────┴──────────┬──────────────┬─────────────────┐ │ +│ │ │ │ │ │ +│ ▼ ▼ ▼ ▼ │ +│ AscendDispatcher AscendAllocator Ascend Ops AscendCommon │ +│ (执行:算子/ MemoryManager (目前是add (共用代码) │ +│ 模块任务) (内存池) 未来图执行) │ +│ │ +│ │ +│ │ +└────────────────────────────┬─────────────────────────────────┘ + │ +┌────────────────────────────▼──────────────────────────────────┐ +│ Ascend runtime │ +│ ┌──────────────┐ ┌──────────────┐ ┌─────────────────┐ │ +│ │ ATB 上下文 │ │ ACL 流 │ │ ATB/ACL 接口 │ │ +│ └──────────────┘ └──────────────┘ └─────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────────────────────┐ │ +│ │ Ascend NPU 硬件(orangepi ai pro) │ │ +│ └──────────────────────────────────────────────────────┘ │ +└───────────────────────────────────────────────────────────────┘ + +关键模块 +-------- + +1. mLLM 框架层 + +框架层负责算子抽象、计算任务构建以及统一调度接口的提供。不依赖任何具体设备实现,仅通过 Backend 接口与底层后端交互。算子在该层被封装为可调度的任务(Task),并通过 DispatcherManager 提交给对应设备后端执行。 + +2. Ascend 后端基础设施层 + +该层是 Ascend Backend 的核心实现,负责承接来自框架层的算子任务,并将其映射到 Ascend 运行时执行。主要组成包括: + +**AscendBackend** + +- 后端入口与核心管理模块,负责后端注册、算子工厂管理、分配器与调度器绑定等。 + +**AscendDispatcher** + +- 任务调度与执行模块,负责驱动算子按照统一的生命周期(reshape / setup / forward)执行。 + +**AscendAllocator / AscendMemoryManager** + +- Ascend 设备内存管理模块,负责 Tensor 与 workspace 的分配、回收及内存池管理。 + +**Ascend Ops / AscendCommon** + +- Ascend 专用算子实现及 ATB / ACL 公共工具封装,屏蔽底层运行时细节。 + +3. Ascend Runtime 层 + +运行时层由 Ascend CANN 提供,包含 ATB 算子库、ACL 执行接口以及执行上下文与流管理。 + +4. Ascend 硬件层 + +最底层为 Ascend NPU 硬件,负责实际的计算执行。 + +执行流程(单算子路径) +---------------------- + +.. code-block:: text +1. Ascend Backend 初始化 + - Context 注册 Backend、Allocator、Dispatcher + +2. 输入 Tensor 准备 + - Ascend Tensor 分配 + - Host → Device 拷贝 + +3. 构建并提交算子任务 + - 创建 Ascend Op、Task + - 提交至 Dispatcher + +4. Ascend 上执行算子 + - reshape、setup、forward + - → ATB Operation Execute + +5. 结果回传与资源释放 + - Device → Host 拷贝验证 + - Tensor 资源释放 + +算子支持与映射 +-------------------------- + +支持的算子 +~~~~~~~~~~~~~~~~~~~ + +当前版本的 Ascend Backend 以验证端到端执行链路为目标,实现了基于 ATB 的 **Add 算子** 支持。 +算子映射策略 +~~~~~~~~~~~~ + +在 Ascend Backend 中,框架算子并不直接依赖底层运行时实现,而是通过后端算子层进行统一映射。 +后续扩展 +~~~~~~~~~~~~~~~~~ + +在当前单算子执行路径稳定的基础上,Ascend Backend 将逐步扩展算子支持范围与执行模式。 + +添加新算子的方法 +~~~~~~~~~~~~~~~~~ +Step 1:确认 ATB 支持与算子约束 +-------------------------------- +- 确认 ATB 是否支持目标算子类型及对应参数结构 + +Step 2:实现 Ascend 算子类 +-------------------------------- +- 在 Ascend 后端中定义算子类 +- 实现统一的算子生命周期:reshape → setup → forward +- 在 forward 阶段调用 ATB 单算子完成执行 + +Step 3:注册算子并接入调度链路 +-------------------------------- +- 将新算子注册到 AscendBackend并完成相关适配 + +Step 4:测试与验证 +-------------------------------- +- 构建最小示例在 Ascend 设备上运行新算子 +- 与 CPU 参考结果对比,验证计算正确性 + +算子计算结果测试 +~~~~~~~~~~~~~~~~~ +- 基准结果:在 CPU/参考实现上运行同一输入,获得期望输出。 +- 输入构造:覆盖典型维度与边界尺寸;固定随机种子,避免非确定性。 +- 误差度量:按 dtype 选择误差标准,如浮点用相对/绝对误差(rtol/atol),整型用全相等。 +- 数据搬运:确保 Host→Device / Device→Host 拷贝后再次比对,排查搬运或对齐问题。 + +内存与数据管理 +-------------- +- AscendMemoryManager(单例):按设备创建独立内存池,当前通过 `aclrtGetDeviceCount` 为每个 device 分配池。 +- AscendMemoryPool:预分配一个较大的空间,`aclrtMalloc(..., ACL_MEM_MALLOC_HUGE_FIRST)` 获取内存,维护 base/cur 指针与剩余空间。 +- 块分配策略:首先从 32B 对齐,优先从有空间 的 free_blocks 复用,若没有可用的则在池内线性切分(64B 对齐),并返回递增 block id。 +- 线程安全:分配/释放/取指针均持锁。 +- 多设备调度:通过当前 device id 选择对应内存池,确保多卡环境下内存隔离。 + +性能与调优 +---------- +- 内存池复用:通过 AscendMemoryPool 预分配大块显存,并在线性切分/复用 block,减少频繁 aclrtMalloc/aclrtFree 带来的碎片与性能开销。 +- 对齐与访问:按 32B/64B 对齐划分内存块,兼顾 ATB/ACL 对齐需求与访存效率。 +- 执行路径简化:当前以单算子执行链路为主,重点验证端到端正确性与内存/数据通路的稳定性,为后续多算子/多流并行奠定基础。 +- 日志观测:通过 AscendCommon 与统一日志系统记录内存分配、算子执行等关键行为,用于简单的性能和资源使用分析。 + +测试与验证 +---------- +- 结果正确性测试:在 CPU 或参考实现上计算结果,在 Ascend Backend 上运行相同输入,对比数值。 +- 时间测试:针对算子构不同步骤,记录执行时间。 +- 端到端验证:在示例工程中跑通完整链路,同时观察输出结果与耗时,确保调度、内存池和运行时组合下行为稳定。 + +后续扩展 +-------- +- 支持更多算子。 +- 更完整的 profiling 与可视化。 +- 上下文/图级缓存,减少重复创建。 \ No newline at end of file From e9e684256c427e486e387640a3fbbc049d2e8a08 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 16 Dec 2025 16:31:25 +0800 Subject: [PATCH 09/16] fix(ascend): add result validation and timing measurement --- examples/ascend_add_demo/main.cpp | 87 ++++++---------- mllm/backends/ascend/AscendCommon.cpp | 98 ++++++++++++++++++- mllm/backends/ascend/AscendCommon.hpp | 41 +++++++- mllm/backends/ascend/ops/AscnedElewiseOps.cpp | 6 +- 4 files changed, 169 insertions(+), 63 deletions(-) diff --git a/examples/ascend_add_demo/main.cpp b/examples/ascend_add_demo/main.cpp index b9704fb5b..216591b2d 100644 --- a/examples/ascend_add_demo/main.cpp +++ b/examples/ascend_add_demo/main.cpp @@ -57,63 +57,36 @@ int main() { )[0]; std::cout << " ✓ Add operation completed\n" << std::endl; - std::cout << "\n5. Copying result from NPU to CPU for verification..." << std::endl; - std::vector z_data_fp16(batch * size); - - auto ret = aclrtMemcpy( - z_data_fp16.data(), batch * size * sizeof(half_float::half), - z_ascend.ptr(), z_ascend.bytes(), - ACL_MEMCPY_DEVICE_TO_HOST - ); - if (ret != ACL_SUCCESS) { - std::cerr << " ✗ Failed to copy result back to CPU: ACL error " << ret << std::endl; - x_handle.release(); - y_handle.release(); - return 1; - } - - std::vector result(batch * size); - for (size_t i = 0; i < result.size(); ++i) { - result[i] = static_cast(z_data_fp16[i]); - } - - std::cout << " ✓ Result copied to CPU\n" << std::endl; - - std::cout << "6. Verifying results..." << std::endl; - std::cout << " Actual result: ["; - for (size_t i = 0; i < result.size(); ++i) { - std::cout << result[i]; - if (i < result.size() - 1) std::cout << ", "; - } - std::cout << "]" << std::endl; - - std::cout << " Expected result: ["; - for (size_t i = 0; i < expected.size(); ++i) { - std::cout << expected[i]; - if (i < expected.size() - 1) std::cout << ", "; - } - std::cout << "]" << std::endl; - - bool correct = true; - const float tolerance = 0.1f; - - for (size_t i = 0; i < result.size(); ++i) { - float diff = std::abs(result[i] - expected[i]); - if (diff > tolerance) { - correct = false; - std::cout << " ✗ Mismatch at index " << i - << ": expected " << expected[i] - << ", got " << result[i] - << " (diff: " << diff << ")" << std::endl; - } - } - - if (correct) { - std::cout << "\n✓✓✓ Test PASSED! All values match expected results. ✓✓✓" << std::endl; - } else { - std::cout << "\n✗✗✗ Test FAILED! Results don't match expected values. ✗✗✗" << std::endl; - } - + std::cout << "\n5. Copying result from NPU to CPU for verification..." << std::endl; + std::vector actual; + bool correct = ascend::verifyAscendTensor( + z_ascend, + expected, + /*atol=*/1e-2f, + /*rtol=*/1e-2f, + /*verbose=*/true, + &actual); + + std::cout << " Actual result: ["; + for (size_t i = 0; i < actual.size(); ++i) { + std::cout << actual[i]; + if (i < actual.size() - 1) std::cout << ", "; + } + std::cout << "]" << std::endl; + + std::cout << " Expected result: ["; + for (size_t i = 0; i < expected.size(); ++i) { + std::cout << expected[i]; + if (i < expected.size() - 1) std::cout << ", "; + } + std::cout << "]" << std::endl; + + if (correct) { + std::cout << "\n✓✓✓ Test PASSED! All values match expected results. ✓✓✓" << std::endl; + } else { + std::cout << "\n✗✗✗ Test FAILED! Results don't match expected values. ✗✗✗" << std::endl; + } + x_handle.release(); y_handle.release(); diff --git a/mllm/backends/ascend/AscendCommon.cpp b/mllm/backends/ascend/AscendCommon.cpp index d98eb29de..98d4d82f0 100644 --- a/mllm/backends/ascend/AscendCommon.cpp +++ b/mllm/backends/ascend/AscendCommon.cpp @@ -3,9 +3,13 @@ #include "mllm/backends/ascend/AscendCommon.hpp" -#include +#include +#include +#include #include +#include #include "mllm/backends/ascend/memory/AscendMemoryManager.hpp" +#include "mllm/core/DataTypes.hpp" namespace mllm::ascend { @@ -73,7 +77,97 @@ AscendTensorHandle prepareAscendTensor(const std::vector& host_data, MLLM_ACL_CHECK(ret); } - return AscendTensorHandle(std::move(tensor), -1); + return {std::move(tensor), -1}; +} + +std::vector copyAscendTensorToHost(const Tensor& t) { + MLLM_RT_ASSERT(t.dtype() == kFloat16); + syncGlobalAtbStream(); + + const size_t elem_cnt = t.numel(); + std::vector device_fp16(elem_cnt); + + auto ret = aclrtMemcpy( + device_fp16.data(), elem_cnt * sizeof(half_float::half), + t.ptr(), t.bytes(), + ACL_MEMCPY_DEVICE_TO_HOST); + MLLM_ACL_CHECK(ret); + + std::vector host(elem_cnt); + for (size_t i = 0; i < elem_cnt; ++i) { + host[i] = static_cast(device_fp16[i]); + } + return host; +} + +bool verifyAscendTensor(const Tensor& t, + const std::vector& expected, + float atol, + float rtol, + bool verbose, + std::vector* actual_out) { + auto actual = copyAscendTensorToHost(t); + if (actual_out != nullptr) { + *actual_out = actual; + } + + if (actual.size() != expected.size()) { + if (verbose) { + std::cout << "[AscendVerify] size mismatch: actual " << actual.size() + << " vs expected " << expected.size() << "\n"; + } + return false; + } + + bool ok = true; + for (size_t i = 0; i < actual.size(); ++i) { + const float diff = std::abs(actual[i] - expected[i]); + const float thr = atol + rtol * std::abs(expected[i]); + if (diff > thr) { + ok = false; + if (verbose) { + std::cout << "[AscendVerify] idx " << i + << " expected " << expected[i] + << " got " << actual[i] + << " diff " << diff + << " thr " << thr << "\n"; + } + } + } + + if (verbose) { + std::cout << (ok ? "[AscendVerify] OK" : "[AscendVerify] FAIL") << "\n"; + } + return ok; +} + +bool verifyAscendTensor(const Tensor& t, + const RefFn& ref_fn, + float atol, + float rtol, + bool verbose, + std::vector* actual_out) { + auto expected = ref_fn(); + return verifyAscendTensor(t, expected, atol, rtol, verbose, actual_out); +} + +AscendTimer::AscendTimer(const char* tag, bool sync_before, bool sync_after) + : tag_(tag), + sync_before_(sync_before), + sync_after_(sync_after) { + if (sync_before_) { + syncGlobalAtbStream(); + } + start_ = std::chrono::high_resolution_clock::now(); +} + +AscendTimer::~AscendTimer() { + if (sync_after_) { + syncGlobalAtbStream(); + } + const auto end = std::chrono::high_resolution_clock::now(); + const double ms = std::chrono::duration(end - start_).count(); + std::cout << "[AscendTimer] " << tag_ << " : " << ms << " ms\n"; } atb::Context* getGlobalAtbContext() { diff --git a/mllm/backends/ascend/AscendCommon.hpp b/mllm/backends/ascend/AscendCommon.hpp index 7b7cea8ec..8d74c8707 100644 --- a/mllm/backends/ascend/AscendCommon.hpp +++ b/mllm/backends/ascend/AscendCommon.hpp @@ -3,16 +3,17 @@ #pragma once +#include #include #include +#include #include #include #include -#include "mllm/core/DataTypes.hpp" #include "mllm/core/Tensor.hpp" -#include "mllm/utils/Common.hpp" +#include "mllm/utils/Common.hpp" // IWYU pragma: keep // Ascend ACL error checking macro #define MLLM_ACL_CHECK(err) \ @@ -93,4 +94,40 @@ AscendTensorHandle prepareAscendTensor(const std::vector& host_data, int batch, int size); +// Copy Ascend tensor to host as float (currently assumes FP16 tensor data). +std::vector copyAscendTensorToHost(const Tensor& t); + +// Verify Ascend tensor against expected values. +bool verifyAscendTensor(const Tensor& t, + const std::vector& expected, + float atol = 1e-2f, + float rtol = 1e-2f, + bool verbose = true, + std::vector* actual_out = nullptr); + +using RefFn = std::function()>; +bool verifyAscendTensor(const Tensor& t, + const RefFn& ref_fn, + float atol = 1e-2f, + float rtol = 1e-2f, + bool verbose = true, + std::vector* actual_out = nullptr); + +// RAII timer for measuring scoped durations (optionally syncs the global stream). +class AscendTimer { + public: + explicit AscendTimer(const char* tag, bool sync_before = true, bool sync_after = true); + ~AscendTimer(); + + private: + const char* tag_; + bool sync_before_; + bool sync_after_; + std::chrono::high_resolution_clock::time_point start_; +}; + +// Convenience macros for scoped timing. +#define ASCEND_TIME_SCOPE(tag) ::mllm::ascend::AscendTimer timer_scope_##__LINE__(tag, true, true) +#define ASCEND_TIME_SCOPE_NOSYNC(tag) ::mllm::ascend::AscendTimer timer_scope_##__LINE__(tag, false, false) + } // namespace mllm::ascend diff --git a/mllm/backends/ascend/ops/AscnedElewiseOps.cpp b/mllm/backends/ascend/ops/AscnedElewiseOps.cpp index fc6fad429..f5a504dd2 100644 --- a/mllm/backends/ascend/ops/AscnedElewiseOps.cpp +++ b/mllm/backends/ascend/ops/AscnedElewiseOps.cpp @@ -98,8 +98,10 @@ void AscendAddOp::forward(const std::vector& inputs, std::vector mem_mgr.allocateBlock(static_cast(workspaceSize), workspace_block_id); mem_mgr.getBlockPtr(workspace_block_id, workspace); } - - st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + { + ASCEND_TIME_SCOPE("AscendAddOp::forward"); + st = op->Execute(vp, reinterpret_cast(workspace), workspaceSize, atb_ctx); + } if (st != atb::NO_ERROR) { MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB AddOp Execute failed, status={}", static_cast(st)); } From 852406a68ebdf2d02d244b9d55fdb7196267de6a Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 16 Dec 2025 17:53:54 +0800 Subject: [PATCH 10/16] fix(ascend): use the common code path of setup --- mllm/backends/ascend/ops/AscnedElewiseOps.cpp | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mllm/backends/ascend/ops/AscnedElewiseOps.cpp b/mllm/backends/ascend/ops/AscnedElewiseOps.cpp index f5a504dd2..4972e0cb5 100644 --- a/mllm/backends/ascend/ops/AscnedElewiseOps.cpp +++ b/mllm/backends/ascend/ops/AscnedElewiseOps.cpp @@ -20,18 +20,19 @@ namespace mllm::ascend { AscendAddOp::AscendAddOp(const aops::AddOpOptions& options) : aops::AddOp(options) {} void AscendAddOp::setup(const std::vector& inputs, std::vector& outputs) { - for (auto& t : outputs) { - if (!t.isNil()) { - auto& mem_mgr = getAscendMemoryManager(); - int block_id = -1; - void* device_ptr = nullptr; + // for (auto& t : outputs) { + // if (!t.isNil()) { + // auto& mem_mgr = getAscendMemoryManager(); + // int block_id = -1; + // void* device_ptr = nullptr; - mem_mgr.allocateBlock(static_cast(t.bytes()), block_id); - mem_mgr.getBlockPtr(block_id, device_ptr); + // mem_mgr.allocateBlock(static_cast(t.bytes()), block_id); + // mem_mgr.getBlockPtr(block_id, device_ptr); - t.impl()->storage()->ptr_ = device_ptr; - } - } + // t.impl()->storage()->ptr_ = device_ptr; + // } + // } + BaseOp::setup(inputs, outputs); } void AscendAddOp::forward(const std::vector& inputs, std::vector& outputs) { From f82e1acdaf321cb3b4557ddd9d4e48aac49c05ff Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 20 Dec 2025 15:53:13 +0800 Subject: [PATCH 11/16] fix(ascend): fix some problem of document --- docs/ascend_backend/core_design.rst | 102 +++++++++++++++------------- docs/ascend_backend/index.rst | 9 +++ docs/index.rst | 5 ++ 3 files changed, 67 insertions(+), 49 deletions(-) create mode 100644 docs/ascend_backend/index.rst diff --git a/docs/ascend_backend/core_design.rst b/docs/ascend_backend/core_design.rst index 592089736..db835823e 100644 --- a/docs/ascend_backend/core_design.rst +++ b/docs/ascend_backend/core_design.rst @@ -1,5 +1,5 @@ -Ascend Backend 设计概述 -==================== +Ascend Backend +======================== 总览 ---- @@ -15,47 +15,50 @@ Ascend Backend 将 mLLM 的算子执行能力接入华为 Ascend NPU,提供端 架构组件 -------- -.. code-block:: text - -┌─────────────────────────────────────────────────────────────┐ -│ MLLM 框架 │ -│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ -│ │ 模块 │ │ 层 │ │ 调度器 │ │ -│ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ -└─────────┼─────────────────┼─────────────────┼───────────────┘ - │ │ │ - └─────────────────┴─────────────────┘ - │ -┌──────────────────────────────────────────────────────────────┐ -│ Ascend 后端基础设施 │ -│ │ -│ ┌────────────────────────────────────────────────────────┐ │ -│ │ AscendBackend(核心管理) │ │ -│ │ - 设备/算子注册 - 分配器绑定 │ │ -│ │ - 设备信息日志 │ │ -│ └─────────┬──────────────────────────────────────────────┘ │ -│ │ │ -│ ┌─────────┴──────────┬──────────────┬─────────────────┐ │ -│ │ │ │ │ │ -│ ▼ ▼ ▼ ▼ │ -│ AscendDispatcher AscendAllocator Ascend Ops AscendCommon │ -│ (执行:算子/ MemoryManager (目前是add (共用代码) │ -│ 模块任务) (内存池) 未来图执行) │ -│ │ -│ │ -│ │ -└────────────────────────────┬─────────────────────────────────┘ - │ -┌────────────────────────────▼──────────────────────────────────┐ -│ Ascend runtime │ -│ ┌──────────────┐ ┌──────────────┐ ┌─────────────────┐ │ -│ │ ATB 上下文 │ │ ACL 流 │ │ ATB/ACL 接口 │ │ -│ └──────────────┘ └──────────────┘ └─────────────────┘ │ -│ │ -│ ┌──────────────────────────────────────────────────────┐ │ -│ │ Ascend NPU 硬件(orangepi ai pro) │ │ -│ └──────────────────────────────────────────────────────┘ │ -└───────────────────────────────────────────────────────────────┘ + +架构图如下: + +:: + + ┌─────────────────────────────────────────────────────────────┐ + │ MLLM 框架 │ + │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ + │ │ 模块 │ │ 层 │ │ 调度器 │ │ + │ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ + └─────────┼─────────────────┼─────────────────┼───────────────┘ + │ │ │ + └─────────────────┴─────────────────┘ + │ + ┌──────────────────────────────────────────────────────────────┐ + │ Ascend 后端基础设施 │ + │ │ + │ ┌────────────────────────────────────────────────────────┐ │ + │ │ AscendBackend(核心管理) │ │ + │ │ - 设备/算子注册 - 分配器绑定 │ │ + │ │ - 设备信息日志 │ │ + │ └─────────┬──────────────────────────────────────────────┘ │ + │ │ │ + │ ┌─────────┴──────────┬──────────────┬─────────────────┐ │ + │ │ │ │ │ │ + │ ▼ ▼ ▼ ▼ │ + │ AscendDispatcher AscendAllocator Ascend Ops AscendCommon │ + │ (执行:算子/ MemoryManager (目前是add (共用代码) │ + │ 模块任务) (内存池) 未来图执行) │ + │ │ + │ │ + │ │ + └────────────────────────────┬─────────────────────────────────┘ + │ + ┌────────────────────────────▼──────────────────────────────────┐ + │ Ascend runtime │ + │ ┌──────────────┐ ┌──────────────┐ ┌─────────────────┐ │ + │ │ ATB 上下文 │ │ ACL 流 │ │ ATB/ACL 接口 │ │ + │ └──────────────┘ └──────────────┘ └─────────────────┘ │ + │ │ + │ ┌──────────────────────────────────────────────────────┐ │ + │ │ Ascend NPU 硬件(orangepi ai pro) │ │ + │ └──────────────────────────────────────────────────────┘ │ + └───────────────────────────────────────────────────────────────┘ 关键模块 -------- @@ -95,7 +98,6 @@ Ascend Backend 将 mLLM 的算子执行能力接入华为 Ascend NPU,提供端 执行流程(单算子路径) ---------------------- -.. code-block:: text 1. Ascend Backend 初始化 - Context 注册 Backend、Allocator、Dispatcher @@ -116,16 +118,18 @@ Ascend Backend 将 mLLM 的算子执行能力接入华为 Ascend NPU,提供端 - Tensor 资源释放 算子支持与映射 --------------------------- +-------------- 支持的算子 ~~~~~~~~~~~~~~~~~~~ 当前版本的 Ascend Backend 以验证端到端执行链路为目标,实现了基于 ATB 的 **Add 算子** 支持。 + 算子映射策略 ~~~~~~~~~~~~ 在 Ascend Backend 中,框架算子并不直接依赖底层运行时实现,而是通过后端算子层进行统一映射。 + 后续扩展 ~~~~~~~~~~~~~~~~~ @@ -134,21 +138,21 @@ Ascend Backend 将 mLLM 的算子执行能力接入华为 Ascend NPU,提供端 添加新算子的方法 ~~~~~~~~~~~~~~~~~ Step 1:确认 ATB 支持与算子约束 --------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - 确认 ATB 是否支持目标算子类型及对应参数结构 Step 2:实现 Ascend 算子类 --------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - 在 Ascend 后端中定义算子类 - 实现统一的算子生命周期:reshape → setup → forward - 在 forward 阶段调用 ATB 单算子完成执行 Step 3:注册算子并接入调度链路 --------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - 将新算子注册到 AscendBackend并完成相关适配 Step 4:测试与验证 --------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^ - 构建最小示例在 Ascend 设备上运行新算子 - 与 CPU 参考结果对比,验证计算正确性 diff --git a/docs/ascend_backend/index.rst b/docs/ascend_backend/index.rst new file mode 100644 index 000000000..fa97f15a1 --- /dev/null +++ b/docs/ascend_backend/index.rst @@ -0,0 +1,9 @@ +Ascend Backend +==================== + +.. toctree:: + :maxdepth: 2 + + core_design + + diff --git a/docs/index.rst b/docs/index.rst index 8cdcc187f..1f06ef487 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -351,6 +351,11 @@ Documents cpu_backend/index +.. toctree:: + :maxdepth: 2 + + ascend_backend/index + .. toctree:: :maxdepth: 2 From beeabedc6daad0908bd106b0c4bfd92e674ab75b Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 20 Dec 2025 15:55:20 +0800 Subject: [PATCH 12/16] feat(ascend): add a X2X op for transmitting tensor from cpu to npu or from npu to cpu --- mllm/backends/ascend/ops/AscendX2XOp.cpp | 94 ++++++++++++++++++++++++ mllm/backends/ascend/ops/AscendX2XOp.hpp | 27 +++++++ 2 files changed, 121 insertions(+) create mode 100644 mllm/backends/ascend/ops/AscendX2XOp.cpp create mode 100644 mllm/backends/ascend/ops/AscendX2XOp.hpp diff --git a/mllm/backends/ascend/ops/AscendX2XOp.cpp b/mllm/backends/ascend/ops/AscendX2XOp.cpp new file mode 100644 index 000000000..daa875a60 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendX2XOp.cpp @@ -0,0 +1,94 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/ascend/ops/AscendX2XOp.hpp" + +#include +#include "mllm/utils/Common.hpp" +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/core/DeviceTypes.hpp" +#include "mllm/backends/ascend/AscendCommon.hpp" + +namespace mllm::ascend { + +AscendX2XOp::AscendX2XOp(const aops::X2XOpOptions& options) : aops::X2XOp(options) {} + +void AscendX2XOp::forward(const std::vector& inputs, std::vector& outputs) { + MLLM_RT_ASSERT_EQ(inputs.size(), 1); + MLLM_RT_ASSERT_EQ(outputs.size(), 1); + + const auto& input = inputs[0]; + auto& output = outputs[0]; + + const DeviceTypes input_device = input.device(); + const DeviceTypes output_device = output.device(); + + // Case 1: CPU -> Ascend + if (input_device == kCPU && output_device == kAscend) { + const size_t data_size = input.bytes(); + const void* src_data = input.ptr(); + void* dst_data = output.ptr(); + + // Copy data from CPU to Ascend device + auto ret = aclrtMemcpy( + dst_data, data_size, + src_data, data_size, + ACL_MEMCPY_HOST_TO_DEVICE); + + if (ret != ACL_SUCCESS) { + MLLM_ACL_CHECK(ret); + } + + syncGlobalAtbStream(); + return; + } + + // Case 2: Ascend -> CPU + if (input_device == kAscend && output_device == kCPU) { + const size_t data_size = input.bytes(); + const void* src_data = input.ptr(); + void* dst_data = output.ptr(); + + // Copy data from Ascend device to CPU + auto ret = aclrtMemcpy( + dst_data, data_size, + src_data, data_size, + ACL_MEMCPY_DEVICE_TO_HOST); + + if (ret != ACL_SUCCESS) { + MLLM_ACL_CHECK(ret); + } + + syncGlobalAtbStream(); + return; + } + + // Case 3: Ascend -> Ascend (same device, just copy pointer or do memcpy) + if (input_device == kAscend && output_device == kAscend) { + const size_t data_size = input.bytes(); + const void* src_data = input.ptr(); + void* dst_data = output.ptr(); + + if (src_data != dst_data) { + auto ret = aclrtMemcpy( + dst_data, data_size, + src_data, data_size, + ACL_MEMCPY_DEVICE_TO_DEVICE); + + if (ret != ACL_SUCCESS) { + MLLM_ACL_CHECK(ret); + } + + syncGlobalAtbStream(); + } + return; + } + + MLLM_ERROR("AscendX2XOp only supports transform between CPU and Ascend devices. " + "Input device: {}, Output device: {}", + static_cast(input_device), static_cast(output_device)); +} + +} // namespace mllm::ascend + diff --git a/mllm/backends/ascend/ops/AscendX2XOp.hpp b/mllm/backends/ascend/ops/AscendX2XOp.hpp new file mode 100644 index 000000000..bfe9364a3 --- /dev/null +++ b/mllm/backends/ascend/ops/AscendX2XOp.hpp @@ -0,0 +1,27 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/X2XOp.hpp" +#include "mllm/core/OpTypes.hpp" + +namespace mllm::ascend { + +class AscendX2XOp final : public aops::X2XOp { + public: + explicit AscendX2XOp(const aops::X2XOpOptions& options); + + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class AscendX2XOpFactory final : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::X2XOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::ascend + From add14d177d916cf57469d8aa5f809e783fd6e45d Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 20 Dec 2025 15:59:39 +0800 Subject: [PATCH 13/16] fix(ascend): create the test part --- mllm/backends/ascend/AscendBackend.cpp | 2 + mllm/backends/ascend/AscendCommon.cpp | 19 +++++----- tests/CMakeLists.txt | 3 ++ tests/ascend/AscendKernelTest.hpp | 52 ++++++++++++++++++++++++++ tests/ascend/CMakeLists.txt | 26 +++++++++++++ tests/ascend/KernelTest.cpp | 45 ++++++++++++++++++++++ tests/ascend/KernelTestHelper.hpp | 18 +++++++++ 7 files changed, 155 insertions(+), 10 deletions(-) create mode 100644 tests/ascend/AscendKernelTest.hpp create mode 100644 tests/ascend/CMakeLists.txt create mode 100644 tests/ascend/KernelTest.cpp create mode 100644 tests/ascend/KernelTestHelper.hpp diff --git a/mllm/backends/ascend/AscendBackend.cpp b/mllm/backends/ascend/AscendBackend.cpp index 408cb2518..5ec76413a 100644 --- a/mllm/backends/ascend/AscendBackend.cpp +++ b/mllm/backends/ascend/AscendBackend.cpp @@ -7,11 +7,13 @@ #include "mllm/core/DeviceTypes.hpp" #include "mllm/backends/ascend/ops/AscendElewiseOps.hpp" +#include "mllm/backends/ascend/ops/AscendX2XOp.hpp" namespace mllm::ascend { AscendBackend::AscendBackend() : Backend(kAscend, createAscendAllocator()) { regOpFactory(); + regOpFactory(); auto& devices = AscendDeviceMetaInfo::instance().devices; for (const auto& device : devices) { const auto bytes_to_mb = [](size_t bytes) { return bytes / (1024.0 * 1024.0); }; diff --git a/mllm/backends/ascend/AscendCommon.cpp b/mllm/backends/ascend/AscendCommon.cpp index 98d4d82f0..140a5a31e 100644 --- a/mllm/backends/ascend/AscendCommon.cpp +++ b/mllm/backends/ascend/AscendCommon.cpp @@ -81,21 +81,20 @@ AscendTensorHandle prepareAscendTensor(const std::vector& host_data, } std::vector copyAscendTensorToHost(const Tensor& t) { + // Current implementation assumes FP16 tensor on Ascend. MLLM_RT_ASSERT(t.dtype() == kFloat16); - syncGlobalAtbStream(); - const size_t elem_cnt = t.numel(); - std::vector device_fp16(elem_cnt); - - auto ret = aclrtMemcpy( - device_fp16.data(), elem_cnt * sizeof(half_float::half), - t.ptr(), t.bytes(), - ACL_MEMCPY_DEVICE_TO_HOST); - MLLM_ACL_CHECK(ret); + // Use generic .to(kCPU) + CPU-side cast instead of raw aclrtMemcpy. + // This goes through the X2X op we implemented for Ascend, keeping + // all device transfer logic in one place. + auto cpu_tensor = const_cast(t).to(::mllm::kCPU); + const size_t elem_cnt = cpu_tensor.numel(); std::vector host(elem_cnt); + + auto* src = cpu_tensor.ptr(); for (size_t i = 0; i < elem_cnt; ++i) { - host[i] = static_cast(device_fp16[i]); + host[i] = static_cast(src[i]); } return host; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a83123e25..89eaeb49c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -11,3 +11,6 @@ add_subdirectory(cpu) if(MLLM_BUILD_CUDA_BACKEND) add_subdirectory(cuda) endif() +if(DEFINED ENV{ASCEND_HOME_PATH}) + add_subdirectory(ascend) +endif() diff --git a/tests/ascend/AscendKernelTest.hpp b/tests/ascend/AscendKernelTest.hpp new file mode 100644 index 000000000..138ee5ae8 --- /dev/null +++ b/tests/ascend/AscendKernelTest.hpp @@ -0,0 +1,52 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/mllm.hpp" +#include "mllm/core/Tensor.hpp" +#include "KernelTestHelper.hpp" + +#include + +class AscendKernelTest : public KernelTest { + public: + AscendKernelTest() = default; + ~AscendKernelTest() override = default; + + // Test Add operation with different shapes + bool AddFloat16Test(const std::vector& shapes) { + using namespace mllm; // NOLINT + for (auto& shape : shapes) { + // 1. Construct random FP16 inputs on CPU + Tensor x_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU); + Tensor y_cpu = Tensor::random(shape, -3, 3, kFloat16, kCPU); + + // 2. Compute reference result (FP16) on CPU + Tensor ref_cpu = Tensor::zeros(shape, kFloat16, kCPU); + { + auto* x_ptr = x_cpu.ptr(); + auto* y_ptr = y_cpu.ptr(); + auto* r_ptr = ref_cpu.ptr(); + auto num_elements = x_cpu.numel(); + for (size_t i = 0; i < num_elements; ++i) { + r_ptr[i] = x_ptr[i] + y_ptr[i]; + } + } + + // 3. Move inputs to Ascend and run Add (z = x + y) + auto x_ascend = x_cpu.to(kAscend); + auto y_ascend = y_cpu.to(kAscend); + auto z_ascend = x_ascend + y_ascend; + + // 4. Move result back to CPU and compare with reference using allClose + auto z_cpu = z_ascend.to(kCPU); + auto result = mllm::test::allClose(z_cpu, ref_cpu, 1e-2f, 1e-2f); + if (!result.is_close) { + return false; + } + } + return true; + } +}; + diff --git a/tests/ascend/CMakeLists.txt b/tests/ascend/CMakeLists.txt new file mode 100644 index 000000000..0025fb40b --- /dev/null +++ b/tests/ascend/CMakeLists.txt @@ -0,0 +1,26 @@ +add_executable(Mllm-Test-AscendKernel KernelTest.cpp) + +if(DEFINED ENV{ASCEND_HOME_PATH}) + target_include_directories(Mllm-Test-AscendKernel PRIVATE $ENV{ASCEND_HOME_PATH}/include) + target_link_directories(Mllm-Test-AscendKernel PRIVATE $ENV{ASCEND_HOME_PATH}/lib64) +endif() + +target_link_libraries(Mllm-Test-AscendKernel PRIVATE + gtest_main + MllmRT + MllmCPUBackend + MllmAscendBackend + ascendcl +) + +target_include_directories(Mllm-Test-AscendKernel PRIVATE + ${CMAKE_SOURCE_DIR} +) + +set_target_properties(Mllm-Test-AscendKernel PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON +) + +include(GoogleTest) + diff --git a/tests/ascend/KernelTest.cpp b/tests/ascend/KernelTest.cpp new file mode 100644 index 000000000..025c89994 --- /dev/null +++ b/tests/ascend/KernelTest.cpp @@ -0,0 +1,45 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include + +#include "mllm/mllm.hpp" + +/// Kernel tests +#include "AscendKernelTest.hpp" + +//===----------------------------------------------------------------------===// +// Element wise ADD. +// +// FP16 (Ascend currently uses FP16) +//===----------------------------------------------------------------------===// +TEST_F(AscendKernelTest, AddFloat16) { + EXPECT_EQ(AddFloat16Test({ + {2, 3}, + {1, 1}, + {4, 4}, + {8, 8}, + {16, 16}, + {32, 32}, + }), + true); +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + + // Initialize Ascend backend + mllm::initAscendBackend(); + + // Initialize context + mllm::initializeContext(); + + auto ret = RUN_ALL_TESTS(); + + // Cleanup + mllm::memoryReport(); + mllm::shutdownContext(); + + return ret; +} + diff --git a/tests/ascend/KernelTestHelper.hpp b/tests/ascend/KernelTestHelper.hpp new file mode 100644 index 000000000..03a9f86f2 --- /dev/null +++ b/tests/ascend/KernelTestHelper.hpp @@ -0,0 +1,18 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. +#pragma once + +#include + +class KernelTest : public testing::Test { + public: + KernelTest() = default; + ~KernelTest() override = default; + + // If the constructor and destructor are not enough for setting up + // and cleaning up each test, you can define the following methods: + void SetUp() override {} + + void TearDown() override {} +}; + From 6b88b8d36b7db038954732a57b9c8f4c7c4e88b8 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 20 Dec 2025 16:12:27 +0800 Subject: [PATCH 14/16] fix(ascend): move to the test folder --- examples/CMakeLists.txt | 1 - examples/ascend_add_demo/CMakeLists.txt | 22 ------ examples/ascend_add_demo/main.cpp | 100 ------------------------ 3 files changed, 123 deletions(-) delete mode 100644 examples/ascend_add_demo/CMakeLists.txt delete mode 100644 examples/ascend_add_demo/main.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a143a7989..6c49cfb22 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -20,4 +20,3 @@ endif() if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE) add_subdirectory(qwen3_qnn_aot) endif() - diff --git a/examples/ascend_add_demo/CMakeLists.txt b/examples/ascend_add_demo/CMakeLists.txt deleted file mode 100644 index 15eaefd7c..000000000 --- a/examples/ascend_add_demo/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -add_executable(ascend_add_demo main.cpp) - -if(DEFINED ENV{ASCEND_HOME_PATH}) - target_include_directories(ascend_add_demo PRIVATE $ENV{ASCEND_HOME_PATH}/include) - target_link_directories(ascend_add_demo PRIVATE $ENV{ASCEND_HOME_PATH}/lib64) -endif() - -target_link_libraries(ascend_add_demo PRIVATE - MllmRT - MllmAscendBackend - ascendcl # 添加 ACL 库,因为 main.cpp 中直接使用了 aclrtMemcpy -) - -set_target_properties(ascend_add_demo PROPERTIES - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON -) - -target_include_directories(ascend_add_demo PRIVATE - ${CMAKE_SOURCE_DIR} -) - diff --git a/examples/ascend_add_demo/main.cpp b/examples/ascend_add_demo/main.cpp deleted file mode 100644 index 216591b2d..000000000 --- a/examples/ascend_add_demo/main.cpp +++ /dev/null @@ -1,100 +0,0 @@ -#include -#include -#include -#include -#include "mllm/mllm.hpp" -#include "mllm/backends/ascend/AscendCommon.hpp" -#include "mllm/core/Tensor.hpp" -#include "mllm/engine/Context.hpp" -#include "mllm/core/aops/ElewiseOps.hpp" -#include "mllm/core/OpTypes.hpp" - -using namespace mllm; - -int main() { - std::cout << "=== Ascend Add Op Demo ===" << std::endl; - - try { - std::cout << "1. Initializing Ascend backend..." << std::endl; - initAscendBackend(); - std::cout << " ✓ Ascend backend initialized\n" << std::endl; - - std::cout << "2. Preparing test data..." << std::endl; - const int batch = 2; - const int size = 3; - std::vector data_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; - std::vector data_y = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f}; - std::vector expected = {11.0f, 22.0f, 33.0f, 44.0f, 55.0f, 66.0f}; - - std::cout << " Input X: ["; - for (size_t i = 0; i < data_x.size(); ++i) { - std::cout << data_x[i]; - if (i < data_x.size() - 1) std::cout << ", "; - } - std::cout << "]" << std::endl; - - std::cout << " Input Y: ["; - for (size_t i = 0; i < data_y.size(); ++i) { - std::cout << data_y[i]; - if (i < data_y.size() - 1) std::cout << ", "; - } - std::cout << "]\n" << std::endl; - - std::cout << "3. Preparing tensors on Ascend..." << std::endl; - auto x_handle = ascend::prepareAscendTensor(data_x, batch, size); - auto y_handle = ascend::prepareAscendTensor(data_y, batch, size); - auto& x_ascend = x_handle.tensor(); - auto& y_ascend = y_handle.tensor(); - std::cout << " ✓ Tensors ready on Ascend device\n" << std::endl; - - std::cout << "4. Executing Add operation on Ascend..." << std::endl; - auto& ctx = Context::instance(); - std::cout << "context over" < actual; - bool correct = ascend::verifyAscendTensor( - z_ascend, - expected, - /*atol=*/1e-2f, - /*rtol=*/1e-2f, - /*verbose=*/true, - &actual); - - std::cout << " Actual result: ["; - for (size_t i = 0; i < actual.size(); ++i) { - std::cout << actual[i]; - if (i < actual.size() - 1) std::cout << ", "; - } - std::cout << "]" << std::endl; - - std::cout << " Expected result: ["; - for (size_t i = 0; i < expected.size(); ++i) { - std::cout << expected[i]; - if (i < expected.size() - 1) std::cout << ", "; - } - std::cout << "]" << std::endl; - - if (correct) { - std::cout << "\n✓✓✓ Test PASSED! All values match expected results. ✓✓✓" << std::endl; - } else { - std::cout << "\n✗✗✗ Test FAILED! Results don't match expected values. ✗✗✗" << std::endl; - } - - x_handle.release(); - y_handle.release(); - - return correct ? 0 : 1; - - } catch (const std::exception& e) { - std::cerr << "\n✗ Error: " << e.what() << std::endl; - return 1; - } -} - From 2d25eda619121baf475f67db7f49cdbda7d7db2e Mon Sep 17 00:00:00 2001 From: Chenghua <68260701+chenghuaWang@users.noreply.github.com> Date: Tue, 23 Dec 2025 13:56:44 +0800 Subject: [PATCH 15/16] Update build_arm_ascend.yaml --- tasks/build_arm_ascend.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/build_arm_ascend.yaml b/tasks/build_arm_ascend.yaml index 4546f2d81..17ffd3f10 100644 --- a/tasks/build_arm_ascend.yaml +++ b/tasks/build_arm_ascend.yaml @@ -1,7 +1,7 @@ Tasks: - CMakeConfigTask: cmake_cfg_path: "build-arm-ascend" - cmake_build_type: "Release" + cmake_build_type: "ReleaseDebInfo" cmake_extra_args: - "-DMLLM_CROSS_COMPILE=ON" - "-DMLLM_BUILD_ARM_BACKEND=ON" @@ -9,7 +9,7 @@ Tasks: - "-DANDROID_PLATFORM=android-28" - "-DANDROID_ABI=arm64-v8a" - '-DMLLM_CPU_BACKEND_COMPILE_OPTIONS="-march=armv8.2-a+fp16+fp16fml+dotprod+i8mm;-ffast-math;-Wno-nan-infinity-disabled"' - - "-DCMAKE_INSTALL_PREFIX=/root/mllm-install-android-arm64-v8a" + - "-DCMAKE_INSTALL_PREFIX=./mllm-install-android-arm64-v8a" - "-DMLLM_KERNEL_USE_THREADS=ON" - "-DMLLM_KERNEL_THREADS_VENDOR_OPENMP=ON" - "-DMLLM_KERNEL_USE_THREADS_VENDOR_MLLM=OFF" From 2fa8eed23877644f62dfd69ea847e3f482529828 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 23 Dec 2025 16:56:02 +0800 Subject: [PATCH 16/16] fix(ascend): address review comments --- CMakeLists.txt | 4 ++++ mllm/backends/ascend/CMakeLists.txt | 13 +++++-------- .../{AscnedElewiseOps.cpp => AscendElewiseOps.cpp} | 12 ------------ mllm/backends/cpu/kernels/arm/rmsnorm.cpp | 4 ++-- mllm/backends/cpu/kernels/arm/softmax.cpp | 8 ++++---- tests/CMakeLists.txt | 2 +- tests/ascend/CMakeLists.txt | 1 - tests/ascend/KernelTest.cpp | 6 +++--- 8 files changed, 19 insertions(+), 31 deletions(-) rename mllm/backends/ascend/ops/{AscnedElewiseOps.cpp => AscendElewiseOps.cpp} (89%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 221e956d5..7d167435e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,6 +45,10 @@ option(MLLM_CPU_BACKEND_USE_SME2 "Enable SME2" OFF) # Ascend Backend: Options option(MLLM_ASCEND_CPU_DEBUG_MODE "Enable CPU Debug mode in ascend" OFF) +# run mode and SOC version for Ascend backend +set(MLLM_RUN_MODE "npu" CACHE STRING "Run mode for mLLM backends: cpu/sim/npu") +set(MLLM_SOC_VERSION "Ascend310B1" CACHE STRING "SOC version for Ascend backend") + # Threads option(MLLM_KERNEL_USE_THREADS "Enable Threads" ON) option(MLLM_KERNEL_USE_THREADS_VENDOR_MLLM "Enable mllm's thread pool" ON) diff --git a/mllm/backends/ascend/CMakeLists.txt b/mllm/backends/ascend/CMakeLists.txt index 6386948fd..bb0feac46 100644 --- a/mllm/backends/ascend/CMakeLists.txt +++ b/mllm/backends/ascend/CMakeLists.txt @@ -1,11 +1,8 @@ -set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") -set(SOC_VERSION "Ascend310B1" CACHE STRING "system on chip type") - if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) endif() if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) - set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) endif() message(STATUS "SOC_VERSION is ${SOC_VERSION}, RUN_MODE is ${RUN_MODE}") @@ -29,20 +26,20 @@ add_library(MllmAscendBackend SHARED ${MLLM_ASCEND_SOURCES}) if(DEFINED ENV{ASCEND_HOME_PATH}) target_include_directories(MllmAscendBackend PUBLIC $ENV{ASCEND_HOME_PATH}/include) - target_link_directories(MllmAscendBackend PRIVATE $ENV{ASCEND_HOME_PATH}/lib64) + target_link_directories(MllmAscendBackend PUBLIC $ENV{ASCEND_HOME_PATH}/lib64) else() message(WARNING "ASCEND_HOME_PATH is not set, Ascend headers and libs may not be found") endif() if(DEFINED ENV{ATB_HOME_PATH}) target_include_directories(MllmAscendBackend PUBLIC $ENV{ATB_HOME_PATH}/include) - target_link_directories(MllmAscendBackend PRIVATE $ENV{ATB_HOME_PATH}/lib) + target_link_directories(MllmAscendBackend PUBLIC $ENV{ATB_HOME_PATH}/lib) else() message(WARNING "ATB_HOME_PATH not defined, ATB library will not be linked") endif() -target_link_libraries(MllmAscendBackend PRIVATE +target_link_libraries(MllmAscendBackend PUBLIC ascendcl opapi nnopbase diff --git a/mllm/backends/ascend/ops/AscnedElewiseOps.cpp b/mllm/backends/ascend/ops/AscendElewiseOps.cpp similarity index 89% rename from mllm/backends/ascend/ops/AscnedElewiseOps.cpp rename to mllm/backends/ascend/ops/AscendElewiseOps.cpp index 4972e0cb5..762ef1dfe 100644 --- a/mllm/backends/ascend/ops/AscnedElewiseOps.cpp +++ b/mllm/backends/ascend/ops/AscendElewiseOps.cpp @@ -20,18 +20,6 @@ namespace mllm::ascend { AscendAddOp::AscendAddOp(const aops::AddOpOptions& options) : aops::AddOp(options) {} void AscendAddOp::setup(const std::vector& inputs, std::vector& outputs) { - // for (auto& t : outputs) { - // if (!t.isNil()) { - // auto& mem_mgr = getAscendMemoryManager(); - // int block_id = -1; - // void* device_ptr = nullptr; - - // mem_mgr.allocateBlock(static_cast(t.bytes()), block_id); - // mem_mgr.getBlockPtr(block_id, device_ptr); - - // t.impl()->storage()->ptr_ = device_ptr; - // } - // } BaseOp::setup(inputs, outputs); } diff --git a/mllm/backends/cpu/kernels/arm/rmsnorm.cpp b/mllm/backends/cpu/kernels/arm/rmsnorm.cpp index 564bdd301..2c0f5e853 100644 --- a/mllm/backends/cpu/kernels/arm/rmsnorm.cpp +++ b/mllm/backends/cpu/kernels/arm/rmsnorm.cpp @@ -17,7 +17,7 @@ void rmsnorm_fp32(const mllm_fp32_t* __restrict X, const mllm_fp32_t* __restrict auto w_ptr = W; // pass 1 - const float rms = 1.f / std::sqrtf(vsquare_mean_fp32(x_ptr, D) + epsilon); + const float rms = 1.f / std::sqrt(vsquare_mean_fp32(x_ptr, D) + epsilon); // pass 2 if (add_unit_offset) { @@ -106,7 +106,7 @@ void rmsnorm_fp16(const mllm_fp16_t* __restrict X, const mllm_fp16_t* __restrict // pass 1: compute RMS scaling factor float mean_square = vsquare_mean_fp16(x_ptr, D); - const float rms_float = 1.f / std::sqrtf(mean_square + epsilon); + const float rms_float = 1.f / std::sqrt(mean_square + epsilon); float16_t rms_fp16 = static_cast(rms_float); float16x8_t rms_vec = vdupq_n_f16(rms_fp16); diff --git a/mllm/backends/cpu/kernels/arm/softmax.cpp b/mllm/backends/cpu/kernels/arm/softmax.cpp index d8423b943..dd9f39d56 100644 --- a/mllm/backends/cpu/kernels/arm/softmax.cpp +++ b/mllm/backends/cpu/kernels/arm/softmax.cpp @@ -25,7 +25,7 @@ void softmax_v1_fp32(const mllm_fp32_t* __restrict X, mllm_fp32_t* __restrict Y, // Pass 2: minus max_value and calculate exp float sum = 0.f; for (int i = 0; i < len; ++i) { - auto tmp = std::expf(X[i * stride] - max_value); + auto tmp = std::exp(X[i * stride] - max_value); Y[i * stride] = tmp; sum += tmp; } @@ -112,7 +112,7 @@ void softmax_v1_fp32(const mllm_fp32_t* __restrict X, mllm_fp32_t* __restrict Y, } float sum_value = vaddvq_f32(sum_vec_0); for (; i < len; ++i) { - float tmp = std::expf(X[i] - max_value); + float tmp = std::exp(X[i] - max_value); Y[i] = tmp; sum_value += tmp; } @@ -163,7 +163,7 @@ void softmax_v1_fp16(const mllm_fp16_t* __restrict X, mllm_fp16_t* __restrict Y, // Pass 2: minus max_value and calculate exp float sum = 0.f; for (int i = 0; i < len; ++i) { - auto tmp = std::expf(X[i * stride] - max_value); + auto tmp = std::exp(X[i * stride] - max_value); Y[i * stride] = static_cast(tmp); sum += tmp; } @@ -229,7 +229,7 @@ void softmax_v1_fp16(const mllm_fp16_t* __restrict X, mllm_fp16_t* __restrict Y, sum_vec_0 = vaddq_f32(sum_vec_0, sum_vec_2); float sum_value = vaddvq_f32(sum_vec_0); for (; i < len; ++i) { - float tmp = std::expf(X[i] - max_value); + float tmp = std::exp(X[i] - max_value); Y[i] = static_cast(tmp); sum_value += tmp; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 89eaeb49c..7e4e3642c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -11,6 +11,6 @@ add_subdirectory(cpu) if(MLLM_BUILD_CUDA_BACKEND) add_subdirectory(cuda) endif() -if(DEFINED ENV{ASCEND_HOME_PATH}) +if(MLLM_BUILD_ASCEND_BACKEND) add_subdirectory(ascend) endif() diff --git a/tests/ascend/CMakeLists.txt b/tests/ascend/CMakeLists.txt index 0025fb40b..24944bf0b 100644 --- a/tests/ascend/CMakeLists.txt +++ b/tests/ascend/CMakeLists.txt @@ -10,7 +10,6 @@ target_link_libraries(Mllm-Test-AscendKernel PRIVATE MllmRT MllmCPUBackend MllmAscendBackend - ascendcl ) target_include_directories(Mllm-Test-AscendKernel PRIVATE diff --git a/tests/ascend/KernelTest.cpp b/tests/ascend/KernelTest.cpp index 025c89994..b0489f545 100644 --- a/tests/ascend/KernelTest.cpp +++ b/tests/ascend/KernelTest.cpp @@ -28,11 +28,11 @@ TEST_F(AscendKernelTest, AddFloat16) { int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); - // Initialize Ascend backend - mllm::initAscendBackend(); - // Initialize context mllm::initializeContext(); + + // Initialize Ascend backend + mllm::initAscendBackend(); auto ret = RUN_ALL_TESTS();