From 289bf043f5ac3bd2f4376e7853e417503fab3242 Mon Sep 17 00:00:00 2001 From: HayzelHan Date: Thu, 4 Dec 2025 05:49:14 +0000 Subject: [PATCH 1/8] fix(x86): update qwen3 model runtime - Updated qwen3 model runtime using `modeling_qwen3_fa2.hpp` - Enabled qwen3 compilation options - Fixed rmsnorm and softmax operators - Fixed parameters for quantize - Added `NYI` messages for some unsupported ops --- examples/CMakeLists.txt | 4 ++-- examples/qwen3/main.cpp | 2 +- .../kernels/common/ggml/quantize/quantize.hpp | 2 +- mllm/backends/cpu/kernels/x86/rmsnorm.cpp | 12 +++++----- mllm/backends/cpu/kernels/x86/softmax.cpp | 10 ++++---- mllm/backends/cpu/ops/LinearOp.cpp | 4 +++- mllm/backends/cpu/ops/TransposeOp.cpp | 24 ++++++++++++++----- mllm/models/qwen3/modeling_qwen3_fa2.hpp | 2 +- 8 files changed, 37 insertions(+), 23 deletions(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9e1887dfc..8b5578f62 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -5,8 +5,8 @@ #add_subdirectory(llama) add_subdirectory(minicpm_o) add_subdirectory(minicpm4) -#add_subdirectory(qwen3) -#add_subdirectory(qwen3_service) +add_subdirectory(qwen3) +add_subdirectory(qwen3_service) #add_subdirectory(deepseek_ocr) if(MLLM_BUILD_QNN_BACKEND) add_subdirectory(qwen_npu) diff --git a/examples/qwen3/main.cpp b/examples/qwen3/main.cpp index 6652b2d4d..acfc90092 100644 --- a/examples/qwen3/main.cpp +++ b/examples/qwen3/main.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include diff --git a/mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp b/mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp index 3bc417c57..f70b35f85 100644 --- a/mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp +++ b/mllm/backends/cpu/kernels/common/ggml/quantize/quantize.hpp @@ -111,7 +111,7 @@ namespace mllm::cpu { static float table_f32_f16[1 << 16]; static bool table_f32_f16_init = false; -inline static float lookup_fp16_to_fp32(uint16_t f) { +inline static float lookup_fp16_to_fp32(mllm_fp16_t f) { if (!table_f32_f16_init) { uint16_t ii; for (int i = 0; i < (1 << 16); ++i) { diff --git a/mllm/backends/cpu/kernels/x86/rmsnorm.cpp b/mllm/backends/cpu/kernels/x86/rmsnorm.cpp index 6af304331..b07e35fc9 100644 --- a/mllm/backends/cpu/kernels/x86/rmsnorm.cpp +++ b/mllm/backends/cpu/kernels/x86/rmsnorm.cpp @@ -27,22 +27,22 @@ void rmsnorm_fp32(const float* __restrict X, const float* __restrict W, float* _ const auto ones = hn::Set(d, 1.0f); int i = 0; for (; i + hn::Lanes(d) <= D; i += hn::Lanes(d)) { - auto x_val = hn::Load(d, x_ptr + i); - auto w_val = hn::Load(d, w_ptr + i); + auto x_val = hn::LoadU(d, x_ptr + i); + auto w_val = hn::LoadU(d, w_ptr + i); auto multiplier = hn::Add(w_val, ones); multiplier = hn::Mul(multiplier, rms_vec); auto result = hn::Mul(x_val, multiplier); - hn::Store(result, d, y_ptr + i); + hn::StoreU(result, d, y_ptr + i); } for (; i < D; ++i) { y_ptr[i] = x_ptr[i] * rms * (w_ptr[i] + 1.0f); } } else { int i = 0; for (; i + hn::Lanes(d) <= D; i += hn::Lanes(d)) { - auto x_val = hn::Load(d, x_ptr + i); - auto w_val = hn::Load(d, w_ptr + i); + auto x_val = hn::LoadU(d, x_ptr + i); + auto w_val = hn::LoadU(d, w_ptr + i); auto multiplier = hn::Mul(w_val, rms_vec); auto result = hn::Mul(x_val, multiplier); - hn::Store(result, d, y_ptr + i); + hn::StoreU(result, d, y_ptr + i); } for (; i < D; ++i) { y_ptr[i] = x_ptr[i] * rms * w_ptr[i]; } } diff --git a/mllm/backends/cpu/kernels/x86/softmax.cpp b/mllm/backends/cpu/kernels/x86/softmax.cpp index 3ea229e54..57a5f92a2 100644 --- a/mllm/backends/cpu/kernels/x86/softmax.cpp +++ b/mllm/backends/cpu/kernels/x86/softmax.cpp @@ -35,7 +35,7 @@ void softmax_v1_fp32(const mllm_fp32_t* __restrict X, mllm_fp32_t* __restrict Y, int i = 0; V max_vec = hn::Set(d, std::numeric_limits::lowest()); for (; i + hn::Lanes(d) <= len; i += hn::Lanes(d)) { - const V x_vec = hn::Load(d, X + i); + const V x_vec = hn::LoadU(d, X + i); max_vec = hn::Max(max_vec, x_vec); } float max_value = hn::ReduceMax(d, max_vec); @@ -44,10 +44,10 @@ void softmax_v1_fp32(const mllm_fp32_t* __restrict X, mllm_fp32_t* __restrict Y, const V max_vec_broadcast = hn::Set(d, max_value); i = 0; for (; i + hn::Lanes(d) <= len; i += hn::Lanes(d)) { - const V x_vec = hn::Load(d, X + i); + const V x_vec = hn::LoadU(d, X + i); const V normalized = hn::Sub(x_vec, max_vec_broadcast); const V exp_vec = mllm::cpu::x86::vexpq_fast_f32(d, normalized); - hn::Store(exp_vec, d, Y + i); + hn::StoreU(exp_vec, d, Y + i); sum_vec = hn::Add(sum_vec, exp_vec); } float sum_value = hn::ReduceSum(d, sum_vec); @@ -60,9 +60,9 @@ void softmax_v1_fp32(const mllm_fp32_t* __restrict X, mllm_fp32_t* __restrict Y, const V inv_sum_vec = hn::Set(d, sum_value); i = 0; for (; i + hn::Lanes(d) <= len; i += hn::Lanes(d)) { - const V y_vec = hn::Load(d, Y + i); + const V y_vec = hn::LoadU(d, Y + i); const V result = hn::Mul(y_vec, inv_sum_vec); - hn::Store(result, d, Y + i); + hn::StoreU(result, d, Y + i); } for (; i < len; ++i) { Y[i] *= sum_value; } } diff --git a/mllm/backends/cpu/ops/LinearOp.cpp b/mllm/backends/cpu/ops/LinearOp.cpp index d24c23c9b..fdb7e574d 100644 --- a/mllm/backends/cpu/ops/LinearOp.cpp +++ b/mllm/backends/cpu/ops/LinearOp.cpp @@ -290,8 +290,10 @@ void CPULinearOp::forward(const std::vector& inputs, std::vector MLLM_RT_ASSERT_EQ(weight_.dtype(), kFloat32); MLLM_RT_ASSERT_EQ(o.dtype(), kFloat32); if (bias_) { MLLM_RT_ASSERT_EQ(bias_.dtype(), kFloat32); } +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + NYI("LinearImplTypes not supported in x86 yet"); -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) if (batch_count == 1) { arm::mllm_blas_matmul_fp32(M, K, N, o.ptr(), input.ptr(), weight_.ptr(), options_.bias ? bias_.ptr() : nullptr, false, true, options_.getThreads()); diff --git a/mllm/backends/cpu/ops/TransposeOp.cpp b/mllm/backends/cpu/ops/TransposeOp.cpp index 0dd63c72d..b5e1ae939 100644 --- a/mllm/backends/cpu/ops/TransposeOp.cpp +++ b/mllm/backends/cpu/ops/TransposeOp.cpp @@ -27,13 +27,17 @@ void CPUTransposeOp::forward(const std::vector& inputs, std::vector WH) fp32 not supported in x86"); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::transpose_hw_wh_fp32(input.ptr(), output.ptr(), input_shape[0], input_shape[1]); #endif break; } case kFloat16: { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + NYI("Transpose op(HW -> WH) fp16 not supported in x86"); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::transpose_hw_wh_fp16(input.ptr(), output.ptr(), input_shape[0], input_shape[1]); #endif break; @@ -52,14 +56,18 @@ void CPUTransposeOp::forward(const std::vector& inputs, std::vector BHSD) fp32 not supported in x86"); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::transpose_bshd_bhsd_fp32(input.ptr(), output.ptr(), input_shape[0], input_shape[1], input_shape[2], input_shape[3]); #endif break; } case kFloat16: { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + NYI("Transpose op(BSHD -> BHSD) fp16 not supported in x86"); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::transpose_bshd_bhsd_fp16(input.ptr(), output.ptr(), input_shape[0], input_shape[1], input_shape[2], input_shape[3]); #endif @@ -84,14 +92,18 @@ void CPUTransposeOp::forward(const std::vector& inputs, std::vector BSDH) fp32 not supported in x86"); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::transpose_last_dims_fp32(input.ptr(), output.ptr(), batch, input_shape[0], input_shape[1]); #endif break; } case kFloat16: { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + NYI("Transpose op(BSHD -> BSDH) fp16 not supported in x86"); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::transpose_last_dims_fp16(input.ptr(), output.ptr(), batch, input_shape[0], input_shape[1]); #endif diff --git a/mllm/models/qwen3/modeling_qwen3_fa2.hpp b/mllm/models/qwen3/modeling_qwen3_fa2.hpp index a309159f9..4b3004a99 100644 --- a/mllm/models/qwen3/modeling_qwen3_fa2.hpp +++ b/mllm/models/qwen3/modeling_qwen3_fa2.hpp @@ -262,7 +262,7 @@ class Qwen3ForCausalLM : public ARGeneration, public nn::Module { if (cfg.tie_word_embeddings) { // NOTE: // model.lm_head.weight is quantization weights of model.embed_tokens.weight - lm_head_ = reg("lm_head_out", cfg.hidden_size, cfg.vocab_size, false, cfg.linear_impl_type); + lm_head_ = reg("lm_head", cfg.hidden_size, cfg.vocab_size, false, cfg.linear_impl_type); } // Init inv freq From 00c7534445897c62d464d5d9dee6bc72d7e13ee6 Mon Sep 17 00:00:00 2001 From: HayzelHan Date: Fri, 5 Dec 2025 10:29:08 +0000 Subject: [PATCH 2/8] feat(x86): add support for elementwise operators Added support for common CPU elementwise operators (fp32) --- mllm/backends/cpu/kernels/Kernels.hpp | 1 + .../cpu/kernels/common/elewise-inl.hpp | 86 +++++++++++-- mllm/backends/cpu/ops/ElewiseOps.cpp | 119 +++++++++++++++--- 3 files changed, 178 insertions(+), 28 deletions(-) diff --git a/mllm/backends/cpu/kernels/Kernels.hpp b/mllm/backends/cpu/kernels/Kernels.hpp index e99ffdd73..5f6cd2f53 100644 --- a/mllm/backends/cpu/kernels/Kernels.hpp +++ b/mllm/backends/cpu/kernels/Kernels.hpp @@ -37,6 +37,7 @@ #else #include "mllm/backends/cpu/kernels/common/gelu-inl.hpp" // IWYU pragma: export #include "mllm/backends/cpu/kernels/common/permute-inl.hpp" // IWYU pragma: export +#include "mllm/backends/cpu/kernels/common/elewise-inl.hpp" // IWYU pragma: export #endif // Platform free Kernels. diff --git a/mllm/backends/cpu/kernels/common/elewise-inl.hpp b/mllm/backends/cpu/kernels/common/elewise-inl.hpp index a97113f8e..92ffc8474 100644 --- a/mllm/backends/cpu/kernels/common/elewise-inl.hpp +++ b/mllm/backends/cpu/kernels/common/elewise-inl.hpp @@ -65,30 +65,98 @@ struct DivOp { }; template -HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_add(T* x, const T* y, size_t n) { - __elementwise(x, y, n, AddOp{}); +HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_add(T* out, const T* x, const T* y, size_t n) { + __elementwise(x, y, out, n, AddOp{}); } template -HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_sub(T* x, const T* y, size_t n) { - __elementwise(x, y, n, SubOp{}); +HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_sub(T* out, const T* x, const T* y, size_t n) { + __elementwise(x, y, out, n, SubOp{}); } template -HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_mul(T* x, const T* y, size_t n) { - __elementwise(x, y, n, MulOp{}); +HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_mul(T* out, const T* x, const T* y, size_t n) { + __elementwise(x, y, out, n, MulOp{}); } template -HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_div(T* x, const T* y, size_t n) { - __elementwise(x, y, n, DivOp{}); +HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_div(T* out, const T* x, const T* y, size_t n) { + __elementwise(x, y, out, n, DivOp{}); } //===----------------------------------------------------------------------===// // Elementwise + - * / By Const //===----------------------------------------------------------------------===// -// TODO +template +HWY_INLINE void __elementwise_scalar(T* HWY_RESTRICT out, const T* HWY_RESTRICT x, const T* HWY_RESTRICT y_ptr, size_t count, Op&& op) { + const hn::ScalableTag d; + const size_t N = hn::Lanes(d); + size_t idx = 0; + + const T scalar = *y_ptr; + const hn::Vec sVec = hn::Set(d, scalar); + + for (; idx + N <= count; idx += N) { + const hn::Vec vx = hn::LoadU(d, x + idx); + const hn::Vec result = op(d, vx, sVec); + hn::StoreU(result, d, out + idx); + } + + if (idx < count) { + const hn::Vec vx = hn::LoadN(d, x + idx, count - idx); + const hn::Vec result = op(d, vx, sVec); + hn::StoreN(result, d, out + idx, count - idx); + } +} + +struct AddScalarOp { + template + HWY_INLINE V operator()(D d, V a, V b) const { + return hn::Add(a, b); + } +}; + +struct SubScalarOp { + template + HWY_INLINE V operator()(D d, V a, V b) const { + return hn::Sub(a, b); + } +}; + +struct MulScalarOp { + template + HWY_INLINE V operator()(D d, V a, V b) const { + return hn::Mul(a, b); + } +}; + +struct DivScalarOp { + template + HWY_INLINE V operator()(D d, V a, V b) const { + return hn::Div(a, b); + } +}; + +template +HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_add_scalar(T* out, const T* x, const T* y, size_t n) { + __elementwise_scalar(out, x, y, n, AddScalarOp{}); +} + +template +HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_sub_scalar(T* out, const T* x, const T* y, size_t n) { + __elementwise_scalar(out, x, y, n, SubScalarOp{}); +} + +template +HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_mul_scalar(T* out, const T* x, const T* y, size_t n) { + __elementwise_scalar(out, x, y, n, MulScalarOp{}); +} + +template +HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_div_scalar(T* out, const T* x, const T* y, size_t n) { + __elementwise_scalar(out, x, y, n, DivScalarOp{}); +} //===----------------------------------------------------------------------===// // Inplace Elementwise + - * / diff --git a/mllm/backends/cpu/ops/ElewiseOps.cpp b/mllm/backends/cpu/ops/ElewiseOps.cpp index 1393c9408..3feb671cd 100644 --- a/mllm/backends/cpu/ops/ElewiseOps.cpp +++ b/mllm/backends/cpu/ops/ElewiseOps.cpp @@ -140,12 +140,18 @@ void CPUAddOp::forward(const std::vector& inputs, std::vector& o switch (dtype) { case kFloat32: { if (input0.numel() == input1.numel()) { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + cpu::common::HWY_NAMESPACE::element_wise_add(output.ptr(), input0.ptr(), input1.ptr(), + output.numel()); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_add_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel(), options_.getThreads()); #endif } else if (input1.numel() == 1) { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + cpu::common::HWY_NAMESPACE::element_wise_add_scalar(output.ptr(), input0.ptr(), input1.ptr(), + output.numel()); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_add_fp32_scalar(output.ptr(), input0.ptr(), *input1.ptr(), output.numel(), options_.getThreads()); #endif @@ -153,8 +159,20 @@ void CPUAddOp::forward(const std::vector& inputs, std::vector& o const float* a = input0.ptr(); const float* b = input1.ptr(); float* out = output.ptr(); +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + // Process each batch separately + for (int batch = 0; batch < batch_dims; ++batch) { + // Each batch processes broadcast_naive_loops iterations of vector_size elements + for (int l = 0; l < broadcast_naive_loops; ++l) { + int a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension + int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) + cpu::common::HWY_NAMESPACE::element_wise_add(out + out_offset, a + a_offset, b + b_offset, vector_size); + } + } + +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) // Process each batch separately for (int batch = 0; batch < batch_dims; ++batch) { // Each batch processes broadcast_naive_loops iterations of vector_size elements @@ -257,7 +275,7 @@ void CPUAddOp::forward(const std::vector& inputs, std::vector& o const float* a = input0.ptr(); const mllm_complex_fp32_t* b = input1.ptr(); mllm_complex_fp32_t* out = output.ptr(); - + #if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) // Process each batch separately for (int batch = 0; batch < batch_dims; ++batch) { @@ -299,12 +317,18 @@ void CPUSubOp::forward(const std::vector& inputs, std::vector& o switch (dtype) { case kFloat32: { if (input0.numel() == input1.numel()) { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + cpu::common::HWY_NAMESPACE::element_wise_sub(output.ptr(), input0.ptr(), input1.ptr(), + output.numel()); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_sub_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel(), options_.getThreads()); #endif } else if (input1.numel() == 1) { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + cpu::common::HWY_NAMESPACE::element_wise_sub_scalar(output.ptr(), input0.ptr(), input1.ptr(), + output.numel()); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_sub_fp32_scalar(output.ptr(), input0.ptr(), *input1.ptr(), output.numel(), options_.getThreads()); #endif @@ -312,8 +336,19 @@ void CPUSubOp::forward(const std::vector& inputs, std::vector& o const float* a = input0.ptr(); const float* b = input1.ptr(); float* out = output.ptr(); +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + // Process each batch separately + for (int batch = 0; batch < batch_dims; ++batch) { + // Each batch processes broadcast_naive_loops iterations of vector_size elements + for (int l = 0; l < broadcast_naive_loops; ++l) { + int a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension + int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) + cpu::common::HWY_NAMESPACE::element_wise_sub(out + out_offset, a + a_offset, b + b_offset, vector_size); + } + } +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) // Process each batch separately for (int batch = 0; batch < batch_dims; ++batch) { // Each batch processes broadcast_naive_loops iterations of vector_size elements @@ -458,12 +493,18 @@ void CPUMulOp::forward(const std::vector& inputs, std::vector& o switch (dtype) { case kFloat32: { if (input0.numel() == input1.numel()) { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + cpu::common::HWY_NAMESPACE::element_wise_mul(output.ptr(), input0.ptr(), input1.ptr(), + output.numel()); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_mul_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel(), options_.getThreads()); #endif } else if (input1.numel() == 1) { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + cpu::common::HWY_NAMESPACE::element_wise_mul_scalar(output.ptr(), input0.ptr(), input1.ptr(), + output.numel()); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_mul_fp32_scalar(output.ptr(), input0.ptr(), *input1.ptr(), output.numel(), options_.getThreads()); #endif @@ -471,8 +512,19 @@ void CPUMulOp::forward(const std::vector& inputs, std::vector& o const float* a = input0.ptr(); const float* b = input1.ptr(); float* out = output.ptr(); +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + // Process each batch separately + for (int batch = 0; batch < batch_dims; ++batch) { + // Each batch processes broadcast_naive_loops iterations of vector_size elements + for (int l = 0; l < broadcast_naive_loops; ++l) { + int a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension + int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) + cpu::common::HWY_NAMESPACE::element_wise_mul(out + out_offset, a + a_offset, b + b_offset, vector_size); + } + } +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) // Process each batch separately for (int batch = 0; batch < batch_dims; ++batch) { // Each batch processes broadcast_naive_loops iterations of vector_size elements @@ -617,12 +669,18 @@ void CPUDivOp::forward(const std::vector& inputs, std::vector& o switch (dtype) { case kFloat32: { if (input0.numel() == input1.numel()) { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + cpu::common::HWY_NAMESPACE::element_wise_div(output.ptr(), input0.ptr(), input1.ptr(), + output.numel()); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_div_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel(), options_.getThreads()); #endif } else if (input1.numel() == 1) { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + cpu::common::HWY_NAMESPACE::element_wise_div_scalar(output.ptr(), input0.ptr(), input1.ptr(), + output.numel()); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_div_fp32_scalar(output.ptr(), input0.ptr(), *input1.ptr(), output.numel(), options_.getThreads()); #endif @@ -630,8 +688,19 @@ void CPUDivOp::forward(const std::vector& inputs, std::vector& o const float* a = input0.ptr(); const float* b = input1.ptr(); float* out = output.ptr(); +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + // Process each batch separately + for (int batch = 0; batch < batch_dims; ++batch) { + // Each batch processes broadcast_naive_loops iterations of vector_size elements + for (int l = 0; l < broadcast_naive_loops; ++l) { + int a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension + int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) + cpu::common::HWY_NAMESPACE::element_wise_div(out + out_offset, a + a_offset, b + b_offset, vector_size); + } + } +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) // Process each batch separately for (int batch = 0; batch < batch_dims; ++batch) { // Each batch processes broadcast_naive_loops iterations of vector_size elements @@ -775,7 +844,9 @@ void CPUAbsOp::forward(const std::vector& inputs, std::vector& o switch (dtype) { case kFloat32: { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + NYI("AbsOp not supported in x86."); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_abs_fp32(output.ptr(), input.ptr(), output.numel(), options_.getThreads()); #endif break; @@ -841,7 +912,9 @@ void CPULogOp::forward(const std::vector& inputs, std::vector& o switch (dtype) { case kFloat32: { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + NYI("LogOp not supported in x86."); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_log_fp32(output.ptr(), input.ptr(), output.numel(), options_.getThreads()); #endif break; @@ -869,7 +942,9 @@ void CPUExpOp::forward(const std::vector& inputs, std::vector& o auto dtype = output.dtype(); switch (dtype) { case kFloat32: { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + NYI("ExpOp not supported in x86."); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_exp_fp32(output.ptr(), input.ptr(), output.numel(), options_.getThreads()); #endif break; @@ -894,7 +969,9 @@ void CPUClipOp::forward(const std::vector& inputs, std::vector& switch (dtype) { case kFloat32: { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + NYI("ClipOp not supported in x86."); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::clip_fp32(output.ptr(), input.ptr(), static_cast(options_.min_val), static_cast(options_.max_val), output.numel(), options_.getThreads()); #endif @@ -948,7 +1025,9 @@ void CPUSinOp::forward(const std::vector& inputs, std::vector& o auto dtype = output.dtype(); switch (dtype) { case kFloat32: { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + NYI("SinOp not supported in x86."); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_sin_fp32(output.ptr(), input.ptr(), output.numel(), options_.getThreads()); #endif break; @@ -972,7 +1051,9 @@ void CPUCosOp::forward(const std::vector& inputs, std::vector& o auto dtype = output.dtype(); switch (dtype) { case kFloat32: { -#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) + NYI("CosOp not supported in x86."); +#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_cos_fp32(output.ptr(), input.ptr(), output.numel(), options_.getThreads()); #endif break; From 3078f5a9c97fd46ece6644f45186949da77e0686 Mon Sep 17 00:00:00 2001 From: HayzelHan Date: Wed, 10 Dec 2025 03:08:45 +0000 Subject: [PATCH 3/8] refactor(ops): adopt Highway dynamic dispatch for elementwise operators - Replace cpu::common::HWY_NAMESPACE with proper Highway dynamic dispatcher - Create general kernel_dispatch.hpp/cpp with HWY_DYNAMIC_DISPATCH pattern --- mllm/backends/cpu/kernels/Kernels.hpp | 2 +- .../cpu/kernels/common/elewise-inl.hpp | 33 +++----- .../cpu/kernels/common/kernel_dispatch.cpp | 75 ++++++++++++++++++- .../cpu/kernels/common/kernel_dispatch.hpp | 28 +++++++ mllm/backends/cpu/ops/ElewiseOps.cpp | 24 +++--- 5 files changed, 123 insertions(+), 39 deletions(-) create mode 100644 mllm/backends/cpu/kernels/common/kernel_dispatch.hpp diff --git a/mllm/backends/cpu/kernels/Kernels.hpp b/mllm/backends/cpu/kernels/Kernels.hpp index 5f6cd2f53..3d3ee9c8e 100644 --- a/mllm/backends/cpu/kernels/Kernels.hpp +++ b/mllm/backends/cpu/kernels/Kernels.hpp @@ -37,13 +37,13 @@ #else #include "mllm/backends/cpu/kernels/common/gelu-inl.hpp" // IWYU pragma: export #include "mllm/backends/cpu/kernels/common/permute-inl.hpp" // IWYU pragma: export -#include "mllm/backends/cpu/kernels/common/elewise-inl.hpp" // IWYU pragma: export #endif // Platform free Kernels. // NOTE: common/blas.hpp should be include after all kernels. That because in apple platform. // Tensor::nil()'s nil keyword has been defined in apple's system head. +#include "mllm/backends/cpu/kernels/common/kernel_dispatch.hpp" // IWYU pragma: export #include "mllm/backends/cpu/kernels/common/ggml/matmul.hpp" // IWYU pragma: export #include "mllm/backends/cpu/kernels/common/fa2/fwd_bshd.hpp" // IWYU pragma: export #include "mllm/backends/cpu/kernels/common/paged_attn/fwd_bshd.hpp" // IWYU pragma: export diff --git a/mllm/backends/cpu/kernels/common/elewise-inl.hpp b/mllm/backends/cpu/kernels/common/elewise-inl.hpp index 92ffc8474..da3130569 100644 --- a/mllm/backends/cpu/kernels/common/elewise-inl.hpp +++ b/mllm/backends/cpu/kernels/common/elewise-inl.hpp @@ -1,11 +1,8 @@ // Copyright (c) MLLM Team. // Licensed under the MIT License. -#pragma once - #include -#include -#include +#include "mllm/core/DataTypes.hpp" HWY_BEFORE_NAMESPACE(); namespace mllm::cpu::common { // NOLINT @@ -64,23 +61,19 @@ struct DivOp { } }; -template -HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_add(T* out, const T* x, const T* y, size_t n) { +HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_add_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n) { __elementwise(x, y, out, n, AddOp{}); } -template -HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_sub(T* out, const T* x, const T* y, size_t n) { +HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_sub_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n) { __elementwise(x, y, out, n, SubOp{}); } -template -HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_mul(T* out, const T* x, const T* y, size_t n) { +HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_mul_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n) { __elementwise(x, y, out, n, MulOp{}); } -template -HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_div(T* out, const T* x, const T* y, size_t n) { +HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_div_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n) { __elementwise(x, y, out, n, DivOp{}); } @@ -89,12 +82,12 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_div(T* out, const T* x, const T* //===----------------------------------------------------------------------===// template -HWY_INLINE void __elementwise_scalar(T* HWY_RESTRICT out, const T* HWY_RESTRICT x, const T* HWY_RESTRICT y_ptr, size_t count, Op&& op) { +HWY_INLINE void __elementwise_scalar(T* HWY_RESTRICT out, const T* HWY_RESTRICT x, const T y, size_t count, Op&& op) { const hn::ScalableTag d; const size_t N = hn::Lanes(d); size_t idx = 0; - const T scalar = *y_ptr; + const T scalar = y; const hn::Vec sVec = hn::Set(d, scalar); for (; idx + N <= count; idx += N) { @@ -138,23 +131,19 @@ struct DivScalarOp { } }; -template -HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_add_scalar(T* out, const T* x, const T* y, size_t n) { +HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_add_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t y, size_t n) { __elementwise_scalar(out, x, y, n, AddScalarOp{}); } -template -HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_sub_scalar(T* out, const T* x, const T* y, size_t n) { +HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_sub_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t y, size_t n) { __elementwise_scalar(out, x, y, n, SubScalarOp{}); } -template -HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_mul_scalar(T* out, const T* x, const T* y, size_t n) { +HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_mul_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t y, size_t n) { __elementwise_scalar(out, x, y, n, MulScalarOp{}); } -template -HWY_NOINLINE HWY_MAYBE_UNUSED void element_wise_div_scalar(T* out, const T* x, const T* y, size_t n) { +HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_div_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t y, size_t n) { __elementwise_scalar(out, x, y, n, DivScalarOp{}); } diff --git a/mllm/backends/cpu/kernels/common/kernel_dispatch.cpp b/mllm/backends/cpu/kernels/common/kernel_dispatch.cpp index a9c07bcd3..86f909ce2 100644 --- a/mllm/backends/cpu/kernels/common/kernel_dispatch.cpp +++ b/mllm/backends/cpu/kernels/common/kernel_dispatch.cpp @@ -1,12 +1,79 @@ // Copyright (c) MLLM Team. // Licensed under the MIT License. +#include "mllm/backends/cpu/kernels/common/kernel_dispatch.hpp" #include "mllm/utils/CPUArchHelper.hpp" -#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) +// >>>> for dynamic dispatch only, skip if you want static dispatch +// First undef to prevent error when re-included. +#undef HWY_TARGET_INCLUDE +// For dynamic dispatch, specify the name of the current file (unfortunately +// __FILE__ is not reliable) so that foreach_target.h can re-include it. +#define HWY_TARGET_INCLUDE "mllm/backends/cpu/kernels/common/kernel_dispatch.cpp" +// Generates code for each enabled target by re-including this source file. +#include // IWYU pragma: keep +// <<<< end of dynamic dispatch -#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) +// Include all inline implementations here +#include "mllm/backends/cpu/kernels/common/elewise-inl.hpp" -#else +#if HWY_ONCE +namespace mllm::cpu::common { -#endif +//===----------------------------------------------------------------------===// +// Element-wise +//===----------------------------------------------------------------------===// +HWY_EXPORT(elewise_add_fp32); +HWY_EXPORT(elewise_sub_fp32); +HWY_EXPORT(elewise_mul_fp32); +HWY_EXPORT(elewise_div_fp32); +HWY_EXPORT(elewise_add_scalar_fp32); +HWY_EXPORT(elewise_sub_scalar_fp32); +HWY_EXPORT(elewise_mul_scalar_fp32); +HWY_EXPORT(elewise_div_scalar_fp32); + +HWY_DLLEXPORT void call_elewise_add_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n) { + HWY_DYNAMIC_DISPATCH(elewise_add_fp32)(out, x, y, n); +} + +HWY_DLLEXPORT void call_elewise_sub_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n) { + HWY_DYNAMIC_DISPATCH(elewise_sub_fp32)(out, x, y, n); +} + +HWY_DLLEXPORT void call_elewise_mul_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n) { + HWY_DYNAMIC_DISPATCH(elewise_mul_fp32)(out, x, y, n); +} + +HWY_DLLEXPORT void call_elewise_div_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n) { + HWY_DYNAMIC_DISPATCH(elewise_div_fp32)(out, x, y, n); +} + +HWY_DLLEXPORT void call_elewise_add_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, mllm_fp32_t y, size_t n) { + HWY_DYNAMIC_DISPATCH(elewise_add_scalar_fp32)(out, x, y, n); +} + +HWY_DLLEXPORT void call_elewise_sub_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, mllm_fp32_t y, size_t n) { + HWY_DYNAMIC_DISPATCH(elewise_sub_scalar_fp32)(out, x, y, n); +} + +HWY_DLLEXPORT void call_elewise_mul_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, mllm_fp32_t y, size_t n) { + HWY_DYNAMIC_DISPATCH(elewise_mul_scalar_fp32)(out, x, y, n); +} + +HWY_DLLEXPORT void call_elewise_div_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, mllm_fp32_t y, size_t n) { + HWY_DYNAMIC_DISPATCH(elewise_div_scalar_fp32)(out, x, y, n); +} + +//===----------------------------------------------------------------------===// +// GELU +//===----------------------------------------------------------------------===// +// HWY_EXPORT(gelu_fp32); +// +// HWY_DLLEXPORT void call_gelu_fp32(mllm_fp32_t* out, const mllm_fp32_t* in, size_t n) { +// HWY_DYNAMIC_DISPATCH(gelu_fp32)(out, in, n); +// } + + +} // namespace mllm::cpu::common + +#endif // HWY_ONCE diff --git a/mllm/backends/cpu/kernels/common/kernel_dispatch.hpp b/mllm/backends/cpu/kernels/common/kernel_dispatch.hpp new file mode 100644 index 000000000..5271077e6 --- /dev/null +++ b/mllm/backends/cpu/kernels/common/kernel_dispatch.hpp @@ -0,0 +1,28 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include +#include // HWY_DLLEXPORT +#include "mllm/core/DataTypes.hpp" + +namespace mllm::cpu::common { + +//===----------------------------------------------------------------------===// +// Elementwise + - * / By Matrix +//===----------------------------------------------------------------------===// +HWY_DLLEXPORT void call_elewise_add_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n); +HWY_DLLEXPORT void call_elewise_sub_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n); +HWY_DLLEXPORT void call_elewise_mul_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n); +HWY_DLLEXPORT void call_elewise_div_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n); + +//===----------------------------------------------------------------------===// +// Elementwise + - * / By Const +//===----------------------------------------------------------------------===// +HWY_DLLEXPORT void call_elewise_add_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, mllm_fp32_t y, size_t n); +HWY_DLLEXPORT void call_elewise_sub_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, mllm_fp32_t y, size_t n); +HWY_DLLEXPORT void call_elewise_mul_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, mllm_fp32_t y, size_t n); +HWY_DLLEXPORT void call_elewise_div_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, mllm_fp32_t y, size_t n); + +} // namespace mllm::cpu::common diff --git a/mllm/backends/cpu/ops/ElewiseOps.cpp b/mllm/backends/cpu/ops/ElewiseOps.cpp index 3feb671cd..d2ab6df53 100644 --- a/mllm/backends/cpu/ops/ElewiseOps.cpp +++ b/mllm/backends/cpu/ops/ElewiseOps.cpp @@ -141,7 +141,7 @@ void CPUAddOp::forward(const std::vector& inputs, std::vector& o case kFloat32: { if (input0.numel() == input1.numel()) { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - cpu::common::HWY_NAMESPACE::element_wise_add(output.ptr(), input0.ptr(), input1.ptr(), + cpu::common::call_elewise_add_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_add_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel(), @@ -149,7 +149,7 @@ void CPUAddOp::forward(const std::vector& inputs, std::vector& o #endif } else if (input1.numel() == 1) { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - cpu::common::HWY_NAMESPACE::element_wise_add_scalar(output.ptr(), input0.ptr(), input1.ptr(), + cpu::common::call_elewise_add_scalar_fp32(output.ptr(), input0.ptr(), *input1.ptr(), output.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_add_fp32_scalar(output.ptr(), input0.ptr(), *input1.ptr(), @@ -168,7 +168,7 @@ void CPUAddOp::forward(const std::vector& inputs, std::vector& o int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; - cpu::common::HWY_NAMESPACE::element_wise_add(out + out_offset, a + a_offset, b + b_offset, vector_size); + cpu::common::call_elewise_add_fp32(out + out_offset, a + a_offset, b + b_offset, vector_size); } } @@ -318,7 +318,7 @@ void CPUSubOp::forward(const std::vector& inputs, std::vector& o case kFloat32: { if (input0.numel() == input1.numel()) { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - cpu::common::HWY_NAMESPACE::element_wise_sub(output.ptr(), input0.ptr(), input1.ptr(), + cpu::common::call_elewise_sub_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_sub_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel(), @@ -326,7 +326,7 @@ void CPUSubOp::forward(const std::vector& inputs, std::vector& o #endif } else if (input1.numel() == 1) { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - cpu::common::HWY_NAMESPACE::element_wise_sub_scalar(output.ptr(), input0.ptr(), input1.ptr(), + cpu::common::call_elewise_sub_scalar_fp32(output.ptr(), input0.ptr(), *input1.ptr(), output.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_sub_fp32_scalar(output.ptr(), input0.ptr(), *input1.ptr(), @@ -345,7 +345,7 @@ void CPUSubOp::forward(const std::vector& inputs, std::vector& o int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; - cpu::common::HWY_NAMESPACE::element_wise_sub(out + out_offset, a + a_offset, b + b_offset, vector_size); + cpu::common::call_elewise_sub_fp32(out + out_offset, a + a_offset, b + b_offset, vector_size); } } #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) @@ -494,7 +494,7 @@ void CPUMulOp::forward(const std::vector& inputs, std::vector& o case kFloat32: { if (input0.numel() == input1.numel()) { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - cpu::common::HWY_NAMESPACE::element_wise_mul(output.ptr(), input0.ptr(), input1.ptr(), + cpu::common::call_elewise_mul_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_mul_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel(), @@ -502,7 +502,7 @@ void CPUMulOp::forward(const std::vector& inputs, std::vector& o #endif } else if (input1.numel() == 1) { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - cpu::common::HWY_NAMESPACE::element_wise_mul_scalar(output.ptr(), input0.ptr(), input1.ptr(), + cpu::common::call_elewise_mul_scalar_fp32(output.ptr(), input0.ptr(), *input1.ptr(), output.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_mul_fp32_scalar(output.ptr(), input0.ptr(), *input1.ptr(), @@ -521,7 +521,7 @@ void CPUMulOp::forward(const std::vector& inputs, std::vector& o int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; - cpu::common::HWY_NAMESPACE::element_wise_mul(out + out_offset, a + a_offset, b + b_offset, vector_size); + cpu::common::call_elewise_mul_fp32(out + out_offset, a + a_offset, b + b_offset, vector_size); } } #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) @@ -670,7 +670,7 @@ void CPUDivOp::forward(const std::vector& inputs, std::vector& o case kFloat32: { if (input0.numel() == input1.numel()) { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - cpu::common::HWY_NAMESPACE::element_wise_div(output.ptr(), input0.ptr(), input1.ptr(), + cpu::common::call_elewise_div_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_div_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel(), @@ -678,7 +678,7 @@ void CPUDivOp::forward(const std::vector& inputs, std::vector& o #endif } else if (input1.numel() == 1) { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - cpu::common::HWY_NAMESPACE::element_wise_div_scalar(output.ptr(), input0.ptr(), input1.ptr(), + cpu::common::call_elewise_div_scalar_fp32(output.ptr(), input0.ptr(), *input1.ptr(), output.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_div_fp32_scalar(output.ptr(), input0.ptr(), *input1.ptr(), @@ -697,7 +697,7 @@ void CPUDivOp::forward(const std::vector& inputs, std::vector& o int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; - cpu::common::HWY_NAMESPACE::element_wise_div(out + out_offset, a + a_offset, b + b_offset, vector_size); + cpu::common::call_elewise_div_fp32(out + out_offset, a + a_offset, b + b_offset, vector_size); } } #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) From 1f8a4a6c7e34428074e49a4b08c7da975dc617f3 Mon Sep 17 00:00:00 2001 From: HayzelHan Date: Wed, 10 Dec 2025 03:21:07 +0000 Subject: [PATCH 4/8] feat(qwen3): align lm_head_out usage and enable all compilation targets - Use lm_head_out in `modeling_qwen3_fa2.hpp` - Uncomment all compilation targets in `examples/CMakeLists.txt` --- examples/CMakeLists.txt | 12 ++++++------ mllm/models/qwen3/modeling_qwen3_fa2.hpp | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 8b5578f62..5ce568e96 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,13 +1,13 @@ -#add_subdirectory(qwen2vl) -#add_subdirectory(qwen2vl_tracer) -#add_subdirectory(qwen2_5vl) -#add_subdirectory(qwen2_5vl_tracer) -#add_subdirectory(llama) +add_subdirectory(qwen2vl) +add_subdirectory(qwen2vl_tracer) +add_subdirectory(qwen2_5vl) +add_subdirectory(qwen2_5vl_tracer) +add_subdirectory(llama) add_subdirectory(minicpm_o) add_subdirectory(minicpm4) add_subdirectory(qwen3) add_subdirectory(qwen3_service) -#add_subdirectory(deepseek_ocr) +add_subdirectory(deepseek_ocr) if(MLLM_BUILD_QNN_BACKEND) add_subdirectory(qwen_npu) endif() diff --git a/mllm/models/qwen3/modeling_qwen3_fa2.hpp b/mllm/models/qwen3/modeling_qwen3_fa2.hpp index 4b3004a99..a309159f9 100644 --- a/mllm/models/qwen3/modeling_qwen3_fa2.hpp +++ b/mllm/models/qwen3/modeling_qwen3_fa2.hpp @@ -262,7 +262,7 @@ class Qwen3ForCausalLM : public ARGeneration, public nn::Module { if (cfg.tie_word_embeddings) { // NOTE: // model.lm_head.weight is quantization weights of model.embed_tokens.weight - lm_head_ = reg("lm_head", cfg.hidden_size, cfg.vocab_size, false, cfg.linear_impl_type); + lm_head_ = reg("lm_head_out", cfg.hidden_size, cfg.vocab_size, false, cfg.linear_impl_type); } // Init inv freq From ab4a1d3bd1211917e7d8a18d428b111add831219 Mon Sep 17 00:00:00 2001 From: HayzelHan Date: Wed, 10 Dec 2025 05:24:35 +0000 Subject: [PATCH 5/8] ci(x86): add GitHub Actions workflow for x86 backend build - Add GitHub Actions workflow file for Linux (Ubuntu) x86_64 build --- .github/workflows/build-x86.yml | 51 +++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 .github/workflows/build-x86.yml diff --git a/.github/workflows/build-x86.yml b/.github/workflows/build-x86.yml new file mode 100644 index 000000000..cb45e3ac9 --- /dev/null +++ b/.github/workflows/build-x86.yml @@ -0,0 +1,51 @@ +name: Test Linux x86_64 Compilation + +on: + push: + branches: + - main + paths: + - "requirements.txt" + - "task.py" + - "tasks/build_x86.yaml" + - ".github/workflows/build-x86.yml" + - "mllm/**" + pull_request: + branches: + - main + paths: + - "requirements.txt" + - "task.py" + - "tasks/build_x86.yaml" + - ".github/workflows/build-x86.yml" + - "mllm/**" + +jobs: + build-x86: + if: github.repository == 'UbiquitousLearning/mllm' + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Initialize and update submodules + run: | + git submodule init + git submodule update --init --recursive + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.12" + cache: "pip" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Execute build task + run: | + python task.py tasks/build_x86.yaml From 11c0caa5021012bf61c413616b05b92fec64b1d4 Mon Sep 17 00:00:00 2001 From: HayzelHan Date: Wed, 10 Dec 2025 08:22:28 +0000 Subject: [PATCH 6/8] fix(cpu): add conditional compilation for common kernel dispatch --- .../cpu/kernels/common/kernel_dispatch.cpp | 5 +++-- .../cpu/kernels/common/kernel_dispatch.hpp | 14 +++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/mllm/backends/cpu/kernels/common/kernel_dispatch.cpp b/mllm/backends/cpu/kernels/common/kernel_dispatch.cpp index 86f909ce2..1ad3cee93 100644 --- a/mllm/backends/cpu/kernels/common/kernel_dispatch.cpp +++ b/mllm/backends/cpu/kernels/common/kernel_dispatch.cpp @@ -1,8 +1,9 @@ // Copyright (c) MLLM Team. // Licensed under the MIT License. +#include "mllm/utils/CPUArchHelper.hpp" +#if !(defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM)) #include "mllm/backends/cpu/kernels/common/kernel_dispatch.hpp" -#include "mllm/utils/CPUArchHelper.hpp" // >>>> for dynamic dispatch only, skip if you want static dispatch // First undef to prevent error when re-included. @@ -73,7 +74,7 @@ HWY_DLLEXPORT void call_elewise_div_scalar_fp32(mllm_fp32_t* out, const mllm_fp3 // HWY_DYNAMIC_DISPATCH(gelu_fp32)(out, in, n); // } - } // namespace mllm::cpu::common #endif // HWY_ONCE +#endif diff --git a/mllm/backends/cpu/kernels/common/kernel_dispatch.hpp b/mllm/backends/cpu/kernels/common/kernel_dispatch.hpp index 5271077e6..eb100ac43 100644 --- a/mllm/backends/cpu/kernels/common/kernel_dispatch.hpp +++ b/mllm/backends/cpu/kernels/common/kernel_dispatch.hpp @@ -1,12 +1,17 @@ // Copyright (c) MLLM Team. // Licensed under the MIT License. -#pragma once +#ifndef MLLM_BACKENDS_CPU_KERNELS_COMMON_KERNEL_DISPATCH_HPP_ +#define MLLM_BACKENDS_CPU_KERNELS_COMMON_KERNEL_DISPATCH_HPP_ + +#include "mllm/utils/CPUArchHelper.hpp" +#if !(defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM)) -#include -#include // HWY_DLLEXPORT #include "mllm/core/DataTypes.hpp" +// Platform-specific definitions used for declaring an interface, independent of +// the SIMD instruction set. +#include // HWY_RESTRICT namespace mllm::cpu::common { //===----------------------------------------------------------------------===// @@ -26,3 +31,6 @@ HWY_DLLEXPORT void call_elewise_mul_scalar_fp32(mllm_fp32_t* out, const mllm_fp3 HWY_DLLEXPORT void call_elewise_div_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, mllm_fp32_t y, size_t n); } // namespace mllm::cpu::common + +#endif +#endif // MLLM_BACKENDS_CPU_KERNELS_COMMON_KERNEL_DISPATCH_HPP_ From 877e17bd333250c994b9f43994dd1aece3f9bdcb Mon Sep 17 00:00:00 2001 From: HayzelHan Date: Wed, 10 Dec 2025 10:13:33 +0000 Subject: [PATCH 7/8] fix(ci, ops): fix build issues and improve code robustness - Fix python version in `.github/workflows/build-x86.yml` - Rename function name to avoid using reserved identifiers - Add #else fallback for unsupported architectures - Fix potential integer overflow in offset calculations --- .github/workflows/build-x86.yml | 2 +- .../cpu/kernels/common/elewise-inl.hpp | 20 +++--- mllm/backends/cpu/ops/ElewiseOps.cpp | 72 ++++++++++++------- 3 files changed, 59 insertions(+), 35 deletions(-) diff --git a/.github/workflows/build-x86.yml b/.github/workflows/build-x86.yml index cb45e3ac9..f648d3507 100644 --- a/.github/workflows/build-x86.yml +++ b/.github/workflows/build-x86.yml @@ -36,7 +36,7 @@ jobs: git submodule update --init --recursive - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.12" cache: "pip" diff --git a/mllm/backends/cpu/kernels/common/elewise-inl.hpp b/mllm/backends/cpu/kernels/common/elewise-inl.hpp index da3130569..a2f2ee429 100644 --- a/mllm/backends/cpu/kernels/common/elewise-inl.hpp +++ b/mllm/backends/cpu/kernels/common/elewise-inl.hpp @@ -13,7 +13,7 @@ namespace hn = hwy::HWY_NAMESPACE; // Elementwise + - * / By Matrix //===----------------------------------------------------------------------===// template -HWY_INLINE void __elementwise(const T* HWY_RESTRICT x, const T* HWY_RESTRICT y, T* HWY_RESTRICT out, size_t count, Op&& op) { +HWY_INLINE void elementwise_impl(const T* HWY_RESTRICT x, const T* HWY_RESTRICT y, T* HWY_RESTRICT out, size_t count, Op&& op) { const hn::ScalableTag d; const size_t N = hn::Lanes(d); size_t idx = 0; @@ -62,19 +62,19 @@ struct DivOp { }; HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_add_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n) { - __elementwise(x, y, out, n, AddOp{}); + elementwise_impl(x, y, out, n, AddOp{}); } HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_sub_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n) { - __elementwise(x, y, out, n, SubOp{}); + elementwise_impl(x, y, out, n, SubOp{}); } HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_mul_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n) { - __elementwise(x, y, out, n, MulOp{}); + elementwise_impl(x, y, out, n, MulOp{}); } HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_div_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t* y, size_t n) { - __elementwise(x, y, out, n, DivOp{}); + elementwise_impl(x, y, out, n, DivOp{}); } //===----------------------------------------------------------------------===// @@ -82,7 +82,7 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_div_fp32(mllm_fp32_t* out, const mllm //===----------------------------------------------------------------------===// template -HWY_INLINE void __elementwise_scalar(T* HWY_RESTRICT out, const T* HWY_RESTRICT x, const T y, size_t count, Op&& op) { +HWY_INLINE void elementwise_scalar_impl(T* HWY_RESTRICT out, const T* HWY_RESTRICT x, const T y, size_t count, Op&& op) { const hn::ScalableTag d; const size_t N = hn::Lanes(d); size_t idx = 0; @@ -132,19 +132,19 @@ struct DivScalarOp { }; HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_add_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t y, size_t n) { - __elementwise_scalar(out, x, y, n, AddScalarOp{}); + elementwise_scalar_impl(out, x, y, n, AddScalarOp{}); } HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_sub_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t y, size_t n) { - __elementwise_scalar(out, x, y, n, SubScalarOp{}); + elementwise_scalar_impl(out, x, y, n, SubScalarOp{}); } HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_mul_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t y, size_t n) { - __elementwise_scalar(out, x, y, n, MulScalarOp{}); + elementwise_scalar_impl(out, x, y, n, MulScalarOp{}); } HWY_NOINLINE HWY_MAYBE_UNUSED void elewise_div_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, const mllm_fp32_t y, size_t n) { - __elementwise_scalar(out, x, y, n, DivScalarOp{}); + elementwise_scalar_impl(out, x, y, n, DivScalarOp{}); } //===----------------------------------------------------------------------===// diff --git a/mllm/backends/cpu/ops/ElewiseOps.cpp b/mllm/backends/cpu/ops/ElewiseOps.cpp index d2ab6df53..a3e1f7dd1 100644 --- a/mllm/backends/cpu/ops/ElewiseOps.cpp +++ b/mllm/backends/cpu/ops/ElewiseOps.cpp @@ -146,6 +146,8 @@ void CPUAddOp::forward(const std::vector& inputs, std::vector& o #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_add_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel(), options_.getThreads()); +#else + NYI("AddOp not supported on this architecture."); #endif } else if (input1.numel() == 1) { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) @@ -154,6 +156,8 @@ void CPUAddOp::forward(const std::vector& inputs, std::vector& o #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_add_fp32_scalar(output.ptr(), input0.ptr(), *input1.ptr(), output.numel(), options_.getThreads()); +#else + NYI("AddOp not supported on this architecture."); #endif } else if (can_be_broadcast_naive) { const float* a = input0.ptr(); @@ -164,9 +168,9 @@ void CPUAddOp::forward(const std::vector& inputs, std::vector& o for (int batch = 0; batch < batch_dims; ++batch) { // Each batch processes broadcast_naive_loops iterations of vector_size elements for (int l = 0; l < broadcast_naive_loops; ++l) { - int a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; - int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension - int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t b_offset = batch * vector_size; // b doesn't broadcast over loops dimension + size_t out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; cpu::common::call_elewise_add_fp32(out + out_offset, a + a_offset, b + b_offset, vector_size); } @@ -177,13 +181,15 @@ void CPUAddOp::forward(const std::vector& inputs, std::vector& o for (int batch = 0; batch < batch_dims; ++batch) { // Each batch processes broadcast_naive_loops iterations of vector_size elements for (int l = 0; l < broadcast_naive_loops; ++l) { - int a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; - int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension - int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t b_offset = batch * vector_size; // b doesn't broadcast over loops dimension + size_t out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; cpu::arm::ew_add_fp32(out + out_offset, a + a_offset, b + b_offset, vector_size, options_.getThreads()); } } +#else + NYI("AddOp not supported on this architecture."); #endif } else { NYI("AddOp broadcast not supported."); @@ -323,6 +329,8 @@ void CPUSubOp::forward(const std::vector& inputs, std::vector& o #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_sub_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel(), options_.getThreads()); +#else + NYI("SubOp not supported on this architecture."); #endif } else if (input1.numel() == 1) { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) @@ -331,6 +339,8 @@ void CPUSubOp::forward(const std::vector& inputs, std::vector& o #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_sub_fp32_scalar(output.ptr(), input0.ptr(), *input1.ptr(), output.numel(), options_.getThreads()); +#else + NYI("SubOp not supported on this architecture."); #endif } else if (can_be_broadcast_naive) { const float* a = input0.ptr(); @@ -341,9 +351,9 @@ void CPUSubOp::forward(const std::vector& inputs, std::vector& o for (int batch = 0; batch < batch_dims; ++batch) { // Each batch processes broadcast_naive_loops iterations of vector_size elements for (int l = 0; l < broadcast_naive_loops; ++l) { - int a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; - int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension - int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t b_offset = batch * vector_size; // b doesn't broadcast over loops dimension + size_t out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; cpu::common::call_elewise_sub_fp32(out + out_offset, a + a_offset, b + b_offset, vector_size); } @@ -353,13 +363,15 @@ void CPUSubOp::forward(const std::vector& inputs, std::vector& o for (int batch = 0; batch < batch_dims; ++batch) { // Each batch processes broadcast_naive_loops iterations of vector_size elements for (int l = 0; l < broadcast_naive_loops; ++l) { - int a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; - int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension - int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t b_offset = batch * vector_size; // b doesn't broadcast over loops dimension + size_t out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; cpu::arm::ew_sub_fp32(out + out_offset, a + a_offset, b + b_offset, vector_size, options_.getThreads()); } } +#else + NYI("SubOp not supported on this architecture."); #endif } else { NYI("SubOp broadcast not supported."); @@ -499,6 +511,8 @@ void CPUMulOp::forward(const std::vector& inputs, std::vector& o #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_mul_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel(), options_.getThreads()); +#else + NYI("MulOp not supported on this architecture."); #endif } else if (input1.numel() == 1) { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) @@ -507,6 +521,8 @@ void CPUMulOp::forward(const std::vector& inputs, std::vector& o #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_mul_fp32_scalar(output.ptr(), input0.ptr(), *input1.ptr(), output.numel(), options_.getThreads()); +#else + NYI("MulOp not supported on this architecture."); #endif } else if (can_be_broadcast_naive) { const float* a = input0.ptr(); @@ -517,9 +533,9 @@ void CPUMulOp::forward(const std::vector& inputs, std::vector& o for (int batch = 0; batch < batch_dims; ++batch) { // Each batch processes broadcast_naive_loops iterations of vector_size elements for (int l = 0; l < broadcast_naive_loops; ++l) { - int a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; - int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension - int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t b_offset = batch * vector_size; // b doesn't broadcast over loops dimension + size_t out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; cpu::common::call_elewise_mul_fp32(out + out_offset, a + a_offset, b + b_offset, vector_size); } @@ -529,13 +545,15 @@ void CPUMulOp::forward(const std::vector& inputs, std::vector& o for (int batch = 0; batch < batch_dims; ++batch) { // Each batch processes broadcast_naive_loops iterations of vector_size elements for (int l = 0; l < broadcast_naive_loops; ++l) { - int a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; - int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension - int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t b_offset = batch * vector_size; // b doesn't broadcast over loops dimension + size_t out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; cpu::arm::ew_mul_fp32(out + out_offset, a + a_offset, b + b_offset, vector_size, options_.getThreads()); } } +#else + NYI("MulOp not supported on this architecture."); #endif } else { NYI("MulOp broadcast not supported."); @@ -675,6 +693,8 @@ void CPUDivOp::forward(const std::vector& inputs, std::vector& o #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_div_fp32(output.ptr(), input0.ptr(), input1.ptr(), output.numel(), options_.getThreads()); +#else + NYI("DivOp not supported on this architecture."); #endif } else if (input1.numel() == 1) { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) @@ -683,6 +703,8 @@ void CPUDivOp::forward(const std::vector& inputs, std::vector& o #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) cpu::arm::ew_div_fp32_scalar(output.ptr(), input0.ptr(), *input1.ptr(), output.numel(), options_.getThreads()); +#else + NYI("DivOp not supported on this architecture."); #endif } else if (can_be_broadcast_naive) { const float* a = input0.ptr(); @@ -693,9 +715,9 @@ void CPUDivOp::forward(const std::vector& inputs, std::vector& o for (int batch = 0; batch < batch_dims; ++batch) { // Each batch processes broadcast_naive_loops iterations of vector_size elements for (int l = 0; l < broadcast_naive_loops; ++l) { - int a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; - int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension - int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t b_offset = batch * vector_size; // b doesn't broadcast over loops dimension + size_t out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; cpu::common::call_elewise_div_fp32(out + out_offset, a + a_offset, b + b_offset, vector_size); } @@ -705,13 +727,15 @@ void CPUDivOp::forward(const std::vector& inputs, std::vector& o for (int batch = 0; batch < batch_dims; ++batch) { // Each batch processes broadcast_naive_loops iterations of vector_size elements for (int l = 0; l < broadcast_naive_loops; ++l) { - int a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; - int b_offset = batch * vector_size; // b doesn't broadcast over loops dimension - int out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t a_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; + size_t b_offset = batch * vector_size; // b doesn't broadcast over loops dimension + size_t out_offset = batch * broadcast_naive_loops * vector_size + l * vector_size; cpu::arm::ew_div_fp32(out + out_offset, a + a_offset, b + b_offset, vector_size, options_.getThreads()); } } +#else + NYI("DivOp not supported on this architecture."); #endif } else { NYI("DivOp broadcast not supported."); From bd046b90be9c975ae675e8452de519509cea28c1 Mon Sep 17 00:00:00 2001 From: HayzelHan Date: Fri, 12 Dec 2025 09:56:15 +0000 Subject: [PATCH 8/8] docs(x86): add docs for x86 CPU backend --- docs/cpu_backend/x86/index.rst | 191 ++++++++++++++++++++++++++++++++- 1 file changed, 190 insertions(+), 1 deletion(-) diff --git a/docs/cpu_backend/x86/index.rst b/docs/cpu_backend/x86/index.rst index 781e12b9b..03291853f 100644 --- a/docs/cpu_backend/x86/index.rst +++ b/docs/cpu_backend/x86/index.rst @@ -1,4 +1,193 @@ CPU X86 Backend =============== - \ No newline at end of file +Overview +-------- + +The MLLM X86 CPU backend provides optimized inference on x86 processors using Highway's cross-platform SIMD abstractions. +`Google Highway `_ is a C++ library that delivers portable SIMD +(Single Instruction Multiple Data) operations, allowing high-performance neural network computations +while maintaining compatibility across various x86 microarchitectures. + +Key Features: + +- **Portable SIMD Operations**: Highway abstracts platform-specific instructions, supporting multiple + x86 targets (SSE4, AVX2, AVX-512) with the same codebase +- **Runtime Dispatch**: Automatically selects the best available instruction set for the target CPU +- **Quantized Inference**: Optimized kernels for Q4 and other quantization formats +- **Cross-Platform Compatibility**: Maintains backward compatibility from older CPUs (SSE4) to modern + processors (AVX-512) + +This backend leverages Highway's cross-platform SIMD abstractions to achieve high performance on modern +x86 processors while maintaining portability across different CPU models and microarchitectures. + + +Running MLLM on X86 Architecture +-------------------------------- + +This guide explains how to build and run the **mllm** inference framework on x86 processors. +The X86 backend uses Highway for portable SIMD operations, +enabling efficient vectorized computations across different x86 CPU models. + + +Prerequisites +~~~~~~~~~~~~~ + +Before building the x86 backend, install the required build toolchain and Python runtime. + +Install system dependencies: + +.. code-block:: bash + + sudo apt update + sudo apt install build-essential cmake ninja-build python3 python3-pip + +Install Python dependencies: + +.. code-block:: bash + + pip install -r requirements.txt + + +Recommended Versions (Used in Testing): + +- **Operating System**: Linux (Ubuntu 22.04 LTS) +- **CPU**: x86-64 processor with at least SSE4 support + (better performance with AVX2 or AVX-512) +- **C/C++ compiler**: GCC/G++ (validated with GCC 11.4.0) +- **Python**: validated with Python 3.10 +- **Build tools**: CMake + Ninja + (validated with CMake 3.31.2 and Ninja 1.11.1) + +.. note:: + + The default toolchain on Ubuntu 22.04 LTS is sufficient to build the project. + Ubuntu 24.04 LTS has not been validated yet, but it is expected to work as well. + + +Step 1: Build the X86 Backend +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Before building, ensure you have completed the environment setup described above. + +Run the provided build task to compile MLLM with X86-optimized kernels: + +.. code-block:: bash + + python task.py tasks/build_x86.yaml + +This command configures and builds the project with Highway SIMD operations enabled +for x86 architecture. + + +Step 2: Acquire Model Assets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +1. Download the original model from Hugging-Face (or any other reputable source). + + Typical files you need: + + * ``config.json`` + * ``tokenizer.json`` / ``tokenizer.model`` + * PyTorch / Safetensors checkpoints (``.bin``, ``.safetensors``) + +2. Place everything under a single directory, e.g. ``~/models/Qwen3-0.6B``. + +.. note:: + Models obtained from hosting platforms such as Hugging Face or ModelScope (via ``git clone`` or their official CLI) are already organized in a single directory that contains ``config.json``, ``tokenizer.json``, ``tokenizer.model``, checkpoint shards, etc. + + You can download Qwen3-0.6B from ModelScope with the following command: + + .. code-block:: bash + + git clone https://www.modelscope.cn/Qwen/Qwen3-0.6B.git + + +3. Download pre-converted models from **our HuggingFace organization** (recommended on x86): + +Due to current compatibility issues with the mllm-converter on x86 architecture, we recommend downloading pre-converted quantized models from our HuggingFace organization `mllmTeam `_: + +Example command: + +.. code-block:: bash + + wget https://huggingface.co/mllmTeam/qwen-3-0.6b-mllm/blob/main/qwen-3-0.6b-q4_k.mllm + +.. note:: + + If you prefer to convert models yourself, Please refer to :doc:`How to Support a New LLM: Step-by-Step <../../quick_start/how_to_model>`, specifically **Step 2**, to download and convert the model. + + + + +Step 3: Run Inference +~~~~~~~~~~~~~~~~~~~~~ + +Once you have the model assets, run inference using the compiled binary. + +Command Parameters: + +- ``-m``: Path to the quantized MLLM model file +- ``-mv``: Model version (``v1`` or ``v2``) +- ``-t``: Path to the tokenizer file +- ``-c``: Path to the model configuration file + +Example Command: + +.. code-block:: bash + + /path/to/build/bin/mllm-qwen3-runner \ + -m /path/to/model/qwen-3-0.6b-q4_k.mllm \ + -mv v1 \ + -t /path/to/tokenizer/tokenizer.json \ + -c /path/to/config/config_0.6B_w4a32_kai.json + +.. caution:: + You can use ``mllm/examples/qwen3/config_0.6B_w4a32_kai.json`` as the configuration file for Qwen3-0.6B quantized with Q4_K. + But remember to change the ``linear_impl_type`` to ``Default`` because we are using the default linear implementation in the x86 backend. + +Performance +~~~~~~~~~~~~~~~~~~~~~ + +After inference completes, the program automatically outputs a performance summary. + +The following metrics were measured on an **Intel Core i9-14900K** +with a **Qwen3-0.6B** model (Q4 quantization). Example Output: + +.. code-block:: text + + ============== Performance Summary =============== + Total time : 667525.00 μs + Prefill time : 123295.00 μs (194.66 tokens/s) + Decode time : 544230.00 μs ( 49.61 tokens/s) + TTFT : 123443.00 μs + Prefill tokens : 24 + Decode steps : 27 + Avg decode time : 20156.67 μs/token + ================================================== + +- **Prefill throughput**: 194.66 tokens/s — + The model processes input tokens at this rate during the prefill phase +- **Decode throughput**: 49.61 tokens/s — + The model generates output tokens at this rate during decoding +- **Time-to-first-token (TTFT)**: 123.4 ms — + Time from request submission to receiving the first generated token +- **Average decode latency**: 20.16 ms/token — + Average time to generate each subsequent token + +.. note:: + + The x86 PC platform is not currently the primary optimization focus for the MLLM framework, + so inference speeds are relatively slower. Ongoing optimizations are planned for future releases. + + +Factors Affecting Performance +^^^^^^^^^^^^^^^^^^^^^ + +Performance may vary depending on: + +- CPU model and specifications +- System load and thermal conditions +- Model size and quantization method +- Input prompt length and output length