From d58146ba6d2b8949a69f73526afc6a9c55a411e5 Mon Sep 17 00:00:00 2001 From: EdVince <2456510228@qq.com> Date: Tue, 10 Jan 2023 17:31:28 +0800 Subject: [PATCH 01/11] add:mha arm --- src/layer/arm/multiheadattention_arm.cpp | 923 ++++++++++++++++------- src/layer/arm/multiheadattention_arm.h | 18 + tests/testutil.h | 12 + 3 files changed, 683 insertions(+), 270 deletions(-) diff --git a/src/layer/arm/multiheadattention_arm.cpp b/src/layer/arm/multiheadattention_arm.cpp index 81046a3f0dfc..6993ea027af3 100644 --- a/src/layer/arm/multiheadattention_arm.cpp +++ b/src/layer/arm/multiheadattention_arm.cpp @@ -14,327 +14,710 @@ #include "multiheadattention_arm.h" -#include -#include +#include "cpu.h" +#include "layer_type.h" -#if __ARM_NEON -#include -#include "neon_mathfun.h" -#endif // __ARM_NEON +#include namespace ncnn { MultiHeadAttention_arm::MultiHeadAttention_arm() { + #if __ARM_NEON support_packing = true; +#if NCNN_ARM82 + support_fp16_storage = cpu_support_arm_asimdhp(); +#endif #endif // __ARM_NEON -} -int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const Mat& q_blob = bottom_blobs[0]; - const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1]; - const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs.size() == 2 ? k_blob : bottom_blobs[2]; + cvtfp16_to_fp32 = 0; + cvtfp32_to_fp16 = 0; - size_t src_elemsize = q_blob.elemsize; - int src_elempack = q_blob.elempack; - size_t dst_elemsize = k_blob.elemsize; - int dst_elempack = k_blob.elempack; + q_gemm = 0; + k_gemm = 0; + v_gemm = 0; + o_gemm = 0; - const int src_seqlen = q_blob.h; - const int dst_seqlen = k_blob.h; - const int embed_dim_per_head = embed_dim / num_head; - const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + qk_gemm = 0; + qkv_gemm = 0; -#if __ARM_NEON - if (src_elempack == 4) - { - Mat& top_blob = top_blobs[0]; - top_blob.create(embed_dim, src_seqlen, src_elemsize, src_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -1; + qk_softmax = 0; + permute_wch = 0; +} - Mat xq(embed_dim_per_head, src_seqlen, num_head, src_elemsize, src_elempack, opt.workspace_allocator); - Mat xk(embed_dim_per_head, dst_seqlen, num_head, dst_elemsize, dst_elempack, opt.workspace_allocator); - Mat xv(dst_seqlen, embed_dim_per_head, num_head, dst_elemsize, dst_elempack, opt.workspace_allocator); +int MultiHeadAttention_arm::create_pipeline(const Option& opt) +{ + Option opt32 = opt; + opt32.use_bf16_storage = false; + opt32.use_fp16_arithmetic = false; + opt32.use_fp16_packed = false; + opt32.use_fp16_storage = false; - Mat xqk(dst_seqlen * dst_elempack, src_seqlen, num_head, src_elemsize, src_elempack, opt.workspace_allocator); + { + cvtfp16_to_fp32 = ncnn::create_layer(ncnn::LayerType::Cast); + ncnn::ParamDict pd; + pd.set(0, 2); // from fp16 + pd.set(1, 1); // from fp32 + cvtfp16_to_fp32->load_param(pd); + cvtfp16_to_fp32->load_model(ModelBinFromMatArray(0)); + cvtfp16_to_fp32->create_pipeline(opt); + } + { + cvtfp32_to_fp16 = ncnn::create_layer(ncnn::LayerType::Cast); + ncnn::ParamDict pd; + pd.set(0, 1); // from fp32 + pd.set(1, 2); // from fp16 + cvtfp32_to_fp16->load_param(pd); + cvtfp32_to_fp16->load_model(ModelBinFromMatArray(0)); + cvtfp32_to_fp16->create_pipeline(opt); + } - Mat xqkv(embed_dim_per_head, num_head, src_seqlen, src_elemsize, src_elempack, opt.workspace_allocator); + { + qk_softmax = ncnn::create_layer(ncnn::LayerType::Softmax); + ncnn::ParamDict pd; + pd.set(0, -1); + pd.set(1, 1); + qk_softmax->load_param(pd); + qk_softmax->load_model(ModelBinFromMatArray(0)); + qk_softmax->create_pipeline(opt32); + } + { + permute_wch = ncnn::create_layer(ncnn::LayerType::Permute); + ncnn::ParamDict pd; + pd.set(0, 2); // wch + permute_wch->load_param(pd); + permute_wch->load_model(ModelBinFromMatArray(0)); + permute_wch->create_pipeline(opt32); + } + +#if NCNN_ARM82 + if(support_fp16_storage && opt.use_fp16_packed) + { + Option optopt = opt; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < num_head; q++) { - // xq = affine(q) * inv_sqrt_embed_dim_per_head + const int embed_dim_per_head = embed_dim / num_head; + const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + + q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm); + ncnn::ParamDict pd; + pd.set(0, inv_sqrt_embed_dim_per_head); + pd.set(1, 1.f); + pd.set(2, 0); // transA + pd.set(3, 1); // transB + pd.set(4, 1); // constantA + pd.set(5, 0); // constantB + pd.set(6, 1); // constantC + pd.set(7, embed_dim); // M + pd.set(8, 0); // N + pd.set(9, embed_dim); // K + pd.set(10, 1); // constant_broadcast_type_C + pd.set(11, 0); // output_N1M + pd.set(12, 1); // output_elempack + q_gemm->load_param(pd); + Mat weights[2]; + weights[0] = q_weight_data; + weights[1] = q_bias_data; + q_gemm->load_model(ModelBinFromMatArray(weights)); + q_gemm->create_pipeline(optopt); + + if (optopt.lightmode) { - Mat outm = xq.channel(q); - - for (int i = 0; i < src_seqlen; i++) - { - float* outptr = outm.row(i); - - for (int j = 0; j < embed_dim_per_head; j++) - { - const float* ptr = q_blob.row(i); - const float* kptr = (const float*)q_weight_data + embed_dim * (q * embed_dim_per_head + j); - - float32x4_t _sum = vdupq_n_f32(q_bias_data[q * embed_dim_per_head + j]); - for (int k = 0; k < embed_dim; k++) - { - float32x4_t _val = vld1q_f32(ptr); - float32x4_t _k = vdupq_n_f32(kptr[0]); - _sum = vmlaq_f32(_sum, _val, _k); - ptr += 4; - kptr += 1; - } - - float32x4_t _slope = vdupq_n_f32(inv_sqrt_embed_dim_per_head); - _sum = vmulq_f32(_sum, _slope); - - vst1q_f32(outptr, _sum); - outptr += 4; - } - } + q_weight_data.release(); + q_bias_data.release(); } + } - // xk = affine(k) + { + k_gemm = ncnn::create_layer(ncnn::LayerType::Gemm); + ncnn::ParamDict pd; + pd.set(2, 0); // transA + pd.set(3, 1); // transB + pd.set(4, 1); // constantA + pd.set(5, 0); // constantB + pd.set(6, 1); // constantC + pd.set(7, embed_dim); // M + pd.set(8, 0); // N + pd.set(9, kdim); // K + pd.set(10, 1); // constant_broadcast_type_C + pd.set(11, 0); // output_N1M + pd.set(12, 1); // output_elempack + k_gemm->load_param(pd); + Mat weights[2]; + weights[0] = k_weight_data; + weights[1] = k_bias_data; + k_gemm->load_model(ModelBinFromMatArray(weights)); + k_gemm->create_pipeline(optopt); + + if (optopt.lightmode) { - Mat outm = xk.channel(q); - - for (int i = 0; i < dst_seqlen; i++) - { - float* outptr = outm.row(i); - - for (int j = 0; j < embed_dim_per_head; j++) - { - const float* ptr = k_blob.row(i); - const float* kptr = (const float*)k_weight_data + kdim * (q * embed_dim_per_head + j); - - if (dst_elempack == 4) - { - float32x4_t _sum = vdupq_n_f32(k_bias_data[q * embed_dim_per_head + j]); - for (int k = 0; k < kdim; k++) - { - float32x4_t _val = vld1q_f32(ptr); - float32x4_t _k = vdupq_n_f32(kptr[0]); - _sum = vmlaq_f32(_sum, _val, _k); - ptr += 4; - kptr += 1; - } - - vst1q_f32(outptr, _sum); - outptr += 4; - } - if (dst_elempack == 1) - { - float sum = k_bias_data[q * embed_dim_per_head + j]; - for (int k = 0; k < kdim; k++) - { - sum += ptr[0] * kptr[0]; - ptr += 1; - kptr += 1; - } - - outptr[0] = sum; - outptr += 1; - } - } - } + k_weight_data.release(); + k_bias_data.release(); } + } - // xv = affine(v) + { + v_gemm = ncnn::create_layer(ncnn::LayerType::Gemm); + ncnn::ParamDict pd; + pd.set(2, 0); // transA + pd.set(3, 1); // transB + pd.set(4, 1); // constantA + pd.set(5, 0); // constantB + pd.set(6, 1); // constantC + pd.set(7, embed_dim); // M + pd.set(8, 0); // N + pd.set(9, vdim); // K + pd.set(10, 1); // constant_broadcast_type_C + pd.set(11, 0); // output_N1M + pd.set(12, 1); // output_elempack + v_gemm->load_param(pd); + Mat weights[2]; + weights[0] = v_weight_data; + weights[1] = v_bias_data; + v_gemm->load_model(ModelBinFromMatArray(weights)); + v_gemm->create_pipeline(optopt); + + if (optopt.lightmode) { - Mat outm = xv.channel(q); - - for (int i = 0; i < embed_dim_per_head; i++) - { - float* outptr = outm.row(i); - - for (int j = 0; j < dst_seqlen; j++) - { - const float* ptr = v_blob.row(j); - const float* kptr = (const float*)v_weight_data + vdim * (q * embed_dim_per_head + i); - - if (dst_elempack == 4) - { - float32x4_t _sum = vdupq_n_f32(v_bias_data[q * embed_dim_per_head + i]); - for (int k = 0; k < vdim; k++) - { - float32x4_t _val = vld1q_f32(ptr); - float32x4_t _k = vdupq_n_f32(kptr[0]); - _sum = vmlaq_f32(_sum, _val, _k); - ptr += 4; - kptr += 1; - } - - vst1q_f32(outptr, _sum); - outptr += 4; - } - if (dst_elempack == 1) - { - float sum = v_bias_data[q * embed_dim_per_head + i]; - for (int k = 0; k < vdim; k++) - { - sum += ptr[0] * kptr[0]; - ptr += 1; - kptr += 1; - } - - outptr[0] = sum; - outptr += 1; - } - } - } + v_weight_data.release(); + v_bias_data.release(); } + } - // xqk = xq * xk - // xq (embed_dim_per_head, src_seqlen) - // xk (embed_dim_per_head, dst_seqlen) + { + o_gemm = ncnn::create_layer(ncnn::LayerType::Gemm); + ncnn::ParamDict pd; + pd.set(2, 0); // transA + pd.set(3, 1); // transB + pd.set(4, 0); // constantA + pd.set(5, 1); // constantB + pd.set(6, 1); // constantC + pd.set(7, 0); // M = outch + pd.set(8, embed_dim); // N = size + pd.set(9, embed_dim); // K = maxk*inch + pd.set(10, 4); // constant_broadcast_type_C = null + pd.set(11, 0); // output_N1M + o_gemm->load_param(pd); + Mat weights[2]; + weights[0] = out_weight_data; + weights[1] = out_bias_data; + o_gemm->load_model(ModelBinFromMatArray(weights)); + o_gemm->create_pipeline(optopt); + + if (optopt.lightmode) { - const Mat xqm = xq.channel(q); - const Mat xkm = xk.channel(q); + out_weight_data.release(); + out_bias_data.release(); + } + } - Mat outm = xqk.channel(q); + { + qk_gemm = ncnn::create_layer(ncnn::LayerType::Gemm); + ncnn::ParamDict pd; + pd.set(2, 1); // transA + pd.set(3, 0); // transB + pd.set(4, 0); // constantA + pd.set(5, 0); // constantB + pd.set(6, 1); // constantC + pd.set(7, 0); // M + pd.set(8, 0); // N + pd.set(9, 0); // K + pd.set(10, -1); // constant_broadcast_type_C + pd.set(11, 0); // output_N1M + pd.set(12, 1); // output_elempack + qk_gemm->load_param(pd); + qk_gemm->load_model(ModelBinFromMatArray(0)); + Option opt1 = optopt; + opt1.num_threads = 1; + qk_gemm->create_pipeline(opt1); + } - Mat upxkm; - convert_packing(xkm, upxkm, 1); + { + qkv_gemm = ncnn::create_layer(ncnn::LayerType::Gemm); + ncnn::ParamDict pd; + pd.set(2, 0); // transA + pd.set(3, 1); // transB + pd.set(4, 0); // constantA + pd.set(5, 0); // constantB + pd.set(6, 1); // constantC + pd.set(7, 0); // M + pd.set(8, 0); // N + pd.set(9, 0); // K + pd.set(10, -1); // constant_broadcast_type_C + pd.set(11, 0); // output_N1M + pd.set(12, 1); // output_elempack + qkv_gemm->load_param(pd); + qkv_gemm->load_model(ModelBinFromMatArray(0)); + Option opt1 = optopt; + opt1.num_threads = 1; + qkv_gemm->create_pipeline(opt1); + } - for (int i = 0; i < src_seqlen; i++) - { - float* outptr = outm.row(i); - - for (int j = 0; j < dst_seqlen * dst_elempack; j++) - { - const float* qptr = xqm.row(i); - const float* kptr = upxkm.row(j); - - float32x4_t _sum = vdupq_n_f32(0.f); - for (int k = 0; k < embed_dim_per_head; k++) - { - float32x4_t _q = vld1q_f32(qptr); - float32x4_t _k = vdupq_n_f32(kptr[0]); - _sum = vmlaq_f32(_sum, _q, _k); - qptr += 4; - kptr += 1; - } - - vst1q_f32(outptr, _sum); - outptr += 4; - } - } - } + return 0; + } +#endif - // softmax(xqk) - { - Mat outm = xqk.channel(q); - for (int i = 0; i < src_seqlen; i++) - { - float* ptr = outm.row(i); - - float32x4_t _max = vdupq_n_f32(-FLT_MAX); - for (int j = 0; j < dst_seqlen * dst_elempack; j++) - { - float32x4_t _p = vld1q_f32(ptr + j * 4); - _max = vmaxq_f32(_max, _p); - } - - float32x4_t _sum = vdupq_n_f32(0.f); - for (int j = 0; j < dst_seqlen * dst_elempack; j++) - { - float32x4_t _p = vld1q_f32(ptr + j * 4); - _p = exp_ps(vsubq_f32(_p, _max)); - vst1q_f32(ptr + j * 4, _p); - _sum = vaddq_f32(_sum, _p); - } - - for (int j = 0; j < dst_seqlen * dst_elempack; j++) - { - float32x4_t _p = vld1q_f32(ptr + j * 4); -#if __aarch64__ - _p = vdivq_f32(_p, _sum); -#else - _p = div_ps(_p, _sum); + Option optopt = opt; + optopt.use_bf16_storage = false; + optopt.use_fp16_arithmetic = false; + optopt.use_fp16_packed = false; + optopt.use_fp16_storage = false; + + { + const int embed_dim_per_head = embed_dim / num_head; + const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + + q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm); + ncnn::ParamDict pd; + pd.set(0, inv_sqrt_embed_dim_per_head); + pd.set(1, 1.f); + pd.set(2, 0); // transA + pd.set(3, 1); // transB + pd.set(4, 1); // constantA + pd.set(5, 0); // constantB + pd.set(6, 1); // constantC + pd.set(7, embed_dim); // M + pd.set(8, 0); // N + pd.set(9, embed_dim); // K + pd.set(10, 1); // constant_broadcast_type_C + pd.set(11, 0); // output_N1M + pd.set(12, 1); // output_elempack + q_gemm->load_param(pd); + Mat weights[2]; + weights[0] = q_weight_data; + weights[1] = q_bias_data; + q_gemm->load_model(ModelBinFromMatArray(weights)); + q_gemm->create_pipeline(optopt); + + if (optopt.lightmode) + { + q_weight_data.release(); + q_bias_data.release(); + } + } + + { + k_gemm = ncnn::create_layer(ncnn::LayerType::Gemm); + ncnn::ParamDict pd; + pd.set(2, 0); // transA + pd.set(3, 1); // transB + pd.set(4, 1); // constantA + pd.set(5, 0); // constantB + pd.set(6, 1); // constantC + pd.set(7, embed_dim); // M + pd.set(8, 0); // N + pd.set(9, kdim); // K + pd.set(10, 1); // constant_broadcast_type_C + pd.set(11, 0); // output_N1M + pd.set(12, 1); // output_elempack + k_gemm->load_param(pd); + Mat weights[2]; + weights[0] = k_weight_data; + weights[1] = k_bias_data; + k_gemm->load_model(ModelBinFromMatArray(weights)); + k_gemm->create_pipeline(optopt); + + if (optopt.lightmode) + { + k_weight_data.release(); + k_bias_data.release(); + } + } + + { + v_gemm = ncnn::create_layer(ncnn::LayerType::Gemm); + ncnn::ParamDict pd; + pd.set(2, 0); // transA + pd.set(3, 1); // transB + pd.set(4, 1); // constantA + pd.set(5, 0); // constantB + pd.set(6, 1); // constantC + pd.set(7, embed_dim); // M + pd.set(8, 0); // N + pd.set(9, vdim); // K + pd.set(10, 1); // constant_broadcast_type_C + pd.set(11, 0); // output_N1M + pd.set(12, 1); // output_elempack + v_gemm->load_param(pd); + Mat weights[2]; + weights[0] = v_weight_data; + weights[1] = v_bias_data; + v_gemm->load_model(ModelBinFromMatArray(weights)); + v_gemm->create_pipeline(optopt); + + if (optopt.lightmode) + { + v_weight_data.release(); + v_bias_data.release(); + } + } + + { + o_gemm = ncnn::create_layer(ncnn::LayerType::Gemm); + ncnn::ParamDict pd; + pd.set(2, 0); // transA + pd.set(3, 1); // transB + pd.set(4, 0); // constantA + pd.set(5, 1); // constantB + pd.set(6, 1); // constantC + pd.set(7, 0); // M = outch + pd.set(8, embed_dim); // N = size + pd.set(9, embed_dim); // K = maxk*inch + pd.set(10, 4); // constant_broadcast_type_C = null + pd.set(11, 0); // output_N1M + o_gemm->load_param(pd); + Mat weights[2]; + weights[0] = out_weight_data; + weights[1] = out_bias_data; + o_gemm->load_model(ModelBinFromMatArray(weights)); + o_gemm->create_pipeline(optopt); + + if (optopt.lightmode) + { + out_weight_data.release(); + out_bias_data.release(); + } + } + + { + qk_gemm = ncnn::create_layer(ncnn::LayerType::Gemm); + ncnn::ParamDict pd; + pd.set(2, 1); // transA + pd.set(3, 0); // transB + pd.set(4, 0); // constantA + pd.set(5, 0); // constantB + pd.set(6, 1); // constantC + pd.set(7, 0); // M + pd.set(8, 0); // N + pd.set(9, 0); // K + pd.set(10, -1); // constant_broadcast_type_C + pd.set(11, 0); // output_N1M + pd.set(12, 1); // output_elempack + qk_gemm->load_param(pd); + qk_gemm->load_model(ModelBinFromMatArray(0)); + Option opt1 = optopt; + opt1.num_threads = 1; + qk_gemm->create_pipeline(opt1); + } + + { + qkv_gemm = ncnn::create_layer(ncnn::LayerType::Gemm); + ncnn::ParamDict pd; + pd.set(2, 0); // transA + pd.set(3, 1); // transB + pd.set(4, 0); // constantA + pd.set(5, 0); // constantB + pd.set(6, 1); // constantC + pd.set(7, 0); // M + pd.set(8, 0); // N + pd.set(9, 0); // K + pd.set(10, -1); // constant_broadcast_type_C + pd.set(11, 0); // output_N1M + pd.set(12, 1); // output_elempack + qkv_gemm->load_param(pd); + qkv_gemm->load_model(ModelBinFromMatArray(0)); + Option opt1 = optopt; + opt1.num_threads = 1; + qkv_gemm->create_pipeline(opt1); + } + + return 0; +} + +int MultiHeadAttention_arm::destroy_pipeline(const Option& opt) +{ + Option opt32 = opt; + opt32.use_bf16_storage = false; + opt32.use_fp16_arithmetic = false; + opt32.use_fp16_packed = false; + opt32.use_fp16_storage = false; + + if (cvtfp16_to_fp32) + { + cvtfp16_to_fp32->destroy_pipeline(opt); + delete cvtfp16_to_fp32; + cvtfp16_to_fp32 = 0; + } + if (cvtfp32_to_fp16) + { + cvtfp32_to_fp16->destroy_pipeline(opt); + delete cvtfp32_to_fp16; + cvtfp32_to_fp16 = 0; + } + + if (qk_softmax) + { + qk_softmax->destroy_pipeline(opt32); + delete qk_softmax; + qk_softmax = 0; + } + + if (permute_wch) + { + permute_wch->destroy_pipeline(opt32); + delete permute_wch; + permute_wch = 0; + } + + +#if NCNN_ARM82 + if(support_fp16_storage && opt.use_fp16_packed) + { + Option optopt = opt; + + if (q_gemm) + { + q_gemm->destroy_pipeline(optopt); + delete q_gemm; + q_gemm = 0; + } + + if (k_gemm) + { + k_gemm->destroy_pipeline(optopt); + delete k_gemm; + k_gemm = 0; + } + + if (v_gemm) + { + v_gemm->destroy_pipeline(optopt); + delete v_gemm; + v_gemm = 0; + } + + if (o_gemm) + { + o_gemm->destroy_pipeline(optopt); + delete o_gemm; + o_gemm = 0; + } + + if (qk_gemm) + { + qk_gemm->destroy_pipeline(optopt); + delete qk_gemm; + qk_gemm = 0; + } + + if (qkv_gemm) + { + qkv_gemm->destroy_pipeline(optopt); + delete qkv_gemm; + qkv_gemm = 0; + } + + return 0; + } #endif - vst1q_f32(ptr + j * 4, _p); - } - } - } - // xqkv = xqk * xv - // xqk (dst_seqlen, src_seqlen) - // xv (dst_seqlen, embed_dim_per_head) - // out (embed_dim_per_head, num_head, src_seqlen) - { - const Mat xqkm = xqk.channel(q); - const Mat xvm = xv.channel(q); + Option optopt = opt; + optopt.use_bf16_storage = false; + optopt.use_fp16_arithmetic = false; + optopt.use_fp16_packed = false; + optopt.use_fp16_storage = false; - for (int i = 0; i < src_seqlen; i++) - { - float* outptr = xqkv.channel(i).row(q); - - for (int j = 0; j < embed_dim_per_head; j++) - { - const float* qkptr = xqkm.row(i); - const float* vptr = xvm.row(j); - - float32x4_t _sum = vdupq_n_f32(0.f); - for (int k = 0; k < dst_seqlen * dst_elempack; k++) - { - float32x4_t _qk = vld1q_f32(qkptr); - float32x4_t _v = vdupq_n_f32(vptr[0]); - _sum = vmlaq_f32(_sum, _qk, _v); - qkptr += 4; - vptr += 1; - } - - vst1q_f32(outptr, _sum); - outptr += 4; - } - } - } + if (q_gemm) + { + q_gemm->destroy_pipeline(optopt); + delete q_gemm; + q_gemm = 0; + } + + if (k_gemm) + { + k_gemm->destroy_pipeline(optopt); + delete k_gemm; + k_gemm = 0; + } + + if (v_gemm) + { + v_gemm->destroy_pipeline(optopt); + delete v_gemm; + v_gemm = 0; + } + + if (o_gemm) + { + o_gemm->destroy_pipeline(optopt); + delete o_gemm; + o_gemm = 0; + } + + if (qk_gemm) + { + qk_gemm->destroy_pipeline(optopt); + delete qk_gemm; + qk_gemm = 0; + } + + if (qkv_gemm) + { + qkv_gemm->destroy_pipeline(optopt); + delete qkv_gemm; + qkv_gemm = 0; + } + + return 0; +} + +int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& q_blob = bottom_blobs[0]; + const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1]; + const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs.size() == 2 ? k_blob : bottom_blobs[2]; + + const int embed_dim_per_head = embed_dim / num_head; + const int src_seqlen = q_blob.h * q_blob.elempack; + const int dst_seqlen = k_blob.h * k_blob.elempack; + + const int elembits = q_blob.elembits(); + + Option opt32 = opt; + opt32.use_bf16_storage = false; + opt32.use_fp16_arithmetic = false; + opt32.use_fp16_packed = false; + opt32.use_fp16_storage = false; + +#if NCNN_ARM82 + if(support_fp16_storage && opt.use_fp16_packed && elembits == 16) + { + printf("FP16\n"); + Mat q_affine, k_affine, v_affine; + Mat qk_cross(dst_seqlen, src_seqlen * num_head, 2u, opt.blob_allocator); + Mat qkv_cross(embed_dim_per_head, src_seqlen, num_head, 2u, opt.blob_allocator); + Mat qkv_wch_fp16(embed_dim, src_seqlen, 2u, opt.blob_allocator); + + + q_gemm->forward(q_blob, q_affine, opt); + k_gemm->forward(k_blob, k_affine, opt); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < num_head; i++) + { + std::vector qk_bottom_blobs(2); + qk_bottom_blobs[0] = q_affine.row_range(i * embed_dim_per_head, embed_dim_per_head); + qk_bottom_blobs[1] = k_affine.row_range(i * embed_dim_per_head, embed_dim_per_head); + std::vector qk_top_blobs(1); + qk_top_blobs[0] = qk_cross.row_range(i * src_seqlen, src_seqlen); + Option opt1 = opt; + opt1.num_threads = 1; + qk_gemm->forward(qk_bottom_blobs, qk_top_blobs, opt1); } - // out = affine(xqkv) - // xqkv (embed_dim, src_seqlen) + q_affine.release(); + k_affine.release(); + + Mat qk_cross_fp32, qk_cross_fp32_fp16; + cvtfp16_to_fp32->forward(qk_cross, qk_cross_fp32, opt); + qk_softmax->forward_inplace(qk_cross_fp32, opt32); + cvtfp32_to_fp16->forward(qk_cross_fp32, qk_cross_fp32_fp16, opt); + + qk_cross.release(); + qk_cross_fp32.release(); + + v_gemm->forward(v_blob, v_affine, opt); + #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < src_seqlen; i++) + for (int i = 0; i < num_head; i++) { - float* outptr = top_blob.row(i); + std::vector qkv_bottom_blobs(2); + qkv_bottom_blobs[0] = qk_cross_fp32_fp16.row_range(i * src_seqlen, src_seqlen); + qkv_bottom_blobs[1] = v_affine.row_range(i * embed_dim_per_head, embed_dim_per_head); + std::vector qkv_top_blobs(1); + qkv_top_blobs[0] = qkv_cross.channel(i); + Option opt1 = opt; + opt1.num_threads = 1; + qkv_gemm->forward(qkv_bottom_blobs, qkv_top_blobs, opt1); + } - for (int j = 0; j < embed_dim; j++) - { - const float* ptr = xqkv.channel(i); - const float* kptr = (const float*)out_weight_data + embed_dim * j; + qk_cross_fp32_fp16.release(); + v_affine.release(); - float32x4_t _sum = vdupq_n_f32(out_bias_data[j]); - for (int k = 0; k < embed_dim; k++) + // permute + reshape + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < src_seqlen; q++) + { + __fp16* outptr = qkv_wch_fp16.row<__fp16>(q); + for (int i = 0; i < num_head; i++) + { + __fp16* ptr = qkv_cross.channel(i).row<__fp16>(q); + for (int j = 0; j < embed_dim_per_head; j++) { - float32x4_t _val = vld1q_f32(ptr); - float32x4_t _k = vdupq_n_f32(kptr[0]); - _sum = vmlaq_f32(_sum, _val, _k); - ptr += 4; - kptr += 1; + *outptr++ = ptr[j]; } - - vst1q_f32(outptr, _sum); - outptr += 4; } } + qkv_cross.release(); + + o_gemm->forward(qkv_wch_fp16, top_blobs[0], opt); + return 0; } -#endif // __ARM_NEON +#endif - // fallback to native implement - std::vector bottom_blobs_unpacked = bottom_blobs; - if (dst_elempack == 4) + printf("FP32\n"); + + Mat q_affine; + q_gemm->forward(q_blob, q_affine, opt32); + + Mat k_affine; + k_gemm->forward(k_blob, k_affine, opt32); + + Mat qk_cross(dst_seqlen, src_seqlen * num_head, 4u, opt32.blob_allocator); + #pragma omp parallel for num_threads(opt32.num_threads) + for (int i = 0; i < num_head; i++) { - convert_packing(bottom_blobs[1], bottom_blobs_unpacked[1], 1, opt); - if (bottom_blobs.size() == 3) - convert_packing(bottom_blobs[2], bottom_blobs_unpacked[2], 1, opt); + std::vector qk_bottom_blobs(2); + qk_bottom_blobs[0] = q_affine.row_range(i * embed_dim_per_head, embed_dim_per_head); + qk_bottom_blobs[1] = k_affine.row_range(i * embed_dim_per_head, embed_dim_per_head); + std::vector qk_top_blobs(1); + qk_top_blobs[0] = qk_cross.row_range(i * src_seqlen, src_seqlen); + Option opt1 = opt32; + opt1.num_threads = 1; + qk_gemm->forward(qk_bottom_blobs, qk_top_blobs, opt1); } - return MultiHeadAttention::forward(bottom_blobs_unpacked, top_blobs, opt); + + q_affine.release(); + k_affine.release(); + + qk_softmax->forward_inplace(qk_cross, opt32); + + Mat v_affine; + v_gemm->forward(v_blob, v_affine, opt32); + + Mat qkv_cross(embed_dim_per_head, src_seqlen, num_head, 4u, opt32.blob_allocator); + #pragma omp parallel for num_threads(opt32.num_threads) + for (int i = 0; i < num_head; i++) + { + std::vector qkv_bottom_blobs(2); + qkv_bottom_blobs[0] = qk_cross.row_range(i * src_seqlen, src_seqlen); + qkv_bottom_blobs[1] = v_affine.row_range(i * embed_dim_per_head, embed_dim_per_head); + std::vector qkv_top_blobs(1); + qkv_top_blobs[0] = qkv_cross.channel(i); + Option opt1 = opt32; + opt1.num_threads = 1; + qkv_gemm->forward(qkv_bottom_blobs, qkv_top_blobs, opt1); + } + + qk_cross.release(); + v_affine.release(); + + { + Mat qkv_wch; + permute_wch->forward(qkv_cross, qkv_wch, opt32); + + qkv_cross.release(); + + qkv_wch = qkv_wch.reshape(embed_dim, src_seqlen); + + o_gemm->forward(qkv_wch, top_blobs[0], opt32); + } + + return 0; } } // namespace ncnn diff --git a/src/layer/arm/multiheadattention_arm.h b/src/layer/arm/multiheadattention_arm.h index c3856a013687..4eaf845aee10 100644 --- a/src/layer/arm/multiheadattention_arm.h +++ b/src/layer/arm/multiheadattention_arm.h @@ -24,7 +24,25 @@ class MultiHeadAttention_arm : virtual public MultiHeadAttention public: MultiHeadAttention_arm(); + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +public: + Layer* cvtfp16_to_fp32; + Layer* cvtfp32_to_fp16; + + Layer* q_gemm; + Layer* k_gemm; + Layer* v_gemm; + Layer* o_gemm; + + Layer* qk_gemm; + Layer* qkv_gemm; + + Layer* qk_softmax; + Layer* permute_wch; }; } // namespace ncnn diff --git a/tests/testutil.h b/tests/testutil.h index 5910cdfb45e7..3e4542e8c851 100644 --- a/tests/testutil.h +++ b/tests/testutil.h @@ -1392,6 +1392,12 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec a_fp16 = a; } + // precision whitelist + if (strcmp(layer_type, "MultiHeadAttention") == 0) + { + epsilon = epsilon * 5; + } + std::vector weights_fp16; float epsilon_fp16; if (opt.use_bf16_storage) @@ -1532,6 +1538,12 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec a_fp16 = a; } + // precision whitelist + if (strcmp(layer_type, "MultiHeadAttention") == 0) + { + epsilon = epsilon * 5; + } + std::vector weights_fp16; float epsilon_fp16; if (opt.use_bf16_storage) From d44d4c5510b453328735267f9c0a26e2c81febfe Mon Sep 17 00:00:00 2001 From: EdVince <2456510228@qq.com> Date: Tue, 10 Jan 2023 17:42:58 +0800 Subject: [PATCH 02/11] rm:printf --- src/layer/arm/multiheadattention_arm.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/layer/arm/multiheadattention_arm.cpp b/src/layer/arm/multiheadattention_arm.cpp index 6993ea027af3..b169c153d5b5 100644 --- a/src/layer/arm/multiheadattention_arm.cpp +++ b/src/layer/arm/multiheadattention_arm.cpp @@ -17,8 +17,6 @@ #include "cpu.h" #include "layer_type.h" -#include - namespace ncnn { MultiHeadAttention_arm::MultiHeadAttention_arm() @@ -584,7 +582,6 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v #if NCNN_ARM82 if(support_fp16_storage && opt.use_fp16_packed && elembits == 16) { - printf("FP16\n"); Mat q_affine, k_affine, v_affine; Mat qk_cross(dst_seqlen, src_seqlen * num_head, 2u, opt.blob_allocator); Mat qkv_cross(embed_dim_per_head, src_seqlen, num_head, 2u, opt.blob_allocator); @@ -659,8 +656,6 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v } #endif - printf("FP32\n"); - Mat q_affine; q_gemm->forward(q_blob, q_affine, opt32); From 2f0ed3225c612f36dc99f8b92cae9ec1ad755014 Mon Sep 17 00:00:00 2001 From: EdVince Date: Tue, 10 Jan 2023 10:13:21 +0000 Subject: [PATCH 03/11] apply code-format changes --- src/layer/arm/multiheadattention_arm.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/layer/arm/multiheadattention_arm.cpp b/src/layer/arm/multiheadattention_arm.cpp index b169c153d5b5..75e8ece08888 100644 --- a/src/layer/arm/multiheadattention_arm.cpp +++ b/src/layer/arm/multiheadattention_arm.cpp @@ -21,7 +21,6 @@ namespace ncnn { MultiHeadAttention_arm::MultiHeadAttention_arm() { - #if __ARM_NEON support_packing = true; #if NCNN_ARM82 @@ -90,7 +89,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) } #if NCNN_ARM82 - if(support_fp16_storage && opt.use_fp16_packed) + if (support_fp16_storage && opt.use_fp16_packed) { Option optopt = opt; @@ -458,9 +457,8 @@ int MultiHeadAttention_arm::destroy_pipeline(const Option& opt) permute_wch = 0; } - #if NCNN_ARM82 - if(support_fp16_storage && opt.use_fp16_packed) + if (support_fp16_storage && opt.use_fp16_packed) { Option optopt = opt; @@ -580,14 +578,13 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v opt32.use_fp16_storage = false; #if NCNN_ARM82 - if(support_fp16_storage && opt.use_fp16_packed && elembits == 16) + if (support_fp16_storage && opt.use_fp16_packed && elembits == 16) { Mat q_affine, k_affine, v_affine; Mat qk_cross(dst_seqlen, src_seqlen * num_head, 2u, opt.blob_allocator); Mat qkv_cross(embed_dim_per_head, src_seqlen, num_head, 2u, opt.blob_allocator); Mat qkv_wch_fp16(embed_dim, src_seqlen, 2u, opt.blob_allocator); - q_gemm->forward(q_blob, q_affine, opt); k_gemm->forward(k_blob, k_affine, opt); @@ -606,7 +603,7 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v q_affine.release(); k_affine.release(); - + Mat qk_cross_fp32, qk_cross_fp32_fp16; cvtfp16_to_fp32->forward(qk_cross, qk_cross_fp32, opt); qk_softmax->forward_inplace(qk_cross_fp32, opt32); @@ -616,7 +613,7 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v qk_cross_fp32.release(); v_gemm->forward(v_blob, v_affine, opt); - + #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < num_head; i++) { From dcb79942a39ea87f5df24b09a14542689561a933 Mon Sep 17 00:00:00 2001 From: EdVince <2456510228@qq.com> Date: Tue, 10 Jan 2023 22:16:50 +0800 Subject: [PATCH 04/11] add:gelu arm --- src/layer/arm/gelu_arm.cpp | 252 +++++++++++++++++++++++++++++++++++++ src/layer/arm/gelu_arm.h | 42 +++++++ 2 files changed, 294 insertions(+) create mode 100644 src/layer/arm/gelu_arm.cpp create mode 100644 src/layer/arm/gelu_arm.h diff --git a/src/layer/arm/gelu_arm.cpp b/src/layer/arm/gelu_arm.cpp new file mode 100644 index 000000000000..2dfccabf993e --- /dev/null +++ b/src/layer/arm/gelu_arm.cpp @@ -0,0 +1,252 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "gelu_arm.h" + +#include + +#if __ARM_NEON +#include +#include "neon_mathfun.h" +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#include "neon_mathfun_fp16s.h" +#endif +#endif // __ARM_NEON + +#include "arm_usability.h" +#include "cpu.h" + +namespace ncnn { + +GELU_arm::GELU_arm() +{ +#if __ARM_NEON + support_packing = true; +#if NCNN_ARM82 + support_fp16_storage = cpu_support_arm_asimdhp(); +#endif +#endif // __ARM_NEON + +#if NCNN_BF16 + support_bf16_storage = true; +#endif +} + +int GELU_arm::create_pipeline(const Option& opt) +{ + if (!fast_gelu) + { + support_packing = false; + support_fp16_storage = false; + support_bf16_storage = false; + } + return 0; +} + +int GELU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + if (!fast_gelu) + { + return GELU::forward_inplace(bottom_top_blob, opt); + } + + int elembits = bottom_top_blob.elembits(); + +#if NCNN_ARM82 + if (support_fp16_storage && opt.use_fp16_storage && elembits == 16) + { + if (opt.use_fp16_arithmetic) + return forward_inplace_fp16sa(bottom_top_blob, opt); + else + return forward_inplace_fp16s(bottom_top_blob, opt); + } +#endif + +#if NCNN_BF16 + if (opt.use_bf16_storage && elembits == 16) + return forward_inplace_bf16s(bottom_top_blob, opt); +#endif + + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int elempack = bottom_top_blob.elempack; + int channels = bottom_top_blob.c; + int size = w * h * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; + +#if __ARM_NEON + for (; i + 3 < size; i += 4) + { + float32x4_t _pLoad = vld1q_f32(ptr); + + float32x4_t _blob = vmulq_f32(_pLoad, _pLoad); + _blob = vmulq_f32(_pLoad, _blob); + _blob = vmulq_f32(vdupq_n_f32(0.044715f * 0.79788452f), _blob); + _blob = vmlaq_f32(_blob, vdupq_n_f32(0.79788452f), _pLoad); + _blob = tanh_ps(_blob); + _blob = vaddq_f32(vdupq_n_f32(1.f), _blob); + _blob = vmulq_f32(vdupq_n_f32(0.5f), vmulq_f32(_blob, _pLoad)); + vst1q_f32(ptr, _blob); + ptr += 4; + } +#endif + for (; i < size; i++) + { + // y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3))) + *ptr = 0.5f * *ptr * (1.0f + tanhf(0.79788452f * (*ptr + 0.044715f * *ptr * *ptr * *ptr))); + + ptr++; + } + } + + return 0; +} + + +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC & NCNN_ARM82 +int GELU_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int elempack = bottom_top_blob.elempack; + int channels = bottom_top_blob.c; + int size = w * h * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = (__fp16*)bottom_top_blob.channel(q); + + int i = 0; + + for (; i + 3 < size; i += 4) + { + float32x4_t _pLoad = vcvt_f32_f16(vld1_f16(ptr)); + + float32x4_t _blob = vmulq_f32(_pLoad, _pLoad); + _blob = vmulq_f32(_pLoad, _blob); + _blob = vmulq_f32(vdupq_n_f32(0.044715f * 0.79788452f), _blob); + _blob = vmlaq_f32(_blob, vdupq_n_f32(0.79788452f), _pLoad); + _blob = tanh_ps(_blob); + _blob = vaddq_f32(vdupq_n_f32(1.f), _blob); + _blob = vmulq_f32(vdupq_n_f32(0.5f), vmulq_f32(_blob, _pLoad)); + vst1_f16(ptr, vcvt_f16_f32(_blob)); + ptr += 4; + } + + for (; i < size; i++) + { + float v = (float)*ptr; + v = 0.5f * v * (1.0f + tanhf(0.79788452f * (v + 0.044715f * v * v * v))); + *ptr = (__fp16)v; + ptr++; + } + } + + return 0; +} + +int GELU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int elempack = bottom_top_blob.elempack; + int channels = bottom_top_blob.c; + int size = w * h * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = (__fp16*)bottom_top_blob.channel(q); + + int i = 0; + + for (; i + 7 < size; i += 8) + { + float16x8_t _pLoad = vld1q_f16(ptr); + + float16x8_t _blob = vmulq_f16(_pLoad, _pLoad); + _blob = vmulq_f16(_pLoad, _blob); + _blob = vmulq_f16(vdupq_n_f16(0.044715f * 0.79788452f), _blob); + _blob = vfmaq_f16(_blob, vdupq_n_f16(0.79788452f), _pLoad); + _blob = tanh_ps(_blob); + _blob = vaddq_f16(vdupq_n_f16(1.f), _blob); + _blob = vmulq_f16(vdupq_n_f16(0.5f), vmulq_f16(_blob, _pLoad)); + vst1q_f16(ptr, _blob); + ptr += 8; + } + + for (; i < size; i++) + { + *ptr = (__fp16)0.5f * *ptr * ((__fp16)1.0f + tanhf((__fp16)0.79788452f * (*ptr + (__fp16)0.044715f * *ptr * *ptr * *ptr))); + ptr++; + } + } + + return 0; +} +#endif + + +#if NCNN_BF16 +int GELU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int elempack = bottom_top_blob.elempack; + int channels = bottom_top_blob.c; + int size = w * h * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + unsigned short* ptr = bottom_top_blob.channel(q); + + int i = 0; + + for (; i + 3 < size; i += 4) + { + float32x4_t _pLoad = bfloat2float(vld1_u16(ptr)); + + float32x4_t _blob = vmulq_f32(_pLoad, _pLoad); + _blob = vmulq_f32(_pLoad, _blob); + _blob = vmulq_f32(vdupq_n_f32(0.044715f * 0.79788452f), _blob); + _blob = vmlaq_f32(_blob, vdupq_n_f32(0.79788452f), _pLoad); + _blob = tanh_ps(_blob); + _blob = vaddq_f32(vdupq_n_f32(1.f), _blob); + _blob = vmulq_f32(vdupq_n_f32(0.5f), vmulq_f32(_blob, _pLoad)); + vst1_u16(ptr, float2bfloat(_blob)); + ptr += 4; + } + + for (; i < size; i++) + { + float v = bfloat16_to_float32(*ptr); + v = 0.5f * v * (1.0f + tanhf(0.79788452f * (v + 0.044715f * v * v * v))); + *ptr = float32_to_bfloat16(v); + ptr++; + } + } + + return 0; +} +#endif // NCNN_BF16 + +} // namespace ncnn diff --git a/src/layer/arm/gelu_arm.h b/src/layer/arm/gelu_arm.h new file mode 100644 index 000000000000..c6a3e3ee1486 --- /dev/null +++ b/src/layer/arm/gelu_arm.h @@ -0,0 +1,42 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_GELU_ARM_H +#define LAYER_GELU_ARM_H + +#include "gelu.h" + +namespace ncnn { + +class GELU_arm : virtual public GELU +{ +public: + GELU_arm(); + + virtual int create_pipeline(const Option& opt); + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; + +protected: +#if NCNN_ARM82 + int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; + int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; +#endif +#if NCNN_BF16 + int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif +}; + +} // namespace ncnn + +#endif // LAYER_GELU_ARM_H From a501e9752847f6c0b915aa6cb35ccdee73d3c97b Mon Sep 17 00:00:00 2001 From: EdVince Date: Tue, 10 Jan 2023 14:21:41 +0000 Subject: [PATCH 05/11] apply code-format changes --- src/layer/arm/gelu_arm.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/layer/arm/gelu_arm.cpp b/src/layer/arm/gelu_arm.cpp index 2dfccabf993e..da6d912f7020 100644 --- a/src/layer/arm/gelu_arm.cpp +++ b/src/layer/arm/gelu_arm.cpp @@ -119,7 +119,6 @@ int GELU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } - #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC & NCNN_ARM82 int GELU_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { @@ -204,7 +203,6 @@ int GELU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co } #endif - #if NCNN_BF16 int GELU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const { From fd654b8a3c8443422e4ce210b297af00c89b5582 Mon Sep 17 00:00:00 2001 From: EdVince <2456510228@qq.com> Date: Tue, 10 Jan 2023 22:34:58 +0800 Subject: [PATCH 06/11] update:gelu arm --- src/layer/arm/gelu_arm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layer/arm/gelu_arm.cpp b/src/layer/arm/gelu_arm.cpp index da6d912f7020..d659685feff2 100644 --- a/src/layer/arm/gelu_arm.cpp +++ b/src/layer/arm/gelu_arm.cpp @@ -19,7 +19,7 @@ #if __ARM_NEON #include #include "neon_mathfun.h" -#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if NCNN_ARM82 #include "neon_mathfun_fp16s.h" #endif #endif // __ARM_NEON @@ -119,7 +119,7 @@ int GELU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } -#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC & NCNN_ARM82 +#if NCNN_ARM82 int GELU_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const { int w = bottom_top_blob.w; From 3243bafd93852ff40583fd337a72f7dcb5ee26e3 Mon Sep 17 00:00:00 2001 From: EdVince <2456510228@qq.com> Date: Tue, 10 Jan 2023 22:47:09 +0800 Subject: [PATCH 07/11] rm:gelu arm --- src/layer/arm/gelu_arm.cpp | 250 ------------------------------------- src/layer/arm/gelu_arm.h | 42 ------- 2 files changed, 292 deletions(-) delete mode 100644 src/layer/arm/gelu_arm.cpp delete mode 100644 src/layer/arm/gelu_arm.h diff --git a/src/layer/arm/gelu_arm.cpp b/src/layer/arm/gelu_arm.cpp deleted file mode 100644 index d659685feff2..000000000000 --- a/src/layer/arm/gelu_arm.cpp +++ /dev/null @@ -1,250 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#include "gelu_arm.h" - -#include - -#if __ARM_NEON -#include -#include "neon_mathfun.h" -#if NCNN_ARM82 -#include "neon_mathfun_fp16s.h" -#endif -#endif // __ARM_NEON - -#include "arm_usability.h" -#include "cpu.h" - -namespace ncnn { - -GELU_arm::GELU_arm() -{ -#if __ARM_NEON - support_packing = true; -#if NCNN_ARM82 - support_fp16_storage = cpu_support_arm_asimdhp(); -#endif -#endif // __ARM_NEON - -#if NCNN_BF16 - support_bf16_storage = true; -#endif -} - -int GELU_arm::create_pipeline(const Option& opt) -{ - if (!fast_gelu) - { - support_packing = false; - support_fp16_storage = false; - support_bf16_storage = false; - } - return 0; -} - -int GELU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const -{ - if (!fast_gelu) - { - return GELU::forward_inplace(bottom_top_blob, opt); - } - - int elembits = bottom_top_blob.elembits(); - -#if NCNN_ARM82 - if (support_fp16_storage && opt.use_fp16_storage && elembits == 16) - { - if (opt.use_fp16_arithmetic) - return forward_inplace_fp16sa(bottom_top_blob, opt); - else - return forward_inplace_fp16s(bottom_top_blob, opt); - } -#endif - -#if NCNN_BF16 - if (opt.use_bf16_storage && elembits == 16) - return forward_inplace_bf16s(bottom_top_blob, opt); -#endif - - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int elempack = bottom_top_blob.elempack; - int channels = bottom_top_blob.c; - int size = w * h * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* ptr = bottom_top_blob.channel(q); - - int i = 0; - -#if __ARM_NEON - for (; i + 3 < size; i += 4) - { - float32x4_t _pLoad = vld1q_f32(ptr); - - float32x4_t _blob = vmulq_f32(_pLoad, _pLoad); - _blob = vmulq_f32(_pLoad, _blob); - _blob = vmulq_f32(vdupq_n_f32(0.044715f * 0.79788452f), _blob); - _blob = vmlaq_f32(_blob, vdupq_n_f32(0.79788452f), _pLoad); - _blob = tanh_ps(_blob); - _blob = vaddq_f32(vdupq_n_f32(1.f), _blob); - _blob = vmulq_f32(vdupq_n_f32(0.5f), vmulq_f32(_blob, _pLoad)); - vst1q_f32(ptr, _blob); - ptr += 4; - } -#endif - for (; i < size; i++) - { - // y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3))) - *ptr = 0.5f * *ptr * (1.0f + tanhf(0.79788452f * (*ptr + 0.044715f * *ptr * *ptr * *ptr))); - - ptr++; - } - } - - return 0; -} - -#if NCNN_ARM82 -int GELU_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int elempack = bottom_top_blob.elempack; - int channels = bottom_top_blob.c; - int size = w * h * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = (__fp16*)bottom_top_blob.channel(q); - - int i = 0; - - for (; i + 3 < size; i += 4) - { - float32x4_t _pLoad = vcvt_f32_f16(vld1_f16(ptr)); - - float32x4_t _blob = vmulq_f32(_pLoad, _pLoad); - _blob = vmulq_f32(_pLoad, _blob); - _blob = vmulq_f32(vdupq_n_f32(0.044715f * 0.79788452f), _blob); - _blob = vmlaq_f32(_blob, vdupq_n_f32(0.79788452f), _pLoad); - _blob = tanh_ps(_blob); - _blob = vaddq_f32(vdupq_n_f32(1.f), _blob); - _blob = vmulq_f32(vdupq_n_f32(0.5f), vmulq_f32(_blob, _pLoad)); - vst1_f16(ptr, vcvt_f16_f32(_blob)); - ptr += 4; - } - - for (; i < size; i++) - { - float v = (float)*ptr; - v = 0.5f * v * (1.0f + tanhf(0.79788452f * (v + 0.044715f * v * v * v))); - *ptr = (__fp16)v; - ptr++; - } - } - - return 0; -} - -int GELU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int elempack = bottom_top_blob.elempack; - int channels = bottom_top_blob.c; - int size = w * h * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = (__fp16*)bottom_top_blob.channel(q); - - int i = 0; - - for (; i + 7 < size; i += 8) - { - float16x8_t _pLoad = vld1q_f16(ptr); - - float16x8_t _blob = vmulq_f16(_pLoad, _pLoad); - _blob = vmulq_f16(_pLoad, _blob); - _blob = vmulq_f16(vdupq_n_f16(0.044715f * 0.79788452f), _blob); - _blob = vfmaq_f16(_blob, vdupq_n_f16(0.79788452f), _pLoad); - _blob = tanh_ps(_blob); - _blob = vaddq_f16(vdupq_n_f16(1.f), _blob); - _blob = vmulq_f16(vdupq_n_f16(0.5f), vmulq_f16(_blob, _pLoad)); - vst1q_f16(ptr, _blob); - ptr += 8; - } - - for (; i < size; i++) - { - *ptr = (__fp16)0.5f * *ptr * ((__fp16)1.0f + tanhf((__fp16)0.79788452f * (*ptr + (__fp16)0.044715f * *ptr * *ptr * *ptr))); - ptr++; - } - } - - return 0; -} -#endif - -#if NCNN_BF16 -int GELU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int elempack = bottom_top_blob.elempack; - int channels = bottom_top_blob.c; - int size = w * h * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - unsigned short* ptr = bottom_top_blob.channel(q); - - int i = 0; - - for (; i + 3 < size; i += 4) - { - float32x4_t _pLoad = bfloat2float(vld1_u16(ptr)); - - float32x4_t _blob = vmulq_f32(_pLoad, _pLoad); - _blob = vmulq_f32(_pLoad, _blob); - _blob = vmulq_f32(vdupq_n_f32(0.044715f * 0.79788452f), _blob); - _blob = vmlaq_f32(_blob, vdupq_n_f32(0.79788452f), _pLoad); - _blob = tanh_ps(_blob); - _blob = vaddq_f32(vdupq_n_f32(1.f), _blob); - _blob = vmulq_f32(vdupq_n_f32(0.5f), vmulq_f32(_blob, _pLoad)); - vst1_u16(ptr, float2bfloat(_blob)); - ptr += 4; - } - - for (; i < size; i++) - { - float v = bfloat16_to_float32(*ptr); - v = 0.5f * v * (1.0f + tanhf(0.79788452f * (v + 0.044715f * v * v * v))); - *ptr = float32_to_bfloat16(v); - ptr++; - } - } - - return 0; -} -#endif // NCNN_BF16 - -} // namespace ncnn diff --git a/src/layer/arm/gelu_arm.h b/src/layer/arm/gelu_arm.h deleted file mode 100644 index c6a3e3ee1486..000000000000 --- a/src/layer/arm/gelu_arm.h +++ /dev/null @@ -1,42 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#ifndef LAYER_GELU_ARM_H -#define LAYER_GELU_ARM_H - -#include "gelu.h" - -namespace ncnn { - -class GELU_arm : virtual public GELU -{ -public: - GELU_arm(); - - virtual int create_pipeline(const Option& opt); - virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; - -protected: -#if NCNN_ARM82 - int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; - int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; -#endif -#if NCNN_BF16 - int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; -#endif -}; - -} // namespace ncnn - -#endif // LAYER_GELU_ARM_H From debe2538471520db46c25109ef63522edb2b43bc Mon Sep 17 00:00:00 2001 From: EdVince <2456510228@qq.com> Date: Wed, 11 Jan 2023 11:50:33 +0800 Subject: [PATCH 08/11] update:copyright --- src/layer/arm/multiheadattention_arm.cpp | 2 +- src/layer/arm/multiheadattention_arm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layer/arm/multiheadattention_arm.cpp b/src/layer/arm/multiheadattention_arm.cpp index 75e8ece08888..fd58e3bcc1cb 100644 --- a/src/layer/arm/multiheadattention_arm.cpp +++ b/src/layer/arm/multiheadattention_arm.cpp @@ -1,6 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // -// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at diff --git a/src/layer/arm/multiheadattention_arm.h b/src/layer/arm/multiheadattention_arm.h index 4eaf845aee10..427e897a4abe 100644 --- a/src/layer/arm/multiheadattention_arm.h +++ b/src/layer/arm/multiheadattention_arm.h @@ -1,6 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // -// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at From b0fdc5bfff326cc0d5772f44fdc1a63cc078e5ae Mon Sep 17 00:00:00 2001 From: EdVince <2456510228@qq.com> Date: Tue, 21 Feb 2023 15:06:35 +0800 Subject: [PATCH 09/11] fix:bf16 --- src/layer/arm/multiheadattention_arm.cpp | 63 ++++++++++++++---------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/src/layer/arm/multiheadattention_arm.cpp b/src/layer/arm/multiheadattention_arm.cpp index fd58e3bcc1cb..a4bae46b1637 100644 --- a/src/layer/arm/multiheadattention_arm.cpp +++ b/src/layer/arm/multiheadattention_arm.cpp @@ -28,6 +28,8 @@ MultiHeadAttention_arm::MultiHeadAttention_arm() #endif #endif // __ARM_NEON + support_bf16_storage = false; + cvtfp16_to_fp32 = 0; cvtfp32_to_fp16 = 0; @@ -45,6 +47,9 @@ MultiHeadAttention_arm::MultiHeadAttention_arm() int MultiHeadAttention_arm::create_pipeline(const Option& opt) { + Option optn = opt; + optn.use_bf16_storage = false; + Option opt32 = opt; opt32.use_bf16_storage = false; opt32.use_fp16_arithmetic = false; @@ -58,7 +63,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) pd.set(1, 1); // from fp32 cvtfp16_to_fp32->load_param(pd); cvtfp16_to_fp32->load_model(ModelBinFromMatArray(0)); - cvtfp16_to_fp32->create_pipeline(opt); + cvtfp16_to_fp32->create_pipeline(optn); } { cvtfp32_to_fp16 = ncnn::create_layer(ncnn::LayerType::Cast); @@ -67,7 +72,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) pd.set(1, 2); // from fp16 cvtfp32_to_fp16->load_param(pd); cvtfp32_to_fp16->load_model(ModelBinFromMatArray(0)); - cvtfp32_to_fp16->create_pipeline(opt); + cvtfp32_to_fp16->create_pipeline(optn); } { @@ -89,9 +94,9 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) } #if NCNN_ARM82 - if (support_fp16_storage && opt.use_fp16_packed) + if (support_fp16_storage && optn.use_fp16_packed) { - Option optopt = opt; + Option optopt = optn; { const int embed_dim_per_head = embed_dim / num_head; @@ -255,7 +260,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) } #endif - Option optopt = opt; + Option optopt = optn; optopt.use_bf16_storage = false; optopt.use_fp16_arithmetic = false; optopt.use_fp16_packed = false; @@ -424,7 +429,10 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) int MultiHeadAttention_arm::destroy_pipeline(const Option& opt) { - Option opt32 = opt; + Option optn = opt; + optn.use_bf16_storage = false; + + Option opt32 = optn; opt32.use_bf16_storage = false; opt32.use_fp16_arithmetic = false; opt32.use_fp16_packed = false; @@ -432,13 +440,13 @@ int MultiHeadAttention_arm::destroy_pipeline(const Option& opt) if (cvtfp16_to_fp32) { - cvtfp16_to_fp32->destroy_pipeline(opt); + cvtfp16_to_fp32->destroy_pipeline(optn); delete cvtfp16_to_fp32; cvtfp16_to_fp32 = 0; } if (cvtfp32_to_fp16) { - cvtfp32_to_fp16->destroy_pipeline(opt); + cvtfp32_to_fp16->destroy_pipeline(optn); delete cvtfp32_to_fp16; cvtfp32_to_fp16 = 0; } @@ -458,9 +466,9 @@ int MultiHeadAttention_arm::destroy_pipeline(const Option& opt) } #if NCNN_ARM82 - if (support_fp16_storage && opt.use_fp16_packed) + if (support_fp16_storage && optn.use_fp16_packed) { - Option optopt = opt; + Option optopt = optn; if (q_gemm) { @@ -508,7 +516,7 @@ int MultiHeadAttention_arm::destroy_pipeline(const Option& opt) } #endif - Option optopt = opt; + Option optopt = optn; optopt.use_bf16_storage = false; optopt.use_fp16_arithmetic = false; optopt.use_fp16_packed = false; @@ -571,24 +579,27 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v const int elembits = q_blob.elembits(); - Option opt32 = opt; + Option optn = opt; + optn.use_bf16_storage = false; + + Option opt32 = optn; opt32.use_bf16_storage = false; opt32.use_fp16_arithmetic = false; opt32.use_fp16_packed = false; opt32.use_fp16_storage = false; #if NCNN_ARM82 - if (support_fp16_storage && opt.use_fp16_packed && elembits == 16) + if (support_fp16_storage && optn.use_fp16_packed && elembits == 16) { Mat q_affine, k_affine, v_affine; - Mat qk_cross(dst_seqlen, src_seqlen * num_head, 2u, opt.blob_allocator); - Mat qkv_cross(embed_dim_per_head, src_seqlen, num_head, 2u, opt.blob_allocator); + Mat qk_cross(dst_seqlen, src_seqlen * num_head, 2u, optn.blob_allocator); + Mat qkv_cross(embed_dim_per_head, src_seqlen, num_head, 2u, optn.blob_allocator); Mat qkv_wch_fp16(embed_dim, src_seqlen, 2u, opt.blob_allocator); - q_gemm->forward(q_blob, q_affine, opt); - k_gemm->forward(k_blob, k_affine, opt); + q_gemm->forward(q_blob, q_affine, optn); + k_gemm->forward(k_blob, k_affine, optn); - #pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(optn.num_threads) for (int i = 0; i < num_head; i++) { std::vector qk_bottom_blobs(2); @@ -596,7 +607,7 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v qk_bottom_blobs[1] = k_affine.row_range(i * embed_dim_per_head, embed_dim_per_head); std::vector qk_top_blobs(1); qk_top_blobs[0] = qk_cross.row_range(i * src_seqlen, src_seqlen); - Option opt1 = opt; + Option opt1 = optn; opt1.num_threads = 1; qk_gemm->forward(qk_bottom_blobs, qk_top_blobs, opt1); } @@ -605,16 +616,16 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v k_affine.release(); Mat qk_cross_fp32, qk_cross_fp32_fp16; - cvtfp16_to_fp32->forward(qk_cross, qk_cross_fp32, opt); + cvtfp16_to_fp32->forward(qk_cross, qk_cross_fp32, optn); qk_softmax->forward_inplace(qk_cross_fp32, opt32); - cvtfp32_to_fp16->forward(qk_cross_fp32, qk_cross_fp32_fp16, opt); + cvtfp32_to_fp16->forward(qk_cross_fp32, qk_cross_fp32_fp16, optn); qk_cross.release(); qk_cross_fp32.release(); - v_gemm->forward(v_blob, v_affine, opt); + v_gemm->forward(v_blob, v_affine, optn); - #pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(optn.num_threads) for (int i = 0; i < num_head; i++) { std::vector qkv_bottom_blobs(2); @@ -622,7 +633,7 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v qkv_bottom_blobs[1] = v_affine.row_range(i * embed_dim_per_head, embed_dim_per_head); std::vector qkv_top_blobs(1); qkv_top_blobs[0] = qkv_cross.channel(i); - Option opt1 = opt; + Option opt1 = optn; opt1.num_threads = 1; qkv_gemm->forward(qkv_bottom_blobs, qkv_top_blobs, opt1); } @@ -631,7 +642,7 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v v_affine.release(); // permute + reshape - #pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(optn.num_threads) for (int q = 0; q < src_seqlen; q++) { __fp16* outptr = qkv_wch_fp16.row<__fp16>(q); @@ -647,7 +658,7 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v qkv_cross.release(); - o_gemm->forward(qkv_wch_fp16, top_blobs[0], opt); + o_gemm->forward(qkv_wch_fp16, top_blobs[0], optn); return 0; } From 968f83c132573bbd2fa9885ab02a1511f6f03811 Mon Sep 17 00:00:00 2001 From: nihuini Date: Wed, 22 Feb 2023 19:18:47 +0800 Subject: [PATCH 10/11] permute free mha --- src/layer/arm/multiheadattention_arm.cpp | 87 +++++++----------------- src/layer/arm/multiheadattention_arm.h | 1 - 2 files changed, 26 insertions(+), 62 deletions(-) diff --git a/src/layer/arm/multiheadattention_arm.cpp b/src/layer/arm/multiheadattention_arm.cpp index a4bae46b1637..7ed28b4c1943 100644 --- a/src/layer/arm/multiheadattention_arm.cpp +++ b/src/layer/arm/multiheadattention_arm.cpp @@ -42,7 +42,6 @@ MultiHeadAttention_arm::MultiHeadAttention_arm() qkv_gemm = 0; qk_softmax = 0; - permute_wch = 0; } int MultiHeadAttention_arm::create_pipeline(const Option& opt) @@ -84,14 +83,6 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) qk_softmax->load_model(ModelBinFromMatArray(0)); qk_softmax->create_pipeline(opt32); } - { - permute_wch = ncnn::create_layer(ncnn::LayerType::Permute); - ncnn::ParamDict pd; - pd.set(0, 2); // wch - permute_wch->load_param(pd); - permute_wch->load_model(ModelBinFromMatArray(0)); - permute_wch->create_pipeline(opt32); - } #if NCNN_ARM82 if (support_fp16_storage && optn.use_fp16_packed) @@ -117,6 +108,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) pd.set(10, 1); // constant_broadcast_type_C pd.set(11, 0); // output_N1M pd.set(12, 1); // output_elempack + pd.set(14, 0); // output_transpose q_gemm->load_param(pd); Mat weights[2]; weights[0] = q_weight_data; @@ -145,6 +137,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) pd.set(10, 1); // constant_broadcast_type_C pd.set(11, 0); // output_N1M pd.set(12, 1); // output_elempack + pd.set(14, 0); // output_transpose k_gemm->load_param(pd); Mat weights[2]; weights[0] = k_weight_data; @@ -173,6 +166,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) pd.set(10, 1); // constant_broadcast_type_C pd.set(11, 0); // output_N1M pd.set(12, 1); // output_elempack + pd.set(14, 0); // output_transpose v_gemm->load_param(pd); Mat weights[2]; weights[0] = v_weight_data; @@ -190,7 +184,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) { o_gemm = ncnn::create_layer(ncnn::LayerType::Gemm); ncnn::ParamDict pd; - pd.set(2, 0); // transA + pd.set(2, 1); // transA pd.set(3, 1); // transB pd.set(4, 0); // constantA pd.set(5, 1); // constantB @@ -249,6 +243,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) pd.set(10, -1); // constant_broadcast_type_C pd.set(11, 0); // output_N1M pd.set(12, 1); // output_elempack + pd.set(14, 1); // output_transpose qkv_gemm->load_param(pd); qkv_gemm->load_model(ModelBinFromMatArray(0)); Option opt1 = optopt; @@ -285,6 +280,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) pd.set(10, 1); // constant_broadcast_type_C pd.set(11, 0); // output_N1M pd.set(12, 1); // output_elempack + pd.set(14, 0); // output_transpose q_gemm->load_param(pd); Mat weights[2]; weights[0] = q_weight_data; @@ -313,6 +309,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) pd.set(10, 1); // constant_broadcast_type_C pd.set(11, 0); // output_N1M pd.set(12, 1); // output_elempack + pd.set(14, 0); // output_transpose k_gemm->load_param(pd); Mat weights[2]; weights[0] = k_weight_data; @@ -341,6 +338,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) pd.set(10, 1); // constant_broadcast_type_C pd.set(11, 0); // output_N1M pd.set(12, 1); // output_elempack + pd.set(14, 0); // output_transpose v_gemm->load_param(pd); Mat weights[2]; weights[0] = v_weight_data; @@ -358,7 +356,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) { o_gemm = ncnn::create_layer(ncnn::LayerType::Gemm); ncnn::ParamDict pd; - pd.set(2, 0); // transA + pd.set(2, 1); // transA pd.set(3, 1); // transB pd.set(4, 0); // constantA pd.set(5, 1); // constantB @@ -417,6 +415,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) pd.set(10, -1); // constant_broadcast_type_C pd.set(11, 0); // output_N1M pd.set(12, 1); // output_elempack + pd.set(14, 1); // output_transpose qkv_gemm->load_param(pd); qkv_gemm->load_model(ModelBinFromMatArray(0)); Option opt1 = optopt; @@ -458,13 +457,6 @@ int MultiHeadAttention_arm::destroy_pipeline(const Option& opt) qk_softmax = 0; } - if (permute_wch) - { - permute_wch->destroy_pipeline(opt32); - delete permute_wch; - permute_wch = 0; - } - #if NCNN_ARM82 if (support_fp16_storage && optn.use_fp16_packed) { @@ -589,16 +581,15 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v opt32.use_fp16_storage = false; #if NCNN_ARM82 - if (support_fp16_storage && optn.use_fp16_packed && elembits == 16) + if (support_fp16_storage && optn.use_fp16_storage && elembits == 16) { - Mat q_affine, k_affine, v_affine; - Mat qk_cross(dst_seqlen, src_seqlen * num_head, 2u, optn.blob_allocator); - Mat qkv_cross(embed_dim_per_head, src_seqlen, num_head, 2u, optn.blob_allocator); - Mat qkv_wch_fp16(embed_dim, src_seqlen, 2u, opt.blob_allocator); - + Mat q_affine; q_gemm->forward(q_blob, q_affine, optn); + + Mat k_affine; k_gemm->forward(k_blob, k_affine, optn); + Mat qk_cross(dst_seqlen, src_seqlen * num_head, 2u, optn.blob_allocator); #pragma omp parallel for num_threads(optn.num_threads) for (int i = 0; i < num_head; i++) { @@ -615,50 +606,34 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v q_affine.release(); k_affine.release(); - Mat qk_cross_fp32, qk_cross_fp32_fp16; + // TODO implement fp16s softmax + Mat qk_cross_fp32; cvtfp16_to_fp32->forward(qk_cross, qk_cross_fp32, optn); qk_softmax->forward_inplace(qk_cross_fp32, opt32); - cvtfp32_to_fp16->forward(qk_cross_fp32, qk_cross_fp32_fp16, optn); + cvtfp32_to_fp16->forward(qk_cross_fp32, qk_cross, optn); - qk_cross.release(); qk_cross_fp32.release(); + Mat v_affine; v_gemm->forward(v_blob, v_affine, optn); + Mat qkv_cross(src_seqlen, embed_dim_per_head * num_head, 2u, optn.blob_allocator); #pragma omp parallel for num_threads(optn.num_threads) for (int i = 0; i < num_head; i++) { std::vector qkv_bottom_blobs(2); - qkv_bottom_blobs[0] = qk_cross_fp32_fp16.row_range(i * src_seqlen, src_seqlen); + qkv_bottom_blobs[0] = qk_cross.row_range(i * src_seqlen, src_seqlen); qkv_bottom_blobs[1] = v_affine.row_range(i * embed_dim_per_head, embed_dim_per_head); std::vector qkv_top_blobs(1); - qkv_top_blobs[0] = qkv_cross.channel(i); + qkv_top_blobs[0] = qkv_cross.row_range(i * embed_dim_per_head, embed_dim_per_head); Option opt1 = optn; opt1.num_threads = 1; qkv_gemm->forward(qkv_bottom_blobs, qkv_top_blobs, opt1); } - qk_cross_fp32_fp16.release(); v_affine.release(); - // permute + reshape - #pragma omp parallel for num_threads(optn.num_threads) - for (int q = 0; q < src_seqlen; q++) - { - __fp16* outptr = qkv_wch_fp16.row<__fp16>(q); - for (int i = 0; i < num_head; i++) - { - __fp16* ptr = qkv_cross.channel(i).row<__fp16>(q); - for (int j = 0; j < embed_dim_per_head; j++) - { - *outptr++ = ptr[j]; - } - } - } - - qkv_cross.release(); - - o_gemm->forward(qkv_wch_fp16, top_blobs[0], optn); + o_gemm->forward(qkv_cross, top_blobs[0], optn); return 0; } @@ -692,7 +667,7 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v Mat v_affine; v_gemm->forward(v_blob, v_affine, opt32); - Mat qkv_cross(embed_dim_per_head, src_seqlen, num_head, 4u, opt32.blob_allocator); + Mat qkv_cross(src_seqlen, embed_dim_per_head * num_head, 4u, opt32.blob_allocator); #pragma omp parallel for num_threads(opt32.num_threads) for (int i = 0; i < num_head; i++) { @@ -700,25 +675,15 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v qkv_bottom_blobs[0] = qk_cross.row_range(i * src_seqlen, src_seqlen); qkv_bottom_blobs[1] = v_affine.row_range(i * embed_dim_per_head, embed_dim_per_head); std::vector qkv_top_blobs(1); - qkv_top_blobs[0] = qkv_cross.channel(i); + qkv_top_blobs[0] = qkv_cross.row_range(i * embed_dim_per_head, embed_dim_per_head); Option opt1 = opt32; opt1.num_threads = 1; qkv_gemm->forward(qkv_bottom_blobs, qkv_top_blobs, opt1); } - qk_cross.release(); v_affine.release(); - { - Mat qkv_wch; - permute_wch->forward(qkv_cross, qkv_wch, opt32); - - qkv_cross.release(); - - qkv_wch = qkv_wch.reshape(embed_dim, src_seqlen); - - o_gemm->forward(qkv_wch, top_blobs[0], opt32); - } + o_gemm->forward(qkv_cross, top_blobs[0], opt32); return 0; } diff --git a/src/layer/arm/multiheadattention_arm.h b/src/layer/arm/multiheadattention_arm.h index 427e897a4abe..616d98bac55d 100644 --- a/src/layer/arm/multiheadattention_arm.h +++ b/src/layer/arm/multiheadattention_arm.h @@ -42,7 +42,6 @@ class MultiHeadAttention_arm : virtual public MultiHeadAttention Layer* qkv_gemm; Layer* qk_softmax; - Layer* permute_wch; }; } // namespace ncnn From b9df882fa9ea83917babf060561a2330404dba66 Mon Sep 17 00:00:00 2001 From: nihuini Date: Wed, 22 Feb 2023 19:41:33 +0800 Subject: [PATCH 11/11] move epsilon setting to test file --- src/layer/arm/multiheadattention_arm.cpp | 5 +++-- tests/test_multiheadattention.cpp | 16 +++++++++++----- tests/testutil.h | 12 ------------ 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/src/layer/arm/multiheadattention_arm.cpp b/src/layer/arm/multiheadattention_arm.cpp index 7ed28b4c1943..328b79fd0bb3 100644 --- a/src/layer/arm/multiheadattention_arm.cpp +++ b/src/layer/arm/multiheadattention_arm.cpp @@ -85,7 +85,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& opt) } #if NCNN_ARM82 - if (support_fp16_storage && optn.use_fp16_packed) + if (support_fp16_storage && optn.use_fp16_storage) { Option optopt = optn; @@ -458,7 +458,7 @@ int MultiHeadAttention_arm::destroy_pipeline(const Option& opt) } #if NCNN_ARM82 - if (support_fp16_storage && optn.use_fp16_packed) + if (support_fp16_storage && optn.use_fp16_storage) { Option optopt = optn; @@ -583,6 +583,7 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v #if NCNN_ARM82 if (support_fp16_storage && optn.use_fp16_storage && elembits == 16) { + // TODO implement true fp16s with gemm output_elemtype fp32 Mat q_affine; q_gemm->forward(q_blob, q_affine, optn); diff --git a/tests/test_multiheadattention.cpp b/tests/test_multiheadattention.cpp index e7440fd55bda..7ed18c4fe46a 100644 --- a/tests/test_multiheadattention.cpp +++ b/tests/test_multiheadattention.cpp @@ -41,7 +41,9 @@ static int test_multiheadattention(const ncnn::Mat& q, const ncnn::Mat& k, const as[1] = k; as[2] = v; - int ret = test_layer("MultiHeadAttention", pd, weights, as); + float epsilon = 0.005; + + int ret = test_layer("MultiHeadAttention", pd, weights, as, 1, epsilon); if (ret != 0) { fprintf(stderr, "test_multiheadattention failed q=(%d %d) k=(%d %d) v=(%d %d)\n", q.w, q.h, k.w, k.h, v.w, v.h); @@ -75,10 +77,12 @@ static int test_multiheadattention_samekv(const ncnn::Mat& q, const ncnn::Mat& k as[0] = q; as[1] = kv; - int ret = test_layer("MultiHeadAttention", pd, weights, as); + float epsilon = 0.005; + + int ret = test_layer("MultiHeadAttention", pd, weights, as, 1, epsilon); if (ret != 0) { - fprintf(stderr, "test_multiheadattention failed q=(%d %d) kv=(%d %d)\n", q.w, q.h, kv.w, kv.h); + fprintf(stderr, "test_multiheadattention_samekv failed q=(%d %d) kv=(%d %d)\n", q.w, q.h, kv.w, kv.h); } return ret; @@ -106,10 +110,12 @@ static int test_multiheadattention_sameqkv(const ncnn::Mat& a, int num_heads) std::vector as(1); as[0] = a; - int ret = test_layer("MultiHeadAttention", pd, weights, as); + float epsilon = 0.005; + + int ret = test_layer("MultiHeadAttention", pd, weights, as, 1, epsilon); if (ret != 0) { - fprintf(stderr, "test_multiheadattention failed a=(%d %d)\n", a.w, a.h); + fprintf(stderr, "test_multiheadattention_sameqkv failed a=(%d %d)\n", a.w, a.h); } return ret; diff --git a/tests/testutil.h b/tests/testutil.h index 1e7072aecc04..5a892d687a8d 100644 --- a/tests/testutil.h +++ b/tests/testutil.h @@ -1392,12 +1392,6 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec a_fp16 = a; } - // precision whitelist - if (strcmp(layer_type, "MultiHeadAttention") == 0) - { - epsilon = epsilon * 5; - } - std::vector weights_fp16; float epsilon_fp16; if (opt.use_bf16_storage) @@ -1538,12 +1532,6 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec a_fp16 = a; } - // precision whitelist - if (strcmp(layer_type, "MultiHeadAttention") == 0) - { - epsilon = epsilon * 5; - } - std::vector weights_fp16; float epsilon_fp16; if (opt.use_bf16_storage)