From 66d1104930a93bbe05dbdeddd986c14652ca06ae Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 13 Jun 2022 17:14:01 +0800 Subject: [PATCH 01/36] feat(tools/quantize): support .ini format feat(src/layer): add mha int8 --- benchmark/benchncnn.cpp | 3 + benchmark/vision_transformer_int8.param | 146 ++ docs/developer-guide/operators.md | 13 +- .../quantized-int8-inference.md | 28 +- src/layer/multiheadattention.cpp | 333 ++- src/layer/multiheadattention.h | 30 + tests/test_multiheadattention.cpp | 60 +- tools/CMakeLists.txt | 2 +- tools/modelwriter.cpp | 2069 +++++++++++++++++ tools/modelwriter.h | 2054 +--------------- tools/quantize/CMakeLists.txt | 9 +- tools/quantize/imreadwrite.h | 2 +- tools/quantize/ini_config.cpp | 224 ++ tools/quantize/ini_config.h | 323 +++ tools/quantize/ncnn2int8.cpp | 497 +--- tools/quantize/ncnn2table.cpp | 103 +- tools/quantize/net_quantize.cpp | 625 +++++ tools/quantize/net_quantize.h | 48 + 18 files changed, 3998 insertions(+), 2571 deletions(-) create mode 100644 benchmark/vision_transformer_int8.param create mode 100644 tools/modelwriter.cpp create mode 100644 tools/quantize/ini_config.cpp create mode 100644 tools/quantize/ini_config.h create mode 100644 tools/quantize/net_quantize.cpp create mode 100644 tools/quantize/net_quantize.h diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp index 032e3f9fbc42..283c76a340e2 100644 --- a/benchmark/benchncnn.cpp +++ b/benchmark/benchncnn.cpp @@ -321,6 +321,9 @@ int main(int argc, char** argv) benchmark("vision_transformer", ncnn::Mat(384, 384, 3), opt); benchmark("FastestDet", ncnn::Mat(352, 352, 3), opt); + + benchmark("vision_transformer_int8", ncnn::Mat(384, 384, 3), opt); + #if NCNN_VULKAN delete g_blob_vkallocator; delete g_staging_vkallocator; diff --git a/benchmark/vision_transformer_int8.param b/benchmark/vision_transformer_int8.param new file mode 100644 index 000000000000..0a7e92383add --- /dev/null +++ b/benchmark/vision_transformer_int8.param @@ -0,0 +1,146 @@ +7767517 +144 192 +Input input 0 1 input +MemoryData backbone.cls_token 0 1 backbone.cls_token 0=768 1=1 +MemoryData backbone.pos_embed 0 1 backbone.pos_embed 0=768 1=145 +Convolution Conv_0 1 1 input onnx::Reshape_153 0=768 1=32 3=32 5=1 6=2359296 8=2 +Reshape Reshape_2 1 1 onnx::Reshape_153 onnx::Transpose_155 0=-1 1=768 +Permute Transpose_3 1 1 onnx::Transpose_155 onnx::Concat_156 0=1 +Concat Concat_4 2 1 backbone.cls_token onnx::Concat_156 onnx::Add_157 +BinaryOp Add_5 2 1 onnx::Add_157 backbone.pos_embed input.1 +Split splitncnn_0 1 2 input.1 input.1_splitncnn_0 input.1_splitncnn_1 +LayerNorm LayerNorm_6 1 1 input.1_splitncnn_1 qkv_input 0=768 1=1.000000e-06 +Split splitncnn_1 1 3 qkv_input qkv_input_splitncnn_0 qkv_input_splitncnn_1 qkv_input_splitncnn_2 +MultiHeadAttention MultiHeadAttention_15 3 1 qkv_input_splitncnn_2 qkv_input_splitncnn_1 qkv_input_splitncnn_0 onnx::Add_168 0=768 1=12 2=589824 3=1 +BinaryOp Add_16 2 1 input.1_splitncnn_0 onnx::Add_168 input.4 +Split splitncnn_2 1 2 input.4 input.4_splitncnn_0 input.4_splitncnn_1 +LayerNorm LayerNorm_17 1 1 input.4_splitncnn_1 onnx::Gemm_170 0=768 1=1.000000e-06 +InnerProduct Gemm_18 1 1 onnx::Gemm_170 mmdeploy::Gelu_171 0=3072 1=1 2=2359296 8=2 +GELU Gelu_19 1 1 mmdeploy::Gelu_171 input.8 0=1 +InnerProduct Gemm_20 1 1 input.8 input.12 0=768 1=1 2=2359296 8=2 +BinaryOp Add_21 2 1 input.4_splitncnn_0 input.12 input.16 +Split splitncnn_3 1 2 input.16 input.16_splitncnn_0 input.16_splitncnn_1 +LayerNorm LayerNorm_22 1 1 input.16_splitncnn_1 qkv_input.3 0=768 1=1.000000e-06 +Split splitncnn_4 1 3 qkv_input.3 qkv_input.3_splitncnn_0 qkv_input.3_splitncnn_1 qkv_input.3_splitncnn_2 +MultiHeadAttention MultiHeadAttention_31 3 1 qkv_input.3_splitncnn_2 qkv_input.3_splitncnn_1 qkv_input.3_splitncnn_0 onnx::Add_184 0=768 1=12 2=589824 3=1 +BinaryOp Add_32 2 1 input.16_splitncnn_0 onnx::Add_184 input.20 +Split splitncnn_5 1 2 input.20 input.20_splitncnn_0 input.20_splitncnn_1 +LayerNorm LayerNorm_33 1 1 input.20_splitncnn_1 onnx::Gemm_186 0=768 1=1.000000e-06 +InnerProduct Gemm_34 1 1 onnx::Gemm_186 mmdeploy::Gelu_187 0=3072 1=1 2=2359296 8=2 +GELU Gelu_35 1 1 mmdeploy::Gelu_187 input.24 0=1 +InnerProduct Gemm_36 1 1 input.24 input.28 0=768 1=1 2=2359296 8=2 +BinaryOp Add_37 2 1 input.20_splitncnn_0 input.28 input.32 +Split splitncnn_6 1 2 input.32 input.32_splitncnn_0 input.32_splitncnn_1 +LayerNorm LayerNorm_38 1 1 input.32_splitncnn_1 qkv_input.7 0=768 1=1.000000e-06 +Split splitncnn_7 1 3 qkv_input.7 qkv_input.7_splitncnn_0 qkv_input.7_splitncnn_1 qkv_input.7_splitncnn_2 +MultiHeadAttention MultiHeadAttention_47 3 1 qkv_input.7_splitncnn_2 qkv_input.7_splitncnn_1 qkv_input.7_splitncnn_0 onnx::Add_200 0=768 1=12 2=589824 3=1 +BinaryOp Add_48 2 1 input.32_splitncnn_0 onnx::Add_200 input.36 +Split splitncnn_8 1 2 input.36 input.36_splitncnn_0 input.36_splitncnn_1 +LayerNorm LayerNorm_49 1 1 input.36_splitncnn_1 onnx::Gemm_202 0=768 1=1.000000e-06 +InnerProduct Gemm_50 1 1 onnx::Gemm_202 mmdeploy::Gelu_203 0=3072 1=1 2=2359296 8=2 +GELU Gelu_51 1 1 mmdeploy::Gelu_203 input.40 0=1 +InnerProduct Gemm_52 1 1 input.40 input.44 0=768 1=1 2=2359296 8=2 +BinaryOp Add_53 2 1 input.36_splitncnn_0 input.44 input.48 +Split splitncnn_9 1 2 input.48 input.48_splitncnn_0 input.48_splitncnn_1 +LayerNorm LayerNorm_54 1 1 input.48_splitncnn_1 qkv_input.11 0=768 1=1.000000e-06 +Split splitncnn_10 1 3 qkv_input.11 qkv_input.11_splitncnn_0 qkv_input.11_splitncnn_1 qkv_input.11_splitncnn_2 +MultiHeadAttention MultiHeadAttention_63 3 1 qkv_input.11_splitncnn_2 qkv_input.11_splitncnn_1 qkv_input.11_splitncnn_0 onnx::Add_216 0=768 1=12 2=589824 3=1 +BinaryOp Add_64 2 1 input.48_splitncnn_0 onnx::Add_216 input.52 +Split splitncnn_11 1 2 input.52 input.52_splitncnn_0 input.52_splitncnn_1 +LayerNorm LayerNorm_65 1 1 input.52_splitncnn_1 onnx::Gemm_218 0=768 1=1.000000e-06 +InnerProduct Gemm_66 1 1 onnx::Gemm_218 mmdeploy::Gelu_219 0=3072 1=1 2=2359296 8=2 +GELU Gelu_67 1 1 mmdeploy::Gelu_219 input.56 0=1 +InnerProduct Gemm_68 1 1 input.56 input.60 0=768 1=1 2=2359296 8=2 +BinaryOp Add_69 2 1 input.52_splitncnn_0 input.60 input.64 +Split splitncnn_12 1 2 input.64 input.64_splitncnn_0 input.64_splitncnn_1 +LayerNorm LayerNorm_70 1 1 input.64_splitncnn_1 qkv_input.15 0=768 1=1.000000e-06 +Split splitncnn_13 1 3 qkv_input.15 qkv_input.15_splitncnn_0 qkv_input.15_splitncnn_1 qkv_input.15_splitncnn_2 +MultiHeadAttention MultiHeadAttention_79 3 1 qkv_input.15_splitncnn_2 qkv_input.15_splitncnn_1 qkv_input.15_splitncnn_0 onnx::Add_232 0=768 1=12 2=589824 3=1 +BinaryOp Add_80 2 1 input.64_splitncnn_0 onnx::Add_232 input.68 +Split splitncnn_14 1 2 input.68 input.68_splitncnn_0 input.68_splitncnn_1 +LayerNorm LayerNorm_81 1 1 input.68_splitncnn_1 onnx::Gemm_234 0=768 1=1.000000e-06 +InnerProduct Gemm_82 1 1 onnx::Gemm_234 mmdeploy::Gelu_235 0=3072 1=1 2=2359296 8=2 +GELU Gelu_83 1 1 mmdeploy::Gelu_235 input.72 0=1 +InnerProduct Gemm_84 1 1 input.72 input.76 0=768 1=1 2=2359296 8=2 +BinaryOp Add_85 2 1 input.68_splitncnn_0 input.76 input.80 +Split splitncnn_15 1 2 input.80 input.80_splitncnn_0 input.80_splitncnn_1 +LayerNorm LayerNorm_86 1 1 input.80_splitncnn_1 qkv_input.19 0=768 1=1.000000e-06 +Split splitncnn_16 1 3 qkv_input.19 qkv_input.19_splitncnn_0 qkv_input.19_splitncnn_1 qkv_input.19_splitncnn_2 +MultiHeadAttention MultiHeadAttention_95 3 1 qkv_input.19_splitncnn_2 qkv_input.19_splitncnn_1 qkv_input.19_splitncnn_0 onnx::Add_248 0=768 1=12 2=589824 3=1 +BinaryOp Add_96 2 1 input.80_splitncnn_0 onnx::Add_248 input.84 +Split splitncnn_17 1 2 input.84 input.84_splitncnn_0 input.84_splitncnn_1 +LayerNorm LayerNorm_97 1 1 input.84_splitncnn_1 onnx::Gemm_250 0=768 1=1.000000e-06 +InnerProduct Gemm_98 1 1 onnx::Gemm_250 mmdeploy::Gelu_251 0=3072 1=1 2=2359296 8=2 +GELU Gelu_99 1 1 mmdeploy::Gelu_251 input.88 0=1 +InnerProduct Gemm_100 1 1 input.88 input.92 0=768 1=1 2=2359296 8=2 +BinaryOp Add_101 2 1 input.84_splitncnn_0 input.92 input.96 +Split splitncnn_18 1 2 input.96 input.96_splitncnn_0 input.96_splitncnn_1 +LayerNorm LayerNorm_102 1 1 input.96_splitncnn_1 qkv_input.23 0=768 1=1.000000e-06 +Split splitncnn_19 1 3 qkv_input.23 qkv_input.23_splitncnn_0 qkv_input.23_splitncnn_1 qkv_input.23_splitncnn_2 +MultiHeadAttention MultiHeadAttention_111 3 1 qkv_input.23_splitncnn_2 qkv_input.23_splitncnn_1 qkv_input.23_splitncnn_0 onnx::Add_264 0=768 1=12 2=589824 3=1 +BinaryOp Add_112 2 1 input.96_splitncnn_0 onnx::Add_264 input.100 +Split splitncnn_20 1 2 input.100 input.100_splitncnn_0 input.100_splitncnn_1 +LayerNorm LayerNorm_113 1 1 input.100_splitncnn_1 onnx::Gemm_266 0=768 1=1.000000e-06 +InnerProduct Gemm_114 1 1 onnx::Gemm_266 mmdeploy::Gelu_267 0=3072 1=1 2=2359296 8=2 +GELU Gelu_115 1 1 mmdeploy::Gelu_267 input.104 0=1 +InnerProduct Gemm_116 1 1 input.104 input.108 0=768 1=1 2=2359296 8=2 +BinaryOp Add_117 2 1 input.100_splitncnn_0 input.108 input.112 +Split splitncnn_21 1 2 input.112 input.112_splitncnn_0 input.112_splitncnn_1 +LayerNorm LayerNorm_118 1 1 input.112_splitncnn_1 qkv_input.27 0=768 1=1.000000e-06 +Split splitncnn_22 1 3 qkv_input.27 qkv_input.27_splitncnn_0 qkv_input.27_splitncnn_1 qkv_input.27_splitncnn_2 +MultiHeadAttention MultiHeadAttention_127 3 1 qkv_input.27_splitncnn_2 qkv_input.27_splitncnn_1 qkv_input.27_splitncnn_0 onnx::Add_280 0=768 1=12 2=589824 3=1 +BinaryOp Add_128 2 1 input.112_splitncnn_0 onnx::Add_280 input.116 +Split splitncnn_23 1 2 input.116 input.116_splitncnn_0 input.116_splitncnn_1 +LayerNorm LayerNorm_129 1 1 input.116_splitncnn_1 onnx::Gemm_282 0=768 1=1.000000e-06 +InnerProduct Gemm_130 1 1 onnx::Gemm_282 mmdeploy::Gelu_283 0=3072 1=1 2=2359296 8=2 +GELU Gelu_131 1 1 mmdeploy::Gelu_283 input.120 0=1 +InnerProduct Gemm_132 1 1 input.120 input.124 0=768 1=1 2=2359296 8=2 +BinaryOp Add_133 2 1 input.116_splitncnn_0 input.124 input.128 +Split splitncnn_24 1 2 input.128 input.128_splitncnn_0 input.128_splitncnn_1 +LayerNorm LayerNorm_134 1 1 input.128_splitncnn_1 qkv_input.31 0=768 1=1.000000e-06 +Split splitncnn_25 1 3 qkv_input.31 qkv_input.31_splitncnn_0 qkv_input.31_splitncnn_1 qkv_input.31_splitncnn_2 +MultiHeadAttention MultiHeadAttention_143 3 1 qkv_input.31_splitncnn_2 qkv_input.31_splitncnn_1 qkv_input.31_splitncnn_0 onnx::Add_296 0=768 1=12 2=589824 3=1 +BinaryOp Add_144 2 1 input.128_splitncnn_0 onnx::Add_296 input.132 +Split splitncnn_26 1 2 input.132 input.132_splitncnn_0 input.132_splitncnn_1 +LayerNorm LayerNorm_145 1 1 input.132_splitncnn_1 onnx::Gemm_298 0=768 1=1.000000e-06 +InnerProduct Gemm_146 1 1 onnx::Gemm_298 mmdeploy::Gelu_299 0=3072 1=1 2=2359296 8=2 +GELU Gelu_147 1 1 mmdeploy::Gelu_299 input.136 0=1 +InnerProduct Gemm_148 1 1 input.136 input.140 0=768 1=1 2=2359296 8=2 +BinaryOp Add_149 2 1 input.132_splitncnn_0 input.140 input.144 +Split splitncnn_27 1 2 input.144 input.144_splitncnn_0 input.144_splitncnn_1 +LayerNorm LayerNorm_150 1 1 input.144_splitncnn_1 qkv_input.35 0=768 1=1.000000e-06 +Split splitncnn_28 1 3 qkv_input.35 qkv_input.35_splitncnn_0 qkv_input.35_splitncnn_1 qkv_input.35_splitncnn_2 +MultiHeadAttention MultiHeadAttention_159 3 1 qkv_input.35_splitncnn_2 qkv_input.35_splitncnn_1 qkv_input.35_splitncnn_0 onnx::Add_312 0=768 1=12 2=589824 3=1 +BinaryOp Add_160 2 1 input.144_splitncnn_0 onnx::Add_312 input.148 +Split splitncnn_29 1 2 input.148 input.148_splitncnn_0 input.148_splitncnn_1 +LayerNorm LayerNorm_161 1 1 input.148_splitncnn_1 onnx::Gemm_314 0=768 1=1.000000e-06 +InnerProduct Gemm_162 1 1 onnx::Gemm_314 mmdeploy::Gelu_315 0=3072 1=1 2=2359296 8=2 +GELU Gelu_163 1 1 mmdeploy::Gelu_315 input.152 0=1 +InnerProduct Gemm_164 1 1 input.152 input.156 0=768 1=1 2=2359296 8=2 +BinaryOp Add_165 2 1 input.148_splitncnn_0 input.156 input.160 +Split splitncnn_30 1 2 input.160 input.160_splitncnn_0 input.160_splitncnn_1 +LayerNorm LayerNorm_166 1 1 input.160_splitncnn_1 qkv_input.39 0=768 1=1.000000e-06 +Split splitncnn_31 1 3 qkv_input.39 qkv_input.39_splitncnn_0 qkv_input.39_splitncnn_1 qkv_input.39_splitncnn_2 +MultiHeadAttention MultiHeadAttention_175 3 1 qkv_input.39_splitncnn_2 qkv_input.39_splitncnn_1 qkv_input.39_splitncnn_0 onnx::Add_328 0=768 1=12 2=589824 3=1 +BinaryOp Add_176 2 1 input.160_splitncnn_0 onnx::Add_328 input.164 +Split splitncnn_32 1 2 input.164 input.164_splitncnn_0 input.164_splitncnn_1 +LayerNorm LayerNorm_177 1 1 input.164_splitncnn_1 onnx::Gemm_330 0=768 1=1.000000e-06 +InnerProduct Gemm_178 1 1 onnx::Gemm_330 mmdeploy::Gelu_331 0=3072 1=1 2=2359296 8=2 +GELU Gelu_179 1 1 mmdeploy::Gelu_331 input.168 0=1 +InnerProduct Gemm_180 1 1 input.168 input.172 0=768 1=1 2=2359296 8=2 +BinaryOp Add_181 2 1 input.164_splitncnn_0 input.172 input.176 +Split splitncnn_33 1 2 input.176 input.176_splitncnn_0 input.176_splitncnn_1 +LayerNorm LayerNorm_182 1 1 input.176_splitncnn_1 qkv_input.43 0=768 1=1.000000e-06 +Split splitncnn_34 1 3 qkv_input.43 qkv_input.43_splitncnn_0 qkv_input.43_splitncnn_1 qkv_input.43_splitncnn_2 +MultiHeadAttention MultiHeadAttention_191 3 1 qkv_input.43_splitncnn_2 qkv_input.43_splitncnn_1 qkv_input.43_splitncnn_0 onnx::Add_344 0=768 1=12 2=589824 3=1 +BinaryOp Add_192 2 1 input.176_splitncnn_0 onnx::Add_344 input.180 +Split splitncnn_35 1 2 input.180 input.180_splitncnn_0 input.180_splitncnn_1 +LayerNorm LayerNorm_193 1 1 input.180_splitncnn_1 onnx::Gemm_346 0=768 1=1.000000e-06 +InnerProduct Gemm_194 1 1 onnx::Gemm_346 mmdeploy::Gelu_347 0=3072 1=1 2=2359296 8=2 +GELU Gelu_195 1 1 mmdeploy::Gelu_347 input.184 0=1 +InnerProduct Gemm_196 1 1 input.184 input.188 0=768 1=1 2=2359296 8=2 +BinaryOp Add_197 2 1 input.180_splitncnn_0 input.188 input.192 +LayerNorm LayerNorm_198 1 1 input.192 onnx::Gather_351 0=768 1=1.000000e-06 +Crop Gather_200 1 1 onnx::Gather_351 onnx::Gemm_353 -23309=1,0 -23310=1,1 -23311=1,0 +InnerProduct Gemm_201 1 1 onnx::Gemm_353 cls_score 0=1000 1=1 2=768000 8=2 +Softmax Softmax_202 1 1 cls_score output diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md index 5366da1e112c..578a726fc916 100644 --- a/docs/developer-guide/operators.md +++ b/docs/developer-guide/operators.md @@ -1084,9 +1084,10 @@ y = affine(out) | 0 | embed_dim | int | 0 | | | 1 | num_head | int | 1 | | | 2 | weight_data_size| int | 0 | | +| 3 | int8_scale_term| int | 0 | | -| weight | type | shape | -| ------------- | ----- | --------------------- | +| weight | type | shape | description | +| ------------- | ----- | --- | --------------------- | | q_weight_data | float/fp16/int8 | [weight_data_size] | | q_bias_data | float | [embed_dim] | | k_weight_data | float/fp16/int8 | [weight_data_size] | @@ -1095,6 +1096,14 @@ y = affine(out) | v_bias_data | float | [embed_dim] | | out_weight_data| float/fp16/int8 | [weight_data_size] | | out_bias_data | float | [embed_dim] | +| q_input_scale | float | [1] | +| k_input_scale | float | [1] | +| v_input_scale | float | [1] | +| q_weight_scales | float | [embed_dim] | +| k_weight_scales | float | [embed_dim] | +| v_weight_scales | float | [embed_dim] | +| internal_scales | float | [5] | scales for xq/xk/xv/before_softmax/before_output | + # MVN ``` diff --git a/docs/how-to-use-and-FAQ/quantized-int8-inference.md b/docs/how-to-use-and-FAQ/quantized-int8-inference.md index cf8e05c20952..a8846fc96ebf 100644 --- a/docs/how-to-use-and-FAQ/quantized-int8-inference.md +++ b/docs/how-to-use-and-FAQ/quantized-int8-inference.md @@ -20,7 +20,7 @@ Some imagenet sample images here https://github.com/nihui/imagenet-sample-images ```shell find images/ -type f > imagelist.txt -./ncnn2table mobilenet-opt.param mobilenet-opt.bin imagelist.txt mobilenet.table mean=[104,117,123] norm=[0.017,0.017,0.017] shape=[224,224,3] pixel=BGR thread=8 method=kl +./ncnn2table mobilenet-opt.param mobilenet-opt.bin imagelist.txt mobilenet.table mean=[104,117,123] norm=[0.017,0.017,0.017] shape=[224,224,3] pixel=BGR thread=8 method=kl format=txt ``` * mean and norm are the values you passed to ```Mat::substract_mean_normalize()``` @@ -35,6 +35,7 @@ find images/ -type f > imagelist.txt * pixel is the pixel format of your model, image pixels will be converted to this type before ```Extractor::input()``` * thread is the CPU thread count that could be used for parallel inference * method is the post training quantization algorithm, kl and aciq are currently supported +* format is the output file type of quantization parameters, choose `ini` for `txt`. Using `txt` by default If your model has multiple input nodes, you can use multiple list files and other parameters @@ -60,7 +61,7 @@ mobilenet.load_model("mobilenet-int8.bin"); ## mixed precision inference -Before quantize your model, comment the layer weight scale line in table file, then the layer will do the float32 inference +Before quantize your model, comment layer weight scale line in the table file with `txt` format, then the layer will do the float32 inference ``` conv1_param_0 156.639840536 @@ -69,3 +70,26 @@ conv1_param_0 156.639840536 ``` #conv1_param_0 156.639840536 ``` + +If you are using `ini` format, just remove whole quantization parameters of the layer, for example: + +``` +[conv0] +type = "Conv" +weight = [ 156.639840536 ] +input_scale = 1.23 + +[fire] +type = "Gemm" +weight = [ 156.639840536 ] +input_scale = 1.23 +``` + +to + +``` +[fire] +type = "Gemm" +weight = [ 156.639840536 ] +input_scale = 1.23 +``` diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index ac26f599f048..80ec43518b25 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -15,6 +15,9 @@ #include "multiheadattention.h" #include +#ifdef NCNN_INT8 +#include +#endif namespace ncnn { @@ -27,50 +30,332 @@ int MultiHeadAttention::load_param(const ParamDict& pd) embed_dim = pd.get(0, 0); num_head = pd.get(1, 1); weight_data_size = pd.get(2, 0); + int8_scale_term = pd.get(3, 0); + if (int8_scale_term) + { +#if NCNN_INT8 + support_int8_storage = true; +#else + NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference"); + return -1; +#endif + } return 0; } int MultiHeadAttention::load_model(const ModelBin& mb) { - q_weight_data = mb.load(weight_data_size, 0); - if (q_weight_data.empty()) - return -100; +#define LOAD_MAT(name, len) \ + name = mb.load(len, 0); \ + if (name.empty()) \ + { \ + return -100; \ + } - q_bias_data = mb.load(embed_dim, 1); - if (q_bias_data.empty()) - return -100; +#define LOAD_FLOAT_MAT(name, len) \ + name = mb.load(len, 1); \ + if (name.empty()) \ + { \ + return -100; \ + } - k_weight_data = mb.load(weight_data_size, 0); - if (k_weight_data.empty()) - return -100; + LOAD_MAT(q_weight_data, weight_data_size); + LOAD_FLOAT_MAT(q_bias_data, embed_dim); - k_bias_data = mb.load(embed_dim, 1); - if (k_bias_data.empty()) - return -100; + LOAD_MAT(k_weight_data, weight_data_size); + LOAD_FLOAT_MAT(k_bias_data, embed_dim); - v_weight_data = mb.load(weight_data_size, 0); - if (v_weight_data.empty()) - return -100; + LOAD_MAT(v_weight_data, weight_data_size); + LOAD_FLOAT_MAT(v_bias_data, embed_dim); - v_bias_data = mb.load(embed_dim, 1); - if (v_bias_data.empty()) - return -100; + LOAD_MAT(out_weight_data, weight_data_size); + LOAD_FLOAT_MAT(out_bias_data, embed_dim); - out_weight_data = mb.load(weight_data_size, 0); - if (out_weight_data.empty()) - return -100; +#if NCNN_INT8 + if (int8_scale_term) + { + LOAD_FLOAT_MAT(q_input_scale, 1); + LOAD_FLOAT_MAT(k_input_scale, 1); + LOAD_FLOAT_MAT(v_input_scale, 1); - out_bias_data = mb.load(embed_dim, 1); - if (out_bias_data.empty()) - return -100; + LOAD_FLOAT_MAT(q_weight_scales, embed_dim); + LOAD_FLOAT_MAT(k_weight_scales, embed_dim); + LOAD_FLOAT_MAT(v_weight_scales, embed_dim); + LOAD_FLOAT_MAT(o_weight_scales, embed_dim); + + LOAD_FLOAT_MAT(internal_scales, 5); + } +#endif // NCNN_INT8 + +#undef LOAD_MAT +#undef LOAD_FLOAT_MAT return 0; } +#ifdef NCNN_INT8 +static int affine_input( + const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, + const Mat& input_scale, const Mat& weight_scales, const float transform_scale, + const int num_head, const Option& opt, bool transpose) +{ + const int embed_dim = input.w; + const int seqlen = input.h; + const int embed_dim_per_head = embed_dim / num_head; + const float scale = 1.0 / input_scale[0]; + + Mat input_int8; + if (input.elemsize != 1) + { + quantize_to_int8(input, input_int8, input_scale, opt); + } + + Mat buffer(out_int8.w, out_int8.h, out_int8.c, 4u, opt.workspace_allocator); + + if (transpose) + { + for (int q = 0; q < num_head; q++) + { + Mat outm = buffer.channel(q); + + for (int i = 0; i < embed_dim_per_head; i++) + { + for (int j = 0; j < seqlen; j++) + { + const int8_t* ptr = input_int8.row(j); + const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + i); + + int32_t sum = 0; + const int32_t index = q * embed_dim_per_head + i; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + float* outptr = outm.row(i); + outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; + } + } + } + } + else + { + for (int q = 0; q < num_head; q++) + { + Mat outm = buffer.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const int8_t* ptr = input_int8.row(i); + const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + j); + + int32_t sum = 0; + const int32_t index = q * embed_dim_per_head + j; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; + } + } + } + } + + Mat transform(1, 4u, opt.workspace_allocator); + transform[0] = transform_scale; + quantize_to_int8(buffer, out_int8, transform, opt); + return 0; +} + +static inline int32_t float2int8(float v) +{ + int int32 = static_cast(round(v)); + if (int32 > 127) return 127; + if (int32 < -127) return -127; + return int32; +} + +int MultiHeadAttention::forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& q_blob = bottom_blobs[0]; + const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1]; + const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2]; + + const int seqlen = q_blob.h; + const int embed_dim_per_head = embed_dim / num_head; + + Option opt_g = opt; + opt_g.blob_allocator = opt.workspace_allocator; + opt_g.use_packing_layout = false; + + Mat xq(embed_dim_per_head, seqlen, num_head, 1u, opt.workspace_allocator); + Mat xk(embed_dim_per_head, seqlen, num_head, 1u, opt.workspace_allocator); + Mat xv(seqlen, embed_dim_per_head, num_head, 1u, opt.workspace_allocator); + + affine_input(q_blob, q_weight_data, q_bias_data, xq, q_input_scale, q_weight_scales, internal_scales[0], num_head, opt_g, false); + affine_input(k_blob, k_weight_data, k_bias_data, xk, k_input_scale, k_weight_scales, internal_scales[1], num_head, opt_g, false); + affine_input(v_blob, v_weight_data, v_bias_data, xv, v_input_scale, v_weight_scales, internal_scales[2], num_head, opt_g, true); + + // transpose(v) for better gemm performance + // Mat xv(seqlen, embed_dim_per_head, num_head, 1u, opt.workspace_allocator); + // Mat debug_xv; + // transform_input(v_blob, v_weight_data, v_bias_data, xv, v_input_scale, v_weight_scales, internal_scales[2], opt_g, debug_xv, true); + + // xq @ qk * inv_sqrt_embed_dim_per_head + const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + + Mat xqk(seqlen, seqlen, num_head, 4u, opt.workspace_allocator); + { + // xqk = xq * xk + // xq (embed_dim_per_head, seqlen) + // xk (embed_dim_per_head, seqlen) + const float out_scale = inv_sqrt_embed_dim_per_head / (internal_scales[0] * internal_scales[1]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_head; ++q) + { + const Mat xqm = xq.channel(q); + const Mat xkm = xk.channel(q); + + Mat outm = xqk.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < seqlen; j++) + { + const int8_t* qptr = xqm.row(i); + const int8_t* kptr = xkm.row(j); + + int32_t sum = 0; + for (int k = 0; k < embed_dim_per_head; k++) + { + sum += *qptr++ * *kptr++; + } + + outptr[j] = sum * out_scale; + } + } + } + + // fp32_softmax(xqk) + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_head; q++) + { + // softmax(xqk) + { + Mat outm = xqk.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* ptr = outm.row(i); + + float max = -FLT_MAX; + for (int j = 0; j < seqlen; j++) + { + max = std::max(max, ptr[j]); + } + + float sum = 0.f; + for (int j = 0; j < seqlen; j++) + { + ptr[j] = (float)(exp(ptr[j] - max)); + sum += ptr[j]; + } + + for (int j = 0; j < seqlen; j++) + { + ptr[j] = ptr[j] / sum; + } + } + } + } + } + + // xqkv int4 @ int8, implement by shift + Mat xqkv(embed_dim_per_head, num_head, seqlen, 1u, opt.workspace_allocator); + + const float xqkv_out_scale = internal_scales[4] / internal_scales[2]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_head; ++q) + { + // xqkv = xqk * xv + // xqk (seqlen, seqlen) + // xv (seqlen, embed_dim_per_head) + // out (embed_dim_per_head, num_head, seqlen) + const Mat xqkm = xqk.channel(q); + const Mat xvm = xv.channel(q); + + for (int i = 0; i < seqlen; i++) + { + int8_t* outptr = xqkv.channel(i).row(q); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const float* qkptr = xqkm.row(i); + const int8_t* vptr = xvm.row(j); + + float sum = 0; + for (int k = 0; k < seqlen; k++) + { + sum += (*vptr++) * (*qkptr++); + } + + outptr[j] = float2int8(sum * xqkv_out_scale); + } + } + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(embed_dim, seqlen, 4u, opt.blob_allocator); + if (top_blob.empty()) + return -1; + + const float out_scale = 1.0f / internal_scales[4]; + // out = affine(xqkv) + // xqkv (embed_dim, seqlen) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < seqlen; i++) + { + float* outptr = top_blob.row(i); + + for (int j = 0; j < embed_dim; j++) + { + const int8_t* ptr = xqkv.channel(i); + const int8_t* kptr = (const int8_t*)out_weight_data + embed_dim * j; + + int32_t sum = 0; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + outptr[j] = sum * out_scale / o_weight_scales[j] + out_bias_data[j]; + } + } + + return 0; +} + +#endif + // refers to https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html int MultiHeadAttention::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { +#if NCNN_INT8 + if (opt.use_int8_inference && q_weight_data.elemsize == (size_t)1u && k_weight_data.elemsize == (size_t)1u && v_weight_data.elemsize == (size_t)1u && out_weight_data.elemsize == (size_t)1u) + { + return forward_int8(bottom_blobs, top_blobs, opt); + } +#endif + const Mat& q_blob = bottom_blobs[0]; const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1]; const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2]; diff --git a/src/layer/multiheadattention.h b/src/layer/multiheadattention.h index b878055385d0..31a967804391 100644 --- a/src/layer/multiheadattention.h +++ b/src/layer/multiheadattention.h @@ -30,10 +30,15 @@ class MultiHeadAttention : public Layer virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +#ifdef NCNN_INT8 + int forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +#endif + public: int embed_dim; int num_head; int weight_data_size; + int int8_scale_term; Mat q_weight_data; Mat q_bias_data; @@ -43,6 +48,31 @@ class MultiHeadAttention : public Layer Mat v_bias_data; Mat out_weight_data; Mat out_bias_data; + +#ifdef NCNN_INT8 + Mat q_input_scale; + Mat k_input_scale; + Mat v_input_scale; + + Mat q_weight_scales; + Mat k_weight_scales; + Mat v_weight_scales; + Mat o_weight_scales; + + /** + * @brief mha consists of multiple GEMM, they also have input scale + * + * internal_scales = [ + * q_affine_scale, + * k_affine_scale, + * v_affine_scale, + * energy_scale, + * feat_scael + * ] + * + */ + Mat internal_scales; +#endif }; } // namespace ncnn diff --git a/tests/test_multiheadattention.cpp b/tests/test_multiheadattention.cpp index f4e0b1b44f58..ba6e8d32e899 100644 --- a/tests/test_multiheadattention.cpp +++ b/tests/test_multiheadattention.cpp @@ -93,11 +93,69 @@ static int test_multiheadattention_1() || test_multiheadattention_sameqkv(RandomMat(64, 127), 32); } +#ifdef NCNN_INT8 +static int test_multiheadattention_int8(const ncnn::Mat& a, int num_heads) +{ + int embed_dim = a.w; + + ncnn::ParamDict pd; + pd.set(0, embed_dim); + pd.set(1, num_heads); + pd.set(2, embed_dim * embed_dim); + pd.set(3, 1); + + std::vector weights(16); + weights[0] = RandomIntMat(embed_dim * embed_dim); + weights[1] = RandomIntMat(embed_dim); + weights[2] = RandomIntMat(embed_dim * embed_dim); + weights[3] = RandomIntMat(embed_dim); + weights[4] = RandomIntMat(embed_dim * embed_dim); + weights[5] = RandomIntMat(embed_dim); + weights[6] = RandomIntMat(embed_dim * embed_dim); + weights[7] = RandomIntMat(embed_dim); + + weights[8] = RandomMat(1); + weights[9] = RandomMat(1); + weights[10] = RandomMat(1); + + weights[11] = RandomMat(embed_dim); + weights[12] = RandomMat(embed_dim); + weights[13] = RandomMat(embed_dim); + weights[14] = RandomMat(embed_dim); + + weights[15] = RandomMat(5); + + std::vector as(1); + as[0] = a; + + int ret = test_layer("MultiHeadAttention", pd, weights, as); + if (ret != 0) + { + fprintf(stderr, "test_multiheadattention failed a=(%d %d)\n", a.w, a.h); + } + + return ret; +} + +static int test_multiheadattention_2() +{ + return 0 + || test_multiheadattention_int8(RandomMat(64, 128), 8) + || test_multiheadattention_int8(RandomMat(64, 127), 32); +} +#endif + int main() { SRAND(7767517); - +#ifdef NCNN_INT8 + return 0 + || test_multiheadattention_0() + || test_multiheadattention_1() + || test_multiheadattention_2(); +#else return 0 || test_multiheadattention_0() || test_multiheadattention_1(); +#endif } diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 965046be48d0..31399e79f97d 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -29,7 +29,7 @@ if(NCNN_VULKAN) target_link_libraries(ncnn2mem PRIVATE ${Vulkan_LIBRARY}) endif() -add_executable(ncnnoptimize ncnnoptimize.cpp) +add_executable(ncnnoptimize ncnnoptimize.cpp modelwriter.cpp) target_link_libraries(ncnnoptimize PRIVATE ncnn) if(NCNN_VULKAN) target_link_libraries(ncnnoptimize PRIVATE ${Vulkan_LIBRARY}) diff --git a/tools/modelwriter.cpp b/tools/modelwriter.cpp new file mode 100644 index 000000000000..6096f7ea1cc1 --- /dev/null +++ b/tools/modelwriter.cpp @@ -0,0 +1,2069 @@ +#include "modelwriter.h" + +MemoryFootprintAllocator::MemoryFootprintAllocator() +{ + current_memory_usage = 0; + memory_footprint = 0; +} + +void* MemoryFootprintAllocator::fastMalloc(size_t size) +{ + ncnn::MutexLockGuard g(lock); + void* ptr = ncnn::fastMalloc(size); + bookkeeper[ptr] = size; + current_memory_usage += size; + memory_footprint = std::max(memory_footprint, current_memory_usage); + return ptr; +} + +void MemoryFootprintAllocator::fastFree(void* ptr) +{ + ncnn::MutexLockGuard g(lock); + size_t size = bookkeeper[ptr]; + current_memory_usage -= size; + bookkeeper.erase(bookkeeper.find(ptr)); + ncnn::fastFree(ptr); +} + +int CustomLayer::load_param(const ncnn::ParamDict& pd) +{ + mpd = pd; + return 0; +} + +void CustomLayer::write_param(FILE* pp) +{ + for (int i = 0; i < NCNN_MAX_PARAM_COUNT; i++) + { + int type = mpd.type(i); + if (type == 0) + continue; + + if (type == 2) + { + fprintf(pp, " %d=%d", i, mpd.get(i, 0)); + } + if (type == 3) + { + fprintf(pp, " %d=%e", i, mpd.get(i, 0.f)); + } + if (type == 5) + { + ncnn::Mat v = mpd.get(i, ncnn::Mat()); + int len = v.w; + fprintf(pp, " %d=%d", -i - 23300, len); + const int* p = v; + for (int j = 0; j < len; j++) + { + fprintf(pp, ",%d", p[j]); + } + } + if (type == 6) + { + ncnn::Mat v = mpd.get(i, ncnn::Mat()); + int len = v.w; + fprintf(pp, " %d=%d", -i - 23300, len); + const float* p = v; + for (int j = 0; j < len; j++) + { + fprintf(pp, ",%e", p[j]); + } + } + } +} + +DEFINE_LAYER_CREATOR(CustomLayer) + +ModelWriter::ModelWriter() + : blobs(mutable_blobs()), layers(mutable_layers()) +{ + opt.lightmode = false; + has_custom_layer = false; + gen_random_weight = false; + cutstart = -1; + cutend = -1; + + SRAND(7767517); +} + +ncnn::Layer* ModelWriter::create_custom_layer(const char* type) +{ + ncnn::Layer* layer = Net::create_custom_layer(type); + if (layer) + return layer; + + fprintf(stderr, "create_custom_layer %s\n", type); + + register_custom_layer(type, CustomLayer_layer_creator); + + has_custom_layer = true; + + return Net::create_custom_layer(type); +} + +int ModelWriter::set_cutparam(const char* cutstartname, const char* cutendname) +{ + if (cutstartname != nullptr) + { + int layindex = find_layer_index_by_name(cutstartname); + if (layindex >= 0) + { + cutstart = layindex; + fprintf(stderr, "cutstart layer %d:%s\n", layindex, cutstartname); + } + else + { + fprintf(stderr, "not find target cutstart layer %s\n", cutstartname); + return -1; + } + } + + if (cutendname != nullptr) + { + int layindex = find_layer_index_by_name(cutendname); + if (layindex >= 0) + { + cutend = layindex; + fprintf(stderr, "cutend layer %d:%s\n", layindex, cutendname); + } + else + { + fprintf(stderr, "not find target cutend layer %s\n", cutendname); + return -1; + } + } + + return 0; +} + +int ModelWriter::shape_inference() +{ + if (has_custom_layer) + { + fprintf(stderr, "model has custom layer, shape_inference skipped\n"); + return -1; + } + + const size_t layer_count = layers.size(); + const size_t blob_count = blobs.size(); + + // recreate layer pipeline for param and weight changes + for (size_t i = 0; i < layer_count; i++) + { + ncnn::Layer* layer = layers[i]; + + layer->destroy_pipeline(opt); + + int cret = layer->create_pipeline(opt); + if (cret != 0) + { + NCNN_LOGE("layer create_pipeline %d %s failed", (int)i, layer->name.c_str()); + return -1; + } + } + + ncnn::Extractor ex = create_extractor(); + ex.set_light_mode(true); + + // prepare Input blobs + for (size_t i = 0; i < layer_count; i++) + { + const ncnn::Layer* layer = layers[i]; + if (layer->type == "ncnnfused") + continue; + + if (layer->type != "Input") + continue; + + ncnn::Input* input = (ncnn::Input*)layer; + + int w = input->w; + int h = input->h; + int c = input->c; + + int dims = 0; + if (w == 0 && h == 0 && c == 0) dims = 0; + if (w != 0 && h == 0 && c == 0) dims = 1; + if (w != 0 && h != 0 && c == 0) dims = 2; + if (w != 0 && h != 0 && c != 0) dims = 3; + + if (dims == 0) + { + fprintf(stderr, "Input layer %s without shape info, shape_inference skipped\n", layer->name.c_str()); + return -1; + } + + ncnn::Mat m; + if (dims == 1) m.create(w); + if (dims == 2) m.create(w, h); + if (dims == 3) m.create(w, h, c); + + ex.input(layer->tops[0], m); + } + + // prepare blobs with predefined shape + for (size_t i = 0; i < blob_count; i++) + { + const ncnn::Blob& blob = blobs[i]; + + int dims = blob.shape.dims; + int w = blob.shape.w; + int h = blob.shape.h; + int c = blob.shape.c; + + if (dims == 0) + continue; + + ncnn::Mat m; + if (dims == 1) m.create(w); + if (dims == 2) m.create(w, h); + if (dims == 3) m.create(w, h, c); + + m.fill(0.f); + + ex.input(int(i), m); + } + + fprintf(stderr, "shape_inference\n"); + + // resolve all layer output blob shape + for (size_t i = 0; i < layer_count; i++) + { + const ncnn::Layer* layer = layers[i]; + if (layer->type == "ncnnfused") + continue; + + for (size_t j = 0; j < layer->tops.size(); j++) + { + int top_blob_index = layer->tops[j]; + + ncnn::Mat m; + ex.extract(top_blob_index, m); + + blobs[top_blob_index].shape = m; + } + } + + // assign all layer blob shape + for (size_t i = 0; i < layer_count; i++) + { + ncnn::Layer* layer = layers[i]; + if (layer->type == "ncnnfused") + continue; + + layer->bottom_shapes.resize(layer->bottoms.size()); + for (size_t j = 0; j < layer->bottoms.size(); j++) + { + int bottom_blob_index = layer->bottoms[j]; + + layer->bottom_shapes[j] = blobs[bottom_blob_index].shape; + } + + layer->top_shapes.resize(layer->tops.size()); + for (size_t j = 0; j < layer->tops.size(); j++) + { + int top_blob_index = layer->tops[j]; + + layer->top_shapes[j] = blobs[top_blob_index].shape; + + // fprintf(stderr, "%d %4d %4d %4d | %2d %s\n", blobs[top_blob_index].shape.dims, blobs[top_blob_index].shape.w, blobs[top_blob_index].shape.h, blobs[top_blob_index].shape.c, top_blob_index, blobs[top_blob_index].name.c_str()); + } + } + + return 0; +} + +int ModelWriter::estimate_memory_footprint() +{ + if (has_custom_layer) + { + fprintf(stderr, "model has custom layer, estimate_memory_footprint skipped\n"); + return -1; + } + + const size_t layer_count = layers.size(); + const size_t blob_count = blobs.size(); + + MemoryFootprintAllocator allocator; + + ncnn::Extractor ex = create_extractor(); + ex.set_light_mode(true); + + ex.set_blob_allocator(&allocator); + ex.set_workspace_allocator(&allocator); + + // prepare Input blobs + for (size_t i = 0; i < layer_count; i++) + { + const ncnn::Layer* layer = layers[i]; + if (layer->type == "ncnnfused") + continue; + + if (layer->type != "Input") + continue; + + ncnn::Input* input = (ncnn::Input*)layer; + + int w = input->w; + int h = input->h; + int c = input->c; + + int dims = 0; + if (w == 0 && h == 0 && c == 0) dims = 0; + if (w != 0 && h == 0 && c == 0) dims = 1; + if (w != 0 && h != 0 && c == 0) dims = 2; + if (w != 0 && h != 0 && c != 0) dims = 3; + + if (dims == 0) + { + fprintf(stderr, "Input layer %s without shape info, estimate_memory_footprint skipped\n", layer->name.c_str()); + return -1; + } + + ncnn::Mat m; + if (dims == 1) m.create(w, 4u, &allocator); + if (dims == 2) m.create(w, h, 4u, &allocator); + if (dims == 3) m.create(w, h, c, 4u, &allocator); + + ex.input(layer->tops[0], m); + + fprintf(stderr, "input = %s\n", blobs[layer->tops[0]].name.c_str()); + } + + // find output blobs and do inference + std::vector outputs; + for (size_t i = 0; i < blob_count; i++) + { + const ncnn::Blob& blob = blobs[i]; + + if (blob.producer == -1 || blob.consumer != -1) + continue; + + if (layers[blob.producer]->type == "ncnnfused") + continue; + + // treat blob without any consumers as output + ncnn::Mat m; + ex.extract(int(i), m); + outputs.push_back(m); + + fprintf(stderr, "extract = %s\n", blob.name.c_str()); + } + + fprintf(stderr, "estimated memory footprint = %.2f KB = %.2f MB\n", allocator.memory_footprint / 1024.f, allocator.memory_footprint / 1024.f / 1024.f); + + return 0; +} + +int ModelWriter::fprintf_param_int_array(int id, const ncnn::Mat& m, FILE* pp) +{ + const int count = m.w; + const int* ptr = m; + + fprintf(pp, " -%d=%d", 23300 + id, count); + for (int i = 0; i < count; i++) + { + fprintf(pp, ",%d", ptr[i]); + } + + return 0; +} + +int ModelWriter::fprintf_param_float_array(int id, const ncnn::Mat& m, FILE* pp) +{ + const int count = m.w; + const float* ptr = m; + + fprintf(pp, " -%d=%d", 23300 + id, count); + for (int i = 0; i < count; i++) + { + fprintf(pp, ",%e", ptr[i]); + } + + return 0; +} + +static inline size_t alignSize(size_t sz, int n) +{ + return (sz + n - 1) & -n; +} + +static void replace_denormals_with_zero(float* data, size_t data_length) +{ + const int total = static_cast(data_length); + for (size_t i = 0; i < data_length; ++i) + { + float value = data[i]; + + if (fabsf(value) < 1e-30 && fabsf(value) != 0.f) + { + data[i] = 0.f; + } + } +} + +static float RandomFloat(float a = -1.2f, float b = 1.2f) +{ + float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX; + float diff = b - a; + float r = random * diff; + return a + r; +} + +static void Randomize(ncnn::Mat& m, float a = -1.2f, float b = 1.2f) +{ + if (m.elemsize == 4) + { + for (size_t i = 0; i < m.total(); i++) + { + m[i] = RandomFloat(a, b); + } + } + else if (m.elemsize == 2) + { + unsigned short* p = m; + for (size_t i = 0; i < m.total(); i++) + { + p[i] = ncnn::float32_to_float16(RandomFloat(a, b)); + } + } + else if (m.elemsize == 1) + { + signed char* p = m; + for (size_t i = 0; i < m.total(); i++) + { + p[i] = (signed char)RandomFloat(-127, 127); + } + } +} + +int ModelWriter::fwrite_weight_tag_data(const ncnn::Mat& data, FILE* bp, float a, float b) +{ + int p0 = ftell(bp); + + ncnn::Mat data_flattened = data.reshape(data.w * data.h * data.d * data.c); + if (gen_random_weight) + Randomize(data_flattened, a, b); + + if (data_flattened.elemsize == 4) + { + if (storage_type == 1) + { + const int tag = 0x01306B47; // fp16 magic + fwrite(&tag, sizeof(int), 1, bp); + ncnn::Mat data_flattened_fp16; + ncnn::cast_float32_to_float16(data_flattened, data_flattened_fp16); + fwrite(data_flattened_fp16.data, data_flattened_fp16.elemsize, data_flattened_fp16.w, bp); + } + else + { + const int tag = 0; // fp32 magic + fwrite(&tag, sizeof(int), 1, bp); + replace_denormals_with_zero(data_flattened, data_flattened.w); + fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); + } + } + else if (data_flattened.elemsize == 2) + { + const int tag = 0x01306B47; // fp16 magic + fwrite(&tag, sizeof(int), 1, bp); + fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); + } + else if (data_flattened.elemsize == 1) + { + const int tag = 0x000D4B38; // int8 magic + fwrite(&tag, sizeof(int), 1, bp); + fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); + } + else + { + fprintf(stderr, "unknown weight data type %d\n", (int)data_flattened.elemsize); + } + + // padding to 32bit align + int nwrite = ftell(bp) - p0; + size_t nalign = alignSize(nwrite, 4); + unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00}; + fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp); + + return 0; +} + +int ModelWriter::fwrite_weight_data(const ncnn::Mat& data, FILE* bp, float a, float b) +{ + int p0 = ftell(bp); + + ncnn::Mat data_flattened = data.reshape(data.w * data.h * data.d * data.c); + if (gen_random_weight) + Randomize(data_flattened, a, b); + + if (data_flattened.elemsize == 4) // fp32 + { + replace_denormals_with_zero(data_flattened, data_flattened.w); + } + + fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); + + // padding to 32bit align + int nwrite = ftell(bp) - p0; + size_t nalign = alignSize(nwrite, 4); + unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00}; + fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp); + + return 0; +} + +int ModelWriter::save(const char* parampath, const char* binpath) +{ + uint64_t mac = 0; + + FILE* pp = fopen(parampath, "wb"); + FILE* bp = fopen(binpath, "wb"); + + fprintf(pp, "7767517\n"); + + const size_t layer_count = layers.size(); + + int layer_count_fused = 0; + std::set blob_names; + for (size_t i = 0; i < layer_count; i++) + { + const ncnn::Layer* layer = layers[i]; + if (layer->type == "ncnnfused") + continue; + + layer_count_fused++; + + size_t bottom_count = layer->bottoms.size(); + for (size_t j = 0; j < bottom_count; j++) + { + int bottom_blob_index = layer->bottoms[j]; + blob_names.insert(blobs[bottom_blob_index].name); + } + + size_t top_count = layer->tops.size(); + for (size_t j = 0; j < top_count; j++) + { + int top_blob_index = layer->tops[j]; + blob_names.insert(blobs[top_blob_index].name); + } + } + + size_t blob_count_fused = blob_names.size(); + + fprintf(pp, "%d %zd\n", layer_count_fused, blob_count_fused); + + for (size_t i = 0; i < layer_count; i++) + { + const ncnn::Layer* layer = layers[i]; + if (layer->type == "ncnnfused") + continue; + + if (cutstart > 0 && i < cutstart) + continue; + + if (cutend > 0 && i > cutend) + continue; + + size_t bottom_count = layer->bottoms.size(); + size_t top_count = layer->tops.size(); + + fprintf(pp, "%-24s %-24s %zd %zd", layer->type.c_str(), layer->name.c_str(), bottom_count, top_count); + + for (size_t j = 0; j < bottom_count; j++) + { + int bottom_blob_index = layer->bottoms[j]; + fprintf(pp, " %s", blobs[bottom_blob_index].name.c_str()); + } + for (size_t j = 0; j < top_count; j++) + { + int top_blob_index = layer->tops[j]; + fprintf(pp, " %s", blobs[top_blob_index].name.c_str()); + } + + // write shape hints + bool shape_ready = true; + for (size_t j = 0; j < top_count; j++) + { + int top_blob_index = layer->tops[j]; + + int dims = blobs[top_blob_index].shape.dims; + if (dims == 0) + { + shape_ready = false; + break; + } + } + if (shape_ready) + { + fprintf(pp, " -23330=%zd", top_count * 4); + for (size_t j = 0; j < top_count; j++) + { + int top_blob_index = layer->tops[j]; + + int dims = blobs[top_blob_index].shape.dims; + int w = blobs[top_blob_index].shape.w; + int h = blobs[top_blob_index].shape.h; + int c = blobs[top_blob_index].shape.c; + + fprintf(pp, ",%d,%d,%d,%d", dims, w, h, c); + } + } + + // custom op + if (layer->typeindex & ncnn::LayerType::CustomBit) + { + ((CustomLayer*)layer)->write_param(pp); + + fprintf(pp, "\n"); + + continue; + } + + ncnn::Layer* layer_default = ncnn::create_layer(layer->typeindex); + + ncnn::ParamDict pd; + layer_default->load_param(pd); + +#define fprintf_param_value(format, phase) \ + { \ + if (op->phase != op_default->phase) fprintf(pp, format, op->phase); \ + } + + if (layer->type == "BatchNorm") + { + ncnn::BatchNorm* op = (ncnn::BatchNorm*)layer; + ncnn::BatchNorm* op_default = (ncnn::BatchNorm*)layer_default; + + fprintf_param_value(" 0=%d", channels) + fprintf_param_value(" 1=%e", eps) + + fwrite_weight_data(op->slope_data, bp); + fwrite_weight_data(op->mean_data, bp); + fwrite_weight_data(op->var_data, bp); + fwrite_weight_data(op->bias_data, bp); + } + else if (layer->type == "Bias") + { + ncnn::Bias* op = (ncnn::Bias*)layer; + ncnn::Bias* op_default = (ncnn::Bias*)layer_default; + + fprintf_param_value(" 0=%d", bias_data_size) + + fwrite_weight_data(op->bias_data, bp); + } + else if (layer->type == "BinaryOp") + { + ncnn::BinaryOp* op = (ncnn::BinaryOp*)layer; + ncnn::BinaryOp* op_default = (ncnn::BinaryOp*)layer_default; + + fprintf_param_value(" 0=%d", op_type) + fprintf_param_value(" 1=%d", with_scalar) + fprintf_param_value(" 2=%e", b) + } + else if (layer->type == "Clip") + { + ncnn::Clip* op = (ncnn::Clip*)layer; + ncnn::Clip* op_default = (ncnn::Clip*)layer_default; + + fprintf_param_value(" 0=%e", min) + fprintf_param_value(" 1=%e", max) + } + else if (layer->type == "Concat") + { + ncnn::Concat* op = (ncnn::Concat*)layer; + ncnn::Concat* op_default = (ncnn::Concat*)layer_default; + + fprintf_param_value(" 0=%d", axis) + } + else if (layer->type == "Convolution") + { + ncnn::Convolution* op = (ncnn::Convolution*)layer; + ncnn::Convolution* op_default = (ncnn::Convolution*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + fprintf_param_value(" 18=%e", pad_value) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 8=%d", int8_scale_term) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + +#if NCNN_INT8 + // write int8_scale data + if (op->int8_scale_term) + { + fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100); + fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1); + fwrite_weight_data(op->top_blob_int8_scales, bp, 0.001, 1); + } +#endif // NCNN_INT8 + + if (shape_ready) + { + int inc = blobs[layer->bottoms[0]].shape.c; + int outw = blobs[layer->tops[0]].shape.w; + int outh = blobs[layer->tops[0]].shape.h; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_h * op->kernel_w * outw * outh * outc * inc; + } + } + else if (layer->type == "Convolution1D") + { + ncnn::Convolution1D* op = (ncnn::Convolution1D*)layer; + ncnn::Convolution1D* op_default = (ncnn::Convolution1D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + fprintf_param_value(" 2=%d", dilation_w) + fprintf_param_value(" 3=%d", stride_w) + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + fprintf_param_value(" 18=%e", pad_value) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inh = blobs[layer->bottoms[0]].shape.h; + int outw = blobs[layer->tops[0]].shape.w; + int outh = blobs[layer->tops[0]].shape.h; + + mac += (uint64_t)op->kernel_w * outw * outh * inh; + } + } + else if (layer->type == "Convolution3D") + { + ncnn::Convolution3D* op = (ncnn::Convolution3D*)layer; + ncnn::Convolution3D* op_default = (ncnn::Convolution3D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + { + if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); + } + fprintf_param_value(" 18=%e", pad_value) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inc = blobs[layer->bottoms[0]].shape.c; + int outw = blobs[layer->tops[0]].shape.w; + int outh = blobs[layer->tops[0]].shape.h; + int outd = blobs[layer->tops[0]].shape.d; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * outw * outh * outd * outc * inc; + } + } + else if (layer->type == "ConvolutionDepthWise") + { + ncnn::ConvolutionDepthWise* op = (ncnn::ConvolutionDepthWise*)layer; + ncnn::ConvolutionDepthWise* op_default = (ncnn::ConvolutionDepthWise*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + fprintf_param_value(" 18=%e", pad_value) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 7=%d", group) + fprintf_param_value(" 8=%d", int8_scale_term) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + +#if NCNN_INT8 + // write int8_scale data + if (op->int8_scale_term == 1 || op->int8_scale_term == 101) + { + op->bottom_blob_int8_scales.w = 1; + } + if (op->int8_scale_term == 2 || op->int8_scale_term == 102) + { + op->weight_data_int8_scales.w = 1; + op->bottom_blob_int8_scales.w = 1; + } + if (op->int8_scale_term > 100) + { + op->top_blob_int8_scales.w = 1; + } + + if (op->int8_scale_term) + { + fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100); + fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1); + fwrite_weight_data(op->top_blob_int8_scales, bp, 0.001, 1); + } +#endif // NCNN_INT8 + + if (shape_ready) + { + int inc = blobs[layer->bottoms[0]].shape.c; + int outw = blobs[layer->tops[0]].shape.w; + int outh = blobs[layer->tops[0]].shape.h; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_h * op->kernel_w * outw * outh * (outc / op->group) * (inc / op->group) * op->group; + } + } + else if (layer->type == "ConvolutionDepthWise1D") + { + ncnn::ConvolutionDepthWise1D* op = (ncnn::ConvolutionDepthWise1D*)layer; + ncnn::ConvolutionDepthWise1D* op_default = (ncnn::ConvolutionDepthWise1D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + fprintf_param_value(" 2=%d", dilation_w) + fprintf_param_value(" 3=%d", stride_w) + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + fprintf_param_value(" 18=%e", pad_value) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 7=%d", group) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inh = blobs[layer->bottoms[0]].shape.h; + int outw = blobs[layer->tops[0]].shape.w; + int outh = blobs[layer->tops[0]].shape.h; + + mac += (uint64_t)op->kernel_w * outw * (outh / op->group) * (inh / op->group) * op->group; + } + } + else if (layer->type == "ConvolutionDepthWise3D") + { + ncnn::ConvolutionDepthWise3D* op = (ncnn::ConvolutionDepthWise3D*)layer; + ncnn::ConvolutionDepthWise3D* op_default = (ncnn::ConvolutionDepthWise3D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + { + if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); + } + fprintf_param_value(" 18=%e", pad_value) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 7=%d", group) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inc = blobs[layer->bottoms[0]].shape.c; + int outw = blobs[layer->tops[0]].shape.w; + int outh = blobs[layer->tops[0]].shape.h; + int outd = blobs[layer->tops[0]].shape.d; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * outw * outh * outd * (outc / op->group) * (inc / op->group) * op->group; + } + } + else if (layer->type == "Crop") + { + ncnn::Crop* op = (ncnn::Crop*)layer; + ncnn::Crop* op_default = (ncnn::Crop*)layer_default; + + fprintf_param_value(" 0=%d", woffset) + fprintf_param_value(" 1=%d", hoffset) + fprintf_param_value(" 2=%d", coffset) + fprintf_param_value(" 3=%d", outw) + fprintf_param_value(" 4=%d", outh) + fprintf_param_value(" 5=%d", outc) + fprintf_param_value(" 6=%d", woffset2) + fprintf_param_value(" 7=%d", hoffset2) + fprintf_param_value(" 8=%d", coffset2) + { + if (!op->starts.empty()) fprintf_param_int_array(9, op->starts, pp); + } + { + if (!op->ends.empty()) fprintf_param_int_array(10, op->ends, pp); + } + { + if (!op->axes.empty()) fprintf_param_int_array(11, op->axes, pp); + } + } + else if (layer->type == "Deconvolution") + { + ncnn::Deconvolution* op = (ncnn::Deconvolution*)layer; + ncnn::Deconvolution* op_default = (ncnn::Deconvolution*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + fprintf_param_value(" 18=%d", output_pad_right) + { + if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); + } + fprintf_param_value(" 20=%d", output_w) + { + if (op->output_h != op->output_w) fprintf(pp, " 21=%d", op->output_h); + } + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inw = blobs[layer->bottoms[0]].shape.w; + int inh = blobs[layer->bottoms[0]].shape.h; + int inc = blobs[layer->bottoms[0]].shape.c; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_h * op->kernel_w * inw * inh * outc * inc; + } + } + else if (layer->type == "Deconvolution1D") + { + ncnn::Deconvolution1D* op = (ncnn::Deconvolution1D*)layer; + ncnn::Deconvolution1D* op_default = (ncnn::Deconvolution1D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + fprintf_param_value(" 2=%d", dilation_w) + fprintf_param_value(" 3=%d", stride_w) + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + fprintf_param_value(" 18=%d", output_pad_right) + fprintf_param_value(" 20=%d", output_w) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inw = blobs[layer->bottoms[0]].shape.w; + int inh = blobs[layer->bottoms[0]].shape.h; + int outh = blobs[layer->tops[0]].shape.h; + + mac += (uint64_t)op->kernel_w * inw * outh * inh; + } + } + else if (layer->type == "Deconvolution3D") + { + ncnn::Deconvolution3D* op = (ncnn::Deconvolution3D*)layer; + ncnn::Deconvolution3D* op_default = (ncnn::Deconvolution3D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + { + if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); + } + fprintf_param_value(" 18=%d", output_pad_right) + { + if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); + if (op->output_pad_behind != op->output_pad_right) fprintf(pp, " 20=%d", op->output_pad_behind); + } + fprintf_param_value(" 25=%d", output_w) + { + if (op->output_h != op->output_w) fprintf(pp, " 26=%d", op->output_h); + if (op->output_d != op->output_w) fprintf(pp, " 27=%d", op->output_d); + } + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inw = blobs[layer->bottoms[0]].shape.w; + int inh = blobs[layer->bottoms[0]].shape.h; + int ind = blobs[layer->bottoms[0]].shape.d; + int inc = blobs[layer->bottoms[0]].shape.c; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * inw * inh * ind * outc * inc; + } + } + else if (layer->type == "DeconvolutionDepthWise") + { + ncnn::DeconvolutionDepthWise* op = (ncnn::DeconvolutionDepthWise*)layer; + ncnn::DeconvolutionDepthWise* op_default = (ncnn::DeconvolutionDepthWise*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + fprintf_param_value(" 18=%d", output_pad_right) + { + if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); + } + fprintf_param_value(" 20=%d", output_w) + { + if (op->output_h != op->output_w) fprintf(pp, " 21=%d", op->output_h); + } + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 7=%d", group) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inw = blobs[layer->bottoms[0]].shape.w; + int inh = blobs[layer->bottoms[0]].shape.h; + int inc = blobs[layer->bottoms[0]].shape.c; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_h * op->kernel_w * inw * inh * (outc / op->group) * (inc / op->group) * op->group; + } + } + else if (layer->type == "DeconvolutionDepthWise1D") + { + ncnn::DeconvolutionDepthWise1D* op = (ncnn::DeconvolutionDepthWise1D*)layer; + ncnn::DeconvolutionDepthWise1D* op_default = (ncnn::DeconvolutionDepthWise1D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + fprintf_param_value(" 2=%d", dilation_w) + fprintf_param_value(" 3=%d", stride_w) + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + fprintf_param_value(" 18=%d", output_pad_right) + fprintf_param_value(" 20=%d", output_w) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 7=%d", group) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inw = blobs[layer->bottoms[0]].shape.w; + int inh = blobs[layer->bottoms[0]].shape.h; + int outh = blobs[layer->tops[0]].shape.h; + + mac += (uint64_t)op->kernel_w * inw * (outh / op->group) * (inh / op->group) * op->group; + } + } + else if (layer->type == "DeconvolutionDepthWise3D") + { + ncnn::DeconvolutionDepthWise3D* op = (ncnn::DeconvolutionDepthWise3D*)layer; + ncnn::DeconvolutionDepthWise3D* op_default = (ncnn::DeconvolutionDepthWise3D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + { + if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); + } + fprintf_param_value(" 18=%d", output_pad_right) + { + if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); + if (op->output_pad_behind != op->output_pad_right) fprintf(pp, " 20=%d", op->output_pad_behind); + } + fprintf_param_value(" 25=%d", output_w) + { + if (op->output_h != op->output_w) fprintf(pp, " 26=%d", op->output_h); + if (op->output_d != op->output_w) fprintf(pp, " 27=%d", op->output_d); + } + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 7=%d", group) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inw = blobs[layer->bottoms[0]].shape.w; + int inh = blobs[layer->bottoms[0]].shape.h; + int ind = blobs[layer->bottoms[0]].shape.d; + int inc = blobs[layer->bottoms[0]].shape.c; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * inw * inh * ind * (outc / op->group) * (inc / op->group) * op->group; + } + } + else if (layer->type == "DetectionOutput") + { + ncnn::DetectionOutput* op = (ncnn::DetectionOutput*)layer; + ncnn::DetectionOutput* op_default = (ncnn::DetectionOutput*)layer_default; + + fprintf_param_value(" 0=%d", num_class) + fprintf_param_value(" 1=%e", nms_threshold) + fprintf_param_value(" 2=%d", nms_top_k) + fprintf_param_value(" 3=%d", keep_top_k) + fprintf_param_value(" 4=%e", confidence_threshold) + fprintf_param_value(" 5=%e", variances[0]) + fprintf_param_value(" 6=%e", variances[1]) + fprintf_param_value(" 7=%e", variances[2]) + fprintf_param_value(" 8=%e", variances[3]) + } + else if (layer->type == "Dropout") + { + ncnn::Dropout* op = (ncnn::Dropout*)layer; + ncnn::Dropout* op_default = (ncnn::Dropout*)layer_default; + + fprintf_param_value(" 0=%e", scale) + } + else if (layer->type == "Eltwise") + { + ncnn::Eltwise* op = (ncnn::Eltwise*)layer; + ncnn::Eltwise* op_default = (ncnn::Eltwise*)layer_default; + + fprintf_param_value(" 0=%d", op_type) + { + if (!op->coeffs.empty()) fprintf_param_float_array(1, op->coeffs, pp); + } + } + else if (layer->type == "ELU") + { + ncnn::ELU* op = (ncnn::ELU*)layer; + ncnn::ELU* op_default = (ncnn::ELU*)layer_default; + + fprintf_param_value(" 0=%e", alpha) + } + else if (layer->type == "Embed") + { + ncnn::Embed* op = (ncnn::Embed*)layer; + ncnn::Embed* op_default = (ncnn::Embed*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", input_dim) + fprintf_param_value(" 2=%d", bias_term) + fprintf_param_value(" 3=%d", weight_data_size) + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + } + else if (layer->type == "Exp") + { + ncnn::Exp* op = (ncnn::Exp*)layer; + ncnn::Exp* op_default = (ncnn::Exp*)layer_default; + + fprintf_param_value(" 0=%e", base) + fprintf_param_value(" 1=%e", scale) + fprintf_param_value(" 2=%e", shift) + } + else if (layer->type == "ExpandDims") + { + ncnn::ExpandDims* op = (ncnn::ExpandDims*)layer; + ncnn::ExpandDims* op_default = (ncnn::ExpandDims*)layer_default; + + fprintf_param_value(" 0=%d", expand_w) + fprintf_param_value(" 1=%d", expand_h) + fprintf_param_value(" 2=%d", expand_c) + { + if (!op->axes.empty()) fprintf_param_int_array(0, op->axes, pp); + } + } + else if (layer->type == "GELU") + { + ncnn::GELU* op = (ncnn::GELU*)layer; + ncnn::GELU* op_default = (ncnn::GELU*)layer_default; + + fprintf_param_value(" 0=%d", fast_gelu) + } + else if (layer->type == "Gemm") + { + ncnn::Gemm* op = (ncnn::Gemm*)layer; + ncnn::Gemm* op_default = (ncnn::Gemm*)layer_default; + + fprintf_param_value(" 0=%e", alpha) + fprintf_param_value(" 1=%e", beta) + fprintf_param_value(" 2=%d", transA) + fprintf_param_value(" 3=%d", transB) + } + else if (layer->type == "GroupNorm") + { + ncnn::GroupNorm* op = (ncnn::GroupNorm*)layer; + ncnn::GroupNorm* op_default = (ncnn::GroupNorm*)layer_default; + + fprintf_param_value(" 0=%d", group) + fprintf_param_value(" 1=%d", channels) + fprintf_param_value(" 2=%e", eps) + fprintf_param_value(" 3=%d", affine) + + fwrite_weight_data(op->gamma_data, bp); + fwrite_weight_data(op->beta_data, bp); + } + else if (layer->type == "GRU") + { + ncnn::GRU* op = (ncnn::GRU*)layer; + ncnn::GRU* op_default = (ncnn::GRU*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", weight_data_size) + fprintf_param_value(" 2=%d", direction) + + fwrite_weight_tag_data(op->weight_xc_data, bp); + fwrite_weight_tag_data(op->bias_c_data, bp); + fwrite_weight_tag_data(op->weight_hc_data, bp); + } + else if (layer->type == "HardSigmoid") + { + ncnn::HardSigmoid* op = (ncnn::HardSigmoid*)layer; + ncnn::HardSigmoid* op_default = (ncnn::HardSigmoid*)layer_default; + + fprintf_param_value(" 0=%e", alpha) + fprintf_param_value(" 1=%e", beta) + } + else if (layer->type == "HardSwish") + { + ncnn::HardSwish* op = (ncnn::HardSwish*)layer; + ncnn::HardSwish* op_default = (ncnn::HardSwish*)layer_default; + + fprintf_param_value(" 0=%e", alpha) + fprintf_param_value(" 1=%e", beta) + } + else if (layer->type == "InnerProduct") + { + ncnn::InnerProduct* op = (ncnn::InnerProduct*)layer; + ncnn::InnerProduct* op_default = (ncnn::InnerProduct*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", bias_term) + fprintf_param_value(" 2=%d", weight_data_size) + fprintf_param_value(" 8=%d", int8_scale_term) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + +#if NCNN_INT8 + // write int8_scale data + if (op->int8_scale_term) + { + fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100); + fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1); + } +#endif // NCNN_INT8 + + if (shape_ready) + { + int inw = blobs[layer->bottoms[0]].shape.w; + int inh = blobs[layer->bottoms[0]].shape.h; + int inc = blobs[layer->bottoms[0]].shape.c; + int outw = blobs[layer->tops[0]].shape.w; + + mac += (uint64_t)inw * inh * inc * outw; + } + } + else if (layer->type == "Input") + { + ncnn::Input* op = (ncnn::Input*)layer; + ncnn::Input* op_default = (ncnn::Input*)layer_default; + + fprintf_param_value(" 0=%d", w) + fprintf_param_value(" 1=%d", h) + fprintf_param_value(" 2=%d", c) + } + else if (layer->type == "InstanceNorm") + { + ncnn::InstanceNorm* op = (ncnn::InstanceNorm*)layer; + ncnn::InstanceNorm* op_default = (ncnn::InstanceNorm*)layer_default; + + fprintf_param_value(" 0=%d", channels) + fprintf_param_value(" 1=%e", eps) + fprintf_param_value(" 2=%d", affine) + + fwrite_weight_data(op->gamma_data, bp); + fwrite_weight_data(op->beta_data, bp); + } + else if (layer->type == "Interp") + { + ncnn::Interp* op = (ncnn::Interp*)layer; + ncnn::Interp* op_default = (ncnn::Interp*)layer_default; + + fprintf_param_value(" 0=%d", resize_type) + fprintf_param_value(" 1=%e", height_scale) + fprintf_param_value(" 2=%e", width_scale) + fprintf_param_value(" 3=%d", output_height) + fprintf_param_value(" 4=%d", output_width) + fprintf_param_value(" 5=%d", dynamic_target_size) + fprintf_param_value(" 6=%d", align_corner) + } + else if (layer->type == "LayerNorm") + { + ncnn::LayerNorm* op = (ncnn::LayerNorm*)layer; + ncnn::LayerNorm* op_default = (ncnn::LayerNorm*)layer_default; + + fprintf_param_value(" 0=%d", affine_size) + fprintf_param_value(" 1=%e", eps) + fprintf_param_value(" 2=%d", affine) + + fwrite_weight_data(op->gamma_data, bp); + fwrite_weight_data(op->beta_data, bp); + } + else if (layer->type == "Log") + { + ncnn::Log* op = (ncnn::Log*)layer; + ncnn::Log* op_default = (ncnn::Log*)layer_default; + + fprintf_param_value(" 0=%e", base) + fprintf_param_value(" 1=%e", scale) + fprintf_param_value(" 2=%e", shift) + } + else if (layer->type == "LRN") + { + ncnn::LRN* op = (ncnn::LRN*)layer; + ncnn::LRN* op_default = (ncnn::LRN*)layer_default; + + fprintf_param_value(" 0=%d", region_type) + fprintf_param_value(" 1=%d", local_size) + fprintf_param_value(" 2=%e", alpha) + fprintf_param_value(" 3=%e", beta) + fprintf_param_value(" 4=%e", bias) + } + else if (layer->type == "LSTM") + { + ncnn::LSTM* op = (ncnn::LSTM*)layer; + ncnn::LSTM* op_default = (ncnn::LSTM*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", weight_data_size) + fprintf_param_value(" 2=%d", direction) + + fwrite_weight_tag_data(op->weight_xc_data, bp); + fwrite_weight_tag_data(op->bias_c_data, bp); + fwrite_weight_tag_data(op->weight_hc_data, bp); + } + else if (layer->type == "MatMul") + { + ncnn::MatMul* op = (ncnn::MatMul*)layer; + ncnn::MatMul* op_default = (ncnn::MatMul*)layer_default; + + fprintf_param_value(" 0=%d", transB) + } + else if (layer->type == "MemoryData") + { + ncnn::MemoryData* op = (ncnn::MemoryData*)layer; + ncnn::MemoryData* op_default = (ncnn::MemoryData*)layer_default; + + fprintf_param_value(" 0=%d", w) + fprintf_param_value(" 1=%d", h) + fprintf_param_value(" 2=%d", c) + fprintf_param_value(" 11=%d", d) + fwrite_weight_data(op->data, bp); + } + else if (layer->type == "MultiHeadAttention") + { + ncnn::MultiHeadAttention* op = (ncnn::MultiHeadAttention*)layer; + ncnn::MultiHeadAttention* op_default = (ncnn::MultiHeadAttention*)layer_default; + + fprintf_param_value(" 0=%d", embed_dim) + fprintf_param_value(" 1=%d", num_head) + fprintf_param_value(" 2=%d", weight_data_size) + fprintf_param_value(" 3=%d", int8_scale_term); + + fwrite_weight_tag_data(op->q_weight_data, bp); + fwrite_weight_data(op->q_bias_data, bp); + fwrite_weight_tag_data(op->k_weight_data, bp); + fwrite_weight_data(op->k_bias_data, bp); + fwrite_weight_tag_data(op->v_weight_data, bp); + fwrite_weight_data(op->v_bias_data, bp); + fwrite_weight_tag_data(op->out_weight_data, bp); + fwrite_weight_data(op->out_bias_data, bp); + +#ifdef NCNN_INT8 + if (op->int8_scale_term) + { + fwrite_weight_data(op->q_input_scale, bp, 90, 100); + fwrite_weight_data(op->k_input_scale, bp, 90, 100); + fwrite_weight_data(op->v_input_scale, bp, 90, 100); + + fwrite_weight_data(op->q_weight_scales, bp, 0.001, 1); + fwrite_weight_data(op->k_weight_scales, bp, 0.001, 1); + fwrite_weight_data(op->v_weight_scales, bp, 0.001, 1); + fwrite_weight_data(op->o_weight_scales, bp, 0.001, 1); + fwrite_weight_data(op->internal_scales, bp, 0.001, 1); + } +#endif + } + else if (layer->type == "MVN") + { + ncnn::MVN* op = (ncnn::MVN*)layer; + ncnn::MVN* op_default = (ncnn::MVN*)layer_default; + + fprintf_param_value(" 0=%d", normalize_variance) + fprintf_param_value(" 1=%d", across_channels) + fprintf_param_value(" 2=%e", eps) + } + else if (layer->type == "Normalize") + { + ncnn::Normalize* op = (ncnn::Normalize*)layer; + ncnn::Normalize* op_default = (ncnn::Normalize*)layer_default; + + fprintf_param_value(" 0=%d", across_spatial) + fprintf_param_value(" 1=%d", channel_shared) + fprintf_param_value(" 2=%e", eps) + fprintf_param_value(" 3=%d", scale_data_size) + fprintf_param_value(" 4=%d", across_channel) + fprintf_param_value(" 9=%d", eps_mode) + + fwrite_weight_data(op->scale_data, bp); + } + else if (layer->type == "Padding") + { + ncnn::Padding* op = (ncnn::Padding*)layer; + ncnn::Padding* op_default = (ncnn::Padding*)layer_default; + + fprintf_param_value(" 0=%d", top) + fprintf_param_value(" 1=%d", bottom) + fprintf_param_value(" 2=%d", left) + fprintf_param_value(" 3=%d", right) + fprintf_param_value(" 4=%d", type) + fprintf_param_value(" 5=%e", value) + fprintf_param_value(" 6=%d", per_channel_pad_data_size) + fprintf_param_value(" 7=%d", front) + fprintf_param_value(" 8=%d", behind) + + fwrite_weight_data(op->per_channel_pad_data, bp); + } + else if (layer->type == "Permute") + { + ncnn::Permute* op = (ncnn::Permute*)layer; + ncnn::Permute* op_default = (ncnn::Permute*)layer_default; + + fprintf_param_value(" 0=%d", order_type) + } + else if (layer->type == "PixelShuffle") + { + ncnn::PixelShuffle* op = (ncnn::PixelShuffle*)layer; + ncnn::PixelShuffle* op_default = (ncnn::PixelShuffle*)layer_default; + + fprintf_param_value(" 0=%d", upscale_factor) + fprintf_param_value(" 1=%d", mode) + } + else if (layer->type == "Pooling") + { + ncnn::Pooling* op = (ncnn::Pooling*)layer; + ncnn::Pooling* op_default = (ncnn::Pooling*)layer_default; + + fprintf_param_value(" 0=%d", pooling_type) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + } + fprintf_param_value(" 2=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 12=%d", op->stride_h); + } + fprintf_param_value(" 3=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 13=%d", op->pad_top); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 14=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 15=%d", op->pad_bottom); + } + fprintf_param_value(" 4=%d", global_pooling) + fprintf_param_value(" 5=%d", pad_mode) + fprintf_param_value(" 6=%d", avgpool_count_include_pad) + fprintf_param_value(" 7=%d", adaptive_pooling) + fprintf_param_value(" 8=%d", out_w) + { + if (op->out_h != op->out_w) fprintf(pp, " 18=%d", op->out_h); + } + } + else if (layer->type == "Pooling1D") + { + ncnn::Pooling1D* op = (ncnn::Pooling1D*)layer; + ncnn::Pooling1D* op_default = (ncnn::Pooling1D*)layer_default; + + fprintf_param_value(" 0=%d", pooling_type) + fprintf_param_value(" 1=%d", kernel_w) + fprintf_param_value(" 2=%d", stride_w) + fprintf_param_value(" 3=%d", pad_left) + { + if (op->pad_right != op->pad_left) fprintf(pp, " 14=%d", op->pad_right); + } + fprintf_param_value(" 4=%d", global_pooling) + fprintf_param_value(" 5=%d", pad_mode) + fprintf_param_value(" 6=%d", avgpool_count_include_pad) + fprintf_param_value(" 7=%d", adaptive_pooling) + fprintf_param_value(" 8=%d", out_w) + } + else if (layer->type == "Pooling3D") + { + ncnn::Pooling3D* op = (ncnn::Pooling3D*)layer; + ncnn::Pooling3D* op_default = (ncnn::Pooling3D*)layer_default; + + fprintf_param_value(" 0=%d", pooling_type) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); + } + fprintf_param_value(" 2=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 12=%d", op->stride_h); + if (op->stride_d != op->stride_w) fprintf(pp, " 22=%d", op->stride_d); + } + fprintf_param_value(" 3=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 13=%d", op->pad_top); + if (op->pad_front != op->pad_left) fprintf(pp, " 23=%d", op->pad_front); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 14=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 15=%d", op->pad_bottom); + } + { + if (op->pad_behind != op->pad_front) fprintf(pp, " 16=%d", op->pad_behind); + } + fprintf_param_value(" 4=%d", global_pooling) + fprintf_param_value(" 5=%d", pad_mode) + fprintf_param_value(" 6=%d", avgpool_count_include_pad) + fprintf_param_value(" 7=%d", adaptive_pooling) + fprintf_param_value(" 8=%d", out_w) + { + if (op->out_h != op->out_w) fprintf(pp, " 18=%d", op->out_h); + if (op->out_d != op->out_w) fprintf(pp, " 28=%d", op->out_d); + } + } + else if (layer->type == "Power") + { + ncnn::Power* op = (ncnn::Power*)layer; + ncnn::Power* op_default = (ncnn::Power*)layer_default; + + fprintf_param_value(" 0=%e", power) + fprintf_param_value(" 1=%e", scale) + fprintf_param_value(" 2=%e", shift) + } + else if (layer->type == "PReLU") + { + ncnn::PReLU* op = (ncnn::PReLU*)layer; + ncnn::PReLU* op_default = (ncnn::PReLU*)layer_default; + + fprintf_param_value(" 0=%d", num_slope) + + fwrite_weight_data(op->slope_data, bp); + } + else if (layer->type == "PriorBox") + { + ncnn::PriorBox* op = (ncnn::PriorBox*)layer; + ncnn::PriorBox* op_default = (ncnn::PriorBox*)layer_default; + + { + if (!op->min_sizes.empty()) fprintf_param_float_array(0, op->min_sizes, pp); + } + { + if (!op->max_sizes.empty()) fprintf_param_float_array(1, op->max_sizes, pp); + } + { + if (!op->aspect_ratios.empty()) fprintf_param_float_array(2, op->aspect_ratios, pp); + } + fprintf_param_value(" 3=%e", variances[0]) + fprintf_param_value(" 4=%e", variances[1]) + fprintf_param_value(" 5=%e", variances[2]) + fprintf_param_value(" 6=%e", variances[3]) + fprintf_param_value(" 7=%d", flip) + fprintf_param_value(" 8=%d", clip) + fprintf_param_value(" 9=%d", image_width) + fprintf_param_value(" 10=%d", image_height) + fprintf_param_value(" 11=%e", step_width) + fprintf_param_value(" 12=%e", step_height) + fprintf_param_value(" 13=%e", offset) + } + else if (layer->type == "Proposal") + { + ncnn::Proposal* op = (ncnn::Proposal*)layer; + ncnn::Proposal* op_default = (ncnn::Proposal*)layer_default; + + fprintf_param_value(" 0=%d", feat_stride) + fprintf_param_value(" 1=%d", base_size) + fprintf_param_value(" 2=%d", pre_nms_topN) + fprintf_param_value(" 3=%d", after_nms_topN) + fprintf_param_value(" 4=%e", nms_thresh) + fprintf_param_value(" 5=%d", min_size) + } + else if (layer->type == "PSROIPooling") + { + ncnn::PSROIPooling* op = (ncnn::PSROIPooling*)layer; + ncnn::PSROIPooling* op_default = (ncnn::PSROIPooling*)layer_default; + + fprintf_param_value(" 0=%d", pooled_width) + fprintf_param_value(" 1=%d", pooled_height) + fprintf_param_value(" 2=%e", spatial_scale) + fprintf_param_value(" 3=%d", output_dim) + } + else if (layer->type == "Quantize") + { + ncnn::Quantize* op = (ncnn::Quantize*)layer; + ncnn::Quantize* op_default = (ncnn::Quantize*)layer_default; + + fprintf_param_value(" 0=%d", scale_data_size) + + fwrite_weight_data(op->scale_data, bp); + } + else if (layer->type == "Reduction") + { + ncnn::Reduction* op = (ncnn::Reduction*)layer; + ncnn::Reduction* op_default = (ncnn::Reduction*)layer_default; + + fprintf_param_value(" 0=%d", operation) + fprintf_param_value(" 1=%d", reduce_all) + fprintf_param_value(" 2=%e", coeff) + { + if (!op->axes.empty()) fprintf_param_int_array(3, op->axes, pp); + } + fprintf_param_value(" 4=%d", keepdims) + + // HACK + if (!op->axes.empty()) + { + int fixbug0 = 1; + fprintf(pp, " 5=%d", fixbug0); + } + } + else if (layer->type == "ReLU") + { + ncnn::ReLU* op = (ncnn::ReLU*)layer; + ncnn::ReLU* op_default = (ncnn::ReLU*)layer_default; + + fprintf_param_value(" 0=%e", slope) + } + else if (layer->type == "Reorg") + { + ncnn::Reorg* op = (ncnn::Reorg*)layer; + ncnn::Reorg* op_default = (ncnn::Reorg*)layer_default; + + fprintf_param_value(" 0=%d", stride) + fprintf_param_value(" 1=%d", mode) + } + else if (layer->type == "Requantize") + { + ncnn::Requantize* op = (ncnn::Requantize*)layer; + ncnn::Requantize* op_default = (ncnn::Requantize*)layer_default; + + fprintf_param_value(" 0=%d", scale_in_data_size) + fprintf_param_value(" 1=%d", scale_out_data_size) + fprintf_param_value(" 2=%d", bias_data_size) + fprintf_param_value(" 3=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(4, op->activation_params, pp); + } + + fwrite_weight_data(op->scale_in_data, bp); + fwrite_weight_data(op->scale_out_data, bp); + fwrite_weight_data(op->bias_data, bp); + } + else if (layer->type == "Reshape") + { + ncnn::Reshape* op = (ncnn::Reshape*)layer; + ncnn::Reshape* op_default = (ncnn::Reshape*)layer_default; + + fprintf_param_value(" 0=%d", w) + fprintf_param_value(" 1=%d", h) + fprintf_param_value(" 2=%d", c) + fprintf_param_value(" 3=%d", permute) + } + else if (layer->type == "RNN") + { + ncnn::RNN* op = (ncnn::RNN*)layer; + ncnn::RNN* op_default = (ncnn::RNN*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", weight_data_size) + fprintf_param_value(" 2=%d", direction) + + fwrite_weight_tag_data(op->weight_xc_data, bp); + fwrite_weight_tag_data(op->bias_c_data, bp); + fwrite_weight_tag_data(op->weight_hc_data, bp); + } + else if (layer->type == "ROIAlign") + { + ncnn::ROIAlign* op = (ncnn::ROIAlign*)layer; + ncnn::ROIAlign* op_default = (ncnn::ROIAlign*)layer_default; + + fprintf_param_value(" 0=%d", pooled_width) + fprintf_param_value(" 1=%d", pooled_height) + fprintf_param_value(" 2=%e", spatial_scale) + fprintf_param_value(" 3=%d", sampling_ratio) + fprintf_param_value(" 4=%d", aligned) + fprintf_param_value(" 5=%d", version) + } + else if (layer->type == "ROIPooling") + { + ncnn::ROIPooling* op = (ncnn::ROIPooling*)layer; + ncnn::ROIPooling* op_default = (ncnn::ROIPooling*)layer_default; + + fprintf_param_value(" 0=%d", pooled_width) + fprintf_param_value(" 1=%d", pooled_height) + fprintf_param_value(" 2=%e", spatial_scale) + } + else if (layer->type == "Scale") + { + ncnn::Scale* op = (ncnn::Scale*)layer; + ncnn::Scale* op_default = (ncnn::Scale*)layer_default; + + fprintf_param_value(" 0=%d", scale_data_size) + fprintf_param_value(" 1=%d", bias_term) + + fwrite_weight_data(op->scale_data, bp); + fwrite_weight_data(op->bias_data, bp); + } + else if (layer->type == "ShuffleChannel") + { + ncnn::ShuffleChannel* op = (ncnn::ShuffleChannel*)layer; + ncnn::ShuffleChannel* op_default = (ncnn::ShuffleChannel*)layer_default; + + fprintf_param_value(" 0=%d", group) + fprintf_param_value(" 1=%d", reverse) + } + else if (layer->type == "Slice") + { + ncnn::Slice* op = (ncnn::Slice*)layer; + ncnn::Slice* op_default = (ncnn::Slice*)layer_default; + + { + if (!op->slices.empty()) fprintf_param_int_array(0, op->slices, pp); + } + fprintf_param_value(" 1=%d", axis) + } + else if (layer->type == "Softmax") + { + ncnn::Softmax* op = (ncnn::Softmax*)layer; + ncnn::Softmax* op_default = (ncnn::Softmax*)layer_default; + + fprintf_param_value(" 0=%d", axis) + + // HACK + if (op->axis != 0) + { + int fixbug0 = 1; + fprintf(pp, " 1=%d", fixbug0); + } + } + else if (layer->type == "Squeeze") + { + ncnn::Squeeze* op = (ncnn::Squeeze*)layer; + ncnn::Squeeze* op_default = (ncnn::Squeeze*)layer_default; + + fprintf_param_value(" 0=%d", squeeze_w) + fprintf_param_value(" 1=%d", squeeze_h) + fprintf_param_value(" 2=%d", squeeze_c) + { + if (!op->axes.empty()) fprintf_param_int_array(0, op->axes, pp); + } + } + else if (layer->type == "Threshold") + { + ncnn::Threshold* op = (ncnn::Threshold*)layer; + ncnn::Threshold* op_default = (ncnn::Threshold*)layer_default; + + fprintf_param_value(" 0=%e", threshold) + } + else if (layer->type == "UnaryOp") + { + ncnn::UnaryOp* op = (ncnn::UnaryOp*)layer; + ncnn::UnaryOp* op_default = (ncnn::UnaryOp*)layer_default; + + fprintf_param_value(" 0=%d", op_type) + } + else if (layer->type == "YoloDetectionOutput") + { + ncnn::YoloDetectionOutput* op = (ncnn::YoloDetectionOutput*)layer; + ncnn::YoloDetectionOutput* op_default = (ncnn::YoloDetectionOutput*)layer_default; + + fprintf_param_value(" 0=%d", num_class) + fprintf_param_value(" 1=%d", num_box) + fprintf_param_value(" 2=%e", confidence_threshold) + fprintf_param_value(" 3=%e", nms_threshold) + { + if (!op->biases.empty()) fprintf_param_float_array(4, op->biases, pp); + } + } + else if (layer->type == "Yolov3DetectionOutput") + { + ncnn::Yolov3DetectionOutput* op = (ncnn::Yolov3DetectionOutput*)layer; + ncnn::Yolov3DetectionOutput* op_default = (ncnn::Yolov3DetectionOutput*)layer_default; + + fprintf_param_value(" 0=%d", num_class) + fprintf_param_value(" 1=%d", num_box) + fprintf_param_value(" 2=%e", confidence_threshold) + fprintf_param_value(" 3=%e", nms_threshold) + { + if (!op->biases.empty()) fprintf_param_float_array(4, op->biases, pp); + } + { + if (!op->mask.empty()) fprintf_param_int_array(5, op->mask, pp); + } + { + if (!op->anchors_scale.empty()) fprintf_param_float_array(6, op->anchors_scale, pp); + } + } + +#undef fprintf_param_value + + fprintf(pp, "\n"); + + delete layer_default; + } + + fclose(pp); + fclose(bp); + + if (mac) + { + fprintf(stderr, "mac = %llu = %.2f M\n", static_cast(mac), mac / 1000000.0); + } + + return 0; +} diff --git a/tools/modelwriter.h b/tools/modelwriter.h index e9ff979176a7..844c6d4f6efa 100644 --- a/tools/modelwriter.h +++ b/tools/modelwriter.h @@ -11,7 +11,7 @@ // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. - +#pragma once #ifdef _MSC_VER #define _CRT_SECURE_NO_DEPRECATE #endif @@ -115,30 +115,11 @@ static struct prng_rand_t g_prng_rand_state; class MemoryFootprintAllocator : public ncnn::Allocator { public: - MemoryFootprintAllocator() - { - current_memory_usage = 0; - memory_footprint = 0; - } + MemoryFootprintAllocator(); - virtual void* fastMalloc(size_t size) - { - ncnn::MutexLockGuard g(lock); - void* ptr = ncnn::fastMalloc(size); - bookkeeper[ptr] = size; - current_memory_usage += size; - memory_footprint = std::max(memory_footprint, current_memory_usage); - return ptr; - } + virtual void* fastMalloc(size_t size); - virtual void fastFree(void* ptr) - { - ncnn::MutexLockGuard g(lock); - size_t size = bookkeeper[ptr]; - current_memory_usage -= size; - bookkeeper.erase(bookkeeper.find(ptr)); - ncnn::fastFree(ptr); - } + virtual void fastFree(void* ptr); public: int current_memory_usage; @@ -150,59 +131,14 @@ class MemoryFootprintAllocator : public ncnn::Allocator class CustomLayer : public ncnn::Layer { public: - virtual int load_param(const ncnn::ParamDict& pd) - { - mpd = pd; - return 0; - } - - void write_param(FILE* pp) - { - for (int i = 0; i < NCNN_MAX_PARAM_COUNT; i++) - { - int type = mpd.type(i); - if (type == 0) - continue; + virtual int load_param(const ncnn::ParamDict& pd); - if (type == 2) - { - fprintf(pp, " %d=%d", i, mpd.get(i, 0)); - } - if (type == 3) - { - fprintf(pp, " %d=%e", i, mpd.get(i, 0.f)); - } - if (type == 5) - { - ncnn::Mat v = mpd.get(i, ncnn::Mat()); - int len = v.w; - fprintf(pp, " %d=%d", -i - 23300, len); - const int* p = v; - for (int j = 0; j < len; j++) - { - fprintf(pp, ",%d", p[j]); - } - } - if (type == 6) - { - ncnn::Mat v = mpd.get(i, ncnn::Mat()); - int len = v.w; - fprintf(pp, " %d=%d", -i - 23300, len); - const float* p = v; - for (int j = 0; j < len; j++) - { - fprintf(pp, ",%e", p[j]); - } - } - } - } + void write_param(FILE* pp); public: ncnn::ParamDict mpd; }; -DEFINE_LAYER_CREATOR(CustomLayer) - class ModelWriter : public ncnn::Net { public: @@ -240,1981 +176,3 @@ class ModelWriter : public ncnn::Net int save(const char* parampath, const char* binpath); }; - -ModelWriter::ModelWriter() - : blobs(mutable_blobs()), layers(mutable_layers()) -{ - opt.lightmode = false; - has_custom_layer = false; - gen_random_weight = false; - cutstart = -1; - cutend = -1; - - SRAND(7767517); -} - -ncnn::Layer* ModelWriter::create_custom_layer(const char* type) -{ - ncnn::Layer* layer = Net::create_custom_layer(type); - if (layer) - return layer; - - fprintf(stderr, "create_custom_layer %s\n", type); - - register_custom_layer(type, CustomLayer_layer_creator); - - has_custom_layer = true; - - return Net::create_custom_layer(type); -} - -int ModelWriter::set_cutparam(const char* cutstartname, const char* cutendname) -{ - if (cutstartname != nullptr) - { - int layindex = find_layer_index_by_name(cutstartname); - if (layindex >= 0) - { - cutstart = layindex; - fprintf(stderr, "cutstart layer %d:%s\n", layindex, cutstartname); - } - else - { - fprintf(stderr, "not find target cutstart layer %s\n", cutstartname); - return -1; - } - } - - if (cutendname != nullptr) - { - int layindex = find_layer_index_by_name(cutendname); - if (layindex >= 0) - { - cutend = layindex; - fprintf(stderr, "cutend layer %d:%s\n", layindex, cutendname); - } - else - { - fprintf(stderr, "not find target cutend layer %s\n", cutendname); - return -1; - } - } - - return 0; -} - -int ModelWriter::shape_inference() -{ - if (has_custom_layer) - { - fprintf(stderr, "model has custom layer, shape_inference skipped\n"); - return -1; - } - - const size_t layer_count = layers.size(); - const size_t blob_count = blobs.size(); - - // recreate layer pipeline for param and weight changes - for (size_t i = 0; i < layer_count; i++) - { - ncnn::Layer* layer = layers[i]; - - layer->destroy_pipeline(opt); - - int cret = layer->create_pipeline(opt); - if (cret != 0) - { - NCNN_LOGE("layer create_pipeline %d %s failed", (int)i, layer->name.c_str()); - return -1; - } - } - - ncnn::Extractor ex = create_extractor(); - ex.set_light_mode(true); - - // prepare Input blobs - for (size_t i = 0; i < layer_count; i++) - { - const ncnn::Layer* layer = layers[i]; - if (layer->type == "ncnnfused") - continue; - - if (layer->type != "Input") - continue; - - ncnn::Input* input = (ncnn::Input*)layer; - - int w = input->w; - int h = input->h; - int c = input->c; - - int dims = 0; - if (w == 0 && h == 0 && c == 0) dims = 0; - if (w != 0 && h == 0 && c == 0) dims = 1; - if (w != 0 && h != 0 && c == 0) dims = 2; - if (w != 0 && h != 0 && c != 0) dims = 3; - - if (dims == 0) - { - fprintf(stderr, "Input layer %s without shape info, shape_inference skipped\n", layer->name.c_str()); - return -1; - } - - ncnn::Mat m; - if (dims == 1) m.create(w); - if (dims == 2) m.create(w, h); - if (dims == 3) m.create(w, h, c); - - ex.input(layer->tops[0], m); - } - - // prepare blobs with predefined shape - for (size_t i = 0; i < blob_count; i++) - { - const ncnn::Blob& blob = blobs[i]; - - int dims = blob.shape.dims; - int w = blob.shape.w; - int h = blob.shape.h; - int c = blob.shape.c; - - if (dims == 0) - continue; - - ncnn::Mat m; - if (dims == 1) m.create(w); - if (dims == 2) m.create(w, h); - if (dims == 3) m.create(w, h, c); - - m.fill(0.f); - - ex.input(int(i), m); - } - - fprintf(stderr, "shape_inference\n"); - - // resolve all layer output blob shape - for (size_t i = 0; i < layer_count; i++) - { - const ncnn::Layer* layer = layers[i]; - if (layer->type == "ncnnfused") - continue; - - for (size_t j = 0; j < layer->tops.size(); j++) - { - int top_blob_index = layer->tops[j]; - - ncnn::Mat m; - ex.extract(top_blob_index, m); - - blobs[top_blob_index].shape = m; - } - } - - // assign all layer blob shape - for (size_t i = 0; i < layer_count; i++) - { - ncnn::Layer* layer = layers[i]; - if (layer->type == "ncnnfused") - continue; - - layer->bottom_shapes.resize(layer->bottoms.size()); - for (size_t j = 0; j < layer->bottoms.size(); j++) - { - int bottom_blob_index = layer->bottoms[j]; - - layer->bottom_shapes[j] = blobs[bottom_blob_index].shape; - } - - layer->top_shapes.resize(layer->tops.size()); - for (size_t j = 0; j < layer->tops.size(); j++) - { - int top_blob_index = layer->tops[j]; - - layer->top_shapes[j] = blobs[top_blob_index].shape; - - // fprintf(stderr, "%d %4d %4d %4d | %2d %s\n", blobs[top_blob_index].shape.dims, blobs[top_blob_index].shape.w, blobs[top_blob_index].shape.h, blobs[top_blob_index].shape.c, top_blob_index, blobs[top_blob_index].name.c_str()); - } - } - - return 0; -} - -int ModelWriter::estimate_memory_footprint() -{ - if (has_custom_layer) - { - fprintf(stderr, "model has custom layer, estimate_memory_footprint skipped\n"); - return -1; - } - - const size_t layer_count = layers.size(); - const size_t blob_count = blobs.size(); - - MemoryFootprintAllocator allocator; - - ncnn::Extractor ex = create_extractor(); - ex.set_light_mode(true); - - ex.set_blob_allocator(&allocator); - ex.set_workspace_allocator(&allocator); - - // prepare Input blobs - for (size_t i = 0; i < layer_count; i++) - { - const ncnn::Layer* layer = layers[i]; - if (layer->type == "ncnnfused") - continue; - - if (layer->type != "Input") - continue; - - ncnn::Input* input = (ncnn::Input*)layer; - - int w = input->w; - int h = input->h; - int c = input->c; - - int dims = 0; - if (w == 0 && h == 0 && c == 0) dims = 0; - if (w != 0 && h == 0 && c == 0) dims = 1; - if (w != 0 && h != 0 && c == 0) dims = 2; - if (w != 0 && h != 0 && c != 0) dims = 3; - - if (dims == 0) - { - fprintf(stderr, "Input layer %s without shape info, estimate_memory_footprint skipped\n", layer->name.c_str()); - return -1; - } - - ncnn::Mat m; - if (dims == 1) m.create(w, 4u, &allocator); - if (dims == 2) m.create(w, h, 4u, &allocator); - if (dims == 3) m.create(w, h, c, 4u, &allocator); - - ex.input(layer->tops[0], m); - - fprintf(stderr, "input = %s\n", blobs[layer->tops[0]].name.c_str()); - } - - // find output blobs and do inference - std::vector outputs; - for (size_t i = 0; i < blob_count; i++) - { - const ncnn::Blob& blob = blobs[i]; - - if (blob.producer == -1 || blob.consumer != -1) - continue; - - if (layers[blob.producer]->type == "ncnnfused") - continue; - - // treat blob without any consumers as output - ncnn::Mat m; - ex.extract(int(i), m); - outputs.push_back(m); - - fprintf(stderr, "extract = %s\n", blob.name.c_str()); - } - - fprintf(stderr, "estimated memory footprint = %.2f KB = %.2f MB\n", allocator.memory_footprint / 1024.f, allocator.memory_footprint / 1024.f / 1024.f); - - return 0; -} - -int ModelWriter::fprintf_param_int_array(int id, const ncnn::Mat& m, FILE* pp) -{ - const int count = m.w; - const int* ptr = m; - - fprintf(pp, " -%d=%d", 23300 + id, count); - for (int i = 0; i < count; i++) - { - fprintf(pp, ",%d", ptr[i]); - } - - return 0; -} - -int ModelWriter::fprintf_param_float_array(int id, const ncnn::Mat& m, FILE* pp) -{ - const int count = m.w; - const float* ptr = m; - - fprintf(pp, " -%d=%d", 23300 + id, count); - for (int i = 0; i < count; i++) - { - fprintf(pp, ",%e", ptr[i]); - } - - return 0; -} - -static inline size_t alignSize(size_t sz, int n) -{ - return (sz + n - 1) & -n; -} - -static void replace_denormals_with_zero(float* data, size_t data_length) -{ - const int total = static_cast(data_length); - for (size_t i = 0; i < data_length; ++i) - { - float value = data[i]; - - if (fabsf(value) < 1e-30 && fabsf(value) != 0.f) - { - data[i] = 0.f; - } - } -} - -static float RandomFloat(float a = -1.2f, float b = 1.2f) -{ - float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX; - float diff = b - a; - float r = random * diff; - return a + r; -} - -static void Randomize(ncnn::Mat& m, float a = -1.2f, float b = 1.2f) -{ - if (m.elemsize == 4) - { - for (size_t i = 0; i < m.total(); i++) - { - m[i] = RandomFloat(a, b); - } - } - else if (m.elemsize == 2) - { - unsigned short* p = m; - for (size_t i = 0; i < m.total(); i++) - { - p[i] = ncnn::float32_to_float16(RandomFloat(a, b)); - } - } - else if (m.elemsize == 1) - { - signed char* p = m; - for (size_t i = 0; i < m.total(); i++) - { - p[i] = (signed char)RandomFloat(-127, 127); - } - } -} - -int ModelWriter::fwrite_weight_tag_data(const ncnn::Mat& data, FILE* bp, float a, float b) -{ - int p0 = ftell(bp); - - ncnn::Mat data_flattened = data.reshape(data.w * data.h * data.d * data.c); - if (gen_random_weight) - Randomize(data_flattened, a, b); - - if (data_flattened.elemsize == 4) - { - if (storage_type == 1) - { - const int tag = 0x01306B47; // fp16 magic - fwrite(&tag, sizeof(int), 1, bp); - ncnn::Mat data_flattened_fp16; - ncnn::cast_float32_to_float16(data_flattened, data_flattened_fp16); - fwrite(data_flattened_fp16.data, data_flattened_fp16.elemsize, data_flattened_fp16.w, bp); - } - else - { - const int tag = 0; // fp32 magic - fwrite(&tag, sizeof(int), 1, bp); - replace_denormals_with_zero(data_flattened, data_flattened.w); - fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); - } - } - else if (data_flattened.elemsize == 2) - { - const int tag = 0x01306B47; // fp16 magic - fwrite(&tag, sizeof(int), 1, bp); - fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); - } - else if (data_flattened.elemsize == 1) - { - const int tag = 0x000D4B38; // int8 magic - fwrite(&tag, sizeof(int), 1, bp); - fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); - } - else - { - fprintf(stderr, "unknown weight data type %d\n", (int)data_flattened.elemsize); - } - - // padding to 32bit align - int nwrite = ftell(bp) - p0; - size_t nalign = alignSize(nwrite, 4); - unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00}; - fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp); - - return 0; -} - -int ModelWriter::fwrite_weight_data(const ncnn::Mat& data, FILE* bp, float a, float b) -{ - int p0 = ftell(bp); - - ncnn::Mat data_flattened = data.reshape(data.w * data.h * data.d * data.c); - if (gen_random_weight) - Randomize(data_flattened, a, b); - - if (data_flattened.elemsize == 4) // fp32 - { - replace_denormals_with_zero(data_flattened, data_flattened.w); - } - - fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); - - // padding to 32bit align - int nwrite = ftell(bp) - p0; - size_t nalign = alignSize(nwrite, 4); - unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00}; - fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp); - - return 0; -} - -int ModelWriter::save(const char* parampath, const char* binpath) -{ - uint64_t mac = 0; - - FILE* pp = fopen(parampath, "wb"); - FILE* bp = fopen(binpath, "wb"); - - fprintf(pp, "7767517\n"); - - const size_t layer_count = layers.size(); - - int layer_count_fused = 0; - std::set blob_names; - for (size_t i = 0; i < layer_count; i++) - { - const ncnn::Layer* layer = layers[i]; - if (layer->type == "ncnnfused") - continue; - - layer_count_fused++; - - size_t bottom_count = layer->bottoms.size(); - for (size_t j = 0; j < bottom_count; j++) - { - int bottom_blob_index = layer->bottoms[j]; - blob_names.insert(blobs[bottom_blob_index].name); - } - - size_t top_count = layer->tops.size(); - for (size_t j = 0; j < top_count; j++) - { - int top_blob_index = layer->tops[j]; - blob_names.insert(blobs[top_blob_index].name); - } - } - - size_t blob_count_fused = blob_names.size(); - - fprintf(pp, "%d %zd\n", layer_count_fused, blob_count_fused); - - for (size_t i = 0; i < layer_count; i++) - { - const ncnn::Layer* layer = layers[i]; - if (layer->type == "ncnnfused") - continue; - - if (cutstart > 0 && i < cutstart) - continue; - - if (cutend > 0 && i > cutend) - continue; - - size_t bottom_count = layer->bottoms.size(); - size_t top_count = layer->tops.size(); - - fprintf(pp, "%-24s %-24s %zd %zd", layer->type.c_str(), layer->name.c_str(), bottom_count, top_count); - - for (size_t j = 0; j < bottom_count; j++) - { - int bottom_blob_index = layer->bottoms[j]; - fprintf(pp, " %s", blobs[bottom_blob_index].name.c_str()); - } - for (size_t j = 0; j < top_count; j++) - { - int top_blob_index = layer->tops[j]; - fprintf(pp, " %s", blobs[top_blob_index].name.c_str()); - } - - // write shape hints - bool shape_ready = true; - for (size_t j = 0; j < top_count; j++) - { - int top_blob_index = layer->tops[j]; - - int dims = blobs[top_blob_index].shape.dims; - if (dims == 0) - { - shape_ready = false; - break; - } - } - if (shape_ready) - { - fprintf(pp, " -23330=%zd", top_count * 4); - for (size_t j = 0; j < top_count; j++) - { - int top_blob_index = layer->tops[j]; - - int dims = blobs[top_blob_index].shape.dims; - int w = blobs[top_blob_index].shape.w; - int h = blobs[top_blob_index].shape.h; - int c = blobs[top_blob_index].shape.c; - - fprintf(pp, ",%d,%d,%d,%d", dims, w, h, c); - } - } - - // custom op - if (layer->typeindex & ncnn::LayerType::CustomBit) - { - ((CustomLayer*)layer)->write_param(pp); - - fprintf(pp, "\n"); - - continue; - } - - ncnn::Layer* layer_default = ncnn::create_layer(layer->typeindex); - - ncnn::ParamDict pd; - layer_default->load_param(pd); - -#define fprintf_param_value(format, phase) \ - { \ - if (op->phase != op_default->phase) fprintf(pp, format, op->phase); \ - } - - if (layer->type == "BatchNorm") - { - ncnn::BatchNorm* op = (ncnn::BatchNorm*)layer; - ncnn::BatchNorm* op_default = (ncnn::BatchNorm*)layer_default; - - fprintf_param_value(" 0=%d", channels) - fprintf_param_value(" 1=%e", eps) - - fwrite_weight_data(op->slope_data, bp); - fwrite_weight_data(op->mean_data, bp); - fwrite_weight_data(op->var_data, bp); - fwrite_weight_data(op->bias_data, bp); - } - else if (layer->type == "Bias") - { - ncnn::Bias* op = (ncnn::Bias*)layer; - ncnn::Bias* op_default = (ncnn::Bias*)layer_default; - - fprintf_param_value(" 0=%d", bias_data_size) - - fwrite_weight_data(op->bias_data, bp); - } - else if (layer->type == "BinaryOp") - { - ncnn::BinaryOp* op = (ncnn::BinaryOp*)layer; - ncnn::BinaryOp* op_default = (ncnn::BinaryOp*)layer_default; - - fprintf_param_value(" 0=%d", op_type) - fprintf_param_value(" 1=%d", with_scalar) - fprintf_param_value(" 2=%e", b) - } - else if (layer->type == "Clip") - { - ncnn::Clip* op = (ncnn::Clip*)layer; - ncnn::Clip* op_default = (ncnn::Clip*)layer_default; - - fprintf_param_value(" 0=%e", min) - fprintf_param_value(" 1=%e", max) - } - else if (layer->type == "Concat") - { - ncnn::Concat* op = (ncnn::Concat*)layer; - ncnn::Concat* op_default = (ncnn::Concat*)layer_default; - - fprintf_param_value(" 0=%d", axis) - } - else if (layer->type == "Convolution") - { - ncnn::Convolution* op = (ncnn::Convolution*)layer; - ncnn::Convolution* op_default = (ncnn::Convolution*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - fprintf_param_value(" 18=%e", pad_value) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 8=%d", int8_scale_term) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - -#if NCNN_INT8 - // write int8_scale data - if (op->int8_scale_term) - { - fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100); - fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1); - fwrite_weight_data(op->top_blob_int8_scales, bp, 0.001, 1); - } -#endif // NCNN_INT8 - - if (shape_ready) - { - int inc = blobs[layer->bottoms[0]].shape.c; - int outw = blobs[layer->tops[0]].shape.w; - int outh = blobs[layer->tops[0]].shape.h; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_h * op->kernel_w * outw * outh * outc * inc; - } - } - else if (layer->type == "Convolution1D") - { - ncnn::Convolution1D* op = (ncnn::Convolution1D*)layer; - ncnn::Convolution1D* op_default = (ncnn::Convolution1D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - fprintf_param_value(" 2=%d", dilation_w) - fprintf_param_value(" 3=%d", stride_w) - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - fprintf_param_value(" 18=%e", pad_value) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inh = blobs[layer->bottoms[0]].shape.h; - int outw = blobs[layer->tops[0]].shape.w; - int outh = blobs[layer->tops[0]].shape.h; - - mac += (uint64_t)op->kernel_w * outw * outh * inh; - } - } - else if (layer->type == "Convolution3D") - { - ncnn::Convolution3D* op = (ncnn::Convolution3D*)layer; - ncnn::Convolution3D* op_default = (ncnn::Convolution3D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - { - if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); - } - fprintf_param_value(" 18=%e", pad_value) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inc = blobs[layer->bottoms[0]].shape.c; - int outw = blobs[layer->tops[0]].shape.w; - int outh = blobs[layer->tops[0]].shape.h; - int outd = blobs[layer->tops[0]].shape.d; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * outw * outh * outd * outc * inc; - } - } - else if (layer->type == "ConvolutionDepthWise") - { - ncnn::ConvolutionDepthWise* op = (ncnn::ConvolutionDepthWise*)layer; - ncnn::ConvolutionDepthWise* op_default = (ncnn::ConvolutionDepthWise*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - fprintf_param_value(" 18=%e", pad_value) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 7=%d", group) - fprintf_param_value(" 8=%d", int8_scale_term) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - -#if NCNN_INT8 - // write int8_scale data - if (op->int8_scale_term == 1 || op->int8_scale_term == 101) - { - op->bottom_blob_int8_scales.w = 1; - } - if (op->int8_scale_term == 2 || op->int8_scale_term == 102) - { - op->weight_data_int8_scales.w = 1; - op->bottom_blob_int8_scales.w = 1; - } - if (op->int8_scale_term > 100) - { - op->top_blob_int8_scales.w = 1; - } - - if (op->int8_scale_term) - { - fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100); - fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1); - fwrite_weight_data(op->top_blob_int8_scales, bp, 0.001, 1); - } -#endif // NCNN_INT8 - - if (shape_ready) - { - int inc = blobs[layer->bottoms[0]].shape.c; - int outw = blobs[layer->tops[0]].shape.w; - int outh = blobs[layer->tops[0]].shape.h; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_h * op->kernel_w * outw * outh * (outc / op->group) * (inc / op->group) * op->group; - } - } - else if (layer->type == "ConvolutionDepthWise1D") - { - ncnn::ConvolutionDepthWise1D* op = (ncnn::ConvolutionDepthWise1D*)layer; - ncnn::ConvolutionDepthWise1D* op_default = (ncnn::ConvolutionDepthWise1D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - fprintf_param_value(" 2=%d", dilation_w) - fprintf_param_value(" 3=%d", stride_w) - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - fprintf_param_value(" 18=%e", pad_value) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 7=%d", group) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inh = blobs[layer->bottoms[0]].shape.h; - int outw = blobs[layer->tops[0]].shape.w; - int outh = blobs[layer->tops[0]].shape.h; - - mac += (uint64_t)op->kernel_w * outw * (outh / op->group) * (inh / op->group) * op->group; - } - } - else if (layer->type == "ConvolutionDepthWise3D") - { - ncnn::ConvolutionDepthWise3D* op = (ncnn::ConvolutionDepthWise3D*)layer; - ncnn::ConvolutionDepthWise3D* op_default = (ncnn::ConvolutionDepthWise3D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - { - if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); - } - fprintf_param_value(" 18=%e", pad_value) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 7=%d", group) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inc = blobs[layer->bottoms[0]].shape.c; - int outw = blobs[layer->tops[0]].shape.w; - int outh = blobs[layer->tops[0]].shape.h; - int outd = blobs[layer->tops[0]].shape.d; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * outw * outh * outd * (outc / op->group) * (inc / op->group) * op->group; - } - } - else if (layer->type == "Crop") - { - ncnn::Crop* op = (ncnn::Crop*)layer; - ncnn::Crop* op_default = (ncnn::Crop*)layer_default; - - fprintf_param_value(" 0=%d", woffset) - fprintf_param_value(" 1=%d", hoffset) - fprintf_param_value(" 2=%d", coffset) - fprintf_param_value(" 3=%d", outw) - fprintf_param_value(" 4=%d", outh) - fprintf_param_value(" 5=%d", outc) - fprintf_param_value(" 6=%d", woffset2) - fprintf_param_value(" 7=%d", hoffset2) - fprintf_param_value(" 8=%d", coffset2) - { - if (!op->starts.empty()) fprintf_param_int_array(9, op->starts, pp); - } - { - if (!op->ends.empty()) fprintf_param_int_array(10, op->ends, pp); - } - { - if (!op->axes.empty()) fprintf_param_int_array(11, op->axes, pp); - } - } - else if (layer->type == "Deconvolution") - { - ncnn::Deconvolution* op = (ncnn::Deconvolution*)layer; - ncnn::Deconvolution* op_default = (ncnn::Deconvolution*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - fprintf_param_value(" 18=%d", output_pad_right) - { - if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); - } - fprintf_param_value(" 20=%d", output_w) - { - if (op->output_h != op->output_w) fprintf(pp, " 21=%d", op->output_h); - } - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inw = blobs[layer->bottoms[0]].shape.w; - int inh = blobs[layer->bottoms[0]].shape.h; - int inc = blobs[layer->bottoms[0]].shape.c; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_h * op->kernel_w * inw * inh * outc * inc; - } - } - else if (layer->type == "Deconvolution1D") - { - ncnn::Deconvolution1D* op = (ncnn::Deconvolution1D*)layer; - ncnn::Deconvolution1D* op_default = (ncnn::Deconvolution1D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - fprintf_param_value(" 2=%d", dilation_w) - fprintf_param_value(" 3=%d", stride_w) - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - fprintf_param_value(" 18=%d", output_pad_right) - fprintf_param_value(" 20=%d", output_w) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inw = blobs[layer->bottoms[0]].shape.w; - int inh = blobs[layer->bottoms[0]].shape.h; - int outh = blobs[layer->tops[0]].shape.h; - - mac += (uint64_t)op->kernel_w * inw * outh * inh; - } - } - else if (layer->type == "Deconvolution3D") - { - ncnn::Deconvolution3D* op = (ncnn::Deconvolution3D*)layer; - ncnn::Deconvolution3D* op_default = (ncnn::Deconvolution3D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - { - if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); - } - fprintf_param_value(" 18=%d", output_pad_right) - { - if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); - if (op->output_pad_behind != op->output_pad_right) fprintf(pp, " 20=%d", op->output_pad_behind); - } - fprintf_param_value(" 25=%d", output_w) - { - if (op->output_h != op->output_w) fprintf(pp, " 26=%d", op->output_h); - if (op->output_d != op->output_w) fprintf(pp, " 27=%d", op->output_d); - } - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inw = blobs[layer->bottoms[0]].shape.w; - int inh = blobs[layer->bottoms[0]].shape.h; - int ind = blobs[layer->bottoms[0]].shape.d; - int inc = blobs[layer->bottoms[0]].shape.c; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * inw * inh * ind * outc * inc; - } - } - else if (layer->type == "DeconvolutionDepthWise") - { - ncnn::DeconvolutionDepthWise* op = (ncnn::DeconvolutionDepthWise*)layer; - ncnn::DeconvolutionDepthWise* op_default = (ncnn::DeconvolutionDepthWise*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - fprintf_param_value(" 18=%d", output_pad_right) - { - if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); - } - fprintf_param_value(" 20=%d", output_w) - { - if (op->output_h != op->output_w) fprintf(pp, " 21=%d", op->output_h); - } - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 7=%d", group) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inw = blobs[layer->bottoms[0]].shape.w; - int inh = blobs[layer->bottoms[0]].shape.h; - int inc = blobs[layer->bottoms[0]].shape.c; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_h * op->kernel_w * inw * inh * (outc / op->group) * (inc / op->group) * op->group; - } - } - else if (layer->type == "DeconvolutionDepthWise1D") - { - ncnn::DeconvolutionDepthWise1D* op = (ncnn::DeconvolutionDepthWise1D*)layer; - ncnn::DeconvolutionDepthWise1D* op_default = (ncnn::DeconvolutionDepthWise1D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - fprintf_param_value(" 2=%d", dilation_w) - fprintf_param_value(" 3=%d", stride_w) - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - fprintf_param_value(" 18=%d", output_pad_right) - fprintf_param_value(" 20=%d", output_w) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 7=%d", group) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inw = blobs[layer->bottoms[0]].shape.w; - int inh = blobs[layer->bottoms[0]].shape.h; - int outh = blobs[layer->tops[0]].shape.h; - - mac += (uint64_t)op->kernel_w * inw * (outh / op->group) * (inh / op->group) * op->group; - } - } - else if (layer->type == "DeconvolutionDepthWise3D") - { - ncnn::DeconvolutionDepthWise3D* op = (ncnn::DeconvolutionDepthWise3D*)layer; - ncnn::DeconvolutionDepthWise3D* op_default = (ncnn::DeconvolutionDepthWise3D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - { - if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); - } - fprintf_param_value(" 18=%d", output_pad_right) - { - if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); - if (op->output_pad_behind != op->output_pad_right) fprintf(pp, " 20=%d", op->output_pad_behind); - } - fprintf_param_value(" 25=%d", output_w) - { - if (op->output_h != op->output_w) fprintf(pp, " 26=%d", op->output_h); - if (op->output_d != op->output_w) fprintf(pp, " 27=%d", op->output_d); - } - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 7=%d", group) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inw = blobs[layer->bottoms[0]].shape.w; - int inh = blobs[layer->bottoms[0]].shape.h; - int ind = blobs[layer->bottoms[0]].shape.d; - int inc = blobs[layer->bottoms[0]].shape.c; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * inw * inh * ind * (outc / op->group) * (inc / op->group) * op->group; - } - } - else if (layer->type == "DetectionOutput") - { - ncnn::DetectionOutput* op = (ncnn::DetectionOutput*)layer; - ncnn::DetectionOutput* op_default = (ncnn::DetectionOutput*)layer_default; - - fprintf_param_value(" 0=%d", num_class) - fprintf_param_value(" 1=%e", nms_threshold) - fprintf_param_value(" 2=%d", nms_top_k) - fprintf_param_value(" 3=%d", keep_top_k) - fprintf_param_value(" 4=%e", confidence_threshold) - fprintf_param_value(" 5=%e", variances[0]) - fprintf_param_value(" 6=%e", variances[1]) - fprintf_param_value(" 7=%e", variances[2]) - fprintf_param_value(" 8=%e", variances[3]) - } - else if (layer->type == "Dropout") - { - ncnn::Dropout* op = (ncnn::Dropout*)layer; - ncnn::Dropout* op_default = (ncnn::Dropout*)layer_default; - - fprintf_param_value(" 0=%e", scale) - } - else if (layer->type == "Eltwise") - { - ncnn::Eltwise* op = (ncnn::Eltwise*)layer; - ncnn::Eltwise* op_default = (ncnn::Eltwise*)layer_default; - - fprintf_param_value(" 0=%d", op_type) - { - if (!op->coeffs.empty()) fprintf_param_float_array(1, op->coeffs, pp); - } - } - else if (layer->type == "ELU") - { - ncnn::ELU* op = (ncnn::ELU*)layer; - ncnn::ELU* op_default = (ncnn::ELU*)layer_default; - - fprintf_param_value(" 0=%e", alpha) - } - else if (layer->type == "Embed") - { - ncnn::Embed* op = (ncnn::Embed*)layer; - ncnn::Embed* op_default = (ncnn::Embed*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", input_dim) - fprintf_param_value(" 2=%d", bias_term) - fprintf_param_value(" 3=%d", weight_data_size) - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - } - else if (layer->type == "Exp") - { - ncnn::Exp* op = (ncnn::Exp*)layer; - ncnn::Exp* op_default = (ncnn::Exp*)layer_default; - - fprintf_param_value(" 0=%e", base) - fprintf_param_value(" 1=%e", scale) - fprintf_param_value(" 2=%e", shift) - } - else if (layer->type == "ExpandDims") - { - ncnn::ExpandDims* op = (ncnn::ExpandDims*)layer; - ncnn::ExpandDims* op_default = (ncnn::ExpandDims*)layer_default; - - fprintf_param_value(" 0=%d", expand_w) - fprintf_param_value(" 1=%d", expand_h) - fprintf_param_value(" 2=%d", expand_c) - { - if (!op->axes.empty()) fprintf_param_int_array(0, op->axes, pp); - } - } - else if (layer->type == "GELU") - { - ncnn::GELU* op = (ncnn::GELU*)layer; - ncnn::GELU* op_default = (ncnn::GELU*)layer_default; - - fprintf_param_value(" 0=%d", fast_gelu) - } - else if (layer->type == "Gemm") - { - ncnn::Gemm* op = (ncnn::Gemm*)layer; - ncnn::Gemm* op_default = (ncnn::Gemm*)layer_default; - - fprintf_param_value(" 0=%e", alpha) - fprintf_param_value(" 1=%e", beta) - fprintf_param_value(" 2=%d", transA) - fprintf_param_value(" 3=%d", transB) - } - else if (layer->type == "GroupNorm") - { - ncnn::GroupNorm* op = (ncnn::GroupNorm*)layer; - ncnn::GroupNorm* op_default = (ncnn::GroupNorm*)layer_default; - - fprintf_param_value(" 0=%d", group) - fprintf_param_value(" 1=%d", channels) - fprintf_param_value(" 2=%e", eps) - fprintf_param_value(" 3=%d", affine) - - fwrite_weight_data(op->gamma_data, bp); - fwrite_weight_data(op->beta_data, bp); - } - else if (layer->type == "GRU") - { - ncnn::GRU* op = (ncnn::GRU*)layer; - ncnn::GRU* op_default = (ncnn::GRU*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", weight_data_size) - fprintf_param_value(" 2=%d", direction) - - fwrite_weight_tag_data(op->weight_xc_data, bp); - fwrite_weight_tag_data(op->bias_c_data, bp); - fwrite_weight_tag_data(op->weight_hc_data, bp); - } - else if (layer->type == "HardSigmoid") - { - ncnn::HardSigmoid* op = (ncnn::HardSigmoid*)layer; - ncnn::HardSigmoid* op_default = (ncnn::HardSigmoid*)layer_default; - - fprintf_param_value(" 0=%e", alpha) - fprintf_param_value(" 1=%e", beta) - } - else if (layer->type == "HardSwish") - { - ncnn::HardSwish* op = (ncnn::HardSwish*)layer; - ncnn::HardSwish* op_default = (ncnn::HardSwish*)layer_default; - - fprintf_param_value(" 0=%e", alpha) - fprintf_param_value(" 1=%e", beta) - } - else if (layer->type == "InnerProduct") - { - ncnn::InnerProduct* op = (ncnn::InnerProduct*)layer; - ncnn::InnerProduct* op_default = (ncnn::InnerProduct*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", bias_term) - fprintf_param_value(" 2=%d", weight_data_size) - fprintf_param_value(" 8=%d", int8_scale_term) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - -#if NCNN_INT8 - // write int8_scale data - if (op->int8_scale_term) - { - fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100); - fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1); - } -#endif // NCNN_INT8 - - if (shape_ready) - { - int inw = blobs[layer->bottoms[0]].shape.w; - int inh = blobs[layer->bottoms[0]].shape.h; - int inc = blobs[layer->bottoms[0]].shape.c; - int outw = blobs[layer->tops[0]].shape.w; - - mac += (uint64_t)inw * inh * inc * outw; - } - } - else if (layer->type == "Input") - { - ncnn::Input* op = (ncnn::Input*)layer; - ncnn::Input* op_default = (ncnn::Input*)layer_default; - - fprintf_param_value(" 0=%d", w) - fprintf_param_value(" 1=%d", h) - fprintf_param_value(" 2=%d", c) - } - else if (layer->type == "InstanceNorm") - { - ncnn::InstanceNorm* op = (ncnn::InstanceNorm*)layer; - ncnn::InstanceNorm* op_default = (ncnn::InstanceNorm*)layer_default; - - fprintf_param_value(" 0=%d", channels) - fprintf_param_value(" 1=%e", eps) - fprintf_param_value(" 2=%d", affine) - - fwrite_weight_data(op->gamma_data, bp); - fwrite_weight_data(op->beta_data, bp); - } - else if (layer->type == "Interp") - { - ncnn::Interp* op = (ncnn::Interp*)layer; - ncnn::Interp* op_default = (ncnn::Interp*)layer_default; - - fprintf_param_value(" 0=%d", resize_type) - fprintf_param_value(" 1=%e", height_scale) - fprintf_param_value(" 2=%e", width_scale) - fprintf_param_value(" 3=%d", output_height) - fprintf_param_value(" 4=%d", output_width) - fprintf_param_value(" 5=%d", dynamic_target_size) - fprintf_param_value(" 6=%d", align_corner) - } - else if (layer->type == "LayerNorm") - { - ncnn::LayerNorm* op = (ncnn::LayerNorm*)layer; - ncnn::LayerNorm* op_default = (ncnn::LayerNorm*)layer_default; - - fprintf_param_value(" 0=%d", affine_size) - fprintf_param_value(" 1=%e", eps) - fprintf_param_value(" 2=%d", affine) - - fwrite_weight_data(op->gamma_data, bp); - fwrite_weight_data(op->beta_data, bp); - } - else if (layer->type == "Log") - { - ncnn::Log* op = (ncnn::Log*)layer; - ncnn::Log* op_default = (ncnn::Log*)layer_default; - - fprintf_param_value(" 0=%e", base) - fprintf_param_value(" 1=%e", scale) - fprintf_param_value(" 2=%e", shift) - } - else if (layer->type == "LRN") - { - ncnn::LRN* op = (ncnn::LRN*)layer; - ncnn::LRN* op_default = (ncnn::LRN*)layer_default; - - fprintf_param_value(" 0=%d", region_type) - fprintf_param_value(" 1=%d", local_size) - fprintf_param_value(" 2=%e", alpha) - fprintf_param_value(" 3=%e", beta) - fprintf_param_value(" 4=%e", bias) - } - else if (layer->type == "LSTM") - { - ncnn::LSTM* op = (ncnn::LSTM*)layer; - ncnn::LSTM* op_default = (ncnn::LSTM*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", weight_data_size) - fprintf_param_value(" 2=%d", direction) - - fwrite_weight_tag_data(op->weight_xc_data, bp); - fwrite_weight_tag_data(op->bias_c_data, bp); - fwrite_weight_tag_data(op->weight_hc_data, bp); - } - else if (layer->type == "MatMul") - { - ncnn::MatMul* op = (ncnn::MatMul*)layer; - ncnn::MatMul* op_default = (ncnn::MatMul*)layer_default; - - fprintf_param_value(" 0=%d", transB) - } - else if (layer->type == "MemoryData") - { - ncnn::MemoryData* op = (ncnn::MemoryData*)layer; - ncnn::MemoryData* op_default = (ncnn::MemoryData*)layer_default; - - fprintf_param_value(" 0=%d", w) - fprintf_param_value(" 1=%d", h) - fprintf_param_value(" 2=%d", c) - fprintf_param_value(" 11=%d", d) - fwrite_weight_data(op->data, bp); - } - else if (layer->type == "MultiHeadAttention") - { - ncnn::MultiHeadAttention* op = (ncnn::MultiHeadAttention*)layer; - ncnn::MultiHeadAttention* op_default = (ncnn::MultiHeadAttention*)layer_default; - - fprintf_param_value(" 0=%d", embed_dim) - fprintf_param_value(" 1=%d", num_head) - fprintf_param_value(" 2=%d", weight_data_size) - - fwrite_weight_tag_data(op->q_weight_data, bp); - fwrite_weight_data(op->q_bias_data, bp); - fwrite_weight_tag_data(op->k_weight_data, bp); - fwrite_weight_data(op->k_bias_data, bp); - fwrite_weight_tag_data(op->v_weight_data, bp); - fwrite_weight_data(op->v_bias_data, bp); - fwrite_weight_tag_data(op->out_weight_data, bp); - fwrite_weight_data(op->out_bias_data, bp); - } - else if (layer->type == "MVN") - { - ncnn::MVN* op = (ncnn::MVN*)layer; - ncnn::MVN* op_default = (ncnn::MVN*)layer_default; - - fprintf_param_value(" 0=%d", normalize_variance) - fprintf_param_value(" 1=%d", across_channels) - fprintf_param_value(" 2=%e", eps) - } - else if (layer->type == "Normalize") - { - ncnn::Normalize* op = (ncnn::Normalize*)layer; - ncnn::Normalize* op_default = (ncnn::Normalize*)layer_default; - - fprintf_param_value(" 0=%d", across_spatial) - fprintf_param_value(" 1=%d", channel_shared) - fprintf_param_value(" 2=%e", eps) - fprintf_param_value(" 3=%d", scale_data_size) - fprintf_param_value(" 4=%d", across_channel) - fprintf_param_value(" 9=%d", eps_mode) - - fwrite_weight_data(op->scale_data, bp); - } - else if (layer->type == "Padding") - { - ncnn::Padding* op = (ncnn::Padding*)layer; - ncnn::Padding* op_default = (ncnn::Padding*)layer_default; - - fprintf_param_value(" 0=%d", top) - fprintf_param_value(" 1=%d", bottom) - fprintf_param_value(" 2=%d", left) - fprintf_param_value(" 3=%d", right) - fprintf_param_value(" 4=%d", type) - fprintf_param_value(" 5=%e", value) - fprintf_param_value(" 6=%d", per_channel_pad_data_size) - fprintf_param_value(" 7=%d", front) - fprintf_param_value(" 8=%d", behind) - - fwrite_weight_data(op->per_channel_pad_data, bp); - } - else if (layer->type == "Permute") - { - ncnn::Permute* op = (ncnn::Permute*)layer; - ncnn::Permute* op_default = (ncnn::Permute*)layer_default; - - fprintf_param_value(" 0=%d", order_type) - } - else if (layer->type == "PixelShuffle") - { - ncnn::PixelShuffle* op = (ncnn::PixelShuffle*)layer; - ncnn::PixelShuffle* op_default = (ncnn::PixelShuffle*)layer_default; - - fprintf_param_value(" 0=%d", upscale_factor) - fprintf_param_value(" 1=%d", mode) - } - else if (layer->type == "Pooling") - { - ncnn::Pooling* op = (ncnn::Pooling*)layer; - ncnn::Pooling* op_default = (ncnn::Pooling*)layer_default; - - fprintf_param_value(" 0=%d", pooling_type) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - } - fprintf_param_value(" 2=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 12=%d", op->stride_h); - } - fprintf_param_value(" 3=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 13=%d", op->pad_top); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 14=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 15=%d", op->pad_bottom); - } - fprintf_param_value(" 4=%d", global_pooling) - fprintf_param_value(" 5=%d", pad_mode) - fprintf_param_value(" 6=%d", avgpool_count_include_pad) - fprintf_param_value(" 7=%d", adaptive_pooling) - fprintf_param_value(" 8=%d", out_w) - { - if (op->out_h != op->out_w) fprintf(pp, " 18=%d", op->out_h); - } - } - else if (layer->type == "Pooling1D") - { - ncnn::Pooling1D* op = (ncnn::Pooling1D*)layer; - ncnn::Pooling1D* op_default = (ncnn::Pooling1D*)layer_default; - - fprintf_param_value(" 0=%d", pooling_type) - fprintf_param_value(" 1=%d", kernel_w) - fprintf_param_value(" 2=%d", stride_w) - fprintf_param_value(" 3=%d", pad_left) - { - if (op->pad_right != op->pad_left) fprintf(pp, " 14=%d", op->pad_right); - } - fprintf_param_value(" 4=%d", global_pooling) - fprintf_param_value(" 5=%d", pad_mode) - fprintf_param_value(" 6=%d", avgpool_count_include_pad) - fprintf_param_value(" 7=%d", adaptive_pooling) - fprintf_param_value(" 8=%d", out_w) - } - else if (layer->type == "Pooling3D") - { - ncnn::Pooling3D* op = (ncnn::Pooling3D*)layer; - ncnn::Pooling3D* op_default = (ncnn::Pooling3D*)layer_default; - - fprintf_param_value(" 0=%d", pooling_type) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); - } - fprintf_param_value(" 2=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 12=%d", op->stride_h); - if (op->stride_d != op->stride_w) fprintf(pp, " 22=%d", op->stride_d); - } - fprintf_param_value(" 3=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 13=%d", op->pad_top); - if (op->pad_front != op->pad_left) fprintf(pp, " 23=%d", op->pad_front); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 14=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 15=%d", op->pad_bottom); - } - { - if (op->pad_behind != op->pad_front) fprintf(pp, " 16=%d", op->pad_behind); - } - fprintf_param_value(" 4=%d", global_pooling) - fprintf_param_value(" 5=%d", pad_mode) - fprintf_param_value(" 6=%d", avgpool_count_include_pad) - fprintf_param_value(" 7=%d", adaptive_pooling) - fprintf_param_value(" 8=%d", out_w) - { - if (op->out_h != op->out_w) fprintf(pp, " 18=%d", op->out_h); - if (op->out_d != op->out_w) fprintf(pp, " 28=%d", op->out_d); - } - } - else if (layer->type == "Power") - { - ncnn::Power* op = (ncnn::Power*)layer; - ncnn::Power* op_default = (ncnn::Power*)layer_default; - - fprintf_param_value(" 0=%e", power) - fprintf_param_value(" 1=%e", scale) - fprintf_param_value(" 2=%e", shift) - } - else if (layer->type == "PReLU") - { - ncnn::PReLU* op = (ncnn::PReLU*)layer; - ncnn::PReLU* op_default = (ncnn::PReLU*)layer_default; - - fprintf_param_value(" 0=%d", num_slope) - - fwrite_weight_data(op->slope_data, bp); - } - else if (layer->type == "PriorBox") - { - ncnn::PriorBox* op = (ncnn::PriorBox*)layer; - ncnn::PriorBox* op_default = (ncnn::PriorBox*)layer_default; - - { - if (!op->min_sizes.empty()) fprintf_param_float_array(0, op->min_sizes, pp); - } - { - if (!op->max_sizes.empty()) fprintf_param_float_array(1, op->max_sizes, pp); - } - { - if (!op->aspect_ratios.empty()) fprintf_param_float_array(2, op->aspect_ratios, pp); - } - fprintf_param_value(" 3=%e", variances[0]) - fprintf_param_value(" 4=%e", variances[1]) - fprintf_param_value(" 5=%e", variances[2]) - fprintf_param_value(" 6=%e", variances[3]) - fprintf_param_value(" 7=%d", flip) - fprintf_param_value(" 8=%d", clip) - fprintf_param_value(" 9=%d", image_width) - fprintf_param_value(" 10=%d", image_height) - fprintf_param_value(" 11=%e", step_width) - fprintf_param_value(" 12=%e", step_height) - fprintf_param_value(" 13=%e", offset) - } - else if (layer->type == "Proposal") - { - ncnn::Proposal* op = (ncnn::Proposal*)layer; - ncnn::Proposal* op_default = (ncnn::Proposal*)layer_default; - - fprintf_param_value(" 0=%d", feat_stride) - fprintf_param_value(" 1=%d", base_size) - fprintf_param_value(" 2=%d", pre_nms_topN) - fprintf_param_value(" 3=%d", after_nms_topN) - fprintf_param_value(" 4=%e", nms_thresh) - fprintf_param_value(" 5=%d", min_size) - } - else if (layer->type == "PSROIPooling") - { - ncnn::PSROIPooling* op = (ncnn::PSROIPooling*)layer; - ncnn::PSROIPooling* op_default = (ncnn::PSROIPooling*)layer_default; - - fprintf_param_value(" 0=%d", pooled_width) - fprintf_param_value(" 1=%d", pooled_height) - fprintf_param_value(" 2=%e", spatial_scale) - fprintf_param_value(" 3=%d", output_dim) - } - else if (layer->type == "Quantize") - { - ncnn::Quantize* op = (ncnn::Quantize*)layer; - ncnn::Quantize* op_default = (ncnn::Quantize*)layer_default; - - fprintf_param_value(" 0=%d", scale_data_size) - - fwrite_weight_data(op->scale_data, bp); - } - else if (layer->type == "Reduction") - { - ncnn::Reduction* op = (ncnn::Reduction*)layer; - ncnn::Reduction* op_default = (ncnn::Reduction*)layer_default; - - fprintf_param_value(" 0=%d", operation) - fprintf_param_value(" 1=%d", reduce_all) - fprintf_param_value(" 2=%e", coeff) - { - if (!op->axes.empty()) fprintf_param_int_array(3, op->axes, pp); - } - fprintf_param_value(" 4=%d", keepdims) - - // HACK - if (!op->axes.empty()) - { - int fixbug0 = 1; - fprintf(pp, " 5=%d", fixbug0); - } - } - else if (layer->type == "ReLU") - { - ncnn::ReLU* op = (ncnn::ReLU*)layer; - ncnn::ReLU* op_default = (ncnn::ReLU*)layer_default; - - fprintf_param_value(" 0=%e", slope) - } - else if (layer->type == "Reorg") - { - ncnn::Reorg* op = (ncnn::Reorg*)layer; - ncnn::Reorg* op_default = (ncnn::Reorg*)layer_default; - - fprintf_param_value(" 0=%d", stride) - fprintf_param_value(" 1=%d", mode) - } - else if (layer->type == "Requantize") - { - ncnn::Requantize* op = (ncnn::Requantize*)layer; - ncnn::Requantize* op_default = (ncnn::Requantize*)layer_default; - - fprintf_param_value(" 0=%d", scale_in_data_size) - fprintf_param_value(" 1=%d", scale_out_data_size) - fprintf_param_value(" 2=%d", bias_data_size) - fprintf_param_value(" 3=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(4, op->activation_params, pp); - } - - fwrite_weight_data(op->scale_in_data, bp); - fwrite_weight_data(op->scale_out_data, bp); - fwrite_weight_data(op->bias_data, bp); - } - else if (layer->type == "Reshape") - { - ncnn::Reshape* op = (ncnn::Reshape*)layer; - ncnn::Reshape* op_default = (ncnn::Reshape*)layer_default; - - fprintf_param_value(" 0=%d", w) - fprintf_param_value(" 1=%d", h) - fprintf_param_value(" 2=%d", c) - fprintf_param_value(" 3=%d", permute) - } - else if (layer->type == "RNN") - { - ncnn::RNN* op = (ncnn::RNN*)layer; - ncnn::RNN* op_default = (ncnn::RNN*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", weight_data_size) - fprintf_param_value(" 2=%d", direction) - - fwrite_weight_tag_data(op->weight_xc_data, bp); - fwrite_weight_tag_data(op->bias_c_data, bp); - fwrite_weight_tag_data(op->weight_hc_data, bp); - } - else if (layer->type == "ROIAlign") - { - ncnn::ROIAlign* op = (ncnn::ROIAlign*)layer; - ncnn::ROIAlign* op_default = (ncnn::ROIAlign*)layer_default; - - fprintf_param_value(" 0=%d", pooled_width) - fprintf_param_value(" 1=%d", pooled_height) - fprintf_param_value(" 2=%e", spatial_scale) - fprintf_param_value(" 3=%d", sampling_ratio) - fprintf_param_value(" 4=%d", aligned) - fprintf_param_value(" 5=%d", version) - } - else if (layer->type == "ROIPooling") - { - ncnn::ROIPooling* op = (ncnn::ROIPooling*)layer; - ncnn::ROIPooling* op_default = (ncnn::ROIPooling*)layer_default; - - fprintf_param_value(" 0=%d", pooled_width) - fprintf_param_value(" 1=%d", pooled_height) - fprintf_param_value(" 2=%e", spatial_scale) - } - else if (layer->type == "Scale") - { - ncnn::Scale* op = (ncnn::Scale*)layer; - ncnn::Scale* op_default = (ncnn::Scale*)layer_default; - - fprintf_param_value(" 0=%d", scale_data_size) - fprintf_param_value(" 1=%d", bias_term) - - fwrite_weight_data(op->scale_data, bp); - fwrite_weight_data(op->bias_data, bp); - } - else if (layer->type == "ShuffleChannel") - { - ncnn::ShuffleChannel* op = (ncnn::ShuffleChannel*)layer; - ncnn::ShuffleChannel* op_default = (ncnn::ShuffleChannel*)layer_default; - - fprintf_param_value(" 0=%d", group) - fprintf_param_value(" 1=%d", reverse) - } - else if (layer->type == "Slice") - { - ncnn::Slice* op = (ncnn::Slice*)layer; - ncnn::Slice* op_default = (ncnn::Slice*)layer_default; - - { - if (!op->slices.empty()) fprintf_param_int_array(0, op->slices, pp); - } - fprintf_param_value(" 1=%d", axis) - } - else if (layer->type == "Softmax") - { - ncnn::Softmax* op = (ncnn::Softmax*)layer; - ncnn::Softmax* op_default = (ncnn::Softmax*)layer_default; - - fprintf_param_value(" 0=%d", axis) - - // HACK - if (op->axis != 0) - { - int fixbug0 = 1; - fprintf(pp, " 1=%d", fixbug0); - } - } - else if (layer->type == "Squeeze") - { - ncnn::Squeeze* op = (ncnn::Squeeze*)layer; - ncnn::Squeeze* op_default = (ncnn::Squeeze*)layer_default; - - fprintf_param_value(" 0=%d", squeeze_w) - fprintf_param_value(" 1=%d", squeeze_h) - fprintf_param_value(" 2=%d", squeeze_c) - { - if (!op->axes.empty()) fprintf_param_int_array(0, op->axes, pp); - } - } - else if (layer->type == "Threshold") - { - ncnn::Threshold* op = (ncnn::Threshold*)layer; - ncnn::Threshold* op_default = (ncnn::Threshold*)layer_default; - - fprintf_param_value(" 0=%e", threshold) - } - else if (layer->type == "UnaryOp") - { - ncnn::UnaryOp* op = (ncnn::UnaryOp*)layer; - ncnn::UnaryOp* op_default = (ncnn::UnaryOp*)layer_default; - - fprintf_param_value(" 0=%d", op_type) - } - else if (layer->type == "YoloDetectionOutput") - { - ncnn::YoloDetectionOutput* op = (ncnn::YoloDetectionOutput*)layer; - ncnn::YoloDetectionOutput* op_default = (ncnn::YoloDetectionOutput*)layer_default; - - fprintf_param_value(" 0=%d", num_class) - fprintf_param_value(" 1=%d", num_box) - fprintf_param_value(" 2=%e", confidence_threshold) - fprintf_param_value(" 3=%e", nms_threshold) - { - if (!op->biases.empty()) fprintf_param_float_array(4, op->biases, pp); - } - } - else if (layer->type == "Yolov3DetectionOutput") - { - ncnn::Yolov3DetectionOutput* op = (ncnn::Yolov3DetectionOutput*)layer; - ncnn::Yolov3DetectionOutput* op_default = (ncnn::Yolov3DetectionOutput*)layer_default; - - fprintf_param_value(" 0=%d", num_class) - fprintf_param_value(" 1=%d", num_box) - fprintf_param_value(" 2=%e", confidence_threshold) - fprintf_param_value(" 3=%e", nms_threshold) - { - if (!op->biases.empty()) fprintf_param_float_array(4, op->biases, pp); - } - { - if (!op->mask.empty()) fprintf_param_int_array(5, op->mask, pp); - } - { - if (!op->anchors_scale.empty()) fprintf_param_float_array(6, op->anchors_scale, pp); - } - } - -#undef fprintf_param_value - - fprintf(pp, "\n"); - - delete layer_default; - } - - fclose(pp); - fclose(bp); - - if (mac) - { - fprintf(stderr, "mac = %llu = %.2f M\n", static_cast(mac), mac / 1000000.0); - } - - return 0; -} diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt index 72c76d135015..3cf1460592ae 100644 --- a/tools/quantize/CMakeLists.txt +++ b/tools/quantize/CMakeLists.txt @@ -1,3 +1,4 @@ +set(CMAKE_CXX_STANDARD 11) if(NCNN_PIXEL) if(NOT NCNN_SIMPLEOCV) @@ -17,15 +18,15 @@ if(NCNN_PIXEL) set(OpenCV_FOUND FALSE) if(OpenCV_FOUND) - add_executable(ncnn2table ncnn2table.cpp) + add_executable(ncnn2table ncnn2table.cpp ini_config.cpp) target_include_directories(ncnn2table PRIVATE ${OpenCV_INCLUDE_DIRS}) target_link_libraries(ncnn2table PRIVATE ncnn ${OpenCV_LIBS}) elseif(NCNN_SIMPLEOCV) - add_executable(ncnn2table ncnn2table.cpp) + add_executable(ncnn2table ncnn2table.cpp ini_config.cpp) target_compile_definitions(ncnn2table PUBLIC USE_NCNN_SIMPLEOCV) target_link_libraries(ncnn2table PRIVATE ncnn) else() - add_executable(ncnn2table ncnn2table.cpp imreadwrite.cpp) + add_executable(ncnn2table ncnn2table.cpp imreadwrite.cpp ini_config.cpp) target_compile_definitions(ncnn2table PUBLIC USE_LOCAL_IMREADWRITE) target_link_libraries(ncnn2table PRIVATE ncnn) endif() @@ -34,7 +35,7 @@ if(NCNN_PIXEL) set_property(TARGET ncnn2table PROPERTY FOLDER "tools/optimization") endif() -add_executable(ncnn2int8 ncnn2int8.cpp) +add_executable(ncnn2int8 ncnn2int8.cpp ini_config.cpp net_quantize.cpp ../modelwriter.cpp) target_link_libraries(ncnn2int8 PRIVATE ncnn) # add ncnn2int8 tool to a virtual project group diff --git a/tools/quantize/imreadwrite.h b/tools/quantize/imreadwrite.h index 5a955dfd35e1..f25286502194 100644 --- a/tools/quantize/imreadwrite.h +++ b/tools/quantize/imreadwrite.h @@ -11,7 +11,7 @@ // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. - +#pragma once #ifndef IMREADWRITE_H #define IMREADWRITE_H diff --git a/tools/quantize/ini_config.cpp b/tools/quantize/ini_config.cpp new file mode 100644 index 000000000000..9da3ae9fcb86 --- /dev/null +++ b/tools/quantize/ini_config.cpp @@ -0,0 +1,224 @@ +#include "ini_config.h" +#include +#include + +namespace ini { + +template +void Value::set(T val) +{ + text = std::to_string(f); +} + +void Value::set(std::string str) +{ + text = '\"' + str + '\"'; +} + +template +void Value::set(const std::vector& data) +{ + text = "[ "; + + size_t len = data.size(); + if (len > 0) + { + size_t i = 0; + for (; i < len - 1; ++i) + { + text += std::to_string(data[i]); + text += ", "; + } + text += std::to_string(data[i]); + text += " "; + } + + text += "]"; +} + +template +T Value::get() +{ + T result; + std::stringstream ss; + ss << text; + ss >> result; + return result; +} + +template +std::vector Value::get() +{ + std::vector result; + + std::string no_brace; + { + // remove brace + auto start = text.find('['); + auto end = text.find(']'); + no_brace = text.substr(start + 1, end); + } + + { + // split with the separator ',' + std::stringstream ss; + size_t end = 0, start = 0; + while (true) + { + end = no_brace.find(',', start); + if (end == std::string::npos) + { + break; + } + + std::string val_str = no_brace.substr(start, end); + start = end + 1; + + T val; + ss << val_str; + ss >> val; + ss.clear(); + result.emplace_back(val); + } + + // parse the last one + std::string val_str = no_brace.substr(start); + T val; + ss << val_str; + ss >> val; + result.emplace_back(val); + } + + return result; +} + +std::string Value::stringify() +{ + return text; +} + +void Table::feed(std::string line) +{ + auto pos = line.find(':'); + assert(pos != std::string::npos); + + std::string key = line.substr(0, pos - 1); + std::string value_str = line.substr(pos + 1); + + values[key] = std::make_shared(value_str); +} + +void Table::feed(const std::vector& lines) +{ + for (auto& line : lines) + { + feed(line); + } +} + +void Table::append(std::string key, float data) +{ + auto pVal = std::make_shared(); + pVal->set(data); + values[key] = pVal; +} + +void Table::append(std::string key, const std::vector& data) +{ + auto pVal = std::make_shared(); + pVal->set(data); + values[key] = pVal; +} + +void Table::append(std::string key, std::string data) +{ + auto pVal = std::make_shared(); + pVal->set(data); + values[key] = pVal; +} + +std::shared_ptr Table::operator[](std::string key) +{ + return values[key]; +} + +std::string Table::stringify() +{ + std::string result; + for (auto itra = values.begin(); itra != values.end(); ++itra) + { + result += itra->first; + result += " = "; + result += itra->second->stringify(); + result += '\n'; + } + return result; +} + +void Config::read(std::string path) +{ + std::ifstream fin; + fin.open(path, std::ios::in); + + if (!fin.is_open()) + { + fprintf(stderr, "open %s failed\n", path.c_str()); + return; + } + + bool recoding = false; + std::shared_ptr pTable = nullptr; + + std::string line; + while (fin >> line) + { + if (nullptr == pTable) + { + auto start = line.find('['); + auto end = line.find(']'); + assert(start != std::string::npos); + assert(end != std::string::npos); + + std::string key = line.substr(start + 1, end); + pTable = std::make_shared
(); + tables[key] = pTable; + continue; + } + + if (line.length() <= 2) + { + pTable = nullptr; + continue; + ; + } + + pTable->feed(line); + } +} + +std::vector Config::list_all() +{ + std::vector result; + for (auto itra = tables.begin(); itra != tables.end(); ++itra) + { + result.push_back(itra->first); + } + return result; +} + +std::shared_ptr
Config::operator[](std::string key) +{ + return tables[key]; +} + +void Config::append(std::string key, std::shared_ptr
table) +{ + tables[key] = table; +} + +void Config::write(std::string path) +{ + // TODO +} + +} // namespace ini diff --git a/tools/quantize/ini_config.h b/tools/quantize/ini_config.h new file mode 100644 index 000000000000..7509b48efeb5 --- /dev/null +++ b/tools/quantize/ini_config.h @@ -0,0 +1,323 @@ +// tpoisonooo is pleased to support the open source community by making ncnn available. +// +// author:tpoisonooo (https://github.com/tpoisonooo/) . +// +// Copyright (C) 2022 tpoisonooo. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ini format table reader and writer +// file example: +// +// [Conv_0] +// type = "Conv" +// input_scale = 127.0 +// weight = [ 1117.265625, 8819.232421875 ] +// +// [LayerNorm_66] +// type = "LayerNorm" +// zero_point = -24 + +namespace ini { + +template +std::string value_set(T data) +{ + return std::to_string(data); +} + +template<> +std::string value_set(std::string data); + +template<> +std::string value_set(const char* data); + +template +std::string value_set(const std::vector& data) +{ + std::string text = "[ "; + size_t len = data.size(); + if (len > 0) + { + size_t i = 0; + for (; i < len - 1; ++i) + { + text += std::to_string(data[i]); + text += ", "; + } + text += std::to_string(data[i]); + text += " "; + } + text += "]"; + return text; +} + +template +T value_get(std::string text) +{ + T result; + std::stringstream ss; + ss << text; + ss >> result; + return result; +} + +template<> +std::string value_get(std::string text); + +/** + * @brief parse `[1, 2.2]` format to value list + * + * @tparam T + * @param text + * @return std::vector + */ +template +std::vector value_get_list(std::string text) +{ + std::vector result; + std::string no_brace; + { + // remove brace + auto start = text.find('['); + auto end = text.find(']'); + no_brace = text.substr(start + 1, end - start - 1); + } + + { + // split with the separator ',' + std::stringstream ss; + size_t end = 0, start = 0; + while (true) + { + end = no_brace.find(',', start); + if (end == std::string::npos) + { + break; + } + + std::string val_str = no_brace.substr(start, end - start); + start = end + 1; + + T val; + ss << val_str; + ss >> val; + ss.clear(); + result.emplace_back(val); + } + + // parse the last one + std::string val_str = no_brace.substr(start); + T val; + ss << val_str; + ss >> val; + result.emplace_back(val); + } + return result; +} + +/** + * @brief contains multiple `key=value` lines + * + */ +class Table +{ +public: + Table() + { + } + + void feed(std::string line) + { + auto pos = line.find('='); + assert(pos != std::string::npos); + + std::string key = line.substr(0, pos - 1); + std::string value_str = line.substr(pos + 2); + + values[key] = value_str; + } + + void feed(const std::vector& lines) + { + for (auto& line : lines) + { + feed(line); + } + } + + std::string operator[](std::string key) + { + return values[key]; + } + + template + T get(std::string key) + { + std::string text = values.at(key); + return value_get(text); + } + + template + std::vector get_list(std::string key) + { + std::string text = values[key]; + return value_get_list(text); + } + + template + void append(std::string key, T data) + { + values[key] = value_set(data); + } + + template + void append(std::string key, const std::vector& data) + { + values[key] = value_set(data); + } + + std::string stringify() + { + std::string result; + for (auto itra = values.begin(); itra != values.end(); ++itra) + { + result += itra->first; + result += " = "; + result += itra->second; + result += '\n'; + } + return result; + } + +private: + std::map values; +}; + +/** + * @brief `Config` consist of multiple key-table + * + */ +class Config +{ +public: + Config() + { + } + + void read(std::string path) + { + std::ifstream fin; + fin.open(path, std::ios::in); + + if (!fin.is_open()) + { + fprintf(stderr, "open %s failed\n", path.c_str()); + return; + } + + std::shared_ptr
pTable = nullptr; + constexpr int BUF_LEN = 1024 * 1024; + char buf[BUF_LEN] = {0}; + std::string line; + while (!fin.eof()) + { + fin.getline(buf, BUF_LEN); + line = std::string(buf); + + if (line.length() <= 2) + { + pTable = nullptr; + continue; + } + + if (nullptr == pTable) + { + auto start = line.find('['); + auto end = line.find(']'); + assert(start != std::string::npos); + assert(end != std::string::npos); + + std::string key = line.substr(start + 1, end - start - 1); + + pTable = std::make_shared
(); + append(key, pTable); + continue; + } + + pTable->feed(line); + } + + fin.close(); + } + + std::vector keys() + { + std::vector result; + for (auto& pair : tables) + { + result.push_back(std::get<0>(pair)); + } + return result; + } + + size_t size() + { + return tables.size(); + } + + std::tuple > operator[](size_t i) + { + return tables[i]; + } + + void append(const std::string& key, std::shared_ptr
table) + { + tables.emplace_back(std::make_pair(key, table)); + } + + void write(const std::string& path) + { + std::ofstream fout; + fout.open(path, std::ios::out); + if (!fout.is_open()) + { + fprintf(stderr, "open %s failed\n", path.c_str()); + } + + for (auto& pair : tables) + { + std::string name = std::get<0>(pair); + std::shared_ptr
ptable = std::get<1>(pair); + fout << "[" << name << "]\n"; + fout << ptable->stringify(); + fout << "\n"; + } + fout.flush(); + fout.close(); + } + +private: + std::vector > > tables; +}; + +} // namespace ini diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index f712306b0228..e820217392b3 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -24,12 +24,9 @@ // ncnn public header #include "datareader.h" -#include "layer.h" -#include "layer_type.h" -#include "net.h" // ncnn private header -#include "../modelwriter.h" +#include "net_quantize.h" class DataReaderFromEmpty : public ncnn::DataReader { @@ -45,476 +42,6 @@ class DataReaderFromEmpty : public ncnn::DataReader } }; -static bool read_int8scale_table(const char* filepath, std::map& blob_int8scale_table, std::map& weight_int8scale_table) -{ - blob_int8scale_table.clear(); - weight_int8scale_table.clear(); - - FILE* fp = fopen(filepath, "rb"); - if (!fp) - { - fprintf(stderr, "Open %s failed.\n", filepath); - return false; - } - - std::string key_str; - std::vector scales; - - std::vector line(10240000); - char* pch = NULL; - size_t len = 0; - - while (!feof(fp)) - { - char* s = fgets(line.data(), (int)line.size(), fp); - if (!s) - break; - - float scale = 1.f; - char key[256]; - line[strcspn(line.data(), "\r\n")] = 0; - - pch = strtok(line.data(), " "); - - if (pch == NULL) break; - - bool is_key = true; - while (pch != NULL) - { - if (is_key) - { - sscanf(pch, "%255s", key); - - key_str = key; - is_key = false; - } - else - { - sscanf(pch, "%f", &scale); - - scales.push_back(scale); - } - - pch = strtok(NULL, " "); - } - - // XYZ_param_N pattern - if (strstr(key_str.c_str(), "_param_")) - { - weight_int8scale_table[key_str] = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); - } - else - { - blob_int8scale_table[key_str] = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); - } - key_str.clear(); - scales.clear(); - } - - fclose(fp); - - return true; -} - -class NetQuantize : public ModelWriter -{ -public: - NetQuantize(); - - std::map blob_int8scale_table; - std::map weight_int8scale_table; - -public: - int quantize_convolution(); - int quantize_convolutiondepthwise(); - int quantize_innerproduct(); - - int fuse_requantize(); -}; - -NetQuantize::NetQuantize() - : ModelWriter() -{ -} - -int NetQuantize::quantize_convolution() -{ - const int layer_count = static_cast(layers.size()); - for (int i = 0; i < layer_count; i++) - { - // find convolution layer - if (layers[i]->type != "Convolution") - continue; - - // find convolution layer - std::map::iterator iter_data = blob_int8scale_table.find(layers[i]->name); - if (iter_data == blob_int8scale_table.end()) - continue; - - char key[256]; - sprintf(key, "%s_param_0", layers[i]->name.c_str()); - - std::map::iterator iter = weight_int8scale_table.find(key); - if (iter == weight_int8scale_table.end()) - { - fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); - return -1; - } - - // Convolution - quantize weight from fp32 to int8 - ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i]; - - ncnn::Mat bottom_blob_int8_scales = iter_data->second; - ncnn::Mat weight_data_int8_scales = iter->second; - - fprintf(stderr, "quantize_convolution %s\n", convolution->name.c_str()); - - { - const int maxk = convolution->kernel_w * convolution->kernel_h; - const int num_input = convolution->weight_data_size / convolution->num_output / maxk; - - ncnn::Mat weight_data_r2 = convolution->weight_data.reshape(maxk, num_input, convolution->num_output); - - ncnn::Mat weight_data_int8; - - ncnn::Option opt_q = opt; - opt_q.blob_allocator = convolution->weight_data.allocator; - opt_q.use_packing_layout = false; - ncnn::quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q); - if (weight_data_int8.empty()) - return -100; - - convolution->weight_data = weight_data_int8.reshape(convolution->weight_data_size); - } - - convolution->int8_scale_term = 2; - convolution->weight_data_int8_scales = weight_data_int8_scales; - convolution->bottom_blob_int8_scales = bottom_blob_int8_scales; - } - - return 0; -} - -int NetQuantize::quantize_convolutiondepthwise() -{ - const int layer_count = static_cast(layers.size()); - for (int i = 0; i < layer_count; i++) - { - // find convolution layer - if (layers[i]->type != "ConvolutionDepthWise") - continue; - - // find convolutiondepthwise layer - std::map::iterator iter_data = blob_int8scale_table.find(layers[i]->name); - if (iter_data == blob_int8scale_table.end()) - continue; - - char key[256]; - sprintf(key, "%s_param_0", layers[i]->name.c_str()); - - std::map::iterator iter = weight_int8scale_table.find(key); - if (iter == weight_int8scale_table.end()) - { - fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); - return -1; - } - - // Convolution - quantize weight from fp32 to int8 - ncnn::ConvolutionDepthWise* convdw = (ncnn::ConvolutionDepthWise*)layers[i]; - - ncnn::Mat bottom_blob_int8_scales = iter_data->second; - ncnn::Mat weight_data_int8_scales = iter->second; - - fprintf(stderr, "quantize_convolutiondepthwise %s\n", convdw->name.c_str()); - - { - ncnn::Mat int8_weight_data(convdw->weight_data_size, (size_t)1u); - if (int8_weight_data.empty()) - return -100; - - const int weight_data_size_g = convdw->weight_data_size / convdw->group; - - for (int g = 0; g < convdw->group; g++) - { - ncnn::Option opt_q = opt; - opt_q.blob_allocator = int8_weight_data.allocator; - opt_q.use_packing_layout = false; - - const ncnn::Mat weight_data_g = convdw->weight_data.range(weight_data_size_g * g, weight_data_size_g); - ncnn::Mat int8_weight_data_g = int8_weight_data.range(weight_data_size_g * g, weight_data_size_g); - const ncnn::Mat weight_data_int8_scales_g = weight_data_int8_scales.range(g, 1); - ncnn::quantize_to_int8(weight_data_g, int8_weight_data_g, weight_data_int8_scales_g, opt_q); - } - - convdw->weight_data = int8_weight_data; - } - - convdw->int8_scale_term = 1; - convdw->weight_data_int8_scales = weight_data_int8_scales; - convdw->bottom_blob_int8_scales = bottom_blob_int8_scales; - } - - return 0; -} - -int NetQuantize::quantize_innerproduct() -{ - const int layer_count = static_cast(layers.size()); - for (int i = 0; i < layer_count; i++) - { - // find convolution layer - if (layers[i]->type != "InnerProduct") - continue; - - // find InnerProduct layer - std::map::iterator iter_data = blob_int8scale_table.find(layers[i]->name); - if (iter_data == blob_int8scale_table.end()) - continue; - - char key[256]; - sprintf(key, "%s_param_0", layers[i]->name.c_str()); - - std::map::iterator iter = weight_int8scale_table.find(key); - if (iter == weight_int8scale_table.end()) - { - fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); - return -1; - } - - // InnerProduct - quantize weight from fp32 to int8 - ncnn::InnerProduct* fc = (ncnn::InnerProduct*)layers[i]; - - ncnn::Mat bottom_blob_int8_scales = iter_data->second; - ncnn::Mat weight_data_int8_scales = iter->second; - - fprintf(stderr, "quantize_innerproduct %s\n", fc->name.c_str()); - - { - const int num_input = fc->weight_data_size / fc->num_output; - - ncnn::Mat weight_data_r2 = fc->weight_data.reshape(num_input, fc->num_output); - - ncnn::Mat weight_data_int8; - ncnn::Option opt_q = opt; - opt_q.use_packing_layout = false; - ncnn::quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q); - if (weight_data_int8.empty()) - return -100; - - fc->weight_data = weight_data_int8.reshape(fc->weight_data_size); - } - - fc->int8_scale_term = 2; - fc->weight_data_int8_scales = weight_data_int8_scales; - fc->bottom_blob_int8_scales = bottom_blob_int8_scales; - } - - return 0; -} - -int NetQuantize::fuse_requantize() -{ - const size_t layer_count = layers.size(); - for (size_t i = 0; i < layer_count; i++) - { - if (layers[i]->type != "Convolution" && layers[i]->type != "ConvolutionDepthWise") - continue; - - // Convolution/ConvolutionDepthWise - Convolution/ConvolutionDepthWise - int top_blob_index = layers[i]->tops[0]; - - size_t j = i + 1; - for (; j < layer_count; j++) - { - if (layers[j]->type != "Convolution" && layers[j]->type != "ConvolutionDepthWise") - continue; - - if (layers[j]->bottoms.size() != 1) - continue; - - if (layers[j]->bottoms[0] == top_blob_index) - break; - } - - if (j == layer_count) - continue; - - // fuse requantize - fprintf(stderr, "fuse_requantize %s %s\n", layers[i]->name.c_str(), layers[j]->name.c_str()); - - if (layers[i]->type == "Convolution" && layers[j]->type == "Convolution") - { - ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; - ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - if (layers[i]->type == "Convolution" && layers[j]->type == "ConvolutionDepthWise") - { - ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; - ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "Convolution") - { - ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; - ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "ConvolutionDepthWise") - { - ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; - ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - } - - for (size_t i = 0; i < layer_count; i++) - { - if (layers[i]->type != "Convolution" && layers[i]->type != "ConvolutionDepthWise") - continue; - - // Convolution/ConvolutionDepthWise - Split - Convolution/ConvolutionDepthWise - int top_blob_index = layers[i]->tops[0]; - - size_t j = i + 1; - for (; j < layer_count; j++) - { - if (layers[j]->type != "Split") - continue; - - if (layers[j]->bottoms.size() != 1) - continue; - - if (layers[j]->bottoms[0] == top_blob_index) - break; - } - - if (j == layer_count) - continue; - - ncnn::Split* split = (ncnn::Split*)layers[j]; - - bool all_conv = true; - for (size_t p = 0; p < split->tops.size(); p++) - { - int split_top_blob_index = split->tops[p]; - - size_t k = j + 1; - for (; k < layer_count; k++) - { - if (layers[k]->type != "Convolution" && layers[k]->type != "ConvolutionDepthWise") - continue; - - if (layers[k]->bottoms.size() != 1) - continue; - - if (layers[k]->bottoms[0] == split_top_blob_index) - break; - } - - if (k == layer_count) - { - all_conv = false; - break; - } - - if (layers[k]->type == "Convolution") - { - ncnn::Convolution* convolution = (ncnn::Convolution*)layers[k]; - if (convolution->weight_data.elemsize != 1u) - { - all_conv = false; - break; - } - } - if (layers[k]->type == "ConvolutionDepthWise") - { - ncnn::ConvolutionDepthWise* convolution = (ncnn::ConvolutionDepthWise*)layers[k]; - if (convolution->weight_data.elemsize != 1u) - { - all_conv = false; - break; - } - } - } - - if (!all_conv) - continue; - - j = blobs[split->tops[0]].consumer; - - // fuse requantize - fprintf(stderr, "fuse_requantize %s %s\n", layers[i]->name.c_str(), split->name.c_str()); - - if (layers[i]->type == "Convolution" && layers[j]->type == "Convolution") - { - ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; - ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - if (layers[i]->type == "Convolution" && layers[j]->type == "ConvolutionDepthWise") - { - ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; - ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "Convolution") - { - ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; - ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "ConvolutionDepthWise") - { - ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; - ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - } - - return 0; -} - int main(int argc, char** argv) { if (argc != 6) @@ -532,14 +59,21 @@ int main(int argc, char** argv) NetQuantize quantizer; // parse the calibration scale table - if (int8scale_table_path) + bool success = false; + if (std::string(int8scale_table_path).find(".ini") == std::string::npos) + { + quantizer.set_weight_suffix("_param_0"); + success = quantizer.read_txt_format(int8scale_table_path); + } + else { - bool s2 = read_int8scale_table(int8scale_table_path, quantizer.blob_int8scale_table, quantizer.weight_int8scale_table); - if (!s2) - { - fprintf(stderr, "read_int8scale_table failed\n"); - return -1; - } + success = quantizer.read_ini_format(int8scale_table_path); + } + + if (!success) + { + fprintf(stderr, "read_int8scale_table failed\n"); + return -1; } quantizer.load_param(inparam); @@ -552,6 +86,7 @@ int main(int argc, char** argv) else quantizer.load_model(inbin); + quantizer.quantize_mha(); quantizer.quantize_convolution(); quantizer.quantize_convolutiondepthwise(); quantizer.quantize_innerproduct(); diff --git a/tools/quantize/ncnn2table.cpp b/tools/quantize/ncnn2table.cpp index 9fbafa2d1813..5f83706a0b9b 100644 --- a/tools/quantize/ncnn2table.cpp +++ b/tools/quantize/ncnn2table.cpp @@ -38,6 +38,7 @@ #endif #include #include +#include "ini_config.h" // ncnn public header #include "benchmark.h" @@ -91,6 +92,7 @@ class QuantNet : public ncnn::Net int init(); void print_quant_info() const; int save_table(const char* tablepath); + int save_ini(const char* filepath); int quantize_KL(); int quantize_ACIQ(); int quantize_EQ(); @@ -98,6 +100,7 @@ class QuantNet : public ncnn::Net public: std::vector input_blobs; std::vector conv_layers; + std::vector type_list; std::vector conv_bottom_blobs; std::vector conv_top_blobs; @@ -132,6 +135,7 @@ int QuantNet::init() if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise" || layer->type == "InnerProduct") { conv_layers.push_back(i); + type_list.push_back(layer->type); conv_bottom_blobs.push_back(layer->bottoms[0]); conv_top_blobs.push_back(layer->tops[0]); } @@ -190,6 +194,62 @@ int QuantNet::save_table(const char* tablepath) return 0; } +int QuantNet::save_ini(const char* filepath) +{ + auto root = ini::Config(); + + const int conv_layer_count = static_cast(conv_layers.size()); + const int conv_bottom_blob_count = static_cast(conv_bottom_blobs.size()); + + for (int i = 0; i < conv_layer_count; i++) + { + auto tbl = std::make_shared(); + + // write opr type + auto type = type_list[i]; + if (type == "Convolution" || type == "ConvolutionDepthWise") + { + tbl->append("type", std::string("Conv")); + } + else if (type == "InnerProduct") + { + tbl->append("type", std::string("Gemm")); + } + else + { + fprintf(stderr, "unknown type %s\n", type.c_str()); + } + + // write weight scales + { + const ncnn::Mat& weight_scale = weight_scales[i]; + + std::vector scales = {}; + for (int j = 0; j < weight_scale.w; j++) + { + scales.push_back(static_cast(weight_scale[j])); + } + tbl->append("weight", scales); + } + + // write input scale + { + const ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i]; + if (bottom_blob_scale.w != 1) + { + fprintf(stderr, "not support conv input scale length=%d\n", bottom_blob_scale.w); + return -1; + } + tbl->append("input_scale", static_cast(bottom_blob_scale[0])); + } + + const std::string name = layers[conv_layers[i]]->name; + root.append(name, tbl); + } + root.write(std::string(filepath)); + return 0; +} + void QuantNet::print_quant_info() const { for (int i = 0; i < (int)conv_bottom_blobs.size(); i++) @@ -1586,7 +1646,8 @@ static void show_usage() fprintf(stderr, " pixel=RAW/RGB/BGR/GRAY/RGBA/BGRA,...\n"); fprintf(stderr, " thread=8\n"); fprintf(stderr, " method=kl/aciq/eq\n"); - fprintf(stderr, "Sample usage: ncnn2table squeezenet.param squeezenet.bin imagelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl\n"); + fprintf(stderr, " format=txt/ini\n"); + fprintf(stderr, "Sample usage: ncnn2table squeezenet.param squeezenet.bin imagelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl format=txt\n"); } int main(int argc, char** argv) @@ -1629,6 +1690,7 @@ int main(int argc, char** argv) net.listspaths = parse_comma_path_list(lists); std::string method = "kl"; + std::string format = "txt"; for (int i = 5; i < argc; i++) { @@ -1649,17 +1711,37 @@ int main(int argc, char** argv) // load mean norm shape if (memcmp(key, "mean", 4) == 0) + { net.means = parse_comma_float_array_list(value); - if (memcmp(key, "norm", 4) == 0) + } + else if (memcmp(key, "norm", 4) == 0) + { net.norms = parse_comma_float_array_list(value); - if (memcmp(key, "shape", 5) == 0) + } + else if (memcmp(key, "shape", 5) == 0) + { net.shapes = parse_comma_int_array_list(value); - if (memcmp(key, "pixel", 5) == 0) + } + else if (memcmp(key, "pixel", 5) == 0) + { net.type_to_pixels = parse_comma_pixel_type_list(value); - if (memcmp(key, "thread", 6) == 0) + } + else if (memcmp(key, "thread", 6) == 0) + { net.quantize_num_threads = atoi(value); - if (memcmp(key, "method", 6) == 0) + } + else if (memcmp(key, "method", 6) == 0) + { method = std::string(value); + } + else if (memcmp(key, "format", 6) == 0) + { + format = std::string(value); + } + else + { + fprintf(stderr, "unknown key=%s\n", key); + } } // sanity check @@ -1735,7 +1817,14 @@ int main(int argc, char** argv) net.print_quant_info(); - net.save_table(outtable); + if (format == "ini") + { + net.save_ini(outtable); + } + else + { + net.save_table(outtable); + } return 0; } diff --git a/tools/quantize/net_quantize.cpp b/tools/quantize/net_quantize.cpp new file mode 100644 index 000000000000..b77925ffd235 --- /dev/null +++ b/tools/quantize/net_quantize.cpp @@ -0,0 +1,625 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer.h" +#include "layer_type.h" +#include "net.h" +#include "net_quantize.h" +#include +#include +#include + +void NetQuantize::set_weight_suffix(std::string str) +{ + suffix = str; +} + +bool NetQuantize::read_txt_format(const char* filepath) +{ + blob_int8scale_table.clear(); + weight_int8scale_table.clear(); + + FILE* fp = fopen(filepath, "rb"); + if (!fp) + { + fprintf(stderr, "Open %s failed.\n", filepath); + return false; + } + + std::string key_str; + std::vector scales; + + std::vector line(10240000); + char* pch = NULL; + size_t len = 0; + + while (!feof(fp)) + { + char* s = fgets(line.data(), (int)line.size(), fp); + if (!s) + break; + + float scale = 1.f; + char key[256]; + line[strcspn(line.data(), "\r\n")] = 0; + + pch = strtok(line.data(), " "); + + if (pch == NULL) break; + + bool is_key = true; + while (pch != NULL) + { + if (is_key) + { + sscanf(pch, "%255s", key); + + key_str = key; + is_key = false; + } + else + { + sscanf(pch, "%f", &scale); + + scales.push_back(scale); + } + + pch = strtok(NULL, " "); + } + + // XYZ_param_N pattern + if (strstr(key_str.c_str(), "_param_")) + { + weight_int8scale_table[key_str] = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); + } + else + { + blob_int8scale_table[key_str] = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); + } + key_str.clear(); + scales.clear(); + } + + fclose(fp); + + return true; +} + +bool NetQuantize::read_ini_format(const char* path) +{ + blob_int8scale_table.clear(); + weight_int8scale_table.clear(); + mha_table.clear(); + + ini::Config root; + root.read(std::string(path)); + + size_t len = root.size(); + std::string name, type; + std::shared_ptr ptable; + for (size_t i = 0; i < len; ++i) + { + std::tie(name, ptable) = root[i]; + type = ptable->get("type"); + + if (type == "Conv" || type == "Gemm") + { + // load weight scales + { + std::vector scales = ptable->get_list("weight"); + weight_int8scale_table[name] = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); + } + + // load input scale + { + std::vector scales = {ptable->get("input_scale")}; + blob_int8scale_table[name] = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); + } + } + else if (type == "MultiHeadAttention") + { + mha_table[name] = ptable; + } + } + + return true; +} + +int NetQuantize::quantize_mha() +{ + const int layer_count = static_cast(layers.size()); + auto base_opt = opt; + + for (int i = 0; i < layer_count; i++) + { + // find convolution layer + if (layers[i]->type != "MultiHeadAttention") + continue; + + std::string name = layers[i]->name; + if (mha_table.find(name) == mha_table.end()) + { + fprintf(stderr, "cannot find %s quant param.\n", name.c_str()); + continue; + } + + ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*)layers[i]; + fprintf(stderr, "quantize_multiheadattention %s\n", mha->name.c_str()); + + auto& table = mha_table.at(name); + { + // write weights + // convert fp32 mat to int8 mat with the scales + auto convert = [table, mha, base_opt](ncnn::Mat& weight, std::string key, ncnn::Mat& w_scales) -> int { + ncnn::Option opt_q = base_opt; + opt_q.blob_allocator = weight.allocator; + opt_q.use_packing_layout = false; + + auto scales = table->get_list(key); + if (scales.empty()) + { + return -100; + } + w_scales = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); + + { + ncnn::Mat weight_int8; + const int num_input = mha->embed_dim; + const int num_output = mha->weight_data_size / num_input; + + ncnn::Mat weight_data_r2 = weight.reshape(num_input, num_output); + ncnn::quantize_to_int8(weight_data_r2, weight_int8, w_scales, opt_q); + if (weight_int8.empty()) + return -100; + + weight = weight_int8.reshape(mha->weight_data_size).clone(); + } + return 0; + }; + + int success = 0; + success += convert(mha->q_weight_data, "weight_q", mha->q_weight_scales); + success += convert(mha->k_weight_data, "weight_k", mha->k_weight_scales); + success += convert(mha->v_weight_data, "weight_v", mha->v_weight_scales); + success += convert(mha->out_weight_data, "weight_o", mha->o_weight_scales); + + if (success != 0) + { + fprintf(stderr, "convert fp32 weight to int8 failed. \n"); + return -1; + } + } + + { + // write input scale + auto convert = [table, base_opt](const std::string key, ncnn::Mat& mat) -> int { + std::vector scales = {table->get(key)}; + if (scales.empty()) + { + return -100; + } + + mat = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); + return 0; + }; + + int success = 0; + success += convert("input_scale_q", mha->q_input_scale); + success += convert("input_scale_k", mha->k_input_scale); + success += convert("input_scale_v", mha->v_input_scale); + if (success != 0) + { + fprintf(stderr, "load input scale failed. \n"); + return -100; + } + } + + { + // write internal scales + std::vector internal_scales; + internal_scales.emplace_back(table->get("internal_scale_q")); + internal_scales.emplace_back(table->get("internal_scale_k")); + internal_scales.emplace_back(table->get("internal_scale_v")); + internal_scales.emplace_back(table->get("internal_scale_energy")); + internal_scales.emplace_back(table->get("internal_scale_feat")); + + mha->internal_scales = ncnn::Mat((int)internal_scales.size(), (void*)internal_scales.data()).clone(); + } + + { + // write control variable + mha->int8_scale_term = 1; + } + } + + return 0; +} + +int NetQuantize::quantize_convolution() +{ + const int layer_count = static_cast(layers.size()); + for (int i = 0; i < layer_count; i++) + { + // find convolution layer + if (layers[i]->type != "Convolution") + continue; + + // find convolution layer + std::map::iterator iter_data = blob_int8scale_table.find(layers[i]->name); + if (iter_data == blob_int8scale_table.end()) + continue; + + char key[256]; + sprintf(key, "%s%s", layers[i]->name.c_str(), suffix.c_str()); + + std::map::iterator iter = weight_int8scale_table.find(key); + if (iter == weight_int8scale_table.end()) + { + fprintf(stderr, "%s need to be quantized, but no scale param!\n", key); + return -1; + } + + // Convolution - quantize weight from fp32 to int8 + ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i]; + + ncnn::Mat bottom_blob_int8_scales = iter_data->second; + ncnn::Mat weight_data_int8_scales = iter->second; + + fprintf(stderr, "quantize_convolution %s\n", convolution->name.c_str()); + + { + const int maxk = convolution->kernel_w * convolution->kernel_h; + const int num_input = convolution->weight_data_size / convolution->num_output / maxk; + + ncnn::Mat weight_data_r2 = convolution->weight_data.reshape(maxk, num_input, convolution->num_output); + + ncnn::Mat weight_data_int8; + + ncnn::Option opt_q = opt; + opt_q.blob_allocator = convolution->weight_data.allocator; + opt_q.use_packing_layout = false; + ncnn::quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q); + if (weight_data_int8.empty()) + return -100; + + convolution->weight_data = weight_data_int8.reshape(convolution->weight_data_size); + } + + convolution->int8_scale_term = 2; + convolution->weight_data_int8_scales = weight_data_int8_scales; + convolution->bottom_blob_int8_scales = bottom_blob_int8_scales; + } + + return 0; +} + +int NetQuantize::quantize_convolutiondepthwise() +{ + const int layer_count = static_cast(layers.size()); + for (int i = 0; i < layer_count; i++) + { + // find convolution layer + if (layers[i]->type != "ConvolutionDepthWise") + continue; + + // find convolutiondepthwise layer + std::map::iterator iter_data = blob_int8scale_table.find(layers[i]->name); + if (iter_data == blob_int8scale_table.end()) + continue; + + char key[256]; + sprintf(key, "%s%s", layers[i]->name.c_str(), suffix.c_str()); + + std::map::iterator iter = weight_int8scale_table.find(key); + if (iter == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + // Convolution - quantize weight from fp32 to int8 + ncnn::ConvolutionDepthWise* convdw = (ncnn::ConvolutionDepthWise*)layers[i]; + + ncnn::Mat bottom_blob_int8_scales = iter_data->second; + ncnn::Mat weight_data_int8_scales = iter->second; + + fprintf(stderr, "quantize_convolutiondepthwise %s\n", convdw->name.c_str()); + + { + ncnn::Mat int8_weight_data(convdw->weight_data_size, (size_t)1u); + if (int8_weight_data.empty()) + return -100; + + const int weight_data_size_g = convdw->weight_data_size / convdw->group; + + for (int g = 0; g < convdw->group; g++) + { + ncnn::Option opt_q = opt; + opt_q.blob_allocator = int8_weight_data.allocator; + opt_q.use_packing_layout = false; + + const ncnn::Mat weight_data_g = convdw->weight_data.range(weight_data_size_g * g, weight_data_size_g); + ncnn::Mat int8_weight_data_g = int8_weight_data.range(weight_data_size_g * g, weight_data_size_g); + const ncnn::Mat weight_data_int8_scales_g = weight_data_int8_scales.range(g, 1); + ncnn::quantize_to_int8(weight_data_g, int8_weight_data_g, weight_data_int8_scales_g, opt_q); + } + + convdw->weight_data = int8_weight_data; + } + + convdw->int8_scale_term = 1; + convdw->weight_data_int8_scales = weight_data_int8_scales; + convdw->bottom_blob_int8_scales = bottom_blob_int8_scales; + } + + return 0; +} + +int NetQuantize::quantize_innerproduct() +{ + const int layer_count = static_cast(layers.size()); + for (int i = 0; i < layer_count; i++) + { + // find convolution layer + if (layers[i]->type != "InnerProduct") + continue; + + // find InnerProduct layer + std::map::iterator iter_data = blob_int8scale_table.find(layers[i]->name); + if (iter_data == blob_int8scale_table.end()) + continue; + + char key[256]; + sprintf(key, "%s%s", layers[i]->name.c_str(), suffix.c_str()); + + std::map::iterator iter = weight_int8scale_table.find(key); + if (iter == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + // InnerProduct - quantize weight from fp32 to int8 + ncnn::InnerProduct* fc = (ncnn::InnerProduct*)layers[i]; + + ncnn::Mat bottom_blob_int8_scales = iter_data->second; + ncnn::Mat weight_data_int8_scales = iter->second; + + fprintf(stderr, "quantize_innerproduct %s\n", fc->name.c_str()); + + { + const int num_input = fc->weight_data_size / fc->num_output; + + ncnn::Mat weight_data_r2 = fc->weight_data.reshape(num_input, fc->num_output); + + ncnn::Mat weight_data_int8; + ncnn::Option opt_q = opt; + opt_q.use_packing_layout = false; + ncnn::quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q); + if (weight_data_int8.empty()) + return -100; + + fc->weight_data = weight_data_int8.reshape(fc->weight_data_size); + } + + fc->int8_scale_term = 2; + fc->weight_data_int8_scales = weight_data_int8_scales; + fc->bottom_blob_int8_scales = bottom_blob_int8_scales; + } + + return 0; +} + +int NetQuantize::fuse_requantize() +{ + const size_t layer_count = layers.size(); + for (size_t i = 0; i < layer_count; i++) + { + if (layers[i]->type != "Convolution" && layers[i]->type != "ConvolutionDepthWise") + continue; + + // Convolution/ConvolutionDepthWise - Convolution/ConvolutionDepthWise + int top_blob_index = layers[i]->tops[0]; + + size_t j = i + 1; + for (; j < layer_count; j++) + { + if (layers[j]->type != "Convolution" && layers[j]->type != "ConvolutionDepthWise") + continue; + + if (layers[j]->bottoms.size() != 1) + continue; + + if (layers[j]->bottoms[0] == top_blob_index) + break; + } + + if (j == layer_count) + continue; + + // fuse requantize + fprintf(stderr, "fuse_requantize %s %s\n", layers[i]->name.c_str(), layers[j]->name.c_str()); + + if (layers[i]->type == "Convolution" && layers[j]->type == "Convolution") + { + ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; + ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + if (layers[i]->type == "Convolution" && layers[j]->type == "ConvolutionDepthWise") + { + ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; + ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "Convolution") + { + ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; + ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "ConvolutionDepthWise") + { + ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; + ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + } + + for (size_t i = 0; i < layer_count; i++) + { + if (layers[i]->type != "Convolution" && layers[i]->type != "ConvolutionDepthWise") + continue; + + // Convolution/ConvolutionDepthWise - Split - Convolution/ConvolutionDepthWise + int top_blob_index = layers[i]->tops[0]; + + size_t j = i + 1; + for (; j < layer_count; j++) + { + if (layers[j]->type != "Split") + continue; + + if (layers[j]->bottoms.size() != 1) + continue; + + if (layers[j]->bottoms[0] == top_blob_index) + break; + } + + if (j == layer_count) + continue; + + ncnn::Split* split = (ncnn::Split*)layers[j]; + + bool all_conv = true; + for (size_t p = 0; p < split->tops.size(); p++) + { + int split_top_blob_index = split->tops[p]; + + size_t k = j + 1; + for (; k < layer_count; k++) + { + if (layers[k]->type != "Convolution" && layers[k]->type != "ConvolutionDepthWise") + continue; + + if (layers[k]->bottoms.size() != 1) + continue; + + if (layers[k]->bottoms[0] == split_top_blob_index) + break; + } + + if (k == layer_count) + { + all_conv = false; + break; + } + + if (layers[k]->type == "Convolution") + { + ncnn::Convolution* convolution = (ncnn::Convolution*)layers[k]; + if (convolution->weight_data.elemsize != 1u) + { + all_conv = false; + break; + } + } + if (layers[k]->type == "ConvolutionDepthWise") + { + ncnn::ConvolutionDepthWise* convolution = (ncnn::ConvolutionDepthWise*)layers[k]; + if (convolution->weight_data.elemsize != 1u) + { + all_conv = false; + break; + } + } + } + + if (!all_conv) + continue; + + j = blobs[split->tops[0]].consumer; + + // fuse requantize + fprintf(stderr, "fuse_requantize %s %s\n", layers[i]->name.c_str(), split->name.c_str()); + + if (layers[i]->type == "Convolution" && layers[j]->type == "Convolution") + { + ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; + ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + if (layers[i]->type == "Convolution" && layers[j]->type == "ConvolutionDepthWise") + { + ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; + ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "Convolution") + { + ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; + ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "ConvolutionDepthWise") + { + ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; + ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + } + + return 0; +} diff --git a/tools/quantize/net_quantize.h b/tools/quantize/net_quantize.h new file mode 100644 index 000000000000..79c8a163b92e --- /dev/null +++ b/tools/quantize/net_quantize.h @@ -0,0 +1,48 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#pragma once +// ncnn private header +#include +#include "../modelwriter.h" +#include "ini_config.h" + +class NetQuantize : public ModelWriter +{ +public: + NetQuantize() + { + } + // conv and gemm quant param + std::map blob_int8scale_table; + std::map weight_int8scale_table; + + // MutiHeadAttention quant param + std::map > mha_table; + +public: + bool read_txt_format(const char* path); + bool read_ini_format(const char* path); + + int quantize_convolution(); + int quantize_convolutiondepthwise(); + int quantize_innerproduct(); + int quantize_mha(); + int fuse_requantize(); + + void set_weight_suffix(std::string s); + +private: + std::string suffix; +}; From e0a0ca690765cf820a7b749acb47dd558c2b72f0 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 28 Jul 2022 10:25:13 +0000 Subject: [PATCH 02/36] apply code-format changes --- benchmark/benchncnn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp index 283c76a340e2..abb1bdef851d 100644 --- a/benchmark/benchncnn.cpp +++ b/benchmark/benchncnn.cpp @@ -321,7 +321,7 @@ int main(int argc, char** argv) benchmark("vision_transformer", ncnn::Mat(384, 384, 3), opt); benchmark("FastestDet", ncnn::Mat(352, 352, 3), opt); - + benchmark("vision_transformer_int8", ncnn::Mat(384, 384, 3), opt); #if NCNN_VULKAN From 7565af0c2771fab3df09fc6dee1a0a607d42578a Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 28 Jul 2022 18:43:27 +0800 Subject: [PATCH 03/36] fix(CI): rebase error --- src/layer/multiheadattention.cpp | 5 - tools/quantize/ini_config.cpp | 237 ++++--------------------------- 2 files changed, 27 insertions(+), 215 deletions(-) mode change 100644 => 100755 src/layer/multiheadattention.cpp mode change 100644 => 100755 tools/quantize/ini_config.cpp diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp old mode 100644 new mode 100755 index 80ec43518b25..14f9ddc83ad1 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -202,11 +202,6 @@ int MultiHeadAttention::forward_int8(const std::vector& bottom_blobs, std:: affine_input(k_blob, k_weight_data, k_bias_data, xk, k_input_scale, k_weight_scales, internal_scales[1], num_head, opt_g, false); affine_input(v_blob, v_weight_data, v_bias_data, xv, v_input_scale, v_weight_scales, internal_scales[2], num_head, opt_g, true); - // transpose(v) for better gemm performance - // Mat xv(seqlen, embed_dim_per_head, num_head, 1u, opt.workspace_allocator); - // Mat debug_xv; - // transform_input(v_blob, v_weight_data, v_bias_data, xv, v_input_scale, v_weight_scales, internal_scales[2], opt_g, debug_xv, true); - // xq @ qk * inv_sqrt_embed_dim_per_head const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); diff --git a/tools/quantize/ini_config.cpp b/tools/quantize/ini_config.cpp old mode 100644 new mode 100755 index 9da3ae9fcb86..227a5ea05b92 --- a/tools/quantize/ini_config.cpp +++ b/tools/quantize/ini_config.cpp @@ -1,224 +1,41 @@ +// tpoisonooo is pleased to support the open source community by making ncnn available. +// +// author:tpoisonooo (https://github.com/tpoisonooo/) . +// +// Copyright (C) 2022 tpoisonooo. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + #include "ini_config.h" -#include -#include namespace ini { - -template -void Value::set(T val) -{ - text = std::to_string(f); -} - -void Value::set(std::string str) -{ - text = '\"' + str + '\"'; -} - -template -void Value::set(const std::vector& data) -{ - text = "[ "; - - size_t len = data.size(); - if (len > 0) - { - size_t i = 0; - for (; i < len - 1; ++i) - { - text += std::to_string(data[i]); - text += ", "; - } - text += std::to_string(data[i]); - text += " "; - } - - text += "]"; -} - -template -T Value::get() -{ - T result; - std::stringstream ss; - ss << text; - ss >> result; - return result; -} - -template -std::vector Value::get() -{ - std::vector result; - - std::string no_brace; - { - // remove brace - auto start = text.find('['); - auto end = text.find(']'); - no_brace = text.substr(start + 1, end); - } - - { - // split with the separator ',' - std::stringstream ss; - size_t end = 0, start = 0; - while (true) - { - end = no_brace.find(',', start); - if (end == std::string::npos) - { - break; - } - - std::string val_str = no_brace.substr(start, end); - start = end + 1; - - T val; - ss << val_str; - ss >> val; - ss.clear(); - result.emplace_back(val); - } - - // parse the last one - std::string val_str = no_brace.substr(start); - T val; - ss << val_str; - ss >> val; - result.emplace_back(val); - } - - return result; -} - -std::string Value::stringify() -{ - return text; -} - -void Table::feed(std::string line) -{ - auto pos = line.find(':'); - assert(pos != std::string::npos); - - std::string key = line.substr(0, pos - 1); - std::string value_str = line.substr(pos + 1); - - values[key] = std::make_shared(value_str); -} - -void Table::feed(const std::vector& lines) -{ - for (auto& line : lines) - { - feed(line); - } -} - -void Table::append(std::string key, float data) -{ - auto pVal = std::make_shared(); - pVal->set(data); - values[key] = pVal; -} - -void Table::append(std::string key, const std::vector& data) +template<> +std::string value_set(std::string data) { - auto pVal = std::make_shared(); - pVal->set(data); - values[key] = pVal; + return "\"" + data + "\""; } -void Table::append(std::string key, std::string data) +template<> +std::string value_set(const char* data) { - auto pVal = std::make_shared(); - pVal->set(data); - values[key] = pVal; + return "\"" + std::string(data) + "\""; } -std::shared_ptr Table::operator[](std::string key) +template<> +std::string value_get(std::string text) { - return values[key]; -} - -std::string Table::stringify() -{ - std::string result; - for (auto itra = values.begin(); itra != values.end(); ++itra) - { - result += itra->first; - result += " = "; - result += itra->second->stringify(); - result += '\n'; - } - return result; -} - -void Config::read(std::string path) -{ - std::ifstream fin; - fin.open(path, std::ios::in); - - if (!fin.is_open()) - { - fprintf(stderr, "open %s failed\n", path.c_str()); - return; - } - - bool recoding = false; - std::shared_ptr
pTable = nullptr; + auto start = text.find('\"'); + auto end = text.find_last_of('\"'); - std::string line; - while (fin >> line) - { - if (nullptr == pTable) - { - auto start = line.find('['); - auto end = line.find(']'); - assert(start != std::string::npos); - assert(end != std::string::npos); - - std::string key = line.substr(start + 1, end); - pTable = std::make_shared
(); - tables[key] = pTable; - continue; - } - - if (line.length() <= 2) - { - pTable = nullptr; - continue; - ; - } - - pTable->feed(line); - } -} - -std::vector Config::list_all() -{ - std::vector result; - for (auto itra = tables.begin(); itra != tables.end(); ++itra) - { - result.push_back(itra->first); - } - return result; -} - -std::shared_ptr
Config::operator[](std::string key) -{ - return tables[key]; -} - -void Config::append(std::string key, std::shared_ptr
table) -{ - tables[key] = table; -} - -void Config::write(std::string path) -{ - // TODO + return text.substr(start + 1, end - start - 1); } } // namespace ini From d5f7835ea560e984703606227420b6e7afed9c8b Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 28 Jul 2022 13:25:06 +0000 Subject: [PATCH 04/36] apply code-format changes --- src/layer/multiheadattention.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 src/layer/multiheadattention.cpp diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp old mode 100755 new mode 100644 From fa8b0bc098b01a88bc8df5afbda5e12c00497c13 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 1 Aug 2022 18:01:12 +0800 Subject: [PATCH 05/36] fix(CI): test mha exceeding --- tests/test_multiheadattention.cpp | 38 +++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/test_multiheadattention.cpp b/tests/test_multiheadattention.cpp index ba6e8d32e899..acf3bba1d157 100644 --- a/tests/test_multiheadattention.cpp +++ b/tests/test_multiheadattention.cpp @@ -105,25 +105,25 @@ static int test_multiheadattention_int8(const ncnn::Mat& a, int num_heads) pd.set(3, 1); std::vector weights(16); - weights[0] = RandomIntMat(embed_dim * embed_dim); - weights[1] = RandomIntMat(embed_dim); - weights[2] = RandomIntMat(embed_dim * embed_dim); - weights[3] = RandomIntMat(embed_dim); - weights[4] = RandomIntMat(embed_dim * embed_dim); - weights[5] = RandomIntMat(embed_dim); - weights[6] = RandomIntMat(embed_dim * embed_dim); - weights[7] = RandomIntMat(embed_dim); - - weights[8] = RandomMat(1); - weights[9] = RandomMat(1); - weights[10] = RandomMat(1); - - weights[11] = RandomMat(embed_dim); - weights[12] = RandomMat(embed_dim); - weights[13] = RandomMat(embed_dim); - weights[14] = RandomMat(embed_dim); - - weights[15] = RandomMat(5); + weights[0] = RandomMat(embed_dim * embed_dim); + weights[1] = RandomMat(embed_dim); + weights[2] = RandomMat(embed_dim * embed_dim); + weights[3] = RandomMat(embed_dim); + weights[4] = RandomMat(embed_dim * embed_dim); + weights[5] = RandomMat(embed_dim); + weights[6] = RandomMat(embed_dim * embed_dim); + weights[7] = RandomMat(embed_dim); + + weights[8] = RandomMat(1, 1.f, 10.f); + weights[9] = RandomMat(1, 1.f, 10.f); + weights[10] = RandomMat(1, 1.f, 10.f); + + weights[11] = RandomMat(embed_dim, 1.f, 10.f); + weights[12] = RandomMat(embed_dim, 1.f, 10.f); + weights[13] = RandomMat(embed_dim, 1.f, 10.f); + weights[14] = RandomMat(embed_dim, 1.f, 10.f); + + weights[15] = RandomMat(5, 1.f, 10.f); std::vector as(1); as[0] = a; From 9d3fb108ba89dc0592219de9ff429d6538209c68 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 1 Aug 2022 19:04:07 +0800 Subject: [PATCH 06/36] improvement(src/mha): add file structure --- src/layer/x86/multiheadattention_x86.cpp | 517 +++++++++++++++++++++++ src/layer/x86/multiheadattention_x86.h | 44 ++ 2 files changed, 561 insertions(+) create mode 100644 src/layer/x86/multiheadattention_x86.cpp create mode 100644 src/layer/x86/multiheadattention_x86.h diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp new file mode 100644 index 000000000000..72084706b6a7 --- /dev/null +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -0,0 +1,517 @@ +// tpoisonooo is pleased to support the open source community by making ncnn available. +// +// author:tpoisonooo (https://github.com/tpoisonooo/) . +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "multiheadattention_x86.h" +#include +#ifdef NCNN_INT8 +#include +#endif + + +namespace ncnn { + +MultiHeadAttention_x86::MultiHeadAttention_x86() +{ + support_packing = false; +} + +int MultiHeadAttention_x86::create_pipeline(const Option& opt) +{ +#if NCNN_INT8 + if (opt.use_int8_inference && q_weight_data.elemsize == (size_t)1u && k_weight_data.elemsize == (size_t)1u && v_weight_data.elemsize == (size_t)1u && out_weight_data.elemsize == (size_t)1u) + { + return create_pipeline_int8_x86(opt); + } +#endif + return 0; +} + +#ifdef NCNN_INT8 +int MultiHeadAttention_x86::create_pipeline_int8_x86(const Option& opt) +{ + return 0; +} + +int MultiHeadAttention_x86::destroy_pipeline(const Option& opt) +{ + return 0; +} + +static int affine_input( + const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, + const Mat& input_scale, const Mat& weight_scales, const float transform_scale, + const int num_head, const Option& opt, bool transpose) +{ + const int embed_dim = input.w; + const int seqlen = input.h; + const int embed_dim_per_head = embed_dim / num_head; + const float scale = 1.0 / input_scale[0]; + + Mat input_int8; + if (input.elemsize != 1) + { + quantize_to_int8(input, input_int8, input_scale, opt); + } + + Mat buffer(out_int8.w, out_int8.h, out_int8.c, 4u, opt.workspace_allocator); + + if (transpose) + { + for (int q = 0; q < num_head; q++) + { + Mat outm = buffer.channel(q); + + for (int i = 0; i < embed_dim_per_head; i++) + { + for (int j = 0; j < seqlen; j++) + { + const int8_t* ptr = input_int8.row(j); + const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + i); + + int32_t sum = 0; + const int32_t index = q * embed_dim_per_head + i; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + float* outptr = outm.row(i); + outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; + } + } + } + } + else + { + for (int q = 0; q < num_head; q++) + { + Mat outm = buffer.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const int8_t* ptr = input_int8.row(i); + const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + j); + + int32_t sum = 0; + const int32_t index = q * embed_dim_per_head + j; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; + } + } + } + } + + Mat transform(1, 4u, opt.workspace_allocator); + transform[0] = transform_scale; + quantize_to_int8(buffer, out_int8, transform, opt); + return 0; +} + +static inline int32_t float2int8(float v) +{ + int int32 = static_cast(round(v)); + if (int32 > 127) return 127; + if (int32 < -127) return -127; + return int32; +} + +int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& q_blob = bottom_blobs[0]; + const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1]; + const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2]; + + const int seqlen = q_blob.h; + const int embed_dim_per_head = embed_dim / num_head; + + Option opt_g = opt; + opt_g.blob_allocator = opt.workspace_allocator; + opt_g.use_packing_layout = false; + + Mat xq(embed_dim_per_head, seqlen, num_head, 1u, opt.workspace_allocator); + Mat xk(embed_dim_per_head, seqlen, num_head, 1u, opt.workspace_allocator); + Mat xv(seqlen, embed_dim_per_head, num_head, 1u, opt.workspace_allocator); + + affine_input(q_blob, q_weight_data, q_bias_data, xq, q_input_scale, q_weight_scales, internal_scales[0], num_head, opt_g, false); + affine_input(k_blob, k_weight_data, k_bias_data, xk, k_input_scale, k_weight_scales, internal_scales[1], num_head, opt_g, false); + affine_input(v_blob, v_weight_data, v_bias_data, xv, v_input_scale, v_weight_scales, internal_scales[2], num_head, opt_g, true); + + // xq @ qk * inv_sqrt_embed_dim_per_head + const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + + Mat xqk(seqlen, seqlen, num_head, 4u, opt.workspace_allocator); + { + // xqk = xq * xk + // xq (embed_dim_per_head, seqlen) + // xk (embed_dim_per_head, seqlen) + const float out_scale = inv_sqrt_embed_dim_per_head / (internal_scales[0] * internal_scales[1]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_head; ++q) + { + const Mat xqm = xq.channel(q); + const Mat xkm = xk.channel(q); + + Mat outm = xqk.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < seqlen; j++) + { + const int8_t* qptr = xqm.row(i); + const int8_t* kptr = xkm.row(j); + + int32_t sum = 0; + for (int k = 0; k < embed_dim_per_head; k++) + { + sum += *qptr++ * *kptr++; + } + + outptr[j] = sum * out_scale; + } + } + } + + // fp32_softmax(xqk) + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_head; q++) + { + // softmax(xqk) + { + Mat outm = xqk.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* ptr = outm.row(i); + + float max = -FLT_MAX; + for (int j = 0; j < seqlen; j++) + { + max = std::max(max, ptr[j]); + } + + float sum = 0.f; + for (int j = 0; j < seqlen; j++) + { + ptr[j] = (float)(exp(ptr[j] - max)); + sum += ptr[j]; + } + + for (int j = 0; j < seqlen; j++) + { + ptr[j] = ptr[j] / sum; + } + } + } + } + } + + // xqkv int4 @ int8, implement by shift + Mat xqkv(embed_dim_per_head, num_head, seqlen, 1u, opt.workspace_allocator); + + const float xqkv_out_scale = internal_scales[4] / internal_scales[2]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_head; ++q) + { + // xqkv = xqk * xv + // xqk (seqlen, seqlen) + // xv (seqlen, embed_dim_per_head) + // out (embed_dim_per_head, num_head, seqlen) + const Mat xqkm = xqk.channel(q); + const Mat xvm = xv.channel(q); + + for (int i = 0; i < seqlen; i++) + { + int8_t* outptr = xqkv.channel(i).row(q); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const float* qkptr = xqkm.row(i); + const int8_t* vptr = xvm.row(j); + + float sum = 0; + for (int k = 0; k < seqlen; k++) + { + sum += (*vptr++) * (*qkptr++); + } + + outptr[j] = float2int8(sum * xqkv_out_scale); + } + } + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(embed_dim, seqlen, 4u, opt.blob_allocator); + if (top_blob.empty()) + return -1; + + const float out_scale = 1.0f / internal_scales[4]; + // out = affine(xqkv) + // xqkv (embed_dim, seqlen) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < seqlen; i++) + { + float* outptr = top_blob.row(i); + + for (int j = 0; j < embed_dim; j++) + { + const int8_t* ptr = xqkv.channel(i); + const int8_t* kptr = (const int8_t*)out_weight_data + embed_dim * j; + + int32_t sum = 0; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + outptr[j] = sum * out_scale / o_weight_scales[j] + out_bias_data[j]; + } + } + + return 0; +} + +#endif + +// refers to https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html +int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ +#if NCNN_INT8 + if (opt.use_int8_inference && q_weight_data.elemsize == (size_t)1u && k_weight_data.elemsize == (size_t)1u && v_weight_data.elemsize == (size_t)1u && out_weight_data.elemsize == (size_t)1u) + { + return forward_int8(bottom_blobs, top_blobs, opt); + } +#endif + + const Mat& q_blob = bottom_blobs[0]; + const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1]; + const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2]; + + const int seqlen = q_blob.h; + const int embed_dim_per_head = embed_dim / num_head; + + Mat& top_blob = top_blobs[0]; + top_blob.create(embed_dim, seqlen, 4u, opt.blob_allocator); + if (top_blob.empty()) + return -1; + + Mat xq(embed_dim_per_head, seqlen, num_head, 4u, opt.workspace_allocator); + Mat xk(embed_dim_per_head, seqlen, num_head, 4u, opt.workspace_allocator); + Mat xv(seqlen, embed_dim_per_head, num_head, 4u, opt.workspace_allocator); + + Mat xqk(seqlen, seqlen, num_head, 4u, opt.workspace_allocator); + + Mat xqkv(embed_dim_per_head, num_head, seqlen, 4u, opt.workspace_allocator); + + const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_head; q++) + { + // xq = affine(q) * inv_sqrt_embed_dim_per_head + { + Mat outm = xq.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const float* ptr = q_blob.row(i); + const float* kptr = (const float*)q_weight_data + embed_dim * (q * embed_dim_per_head + j); + + float sum = q_bias_data[q * embed_dim_per_head + j]; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + outptr[j] = sum * inv_sqrt_embed_dim_per_head; + } + } + } + + // xk = affine(k) + { + Mat outm = xk.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const float* ptr = k_blob.row(i); + const float* kptr = (const float*)k_weight_data + embed_dim * (q * embed_dim_per_head + j); + + float sum = k_bias_data[q * embed_dim_per_head + j]; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + outptr[j] = sum; + } + } + } + + // xv = affine(v) + { + Mat outm = xv.channel(q); + + for (int i = 0; i < embed_dim_per_head; i++) + { + for (int j = 0; j < seqlen; j++) + { + const float* ptr = v_blob.row(j); + const float* kptr = (const float*)v_weight_data + embed_dim * (q * embed_dim_per_head + i); + + float sum = v_bias_data[q * embed_dim_per_head + i]; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + float* outptr = outm.row(i); + + outptr[j] = sum; + } + } + } + + // xqk = xq * xk + // xq (embed_dim_per_head, seqlen) + // xk (embed_dim_per_head, seqlen) + { + const Mat xqm = xq.channel(q); + const Mat xkm = xk.channel(q); + + Mat outm = xqk.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < seqlen; j++) + { + const float* qptr = xqm.row(i); + const float* kptr = xkm.row(j); + + float sum = 0.f; + for (int k = 0; k < embed_dim_per_head; k++) + { + sum += *qptr++ * *kptr++; + } + + outptr[j] = sum; + } + } + } + + // softmax(xqk) + { + Mat outm = xqk.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* ptr = outm.row(i); + + float max = -FLT_MAX; + for (int j = 0; j < seqlen; j++) + { + max = std::max(max, ptr[j]); + } + + float sum = 0.f; + for (int j = 0; j < seqlen; j++) + { + ptr[j] = (float)(exp(ptr[j] - max)); + sum += ptr[j]; + } + + for (int j = 0; j < seqlen; j++) + { + ptr[j] /= sum; + } + } + } + + // xqkv = xqk * xv + // xqk (seqlen, seqlen) + // xv (seqlen, embed_dim_per_head) + // out (embed_dim_per_head, num_head, seqlen) + { + const Mat xqkm = xqk.channel(q); + const Mat xvm = xv.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = xqkv.channel(i).row(q); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const float* qkptr = xqkm.row(i); + const float* vptr = xvm.row(j); + + float sum = 0.f; + for (int k = 0; k < seqlen; k++) + { + sum += *qkptr++ * *vptr++; + } + + outptr[j] = sum; + } + } + } + } + + // out = affine(xqkv) + // xqkv (embed_dim, seqlen) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < seqlen; i++) + { + float* outptr = top_blob.row(i); + + for (int j = 0; j < embed_dim; j++) + { + const float* ptr = xqkv.channel(i); + const float* kptr = (const float*)out_weight_data + embed_dim * j; + + float sum = out_bias_data[j]; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + outptr[j] = sum; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/x86/multiheadattention_x86.h b/src/layer/x86/multiheadattention_x86.h new file mode 100644 index 000000000000..6f6982e89847 --- /dev/null +++ b/src/layer/x86/multiheadattention_x86.h @@ -0,0 +1,44 @@ +// tpoisonooo is pleased to support the open source community by making ncnn available. +// +// author:tpoisonooo (https://github.com/tpoisonooo/) . +// +// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_MULTIHEADATTENTION_X86_H +#define LAYER_MULTIHEADATTENTION_X86_H + +#include "multiheadattention.h" + +namespace ncnn { + +class MultiHeadAttention_x86 : virtual public MultiHeadAttention +{ +public: + MultiHeadAttention_x86(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +protected: +#if NCNN_INT8 + int create_pipeline_int8_x86(const Option& opt); + int forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +#endif + +}; + +} // namespace ncnn + +#endif // LAYER_MULTIHEADATTENTION_X86_H From f38ca739497fa0c6a74bdf23c8d5001549e9c663 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 1 Aug 2022 11:06:30 +0000 Subject: [PATCH 07/36] apply code-format changes --- src/layer/x86/multiheadattention_x86.cpp | 1 - src/layer/x86/multiheadattention_x86.h | 1 - 2 files changed, 2 deletions(-) diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index 72084706b6a7..2ddf8a4d48b7 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -20,7 +20,6 @@ #include #endif - namespace ncnn { MultiHeadAttention_x86::MultiHeadAttention_x86() diff --git a/src/layer/x86/multiheadattention_x86.h b/src/layer/x86/multiheadattention_x86.h index 6f6982e89847..284d1701e100 100644 --- a/src/layer/x86/multiheadattention_x86.h +++ b/src/layer/x86/multiheadattention_x86.h @@ -36,7 +36,6 @@ class MultiHeadAttention_x86 : virtual public MultiHeadAttention int create_pipeline_int8_x86(const Option& opt); int forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; #endif - }; } // namespace ncnn From 4a05da2af0cf7a5ce13a9ea9ba82726bc66bfb7a Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Tue, 2 Aug 2022 16:56:01 +0800 Subject: [PATCH 08/36] improvement(src/layer/mha): add const fold --- src/layer/multiheadattention.cpp | 10 +- src/layer/multiheadattention.h | 3 + src/layer/x86/multiheadattention_x86.cpp | 128 ++++++++++------------- src/layer/x86/multiheadattention_x86.h | 4 + src/layer/x86/softmax_x86.cpp | 1 + src/layer/x86/x86_usability.h | 55 ++++++++++ 6 files changed, 122 insertions(+), 79 deletions(-) diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index 14f9ddc83ad1..0e9c54e93a79 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -32,6 +32,9 @@ int MultiHeadAttention::load_param(const ParamDict& pd) weight_data_size = pd.get(2, 0); int8_scale_term = pd.get(3, 0); + embed_dim_per_head = embed_dim / num_head; + inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + if (int8_scale_term) { #if NCNN_INT8 @@ -173,12 +176,12 @@ static int affine_input( return 0; } -static inline int32_t float2int8(float v) +static inline signed char float2int8(float v) { int int32 = static_cast(round(v)); if (int32 > 127) return 127; if (int32 < -127) return -127; - return int32; + return (signed char)int32; } int MultiHeadAttention::forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const @@ -188,7 +191,6 @@ int MultiHeadAttention::forward_int8(const std::vector& bottom_blobs, std:: const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2]; const int seqlen = q_blob.h; - const int embed_dim_per_head = embed_dim / num_head; Option opt_g = opt; opt_g.blob_allocator = opt.workspace_allocator; @@ -203,8 +205,6 @@ int MultiHeadAttention::forward_int8(const std::vector& bottom_blobs, std:: affine_input(v_blob, v_weight_data, v_bias_data, xv, v_input_scale, v_weight_scales, internal_scales[2], num_head, opt_g, true); // xq @ qk * inv_sqrt_embed_dim_per_head - const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); - Mat xqk(seqlen, seqlen, num_head, 4u, opt.workspace_allocator); { // xqk = xq * xk diff --git a/src/layer/multiheadattention.h b/src/layer/multiheadattention.h index 31a967804391..98a63e05a28e 100644 --- a/src/layer/multiheadattention.h +++ b/src/layer/multiheadattention.h @@ -40,6 +40,9 @@ class MultiHeadAttention : public Layer int weight_data_size; int int8_scale_term; + int embed_dim_per_head; + float inv_sqrt_embed_dim_per_head; + Mat q_weight_data; Mat q_bias_data; Mat k_weight_data; diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index 72084706b6a7..c0603f09787d 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -15,7 +15,10 @@ // specific language governing permissions and limitations under the License. #include "multiheadattention_x86.h" +#include "x86_usability.h" +#include "layer_type.h" #include + #ifdef NCNN_INT8 #include #endif @@ -26,16 +29,56 @@ namespace ncnn { MultiHeadAttention_x86::MultiHeadAttention_x86() { support_packing = false; + softmax = 0; } int MultiHeadAttention_x86::create_pipeline(const Option& opt) { + embed_dim_per_head = embed_dim / num_head; + inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + #if NCNN_INT8 if (opt.use_int8_inference && q_weight_data.elemsize == (size_t)1u && k_weight_data.elemsize == (size_t)1u && v_weight_data.elemsize == (size_t)1u && out_weight_data.elemsize == (size_t)1u) { return create_pipeline_int8_x86(opt); } #endif + + // for fp32 inference, const fold inv_sqrt_embed_dim_per_head into `q_w` and `q_bias` +#if 0 + // FIXME! + float scale_vals[1] = {inv_sqrt_embed_dim_per_head}; + q_weight_fold_data = q_weight_data.clone(); + q_weight_fold_data.substract_mean_normalize(0, scale_vals); + q_bias_fold_data = q_bias_data.clone(); + q_bias_fold_data.substract_mean_normalize(0, scale_vals); +#else + q_weight_fold_data = q_weight_data.clone(); + for (int i = 0; i < q_weight_fold_data.w; ++i) { + q_weight_fold_data[i] *= inv_sqrt_embed_dim_per_head; + } + q_bias_fold_data = q_bias_data.clone(); + for (int i = 0; i < q_bias_fold_data.w; ++i) { + q_bias_fold_data[i] *= inv_sqrt_embed_dim_per_head; + } +#endif + + { + softmax = ncnn::create_layer(ncnn::LayerType::Softmax); + + ncnn::ParamDict pd; + pd.set(0, 1); + pd.set(1, 1); + + softmax->load_param(pd); + softmax->create_pipeline(opt); + } + + if (opt.lightmode) + { + q_weight_data.release(); + q_bias_data.release(); + } return 0; } @@ -143,7 +186,6 @@ int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blob const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2]; const int seqlen = q_blob.h; - const int embed_dim_per_head = embed_dim / num_head; Option opt_g = opt; opt_g.blob_allocator = opt.workspace_allocator; @@ -158,7 +200,6 @@ int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blob affine_input(v_blob, v_weight_data, v_bias_data, xv, v_input_scale, v_weight_scales, internal_scales[2], num_head, opt_g, true); // xq @ qk * inv_sqrt_embed_dim_per_head - const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); Mat xqk(seqlen, seqlen, num_head, 4u, opt.workspace_allocator); { @@ -302,7 +343,7 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v #if NCNN_INT8 if (opt.use_int8_inference && q_weight_data.elemsize == (size_t)1u && k_weight_data.elemsize == (size_t)1u && v_weight_data.elemsize == (size_t)1u && out_weight_data.elemsize == (size_t)1u) { - return forward_int8(bottom_blobs, top_blobs, opt); + return forward_int8_x86(bottom_blobs, top_blobs, opt); } #endif @@ -326,9 +367,7 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v Mat xqkv(embed_dim_per_head, num_head, seqlen, 4u, opt.workspace_allocator); - const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); - #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < num_head; q++) { // xq = affine(q) * inv_sqrt_embed_dim_per_head @@ -342,15 +381,9 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v for (int j = 0; j < embed_dim_per_head; j++) { const float* ptr = q_blob.row(i); - const float* kptr = (const float*)q_weight_data + embed_dim * (q * embed_dim_per_head + j); + const float* kptr = (const float*)q_weight_fold_data + embed_dim * (q * embed_dim_per_head + j); - float sum = q_bias_data[q * embed_dim_per_head + j]; - for (int k = 0; k < embed_dim; k++) - { - sum += *ptr++ * *kptr++; - } - - outptr[j] = sum * inv_sqrt_embed_dim_per_head; + outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + q_bias_fold_data[q * embed_dim_per_head + j]; } } } @@ -368,13 +401,7 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v const float* ptr = k_blob.row(i); const float* kptr = (const float*)k_weight_data + embed_dim * (q * embed_dim_per_head + j); - float sum = k_bias_data[q * embed_dim_per_head + j]; - for (int k = 0; k < embed_dim; k++) - { - sum += *ptr++ * *kptr++; - } - - outptr[j] = sum; + outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + k_bias_data[q * embed_dim_per_head + j]; } } } @@ -385,20 +412,14 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v for (int i = 0; i < embed_dim_per_head; i++) { + float* outptr = outm.row(i); + for (int j = 0; j < seqlen; j++) { const float* ptr = v_blob.row(j); const float* kptr = (const float*)v_weight_data + embed_dim * (q * embed_dim_per_head + i); - float sum = v_bias_data[q * embed_dim_per_head + i]; - for (int k = 0; k < embed_dim; k++) - { - sum += *ptr++ * *kptr++; - } - - float* outptr = outm.row(i); - - outptr[j] = sum; + outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + v_bias_data[q * embed_dim_per_head + i]; } } } @@ -421,43 +442,14 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v const float* qptr = xqm.row(i); const float* kptr = xkm.row(j); - float sum = 0.f; - for (int k = 0; k < embed_dim_per_head; k++) - { - sum += *qptr++ * *kptr++; - } - - outptr[j] = sum; + outptr[j] = mul_add_reduce_no_align(qptr, kptr, embed_dim_per_head); } } } - // softmax(xqk) { Mat outm = xqk.channel(q); - - for (int i = 0; i < seqlen; i++) - { - float* ptr = outm.row(i); - - float max = -FLT_MAX; - for (int j = 0; j < seqlen; j++) - { - max = std::max(max, ptr[j]); - } - - float sum = 0.f; - for (int j = 0; j < seqlen; j++) - { - ptr[j] = (float)(exp(ptr[j] - max)); - sum += ptr[j]; - } - - for (int j = 0; j < seqlen; j++) - { - ptr[j] /= sum; - } - } + softmax->forward_inplace(outm, opt); } // xqkv = xqk * xv @@ -477,13 +469,7 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v const float* qkptr = xqkm.row(i); const float* vptr = xvm.row(j); - float sum = 0.f; - for (int k = 0; k < seqlen; k++) - { - sum += *qkptr++ * *vptr++; - } - - outptr[j] = sum; + outptr[j] = mul_add_reduce_no_align(qkptr, vptr, seqlen); } } } @@ -501,13 +487,7 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v const float* ptr = xqkv.channel(i); const float* kptr = (const float*)out_weight_data + embed_dim * j; - float sum = out_bias_data[j]; - for (int k = 0; k < embed_dim; k++) - { - sum += *ptr++ * *kptr++; - } - - outptr[j] = sum; + outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + out_bias_data[j]; } } diff --git a/src/layer/x86/multiheadattention_x86.h b/src/layer/x86/multiheadattention_x86.h index 6f6982e89847..ebc2b1cd8353 100644 --- a/src/layer/x86/multiheadattention_x86.h +++ b/src/layer/x86/multiheadattention_x86.h @@ -37,6 +37,10 @@ class MultiHeadAttention_x86 : virtual public MultiHeadAttention int forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; #endif +private: + Mat q_weight_fold_data; + Mat q_bias_fold_data; + Layer* softmax; }; } // namespace ncnn diff --git a/src/layer/x86/softmax_x86.cpp b/src/layer/x86/softmax_x86.cpp index d1df7e446cf7..69091ad75e08 100644 --- a/src/layer/x86/softmax_x86.cpp +++ b/src/layer/x86/softmax_x86.cpp @@ -317,6 +317,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return -100; max.fill(-FLT_MAX); + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = bottom_top_blob.channel(q); diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index 28ddfd50b952..dd05aa38a019 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -494,4 +494,59 @@ static NCNN_FORCEINLINE float _mm512_comp_reduce_max_ps(__m512 x) #endif // __AVX__ #endif // __SSE2__ +/** + * @brief A wrapper for simd computation + * + * result = reduce(a[] * b[]) + * + * @param a + * @param b + * @param size + * @return float + */ + static NCNN_FORCEINLINE float mul_add_reduce_no_align(const float* a, const float* b, const int size) +{ + float ret = 0.f; + int align = 0; +#if __AVX512F__ + align = (size >> 4) << 4; + __m512 _sum = _mm512_set1_ps(0.f); + + for (int i = 0; i < align; i+=16) { + __m512 val0 = _mm512_loadu_ps(a + i); + __m512 val1 = _mm512_loadu_ps(b + i); + _sum = _mm512_add_ps(_sum, _mm512_mul_ps(val0, val1)); + } + ret += _mm512_reduce_add_ps(_sum); + +#elif __AVX__ + align = (size >> 3) << 3; + __m256 _sum = _mm256_set1_ps(0.f); + for (int i = 0; i < align; i+=8) { + __m256 val0 = _mm256_loadu_ps(a + i); + __m256 val1 = _mm256_loadu_ps(b + i); + _sum = _mm256_comp_fmadd_ps(val0, val1, _sum); + } + ret += _mm256_reduce_add_ps(_sum); + +#elif __SSE2__ + align = (size >> 2) << 2; + __m128 _sum = _mm_set1_ps(0.f); + for (int i = 0; i < align; i+=8) { + __m128 val0 = _mm_loadu_ps(a + i); + __m128 val1 = _mm_loadu_ps(b + i); + _sum = _mm_add_ps(_sum, _mm_mul_ps(val0, val1)); + } + + ret += _mm_reduce_add_ps(_sum); + +#endif + float sum = 0.f; + for (int i = align; i < size; ++i) { + sum += a[i] * b[i]; + } + ret += sum; + return ret; +} + #endif // X86_USABILITY_H From 49cbb14db339181319bb76650fffb178fef63b70 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Tue, 2 Aug 2022 09:14:00 +0000 Subject: [PATCH 09/36] apply code-format changes --- src/layer/x86/multiheadattention_x86.cpp | 13 +++++----- src/layer/x86/multiheadattention_x86.h | 6 ++--- src/layer/x86/x86_usability.h | 30 ++++++++++++++---------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index cff2cf9d0b43..42af2ad3f20c 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -53,11 +53,13 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) q_bias_fold_data.substract_mean_normalize(0, scale_vals); #else q_weight_fold_data = q_weight_data.clone(); - for (int i = 0; i < q_weight_fold_data.w; ++i) { + for (int i = 0; i < q_weight_fold_data.w; ++i) + { q_weight_fold_data[i] *= inv_sqrt_embed_dim_per_head; } q_bias_fold_data = q_bias_data.clone(); - for (int i = 0; i < q_bias_fold_data.w; ++i) { + for (int i = 0; i < q_bias_fold_data.w; ++i) + { q_bias_fold_data[i] *= inv_sqrt_embed_dim_per_head; } #endif @@ -73,7 +75,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) softmax->create_pipeline(opt); } - if (opt.lightmode) + if (opt.lightmode) { q_weight_data.release(); q_bias_data.release(); @@ -366,7 +368,6 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v Mat xqkv(embed_dim_per_head, num_head, seqlen, 4u, opt.workspace_allocator); - for (int q = 0; q < num_head; q++) { // xq = affine(q) * inv_sqrt_embed_dim_per_head @@ -382,7 +383,7 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v const float* ptr = q_blob.row(i); const float* kptr = (const float*)q_weight_fold_data + embed_dim * (q * embed_dim_per_head + j); - outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + q_bias_fold_data[q * embed_dim_per_head + j]; + outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + q_bias_fold_data[q * embed_dim_per_head + j]; } } } @@ -400,7 +401,7 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v const float* ptr = k_blob.row(i); const float* kptr = (const float*)k_weight_data + embed_dim * (q * embed_dim_per_head + j); - outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + k_bias_data[q * embed_dim_per_head + j]; + outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + k_bias_data[q * embed_dim_per_head + j]; } } } diff --git a/src/layer/x86/multiheadattention_x86.h b/src/layer/x86/multiheadattention_x86.h index ebc2b1cd8353..966dbf558bdc 100644 --- a/src/layer/x86/multiheadattention_x86.h +++ b/src/layer/x86/multiheadattention_x86.h @@ -38,9 +38,9 @@ class MultiHeadAttention_x86 : virtual public MultiHeadAttention #endif private: - Mat q_weight_fold_data; - Mat q_bias_fold_data; - Layer* softmax; + Mat q_weight_fold_data; + Mat q_bias_fold_data; + Layer* softmax; }; } // namespace ncnn diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index d4512211abe0..10a14bf5fe4e 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -496,15 +496,15 @@ static NCNN_FORCEINLINE float _mm512_comp_reduce_max_ps(__m512 x) /** * @brief A wrapper for simd computation - * - * result = reduce(a[] * b[]) - * - * @param a - * @param b - * @param size - * @return float + * + * result = reduce(a[] * b[]) + * + * @param a + * @param b + * @param size + * @return float */ - static NCNN_FORCEINLINE float mul_add_reduce_no_align(const float* a, const float* b, const int size) +static NCNN_FORCEINLINE float mul_add_reduce_no_align(const float* a, const float* b, const int size) { float sum = 0.f; int align = 0; @@ -512,7 +512,8 @@ static NCNN_FORCEINLINE float _mm512_comp_reduce_max_ps(__m512 x) align = (size >> 4) << 4; __m512 _sum = _mm512_set1_ps(0.f); - for (int i = 0; i < align; i+=16) { + for (int i = 0; i < align; i += 16) + { __m512 val0 = _mm512_loadu_ps(a + i); __m512 val1 = _mm512_loadu_ps(b + i); _sum = _mm512_add_ps(_sum, _mm512_mul_ps(val0, val1)); @@ -522,7 +523,8 @@ static NCNN_FORCEINLINE float _mm512_comp_reduce_max_ps(__m512 x) #elif __AVX__ align = (size >> 3) << 3; __m256 _sum = _mm256_set1_ps(0.f); - for (int i = 0; i < align; i+=8) { + for (int i = 0; i < align; i += 8) + { __m256 val0 = _mm256_loadu_ps(a + i); __m256 val1 = _mm256_loadu_ps(b + i); _sum = _mm256_comp_fmadd_ps(val0, val1, _sum); @@ -532,16 +534,18 @@ static NCNN_FORCEINLINE float _mm512_comp_reduce_max_ps(__m512 x) #elif __SSE2__ align = (size >> 2) << 2; __m128 _sum = _mm_set1_ps(0.f); - for (int i = 0; i < align; i+=8) { + for (int i = 0; i < align; i += 8) + { __m128 val0 = _mm_loadu_ps(a + i); __m128 val1 = _mm_loadu_ps(b + i); _sum = _mm_add_ps(_sum, _mm_mul_ps(val0, val1)); } - sum += _mm_reduce_add_ps(_sum); + sum += _mm_reduce_add_ps(_sum); #endif - for (int i = align; i < size; ++i) { + for (int i = align; i < size; ++i) + { sum += a[i] * b[i]; } return sum; From 37848f11183937e0b9ded1d0235253be3fe3d872 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Tue, 2 Aug 2022 19:19:27 +0800 Subject: [PATCH 10/36] improvement(src/layer/mha): update --- src/layer/x86/multiheadattention_x86.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index cff2cf9d0b43..c0f4000099ae 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -18,6 +18,7 @@ #include "x86_usability.h" #include "layer_type.h" #include +#include #ifdef NCNN_INT8 #include @@ -66,7 +67,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) softmax = ncnn::create_layer(ncnn::LayerType::Softmax); ncnn::ParamDict pd; - pd.set(0, 1); + pd.set(0, 2); pd.set(1, 1); softmax->load_param(pd); @@ -386,7 +387,10 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v } } } + } + for (int q = 0; q < num_head; q++) + { // xk = affine(k) { Mat outm = xk.channel(q); @@ -404,7 +408,10 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v } } } + } + for (int q = 0; q < num_head; q++) + { // xv = affine(v) { Mat outm = xv.channel(q); @@ -422,7 +429,10 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v } } } + } + for (int q = 0; q < num_head; q++) + { // xqk = xq * xk // xq (embed_dim_per_head, seqlen) // xk (embed_dim_per_head, seqlen) @@ -445,12 +455,12 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v } } } + } - { - Mat outm = xqk.channel(q); - softmax->forward_inplace(outm, opt); - } + softmax->forward_inplace(xqk, opt); + for (int q = 0; q < num_head; q++) + { // xqkv = xqk * xv // xqk (seqlen, seqlen) // xv (seqlen, embed_dim_per_head) From 95692d745af730d07df6f6afbab476189483b220 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Wed, 3 Aug 2022 19:57:57 +0800 Subject: [PATCH 11/36] fix(src/layer/mha): miss convert weight to int8 --- src/layer/multiheadattention.cpp | 53 ++++++++++++++++++++++++++++++-- src/layer/multiheadattention.h | 6 ++++ 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index 0e9c54e93a79..2d582b0809de 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -25,6 +25,36 @@ MultiHeadAttention::MultiHeadAttention() { } +int MultiHeadAttention::create_pipeline(const Option& opt) +{ + // runtime quantize the weight data + int ret = 0; +#if NCNN_INT8 + if (opt.use_int8_inference && int8_scale_term) + { + if (q_weight_data.elemsize == (size_t) 4u) + { + ret += quantize_weight(q_weight_data, q_weight_scales, opt); + } + if (k_weight_data.elemsize == (size_t) 4u) + { + ret += quantize_weight(k_weight_data, k_weight_scales, opt); + } + if (v_weight_data.elemsize == (size_t) 4u) + { + ret += quantize_weight(v_weight_data, v_weight_scales, opt); + } + if (out_weight_data.elemsize == (size_t) 4u) + { + ret += quantize_weight(out_weight_data, o_weight_scales, opt); + } + } +#else + (void)(opt); +#endif // NCNN_INT8 + return ret; +} + int MultiHeadAttention::load_param(const ParamDict& pd) { embed_dim = pd.get(0, 0); @@ -98,10 +128,29 @@ int MultiHeadAttention::load_model(const ModelBin& mb) } #ifdef NCNN_INT8 -static int affine_input( + +int MultiHeadAttention::quantize_weight(Mat& weight_data, const Mat& weight_data_int8_scales, const Option& opt) +{ + const int num_output = embed_dim; + const int num_input = weight_data_size / num_output; + + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + Mat weight_data_int8; + Option opt_q = opt; + opt_q.use_packing_layout = false; + quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q); + if (weight_data_int8.empty()) + return -100; + + weight_data = weight_data_int8.reshape(weight_data_size); + return 0; +} + +int MultiHeadAttention::affine_input( const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, const Mat& input_scale, const Mat& weight_scales, const float transform_scale, - const int num_head, const Option& opt, bool transpose) + const int num_head, const Option& opt, bool transpose) const { const int embed_dim = input.w; const int seqlen = input.h; diff --git a/src/layer/multiheadattention.h b/src/layer/multiheadattention.h index 98a63e05a28e..a344c24a8f56 100644 --- a/src/layer/multiheadattention.h +++ b/src/layer/multiheadattention.h @@ -28,10 +28,16 @@ class MultiHeadAttention : public Layer virtual int load_model(const ModelBin& mb); + virtual int create_pipeline(const Option& opt); + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; #ifdef NCNN_INT8 int forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + + int quantize_weight(Mat& weight, const Mat& scales, const Option& opt); + + int affine_input(const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8,const Mat& input_scale, const Mat& weight_scales, const float transform_scale, const int num_head, const Option& opt, bool transpose) const; #endif public: From 07a1424503436a7074a24d19d166453cf7294b72 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Wed, 3 Aug 2022 19:58:52 +0800 Subject: [PATCH 12/36] fix(src/layer/x86/mha): update int8 --- src/layer/x86/multiheadattention_x86.cpp | 86 ------------------------ 1 file changed, 86 deletions(-) diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index 6a5f6382bc03..80c82e6ef7fa 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -95,92 +95,6 @@ int MultiHeadAttention_x86::destroy_pipeline(const Option& opt) return 0; } -static int affine_input( - const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, - const Mat& input_scale, const Mat& weight_scales, const float transform_scale, - const int num_head, const Option& opt, bool transpose) -{ - const int embed_dim = input.w; - const int seqlen = input.h; - const int embed_dim_per_head = embed_dim / num_head; - const float scale = 1.0 / input_scale[0]; - - Mat input_int8; - if (input.elemsize != 1) - { - quantize_to_int8(input, input_int8, input_scale, opt); - } - - Mat buffer(out_int8.w, out_int8.h, out_int8.c, 4u, opt.workspace_allocator); - - if (transpose) - { - for (int q = 0; q < num_head; q++) - { - Mat outm = buffer.channel(q); - - for (int i = 0; i < embed_dim_per_head; i++) - { - for (int j = 0; j < seqlen; j++) - { - const int8_t* ptr = input_int8.row(j); - const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + i); - - int32_t sum = 0; - const int32_t index = q * embed_dim_per_head + i; - for (int k = 0; k < embed_dim; k++) - { - sum += *ptr++ * *kptr++; - } - - float* outptr = outm.row(i); - outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; - } - } - } - } - else - { - for (int q = 0; q < num_head; q++) - { - Mat outm = buffer.channel(q); - - for (int i = 0; i < seqlen; i++) - { - float* outptr = outm.row(i); - - for (int j = 0; j < embed_dim_per_head; j++) - { - const int8_t* ptr = input_int8.row(i); - const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + j); - - int32_t sum = 0; - const int32_t index = q * embed_dim_per_head + j; - for (int k = 0; k < embed_dim; k++) - { - sum += *ptr++ * *kptr++; - } - - outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; - } - } - } - } - - Mat transform(1, 4u, opt.workspace_allocator); - transform[0] = transform_scale; - quantize_to_int8(buffer, out_int8, transform, opt); - return 0; -} - -static inline int32_t float2int8(float v) -{ - int int32 = static_cast(round(v)); - if (int32 > 127) return 127; - if (int32 < -127) return -127; - return int32; -} - int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& q_blob = bottom_blobs[0]; From 9c1c2c9d4f0c4712048b42def367cc56ffd8cba0 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Wed, 3 Aug 2022 12:00:37 +0000 Subject: [PATCH 13/36] apply code-format changes --- src/layer/multiheadattention.cpp | 8 ++++---- src/layer/multiheadattention.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index 2d582b0809de..4f77ff862b9d 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -32,19 +32,19 @@ int MultiHeadAttention::create_pipeline(const Option& opt) #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { - if (q_weight_data.elemsize == (size_t) 4u) + if (q_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(q_weight_data, q_weight_scales, opt); } - if (k_weight_data.elemsize == (size_t) 4u) + if (k_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(k_weight_data, k_weight_scales, opt); } - if (v_weight_data.elemsize == (size_t) 4u) + if (v_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(v_weight_data, v_weight_scales, opt); } - if (out_weight_data.elemsize == (size_t) 4u) + if (out_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(out_weight_data, o_weight_scales, opt); } diff --git a/src/layer/multiheadattention.h b/src/layer/multiheadattention.h index a344c24a8f56..4fc963d8feb8 100644 --- a/src/layer/multiheadattention.h +++ b/src/layer/multiheadattention.h @@ -37,7 +37,7 @@ class MultiHeadAttention : public Layer int quantize_weight(Mat& weight, const Mat& scales, const Option& opt); - int affine_input(const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8,const Mat& input_scale, const Mat& weight_scales, const float transform_scale, const int num_head, const Option& opt, bool transpose) const; + int affine_input(const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, const Mat& input_scale, const Mat& weight_scales, const float transform_scale, const int num_head, const Option& opt, bool transpose) const; #endif public: From 9454c5105864f9eaaee1a912c29fb78c4bda3060 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 4 Aug 2022 17:09:26 +0800 Subject: [PATCH 14/36] improvement(src/x86/mha): int8 optimization --- src/layer/multiheadattention.cpp | 8 +- src/layer/multiheadattention.h | 2 +- src/layer/x86/multiheadattention_x86.cpp | 108 ++++++++++++++++++----- src/layer/x86/multiheadattention_x86.h | 3 + src/layer/x86/x86_usability.h | 69 ++++++++++++++- 5 files changed, 162 insertions(+), 28 deletions(-) diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index 4f77ff862b9d..2d582b0809de 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -32,19 +32,19 @@ int MultiHeadAttention::create_pipeline(const Option& opt) #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { - if (q_weight_data.elemsize == (size_t)4u) + if (q_weight_data.elemsize == (size_t) 4u) { ret += quantize_weight(q_weight_data, q_weight_scales, opt); } - if (k_weight_data.elemsize == (size_t)4u) + if (k_weight_data.elemsize == (size_t) 4u) { ret += quantize_weight(k_weight_data, k_weight_scales, opt); } - if (v_weight_data.elemsize == (size_t)4u) + if (v_weight_data.elemsize == (size_t) 4u) { ret += quantize_weight(v_weight_data, v_weight_scales, opt); } - if (out_weight_data.elemsize == (size_t)4u) + if (out_weight_data.elemsize == (size_t) 4u) { ret += quantize_weight(out_weight_data, o_weight_scales, opt); } diff --git a/src/layer/multiheadattention.h b/src/layer/multiheadattention.h index 4fc963d8feb8..e81c1287a0d4 100644 --- a/src/layer/multiheadattention.h +++ b/src/layer/multiheadattention.h @@ -37,7 +37,7 @@ class MultiHeadAttention : public Layer int quantize_weight(Mat& weight, const Mat& scales, const Option& opt); - int affine_input(const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, const Mat& input_scale, const Mat& weight_scales, const float transform_scale, const int num_head, const Option& opt, bool transpose) const; + virtual int affine_input(const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, const Mat& input_scale, const Mat& weight_scales, const float transform_scale, const int num_head, const Option& opt, bool transpose) const; #endif public: diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index 80c82e6ef7fa..feba621c3db0 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -37,6 +37,17 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) embed_dim_per_head = embed_dim / num_head; inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + { + softmax = ncnn::create_layer(ncnn::LayerType::Softmax); + + ncnn::ParamDict pd; + pd.set(0, 2); + pd.set(1, 1); + + softmax->load_param(pd); + softmax->create_pipeline(opt); + } + #if NCNN_INT8 if (opt.use_int8_inference && q_weight_data.elemsize == (size_t)1u && k_weight_data.elemsize == (size_t)1u && v_weight_data.elemsize == (size_t)1u && out_weight_data.elemsize == (size_t)1u) { @@ -65,16 +76,6 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) } #endif - { - softmax = ncnn::create_layer(ncnn::LayerType::Softmax); - - ncnn::ParamDict pd; - pd.set(0, 2); - pd.set(1, 1); - - softmax->load_param(pd); - softmax->create_pipeline(opt); - } if (opt.lightmode) { @@ -95,6 +96,79 @@ int MultiHeadAttention_x86::destroy_pipeline(const Option& opt) return 0; } + +int MultiHeadAttention_x86::affine_input( + const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, + const Mat& input_scale, const Mat& weight_scales, const float transform_scale, + const int num_head, const Option& opt, bool transpose) const +{ + const int embed_dim = input.w; + const int seqlen = input.h; + const int embed_dim_per_head = embed_dim / num_head; + const float scale = 1.0 / input_scale[0]; + + Mat input_int8; + if (input.elemsize != 1) + { + quantize_to_int8(input, input_int8, input_scale, opt); + } + + Mat buffer(out_int8.w, out_int8.h, out_int8.c, 4u, opt.workspace_allocator); + + if (transpose) + { + for (int q = 0; q < num_head; q++) + { + Mat outm = buffer.channel(q); + + for (int i = 0; i < embed_dim_per_head; i++) + { + for (int j = 0; j < seqlen; j++) + { + const int8_t* ptr = input_int8.row(j); + const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + i); + + const int32_t sum = mul_add_reduce_no_align(ptr, kptr, embed_dim); + + const int32_t index = q * embed_dim_per_head + i; + + float* outptr = outm.row(i); + outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; + } + } + } + } + else + { + for (int q = 0; q < num_head; q++) + { + Mat outm = buffer.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const int8_t* ptr = input_int8.row(i); + const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + j); + + const int32_t index = q * embed_dim_per_head + j; + + const int32_t sum = mul_add_reduce_no_align(ptr, kptr, embed_dim); + + outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; + } + } + } + } + + Mat transform(1, 4u, opt.workspace_allocator); + transform[0] = transform_scale; + quantize_to_int8(buffer, out_int8, transform, opt); + return 0; +} + int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& q_blob = bottom_blobs[0]; @@ -141,11 +215,7 @@ int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blob const int8_t* qptr = xqm.row(i); const int8_t* kptr = xkm.row(j); - int32_t sum = 0; - for (int k = 0; k < embed_dim_per_head; k++) - { - sum += *qptr++ * *kptr++; - } + const int32_t sum = mul_add_reduce_no_align(qptr, kptr, embed_dim_per_head); outptr[j] = sum * out_scale; } @@ -156,7 +226,6 @@ int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blob #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < num_head; q++) { - // softmax(xqk) { Mat outm = xqk.channel(q); @@ -186,7 +255,6 @@ int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blob } } - // xqkv int4 @ int8, implement by shift Mat xqkv(embed_dim_per_head, num_head, seqlen, 1u, opt.workspace_allocator); const float xqkv_out_scale = internal_scales[4] / internal_scales[2]; @@ -238,11 +306,7 @@ int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blob const int8_t* ptr = xqkv.channel(i); const int8_t* kptr = (const int8_t*)out_weight_data + embed_dim * j; - int32_t sum = 0; - for (int k = 0; k < embed_dim; k++) - { - sum += *ptr++ * *kptr++; - } + const int32_t sum = mul_add_reduce_no_align(ptr, kptr, embed_dim); outptr[j] = sum * out_scale / o_weight_scales[j] + out_bias_data[j]; } diff --git a/src/layer/x86/multiheadattention_x86.h b/src/layer/x86/multiheadattention_x86.h index 966dbf558bdc..4190269ea9c8 100644 --- a/src/layer/x86/multiheadattention_x86.h +++ b/src/layer/x86/multiheadattention_x86.h @@ -35,6 +35,9 @@ class MultiHeadAttention_x86 : virtual public MultiHeadAttention #if NCNN_INT8 int create_pipeline_int8_x86(const Option& opt); int forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + + virtual int affine_input(const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, const Mat& input_scale, const Mat& weight_scales, const float transform_scale, const int num_head, const Option& opt, bool transpose) const; + #endif private: diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index 10a14bf5fe4e..f19b165f0461 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -287,6 +287,23 @@ static NCNN_FORCEINLINE float _mm256_reduce_add_ps(__m256 x) return _mm_cvtss_f32(x32); } +static NCNN_FORCEINLINE int32_t hsum_epi32_avx(__m128i x) +{ + __m128i hi64 = _mm_unpackhi_epi64(x, x); // 3-operand non-destructive AVX lets us save a byte without needing a movdqa + __m128i sum64 = _mm_add_epi32(hi64, x); + __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); // Swap the low two elements + __m128i sum32 = _mm_add_epi32(sum64, hi32); + return _mm_cvtsi128_si32(sum32); // movd +} + +static NCNN_FORCEINLINE int32_t _mm256_hsum_8x32_i(__m256i v) +{ + __m128i sum128 = _mm_add_epi32( + _mm256_castsi256_si128(v), + _mm256_extracti128_si256(v, 1)); // silly GCC uses a longer AXV512VL instruction if AVX512 is enabled :/ + return hsum_epi32_avx(sum128); +} + static NCNN_FORCEINLINE float _mm256_reduce_max_ps(__m256 x) { const __m128 x128 = _mm_max_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x)); @@ -495,7 +512,7 @@ static NCNN_FORCEINLINE float _mm512_comp_reduce_max_ps(__m512 x) #endif // __SSE2__ /** - * @brief A wrapper for simd computation + * @brief A wrapper for fp32 simd computation * * result = reduce(a[] * b[]) * @@ -551,4 +568,54 @@ static NCNN_FORCEINLINE float mul_add_reduce_no_align(const float* a, const floa return sum; } +/** + * @brief A wrapper for int8 simd computation + * + * result = reduce(a[] * b[]) + * + * @param a + * @param b + * @param size + * @return int32_t + */ +static NCNN_FORCEINLINE int32_t mul_add_reduce_no_align(const int8_t* a, const int8_t* b, const int size) +{ + int32_t sum = 0.f; + int align = 0; + +#if __AVXVNNI__ || __AVX512VNNI__ + align = (size >> 5) << 5; + __m256i _sum = _mm256_setzero_si256(); + for (int i = 0; i < align; i+=32) + { + __m256i val0 = _mm256_lddqu_si256((const __m256i*)(a + i)); + __m256i val1 = _mm256_lddqu_si256((const __m256i*)(b + i)); + _sum = _mm256_dpbusd_epi32(_sum, val0, val1); + } + sum += _mm256_hsum_8x32_i(_sum); + +#elif __AVX2__ + align = (size >> 4) << 4; + __m256i _sum = _mm256_setzero_si256(); + for (int i = 0; i < align; i += 16) + { + __m256i val0 = _mm256_cvtepi8_epi16(_mm_lddqu_si128((const __m128i*)(a + i))); + __m256i val1 = _mm256_cvtepi8_epi16(_mm_lddqu_si128((const __m128i*)(b + i))); + + __m256i lo = _mm256_mullo_epi16(val0, val1); + __m256i hi = _mm256_mulhi_epi16(val0, val1); + _sum = _mm256_add_epi32(_sum, _mm256_unpacklo_epi16(lo, hi)); + _sum = _mm256_add_epi32(_sum, _mm256_unpackhi_epi16(lo, hi)); + } + sum += _mm256_hsum_8x32_i(_sum); + +#endif + + for (int i = align; i < size; ++i) + { + sum += a[i] * b[i]; + } + return sum; +} + #endif // X86_USABILITY_H From 42ad426cb9d1dc8f7a18d33435b6af64cf4dcc01 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 4 Aug 2022 09:11:43 +0000 Subject: [PATCH 15/36] apply code-format changes --- src/layer/multiheadattention.cpp | 8 ++++---- src/layer/x86/multiheadattention_x86.cpp | 2 -- src/layer/x86/x86_usability.h | 16 ++++++++-------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index 2d582b0809de..4f77ff862b9d 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -32,19 +32,19 @@ int MultiHeadAttention::create_pipeline(const Option& opt) #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { - if (q_weight_data.elemsize == (size_t) 4u) + if (q_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(q_weight_data, q_weight_scales, opt); } - if (k_weight_data.elemsize == (size_t) 4u) + if (k_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(k_weight_data, k_weight_scales, opt); } - if (v_weight_data.elemsize == (size_t) 4u) + if (v_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(v_weight_data, v_weight_scales, opt); } - if (out_weight_data.elemsize == (size_t) 4u) + if (out_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(out_weight_data, o_weight_scales, opt); } diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index feba621c3db0..def263558bfa 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -76,7 +76,6 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) } #endif - if (opt.lightmode) { q_weight_data.release(); @@ -96,7 +95,6 @@ int MultiHeadAttention_x86::destroy_pipeline(const Option& opt) return 0; } - int MultiHeadAttention_x86::affine_input( const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, const Mat& input_scale, const Mat& weight_scales, const float transform_scale, diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index f19b165f0461..c1865e211c73 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -289,18 +289,18 @@ static NCNN_FORCEINLINE float _mm256_reduce_add_ps(__m256 x) static NCNN_FORCEINLINE int32_t hsum_epi32_avx(__m128i x) { - __m128i hi64 = _mm_unpackhi_epi64(x, x); // 3-operand non-destructive AVX lets us save a byte without needing a movdqa + __m128i hi64 = _mm_unpackhi_epi64(x, x); // 3-operand non-destructive AVX lets us save a byte without needing a movdqa __m128i sum64 = _mm_add_epi32(hi64, x); - __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); // Swap the low two elements + __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); // Swap the low two elements __m128i sum32 = _mm_add_epi32(sum64, hi32); - return _mm_cvtsi128_si32(sum32); // movd + return _mm_cvtsi128_si32(sum32); // movd } static NCNN_FORCEINLINE int32_t _mm256_hsum_8x32_i(__m256i v) { - __m128i sum128 = _mm_add_epi32( - _mm256_castsi256_si128(v), - _mm256_extracti128_si256(v, 1)); // silly GCC uses a longer AXV512VL instruction if AVX512 is enabled :/ + __m128i sum128 = _mm_add_epi32( + _mm256_castsi256_si128(v), + _mm256_extracti128_si256(v, 1)); // silly GCC uses a longer AXV512VL instruction if AVX512 is enabled :/ return hsum_epi32_avx(sum128); } @@ -586,7 +586,7 @@ static NCNN_FORCEINLINE int32_t mul_add_reduce_no_align(const int8_t* a, const i #if __AVXVNNI__ || __AVX512VNNI__ align = (size >> 5) << 5; __m256i _sum = _mm256_setzero_si256(); - for (int i = 0; i < align; i+=32) + for (int i = 0; i < align; i += 32) { __m256i val0 = _mm256_lddqu_si256((const __m256i*)(a + i)); __m256i val1 = _mm256_lddqu_si256((const __m256i*)(b + i)); @@ -596,7 +596,7 @@ static NCNN_FORCEINLINE int32_t mul_add_reduce_no_align(const int8_t* a, const i #elif __AVX2__ align = (size >> 4) << 4; - __m256i _sum = _mm256_setzero_si256(); + __m256i _sum = _mm256_setzero_si256(); for (int i = 0; i < align; i += 16) { __m256i val0 = _mm256_cvtepi8_epi16(_mm_lddqu_si128((const __m128i*)(a + i))); From 6854ef764336bb82374ed1ea5ef3800d953f3fb4 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Fri, 5 Aug 2022 18:39:24 +0800 Subject: [PATCH 16/36] feat(src/layer): layernorm support int8 --- src/layer/convolution.cpp | 10 +- src/layer/layernorm.cpp | 149 +++++++++++++++++++++++++++++- src/layer/layernorm.h | 9 ++ src/layer/mathfun.h | 27 ++++++ src/layer/multiheadattention.cpp | 9 +- tests/test_layernorm.cpp | 41 +++++++- tests/test_multiheadattention.cpp | 3 +- tools/quantize/net_quantize.cpp | 60 +++++++++++- tools/quantize/net_quantize.h | 3 + 9 files changed, 288 insertions(+), 23 deletions(-) create mode 100644 src/layer/mathfun.h diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp index 4acf91869ae6..91c07f4038d4 100644 --- a/src/layer/convolution.cpp +++ b/src/layer/convolution.cpp @@ -18,6 +18,8 @@ #include "fused_activation.h" +#include "mathfun.h" + namespace ncnn { Convolution::Convolution() @@ -383,14 +385,6 @@ void Convolution::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered } #if NCNN_INT8 -static inline signed char float2int8(float v) -{ - int int32 = static_cast(round(v)); - if (int32 > 127) return 127; - if (int32 < -127) return -127; - return (signed char)int32; -} - int Convolution::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { int w = bottom_blob.w; diff --git a/src/layer/layernorm.cpp b/src/layer/layernorm.cpp index c95609e3799e..4b4981b80d70 100644 --- a/src/layer/layernorm.cpp +++ b/src/layer/layernorm.cpp @@ -13,8 +13,9 @@ // specific language governing permissions and limitations under the License. #include "layernorm.h" - +#include #include +#include "mathfun.h" namespace ncnn { @@ -30,6 +31,7 @@ int LayerNorm::load_param(const ParamDict& pd) eps = pd.get(1, 0.001f); affine = pd.get(2, 1); + int8_scale_term = pd.get(3, 0); return 0; } @@ -46,11 +48,156 @@ int LayerNorm::load_model(const ModelBin& mb) if (beta_data.empty()) return -100; +#ifdef NCNN_INT8 + if (int8_scale_term) + { + input_scales = mb.load(affine_size, 1); + output_scale = mb.load(1, 1); + } +#endif return 0; } +#ifdef NCNN_INT8 +static inline void get_MN(const float x, uint32_t& M, uint32_t& N) +{ + static uint32_t pow2_table[] = { + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, + 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, + 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648}; + + int bit = 7 - round(floor(log2(x))); + bit = bit < 0 ? 0 : bit; + bit = bit > 31 ? 31 : bit; + + N = pow2_table[bit]; + + // N > 0 and x > 0 + M = round(floor(N * x)); + M = M > 255 ? 255 : M; + + return; +} + +int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const +{ + if (!affine || bottom_top_blob.dims != 3 || bottom_top_blob.c != 1) + { + // non transformer int8 layernorm not implemented + return -100; + } + + if (bottom_top_blob.w != affine_size) + { + // check input parameter + return -200; + } + + // Transformer using BNC format + float in_scale_max = -FLT_MAX; + const float out_scale = output_scale[0]; + { + for (int i = 0; i < affine_size; ++i) + { + if (in_scale_max < input_scales[i]) + { + in_scale_max = input_scales[i]; + } + } + } + + // quantize input to int8 + Mat xq; + const int elem_count = bottom_top_blob.w * bottom_top_blob.h * bottom_top_blob.c; + if (bottom_top_blob.elemsize == (size_t)1u) + { + xq = bottom_top_blob; + // if input int8, rescale input + for (int i = 0; i < bottom_top_blob.h; ++i) + { + int8_t* ptr = xq.row(i); + for (int j = 0; j < bottom_top_blob.w; ++j) + { + ptr[j] = float2int8(ptr[j] * in_scale_max / input_scales[j]); + } + } + } + else + { + xq.create(bottom_top_blob.w, bottom_top_blob.h, 1u, opt.workspace_allocator); + // else fuse ((in * in_scale).round() * (in_scale_max / in_scale)).round to (in*in_scale_max).round() + int8_t* ptr = (int8_t*)xq.data; + for (int i = 0; i < elem_count; ++i) + { + ptr[i] = float2int8(bottom_top_blob[i] * in_scale_max); + } + } + + // get mean and std + for (int i = 0; i < xq.h; ++i) + { + // get mean and std + int32_t sum = 0; + int32_t sum_pow2 = 0; + int8_t* ptr = xq.row(i); + for (int j = 0; j < xq.w; ++j) + { + sum += ptr[j]; + sum_pow2 += ptr[j] * ptr[j]; + } + + const float mean = sum * 1.0f / in_scale_max / affine_size; + const float std = sqrt(affine_size * sum_pow2 - sum * sum) * in_scale_max / affine_size; + + // update xq + const float scale_a = out_scale / std / in_scale_max; + const float scale_b = mean / std; + for (int j = 0; j < affine_size; ++j) + { + float A = gamma_data[j] * scale_a; + const float sign = A > 0.f ? 1.f : -1.f; + + uint32_t M, N; + get_MN(abs(A), M, N); + + int32_t B = round((beta_data[j] - scale_b * gamma_data[j]) * out_scale * N); + + ptr[j] = float2int8((sign * M * ptr[j] + B) / N); + } + } + + if (int8_scale_term >= 100) + { + // output int8 + bottom_top_blob = xq; + } + else + { + // dequant and output fp32 + if (bottom_top_blob.elemsize == (size_t)1u) + { + bottom_top_blob.create(bottom_top_blob.w, bottom_top_blob.h, (size_t)4u, opt.workspace_allocator); + } + + int8_t* ptr = (int8_t*)xq.data; + for (int i = 0; i < elem_count; ++i) + { + bottom_top_blob[i] = ptr[i] / out_scale; + } + } + + return 0; +} +#endif + int LayerNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { +#ifdef NCNN_INT8 + if (opt.use_int8_inference && int8_scale_term) + { + return forward_inplace_int8(bottom_top_blob, opt); + } +#endif // x = (x - mean) / sqrt(var + eps) * gamma + beta int dims = bottom_top_blob.dims; diff --git a/src/layer/layernorm.h b/src/layer/layernorm.h index 375c0eb40bcb..a771f3fd91e5 100644 --- a/src/layer/layernorm.h +++ b/src/layer/layernorm.h @@ -30,11 +30,20 @@ class LayerNorm : public Layer virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +#ifdef NCNN_INT8 + int forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const; +#endif + public: // param int affine_size; float eps; int affine; + int int8_scale_term; +#ifdef NCNN_INT8 + Mat input_scales; + Mat output_scale; +#endif // model Mat gamma_data; diff --git a/src/layer/mathfun.h b/src/layer/mathfun.h new file mode 100644 index 000000000000..d404dfff538f --- /dev/null +++ b/src/layer/mathfun.h @@ -0,0 +1,27 @@ +// tpoisonooo is pleased to support the open source community by making ncnn available. +// +// author:tpoisonooo (https://github.com/tpoisonooo/) . +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#pragma once + +#ifdef NCNN_INT8 +static inline signed char float2int8(float v) +{ + int int32 = static_cast(round(v)); + if (int32 > 127) return 127; + if (int32 < -127) return -127; + return (signed char)int32; +} +#endif \ No newline at end of file diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index 4f77ff862b9d..c722c43ea3d5 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -17,6 +17,7 @@ #include #ifdef NCNN_INT8 #include +#include "mathfun.h" #endif namespace ncnn { @@ -225,14 +226,6 @@ int MultiHeadAttention::affine_input( return 0; } -static inline signed char float2int8(float v) -{ - int int32 = static_cast(round(v)); - if (int32 > 127) return 127; - if (int32 < -127) return -127; - return (signed char)int32; -} - int MultiHeadAttention::forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& q_blob = bottom_blobs[0]; diff --git a/tests/test_layernorm.cpp b/tests/test_layernorm.cpp index b4e3ad7fa1e6..8f69bb47447b 100644 --- a/tests/test_layernorm.cpp +++ b/tests/test_layernorm.cpp @@ -35,6 +35,29 @@ static int test_layernorm(const ncnn::Mat& a, int affine_size, float eps, int af return ret; } +static int test_layernorm_int8(const ncnn::Mat& a, int affine_size, float eps, int int8_scale_term) +{ + ncnn::ParamDict pd; + pd.set(0, affine_size); + pd.set(1, eps); + pd.set(2, 1); + pd.set(3, int8_scale_term); + + std::vector weights(4); + weights[0] = RandomMat(affine_size); + weights[1] = RandomMat(affine_size); + weights[2] = RandomMat(affine_size, 1.0f, 127.f/1.2f); + weights[3] = RandomMat(1); + + int ret = test_layer("LayerNorm", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_layernorm failed a.dims=%d a=(%d %d %d) affine_size=%d eps=%f int8_scale_term=%d\n", a.dims, a.w, a.h, a.c, affine_size, eps, int8_scale_term); + } + + return ret; +} + static int test_layernorm_0() { return 0 @@ -95,13 +118,23 @@ static int test_layernorm_3() || test_layernorm(RandomMat(24), 24, 0.001f, 1); } +static int test_layernorm_4() +{ + return 0 + || test_layernorm_int8(RandomMat(6, 4, 1), 6, 0.01f, 101) + || test_layernorm_int8(RandomMat(768, 127, 1), 6, 0.01f, 101) + || test_layernorm_int8(RandomMat(6, 7, 1), 6, 0.001f, 1) + || test_layernorm_int8(RandomMat(768, 127, 1), 6, 0.01f, 1); +} + int main() { SRAND(7767517); return 0 - || test_layernorm_0() - || test_layernorm_1() - || test_layernorm_2() - || test_layernorm_3(); + // || test_layernorm_0() + // || test_layernorm_1() + // || test_layernorm_2() + // || test_layernorm_3() + || test_layernorm_4(); } diff --git a/tests/test_multiheadattention.cpp b/tests/test_multiheadattention.cpp index acf3bba1d157..46d89b5fbcf7 100644 --- a/tests/test_multiheadattention.cpp +++ b/tests/test_multiheadattention.cpp @@ -141,7 +141,8 @@ static int test_multiheadattention_2() { return 0 || test_multiheadattention_int8(RandomMat(64, 128), 8) - || test_multiheadattention_int8(RandomMat(64, 127), 32); + || test_multiheadattention_int8(RandomMat(512, 512), 32); + } #endif diff --git a/tools/quantize/net_quantize.cpp b/tools/quantize/net_quantize.cpp index b77925ffd235..fe8744ab9461 100644 --- a/tools/quantize/net_quantize.cpp +++ b/tools/quantize/net_quantize.cpp @@ -131,6 +131,10 @@ bool NetQuantize::read_ini_format(const char* path) { mha_table[name] = ptable; } + else if (type == "LayerNorm") + { + layernorm_table[name] = ptable; + } } return true; @@ -143,7 +147,7 @@ int NetQuantize::quantize_mha() for (int i = 0; i < layer_count; i++) { - // find convolution layer + // find mha layer if (layers[i]->type != "MultiHeadAttention") continue; @@ -421,6 +425,60 @@ int NetQuantize::quantize_innerproduct() return 0; } +int NetQuantize::quantize_layernorm() +{ + const int layer_count = static_cast(layers.size()); + auto base_opt = opt; + + for (int i = 0; i < layer_count; i++) + { + // find layernorm layer + if (layers[i]->type != "LayerNorm") + continue; + + std::string name = layers[i]->name; + if (layernorm_table.find(name) == layernorm_table.end()) + { + fprintf(stderr, "cannot find %s quant param.\n", name.c_str()); + continue; + } + + ncnn::LayerNorm* ln = (ncnn::LayerNorm*)layers[i]; + fprintf(stderr, "quantize_layernorm %s\n", ln->name.c_str()); + + auto& table = layernorm_table.at(name); + { + // write input scale + auto convert = [table, base_opt](const std::string key, ncnn::Mat& mat) -> int { + std::vector scales = {table->get(key)}; + if (scales.empty()) + { + return -100; + } + + mat = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); + return 0; + }; + + int success = 0; + success += convert("input_scales", ln->input_scales); + success += convert("output_scale", ln->output_scale); + if (success != 0) + { + fprintf(stderr, "load layernorm scale failed. \n"); + return -100; + } + } + + { + // write control variable + ln->int8_scale_term = 1; + } + } + + return 0; +} + int NetQuantize::fuse_requantize() { const size_t layer_count = layers.size(); diff --git a/tools/quantize/net_quantize.h b/tools/quantize/net_quantize.h index 79c8a163b92e..c02e86e2e9c9 100644 --- a/tools/quantize/net_quantize.h +++ b/tools/quantize/net_quantize.h @@ -30,6 +30,8 @@ class NetQuantize : public ModelWriter // MutiHeadAttention quant param std::map > mha_table; + // LayerNorm quant param + std::map > layernorm_table; public: bool read_txt_format(const char* path); @@ -39,6 +41,7 @@ class NetQuantize : public ModelWriter int quantize_convolutiondepthwise(); int quantize_innerproduct(); int quantize_mha(); + int quantize_layernorm(); int fuse_requantize(); void set_weight_suffix(std::string s); From 9bd2ac455f278595ba42217292bc2695144a6c6b Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Fri, 5 Aug 2022 10:41:21 +0000 Subject: [PATCH 17/36] apply code-format changes --- src/layer/layernorm.cpp | 5 +++-- tests/test_layernorm.cpp | 10 +++++----- tests/test_multiheadattention.cpp | 1 - 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/layer/layernorm.cpp b/src/layer/layernorm.cpp index 4b4981b80d70..ea50fad8bea1 100644 --- a/src/layer/layernorm.cpp +++ b/src/layer/layernorm.cpp @@ -64,7 +64,8 @@ static inline void get_MN(const float x, uint32_t& M, uint32_t& N) static uint32_t pow2_table[] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, - 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648}; + 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648 + }; int bit = 7 - round(floor(log2(x))); bit = bit < 0 ? 0 : bit; @@ -146,7 +147,7 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con sum_pow2 += ptr[j] * ptr[j]; } - const float mean = sum * 1.0f / in_scale_max / affine_size; + const float mean = sum * 1.0f / in_scale_max / affine_size; const float std = sqrt(affine_size * sum_pow2 - sum * sum) * in_scale_max / affine_size; // update xq diff --git a/tests/test_layernorm.cpp b/tests/test_layernorm.cpp index 8f69bb47447b..8e983144d0ff 100644 --- a/tests/test_layernorm.cpp +++ b/tests/test_layernorm.cpp @@ -46,7 +46,7 @@ static int test_layernorm_int8(const ncnn::Mat& a, int affine_size, float eps, i std::vector weights(4); weights[0] = RandomMat(affine_size); weights[1] = RandomMat(affine_size); - weights[2] = RandomMat(affine_size, 1.0f, 127.f/1.2f); + weights[2] = RandomMat(affine_size, 1.0f, 127.f / 1.2f); weights[3] = RandomMat(1); int ret = test_layer("LayerNorm", pd, weights, a); @@ -132,9 +132,9 @@ int main() SRAND(7767517); return 0 - // || test_layernorm_0() - // || test_layernorm_1() - // || test_layernorm_2() - // || test_layernorm_3() + // || test_layernorm_0() + // || test_layernorm_1() + // || test_layernorm_2() + // || test_layernorm_3() || test_layernorm_4(); } diff --git a/tests/test_multiheadattention.cpp b/tests/test_multiheadattention.cpp index 46d89b5fbcf7..bbf71c4d9c64 100644 --- a/tests/test_multiheadattention.cpp +++ b/tests/test_multiheadattention.cpp @@ -142,7 +142,6 @@ static int test_multiheadattention_2() return 0 || test_multiheadattention_int8(RandomMat(64, 128), 8) || test_multiheadattention_int8(RandomMat(512, 512), 32); - } #endif From a6a818cd3708706946daf2d31d81e2ef5ce1b566 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 8 Aug 2022 18:51:57 +0800 Subject: [PATCH 18/36] feat(src/layer/layernorm.cpp): add int8 support --- src/layer/layernorm.cpp | 131 +++++++++++++++++++++++++++----- tests/test_layernorm.cpp | 10 +-- tools/modelwriter.cpp | 9 +++ tools/quantize/ncnn2int8.cpp | 1 + tools/quantize/net_quantize.cpp | 29 +++---- 5 files changed, 136 insertions(+), 44 deletions(-) diff --git a/src/layer/layernorm.cpp b/src/layer/layernorm.cpp index ea50fad8bea1..e81b61908f01 100644 --- a/src/layer/layernorm.cpp +++ b/src/layer/layernorm.cpp @@ -17,6 +17,8 @@ #include #include "mathfun.h" +// #include "npy.h" + namespace ncnn { LayerNorm::LayerNorm() @@ -30,7 +32,6 @@ int LayerNorm::load_param(const ParamDict& pd) affine_size = pd.get(0, 0); eps = pd.get(1, 0.001f); affine = pd.get(2, 1); - int8_scale_term = pd.get(3, 0); return 0; } @@ -61,17 +62,11 @@ int LayerNorm::load_model(const ModelBin& mb) #ifdef NCNN_INT8 static inline void get_MN(const float x, uint32_t& M, uint32_t& N) { - static uint32_t pow2_table[] = { - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, - 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, - 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648 - }; - int bit = 7 - round(floor(log2(x))); bit = bit < 0 ? 0 : bit; bit = bit > 31 ? 31 : bit; - N = pow2_table[bit]; + N = 1u << bit; // N > 0 and x > 0 M = round(floor(N * x)); @@ -82,7 +77,7 @@ static inline void get_MN(const float x, uint32_t& M, uint32_t& N) int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const { - if (!affine || bottom_top_blob.dims != 3 || bottom_top_blob.c != 1) + if (!affine || bottom_top_blob.dims != 2 || bottom_top_blob.c != 1) { // non transformer int8 layernorm not implemented return -100; @@ -94,6 +89,71 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con return -200; } + // { + // // setup input for debug + // { + // // write input + // std::vector shape; + // std::vector data; + // std::string typestr; + // std::string filename = "/home/PJLAB/konghuanjun/GitProjects/FQ-ViT/in_1_197_768.npy"; + // npy::LoadArrayFromNumpy(filename, typestr, shape, data); + + // float* ptr = (float*)bottom_top_blob.data; + // for (int i = 0; i < 151296; ++i) + // { + // ptr[i] = data[i]; + // } + // } + // { + // // write input scales + // std::vector shape; + // std::vector data; + // std::string typestr; + // std::string filename = "/home/PJLAB/konghuanjun/GitProjects/FQ-ViT/input_scales_768.npy"; + // npy::LoadArrayFromNumpy(filename, typestr, shape, data); + + // float* ptr = (float*)input_scales.data; + // for (int i = 0; i < 768; ++i) + // { + // ptr[i] =1.0f / data[i]; + // } + // } + // { + // // write output scale + // float* ptr = (float*)output_scale.data; + // ptr[0] = 1.0f / 0.0833f; + // } + // { + // // write gamma + // std::vector shape; + // std::vector data; + // std::string typestr; + // std::string filename = "/home/PJLAB/konghuanjun/GitProjects/FQ-ViT/gamma_768.npy"; + // npy::LoadArrayFromNumpy(filename, typestr, shape, data); + + // float* ptr = (float*)gamma_data.data; + // for (int i = 0; i < 768; ++i) + // { + // ptr[i] =data[i]; + // } + // } + // { + // // write beta + // std::vector shape; + // std::vector data; + // std::string typestr; + // std::string filename = "/home/PJLAB/konghuanjun/GitProjects/FQ-ViT/beta_768.npy"; + // npy::LoadArrayFromNumpy(filename, typestr, shape, data); + + // float* ptr = (float*)beta_data.data; + // for (int i = 0; i < 768; ++i) + // { + // ptr[i] =data[i]; + // } + // } + // } + // Transformer using BNC format float in_scale_max = -FLT_MAX; const float out_scale = output_scale[0]; @@ -108,39 +168,39 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con } // quantize input to int8 - Mat xq; + Mat xq(bottom_top_blob.w, bottom_top_blob.h, 4u, opt.workspace_allocator); const int elem_count = bottom_top_blob.w * bottom_top_blob.h * bottom_top_blob.c; if (bottom_top_blob.elemsize == (size_t)1u) { - xq = bottom_top_blob; // if input int8, rescale input for (int i = 0; i < bottom_top_blob.h; ++i) { - int8_t* ptr = xq.row(i); + int32_t* ptr = xq.row(i); for (int j = 0; j < bottom_top_blob.w; ++j) { - ptr[j] = float2int8(ptr[j] * in_scale_max / input_scales[j]); + ptr[j] = round(ptr[j] * in_scale_max / input_scales[j]); } } } else { - xq.create(bottom_top_blob.w, bottom_top_blob.h, 1u, opt.workspace_allocator); // else fuse ((in * in_scale).round() * (in_scale_max / in_scale)).round to (in*in_scale_max).round() - int8_t* ptr = (int8_t*)xq.data; + int32_t* ptr = (int32_t*)xq.data; for (int i = 0; i < elem_count; ++i) { - ptr[i] = float2int8(bottom_top_blob[i] * in_scale_max); + ptr[i] = round(bottom_top_blob[i] * in_scale_max); } } + // std::vector A_save, B_save, result_save; + // get mean and std for (int i = 0; i < xq.h; ++i) { // get mean and std int32_t sum = 0; int32_t sum_pow2 = 0; - int8_t* ptr = xq.row(i); + int32_t* ptr = xq.row(i); for (int j = 0; j < xq.w; ++j) { sum += ptr[j]; @@ -148,7 +208,7 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con } const float mean = sum * 1.0f / in_scale_max / affine_size; - const float std = sqrt(affine_size * sum_pow2 - sum * sum) * in_scale_max / affine_size; + const float std = sqrt(1.0f * affine_size * sum_pow2 - sum * sum) / in_scale_max / affine_size; // update xq const float scale_a = out_scale / std / in_scale_max; @@ -163,14 +223,43 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con int32_t B = round((beta_data[j] - scale_b * gamma_data[j]) * out_scale * N); - ptr[j] = float2int8((sign * M * ptr[j] + B) / N); + ptr[j] = round((sign * M * ptr[j] + B) / N); + + // A_save.emplace_back(A); + // B_save.emplace_back(B); + // result_save.emplace_back(ptr[j]); } } + // { + // // save to numpy + // const unsigned long shape[] = {197 * 768}; + // npy::SaveArrayAsNumpy("A.npy", false, 1, shape, A_save); + // npy::SaveArrayAsNumpy("B.npy", false, 1, shape, B_save); + // npy::SaveArrayAsNumpy("result.npy", false, 1, shape, result_save); + // } + if (int8_scale_term >= 100) { // output int8 - bottom_top_blob = xq; + bottom_top_blob.create(bottom_top_blob.w, bottom_top_blob.h, 1u, opt.workspace_allocator); + int32_t* from = (int32_t*)xq.data; + int8_t* to = (int8_t*)bottom_top_blob.data; + for (int i = 0; i < elem_count; ++i) + { + if (from[i] > 127) + { + to[i] = 127; + } + else if (from[i] < -127) + { + to[i] = -127; + } + else + { + to[i] = from[i]; + } + } } else { @@ -180,7 +269,7 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con bottom_top_blob.create(bottom_top_blob.w, bottom_top_blob.h, (size_t)4u, opt.workspace_allocator); } - int8_t* ptr = (int8_t*)xq.data; + int32_t* ptr = (int32_t*)xq.data; for (int i = 0; i < elem_count; ++i) { bottom_top_blob[i] = ptr[i] / out_scale; diff --git a/tests/test_layernorm.cpp b/tests/test_layernorm.cpp index 8e983144d0ff..279a1cc7b142 100644 --- a/tests/test_layernorm.cpp +++ b/tests/test_layernorm.cpp @@ -121,7 +121,7 @@ static int test_layernorm_3() static int test_layernorm_4() { return 0 - || test_layernorm_int8(RandomMat(6, 4, 1), 6, 0.01f, 101) + || test_layernorm_int8(RandomMat(768, 197, 1), 768, 0.0001f, 101) || test_layernorm_int8(RandomMat(768, 127, 1), 6, 0.01f, 101) || test_layernorm_int8(RandomMat(6, 7, 1), 6, 0.001f, 1) || test_layernorm_int8(RandomMat(768, 127, 1), 6, 0.01f, 1); @@ -132,9 +132,9 @@ int main() SRAND(7767517); return 0 - // || test_layernorm_0() - // || test_layernorm_1() - // || test_layernorm_2() - // || test_layernorm_3() + || test_layernorm_0() + || test_layernorm_1() + || test_layernorm_2() + || test_layernorm_3() || test_layernorm_4(); } diff --git a/tools/modelwriter.cpp b/tools/modelwriter.cpp index 6096f7ea1cc1..2c003d8478a7 100644 --- a/tools/modelwriter.cpp +++ b/tools/modelwriter.cpp @@ -1542,9 +1542,18 @@ int ModelWriter::save(const char* parampath, const char* binpath) fprintf_param_value(" 0=%d", affine_size) fprintf_param_value(" 1=%e", eps) fprintf_param_value(" 2=%d", affine) + fprintf_param_value(" 3=%d", int8_scale_term); fwrite_weight_data(op->gamma_data, bp); fwrite_weight_data(op->beta_data, bp); + +#ifdef NCNN_INT8 + if (op->int8_scale_term) + { + fwrite_weight_data(op->input_scales, bp, 5, 100); + fwrite_weight_data(op->output_scale, bp, 1, 10); + } +#endif } else if (layer->type == "Log") { diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index e820217392b3..1965a81f5d2d 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -90,6 +90,7 @@ int main(int argc, char** argv) quantizer.quantize_convolution(); quantizer.quantize_convolutiondepthwise(); quantizer.quantize_innerproduct(); + quantizer.quantize_layernorm(); quantizer.fuse_requantize(); diff --git a/tools/quantize/net_quantize.cpp b/tools/quantize/net_quantize.cpp index fe8744ab9461..4d4dd1176e31 100644 --- a/tools/quantize/net_quantize.cpp +++ b/tools/quantize/net_quantize.cpp @@ -448,29 +448,22 @@ int NetQuantize::quantize_layernorm() auto& table = layernorm_table.at(name); { - // write input scale - auto convert = [table, base_opt](const std::string key, ncnn::Mat& mat) -> int { - std::vector scales = {table->get(key)}; - if (scales.empty()) - { - return -100; - } - - mat = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); - return 0; - }; + std::vector scales = table->get_list("input_scales"); + if (scales.empty()) + { + fprintf(stderr, "quantize_layernorm input scales empty.\n"); + return -100; + } + ln->input_scales = ncnn::Mat((int) scales.size(), (void*)scales.data()).clone(); - int success = 0; - success += convert("input_scales", ln->input_scales); - success += convert("output_scale", ln->output_scale); - if (success != 0) + scales = {table->get("output_scale")}; + if (std::abs(scales[0]) <= 1e-6) { - fprintf(stderr, "load layernorm scale failed. \n"); + fprintf(stderr, "quantize_layernorm output scale unavailable.\n"); return -100; } - } + ln->output_scale = ncnn::Mat((int) scales.size(), (void*)scales.data()).clone(); - { // write control variable ln->int8_scale_term = 1; } From ac0d745848f95993287cf4346515253414035ea5 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 8 Aug 2022 10:54:04 +0000 Subject: [PATCH 19/36] apply code-format changes --- src/layer/layernorm.cpp | 4 ++-- tests/test_layernorm.cpp | 8 ++++---- tools/quantize/net_quantize.cpp | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/layer/layernorm.cpp b/src/layer/layernorm.cpp index e81b61908f01..f25500fbdebd 100644 --- a/src/layer/layernorm.cpp +++ b/src/layer/layernorm.cpp @@ -77,7 +77,7 @@ static inline void get_MN(const float x, uint32_t& M, uint32_t& N) int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const { - if (!affine || bottom_top_blob.dims != 2 || bottom_top_blob.c != 1) + if (!affine || bottom_top_blob.dims != 2 || bottom_top_blob.c != 1) { // non transformer int8 layernorm not implemented return -100; @@ -244,7 +244,7 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con // output int8 bottom_top_blob.create(bottom_top_blob.w, bottom_top_blob.h, 1u, opt.workspace_allocator); int32_t* from = (int32_t*)xq.data; - int8_t* to = (int8_t*)bottom_top_blob.data; + int8_t* to = (int8_t*)bottom_top_blob.data; for (int i = 0; i < elem_count; ++i) { if (from[i] > 127) diff --git a/tests/test_layernorm.cpp b/tests/test_layernorm.cpp index 279a1cc7b142..f638d299cd4c 100644 --- a/tests/test_layernorm.cpp +++ b/tests/test_layernorm.cpp @@ -132,9 +132,9 @@ int main() SRAND(7767517); return 0 - || test_layernorm_0() - || test_layernorm_1() - || test_layernorm_2() - || test_layernorm_3() + || test_layernorm_0() + || test_layernorm_1() + || test_layernorm_2() + || test_layernorm_3() || test_layernorm_4(); } diff --git a/tools/quantize/net_quantize.cpp b/tools/quantize/net_quantize.cpp index 4d4dd1176e31..fc8dcdf70588 100644 --- a/tools/quantize/net_quantize.cpp +++ b/tools/quantize/net_quantize.cpp @@ -454,7 +454,7 @@ int NetQuantize::quantize_layernorm() fprintf(stderr, "quantize_layernorm input scales empty.\n"); return -100; } - ln->input_scales = ncnn::Mat((int) scales.size(), (void*)scales.data()).clone(); + ln->input_scales = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); scales = {table->get("output_scale")}; if (std::abs(scales[0]) <= 1e-6) @@ -462,7 +462,7 @@ int NetQuantize::quantize_layernorm() fprintf(stderr, "quantize_layernorm output scale unavailable.\n"); return -100; } - ln->output_scale = ncnn::Mat((int) scales.size(), (void*)scales.data()).clone(); + ln->output_scale = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); // write control variable ln->int8_scale_term = 1; From 4d1950702c4fec45642074c8fa1cb8e5041a4b4f Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 8 Aug 2022 21:04:44 +0800 Subject: [PATCH 20/36] fix(src/layer): update layernorm OMP option --- src/layer/layernorm.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/layer/layernorm.cpp b/src/layer/layernorm.cpp index e81b61908f01..2ea8bff57140 100644 --- a/src/layer/layernorm.cpp +++ b/src/layer/layernorm.cpp @@ -158,6 +158,7 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con float in_scale_max = -FLT_MAX; const float out_scale = output_scale[0]; { + #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < affine_size; ++i) { if (in_scale_max < input_scales[i]) @@ -173,6 +174,7 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con if (bottom_top_blob.elemsize == (size_t)1u) { // if input int8, rescale input + #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < bottom_top_blob.h; ++i) { int32_t* ptr = xq.row(i); @@ -186,6 +188,7 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con { // else fuse ((in * in_scale).round() * (in_scale_max / in_scale)).round to (in*in_scale_max).round() int32_t* ptr = (int32_t*)xq.data; + #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < elem_count; ++i) { ptr[i] = round(bottom_top_blob[i] * in_scale_max); @@ -195,6 +198,7 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con // std::vector A_save, B_save, result_save; // get mean and std + #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < xq.h; ++i) { // get mean and std @@ -245,6 +249,7 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con bottom_top_blob.create(bottom_top_blob.w, bottom_top_blob.h, 1u, opt.workspace_allocator); int32_t* from = (int32_t*)xq.data; int8_t* to = (int8_t*)bottom_top_blob.data; + #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < elem_count; ++i) { if (from[i] > 127) @@ -270,6 +275,7 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con } int32_t* ptr = (int32_t*)xq.data; + #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < elem_count; ++i) { bottom_top_blob[i] = ptr[i] / out_scale; From 6ddf7cc8400b63868dd9cd5c011e5c3df5bacd89 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Wed, 10 Aug 2022 19:57:37 +0800 Subject: [PATCH 21/36] feat(src/layer): binaryop support int8 --- src/layer/binaryop.cpp | 241 +++++++++++++++++++++-- src/layer/binaryop.h | 12 ++ src/layer/x86/multiheadattention_x86.cpp | 19 +- src/layer/x86/multiheadattention_x86.h | 1 - tools/modelwriter.cpp | 4 + tools/quantize/ncnn2int8.cpp | 4 +- tools/quantize/net_quantize.cpp | 226 ++++++++++++++++++++- tools/quantize/net_quantize.h | 12 +- 8 files changed, 484 insertions(+), 35 deletions(-) diff --git a/src/layer/binaryop.cpp b/src/layer/binaryop.cpp index 53eb234bc923..851a83720270 100644 --- a/src/layer/binaryop.cpp +++ b/src/layer/binaryop.cpp @@ -15,6 +15,7 @@ #include "binaryop.h" #include +#include "mathfun.h" namespace ncnn { @@ -29,6 +30,12 @@ int BinaryOp::load_param(const ParamDict& pd) op_type = pd.get(0, 0); with_scalar = pd.get(1, 0); b = pd.get(2, 0.f); + int8_scale_term = pd.get(3, 0); +#ifdef NCNN_INT8 + in_scale0 = pd.get(4, 1.f); + in_scale1 = pd.get(5, 1.f); + out_scale = pd.get(6, 1.f); +#endif if (with_scalar != 0) { @@ -918,6 +925,157 @@ struct binary_op_rdiv } }; +#ifdef NCNN_INT8 +template +int BinaryOp::binary_op_int8(const Mat& a, const Mat& b, Mat& c, const Option& opt) const +{ + if (a.w != b.w || a.h != b.h || a.c != b.c || a.d != b.d || a.d != 1) + { + // binaryop int8 only support input same shape, not support packing layout + return -100; + } + + Op op; + const int channels = a.c; + const int size = a.w * a.h; + + if (int8_scale_term > 100){ + // requant + c.create(a.w, a.h, a.c, 1u, opt.workspace_allocator); + + if (a.elemsize == 1u && b.elemsize == 1u) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + int8_t* ptr0 = (int8_t*)(a.channel(q).data); + int8_t* ptr1 = (int8_t*)(b.channel(q).data); + int8_t* pout = (int8_t*)(c.channel(q).data); + + for (int i = 0; i < size; i++) + { + int32_t v = op(ptr0[i] / in_scale0, ptr1[i] / in_scale1); + pout[i] = float2int8(v * out_scale); + } + } + return 0; + } + + if (a.elemsize == 1u && b.elemsize == 4u) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + int8_t* ptr0 = (int8_t*)(a.channel(q).data); + const float* ptr1 = b.channel(q); + int8_t* pout = (int8_t*)(c.channel(q).data); + + for (int i = 0; i < size; i++) + { + int32_t v = op(ptr0[i] / in_scale0, ptr1[i]); + pout[i] = float2int8(v * out_scale); + } + } + return 0; + } + + if (a.elemsize == 4u && b.elemsize == 1u) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr0 = a.channel(q); + int8_t* ptr1 = (int8_t*)(b.channel(q).data); + int8_t* pout = (int8_t*)(c.channel(q).data); + + for (int i = 0; i < size; i++) + { + int32_t v = op(ptr0[i], ptr1[i] / in_scale1); + pout[i] = float2int8(v * out_scale); + } + } + return 0; + } + + if (a.elemsize == 4u && b.elemsize == 4u) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr0 = a.channel(q); + const float* ptr1 = b.channel(q); + int8_t* pout = (int8_t*)(c.channel(q).data); + + for (int i = 0; i < size; i++) + { + int32_t v = op(ptr0[i], ptr1[i] / in_scale1); + pout[i] = float2int8(v * out_scale); + } + } + return 0; + } + + } else { + // dequant + c.create(a.w, a.h, a.c, 4u, opt.workspace_allocator); + + if (a.elemsize == 1u && b.elemsize == 1u) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + int8_t* ptr0 = (int8_t*)(a.channel(q).data); + int8_t* ptr1 = (int8_t*)(b.channel(q).data); + float* pout = c.channel(q); + + for (int i = 0; i < size; i++) + { + pout[i] = op(ptr0[i] / in_scale0, ptr1[i] / in_scale1); + } + } + return 0; + } + + if (a.elemsize == 1u && b.elemsize == 4u) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + int8_t* ptr0 = (int8_t*)(a.channel(q).data); + const float* ptr1 = b.channel(q); + float* pout = c.channel(q); + + for (int i = 0; i < size; i++) + { + pout[i] = op(ptr0[i] / in_scale0, ptr1[i]); + } + } + return 0; + } + + if (a.elemsize == 4u && b.elemsize == 1u) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr0 = a.channel(q); + int8_t* ptr1 = (int8_t*)(b.channel(q).data); + float* pout = c.channel(q); + + for (int i = 0; i < size; i++) + { + pout[i] = op(ptr0[i], ptr1[i] / in_scale1); + } + } + return 0; + } + } + + return 0; +} + +#endif + int BinaryOp::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& bottom_blob = bottom_blobs[0]; @@ -925,38 +1083,85 @@ int BinaryOp::forward(const std::vector& bottom_blobs, std::vector& to Mat& top_blob = top_blobs[0]; - if (op_type == Operation_ADD) - return binary_op(bottom_blob, bottom_blob1, top_blob, opt); +#ifdef NCNN_INT8 + if (int8_scale_term > 0) + { + // requant + if (op_type == Operation_ADD) + return binary_op_int8(bottom_blob, bottom_blob1, top_blob, opt); - if (op_type == Operation_SUB) - return binary_op(bottom_blob, bottom_blob1, top_blob, opt); + if (op_type == Operation_SUB) + return binary_op_int8(bottom_blob, bottom_blob1, top_blob, opt); - if (op_type == Operation_MUL) - return binary_op(bottom_blob, bottom_blob1, top_blob, opt); + if (op_type == Operation_MUL) + return binary_op_int8(bottom_blob, bottom_blob1, top_blob, opt); - if (op_type == Operation_DIV) - return binary_op(bottom_blob, bottom_blob1, top_blob, opt); + if (op_type == Operation_DIV) + return binary_op_int8(bottom_blob, bottom_blob1, top_blob, opt); - if (op_type == Operation_MAX) - return binary_op(bottom_blob, bottom_blob1, top_blob, opt); + if (op_type == Operation_MAX) + return binary_op_int8(bottom_blob, bottom_blob1, top_blob, opt); - if (op_type == Operation_MIN) - return binary_op(bottom_blob, bottom_blob1, top_blob, opt); + if (op_type == Operation_MIN) + return binary_op_int8(bottom_blob, bottom_blob1, top_blob, opt); - if (op_type == Operation_POW) - return binary_op(bottom_blob, bottom_blob1, top_blob, opt); + if (op_type == Operation_POW) + return binary_op_int8(bottom_blob, bottom_blob1, top_blob, opt); - if (op_type == Operation_RSUB) - return binary_op(bottom_blob1, bottom_blob, top_blob, opt); + if (op_type == Operation_RSUB) + return binary_op_int8(bottom_blob1, bottom_blob, top_blob, opt); - if (op_type == Operation_RDIV) - return binary_op(bottom_blob1, bottom_blob, top_blob, opt); + if (op_type == Operation_RDIV) + return binary_op_int8(bottom_blob1, bottom_blob, top_blob, opt); + } +#endif + + switch (op_type) + { + case Operation_ADD: + binary_op(bottom_blob, bottom_blob1, top_blob, opt); + break; + case Operation_SUB: + binary_op(bottom_blob, bottom_blob1, top_blob, opt); + break; + case Operation_MUL: + binary_op(bottom_blob, bottom_blob1, top_blob, opt); + break; + case Operation_DIV: + binary_op(bottom_blob, bottom_blob1, top_blob, opt); + break; + case Operation_MAX: + binary_op(bottom_blob, bottom_blob1, top_blob, opt); + break; + case Operation_MIN: + binary_op(bottom_blob, bottom_blob1, top_blob, opt); + break; + case Operation_POW: + binary_op(bottom_blob, bottom_blob1, top_blob, opt); + break; + case Operation_RSUB: + binary_op(bottom_blob1, bottom_blob, top_blob, opt); + break; + case Operation_RDIV: + binary_op(bottom_blob1, bottom_blob, top_blob, opt); + break; + default: + return -100; + } return 0; } int BinaryOp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { +#ifdef NCNN_INT8 + if (int8_scale_term > 0) + { + // int8 BinaryOp with scalar not implemented + return -100; + } +#endif + if (op_type == Operation_ADD) return binary_op_scalar_inplace(bottom_top_blob, b, opt); diff --git a/src/layer/binaryop.h b/src/layer/binaryop.h index 74798f906aa8..e111c7b88014 100644 --- a/src/layer/binaryop.h +++ b/src/layer/binaryop.h @@ -32,6 +32,11 @@ class BinaryOp : public Layer virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +#ifdef NCNN_INT8 + template + int binary_op_int8(const Mat& a, const Mat& b, Mat& c, const Option& opt) const; +#endif + enum OperationType { Operation_ADD = 0, @@ -50,6 +55,13 @@ class BinaryOp : public Layer int op_type; int with_scalar; float b; + + int int8_scale_term; +#ifdef NCNN_INT8 + float in_scale0; + float in_scale1; + float out_scale; +#endif }; } // namespace ncnn diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index def263558bfa..d5352907ee72 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -18,7 +18,6 @@ #include "x86_usability.h" #include "layer_type.h" #include -#include #ifdef NCNN_INT8 #include @@ -48,13 +47,6 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) softmax->create_pipeline(opt); } -#if NCNN_INT8 - if (opt.use_int8_inference && q_weight_data.elemsize == (size_t)1u && k_weight_data.elemsize == (size_t)1u && v_weight_data.elemsize == (size_t)1u && out_weight_data.elemsize == (size_t)1u) - { - return create_pipeline_int8_x86(opt); - } -#endif - // for fp32 inference, const fold inv_sqrt_embed_dim_per_head into `q_w` and `q_bias` #if 0 // FIXME! @@ -85,13 +77,14 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) } #ifdef NCNN_INT8 -int MultiHeadAttention_x86::create_pipeline_int8_x86(const Option& opt) -{ - return 0; -} - int MultiHeadAttention_x86::destroy_pipeline(const Option& opt) { + if (softmax) + { + softmax->destroy_pipeline(opt); + delete softmax; + softmax = 0; + } return 0; } diff --git a/src/layer/x86/multiheadattention_x86.h b/src/layer/x86/multiheadattention_x86.h index 4190269ea9c8..868a69273786 100644 --- a/src/layer/x86/multiheadattention_x86.h +++ b/src/layer/x86/multiheadattention_x86.h @@ -33,7 +33,6 @@ class MultiHeadAttention_x86 : virtual public MultiHeadAttention protected: #if NCNN_INT8 - int create_pipeline_int8_x86(const Option& opt); int forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; virtual int affine_input(const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, const Mat& input_scale, const Mat& weight_scales, const float transform_scale, const int num_head, const Option& opt, bool transpose) const; diff --git a/tools/modelwriter.cpp b/tools/modelwriter.cpp index 2c003d8478a7..274bf4dd1717 100644 --- a/tools/modelwriter.cpp +++ b/tools/modelwriter.cpp @@ -660,6 +660,10 @@ int ModelWriter::save(const char* parampath, const char* binpath) fprintf_param_value(" 0=%d", op_type) fprintf_param_value(" 1=%d", with_scalar) fprintf_param_value(" 2=%e", b) + fprintf_param_value(" 3=%d", int8_scale_term) + fprintf_param_value(" 4=%e", in_scale0) + fprintf_param_value(" 5=%e", in_scale1) + fprintf_param_value(" 6=%e", out_scale) } else if (layer->type == "Clip") { diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index 1965a81f5d2d..49208b1b7cf8 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -92,7 +92,9 @@ int main(int argc, char** argv) quantizer.quantize_innerproduct(); quantizer.quantize_layernorm(); - quantizer.fuse_requantize(); + quantizer.fuse_conv_requantize(); + quantizer.fuse_layernorm_requantize(); + int fuse_binaryop_requantize(); quantizer.save(outparam, outbin); diff --git a/tools/quantize/net_quantize.cpp b/tools/quantize/net_quantize.cpp index fc8dcdf70588..b068f7a884c8 100644 --- a/tools/quantize/net_quantize.cpp +++ b/tools/quantize/net_quantize.cpp @@ -135,6 +135,10 @@ bool NetQuantize::read_ini_format(const char* path) { layernorm_table[name] = ptable; } + else if (type == "BinaryOp") + { + binaryop_table[name] = ptable; + } } return true; @@ -472,7 +476,67 @@ int NetQuantize::quantize_layernorm() return 0; } -int NetQuantize::fuse_requantize() + +int NetQuantize::quantize_binaryop() +{ + const int layer_count = static_cast(layers.size()); + auto base_opt = opt; + + for (int i = 0; i < layer_count; i++) + { + // find add layer + if (layers[i]->type != "BinaryOp") + continue; + + ncnn::BinaryOp* op = (ncnn::BinaryOp*)layers[i]; + + if (op->bottoms.size() != 2) + { + // binaryop with scalar, skip + continue; + } + + if (binaryop_table.find(op->name) == binaryop_table.end()) + { + fprintf(stderr, "cannot find %s quant param.\n", op->name.c_str()); + continue; + } + + auto& table = binaryop_table.at(op->name); + { + std::vector scales = table->get_list("input_scales"); + if (scales.size() != 2) + { + fprintf(stderr, "quantize_binaryop input scales len mismatch.\n"); + return -100; + } + op->in_scale0 = scales[0]; + op->in_scale1 = scales[1]; + + op->out_scale = table->get("output_scale"); + if (std::abs(op->out_scale) <= 1e-6) + { + fprintf(stderr, "quantize_binaryop output scale too small.\n"); + return -100; + } + + op->int8_scale_term = 1; + } + + // print some tips + switch (op->op_type) + { + case ncnn::BinaryOp::Operation_DIV: + case ncnn::BinaryOp::Operation_RDIV: + case ncnn::BinaryOp::Operation_POW: + fprintf(stderr, "please make sure that you really want to quantize div/rdiv/pow operation +_+ \n"); + break; + } + } + return 0; +} + +int NetQuantize::fuse_conv_requantize() { const size_t layer_count = layers.size(); for (size_t i = 0; i < layer_count; i++) @@ -674,3 +738,163 @@ int NetQuantize::fuse_requantize() return 0; } + +/** + * @brief if [LayerNorm --> X] and X is a type of quantizable layer, then requant layernorm.output, AKA X.input is quantized tensor + * + * @return int + */ +int NetQuantize::fuse_layernorm_requantize() +{ + const size_t layer_count = layers.size(); + for (size_t i = 0; i < layer_count; i++) + { + if (layers[i]->type != "LayerNorm") + continue; + + // LayerNorm --> quantizable_node + int top_blob_index = layers[i]->tops[0]; + + size_t j = i + 1; + for (; j < layer_count; j++) + { + if (quantizable_node.find(layers[j]->type) == quantizable_node.end()) + { + continue; + } + + if (layers[j]->bottoms.size() != 1) + continue; + + if (layers[j]->bottoms[0] == top_blob_index) + break; + } + + if (j == layer_count) + continue; + + // fuse requantize + fprintf(stderr, "fuse_requantize %s %s\n", layers[i]->name.c_str(), layers[j]->name.c_str()); + + ncnn::LayerNorm* ln = (ncnn::LayerNorm*)layers[i]; + // layernorm_int8 quantized by , so do not need to update next node's output_scale. + ln->int8_scale_term += 100; + } + + return 0; +} + +/** + * @brief + * + * if all of output is quantized, binaryop use requant + * if none of input and output can be quantize, binaryop skip quant + * + * @return int + */ +int NetQuantize::fuse_binaryop_requantize() +{ + const size_t layer_count = layers.size(); + + auto get_all_connected_outputs = [=](ncnn::BinaryOp* op, int cur) -> std::vector + { + std::vector layers; + for (size_t j = cur; j bottoms) { + if (index == op->tops[0] || index == op->tops[1]) + { + layers.emplace_back(next); + break; + } + } + } + return layers; + }; + + auto get_all_connected_inputs = [=](ncnn::BinaryOp* op, int cur) -> std::vector + { + std::vector layers; + for (size_t j = 0; j tops) { + if (index == op->bottoms[0] || index == op->bottoms[1]) + { + layers.emplace_back(last); + break; + } + } + } + return layers; + }; + + auto is_quantized = [=](ncnn::Layer* layer) -> bool + { + if (quantizable_node.find(layer->name) == quantizable_node.end()) + { + return false; + } + if (layer->name == "Convolution") + return ((ncnn::Convolution*)layer)->int8_scale_term > 0; + + if (layer->name == "MultiHeadAttention") + return ((ncnn::MultiHeadAttention*)layer)->int8_scale_term > 0; + + if (layer->name == "InnerProduct") + return ((ncnn::InnerProduct*)layer)->int8_scale_term > 0; + + if (layer->name == "ConvolutionDepthWise") + return ((ncnn::ConvolutionDepthWise*)layer)->int8_scale_term > 0; + + return false; + }; + + for (size_t i = 0; i < layer_count; i++) + { + if (layers[i]->type != "BinaryOp") + continue; + + ncnn::BinaryOp* op = (ncnn::BinaryOp*)layers[i]; + + auto outputs = get_all_connected_outputs(op, i); + auto inputs = get_all_connected_inputs(op, i); + + // if binaryop outputs are all quantized, requant and return + bool all_output_quantize = true; + // if none of nodes could be quantized, give up quantize binaryop + bool non_can_quantize = true; + + for (ncnn::Layer* output: outputs) + { + if (is_quantized(output)) + { + non_can_quantize = false; + } else + { + all_output_quantize = false; + } + } + for (ncnn::Layer* input: inputs) + { + if (is_quantized(input)) + { + non_can_quantize = false; + } + } + + if (all_output_quantize) + { + // enable requant + op->int8_scale_term += 100; + } else if (non_can_quantize){ + // cancel quant + op->int8_scale_term = 0; + op->in_scale0 = 1.f; + op->in_scale1 = 1.f; + op->out_scale = 1.f; + } + } + return 0; +} diff --git a/tools/quantize/net_quantize.h b/tools/quantize/net_quantize.h index c02e86e2e9c9..eb578d45ea8e 100644 --- a/tools/quantize/net_quantize.h +++ b/tools/quantize/net_quantize.h @@ -17,12 +17,14 @@ #include #include "../modelwriter.h" #include "ini_config.h" +#include class NetQuantize : public ModelWriter { public: NetQuantize() { + quantizable_node = {"LayerNorm", "Convolution", "ConvolutionDepthWise", "MultiHeadAttention", "Add"}; } // conv and gemm quant param std::map blob_int8scale_table; @@ -32,6 +34,10 @@ class NetQuantize : public ModelWriter std::map > mha_table; // LayerNorm quant param std::map > layernorm_table; + // BinaryOp quant param + std::map > binaryop_table; + // supported quantizable node + std::set quantizable_node; public: bool read_txt_format(const char* path); @@ -41,8 +47,12 @@ class NetQuantize : public ModelWriter int quantize_convolutiondepthwise(); int quantize_innerproduct(); int quantize_mha(); + int quantize_binaryop(); int quantize_layernorm(); - int fuse_requantize(); + + int fuse_conv_requantize(); + int fuse_layernorm_requantize(); + int fuse_binaryop_requantize(); void set_weight_suffix(std::string s); From 5727e194a811e67a9356d2f2f6fb2f14faea3932 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 11 Aug 2022 14:46:00 +0800 Subject: [PATCH 22/36] feat(src/layer): test add int8 failed --- src/layer/binaryop.cpp | 10 +-- src/layer/layernorm.cpp | 2 +- src/layer/x86/multiheadattention_x86.cpp | 7 ++ tools/quantize/ncnn2int8.cpp | 3 +- tools/quantize/net_quantize.cpp | 97 +++++++++++++++++------- tools/quantize/net_quantize.h | 2 +- 6 files changed, 87 insertions(+), 34 deletions(-) diff --git a/src/layer/binaryop.cpp b/src/layer/binaryop.cpp index 851a83720270..b67d2923e10c 100644 --- a/src/layer/binaryop.cpp +++ b/src/layer/binaryop.cpp @@ -945,7 +945,7 @@ int BinaryOp::binary_op_int8(const Mat& a, const Mat& b, Mat& c, const Option& o if (a.elemsize == 1u && b.elemsize == 1u) { - #pragma omp parallel for num_threads(opt.num_threads) + // #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { int8_t* ptr0 = (int8_t*)(a.channel(q).data); @@ -963,7 +963,7 @@ int BinaryOp::binary_op_int8(const Mat& a, const Mat& b, Mat& c, const Option& o if (a.elemsize == 1u && b.elemsize == 4u) { - #pragma omp parallel for num_threads(opt.num_threads) + // #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { int8_t* ptr0 = (int8_t*)(a.channel(q).data); @@ -981,7 +981,7 @@ int BinaryOp::binary_op_int8(const Mat& a, const Mat& b, Mat& c, const Option& o if (a.elemsize == 4u && b.elemsize == 1u) { - #pragma omp parallel for num_threads(opt.num_threads) + // #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr0 = a.channel(q); @@ -999,7 +999,7 @@ int BinaryOp::binary_op_int8(const Mat& a, const Mat& b, Mat& c, const Option& o if (a.elemsize == 4u && b.elemsize == 4u) { - #pragma omp parallel for num_threads(opt.num_threads) + // #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr0 = a.channel(q); @@ -1008,7 +1008,7 @@ int BinaryOp::binary_op_int8(const Mat& a, const Mat& b, Mat& c, const Option& o for (int i = 0; i < size; i++) { - int32_t v = op(ptr0[i], ptr1[i] / in_scale1); + float v = op(ptr0[i], ptr1[i]); pout[i] = float2int8(v * out_scale); } } diff --git a/src/layer/layernorm.cpp b/src/layer/layernorm.cpp index b6241945881e..4d4d09f52d70 100644 --- a/src/layer/layernorm.cpp +++ b/src/layer/layernorm.cpp @@ -77,7 +77,7 @@ static inline void get_MN(const float x, uint32_t& M, uint32_t& N) int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const { - if (!affine || bottom_top_blob.dims != 2 || bottom_top_blob.c != 1) + if (!affine || bottom_top_blob.c != 1) { // non transformer int8 layernorm not implemented return -100; diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index d5352907ee72..30c2a31e127f 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -47,6 +47,13 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) softmax->create_pipeline(opt); } +#if NCNN_INT8 + if (opt.use_int8_inference && q_weight_data.elemsize == (size_t)1u && k_weight_data.elemsize == (size_t)1u && v_weight_data.elemsize == (size_t)1u && out_weight_data.elemsize == (size_t)1u) + { + return 0; + } +#endif + // for fp32 inference, const fold inv_sqrt_embed_dim_per_head into `q_w` and `q_bias` #if 0 // FIXME! diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index 49208b1b7cf8..1013e3b7e338 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -91,10 +91,11 @@ int main(int argc, char** argv) quantizer.quantize_convolutiondepthwise(); quantizer.quantize_innerproduct(); quantizer.quantize_layernorm(); + quantizer.quantize_binaryop(); quantizer.fuse_conv_requantize(); quantizer.fuse_layernorm_requantize(); - int fuse_binaryop_requantize(); + quantizer.fuse_binaryop_requantize(); quantizer.save(outparam, outbin); diff --git a/tools/quantize/net_quantize.cpp b/tools/quantize/net_quantize.cpp index b068f7a884c8..4be076bb9059 100644 --- a/tools/quantize/net_quantize.cpp +++ b/tools/quantize/net_quantize.cpp @@ -135,7 +135,7 @@ bool NetQuantize::read_ini_format(const char* path) { layernorm_table[name] = ptable; } - else if (type == "BinaryOp") + else if (type == "Add") { binaryop_table[name] = ptable; } @@ -796,58 +796,95 @@ int NetQuantize::fuse_binaryop_requantize() { const size_t layer_count = layers.size(); - auto get_all_connected_outputs = [=](ncnn::BinaryOp* op, int cur) -> std::vector + auto direct_connected_outputs = [&](ncnn::Layer* op, int cur) -> std::vector { - std::vector layers; + std::vector outputs; for (size_t j = cur; j bottoms) { if (index == op->tops[0] || index == op->tops[1]) { - layers.emplace_back(next); + outputs.emplace_back(next); break; } } } - return layers; + return outputs; }; - auto get_all_connected_inputs = [=](ncnn::BinaryOp* op, int cur) -> std::vector + auto all_outputs = [&](ncnn::BinaryOp* op, int cur) -> std::vector { - std::vector layers; + auto directs = direct_connected_outputs(op, cur); + std::vector outputs; + for (auto node: directs) + { + if (node->type == "Split") + { + auto nexts = direct_connected_outputs(node, cur); + outputs.insert(outputs.end(), nexts.begin(), nexts.end()); + continue; + } + outputs.emplace_back(node); + } + return outputs; + }; + + auto direct_connected_inputs = [=](ncnn::Layer* op, int cur) -> std::vector + { + std::vector inputs; for (size_t j = 0; j tops) { if (index == op->bottoms[0] || index == op->bottoms[1]) { - layers.emplace_back(last); + inputs.emplace_back(last); break; } } } - return layers; + return inputs; }; - auto is_quantized = [=](ncnn::Layer* layer) -> bool + auto all_inputs = [&](ncnn::BinaryOp* op, int cur) -> std::vector { - if (quantizable_node.find(layer->name) == quantizable_node.end()) + auto directs = direct_connected_inputs(op, cur); + std::vector inputs; + for (auto node: directs) + { + if (node->type == "Split") { - return false; + auto lasts = direct_connected_inputs(node, cur); + inputs.insert(inputs.end(), lasts.begin(), lasts.end()); + continue; } - if (layer->name == "Convolution") + inputs.emplace_back(node); + } + return inputs; + }; + + auto is_quantized = [=](ncnn::Layer* layer) -> bool + { + if (layer->type == "Convolution") return ((ncnn::Convolution*)layer)->int8_scale_term > 0; - if (layer->name == "MultiHeadAttention") + if (layer->type == "MultiHeadAttention") return ((ncnn::MultiHeadAttention*)layer)->int8_scale_term > 0; - if (layer->name == "InnerProduct") + if (layer->type == "InnerProduct") return ((ncnn::InnerProduct*)layer)->int8_scale_term > 0; - if (layer->name == "ConvolutionDepthWise") + if (layer->type == "ConvolutionDepthWise") return ((ncnn::ConvolutionDepthWise*)layer)->int8_scale_term > 0; - + + if (layer->type == "LayerNorm") + return ((ncnn::LayerNorm*)layer)->int8_scale_term > 0; + + if (layer->type == "BinaryOp") + // suppose that future binaryop could be quantized + return true; + return false; }; @@ -858,33 +895,38 @@ int NetQuantize::fuse_binaryop_requantize() ncnn::BinaryOp* op = (ncnn::BinaryOp*)layers[i]; - auto outputs = get_all_connected_outputs(op, i); - auto inputs = get_all_connected_inputs(op, i); + if (op->int8_scale_term == 0) + { + continue; + } + + auto outputs = all_outputs(op, i); + auto inputs = all_inputs(op, i); // if binaryop outputs are all quantized, requant and return - bool all_output_quantize = true; + bool all_output_support_quant = true; // if none of nodes could be quantized, give up quantize binaryop bool non_can_quantize = true; - for (ncnn::Layer* output: outputs) + for (ncnn::Layer* node: outputs) { - if (is_quantized(output)) + if (is_quantized(node)) { non_can_quantize = false; } else { - all_output_quantize = false; + all_output_support_quant = false; } } - for (ncnn::Layer* input: inputs) + for (ncnn::Layer* node: inputs) { - if (is_quantized(input)) + if (is_quantized(node)) { non_can_quantize = false; } } - if (all_output_quantize) + if (all_output_support_quant) { // enable requant op->int8_scale_term += 100; @@ -894,7 +936,10 @@ int NetQuantize::fuse_binaryop_requantize() op->in_scale0 = 1.f; op->in_scale1 = 1.f; op->out_scale = 1.f; + } else { + op->int8_scale_term = 1; } + fprintf(stderr, "quantize_binaryop %s int8_scale_term %d\n", op->name.c_str(), op->int8_scale_term); } return 0; } diff --git a/tools/quantize/net_quantize.h b/tools/quantize/net_quantize.h index eb578d45ea8e..e48d54fd1b50 100644 --- a/tools/quantize/net_quantize.h +++ b/tools/quantize/net_quantize.h @@ -24,7 +24,7 @@ class NetQuantize : public ModelWriter public: NetQuantize() { - quantizable_node = {"LayerNorm", "Convolution", "ConvolutionDepthWise", "MultiHeadAttention", "Add"}; + quantizable_node = {"LayerNorm", "Convolution", "ConvolutionDepthWise", "MultiHeadAttention", "BinaryOp"}; } // conv and gemm quant param std::map blob_int8scale_table; From 86148bd30ffda344063d020ba2b9323c8155f4db Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 11 Aug 2022 17:21:28 +0800 Subject: [PATCH 23/36] feat(src/layer): revert int8 BinaryOp --- benchmark/vision_transformer_int8.param | 216 +- examples/CMakeLists.txt | 1 + examples/vision_transformer_int8.cpp | 83 + src/layer/binaryop.cpp | 241 +- src/layer/binaryop.h | 12 - src/layer/layernorm.cpp | 5 +- src/layer/multiheadattention.cpp | 3 + src/layer/x86/binaryop_x86.cpp | 2791 ---------------------- src/layer/x86/binaryop_x86.h | 34 - src/layer/x86/multiheadattention_x86.cpp | 3 + tools/modelwriter.cpp | 4 - tools/quantize/ncnn2int8.cpp | 2 - tools/quantize/net_quantize.cpp | 202 +- tools/quantize/net_quantize.h | 3 - 14 files changed, 236 insertions(+), 3364 deletions(-) create mode 100644 examples/vision_transformer_int8.cpp delete mode 100644 src/layer/x86/binaryop_x86.cpp delete mode 100644 src/layer/x86/binaryop_x86.h diff --git a/benchmark/vision_transformer_int8.param b/benchmark/vision_transformer_int8.param index 0a7e92383add..76474c8bbce8 100644 --- a/benchmark/vision_transformer_int8.param +++ b/benchmark/vision_transformer_int8.param @@ -9,138 +9,138 @@ Permute Transpose_3 1 1 onnx::Transpose_155 onnx:: Concat Concat_4 2 1 backbone.cls_token onnx::Concat_156 onnx::Add_157 BinaryOp Add_5 2 1 onnx::Add_157 backbone.pos_embed input.1 Split splitncnn_0 1 2 input.1 input.1_splitncnn_0 input.1_splitncnn_1 -LayerNorm LayerNorm_6 1 1 input.1_splitncnn_1 qkv_input 0=768 1=1.000000e-06 +LayerNorm LayerNorm_6 1 1 input.1_splitncnn_1 qkv_input 0=768 1=1.000000e-06 3=101 Split splitncnn_1 1 3 qkv_input qkv_input_splitncnn_0 qkv_input_splitncnn_1 qkv_input_splitncnn_2 MultiHeadAttention MultiHeadAttention_15 3 1 qkv_input_splitncnn_2 qkv_input_splitncnn_1 qkv_input_splitncnn_0 onnx::Add_168 0=768 1=12 2=589824 3=1 BinaryOp Add_16 2 1 input.1_splitncnn_0 onnx::Add_168 input.4 Split splitncnn_2 1 2 input.4 input.4_splitncnn_0 input.4_splitncnn_1 -LayerNorm LayerNorm_17 1 1 input.4_splitncnn_1 onnx::Gemm_170 0=768 1=1.000000e-06 -InnerProduct Gemm_18 1 1 onnx::Gemm_170 mmdeploy::Gelu_171 0=3072 1=1 2=2359296 8=2 -GELU Gelu_19 1 1 mmdeploy::Gelu_171 input.8 0=1 -InnerProduct Gemm_20 1 1 input.8 input.12 0=768 1=1 2=2359296 8=2 -BinaryOp Add_21 2 1 input.4_splitncnn_0 input.12 input.16 -Split splitncnn_3 1 2 input.16 input.16_splitncnn_0 input.16_splitncnn_1 -LayerNorm LayerNorm_22 1 1 input.16_splitncnn_1 qkv_input.3 0=768 1=1.000000e-06 +LayerNorm LayerNorm_17 1 1 input.4_splitncnn_1 A.1 0=768 1=1.000000e-06 3=101 +InnerProduct Gemm_18 1 1 A.1 mmdeploy::Gelu_171 0=3072 1=1 2=2359296 8=2 +GELU Gelu_19 1 1 mmdeploy::Gelu_171 A.3 0=1 +InnerProduct Gemm_20 1 1 A.3 input.16 0=768 1=1 2=2359296 8=2 +BinaryOp Add_21 2 1 input.4_splitncnn_0 input.16 input.20 +Split splitncnn_3 1 2 input.20 input.20_splitncnn_0 input.20_splitncnn_1 +LayerNorm LayerNorm_22 1 1 input.20_splitncnn_1 qkv_input.3 0=768 1=1.000000e-06 3=101 Split splitncnn_4 1 3 qkv_input.3 qkv_input.3_splitncnn_0 qkv_input.3_splitncnn_1 qkv_input.3_splitncnn_2 MultiHeadAttention MultiHeadAttention_31 3 1 qkv_input.3_splitncnn_2 qkv_input.3_splitncnn_1 qkv_input.3_splitncnn_0 onnx::Add_184 0=768 1=12 2=589824 3=1 -BinaryOp Add_32 2 1 input.16_splitncnn_0 onnx::Add_184 input.20 -Split splitncnn_5 1 2 input.20 input.20_splitncnn_0 input.20_splitncnn_1 -LayerNorm LayerNorm_33 1 1 input.20_splitncnn_1 onnx::Gemm_186 0=768 1=1.000000e-06 -InnerProduct Gemm_34 1 1 onnx::Gemm_186 mmdeploy::Gelu_187 0=3072 1=1 2=2359296 8=2 -GELU Gelu_35 1 1 mmdeploy::Gelu_187 input.24 0=1 -InnerProduct Gemm_36 1 1 input.24 input.28 0=768 1=1 2=2359296 8=2 -BinaryOp Add_37 2 1 input.20_splitncnn_0 input.28 input.32 -Split splitncnn_6 1 2 input.32 input.32_splitncnn_0 input.32_splitncnn_1 -LayerNorm LayerNorm_38 1 1 input.32_splitncnn_1 qkv_input.7 0=768 1=1.000000e-06 +BinaryOp Add_32 2 1 input.20_splitncnn_0 onnx::Add_184 input.24 +Split splitncnn_5 1 2 input.24 input.24_splitncnn_0 input.24_splitncnn_1 +LayerNorm LayerNorm_33 1 1 input.24_splitncnn_1 A.5 0=768 1=1.000000e-06 3=101 +InnerProduct Gemm_34 1 1 A.5 mmdeploy::Gelu_187 0=3072 1=1 2=2359296 8=2 +GELU Gelu_35 1 1 mmdeploy::Gelu_187 A.7 0=1 +InnerProduct Gemm_36 1 1 A.7 input.36 0=768 1=1 2=2359296 8=2 +BinaryOp Add_37 2 1 input.24_splitncnn_0 input.36 input.40 +Split splitncnn_6 1 2 input.40 input.40_splitncnn_0 input.40_splitncnn_1 +LayerNorm LayerNorm_38 1 1 input.40_splitncnn_1 qkv_input.7 0=768 1=1.000000e-06 3=101 Split splitncnn_7 1 3 qkv_input.7 qkv_input.7_splitncnn_0 qkv_input.7_splitncnn_1 qkv_input.7_splitncnn_2 MultiHeadAttention MultiHeadAttention_47 3 1 qkv_input.7_splitncnn_2 qkv_input.7_splitncnn_1 qkv_input.7_splitncnn_0 onnx::Add_200 0=768 1=12 2=589824 3=1 -BinaryOp Add_48 2 1 input.32_splitncnn_0 onnx::Add_200 input.36 -Split splitncnn_8 1 2 input.36 input.36_splitncnn_0 input.36_splitncnn_1 -LayerNorm LayerNorm_49 1 1 input.36_splitncnn_1 onnx::Gemm_202 0=768 1=1.000000e-06 -InnerProduct Gemm_50 1 1 onnx::Gemm_202 mmdeploy::Gelu_203 0=3072 1=1 2=2359296 8=2 -GELU Gelu_51 1 1 mmdeploy::Gelu_203 input.40 0=1 -InnerProduct Gemm_52 1 1 input.40 input.44 0=768 1=1 2=2359296 8=2 -BinaryOp Add_53 2 1 input.36_splitncnn_0 input.44 input.48 -Split splitncnn_9 1 2 input.48 input.48_splitncnn_0 input.48_splitncnn_1 -LayerNorm LayerNorm_54 1 1 input.48_splitncnn_1 qkv_input.11 0=768 1=1.000000e-06 +BinaryOp Add_48 2 1 input.40_splitncnn_0 onnx::Add_200 input.44 +Split splitncnn_8 1 2 input.44 input.44_splitncnn_0 input.44_splitncnn_1 +LayerNorm LayerNorm_49 1 1 input.44_splitncnn_1 A.9 0=768 1=1.000000e-06 3=101 +InnerProduct Gemm_50 1 1 A.9 mmdeploy::Gelu_203 0=3072 1=1 2=2359296 8=2 +GELU Gelu_51 1 1 mmdeploy::Gelu_203 A.11 0=1 +InnerProduct Gemm_52 1 1 A.11 input.56 0=768 1=1 2=2359296 8=2 +BinaryOp Add_53 2 1 input.44_splitncnn_0 input.56 input.60 +Split splitncnn_9 1 2 input.60 input.60_splitncnn_0 input.60_splitncnn_1 +LayerNorm LayerNorm_54 1 1 input.60_splitncnn_1 qkv_input.11 0=768 1=1.000000e-06 3=101 Split splitncnn_10 1 3 qkv_input.11 qkv_input.11_splitncnn_0 qkv_input.11_splitncnn_1 qkv_input.11_splitncnn_2 MultiHeadAttention MultiHeadAttention_63 3 1 qkv_input.11_splitncnn_2 qkv_input.11_splitncnn_1 qkv_input.11_splitncnn_0 onnx::Add_216 0=768 1=12 2=589824 3=1 -BinaryOp Add_64 2 1 input.48_splitncnn_0 onnx::Add_216 input.52 -Split splitncnn_11 1 2 input.52 input.52_splitncnn_0 input.52_splitncnn_1 -LayerNorm LayerNorm_65 1 1 input.52_splitncnn_1 onnx::Gemm_218 0=768 1=1.000000e-06 -InnerProduct Gemm_66 1 1 onnx::Gemm_218 mmdeploy::Gelu_219 0=3072 1=1 2=2359296 8=2 -GELU Gelu_67 1 1 mmdeploy::Gelu_219 input.56 0=1 -InnerProduct Gemm_68 1 1 input.56 input.60 0=768 1=1 2=2359296 8=2 -BinaryOp Add_69 2 1 input.52_splitncnn_0 input.60 input.64 -Split splitncnn_12 1 2 input.64 input.64_splitncnn_0 input.64_splitncnn_1 -LayerNorm LayerNorm_70 1 1 input.64_splitncnn_1 qkv_input.15 0=768 1=1.000000e-06 +BinaryOp Add_64 2 1 input.60_splitncnn_0 onnx::Add_216 input.64 +Split splitncnn_11 1 2 input.64 input.64_splitncnn_0 input.64_splitncnn_1 +LayerNorm LayerNorm_65 1 1 input.64_splitncnn_1 A.13 0=768 1=1.000000e-06 3=101 +InnerProduct Gemm_66 1 1 A.13 mmdeploy::Gelu_219 0=3072 1=1 2=2359296 8=2 +GELU Gelu_67 1 1 mmdeploy::Gelu_219 A.15 0=1 +InnerProduct Gemm_68 1 1 A.15 input.76 0=768 1=1 2=2359296 8=2 +BinaryOp Add_69 2 1 input.64_splitncnn_0 input.76 input.80 +Split splitncnn_12 1 2 input.80 input.80_splitncnn_0 input.80_splitncnn_1 +LayerNorm LayerNorm_70 1 1 input.80_splitncnn_1 qkv_input.15 0=768 1=1.000000e-06 3=101 Split splitncnn_13 1 3 qkv_input.15 qkv_input.15_splitncnn_0 qkv_input.15_splitncnn_1 qkv_input.15_splitncnn_2 MultiHeadAttention MultiHeadAttention_79 3 1 qkv_input.15_splitncnn_2 qkv_input.15_splitncnn_1 qkv_input.15_splitncnn_0 onnx::Add_232 0=768 1=12 2=589824 3=1 -BinaryOp Add_80 2 1 input.64_splitncnn_0 onnx::Add_232 input.68 -Split splitncnn_14 1 2 input.68 input.68_splitncnn_0 input.68_splitncnn_1 -LayerNorm LayerNorm_81 1 1 input.68_splitncnn_1 onnx::Gemm_234 0=768 1=1.000000e-06 -InnerProduct Gemm_82 1 1 onnx::Gemm_234 mmdeploy::Gelu_235 0=3072 1=1 2=2359296 8=2 -GELU Gelu_83 1 1 mmdeploy::Gelu_235 input.72 0=1 -InnerProduct Gemm_84 1 1 input.72 input.76 0=768 1=1 2=2359296 8=2 -BinaryOp Add_85 2 1 input.68_splitncnn_0 input.76 input.80 -Split splitncnn_15 1 2 input.80 input.80_splitncnn_0 input.80_splitncnn_1 -LayerNorm LayerNorm_86 1 1 input.80_splitncnn_1 qkv_input.19 0=768 1=1.000000e-06 +BinaryOp Add_80 2 1 input.80_splitncnn_0 onnx::Add_232 input.84 +Split splitncnn_14 1 2 input.84 input.84_splitncnn_0 input.84_splitncnn_1 +LayerNorm LayerNorm_81 1 1 input.84_splitncnn_1 A.17 0=768 1=1.000000e-06 3=101 +InnerProduct Gemm_82 1 1 A.17 mmdeploy::Gelu_235 0=3072 1=1 2=2359296 8=2 +GELU Gelu_83 1 1 mmdeploy::Gelu_235 A.19 0=1 +InnerProduct Gemm_84 1 1 A.19 input.96 0=768 1=1 2=2359296 8=2 +BinaryOp Add_85 2 1 input.84_splitncnn_0 input.96 input.100 +Split splitncnn_15 1 2 input.100 input.100_splitncnn_0 input.100_splitncnn_1 +LayerNorm LayerNorm_86 1 1 input.100_splitncnn_1 qkv_input.19 0=768 1=1.000000e-06 3=101 Split splitncnn_16 1 3 qkv_input.19 qkv_input.19_splitncnn_0 qkv_input.19_splitncnn_1 qkv_input.19_splitncnn_2 MultiHeadAttention MultiHeadAttention_95 3 1 qkv_input.19_splitncnn_2 qkv_input.19_splitncnn_1 qkv_input.19_splitncnn_0 onnx::Add_248 0=768 1=12 2=589824 3=1 -BinaryOp Add_96 2 1 input.80_splitncnn_0 onnx::Add_248 input.84 -Split splitncnn_17 1 2 input.84 input.84_splitncnn_0 input.84_splitncnn_1 -LayerNorm LayerNorm_97 1 1 input.84_splitncnn_1 onnx::Gemm_250 0=768 1=1.000000e-06 -InnerProduct Gemm_98 1 1 onnx::Gemm_250 mmdeploy::Gelu_251 0=3072 1=1 2=2359296 8=2 -GELU Gelu_99 1 1 mmdeploy::Gelu_251 input.88 0=1 -InnerProduct Gemm_100 1 1 input.88 input.92 0=768 1=1 2=2359296 8=2 -BinaryOp Add_101 2 1 input.84_splitncnn_0 input.92 input.96 -Split splitncnn_18 1 2 input.96 input.96_splitncnn_0 input.96_splitncnn_1 -LayerNorm LayerNorm_102 1 1 input.96_splitncnn_1 qkv_input.23 0=768 1=1.000000e-06 +BinaryOp Add_96 2 1 input.100_splitncnn_0 onnx::Add_248 input.104 +Split splitncnn_17 1 2 input.104 input.104_splitncnn_0 input.104_splitncnn_1 +LayerNorm LayerNorm_97 1 1 input.104_splitncnn_1 A.21 0=768 1=1.000000e-06 3=101 +InnerProduct Gemm_98 1 1 A.21 mmdeploy::Gelu_251 0=3072 1=1 2=2359296 8=2 +GELU Gelu_99 1 1 mmdeploy::Gelu_251 A.23 0=1 +InnerProduct Gemm_100 1 1 A.23 input.116 0=768 1=1 2=2359296 8=2 +BinaryOp Add_101 2 1 input.104_splitncnn_0 input.116 input.120 +Split splitncnn_18 1 2 input.120 input.120_splitncnn_0 input.120_splitncnn_1 +LayerNorm LayerNorm_102 1 1 input.120_splitncnn_1 qkv_input.23 0=768 1=1.000000e-06 3=101 Split splitncnn_19 1 3 qkv_input.23 qkv_input.23_splitncnn_0 qkv_input.23_splitncnn_1 qkv_input.23_splitncnn_2 MultiHeadAttention MultiHeadAttention_111 3 1 qkv_input.23_splitncnn_2 qkv_input.23_splitncnn_1 qkv_input.23_splitncnn_0 onnx::Add_264 0=768 1=12 2=589824 3=1 -BinaryOp Add_112 2 1 input.96_splitncnn_0 onnx::Add_264 input.100 -Split splitncnn_20 1 2 input.100 input.100_splitncnn_0 input.100_splitncnn_1 -LayerNorm LayerNorm_113 1 1 input.100_splitncnn_1 onnx::Gemm_266 0=768 1=1.000000e-06 -InnerProduct Gemm_114 1 1 onnx::Gemm_266 mmdeploy::Gelu_267 0=3072 1=1 2=2359296 8=2 -GELU Gelu_115 1 1 mmdeploy::Gelu_267 input.104 0=1 -InnerProduct Gemm_116 1 1 input.104 input.108 0=768 1=1 2=2359296 8=2 -BinaryOp Add_117 2 1 input.100_splitncnn_0 input.108 input.112 -Split splitncnn_21 1 2 input.112 input.112_splitncnn_0 input.112_splitncnn_1 -LayerNorm LayerNorm_118 1 1 input.112_splitncnn_1 qkv_input.27 0=768 1=1.000000e-06 +BinaryOp Add_112 2 1 input.120_splitncnn_0 onnx::Add_264 input.124 +Split splitncnn_20 1 2 input.124 input.124_splitncnn_0 input.124_splitncnn_1 +LayerNorm LayerNorm_113 1 1 input.124_splitncnn_1 A.25 0=768 1=1.000000e-06 3=101 +InnerProduct Gemm_114 1 1 A.25 mmdeploy::Gelu_267 0=3072 1=1 2=2359296 8=2 +GELU Gelu_115 1 1 mmdeploy::Gelu_267 A.27 0=1 +InnerProduct Gemm_116 1 1 A.27 input.136 0=768 1=1 2=2359296 8=2 +BinaryOp Add_117 2 1 input.124_splitncnn_0 input.136 input.140 +Split splitncnn_21 1 2 input.140 input.140_splitncnn_0 input.140_splitncnn_1 +LayerNorm LayerNorm_118 1 1 input.140_splitncnn_1 qkv_input.27 0=768 1=1.000000e-06 3=101 Split splitncnn_22 1 3 qkv_input.27 qkv_input.27_splitncnn_0 qkv_input.27_splitncnn_1 qkv_input.27_splitncnn_2 MultiHeadAttention MultiHeadAttention_127 3 1 qkv_input.27_splitncnn_2 qkv_input.27_splitncnn_1 qkv_input.27_splitncnn_0 onnx::Add_280 0=768 1=12 2=589824 3=1 -BinaryOp Add_128 2 1 input.112_splitncnn_0 onnx::Add_280 input.116 -Split splitncnn_23 1 2 input.116 input.116_splitncnn_0 input.116_splitncnn_1 -LayerNorm LayerNorm_129 1 1 input.116_splitncnn_1 onnx::Gemm_282 0=768 1=1.000000e-06 -InnerProduct Gemm_130 1 1 onnx::Gemm_282 mmdeploy::Gelu_283 0=3072 1=1 2=2359296 8=2 -GELU Gelu_131 1 1 mmdeploy::Gelu_283 input.120 0=1 -InnerProduct Gemm_132 1 1 input.120 input.124 0=768 1=1 2=2359296 8=2 -BinaryOp Add_133 2 1 input.116_splitncnn_0 input.124 input.128 -Split splitncnn_24 1 2 input.128 input.128_splitncnn_0 input.128_splitncnn_1 -LayerNorm LayerNorm_134 1 1 input.128_splitncnn_1 qkv_input.31 0=768 1=1.000000e-06 +BinaryOp Add_128 2 1 input.140_splitncnn_0 onnx::Add_280 input.144 +Split splitncnn_23 1 2 input.144 input.144_splitncnn_0 input.144_splitncnn_1 +LayerNorm LayerNorm_129 1 1 input.144_splitncnn_1 A.29 0=768 1=1.000000e-06 3=101 +InnerProduct Gemm_130 1 1 A.29 mmdeploy::Gelu_283 0=3072 1=1 2=2359296 8=2 +GELU Gelu_131 1 1 mmdeploy::Gelu_283 A.31 0=1 +InnerProduct Gemm_132 1 1 A.31 input.156 0=768 1=1 2=2359296 8=2 +BinaryOp Add_133 2 1 input.144_splitncnn_0 input.156 input.160 +Split splitncnn_24 1 2 input.160 input.160_splitncnn_0 input.160_splitncnn_1 +LayerNorm LayerNorm_134 1 1 input.160_splitncnn_1 qkv_input.31 0=768 1=1.000000e-06 3=101 Split splitncnn_25 1 3 qkv_input.31 qkv_input.31_splitncnn_0 qkv_input.31_splitncnn_1 qkv_input.31_splitncnn_2 MultiHeadAttention MultiHeadAttention_143 3 1 qkv_input.31_splitncnn_2 qkv_input.31_splitncnn_1 qkv_input.31_splitncnn_0 onnx::Add_296 0=768 1=12 2=589824 3=1 -BinaryOp Add_144 2 1 input.128_splitncnn_0 onnx::Add_296 input.132 -Split splitncnn_26 1 2 input.132 input.132_splitncnn_0 input.132_splitncnn_1 -LayerNorm LayerNorm_145 1 1 input.132_splitncnn_1 onnx::Gemm_298 0=768 1=1.000000e-06 -InnerProduct Gemm_146 1 1 onnx::Gemm_298 mmdeploy::Gelu_299 0=3072 1=1 2=2359296 8=2 -GELU Gelu_147 1 1 mmdeploy::Gelu_299 input.136 0=1 -InnerProduct Gemm_148 1 1 input.136 input.140 0=768 1=1 2=2359296 8=2 -BinaryOp Add_149 2 1 input.132_splitncnn_0 input.140 input.144 -Split splitncnn_27 1 2 input.144 input.144_splitncnn_0 input.144_splitncnn_1 -LayerNorm LayerNorm_150 1 1 input.144_splitncnn_1 qkv_input.35 0=768 1=1.000000e-06 +BinaryOp Add_144 2 1 input.160_splitncnn_0 onnx::Add_296 input.164 +Split splitncnn_26 1 2 input.164 input.164_splitncnn_0 input.164_splitncnn_1 +LayerNorm LayerNorm_145 1 1 input.164_splitncnn_1 A.33 0=768 1=1.000000e-06 3=101 +InnerProduct Gemm_146 1 1 A.33 mmdeploy::Gelu_299 0=3072 1=1 2=2359296 8=2 +GELU Gelu_147 1 1 mmdeploy::Gelu_299 A.35 0=1 +InnerProduct Gemm_148 1 1 A.35 input.176 0=768 1=1 2=2359296 8=2 +BinaryOp Add_149 2 1 input.164_splitncnn_0 input.176 input.180 +Split splitncnn_27 1 2 input.180 input.180_splitncnn_0 input.180_splitncnn_1 +LayerNorm LayerNorm_150 1 1 input.180_splitncnn_1 qkv_input.35 0=768 1=1.000000e-06 3=101 Split splitncnn_28 1 3 qkv_input.35 qkv_input.35_splitncnn_0 qkv_input.35_splitncnn_1 qkv_input.35_splitncnn_2 MultiHeadAttention MultiHeadAttention_159 3 1 qkv_input.35_splitncnn_2 qkv_input.35_splitncnn_1 qkv_input.35_splitncnn_0 onnx::Add_312 0=768 1=12 2=589824 3=1 -BinaryOp Add_160 2 1 input.144_splitncnn_0 onnx::Add_312 input.148 -Split splitncnn_29 1 2 input.148 input.148_splitncnn_0 input.148_splitncnn_1 -LayerNorm LayerNorm_161 1 1 input.148_splitncnn_1 onnx::Gemm_314 0=768 1=1.000000e-06 -InnerProduct Gemm_162 1 1 onnx::Gemm_314 mmdeploy::Gelu_315 0=3072 1=1 2=2359296 8=2 -GELU Gelu_163 1 1 mmdeploy::Gelu_315 input.152 0=1 -InnerProduct Gemm_164 1 1 input.152 input.156 0=768 1=1 2=2359296 8=2 -BinaryOp Add_165 2 1 input.148_splitncnn_0 input.156 input.160 -Split splitncnn_30 1 2 input.160 input.160_splitncnn_0 input.160_splitncnn_1 -LayerNorm LayerNorm_166 1 1 input.160_splitncnn_1 qkv_input.39 0=768 1=1.000000e-06 +BinaryOp Add_160 2 1 input.180_splitncnn_0 onnx::Add_312 input.184 +Split splitncnn_29 1 2 input.184 input.184_splitncnn_0 input.184_splitncnn_1 +LayerNorm LayerNorm_161 1 1 input.184_splitncnn_1 A.37 0=768 1=1.000000e-06 3=101 +InnerProduct Gemm_162 1 1 A.37 mmdeploy::Gelu_315 0=3072 1=1 2=2359296 8=2 +GELU Gelu_163 1 1 mmdeploy::Gelu_315 A.39 0=1 +InnerProduct Gemm_164 1 1 A.39 input.196 0=768 1=1 2=2359296 8=2 +BinaryOp Add_165 2 1 input.184_splitncnn_0 input.196 input.200 +Split splitncnn_30 1 2 input.200 input.200_splitncnn_0 input.200_splitncnn_1 +LayerNorm LayerNorm_166 1 1 input.200_splitncnn_1 qkv_input.39 0=768 1=1.000000e-06 3=101 Split splitncnn_31 1 3 qkv_input.39 qkv_input.39_splitncnn_0 qkv_input.39_splitncnn_1 qkv_input.39_splitncnn_2 MultiHeadAttention MultiHeadAttention_175 3 1 qkv_input.39_splitncnn_2 qkv_input.39_splitncnn_1 qkv_input.39_splitncnn_0 onnx::Add_328 0=768 1=12 2=589824 3=1 -BinaryOp Add_176 2 1 input.160_splitncnn_0 onnx::Add_328 input.164 -Split splitncnn_32 1 2 input.164 input.164_splitncnn_0 input.164_splitncnn_1 -LayerNorm LayerNorm_177 1 1 input.164_splitncnn_1 onnx::Gemm_330 0=768 1=1.000000e-06 -InnerProduct Gemm_178 1 1 onnx::Gemm_330 mmdeploy::Gelu_331 0=3072 1=1 2=2359296 8=2 -GELU Gelu_179 1 1 mmdeploy::Gelu_331 input.168 0=1 -InnerProduct Gemm_180 1 1 input.168 input.172 0=768 1=1 2=2359296 8=2 -BinaryOp Add_181 2 1 input.164_splitncnn_0 input.172 input.176 -Split splitncnn_33 1 2 input.176 input.176_splitncnn_0 input.176_splitncnn_1 -LayerNorm LayerNorm_182 1 1 input.176_splitncnn_1 qkv_input.43 0=768 1=1.000000e-06 +BinaryOp Add_176 2 1 input.200_splitncnn_0 onnx::Add_328 input.204 +Split splitncnn_32 1 2 input.204 input.204_splitncnn_0 input.204_splitncnn_1 +LayerNorm LayerNorm_177 1 1 input.204_splitncnn_1 A.41 0=768 1=1.000000e-06 3=101 +InnerProduct Gemm_178 1 1 A.41 mmdeploy::Gelu_331 0=3072 1=1 2=2359296 8=2 +GELU Gelu_179 1 1 mmdeploy::Gelu_331 A.43 0=1 +InnerProduct Gemm_180 1 1 A.43 input.216 0=768 1=1 2=2359296 8=2 +BinaryOp Add_181 2 1 input.204_splitncnn_0 input.216 input.220 +Split splitncnn_33 1 2 input.220 input.220_splitncnn_0 input.220_splitncnn_1 +LayerNorm LayerNorm_182 1 1 input.220_splitncnn_1 qkv_input.43 0=768 1=1.000000e-06 3=101 Split splitncnn_34 1 3 qkv_input.43 qkv_input.43_splitncnn_0 qkv_input.43_splitncnn_1 qkv_input.43_splitncnn_2 MultiHeadAttention MultiHeadAttention_191 3 1 qkv_input.43_splitncnn_2 qkv_input.43_splitncnn_1 qkv_input.43_splitncnn_0 onnx::Add_344 0=768 1=12 2=589824 3=1 -BinaryOp Add_192 2 1 input.176_splitncnn_0 onnx::Add_344 input.180 -Split splitncnn_35 1 2 input.180 input.180_splitncnn_0 input.180_splitncnn_1 -LayerNorm LayerNorm_193 1 1 input.180_splitncnn_1 onnx::Gemm_346 0=768 1=1.000000e-06 -InnerProduct Gemm_194 1 1 onnx::Gemm_346 mmdeploy::Gelu_347 0=3072 1=1 2=2359296 8=2 -GELU Gelu_195 1 1 mmdeploy::Gelu_347 input.184 0=1 -InnerProduct Gemm_196 1 1 input.184 input.188 0=768 1=1 2=2359296 8=2 -BinaryOp Add_197 2 1 input.180_splitncnn_0 input.188 input.192 -LayerNorm LayerNorm_198 1 1 input.192 onnx::Gather_351 0=768 1=1.000000e-06 -Crop Gather_200 1 1 onnx::Gather_351 onnx::Gemm_353 -23309=1,0 -23310=1,1 -23311=1,0 -InnerProduct Gemm_201 1 1 onnx::Gemm_353 cls_score 0=1000 1=1 2=768000 8=2 +BinaryOp Add_192 2 1 input.220_splitncnn_0 onnx::Add_344 input.224 +Split splitncnn_35 1 2 input.224 input.224_splitncnn_0 input.224_splitncnn_1 +LayerNorm LayerNorm_193 1 1 input.224_splitncnn_1 A.45 0=768 1=1.000000e-06 3=101 +InnerProduct Gemm_194 1 1 A.45 mmdeploy::Gelu_347 0=3072 1=1 2=2359296 8=2 +GELU Gelu_195 1 1 mmdeploy::Gelu_347 A.47 0=1 +InnerProduct Gemm_196 1 1 A.47 input.236 0=768 1=1 2=2359296 8=2 +BinaryOp Add_197 2 1 input.224_splitncnn_0 input.236 input.240 +LayerNorm LayerNorm_198 1 1 input.240 onnx::Gather_351 0=768 1=1.000000e-06 3=1 +Crop Gather_200 1 1 onnx::Gather_351 A -23309=1,0 -23310=1,1 -23311=1,0 +InnerProduct Gemm_201 1 1 A cls_score 0=1000 1=1 2=768000 8=2 Softmax Softmax_202 1 1 cls_score output diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a7739be27e51..0aa2f42ca0a2 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -65,6 +65,7 @@ if(NCNN_PIXEL) ncnn_add_example(nanodetplus_pnnx) ncnn_add_example(scrfd) ncnn_add_example(scrfd_crowdhuman) + ncnn_add_example(vision_transformer_int8) if(OpenCV_FOUND) ncnn_add_example(yolov4) ncnn_add_example(rvm) diff --git a/examples/vision_transformer_int8.cpp b/examples/vision_transformer_int8.cpp new file mode 100644 index 000000000000..7ca087d0048b --- /dev/null +++ b/examples/vision_transformer_int8.cpp @@ -0,0 +1,83 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "net.h" + +#include +#if defined(USE_NCNN_SIMPLEOCV) +#include "simpleocv.h" +#else +#include +#include +#endif +#include +#include + +static int classify_vit(const cv::Mat& bgr) +{ + ncnn::Net vit; + + vit.opt.use_vulkan_compute = false; + + // the ncnn model https://github.com/tpoisonooo/mmdeploy-onnx2ncnn-testdata/tree/main/vit-int8-20220811 + vit.load_param("vit8.param"); + vit.load_model("vit8.bin"); + + ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, 384, 384); + + const float mean_vals[3] = {127.5f, 127.5f, 127.5f}; + const float norm_vals[3] = {0.007843f, 0.007843f, 0.007843f}; + in.substract_mean_normalize(mean_vals, norm_vals); + + ncnn::Extractor ex = vit.create_extractor(); + + ex.input("input", in); + + ncnn::Mat multiHeadOut; + + ex.extract("output", multiHeadOut); + + float max_value = multiHeadOut[0]; + int max_index = 0; + for (int j = 0; j < multiHeadOut.w; j++) + { + if (max_value < multiHeadOut[j]) { + max_value = multiHeadOut[j]; + max_index = j; + } + } + fprintf(stdout, "softmax result: %d %f\n", max_index, max_value); + + return 0; +} + +int main(int argc, char** argv) +{ + if (argc != 2) + { + fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); + return -1; + } + + const char* imagepath = argv[1]; + cv::Mat m = cv::imread(imagepath, 1); + if (m.empty()) + { + fprintf(stderr, "cv::imread %s failed\n", imagepath); + return -1; + } + + classify_vit(m); + return 0; +} diff --git a/src/layer/binaryop.cpp b/src/layer/binaryop.cpp index b67d2923e10c..53eb234bc923 100644 --- a/src/layer/binaryop.cpp +++ b/src/layer/binaryop.cpp @@ -15,7 +15,6 @@ #include "binaryop.h" #include -#include "mathfun.h" namespace ncnn { @@ -30,12 +29,6 @@ int BinaryOp::load_param(const ParamDict& pd) op_type = pd.get(0, 0); with_scalar = pd.get(1, 0); b = pd.get(2, 0.f); - int8_scale_term = pd.get(3, 0); -#ifdef NCNN_INT8 - in_scale0 = pd.get(4, 1.f); - in_scale1 = pd.get(5, 1.f); - out_scale = pd.get(6, 1.f); -#endif if (with_scalar != 0) { @@ -925,157 +918,6 @@ struct binary_op_rdiv } }; -#ifdef NCNN_INT8 -template -int BinaryOp::binary_op_int8(const Mat& a, const Mat& b, Mat& c, const Option& opt) const -{ - if (a.w != b.w || a.h != b.h || a.c != b.c || a.d != b.d || a.d != 1) - { - // binaryop int8 only support input same shape, not support packing layout - return -100; - } - - Op op; - const int channels = a.c; - const int size = a.w * a.h; - - if (int8_scale_term > 100){ - // requant - c.create(a.w, a.h, a.c, 1u, opt.workspace_allocator); - - if (a.elemsize == 1u && b.elemsize == 1u) - { - // #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - int8_t* ptr0 = (int8_t*)(a.channel(q).data); - int8_t* ptr1 = (int8_t*)(b.channel(q).data); - int8_t* pout = (int8_t*)(c.channel(q).data); - - for (int i = 0; i < size; i++) - { - int32_t v = op(ptr0[i] / in_scale0, ptr1[i] / in_scale1); - pout[i] = float2int8(v * out_scale); - } - } - return 0; - } - - if (a.elemsize == 1u && b.elemsize == 4u) - { - // #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - int8_t* ptr0 = (int8_t*)(a.channel(q).data); - const float* ptr1 = b.channel(q); - int8_t* pout = (int8_t*)(c.channel(q).data); - - for (int i = 0; i < size; i++) - { - int32_t v = op(ptr0[i] / in_scale0, ptr1[i]); - pout[i] = float2int8(v * out_scale); - } - } - return 0; - } - - if (a.elemsize == 4u && b.elemsize == 1u) - { - // #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr0 = a.channel(q); - int8_t* ptr1 = (int8_t*)(b.channel(q).data); - int8_t* pout = (int8_t*)(c.channel(q).data); - - for (int i = 0; i < size; i++) - { - int32_t v = op(ptr0[i], ptr1[i] / in_scale1); - pout[i] = float2int8(v * out_scale); - } - } - return 0; - } - - if (a.elemsize == 4u && b.elemsize == 4u) - { - // #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr0 = a.channel(q); - const float* ptr1 = b.channel(q); - int8_t* pout = (int8_t*)(c.channel(q).data); - - for (int i = 0; i < size; i++) - { - float v = op(ptr0[i], ptr1[i]); - pout[i] = float2int8(v * out_scale); - } - } - return 0; - } - - } else { - // dequant - c.create(a.w, a.h, a.c, 4u, opt.workspace_allocator); - - if (a.elemsize == 1u && b.elemsize == 1u) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - int8_t* ptr0 = (int8_t*)(a.channel(q).data); - int8_t* ptr1 = (int8_t*)(b.channel(q).data); - float* pout = c.channel(q); - - for (int i = 0; i < size; i++) - { - pout[i] = op(ptr0[i] / in_scale0, ptr1[i] / in_scale1); - } - } - return 0; - } - - if (a.elemsize == 1u && b.elemsize == 4u) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - int8_t* ptr0 = (int8_t*)(a.channel(q).data); - const float* ptr1 = b.channel(q); - float* pout = c.channel(q); - - for (int i = 0; i < size; i++) - { - pout[i] = op(ptr0[i] / in_scale0, ptr1[i]); - } - } - return 0; - } - - if (a.elemsize == 4u && b.elemsize == 1u) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr0 = a.channel(q); - int8_t* ptr1 = (int8_t*)(b.channel(q).data); - float* pout = c.channel(q); - - for (int i = 0; i < size; i++) - { - pout[i] = op(ptr0[i], ptr1[i] / in_scale1); - } - } - return 0; - } - } - - return 0; -} - -#endif - int BinaryOp::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& bottom_blob = bottom_blobs[0]; @@ -1083,85 +925,38 @@ int BinaryOp::forward(const std::vector& bottom_blobs, std::vector& to Mat& top_blob = top_blobs[0]; -#ifdef NCNN_INT8 - if (int8_scale_term > 0) - { - // requant - if (op_type == Operation_ADD) - return binary_op_int8(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_SUB) - return binary_op_int8(bottom_blob, bottom_blob1, top_blob, opt); + if (op_type == Operation_ADD) + return binary_op(bottom_blob, bottom_blob1, top_blob, opt); - if (op_type == Operation_MUL) - return binary_op_int8(bottom_blob, bottom_blob1, top_blob, opt); + if (op_type == Operation_SUB) + return binary_op(bottom_blob, bottom_blob1, top_blob, opt); - if (op_type == Operation_DIV) - return binary_op_int8(bottom_blob, bottom_blob1, top_blob, opt); + if (op_type == Operation_MUL) + return binary_op(bottom_blob, bottom_blob1, top_blob, opt); - if (op_type == Operation_MAX) - return binary_op_int8(bottom_blob, bottom_blob1, top_blob, opt); + if (op_type == Operation_DIV) + return binary_op(bottom_blob, bottom_blob1, top_blob, opt); - if (op_type == Operation_MIN) - return binary_op_int8(bottom_blob, bottom_blob1, top_blob, opt); + if (op_type == Operation_MAX) + return binary_op(bottom_blob, bottom_blob1, top_blob, opt); - if (op_type == Operation_POW) - return binary_op_int8(bottom_blob, bottom_blob1, top_blob, opt); + if (op_type == Operation_MIN) + return binary_op(bottom_blob, bottom_blob1, top_blob, opt); - if (op_type == Operation_RSUB) - return binary_op_int8(bottom_blob1, bottom_blob, top_blob, opt); + if (op_type == Operation_POW) + return binary_op(bottom_blob, bottom_blob1, top_blob, opt); - if (op_type == Operation_RDIV) - return binary_op_int8(bottom_blob1, bottom_blob, top_blob, opt); - } -#endif + if (op_type == Operation_RSUB) + return binary_op(bottom_blob1, bottom_blob, top_blob, opt); - switch (op_type) - { - case Operation_ADD: - binary_op(bottom_blob, bottom_blob1, top_blob, opt); - break; - case Operation_SUB: - binary_op(bottom_blob, bottom_blob1, top_blob, opt); - break; - case Operation_MUL: - binary_op(bottom_blob, bottom_blob1, top_blob, opt); - break; - case Operation_DIV: - binary_op(bottom_blob, bottom_blob1, top_blob, opt); - break; - case Operation_MAX: - binary_op(bottom_blob, bottom_blob1, top_blob, opt); - break; - case Operation_MIN: - binary_op(bottom_blob, bottom_blob1, top_blob, opt); - break; - case Operation_POW: - binary_op(bottom_blob, bottom_blob1, top_blob, opt); - break; - case Operation_RSUB: - binary_op(bottom_blob1, bottom_blob, top_blob, opt); - break; - case Operation_RDIV: - binary_op(bottom_blob1, bottom_blob, top_blob, opt); - break; - default: - return -100; - } + if (op_type == Operation_RDIV) + return binary_op(bottom_blob1, bottom_blob, top_blob, opt); return 0; } int BinaryOp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#ifdef NCNN_INT8 - if (int8_scale_term > 0) - { - // int8 BinaryOp with scalar not implemented - return -100; - } -#endif - if (op_type == Operation_ADD) return binary_op_scalar_inplace(bottom_top_blob, b, opt); diff --git a/src/layer/binaryop.h b/src/layer/binaryop.h index e111c7b88014..74798f906aa8 100644 --- a/src/layer/binaryop.h +++ b/src/layer/binaryop.h @@ -32,11 +32,6 @@ class BinaryOp : public Layer virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; -#ifdef NCNN_INT8 - template - int binary_op_int8(const Mat& a, const Mat& b, Mat& c, const Option& opt) const; -#endif - enum OperationType { Operation_ADD = 0, @@ -55,13 +50,6 @@ class BinaryOp : public Layer int op_type; int with_scalar; float b; - - int int8_scale_term; -#ifdef NCNN_INT8 - float in_scale0; - float in_scale1; - float out_scale; -#endif }; } // namespace ncnn diff --git a/src/layer/layernorm.cpp b/src/layer/layernorm.cpp index 4d4d09f52d70..5aec4cfb177b 100644 --- a/src/layer/layernorm.cpp +++ b/src/layer/layernorm.cpp @@ -177,10 +177,11 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < bottom_top_blob.h; ++i) { - int32_t* ptr = xq.row(i); + int8_t* from = bottom_top_blob.row(i); + int32_t* to = xq.row(i); for (int j = 0; j < bottom_top_blob.w; ++j) { - ptr[j] = round(ptr[j] * in_scale_max / input_scales[j]); + to[j] = round(from[j] * in_scale_max / input_scales[j]); } } } diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index c722c43ea3d5..b377e4cc35f1 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -162,6 +162,9 @@ int MultiHeadAttention::affine_input( if (input.elemsize != 1) { quantize_to_int8(input, input_int8, input_scale, opt); + } else + { + input_int8 = input; } Mat buffer(out_int8.w, out_int8.h, out_int8.c, 4u, opt.workspace_allocator); diff --git a/src/layer/x86/binaryop_x86.cpp b/src/layer/x86/binaryop_x86.cpp deleted file mode 100644 index 6d6a09fd71c1..000000000000 --- a/src/layer/x86/binaryop_x86.cpp +++ /dev/null @@ -1,2791 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#include "binaryop_x86.h" - -#if __SSE2__ -#include -#include "sse_mathfun.h" -#if __AVX__ -#include -#include "avx_mathfun.h" -#if __AVX512F__ -#include "avx512_mathfun.h" -#endif // __AVX512F__ -#endif // __AVX__ -#endif // __SSE2__ - -#include - -namespace ncnn { - -BinaryOp_x86::BinaryOp_x86() -{ -#if __SSE2__ - support_packing = true; -#endif // __SSE2__ -} - -template -static int binary_op_2_3_4_20(const Mat& a, const Mat& b, Mat& c, const Option& opt) -{ - Op op; - - int w = b.w; - int h = b.h; - int d = b.d; - int channels = b.c; - int elempack = b.elempack; - int size = w * h * d * elempack; - - // type 2 3 4 20 - c.create_like(b, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float a0 = a[0]; - const float* ptr = b.channel(q); - float* outptr = c.channel(q); - - int i = 0; -#if __SSE2__ -#if __AVX__ -#if __AVX512F__ - __m512 _a0_avx512 = _mm512_set1_ps(a0); - for (; i + 15 < size; i += 16) - { - __m512 _p = _mm512_loadu_ps(ptr); - __m512 _outp = op.func_pack16(_a0_avx512, _p); - _mm512_storeu_ps(outptr, _outp); - ptr += 16; - outptr += 16; - } -#endif // __AVX512F__ - __m256 _a0_avx = _mm256_set1_ps(a0); - for (; i + 7 < size; i += 8) - { - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _outp = op.func_pack8(_a0_avx, _p); - _mm256_storeu_ps(outptr, _outp); - ptr += 8; - outptr += 8; - } -#endif // __AVX__ - __m128 _a0 = _mm_set1_ps(a0); - for (; i + 3 < size; i += 4) - { - __m128 _p = _mm_load_ps(ptr); - __m128 _outp = op.func_pack4(_a0, _p); - _mm_store_ps(outptr, _outp); - ptr += 4; - outptr += 4; - } -#endif // __SSE2__ - for (; i < size; i++) - { - *outptr = op.func(a0, *ptr); - ptr += 1; - outptr += 1; - } - } - - return 0; -} - -template -static int binary_op_6_11_16_25(const Mat& a, const Mat& b, Mat& c, const Option& opt) -{ - Op op; - - int w = a.w; - int h = a.h; - int d = a.d; - int channels = a.c; - int elempack = a.elempack; - int size = w * h * d * elempack; - - // type 6 11 16 25 - c.create_like(a, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float b0 = b[0]; - float* outptr = c.channel(q); - - int i = 0; -#if __SSE2__ -#if __AVX__ -#if __AVX512F__ - __m512 _b0_avx512 = _mm512_set1_ps(b0); - for (; i + 15 < size; i += 16) - { - __m512 _p = _mm512_loadu_ps(ptr); - __m512 _outp = op.func_pack16(_p, _b0_avx512); - _mm512_storeu_ps(outptr, _outp); - ptr += 16; - outptr += 16; - } -#endif // __AVX512F__ - __m256 _b0_avx = _mm256_set1_ps(b0); - for (; i + 7 < size; i += 8) - { - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _outp = op.func_pack8(_p, _b0_avx); - _mm256_storeu_ps(outptr, _outp); - ptr += 8; - outptr += 8; - } -#endif // __AVX__ - __m128 _b0 = _mm_set1_ps(b0); - for (; i + 3 < size; i += 4) - { - __m128 _p = _mm_load_ps(ptr); - __m128 _outp = op.func_pack4(_p, _b0); - _mm_store_ps(outptr, _outp); - ptr += 4; - outptr += 4; - } -#endif // __SSE2__ - for (; i < size; i++) - { - *outptr = op.func(*ptr, b0); - ptr += 1; - outptr += 1; - } - } - - return 0; -} - -template -static int binary_op_7_13_19_29(const Mat& a, const Mat& b, Mat& c, const Option& opt) -{ - Op op; - - int w = a.w; - int h = a.h; - int d = a.d; - int channels = a.c; - int elempack = a.elempack; - int size = w * h * d * elempack; - - // type 7 13 19 29 - c.create_like(a, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - int i = 0; -#if __SSE2__ -#if __AVX__ -#if __AVX512F__ - for (; i + 15 < size; i += 16) - { - __m512 _p = _mm512_loadu_ps(ptr); - __m512 _p1 = _mm512_loadu_ps(ptr1); - __m512 _outp = op.func_pack16(_p, _p1); - _mm512_storeu_ps(outptr, _outp); - ptr += 16; - ptr1 += 16; - outptr += 16; - } -#endif // __AVX512F__ - for (; i + 7 < size; i += 8) - { - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _p1 = _mm256_loadu_ps(ptr1); - __m256 _outp = op.func_pack8(_p, _p1); - _mm256_storeu_ps(outptr, _outp); - ptr += 8; - ptr1 += 8; - outptr += 8; - } -#endif // __AVX__ - for (; i + 3 < size; i += 4) - { - __m128 _p = _mm_load_ps(ptr); - __m128 _p1 = _mm_load_ps(ptr1); - __m128 _outp = op.func_pack4(_p, _p1); - _mm_store_ps(outptr, _outp); - ptr += 4; - ptr1 += 4; - outptr += 4; - } -#endif // __SSE2__ - for (; i < size; i++) - { - *outptr = op.func(*ptr, *ptr1); - ptr += 1; - ptr1 += 1; - outptr += 1; - } - } - - return 0; -} - -#if __SSE2__ -#if __AVX__ -#if __AVX512F__ -// broadcasting rule -// https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting - -template -static int binary_op_pack16(const Mat& a, const Mat& b, Mat& c, const Option& opt) -{ - Op op; - - int w = a.w; - int h = a.h; - int d = a.d; - int channels = a.c; - int size = w * h * d; - size_t elemsize = a.elemsize; - int elempack = a.elempack; - - int w1 = b.w; - int h1 = b.h; - int d1 = b.d; - int channels1 = b.c; - int size1 = w1 * h1 * d1; - size_t elemsize1 = b.elemsize; - int elempack1 = b.elempack; - - if (a.dims == 4) - { - if (b.dims == 4) - { - // type 29 - return binary_op_7_13_19_29(a, b, c, opt); - } - - c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.dims == 3) - { - // type 28 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int z = 0; z < d; z++) - { - for (int y = 0; y < h; y++) - { - __m512 _b0 = _mm512_loadu_ps(ptr1); - for (int x = 0; x < w; x++) - { - __m512 _p = _mm512_loadu_ps(ptr); - __m512 _outp = op.func_pack16(_p, _b0); - _mm512_storeu_ps(outptr, _outp); - ptr += 16; - outptr += 16; - } - - ptr1 += 16; - } - } - } - - return 0; - } - - if (b.dims == 2) - { - // type 27 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.row(q); - float* outptr = c.channel(q); - - for (int z = 0; z < d; z++) - { - __m512 _b0 = _mm512_loadu_ps(ptr1); - for (int y = 0; y < h; y++) - { - for (int x = 0; x < w; x++) - { - __m512 _p = _mm512_loadu_ps(ptr); - __m512 _outp = op.func_pack16(_p, _b0); - _mm512_storeu_ps(outptr, _outp); - ptr += 16; - outptr += 16; - } - } - - ptr1 += 16; - } - } - - return 0; - } - - if (b.dims == 1) - { - if (b.w == 1 && elempack1 == 1) - { - // type 25 - return binary_op_6_11_16_25(a, b, c, opt); - } - - // type 26 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - __m512 _b0 = _mm512_loadu_ps((const float*)b + q * 16); - float* outptr = c.channel(q); - - for (int i = 0; i < size; i++) - { - __m512 _p = _mm512_loadu_ps(ptr); - __m512 _outp = op.func_pack16(_p, _b0); - _mm512_storeu_ps(outptr, _outp); - ptr += 16; - outptr += 16; - } - } - - return 0; - } - } - else if (a.dims == 3) - { - if (b.dims == 4) - { - // type 23 - c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int z = 0; z < d1; z++) - { - for (int y = 0; y < h1; y++) - { - __m512 _a0 = _mm512_loadu_ps(ptr); - for (int x = 0; x < w1; x++) - { - __m512 _p = _mm512_loadu_ps(ptr1); - __m512 _outp = op.func_pack16(_a0, _p); - _mm512_storeu_ps(outptr, _outp); - ptr1 += 16; - outptr += 16; - } - - ptr += 16; - } - } - } - - return 0; - } - - if (b.dims == 3) - { - if (w1 == 1 && h1 == 1 && channels1 == channels) - { - // special type 1 - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* b0 = b.channel(q); - float* outptr = c.channel(q); - __m512 _b0 = _mm512_loadu_ps(b0); - for (int i = 0; i < size; i++) - { - __m512 _p = _mm512_loadu_ps(ptr); - __m512 _outp = op.func_pack16(_p, _b0); - _mm512_storeu_ps(outptr, _outp); - ptr += 16; - outptr += 16; - } - } - - return 0; - } - - if (w1 == w && h1 == h && channels1 == 1 && elempack1 == 1) - { - // special type 2 - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b; - float* outptr = c.channel(q); - for (int i = 0; i < size; i++) - { - __m512 _p = _mm512_loadu_ps(ptr); - __m512 _p1 = _mm512_set1_ps(*ptr1); - __m512 _outp = op.func_pack16(_p, _p1); - _mm512_storeu_ps(outptr, _outp); - ptr += 16; - ptr1 += 1; - outptr += 16; - } - } - - return 0; - } - - if (w == 1 && h == 1 && channels1 == channels) - { - // special type 3 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* a0 = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - __m512 _a0 = _mm512_loadu_ps(a0); - for (int i = 0; i < size1; i++) - { - __m512 _p1 = _mm512_loadu_ps(ptr1); - __m512 _outp = op.func_pack16(_a0, _p1); - _mm512_storeu_ps(outptr, _outp); - ptr1 += 16; - outptr += 16; - } - } - - return 0; - } - - if (w1 == w && h1 == h && channels == 1 && elempack == 1) - { - // special type 4 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a; - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - for (int i = 0; i < size1; i++) - { - __m512 _p = _mm512_set1_ps(*ptr); - __m512 _p1 = _mm512_loadu_ps(ptr1); - __m512 _outp = op.func_pack16(_p, _p1); - _mm512_storeu_ps(outptr, _outp); - ptr += 1; - ptr1 += 16; - outptr += 16; - } - } - - return 0; - } - - if (w != 1 && w1 == 1 && h1 == h && channels1 == channels) - { - // special type 5 - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h; y++) - { - __m512 _p1 = _mm512_loadu_ps(ptr1 + y * 16); - for (int x = 0; x < w; x++) - { - __m512 _p = _mm512_loadu_ps(ptr); - __m512 _outp = op.func_pack16(_p, _p1); - _mm512_storeu_ps(outptr, _outp); - - ptr += 16; - outptr += 16; - } - } - } - - return 0; - } - - if (w1 == w && h != 1 && h1 == 1 && channels1 == channels) - { - // special type 6 - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h; y++) - { - for (int x = 0; x < w; x++) - { - __m512 _p = _mm512_loadu_ps(ptr); - __m512 _p1 = _mm512_loadu_ps(ptr1 + x * 16); - __m512 _outp = op.func_pack16(_p, _p1); - _mm512_storeu_ps(outptr, _outp); - - ptr += 16; - outptr += 16; - } - } - } - - return 0; - } - - if (w1 != 1 && w == 1 && h1 == h && channels1 == channels) - { - // special type 7 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h1; y++) - { - __m512 _p = _mm512_loadu_ps(ptr + y * 16); - for (int x = 0; x < w1; x++) - { - __m512 _p1 = _mm512_loadu_ps(ptr1); - __m512 _outp = op.func_pack16(_p, _p1); - _mm512_storeu_ps(outptr, _outp); - - ptr1 += 16; - outptr += 16; - } - } - } - - return 0; - } - - if (w1 == w && h1 != 1 && h == 1 && channels1 == channels) - { - // special type 8 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h1; y++) - { - for (int x = 0; x < w1; x++) - { - __m512 _p = _mm512_loadu_ps(ptr + x * 16); - __m512 _p1 = _mm512_loadu_ps(ptr1); - __m512 _outp = op.func_pack16(_p, _p1); - _mm512_storeu_ps(outptr, _outp); - - ptr1 += 16; - outptr += 16; - } - } - } - - return 0; - } - - // type 19 - return binary_op_7_13_19_29(a, b, c, opt); - } - - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.dims == 2) - { - // type 18 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.row(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h; y++) - { - __m512 _b0 = _mm512_loadu_ps(ptr1); - for (int x = 0; x < w; x++) - { - __m512 _p = _mm512_loadu_ps(ptr); - __m512 _outp = op.func_pack16(_p, _b0); - _mm512_storeu_ps(outptr, _outp); - ptr += 16; - outptr += 16; - } - - ptr1 += 16; - } - } - - return 0; - } - - if (b.dims == 1) - { - if (b.w == 1 && elempack1 == 1) - { - // type 16 - return binary_op_6_11_16_25(a, b, c, opt); - } - - // type 17 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - __m512 _b0 = _mm512_loadu_ps((const float*)b + q * 16); - float* outptr = c.channel(q); - - for (int i = 0; i < size; i++) - { - __m512 _p = _mm512_loadu_ps(ptr); - __m512 _outp = op.func_pack16(_p, _b0); - _mm512_storeu_ps(outptr, _outp); - ptr += 16; - outptr += 16; - } - } - - return 0; - } - } - else if (a.dims == 2) - { - if (b.dims == 4) - { - // type 22 - c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.row(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int z = 0; z < d1; z++) - { - __m512 _a0 = _mm512_loadu_ps(ptr); - for (int y = 0; y < h1; y++) - { - for (int x = 0; x < w1; x++) - { - __m512 _p = _mm512_loadu_ps(ptr1); - __m512 _outp = op.func_pack16(_a0, _p); - _mm512_storeu_ps(outptr, _outp); - ptr1 += 16; - outptr += 16; - } - } - - ptr += 16; - } - } - - return 0; - } - - if (b.dims == 3) - { - // type 14 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.row(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h1; y++) - { - __m512 _a0 = _mm512_loadu_ps(ptr); - for (int x = 0; x < w1; x++) - { - __m512 _p1 = _mm512_loadu_ps(ptr1); - __m512 _outp = op.func_pack16(_a0, _p1); - _mm512_storeu_ps(outptr, _outp); - ptr1 += 16; - outptr += 16; - } - - ptr += 16; - } - } - - return 0; - } - - c.create(w, h, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.dims == 2) - { - // type 13 - return binary_op_7_13_19_29(a, b, c, opt); - } - - if (b.dims == 1) - { - c.create(w, h, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.w == 1 && elempack1 == 1) - { - // type 11 - return binary_op_6_11_16_25(a, b, c, opt); - } - - // type 12 - const float* ptr = a; - const float* ptr1 = b; - float* outptr = c; - - for (int y = 0; y < h; y++) - { - __m512 _b0 = _mm512_loadu_ps(ptr1); - for (int x = 0; x < w; x++) - { - __m512 _p = _mm512_loadu_ps(ptr); - __m512 _outp = op.func_pack16(_p, _b0); - _mm512_storeu_ps(outptr, _outp); - ptr += 16; - outptr += 16; - } - - ptr1 += 16; - } - - return 0; - } - } - else if (a.dims == 1) - { - if (a.w == 1 && elempack == 1) - { - // type 2 3 4 20 - return binary_op_2_3_4_20(a, b, c, opt); - } - - if (b.dims == 4) - { - // type 21 - c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - __m512 _a0 = _mm512_loadu_ps((const float*)a + q * 16); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int i = 0; i < size1; i++) - { - __m512 _p1 = _mm512_loadu_ps(ptr1); - __m512 _outp = op.func_pack16(_a0, _p1); - _mm512_storeu_ps(outptr, _outp); - ptr1 += 16; - outptr += 16; - } - } - - return 0; - } - - if (b.dims == 3) - { - // type 9 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - __m512 _a0 = _mm512_loadu_ps((const float*)a + q * 16); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int i = 0; i < size1; i++) - { - __m512 _p1 = _mm512_loadu_ps(ptr1); - __m512 _outp = op.func_pack16(_a0, _p1); - _mm512_storeu_ps(outptr, _outp); - ptr1 += 16; - outptr += 16; - } - } - - return 0; - } - - if (b.dims == 2) - { - // type 8 - c.create(w1, h1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - const float* ptr = a; - const float* ptr1 = b; - float* outptr = c; - - for (int y = 0; y < h1; y++) - { - __m512 _a0 = _mm512_loadu_ps(ptr); - for (int x = 0; x < w1; x++) - { - __m512 _p1 = _mm512_loadu_ps(ptr1); - __m512 _outp = op.func_pack16(_a0, _p1); - _mm512_storeu_ps(outptr, _outp); - ptr1 += 16; - outptr += 16; - } - - ptr += 16; - } - - return 0; - } - - if (b.dims == 1) - { - c.create(w, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.w == 1 && elempack1 == 1) - { - // type 6 - return binary_op_6_11_16_25(a, b, c, opt); - } - - // type 7 - binary_op_7_13_19_29(a, b, c, opt); - } - } - - return 0; -} -#endif // __AVX512F__ - -template -static int binary_op_pack8(const Mat& a, const Mat& b, Mat& c, const Option& opt) -{ - Op op; - - int w = a.w; - int h = a.h; - int d = a.d; - int channels = a.c; - int size = w * h * d; - size_t elemsize = a.elemsize; - int elempack = a.elempack; - - int w1 = b.w; - int h1 = b.h; - int d1 = b.d; - int channels1 = b.c; - int size1 = w1 * h1 * d1; - size_t elemsize1 = b.elemsize; - int elempack1 = b.elempack; - - if (a.dims == 4) - { - if (b.dims == 4) - { - // type 29 - return binary_op_7_13_19_29(a, b, c, opt); - } - - c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.dims == 3) - { - // type 28 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int z = 0; z < d; z++) - { - for (int y = 0; y < h; y++) - { - __m256 _b0 = _mm256_loadu_ps(ptr1); - for (int x = 0; x < w; x++) - { - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _outp = op.func_pack8(_p, _b0); - _mm256_storeu_ps(outptr, _outp); - ptr += 8; - outptr += 8; - } - - ptr1 += 8; - } - } - } - - return 0; - } - - if (b.dims == 2) - { - // type 27 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.row(q); - float* outptr = c.channel(q); - - for (int z = 0; z < d; z++) - { - __m256 _b0 = _mm256_loadu_ps(ptr1); - for (int y = 0; y < h; y++) - { - for (int x = 0; x < w; x++) - { - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _outp = op.func_pack8(_p, _b0); - _mm256_storeu_ps(outptr, _outp); - ptr += 8; - outptr += 8; - } - } - - ptr1 += 8; - } - } - - return 0; - } - - if (b.dims == 1) - { - if (b.w == 1 && elempack1 == 1) - { - // type 25 - return binary_op_6_11_16_25(a, b, c, opt); - } - - // type 26 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - __m256 _b0 = _mm256_loadu_ps((const float*)b + q * 8); - float* outptr = c.channel(q); - - for (int i = 0; i < size; i++) - { - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _outp = op.func_pack8(_p, _b0); - _mm256_storeu_ps(outptr, _outp); - ptr += 8; - outptr += 8; - } - } - - return 0; - } - } - else if (a.dims == 3) - { - if (b.dims == 4) - { - // type 23 - c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int z = 0; z < d1; z++) - { - for (int y = 0; y < h1; y++) - { - __m256 _a0 = _mm256_loadu_ps(ptr); - for (int x = 0; x < w1; x++) - { - __m256 _p = _mm256_loadu_ps(ptr1); - __m256 _outp = op.func_pack8(_a0, _p); - _mm256_storeu_ps(outptr, _outp); - ptr1 += 8; - outptr += 8; - } - - ptr += 8; - } - } - } - - return 0; - } - - if (b.dims == 3) - { - if (w1 == 1 && h1 == 1 && channels1 == channels) - { - // special type 1 - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* b0 = b.channel(q); - float* outptr = c.channel(q); - __m256 _b0 = _mm256_loadu_ps(b0); - for (int i = 0; i < size; i++) - { - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _outp = op.func_pack8(_p, _b0); - _mm256_storeu_ps(outptr, _outp); - ptr += 8; - outptr += 8; - } - } - - return 0; - } - - if (w1 == w && h1 == h && channels1 == 1 && elempack1 == 1) - { - // special type 2 - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b; - float* outptr = c.channel(q); - for (int i = 0; i < size; i++) - { - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _p1 = _mm256_broadcast_ss(ptr1); - __m256 _outp = op.func_pack8(_p, _p1); - _mm256_storeu_ps(outptr, _outp); - ptr += 8; - ptr1 += 1; - outptr += 8; - } - } - - return 0; - } - - if (w == 1 && h == 1 && channels1 == channels) - { - // special type 3 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* a0 = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - __m256 _a0 = _mm256_loadu_ps(a0); - for (int i = 0; i < size1; i++) - { - __m256 _p1 = _mm256_loadu_ps(ptr1); - __m256 _outp = op.func_pack8(_a0, _p1); - _mm256_storeu_ps(outptr, _outp); - ptr1 += 8; - outptr += 8; - } - } - - return 0; - } - - if (w1 == w && h1 == h && channels == 1 && elempack == 1) - { - // special type 4 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a; - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - for (int i = 0; i < size1; i++) - { - __m256 _p = _mm256_broadcast_ss(ptr); - __m256 _p1 = _mm256_loadu_ps(ptr1); - __m256 _outp = op.func_pack8(_p, _p1); - _mm256_storeu_ps(outptr, _outp); - ptr += 1; - ptr1 += 8; - outptr += 8; - } - } - - return 0; - } - - if (w != 1 && w1 == 1 && h1 == h && channels1 == channels) - { - // special type 5 - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h; y++) - { - __m256 _p1 = _mm256_loadu_ps(ptr1 + y * 8); - for (int x = 0; x < w; x++) - { - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _outp = op.func_pack8(_p, _p1); - _mm256_storeu_ps(outptr, _outp); - - ptr += 8; - outptr += 8; - } - } - } - - return 0; - } - - if (w1 == w && h != 1 && h1 == 1 && channels1 == channels) - { - // special type 6 - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h; y++) - { - for (int x = 0; x < w; x++) - { - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _p1 = _mm256_loadu_ps(ptr1 + x * 8); - __m256 _outp = op.func_pack8(_p, _p1); - _mm256_storeu_ps(outptr, _outp); - - ptr += 8; - outptr += 8; - } - } - } - - return 0; - } - - if (w1 != 1 && w == 1 && h1 == h && channels1 == channels) - { - // special type 7 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h1; y++) - { - __m256 _p = _mm256_loadu_ps(ptr + y * 8); - for (int x = 0; x < w1; x++) - { - __m256 _p1 = _mm256_loadu_ps(ptr1); - __m256 _outp = op.func_pack8(_p, _p1); - _mm256_storeu_ps(outptr, _outp); - - ptr1 += 8; - outptr += 8; - } - } - } - - return 0; - } - - if (w1 == w && h1 != 1 && h == 1 && channels1 == channels) - { - // special type 8 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h1; y++) - { - for (int x = 0; x < w1; x++) - { - __m256 _p = _mm256_loadu_ps(ptr + x * 8); - __m256 _p1 = _mm256_loadu_ps(ptr1); - __m256 _outp = op.func_pack8(_p, _p1); - _mm256_storeu_ps(outptr, _outp); - - ptr1 += 8; - outptr += 8; - } - } - } - - return 0; - } - - // type 19 - return binary_op_7_13_19_29(a, b, c, opt); - } - - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.dims == 2) - { - // type 18 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.row(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h; y++) - { - __m256 _b0 = _mm256_loadu_ps(ptr1); - for (int x = 0; x < w; x++) - { - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _outp = op.func_pack8(_p, _b0); - _mm256_storeu_ps(outptr, _outp); - ptr += 8; - outptr += 8; - } - - ptr1 += 8; - } - } - - return 0; - } - - if (b.dims == 1) - { - if (b.w == 1 && elempack1 == 1) - { - // type 16 - return binary_op_6_11_16_25(a, b, c, opt); - } - - // type 17 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - __m256 _b0 = _mm256_loadu_ps((const float*)b + q * 8); - float* outptr = c.channel(q); - - for (int i = 0; i < size; i++) - { - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _outp = op.func_pack8(_p, _b0); - _mm256_storeu_ps(outptr, _outp); - ptr += 8; - outptr += 8; - } - } - - return 0; - } - } - else if (a.dims == 2) - { - if (b.dims == 4) - { - // type 22 - c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.row(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int z = 0; z < d1; z++) - { - __m256 _a0 = _mm256_loadu_ps(ptr); - for (int y = 0; y < h1; y++) - { - for (int x = 0; x < w1; x++) - { - __m256 _p = _mm256_loadu_ps(ptr1); - __m256 _outp = op.func_pack8(_a0, _p); - _mm256_storeu_ps(outptr, _outp); - ptr1 += 8; - outptr += 8; - } - } - - ptr += 8; - } - } - - return 0; - } - - if (b.dims == 3) - { - // type 14 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.row(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h1; y++) - { - __m256 _a0 = _mm256_loadu_ps(ptr); - for (int x = 0; x < w1; x++) - { - __m256 _p1 = _mm256_loadu_ps(ptr1); - __m256 _outp = op.func_pack8(_a0, _p1); - _mm256_storeu_ps(outptr, _outp); - ptr1 += 8; - outptr += 8; - } - - ptr += 8; - } - } - - return 0; - } - - c.create(w, h, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.dims == 2) - { - // type 13 - return binary_op_7_13_19_29(a, b, c, opt); - } - - if (b.dims == 1) - { - c.create(w, h, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.w == 1 && elempack1 == 1) - { - // type 11 - return binary_op_6_11_16_25(a, b, c, opt); - } - - // type 12 - const float* ptr = a; - const float* ptr1 = b; - float* outptr = c; - - for (int y = 0; y < h; y++) - { - __m256 _b0 = _mm256_loadu_ps(ptr1); - for (int x = 0; x < w; x++) - { - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _outp = op.func_pack8(_p, _b0); - _mm256_storeu_ps(outptr, _outp); - ptr += 8; - outptr += 8; - } - - ptr1 += 8; - } - - return 0; - } - } - else if (a.dims == 1) - { - if (a.w == 1 && elempack == 1) - { - // type 2 3 4 20 - return binary_op_2_3_4_20(a, b, c, opt); - } - - if (b.dims == 4) - { - // type 21 - c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - __m256 _a0 = _mm256_loadu_ps((const float*)a + q * 8); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int i = 0; i < size1; i++) - { - __m256 _p1 = _mm256_loadu_ps(ptr1); - __m256 _outp = op.func_pack8(_a0, _p1); - _mm256_storeu_ps(outptr, _outp); - ptr1 += 8; - outptr += 8; - } - } - - return 0; - } - - if (b.dims == 3) - { - // type 9 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - __m256 _a0 = _mm256_loadu_ps((const float*)a + q * 8); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int i = 0; i < size1; i++) - { - __m256 _p1 = _mm256_loadu_ps(ptr1); - __m256 _outp = op.func_pack8(_a0, _p1); - _mm256_storeu_ps(outptr, _outp); - ptr1 += 8; - outptr += 8; - } - } - - return 0; - } - - if (b.dims == 2) - { - // type 8 - c.create(w1, h1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - const float* ptr = a; - const float* ptr1 = b; - float* outptr = c; - - for (int y = 0; y < h1; y++) - { - __m256 _a0 = _mm256_loadu_ps(ptr); - for (int x = 0; x < w1; x++) - { - __m256 _p1 = _mm256_loadu_ps(ptr1); - __m256 _outp = op.func_pack8(_a0, _p1); - _mm256_storeu_ps(outptr, _outp); - ptr1 += 8; - outptr += 8; - } - - ptr += 8; - } - - return 0; - } - - if (b.dims == 1) - { - c.create(w, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.w == 1 && elempack1 == 1) - { - // type 6 - return binary_op_6_11_16_25(a, b, c, opt); - } - - // type 7 - binary_op_7_13_19_29(a, b, c, opt); - } - } - - return 0; -} -#endif // __AVX__ - -template -static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt) -{ - Op op; - - int w = a.w; - int h = a.h; - int d = a.d; - int channels = a.c; - int size = w * h * d; - size_t elemsize = a.elemsize; - int elempack = a.elempack; - - int w1 = b.w; - int h1 = b.h; - int d1 = b.d; - int channels1 = b.c; - int size1 = w1 * h1 * d1; - size_t elemsize1 = b.elemsize; - int elempack1 = b.elempack; - - if (a.dims == 4) - { - if (b.dims == 4) - { - // type 29 - return binary_op_7_13_19_29(a, b, c, opt); - } - - c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.dims == 3) - { - // type 28 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int z = 0; z < d; z++) - { - for (int y = 0; y < h; y++) - { - __m128 _b0 = _mm_loadu_ps(ptr1); - for (int x = 0; x < w; x++) - { - __m128 _p = _mm_loadu_ps(ptr); - __m128 _outp = op.func_pack4(_p, _b0); - _mm_storeu_ps(outptr, _outp); - ptr += 4; - outptr += 4; - } - - ptr1 += 4; - } - } - } - - return 0; - } - - if (b.dims == 2) - { - // type 27 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.row(q); - float* outptr = c.channel(q); - - for (int z = 0; z < d; z++) - { - __m128 _b0 = _mm_loadu_ps(ptr1); - for (int y = 0; y < h; y++) - { - for (int x = 0; x < w; x++) - { - __m128 _p = _mm_loadu_ps(ptr); - __m128 _outp = op.func_pack4(_p, _b0); - _mm_storeu_ps(outptr, _outp); - ptr += 4; - outptr += 4; - } - } - - ptr1 += 4; - } - } - - return 0; - } - - if (b.dims == 1) - { - if (b.w == 1 && elempack1 == 1) - { - // type 25 - return binary_op_6_11_16_25(a, b, c, opt); - } - - // type 26 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - __m128 _b0 = _mm_loadu_ps((const float*)b + q * 4); - float* outptr = c.channel(q); - - for (int i = 0; i < size; i++) - { - __m128 _p = _mm_loadu_ps(ptr); - __m128 _outp = op.func_pack4(_p, _b0); - _mm_storeu_ps(outptr, _outp); - ptr += 4; - outptr += 4; - } - } - - return 0; - } - } - else if (a.dims == 3) - { - if (b.dims == 4) - { - // type 23 - c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int z = 0; z < d1; z++) - { - for (int y = 0; y < h1; y++) - { - __m128 _a0 = _mm_loadu_ps(ptr); - for (int x = 0; x < w1; x++) - { - __m128 _p = _mm_loadu_ps(ptr1); - __m128 _outp = op.func_pack4(_a0, _p); - _mm_storeu_ps(outptr, _outp); - ptr1 += 4; - outptr += 4; - } - - ptr += 4; - } - } - } - - return 0; - } - - if (b.dims == 3) - { - if (w1 == 1 && h1 == 1 && channels1 == channels) - { - // special type 1 - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - float* outptr = c.channel(q); - const float* b0 = b.channel(q); - __m128 _b0 = _mm_loadu_ps(b0); - for (int i = 0; i < size; i++) - { - __m128 _p = _mm_loadu_ps(ptr); - __m128 _outp = op.func_pack4(_p, _b0); - _mm_storeu_ps(outptr, _outp); - ptr += 4; - outptr += 4; - } - } - - return 0; - } - - if (w1 == w && h1 == h && channels1 == 1 && elempack1 == 1) - { - // special type 2 - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b; - float* outptr = c.channel(q); - for (int i = 0; i < size; i++) - { - __m128 _p = _mm_loadu_ps(ptr); - __m128 _p1 = _mm_set1_ps(*ptr1); - __m128 _outp = op.func_pack4(_p, _p1); - _mm_storeu_ps(outptr, _outp); - ptr += 4; - ptr1 += 1; - outptr += 4; - } - } - - return 0; - } - - if (w == 1 && h == 1 && channels1 == channels) - { - // special type 3 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* a0 = a.channel(q); - float* outptr = c.channel(q); - const float* ptr1 = b.channel(q); - __m128 _a0 = _mm_loadu_ps(a0); - for (int i = 0; i < size1; i++) - { - __m128 _p1 = _mm_loadu_ps(ptr1); - __m128 _outp = op.func_pack4(_a0, _p1); - _mm_storeu_ps(outptr, _outp); - ptr1 += 4; - outptr += 4; - } - } - - return 0; - } - - if (w1 == w && h1 == h && channels == 1 && elempack == 1) - { - // special type 4 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a; - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - for (int i = 0; i < size1; i++) - { - __m128 _p = _mm_set1_ps(*ptr); - __m128 _p1 = _mm_loadu_ps(ptr1); - __m128 _outp = op.func_pack4(_p, _p1); - _mm_storeu_ps(outptr, _outp); - ptr += 1; - ptr1 += 4; - outptr += 4; - } - } - - return 0; - } - - if (w != 1 && w1 == 1 && h1 == h && channels1 == channels) - { - // special type 5 - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h; y++) - { - __m128 _p1 = _mm_loadu_ps(ptr1 + y * 4); - for (int x = 0; x < w; x++) - { - __m128 _p = _mm_loadu_ps(ptr); - __m128 _outp = op.func_pack4(_p, _p1); - _mm_storeu_ps(outptr, _outp); - - ptr += 4; - outptr += 4; - } - } - } - - return 0; - } - - if (w1 == w && h != 1 && h1 == 1 && channels1 == channels) - { - // special type 6 - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h; y++) - { - for (int x = 0; x < w; x++) - { - __m128 _p = _mm_loadu_ps(ptr); - __m128 _p1 = _mm_loadu_ps(ptr1 + x * 4); - __m128 _outp = op.func_pack4(_p, _p1); - _mm_storeu_ps(outptr, _outp); - - ptr += 4; - outptr += 4; - } - } - } - - return 0; - } - - if (w1 != 1 && w == 1 && h1 == h && channels1 == channels) - { - // special type 7 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h1; y++) - { - __m128 _p = _mm_loadu_ps(ptr + y * 4); - for (int x = 0; x < w1; x++) - { - __m128 _p1 = _mm_loadu_ps(ptr1); - __m128 _outp = op.func_pack4(_p, _p1); - _mm_storeu_ps(outptr, _outp); - - ptr1 += 4; - outptr += 4; - } - } - } - - return 0; - } - - if (w1 == w && h1 != 1 && h == 1 && channels1 == channels) - { - // special type 8 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h1; y++) - { - for (int x = 0; x < w1; x++) - { - __m128 _p = _mm_loadu_ps(ptr + x * 4); - __m128 _p1 = _mm_loadu_ps(ptr1); - __m128 _outp = op.func_pack4(_p, _p1); - _mm_storeu_ps(outptr, _outp); - - ptr1 += 4; - outptr += 4; - } - } - } - - return 0; - } - - // type 19 - return binary_op_7_13_19_29(a, b, c, opt); - } - - c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.dims == 2) - { - // type 18 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - const float* ptr1 = b.row(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h; y++) - { - __m128 _b0 = _mm_loadu_ps(ptr1); - for (int x = 0; x < w; x++) - { - __m128 _p = _mm_loadu_ps(ptr); - __m128 _outp = op.func_pack4(_p, _b0); - _mm_storeu_ps(outptr, _outp); - ptr += 4; - outptr += 4; - } - - ptr1 += 4; - } - } - - return 0; - } - - if (b.dims == 1) - { - if (b.w == 1 && elempack1 == 1) - { - // type 16 - return binary_op_6_11_16_25(a, b, c, opt); - } - - // type 17 - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - __m128 _b0 = _mm_loadu_ps((const float*)b + q * 4); - float* outptr = c.channel(q); - - for (int i = 0; i < size; i++) - { - __m128 _p = _mm_loadu_ps(ptr); - __m128 _outp = op.func_pack4(_p, _b0); - _mm_storeu_ps(outptr, _outp); - ptr += 4; - outptr += 4; - } - } - - return 0; - } - } - else if (a.dims == 2) - { - if (b.dims == 4) - { - // type 22 - c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.row(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int z = 0; z < d1; z++) - { - __m128 _a0 = _mm_loadu_ps(ptr); - for (int y = 0; y < h1; y++) - { - for (int x = 0; x < w1; x++) - { - __m128 _p = _mm_loadu_ps(ptr1); - __m128 _outp = op.func_pack4(_a0, _p); - _mm_storeu_ps(outptr, _outp); - ptr1 += 4; - outptr += 4; - } - } - - ptr += 4; - } - } - - return 0; - } - - if (b.dims == 3) - { - // type 14 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - const float* ptr = a.row(q); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int y = 0; y < h1; y++) - { - __m128 _a0 = _mm_loadu_ps(ptr); - for (int x = 0; x < w1; x++) - { - __m128 _p1 = _mm_loadu_ps(ptr1); - __m128 _outp = op.func_pack4(_a0, _p1); - _mm_storeu_ps(outptr, _outp); - ptr1 += 4; - outptr += 4; - } - - ptr += 4; - } - } - - return 0; - } - - c.create(w, h, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.dims == 2) - { - // type 13 - return binary_op_7_13_19_29(a, b, c, opt); - } - - if (b.dims == 1) - { - c.create(w, h, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.w == 1 && elempack1 == 1) - { - // type 11 - return binary_op_6_11_16_25(a, b, c, opt); - } - - // type 12 - const float* ptr = a; - const float* ptr1 = b; - float* outptr = c; - - for (int y = 0; y < h; y++) - { - __m128 _b0 = _mm_loadu_ps(ptr1); - for (int x = 0; x < w; x++) - { - __m128 _p = _mm_loadu_ps(ptr); - __m128 _outp = op.func_pack4(_p, _b0); - _mm_storeu_ps(outptr, _outp); - ptr += 4; - outptr += 4; - } - - ptr1 += 4; - } - - return 0; - } - } - else if (a.dims == 1) - { - if (a.w == 1 && elempack == 1) - { - // type 2 3 4 20 - return binary_op_2_3_4_20(a, b, c, opt); - } - - if (b.dims == 4) - { - // type 21 - c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - __m128 _a0 = _mm_loadu_ps((const float*)a + q * 4); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int i = 0; i < size1; i++) - { - __m128 _p1 = _mm_loadu_ps(ptr1); - __m128 _outp = op.func_pack4(_a0, _p1); - _mm_storeu_ps(outptr, _outp); - ptr1 += 4; - outptr += 4; - } - } - - return 0; - } - - if (b.dims == 3) - { - // type 9 - c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels1; q++) - { - __m128 _a0 = _mm_loadu_ps((const float*)a + q * 4); - const float* ptr1 = b.channel(q); - float* outptr = c.channel(q); - - for (int i = 0; i < size1; i++) - { - __m128 _p1 = _mm_loadu_ps(ptr1); - __m128 _outp = op.func_pack4(_a0, _p1); - _mm_storeu_ps(outptr, _outp); - ptr1 += 4; - outptr += 4; - } - } - - return 0; - } - - if (b.dims == 2) - { - // type 8 - c.create(w1, h1, elemsize1, elempack1, opt.blob_allocator); - if (c.empty()) - return -100; - - const float* ptr = a; - const float* ptr1 = b; - float* outptr = c; - - for (int y = 0; y < h1; y++) - { - __m128 _a0 = _mm_loadu_ps(ptr); - for (int x = 0; x < w1; x++) - { - __m128 _p1 = _mm_loadu_ps(ptr1); - __m128 _outp = op.func_pack4(_a0, _p1); - _mm_storeu_ps(outptr, _outp); - ptr1 += 4; - outptr += 4; - } - - ptr += 4; - } - - return 0; - } - - if (b.dims == 1) - { - c.create(w, elemsize, elempack, opt.blob_allocator); - if (c.empty()) - return -100; - - if (b.w == 1 && elempack1 == 1) - { - // type 6 - return binary_op_6_11_16_25(a, b, c, opt); - } - - // type 7 - binary_op_7_13_19_29(a, b, c, opt); - } - } - - return 0; -} -#endif // __SSE2__ - -template -static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt) -{ - Op op; - - int w = a.w; - int h = a.h; - int d = a.d; - int channels = a.c; - int elempack = a.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* ptr = a.channel(q); - - int i = 0; -#if __SSE2__ -#if __AVX__ -#if __AVX512F__ - __m512 _b_avx512 = _mm512_set1_ps(b); - for (; i + 15 < size; i += 16) - { - __m512 _p = _mm512_loadu_ps(ptr); - _p = op.func_pack16(_p, _b_avx512); - _mm512_storeu_ps(ptr, _p); - ptr += 16; - } -#endif // __AVX512F__ - __m256 _b_avx = _mm256_set1_ps(b); - for (; i + 7 < size; i += 8) - { - __m256 _p = _mm256_loadu_ps(ptr); - _p = op.func_pack8(_p, _b_avx); - _mm256_storeu_ps(ptr, _p); - ptr += 8; - } -#endif // __AVX__ - __m128 _b = _mm_set1_ps((float)b); - for (; i + 3 < size; i += 4) - { - __m128 _p = _mm_load_ps(ptr); - _p = op.func_pack4(_p, _b); - _mm_store_ps(ptr, _p); - ptr += 4; - } -#endif // __SSE2__ - for (; i < size; i++) - { - *ptr = op.func(*ptr, b); - ptr++; - } - } - - return 0; -} - -namespace BinaryOp_x86_functor { - -struct binary_op_add -{ - float func(const float& x, const float& y) const - { - return x + y; - } -#if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const - { - return _mm_add_ps(x, y); - } -#if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const - { - return _mm256_add_ps(x, y); - } -#if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const - { - return _mm512_add_ps(x, y); - } -#endif // __AVX512F__ -#endif // __AVX__ -#endif // __SSE2__ -}; - -struct binary_op_sub -{ - float func(const float& x, const float& y) const - { - return x - y; - } -#if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const - { - return _mm_sub_ps(x, y); - } -#if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const - { - return _mm256_sub_ps(x, y); - } -#if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const - { - return _mm512_sub_ps(x, y); - } -#endif // __AVX512F__ -#endif // __AVX__ -#endif // __SSE2__ -}; - -struct binary_op_mul -{ - float func(const float& x, const float& y) const - { - return x * y; - } -#if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const - { - return _mm_mul_ps(x, y); - } -#if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const - { - return _mm256_mul_ps(x, y); - } -#if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const - { - return _mm512_mul_ps(x, y); - } -#endif // __AVX512F__ -#endif // __AVX__ -#endif // __SSE2__ -}; - -struct binary_op_div -{ - float func(const float& x, const float& y) const - { - return x / y; - } -#if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const - { - return _mm_div_ps(x, y); - } -#if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const - { - return _mm256_div_ps(x, y); - } -#if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const - { - return _mm512_div_ps(x, y); - } -#endif // __AVX512F__ -#endif // __AVX__ -#endif // __SSE2__ -}; - -struct binary_op_max -{ - float func(const float& x, const float& y) const - { - return std::max(x, y); - } -#if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const - { - return _mm_max_ps(x, y); - } -#if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const - { - return _mm256_max_ps(x, y); - } -#if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const - { - return _mm512_max_ps(x, y); - } -#endif // __AVX512F__ -#endif // __AVX__ -#endif // __SSE2__ -}; - -struct binary_op_min -{ - float func(const float& x, const float& y) const - { - return std::min(x, y); - } -#if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const - { - return _mm_min_ps(x, y); - } -#if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const - { - return _mm256_min_ps(x, y); - } -#if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const - { - return _mm512_min_ps(x, y); - } -#endif // __AVX512F__ -#endif // __AVX__ -#endif // __SSE2__ -}; - -struct binary_op_pow -{ - float func(const float& x, const float& y) const - { - return (float)pow(x, y); - } -#if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const - { - return pow_ps(x, y); - } -#if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const - { - return pow256_ps(x, y); - } -#if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const - { - return pow512_ps(x, y); - } -#endif // __AVX512F__ -#endif // __AVX__ -#endif // __SSE2__ -}; - -struct binary_op_rsub -{ - float func(const float& x, const float& y) const - { - return y - x; - } -#if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const - { - return _mm_sub_ps(y, x); - } -#if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const - { - return _mm256_sub_ps(y, x); - } -#if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const - { - return _mm512_sub_ps(y, x); - } -#endif // __AVX512F__ -#endif // __AVX__ -#endif // __SSE2__ -}; - -struct binary_op_rdiv -{ - float func(const float& x, const float& y) const - { - return y / x; - } -#if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const - { - return _mm_div_ps(y, x); - } -#if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const - { - return _mm256_div_ps(y, x); - } -#if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const - { - return _mm512_div_ps(y, x); - } -#endif // __AVX512F__ -#endif // __AVX__ -#endif // __SSE2__ -}; - -} // namespace BinaryOp_x86_functor - -int BinaryOp_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ -#if __SSE2__ - using namespace BinaryOp_x86_functor; - - const Mat& bottom_blob = bottom_blobs[0]; - const Mat& bottom_blob1 = bottom_blobs[1]; - Mat& top_blob = top_blobs[0]; - - int elempack = bottom_blob.elempack; - int elempack1 = bottom_blob1.elempack; - -#if __AVX__ -#if __AVX512F__ - if (elempack == 16 || elempack1 == 16) - { - if (op_type == Operation_ADD) - return binary_op_pack16(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_SUB) - return binary_op_pack16(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_MUL) - return binary_op_pack16(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_DIV) - return binary_op_pack16(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_MAX) - return binary_op_pack16(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_MIN) - return binary_op_pack16(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_POW) - return binary_op_pack16(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_RSUB) - return binary_op_pack16(bottom_blob1, bottom_blob, top_blob, opt); - - if (op_type == Operation_RDIV) - return binary_op_pack16(bottom_blob1, bottom_blob, top_blob, opt); - } -#endif // __AVX512F__ - - if (elempack == 8 || elempack1 == 8) - { - if (op_type == Operation_ADD) - return binary_op_pack8(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_SUB) - return binary_op_pack8(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_MUL) - return binary_op_pack8(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_DIV) - return binary_op_pack8(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_MAX) - return binary_op_pack8(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_MIN) - return binary_op_pack8(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_POW) - return binary_op_pack8(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_RSUB) - return binary_op_pack8(bottom_blob1, bottom_blob, top_blob, opt); - - if (op_type == Operation_RDIV) - return binary_op_pack8(bottom_blob1, bottom_blob, top_blob, opt); - } -#endif // __AVX__ - - if (elempack == 4 || elempack1 == 4) - { - if (op_type == Operation_ADD) - return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_SUB) - return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_MUL) - return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_DIV) - return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_MAX) - return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_MIN) - return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_POW) - return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); - - if (op_type == Operation_RSUB) - return binary_op_pack4(bottom_blob1, bottom_blob, top_blob, opt); - - if (op_type == Operation_RDIV) - return binary_op_pack4(bottom_blob1, bottom_blob, top_blob, opt); - } -#endif // __SSE2__ - - return BinaryOp::forward(bottom_blobs, top_blobs, opt); -} - -int BinaryOp_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const -{ - using namespace BinaryOp_x86_functor; - - if (op_type == Operation_ADD) - return binary_op_scalar_inplace(bottom_top_blob, b, opt); - - if (op_type == Operation_SUB) - return binary_op_scalar_inplace(bottom_top_blob, b, opt); - - if (op_type == Operation_MUL) - return binary_op_scalar_inplace(bottom_top_blob, b, opt); - - if (op_type == Operation_DIV) - return binary_op_scalar_inplace(bottom_top_blob, b, opt); - - if (op_type == Operation_MAX) - return binary_op_scalar_inplace(bottom_top_blob, b, opt); - - if (op_type == Operation_MIN) - return binary_op_scalar_inplace(bottom_top_blob, b, opt); - - if (op_type == Operation_POW) - return binary_op_scalar_inplace(bottom_top_blob, b, opt); - - if (op_type == Operation_RSUB) - return binary_op_scalar_inplace(bottom_top_blob, b, opt); - - if (op_type == Operation_RDIV) - return binary_op_scalar_inplace(bottom_top_blob, b, opt); - - return 0; -} - -} // namespace ncnn diff --git a/src/layer/x86/binaryop_x86.h b/src/layer/x86/binaryop_x86.h deleted file mode 100644 index 9f3ebb3cac9f..000000000000 --- a/src/layer/x86/binaryop_x86.h +++ /dev/null @@ -1,34 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#ifndef LAYER_BINARYOP_X86_H -#define LAYER_BINARYOP_X86_H - -#include "binaryop.h" - -namespace ncnn { - -class BinaryOp_x86 : virtual public BinaryOp -{ -public: - BinaryOp_x86(); - - virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; - - virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; -}; - -} // namespace ncnn - -#endif // LAYER_BINARYOP_X86_H diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index 30c2a31e127f..47609d00415e 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -109,6 +109,9 @@ int MultiHeadAttention_x86::affine_input( if (input.elemsize != 1) { quantize_to_int8(input, input_int8, input_scale, opt); + } else + { + input_int8 = input; } Mat buffer(out_int8.w, out_int8.h, out_int8.c, 4u, opt.workspace_allocator); diff --git a/tools/modelwriter.cpp b/tools/modelwriter.cpp index 274bf4dd1717..2c003d8478a7 100644 --- a/tools/modelwriter.cpp +++ b/tools/modelwriter.cpp @@ -660,10 +660,6 @@ int ModelWriter::save(const char* parampath, const char* binpath) fprintf_param_value(" 0=%d", op_type) fprintf_param_value(" 1=%d", with_scalar) fprintf_param_value(" 2=%e", b) - fprintf_param_value(" 3=%d", int8_scale_term) - fprintf_param_value(" 4=%e", in_scale0) - fprintf_param_value(" 5=%e", in_scale1) - fprintf_param_value(" 6=%e", out_scale) } else if (layer->type == "Clip") { diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index 1013e3b7e338..7147f1acf377 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -91,11 +91,9 @@ int main(int argc, char** argv) quantizer.quantize_convolutiondepthwise(); quantizer.quantize_innerproduct(); quantizer.quantize_layernorm(); - quantizer.quantize_binaryop(); quantizer.fuse_conv_requantize(); quantizer.fuse_layernorm_requantize(); - quantizer.fuse_binaryop_requantize(); quantizer.save(outparam, outbin); diff --git a/tools/quantize/net_quantize.cpp b/tools/quantize/net_quantize.cpp index 4be076bb9059..b3805195b218 100644 --- a/tools/quantize/net_quantize.cpp +++ b/tools/quantize/net_quantize.cpp @@ -476,66 +476,6 @@ int NetQuantize::quantize_layernorm() return 0; } - -int NetQuantize::quantize_binaryop() -{ - const int layer_count = static_cast(layers.size()); - auto base_opt = opt; - - for (int i = 0; i < layer_count; i++) - { - // find add layer - if (layers[i]->type != "BinaryOp") - continue; - - ncnn::BinaryOp* op = (ncnn::BinaryOp*)layers[i]; - - if (op->bottoms.size() != 2) - { - // binaryop with scalar, skip - continue; - } - - if (binaryop_table.find(op->name) == binaryop_table.end()) - { - fprintf(stderr, "cannot find %s quant param.\n", op->name.c_str()); - continue; - } - - auto& table = binaryop_table.at(op->name); - { - std::vector scales = table->get_list("input_scales"); - if (scales.size() != 2) - { - fprintf(stderr, "quantize_binaryop input scales len mismatch.\n"); - return -100; - } - op->in_scale0 = scales[0]; - op->in_scale1 = scales[1]; - - op->out_scale = table->get("output_scale"); - if (std::abs(op->out_scale) <= 1e-6) - { - fprintf(stderr, "quantize_binaryop output scale too small.\n"); - return -100; - } - - op->int8_scale_term = 1; - } - - // print some tips - switch (op->op_type) - { - case ncnn::BinaryOp::Operation_DIV: - case ncnn::BinaryOp::Operation_RDIV: - case ncnn::BinaryOp::Operation_POW: - fprintf(stderr, "please make sure that you really want to quantize div/rdiv/pow operation +_+ \n"); - break; - } - } - return 0; -} - int NetQuantize::fuse_conv_requantize() { const size_t layer_count = layers.size(); @@ -745,54 +685,6 @@ int NetQuantize::fuse_conv_requantize() * @return int */ int NetQuantize::fuse_layernorm_requantize() -{ - const size_t layer_count = layers.size(); - for (size_t i = 0; i < layer_count; i++) - { - if (layers[i]->type != "LayerNorm") - continue; - - // LayerNorm --> quantizable_node - int top_blob_index = layers[i]->tops[0]; - - size_t j = i + 1; - for (; j < layer_count; j++) - { - if (quantizable_node.find(layers[j]->type) == quantizable_node.end()) - { - continue; - } - - if (layers[j]->bottoms.size() != 1) - continue; - - if (layers[j]->bottoms[0] == top_blob_index) - break; - } - - if (j == layer_count) - continue; - - // fuse requantize - fprintf(stderr, "fuse_requantize %s %s\n", layers[i]->name.c_str(), layers[j]->name.c_str()); - - ncnn::LayerNorm* ln = (ncnn::LayerNorm*)layers[i]; - // layernorm_int8 quantized by , so do not need to update next node's output_scale. - ln->int8_scale_term += 100; - } - - return 0; -} - -/** - * @brief - * - * if all of output is quantized, binaryop use requant - * if none of input and output can be quantize, binaryop skip quant - * - * @return int - */ -int NetQuantize::fuse_binaryop_requantize() { const size_t layer_count = layers.size(); @@ -813,7 +705,7 @@ int NetQuantize::fuse_binaryop_requantize() return outputs; }; - auto all_outputs = [&](ncnn::BinaryOp* op, int cur) -> std::vector + auto all_outputs = [&](ncnn::Layer* op, int cur) -> std::vector { auto directs = direct_connected_outputs(op, cur); std::vector outputs; @@ -830,40 +722,6 @@ int NetQuantize::fuse_binaryop_requantize() return outputs; }; - auto direct_connected_inputs = [=](ncnn::Layer* op, int cur) -> std::vector - { - std::vector inputs; - for (size_t j = 0; j tops) { - if (index == op->bottoms[0] || index == op->bottoms[1]) - { - inputs.emplace_back(last); - break; - } - } - } - return inputs; - }; - - auto all_inputs = [&](ncnn::BinaryOp* op, int cur) -> std::vector - { - auto directs = direct_connected_inputs(op, cur); - std::vector inputs; - for (auto node: directs) - { - if (node->type == "Split") - { - auto lasts = direct_connected_inputs(node, cur); - inputs.insert(inputs.end(), lasts.begin(), lasts.end()); - continue; - } - inputs.emplace_back(node); - } - return inputs; - }; - auto is_quantized = [=](ncnn::Layer* layer) -> bool { if (layer->type == "Convolution") @@ -887,59 +745,33 @@ int NetQuantize::fuse_binaryop_requantize() return false; }; - + for (size_t i = 0; i < layer_count; i++) { - if (layers[i]->type != "BinaryOp") - continue; - - ncnn::BinaryOp* op = (ncnn::BinaryOp*)layers[i]; - - if (op->int8_scale_term == 0) - { + if (layers[i]->type != "LayerNorm") continue; - } - - auto outputs = all_outputs(op, i); - auto inputs = all_inputs(op, i); - - // if binaryop outputs are all quantized, requant and return - bool all_output_support_quant = true; - // if none of nodes could be quantized, give up quantize binaryop - bool non_can_quantize = true; - for (ncnn::Layer* node: outputs) - { - if (is_quantized(node)) - { - non_can_quantize = false; - } else - { - all_output_support_quant = false; - } - } - for (ncnn::Layer* node: inputs) + ncnn::LayerNorm* ln = (ncnn::LayerNorm*)layers[i]; + auto outputs = all_outputs(ln, i); + bool all_support_quant = true; + for (auto node: outputs) { - if (is_quantized(node)) + if (! is_quantized(node)) { - non_can_quantize = false; + all_support_quant = false; + break; } } - if (all_output_support_quant) + if (all_support_quant) { - // enable requant - op->int8_scale_term += 100; - } else if (non_can_quantize){ - // cancel quant - op->int8_scale_term = 0; - op->in_scale0 = 1.f; - op->in_scale1 = 1.f; - op->out_scale = 1.f; - } else { - op->int8_scale_term = 1; + // fuse requantize + + // layernorm_int8 quantized by , so do not need to update next node's output_scale. + ln->int8_scale_term += 100; + fprintf(stderr, "fuse_layernorm_requantize %s %s, int8_scale_term %d\n", layers[i]->name.c_str(), outputs[0]->name.c_str(), ln->int8_scale_term); } - fprintf(stderr, "quantize_binaryop %s int8_scale_term %d\n", op->name.c_str(), op->int8_scale_term); } + return 0; } diff --git a/tools/quantize/net_quantize.h b/tools/quantize/net_quantize.h index e48d54fd1b50..aacf93e4a71e 100644 --- a/tools/quantize/net_quantize.h +++ b/tools/quantize/net_quantize.h @@ -24,7 +24,6 @@ class NetQuantize : public ModelWriter public: NetQuantize() { - quantizable_node = {"LayerNorm", "Convolution", "ConvolutionDepthWise", "MultiHeadAttention", "BinaryOp"}; } // conv and gemm quant param std::map blob_int8scale_table; @@ -36,8 +35,6 @@ class NetQuantize : public ModelWriter std::map > layernorm_table; // BinaryOp quant param std::map > binaryop_table; - // supported quantizable node - std::set quantizable_node; public: bool read_txt_format(const char* path); From e7f84d04a4bdd4ea3b8d89f16ecae7d10a9ffa7f Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 11 Aug 2022 09:23:24 +0000 Subject: [PATCH 24/36] apply code-format changes --- examples/vision_transformer_int8.cpp | 3 +- src/layer/multiheadattention.cpp | 3 +- src/layer/x86/multiheadattention_x86.cpp | 3 +- tools/quantize/net_quantize.cpp | 50 ++++++++++++------------ 4 files changed, 30 insertions(+), 29 deletions(-) diff --git a/examples/vision_transformer_int8.cpp b/examples/vision_transformer_int8.cpp index 7ca087d0048b..49ab9195ca66 100644 --- a/examples/vision_transformer_int8.cpp +++ b/examples/vision_transformer_int8.cpp @@ -52,7 +52,8 @@ static int classify_vit(const cv::Mat& bgr) int max_index = 0; for (int j = 0; j < multiHeadOut.w; j++) { - if (max_value < multiHeadOut[j]) { + if (max_value < multiHeadOut[j]) + { max_value = multiHeadOut[j]; max_index = j; } diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index b377e4cc35f1..1c2cee417a02 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -162,7 +162,8 @@ int MultiHeadAttention::affine_input( if (input.elemsize != 1) { quantize_to_int8(input, input_int8, input_scale, opt); - } else + } + else { input_int8 = input; } diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index 47609d00415e..b7e42f162709 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -109,7 +109,8 @@ int MultiHeadAttention_x86::affine_input( if (input.elemsize != 1) { quantize_to_int8(input, input_int8, input_scale, opt); - } else + } + else { input_int8 = input; } diff --git a/tools/quantize/net_quantize.cpp b/tools/quantize/net_quantize.cpp index b3805195b218..16d358f230b8 100644 --- a/tools/quantize/net_quantize.cpp +++ b/tools/quantize/net_quantize.cpp @@ -681,20 +681,20 @@ int NetQuantize::fuse_conv_requantize() /** * @brief if [LayerNorm --> X] and X is a type of quantizable layer, then requant layernorm.output, AKA X.input is quantized tensor - * + * * @return int */ int NetQuantize::fuse_layernorm_requantize() { const size_t layer_count = layers.size(); - auto direct_connected_outputs = [&](ncnn::Layer* op, int cur) -> std::vector - { + auto direct_connected_outputs = [&](ncnn::Layer* op, int cur) -> std::vector { std::vector outputs; - for (size_t j = cur; j bottoms) { + for (auto index : next->bottoms) + { if (index == op->tops[0] || index == op->tops[1]) { outputs.emplace_back(next); @@ -705,11 +705,10 @@ int NetQuantize::fuse_layernorm_requantize() return outputs; }; - auto all_outputs = [&](ncnn::Layer* op, int cur) -> std::vector - { + auto all_outputs = [&](ncnn::Layer* op, int cur) -> std::vector { auto directs = direct_connected_outputs(op, cur); std::vector outputs; - for (auto node: directs) + for (auto node : directs) { if (node->type == "Split") { @@ -722,28 +721,27 @@ int NetQuantize::fuse_layernorm_requantize() return outputs; }; - auto is_quantized = [=](ncnn::Layer* layer) -> bool - { - if (layer->type == "Convolution") - return ((ncnn::Convolution*)layer)->int8_scale_term > 0; + auto is_quantized = [=](ncnn::Layer* layer) -> bool { + if (layer->type == "Convolution") + return ((ncnn::Convolution*)layer)->int8_scale_term > 0; + + if (layer->type == "MultiHeadAttention") + return ((ncnn::MultiHeadAttention*)layer)->int8_scale_term > 0; - if (layer->type == "MultiHeadAttention") - return ((ncnn::MultiHeadAttention*)layer)->int8_scale_term > 0; + if (layer->type == "InnerProduct") + return ((ncnn::InnerProduct*)layer)->int8_scale_term > 0; - if (layer->type == "InnerProduct") - return ((ncnn::InnerProduct*)layer)->int8_scale_term > 0; - - if (layer->type == "ConvolutionDepthWise") - return ((ncnn::ConvolutionDepthWise*)layer)->int8_scale_term > 0; + if (layer->type == "ConvolutionDepthWise") + return ((ncnn::ConvolutionDepthWise*)layer)->int8_scale_term > 0; - if (layer->type == "LayerNorm") + if (layer->type == "LayerNorm") return ((ncnn::LayerNorm*)layer)->int8_scale_term > 0; - if (layer->type == "BinaryOp") - // suppose that future binaryop could be quantized - return true; + if (layer->type == "BinaryOp") + // suppose that future binaryop could be quantized + return true; - return false; + return false; }; for (size_t i = 0; i < layer_count; i++) @@ -754,9 +752,9 @@ int NetQuantize::fuse_layernorm_requantize() ncnn::LayerNorm* ln = (ncnn::LayerNorm*)layers[i]; auto outputs = all_outputs(ln, i); bool all_support_quant = true; - for (auto node: outputs) + for (auto node : outputs) { - if (! is_quantized(node)) + if (!is_quantized(node)) { all_support_quant = false; break; From 4588921b4165040bdd8bba24af4310f917d709f1 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 11 Aug 2022 17:23:47 +0800 Subject: [PATCH 25/36] feat(src/layer/x86): revert binaryop --- src/layer/x86/binaryop_x86.cpp | 2791 ++++++++++++++++++++++++++++++++ src/layer/x86/binaryop_x86.h | 34 + 2 files changed, 2825 insertions(+) create mode 100644 src/layer/x86/binaryop_x86.cpp create mode 100644 src/layer/x86/binaryop_x86.h diff --git a/src/layer/x86/binaryop_x86.cpp b/src/layer/x86/binaryop_x86.cpp new file mode 100644 index 000000000000..6d6a09fd71c1 --- /dev/null +++ b/src/layer/x86/binaryop_x86.cpp @@ -0,0 +1,2791 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "binaryop_x86.h" + +#if __SSE2__ +#include +#include "sse_mathfun.h" +#if __AVX__ +#include +#include "avx_mathfun.h" +#if __AVX512F__ +#include "avx512_mathfun.h" +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ + +#include + +namespace ncnn { + +BinaryOp_x86::BinaryOp_x86() +{ +#if __SSE2__ + support_packing = true; +#endif // __SSE2__ +} + +template +static int binary_op_2_3_4_20(const Mat& a, const Mat& b, Mat& c, const Option& opt) +{ + Op op; + + int w = b.w; + int h = b.h; + int d = b.d; + int channels = b.c; + int elempack = b.elempack; + int size = w * h * d * elempack; + + // type 2 3 4 20 + c.create_like(b, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float a0 = a[0]; + const float* ptr = b.channel(q); + float* outptr = c.channel(q); + + int i = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + __m512 _a0_avx512 = _mm512_set1_ps(a0); + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _outp = op.func_pack16(_a0_avx512, _p); + _mm512_storeu_ps(outptr, _outp); + ptr += 16; + outptr += 16; + } +#endif // __AVX512F__ + __m256 _a0_avx = _mm256_set1_ps(a0); + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _outp = op.func_pack8(_a0_avx, _p); + _mm256_storeu_ps(outptr, _outp); + ptr += 8; + outptr += 8; + } +#endif // __AVX__ + __m128 _a0 = _mm_set1_ps(a0); + for (; i + 3 < size; i += 4) + { + __m128 _p = _mm_load_ps(ptr); + __m128 _outp = op.func_pack4(_a0, _p); + _mm_store_ps(outptr, _outp); + ptr += 4; + outptr += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + *outptr = op.func(a0, *ptr); + ptr += 1; + outptr += 1; + } + } + + return 0; +} + +template +static int binary_op_6_11_16_25(const Mat& a, const Mat& b, Mat& c, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int elempack = a.elempack; + int size = w * h * d * elempack; + + // type 6 11 16 25 + c.create_like(a, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float b0 = b[0]; + float* outptr = c.channel(q); + + int i = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + __m512 _b0_avx512 = _mm512_set1_ps(b0); + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _outp = op.func_pack16(_p, _b0_avx512); + _mm512_storeu_ps(outptr, _outp); + ptr += 16; + outptr += 16; + } +#endif // __AVX512F__ + __m256 _b0_avx = _mm256_set1_ps(b0); + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _outp = op.func_pack8(_p, _b0_avx); + _mm256_storeu_ps(outptr, _outp); + ptr += 8; + outptr += 8; + } +#endif // __AVX__ + __m128 _b0 = _mm_set1_ps(b0); + for (; i + 3 < size; i += 4) + { + __m128 _p = _mm_load_ps(ptr); + __m128 _outp = op.func_pack4(_p, _b0); + _mm_store_ps(outptr, _outp); + ptr += 4; + outptr += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + *outptr = op.func(*ptr, b0); + ptr += 1; + outptr += 1; + } + } + + return 0; +} + +template +static int binary_op_7_13_19_29(const Mat& a, const Mat& b, Mat& c, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int elempack = a.elempack; + int size = w * h * d * elempack; + + // type 7 13 19 29 + c.create_like(a, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + int i = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _p1 = _mm512_loadu_ps(ptr1); + __m512 _outp = op.func_pack16(_p, _p1); + _mm512_storeu_ps(outptr, _outp); + ptr += 16; + ptr1 += 16; + outptr += 16; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _p1 = _mm256_loadu_ps(ptr1); + __m256 _outp = op.func_pack8(_p, _p1); + _mm256_storeu_ps(outptr, _outp); + ptr += 8; + ptr1 += 8; + outptr += 8; + } +#endif // __AVX__ + for (; i + 3 < size; i += 4) + { + __m128 _p = _mm_load_ps(ptr); + __m128 _p1 = _mm_load_ps(ptr1); + __m128 _outp = op.func_pack4(_p, _p1); + _mm_store_ps(outptr, _outp); + ptr += 4; + ptr1 += 4; + outptr += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + *outptr = op.func(*ptr, *ptr1); + ptr += 1; + ptr1 += 1; + outptr += 1; + } + } + + return 0; +} + +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ +// broadcasting rule +// https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting + +template +static int binary_op_pack16(const Mat& a, const Mat& b, Mat& c, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int size = w * h * d; + size_t elemsize = a.elemsize; + int elempack = a.elempack; + + int w1 = b.w; + int h1 = b.h; + int d1 = b.d; + int channels1 = b.c; + int size1 = w1 * h1 * d1; + size_t elemsize1 = b.elemsize; + int elempack1 = b.elempack; + + if (a.dims == 4) + { + if (b.dims == 4) + { + // type 29 + return binary_op_7_13_19_29(a, b, c, opt); + } + + c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 3) + { + // type 28 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d; z++) + { + for (int y = 0; y < h; y++) + { + __m512 _b0 = _mm512_loadu_ps(ptr1); + for (int x = 0; x < w; x++) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _outp = op.func_pack16(_p, _b0); + _mm512_storeu_ps(outptr, _outp); + ptr += 16; + outptr += 16; + } + + ptr1 += 16; + } + } + } + + return 0; + } + + if (b.dims == 2) + { + // type 27 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.row(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d; z++) + { + __m512 _b0 = _mm512_loadu_ps(ptr1); + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _outp = op.func_pack16(_p, _b0); + _mm512_storeu_ps(outptr, _outp); + ptr += 16; + outptr += 16; + } + } + + ptr1 += 16; + } + } + + return 0; + } + + if (b.dims == 1) + { + if (b.w == 1 && elempack1 == 1) + { + // type 25 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 26 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + __m512 _b0 = _mm512_loadu_ps((const float*)b + q * 16); + float* outptr = c.channel(q); + + for (int i = 0; i < size; i++) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _outp = op.func_pack16(_p, _b0); + _mm512_storeu_ps(outptr, _outp); + ptr += 16; + outptr += 16; + } + } + + return 0; + } + } + else if (a.dims == 3) + { + if (b.dims == 4) + { + // type 23 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d1; z++) + { + for (int y = 0; y < h1; y++) + { + __m512 _a0 = _mm512_loadu_ps(ptr); + for (int x = 0; x < w1; x++) + { + __m512 _p = _mm512_loadu_ps(ptr1); + __m512 _outp = op.func_pack16(_a0, _p); + _mm512_storeu_ps(outptr, _outp); + ptr1 += 16; + outptr += 16; + } + + ptr += 16; + } + } + } + + return 0; + } + + if (b.dims == 3) + { + if (w1 == 1 && h1 == 1 && channels1 == channels) + { + // special type 1 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* b0 = b.channel(q); + float* outptr = c.channel(q); + __m512 _b0 = _mm512_loadu_ps(b0); + for (int i = 0; i < size; i++) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _outp = op.func_pack16(_p, _b0); + _mm512_storeu_ps(outptr, _outp); + ptr += 16; + outptr += 16; + } + } + + return 0; + } + + if (w1 == w && h1 == h && channels1 == 1 && elempack1 == 1) + { + // special type 2 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b; + float* outptr = c.channel(q); + for (int i = 0; i < size; i++) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _p1 = _mm512_set1_ps(*ptr1); + __m512 _outp = op.func_pack16(_p, _p1); + _mm512_storeu_ps(outptr, _outp); + ptr += 16; + ptr1 += 1; + outptr += 16; + } + } + + return 0; + } + + if (w == 1 && h == 1 && channels1 == channels) + { + // special type 3 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* a0 = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + __m512 _a0 = _mm512_loadu_ps(a0); + for (int i = 0; i < size1; i++) + { + __m512 _p1 = _mm512_loadu_ps(ptr1); + __m512 _outp = op.func_pack16(_a0, _p1); + _mm512_storeu_ps(outptr, _outp); + ptr1 += 16; + outptr += 16; + } + } + + return 0; + } + + if (w1 == w && h1 == h && channels == 1 && elempack == 1) + { + // special type 4 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a; + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + for (int i = 0; i < size1; i++) + { + __m512 _p = _mm512_set1_ps(*ptr); + __m512 _p1 = _mm512_loadu_ps(ptr1); + __m512 _outp = op.func_pack16(_p, _p1); + _mm512_storeu_ps(outptr, _outp); + ptr += 1; + ptr1 += 16; + outptr += 16; + } + } + + return 0; + } + + if (w != 1 && w1 == 1 && h1 == h && channels1 == channels) + { + // special type 5 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + __m512 _p1 = _mm512_loadu_ps(ptr1 + y * 16); + for (int x = 0; x < w; x++) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _outp = op.func_pack16(_p, _p1); + _mm512_storeu_ps(outptr, _outp); + + ptr += 16; + outptr += 16; + } + } + } + + return 0; + } + + if (w1 == w && h != 1 && h1 == 1 && channels1 == channels) + { + // special type 6 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _p1 = _mm512_loadu_ps(ptr1 + x * 16); + __m512 _outp = op.func_pack16(_p, _p1); + _mm512_storeu_ps(outptr, _outp); + + ptr += 16; + outptr += 16; + } + } + } + + return 0; + } + + if (w1 != 1 && w == 1 && h1 == h && channels1 == channels) + { + // special type 7 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + __m512 _p = _mm512_loadu_ps(ptr + y * 16); + for (int x = 0; x < w1; x++) + { + __m512 _p1 = _mm512_loadu_ps(ptr1); + __m512 _outp = op.func_pack16(_p, _p1); + _mm512_storeu_ps(outptr, _outp); + + ptr1 += 16; + outptr += 16; + } + } + } + + return 0; + } + + if (w1 == w && h1 != 1 && h == 1 && channels1 == channels) + { + // special type 8 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + for (int x = 0; x < w1; x++) + { + __m512 _p = _mm512_loadu_ps(ptr + x * 16); + __m512 _p1 = _mm512_loadu_ps(ptr1); + __m512 _outp = op.func_pack16(_p, _p1); + _mm512_storeu_ps(outptr, _outp); + + ptr1 += 16; + outptr += 16; + } + } + } + + return 0; + } + + // type 19 + return binary_op_7_13_19_29(a, b, c, opt); + } + + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 2) + { + // type 18 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.row(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + __m512 _b0 = _mm512_loadu_ps(ptr1); + for (int x = 0; x < w; x++) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _outp = op.func_pack16(_p, _b0); + _mm512_storeu_ps(outptr, _outp); + ptr += 16; + outptr += 16; + } + + ptr1 += 16; + } + } + + return 0; + } + + if (b.dims == 1) + { + if (b.w == 1 && elempack1 == 1) + { + // type 16 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 17 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + __m512 _b0 = _mm512_loadu_ps((const float*)b + q * 16); + float* outptr = c.channel(q); + + for (int i = 0; i < size; i++) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _outp = op.func_pack16(_p, _b0); + _mm512_storeu_ps(outptr, _outp); + ptr += 16; + outptr += 16; + } + } + + return 0; + } + } + else if (a.dims == 2) + { + if (b.dims == 4) + { + // type 22 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.row(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d1; z++) + { + __m512 _a0 = _mm512_loadu_ps(ptr); + for (int y = 0; y < h1; y++) + { + for (int x = 0; x < w1; x++) + { + __m512 _p = _mm512_loadu_ps(ptr1); + __m512 _outp = op.func_pack16(_a0, _p); + _mm512_storeu_ps(outptr, _outp); + ptr1 += 16; + outptr += 16; + } + } + + ptr += 16; + } + } + + return 0; + } + + if (b.dims == 3) + { + // type 14 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.row(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + __m512 _a0 = _mm512_loadu_ps(ptr); + for (int x = 0; x < w1; x++) + { + __m512 _p1 = _mm512_loadu_ps(ptr1); + __m512 _outp = op.func_pack16(_a0, _p1); + _mm512_storeu_ps(outptr, _outp); + ptr1 += 16; + outptr += 16; + } + + ptr += 16; + } + } + + return 0; + } + + c.create(w, h, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 2) + { + // type 13 + return binary_op_7_13_19_29(a, b, c, opt); + } + + if (b.dims == 1) + { + c.create(w, h, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.w == 1 && elempack1 == 1) + { + // type 11 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 12 + const float* ptr = a; + const float* ptr1 = b; + float* outptr = c; + + for (int y = 0; y < h; y++) + { + __m512 _b0 = _mm512_loadu_ps(ptr1); + for (int x = 0; x < w; x++) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _outp = op.func_pack16(_p, _b0); + _mm512_storeu_ps(outptr, _outp); + ptr += 16; + outptr += 16; + } + + ptr1 += 16; + } + + return 0; + } + } + else if (a.dims == 1) + { + if (a.w == 1 && elempack == 1) + { + // type 2 3 4 20 + return binary_op_2_3_4_20(a, b, c, opt); + } + + if (b.dims == 4) + { + // type 21 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + __m512 _a0 = _mm512_loadu_ps((const float*)a + q * 16); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int i = 0; i < size1; i++) + { + __m512 _p1 = _mm512_loadu_ps(ptr1); + __m512 _outp = op.func_pack16(_a0, _p1); + _mm512_storeu_ps(outptr, _outp); + ptr1 += 16; + outptr += 16; + } + } + + return 0; + } + + if (b.dims == 3) + { + // type 9 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + __m512 _a0 = _mm512_loadu_ps((const float*)a + q * 16); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int i = 0; i < size1; i++) + { + __m512 _p1 = _mm512_loadu_ps(ptr1); + __m512 _outp = op.func_pack16(_a0, _p1); + _mm512_storeu_ps(outptr, _outp); + ptr1 += 16; + outptr += 16; + } + } + + return 0; + } + + if (b.dims == 2) + { + // type 8 + c.create(w1, h1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + const float* ptr = a; + const float* ptr1 = b; + float* outptr = c; + + for (int y = 0; y < h1; y++) + { + __m512 _a0 = _mm512_loadu_ps(ptr); + for (int x = 0; x < w1; x++) + { + __m512 _p1 = _mm512_loadu_ps(ptr1); + __m512 _outp = op.func_pack16(_a0, _p1); + _mm512_storeu_ps(outptr, _outp); + ptr1 += 16; + outptr += 16; + } + + ptr += 16; + } + + return 0; + } + + if (b.dims == 1) + { + c.create(w, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.w == 1 && elempack1 == 1) + { + // type 6 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 7 + binary_op_7_13_19_29(a, b, c, opt); + } + } + + return 0; +} +#endif // __AVX512F__ + +template +static int binary_op_pack8(const Mat& a, const Mat& b, Mat& c, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int size = w * h * d; + size_t elemsize = a.elemsize; + int elempack = a.elempack; + + int w1 = b.w; + int h1 = b.h; + int d1 = b.d; + int channels1 = b.c; + int size1 = w1 * h1 * d1; + size_t elemsize1 = b.elemsize; + int elempack1 = b.elempack; + + if (a.dims == 4) + { + if (b.dims == 4) + { + // type 29 + return binary_op_7_13_19_29(a, b, c, opt); + } + + c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 3) + { + // type 28 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d; z++) + { + for (int y = 0; y < h; y++) + { + __m256 _b0 = _mm256_loadu_ps(ptr1); + for (int x = 0; x < w; x++) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _outp = op.func_pack8(_p, _b0); + _mm256_storeu_ps(outptr, _outp); + ptr += 8; + outptr += 8; + } + + ptr1 += 8; + } + } + } + + return 0; + } + + if (b.dims == 2) + { + // type 27 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.row(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d; z++) + { + __m256 _b0 = _mm256_loadu_ps(ptr1); + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _outp = op.func_pack8(_p, _b0); + _mm256_storeu_ps(outptr, _outp); + ptr += 8; + outptr += 8; + } + } + + ptr1 += 8; + } + } + + return 0; + } + + if (b.dims == 1) + { + if (b.w == 1 && elempack1 == 1) + { + // type 25 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 26 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + __m256 _b0 = _mm256_loadu_ps((const float*)b + q * 8); + float* outptr = c.channel(q); + + for (int i = 0; i < size; i++) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _outp = op.func_pack8(_p, _b0); + _mm256_storeu_ps(outptr, _outp); + ptr += 8; + outptr += 8; + } + } + + return 0; + } + } + else if (a.dims == 3) + { + if (b.dims == 4) + { + // type 23 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d1; z++) + { + for (int y = 0; y < h1; y++) + { + __m256 _a0 = _mm256_loadu_ps(ptr); + for (int x = 0; x < w1; x++) + { + __m256 _p = _mm256_loadu_ps(ptr1); + __m256 _outp = op.func_pack8(_a0, _p); + _mm256_storeu_ps(outptr, _outp); + ptr1 += 8; + outptr += 8; + } + + ptr += 8; + } + } + } + + return 0; + } + + if (b.dims == 3) + { + if (w1 == 1 && h1 == 1 && channels1 == channels) + { + // special type 1 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* b0 = b.channel(q); + float* outptr = c.channel(q); + __m256 _b0 = _mm256_loadu_ps(b0); + for (int i = 0; i < size; i++) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _outp = op.func_pack8(_p, _b0); + _mm256_storeu_ps(outptr, _outp); + ptr += 8; + outptr += 8; + } + } + + return 0; + } + + if (w1 == w && h1 == h && channels1 == 1 && elempack1 == 1) + { + // special type 2 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b; + float* outptr = c.channel(q); + for (int i = 0; i < size; i++) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _p1 = _mm256_broadcast_ss(ptr1); + __m256 _outp = op.func_pack8(_p, _p1); + _mm256_storeu_ps(outptr, _outp); + ptr += 8; + ptr1 += 1; + outptr += 8; + } + } + + return 0; + } + + if (w == 1 && h == 1 && channels1 == channels) + { + // special type 3 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* a0 = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + __m256 _a0 = _mm256_loadu_ps(a0); + for (int i = 0; i < size1; i++) + { + __m256 _p1 = _mm256_loadu_ps(ptr1); + __m256 _outp = op.func_pack8(_a0, _p1); + _mm256_storeu_ps(outptr, _outp); + ptr1 += 8; + outptr += 8; + } + } + + return 0; + } + + if (w1 == w && h1 == h && channels == 1 && elempack == 1) + { + // special type 4 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a; + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + for (int i = 0; i < size1; i++) + { + __m256 _p = _mm256_broadcast_ss(ptr); + __m256 _p1 = _mm256_loadu_ps(ptr1); + __m256 _outp = op.func_pack8(_p, _p1); + _mm256_storeu_ps(outptr, _outp); + ptr += 1; + ptr1 += 8; + outptr += 8; + } + } + + return 0; + } + + if (w != 1 && w1 == 1 && h1 == h && channels1 == channels) + { + // special type 5 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + __m256 _p1 = _mm256_loadu_ps(ptr1 + y * 8); + for (int x = 0; x < w; x++) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _outp = op.func_pack8(_p, _p1); + _mm256_storeu_ps(outptr, _outp); + + ptr += 8; + outptr += 8; + } + } + } + + return 0; + } + + if (w1 == w && h != 1 && h1 == 1 && channels1 == channels) + { + // special type 6 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _p1 = _mm256_loadu_ps(ptr1 + x * 8); + __m256 _outp = op.func_pack8(_p, _p1); + _mm256_storeu_ps(outptr, _outp); + + ptr += 8; + outptr += 8; + } + } + } + + return 0; + } + + if (w1 != 1 && w == 1 && h1 == h && channels1 == channels) + { + // special type 7 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + __m256 _p = _mm256_loadu_ps(ptr + y * 8); + for (int x = 0; x < w1; x++) + { + __m256 _p1 = _mm256_loadu_ps(ptr1); + __m256 _outp = op.func_pack8(_p, _p1); + _mm256_storeu_ps(outptr, _outp); + + ptr1 += 8; + outptr += 8; + } + } + } + + return 0; + } + + if (w1 == w && h1 != 1 && h == 1 && channels1 == channels) + { + // special type 8 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + for (int x = 0; x < w1; x++) + { + __m256 _p = _mm256_loadu_ps(ptr + x * 8); + __m256 _p1 = _mm256_loadu_ps(ptr1); + __m256 _outp = op.func_pack8(_p, _p1); + _mm256_storeu_ps(outptr, _outp); + + ptr1 += 8; + outptr += 8; + } + } + } + + return 0; + } + + // type 19 + return binary_op_7_13_19_29(a, b, c, opt); + } + + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 2) + { + // type 18 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.row(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + __m256 _b0 = _mm256_loadu_ps(ptr1); + for (int x = 0; x < w; x++) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _outp = op.func_pack8(_p, _b0); + _mm256_storeu_ps(outptr, _outp); + ptr += 8; + outptr += 8; + } + + ptr1 += 8; + } + } + + return 0; + } + + if (b.dims == 1) + { + if (b.w == 1 && elempack1 == 1) + { + // type 16 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 17 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + __m256 _b0 = _mm256_loadu_ps((const float*)b + q * 8); + float* outptr = c.channel(q); + + for (int i = 0; i < size; i++) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _outp = op.func_pack8(_p, _b0); + _mm256_storeu_ps(outptr, _outp); + ptr += 8; + outptr += 8; + } + } + + return 0; + } + } + else if (a.dims == 2) + { + if (b.dims == 4) + { + // type 22 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.row(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d1; z++) + { + __m256 _a0 = _mm256_loadu_ps(ptr); + for (int y = 0; y < h1; y++) + { + for (int x = 0; x < w1; x++) + { + __m256 _p = _mm256_loadu_ps(ptr1); + __m256 _outp = op.func_pack8(_a0, _p); + _mm256_storeu_ps(outptr, _outp); + ptr1 += 8; + outptr += 8; + } + } + + ptr += 8; + } + } + + return 0; + } + + if (b.dims == 3) + { + // type 14 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.row(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + __m256 _a0 = _mm256_loadu_ps(ptr); + for (int x = 0; x < w1; x++) + { + __m256 _p1 = _mm256_loadu_ps(ptr1); + __m256 _outp = op.func_pack8(_a0, _p1); + _mm256_storeu_ps(outptr, _outp); + ptr1 += 8; + outptr += 8; + } + + ptr += 8; + } + } + + return 0; + } + + c.create(w, h, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 2) + { + // type 13 + return binary_op_7_13_19_29(a, b, c, opt); + } + + if (b.dims == 1) + { + c.create(w, h, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.w == 1 && elempack1 == 1) + { + // type 11 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 12 + const float* ptr = a; + const float* ptr1 = b; + float* outptr = c; + + for (int y = 0; y < h; y++) + { + __m256 _b0 = _mm256_loadu_ps(ptr1); + for (int x = 0; x < w; x++) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _outp = op.func_pack8(_p, _b0); + _mm256_storeu_ps(outptr, _outp); + ptr += 8; + outptr += 8; + } + + ptr1 += 8; + } + + return 0; + } + } + else if (a.dims == 1) + { + if (a.w == 1 && elempack == 1) + { + // type 2 3 4 20 + return binary_op_2_3_4_20(a, b, c, opt); + } + + if (b.dims == 4) + { + // type 21 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + __m256 _a0 = _mm256_loadu_ps((const float*)a + q * 8); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int i = 0; i < size1; i++) + { + __m256 _p1 = _mm256_loadu_ps(ptr1); + __m256 _outp = op.func_pack8(_a0, _p1); + _mm256_storeu_ps(outptr, _outp); + ptr1 += 8; + outptr += 8; + } + } + + return 0; + } + + if (b.dims == 3) + { + // type 9 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + __m256 _a0 = _mm256_loadu_ps((const float*)a + q * 8); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int i = 0; i < size1; i++) + { + __m256 _p1 = _mm256_loadu_ps(ptr1); + __m256 _outp = op.func_pack8(_a0, _p1); + _mm256_storeu_ps(outptr, _outp); + ptr1 += 8; + outptr += 8; + } + } + + return 0; + } + + if (b.dims == 2) + { + // type 8 + c.create(w1, h1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + const float* ptr = a; + const float* ptr1 = b; + float* outptr = c; + + for (int y = 0; y < h1; y++) + { + __m256 _a0 = _mm256_loadu_ps(ptr); + for (int x = 0; x < w1; x++) + { + __m256 _p1 = _mm256_loadu_ps(ptr1); + __m256 _outp = op.func_pack8(_a0, _p1); + _mm256_storeu_ps(outptr, _outp); + ptr1 += 8; + outptr += 8; + } + + ptr += 8; + } + + return 0; + } + + if (b.dims == 1) + { + c.create(w, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.w == 1 && elempack1 == 1) + { + // type 6 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 7 + binary_op_7_13_19_29(a, b, c, opt); + } + } + + return 0; +} +#endif // __AVX__ + +template +static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int size = w * h * d; + size_t elemsize = a.elemsize; + int elempack = a.elempack; + + int w1 = b.w; + int h1 = b.h; + int d1 = b.d; + int channels1 = b.c; + int size1 = w1 * h1 * d1; + size_t elemsize1 = b.elemsize; + int elempack1 = b.elempack; + + if (a.dims == 4) + { + if (b.dims == 4) + { + // type 29 + return binary_op_7_13_19_29(a, b, c, opt); + } + + c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 3) + { + // type 28 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d; z++) + { + for (int y = 0; y < h; y++) + { + __m128 _b0 = _mm_loadu_ps(ptr1); + for (int x = 0; x < w; x++) + { + __m128 _p = _mm_loadu_ps(ptr); + __m128 _outp = op.func_pack4(_p, _b0); + _mm_storeu_ps(outptr, _outp); + ptr += 4; + outptr += 4; + } + + ptr1 += 4; + } + } + } + + return 0; + } + + if (b.dims == 2) + { + // type 27 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.row(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d; z++) + { + __m128 _b0 = _mm_loadu_ps(ptr1); + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + __m128 _p = _mm_loadu_ps(ptr); + __m128 _outp = op.func_pack4(_p, _b0); + _mm_storeu_ps(outptr, _outp); + ptr += 4; + outptr += 4; + } + } + + ptr1 += 4; + } + } + + return 0; + } + + if (b.dims == 1) + { + if (b.w == 1 && elempack1 == 1) + { + // type 25 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 26 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + __m128 _b0 = _mm_loadu_ps((const float*)b + q * 4); + float* outptr = c.channel(q); + + for (int i = 0; i < size; i++) + { + __m128 _p = _mm_loadu_ps(ptr); + __m128 _outp = op.func_pack4(_p, _b0); + _mm_storeu_ps(outptr, _outp); + ptr += 4; + outptr += 4; + } + } + + return 0; + } + } + else if (a.dims == 3) + { + if (b.dims == 4) + { + // type 23 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d1; z++) + { + for (int y = 0; y < h1; y++) + { + __m128 _a0 = _mm_loadu_ps(ptr); + for (int x = 0; x < w1; x++) + { + __m128 _p = _mm_loadu_ps(ptr1); + __m128 _outp = op.func_pack4(_a0, _p); + _mm_storeu_ps(outptr, _outp); + ptr1 += 4; + outptr += 4; + } + + ptr += 4; + } + } + } + + return 0; + } + + if (b.dims == 3) + { + if (w1 == 1 && h1 == 1 && channels1 == channels) + { + // special type 1 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + float* outptr = c.channel(q); + const float* b0 = b.channel(q); + __m128 _b0 = _mm_loadu_ps(b0); + for (int i = 0; i < size; i++) + { + __m128 _p = _mm_loadu_ps(ptr); + __m128 _outp = op.func_pack4(_p, _b0); + _mm_storeu_ps(outptr, _outp); + ptr += 4; + outptr += 4; + } + } + + return 0; + } + + if (w1 == w && h1 == h && channels1 == 1 && elempack1 == 1) + { + // special type 2 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b; + float* outptr = c.channel(q); + for (int i = 0; i < size; i++) + { + __m128 _p = _mm_loadu_ps(ptr); + __m128 _p1 = _mm_set1_ps(*ptr1); + __m128 _outp = op.func_pack4(_p, _p1); + _mm_storeu_ps(outptr, _outp); + ptr += 4; + ptr1 += 1; + outptr += 4; + } + } + + return 0; + } + + if (w == 1 && h == 1 && channels1 == channels) + { + // special type 3 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* a0 = a.channel(q); + float* outptr = c.channel(q); + const float* ptr1 = b.channel(q); + __m128 _a0 = _mm_loadu_ps(a0); + for (int i = 0; i < size1; i++) + { + __m128 _p1 = _mm_loadu_ps(ptr1); + __m128 _outp = op.func_pack4(_a0, _p1); + _mm_storeu_ps(outptr, _outp); + ptr1 += 4; + outptr += 4; + } + } + + return 0; + } + + if (w1 == w && h1 == h && channels == 1 && elempack == 1) + { + // special type 4 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a; + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + for (int i = 0; i < size1; i++) + { + __m128 _p = _mm_set1_ps(*ptr); + __m128 _p1 = _mm_loadu_ps(ptr1); + __m128 _outp = op.func_pack4(_p, _p1); + _mm_storeu_ps(outptr, _outp); + ptr += 1; + ptr1 += 4; + outptr += 4; + } + } + + return 0; + } + + if (w != 1 && w1 == 1 && h1 == h && channels1 == channels) + { + // special type 5 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + __m128 _p1 = _mm_loadu_ps(ptr1 + y * 4); + for (int x = 0; x < w; x++) + { + __m128 _p = _mm_loadu_ps(ptr); + __m128 _outp = op.func_pack4(_p, _p1); + _mm_storeu_ps(outptr, _outp); + + ptr += 4; + outptr += 4; + } + } + } + + return 0; + } + + if (w1 == w && h != 1 && h1 == 1 && channels1 == channels) + { + // special type 6 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + __m128 _p = _mm_loadu_ps(ptr); + __m128 _p1 = _mm_loadu_ps(ptr1 + x * 4); + __m128 _outp = op.func_pack4(_p, _p1); + _mm_storeu_ps(outptr, _outp); + + ptr += 4; + outptr += 4; + } + } + } + + return 0; + } + + if (w1 != 1 && w == 1 && h1 == h && channels1 == channels) + { + // special type 7 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + __m128 _p = _mm_loadu_ps(ptr + y * 4); + for (int x = 0; x < w1; x++) + { + __m128 _p1 = _mm_loadu_ps(ptr1); + __m128 _outp = op.func_pack4(_p, _p1); + _mm_storeu_ps(outptr, _outp); + + ptr1 += 4; + outptr += 4; + } + } + } + + return 0; + } + + if (w1 == w && h1 != 1 && h == 1 && channels1 == channels) + { + // special type 8 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + for (int x = 0; x < w1; x++) + { + __m128 _p = _mm_loadu_ps(ptr + x * 4); + __m128 _p1 = _mm_loadu_ps(ptr1); + __m128 _outp = op.func_pack4(_p, _p1); + _mm_storeu_ps(outptr, _outp); + + ptr1 += 4; + outptr += 4; + } + } + } + + return 0; + } + + // type 19 + return binary_op_7_13_19_29(a, b, c, opt); + } + + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 2) + { + // type 18 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.row(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + __m128 _b0 = _mm_loadu_ps(ptr1); + for (int x = 0; x < w; x++) + { + __m128 _p = _mm_loadu_ps(ptr); + __m128 _outp = op.func_pack4(_p, _b0); + _mm_storeu_ps(outptr, _outp); + ptr += 4; + outptr += 4; + } + + ptr1 += 4; + } + } + + return 0; + } + + if (b.dims == 1) + { + if (b.w == 1 && elempack1 == 1) + { + // type 16 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 17 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + __m128 _b0 = _mm_loadu_ps((const float*)b + q * 4); + float* outptr = c.channel(q); + + for (int i = 0; i < size; i++) + { + __m128 _p = _mm_loadu_ps(ptr); + __m128 _outp = op.func_pack4(_p, _b0); + _mm_storeu_ps(outptr, _outp); + ptr += 4; + outptr += 4; + } + } + + return 0; + } + } + else if (a.dims == 2) + { + if (b.dims == 4) + { + // type 22 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.row(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d1; z++) + { + __m128 _a0 = _mm_loadu_ps(ptr); + for (int y = 0; y < h1; y++) + { + for (int x = 0; x < w1; x++) + { + __m128 _p = _mm_loadu_ps(ptr1); + __m128 _outp = op.func_pack4(_a0, _p); + _mm_storeu_ps(outptr, _outp); + ptr1 += 4; + outptr += 4; + } + } + + ptr += 4; + } + } + + return 0; + } + + if (b.dims == 3) + { + // type 14 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.row(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + __m128 _a0 = _mm_loadu_ps(ptr); + for (int x = 0; x < w1; x++) + { + __m128 _p1 = _mm_loadu_ps(ptr1); + __m128 _outp = op.func_pack4(_a0, _p1); + _mm_storeu_ps(outptr, _outp); + ptr1 += 4; + outptr += 4; + } + + ptr += 4; + } + } + + return 0; + } + + c.create(w, h, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 2) + { + // type 13 + return binary_op_7_13_19_29(a, b, c, opt); + } + + if (b.dims == 1) + { + c.create(w, h, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.w == 1 && elempack1 == 1) + { + // type 11 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 12 + const float* ptr = a; + const float* ptr1 = b; + float* outptr = c; + + for (int y = 0; y < h; y++) + { + __m128 _b0 = _mm_loadu_ps(ptr1); + for (int x = 0; x < w; x++) + { + __m128 _p = _mm_loadu_ps(ptr); + __m128 _outp = op.func_pack4(_p, _b0); + _mm_storeu_ps(outptr, _outp); + ptr += 4; + outptr += 4; + } + + ptr1 += 4; + } + + return 0; + } + } + else if (a.dims == 1) + { + if (a.w == 1 && elempack == 1) + { + // type 2 3 4 20 + return binary_op_2_3_4_20(a, b, c, opt); + } + + if (b.dims == 4) + { + // type 21 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + __m128 _a0 = _mm_loadu_ps((const float*)a + q * 4); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int i = 0; i < size1; i++) + { + __m128 _p1 = _mm_loadu_ps(ptr1); + __m128 _outp = op.func_pack4(_a0, _p1); + _mm_storeu_ps(outptr, _outp); + ptr1 += 4; + outptr += 4; + } + } + + return 0; + } + + if (b.dims == 3) + { + // type 9 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + __m128 _a0 = _mm_loadu_ps((const float*)a + q * 4); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int i = 0; i < size1; i++) + { + __m128 _p1 = _mm_loadu_ps(ptr1); + __m128 _outp = op.func_pack4(_a0, _p1); + _mm_storeu_ps(outptr, _outp); + ptr1 += 4; + outptr += 4; + } + } + + return 0; + } + + if (b.dims == 2) + { + // type 8 + c.create(w1, h1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + const float* ptr = a; + const float* ptr1 = b; + float* outptr = c; + + for (int y = 0; y < h1; y++) + { + __m128 _a0 = _mm_loadu_ps(ptr); + for (int x = 0; x < w1; x++) + { + __m128 _p1 = _mm_loadu_ps(ptr1); + __m128 _outp = op.func_pack4(_a0, _p1); + _mm_storeu_ps(outptr, _outp); + ptr1 += 4; + outptr += 4; + } + + ptr += 4; + } + + return 0; + } + + if (b.dims == 1) + { + c.create(w, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.w == 1 && elempack1 == 1) + { + // type 6 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 7 + binary_op_7_13_19_29(a, b, c, opt); + } + } + + return 0; +} +#endif // __SSE2__ + +template +static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int elempack = a.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = a.channel(q); + + int i = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + __m512 _b_avx512 = _mm512_set1_ps(b); + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + _p = op.func_pack16(_p, _b_avx512); + _mm512_storeu_ps(ptr, _p); + ptr += 16; + } +#endif // __AVX512F__ + __m256 _b_avx = _mm256_set1_ps(b); + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr); + _p = op.func_pack8(_p, _b_avx); + _mm256_storeu_ps(ptr, _p); + ptr += 8; + } +#endif // __AVX__ + __m128 _b = _mm_set1_ps((float)b); + for (; i + 3 < size; i += 4) + { + __m128 _p = _mm_load_ps(ptr); + _p = op.func_pack4(_p, _b); + _mm_store_ps(ptr, _p); + ptr += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + *ptr = op.func(*ptr, b); + ptr++; + } + } + + return 0; +} + +namespace BinaryOp_x86_functor { + +struct binary_op_add +{ + float func(const float& x, const float& y) const + { + return x + y; + } +#if __SSE2__ + __m128 func_pack4(const __m128& x, const __m128& y) const + { + return _mm_add_ps(x, y); + } +#if __AVX__ + __m256 func_pack8(const __m256& x, const __m256& y) const + { + return _mm256_add_ps(x, y); + } +#if __AVX512F__ + __m512 func_pack16(const __m512& x, const __m512& y) const + { + return _mm512_add_ps(x, y); + } +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ +}; + +struct binary_op_sub +{ + float func(const float& x, const float& y) const + { + return x - y; + } +#if __SSE2__ + __m128 func_pack4(const __m128& x, const __m128& y) const + { + return _mm_sub_ps(x, y); + } +#if __AVX__ + __m256 func_pack8(const __m256& x, const __m256& y) const + { + return _mm256_sub_ps(x, y); + } +#if __AVX512F__ + __m512 func_pack16(const __m512& x, const __m512& y) const + { + return _mm512_sub_ps(x, y); + } +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ +}; + +struct binary_op_mul +{ + float func(const float& x, const float& y) const + { + return x * y; + } +#if __SSE2__ + __m128 func_pack4(const __m128& x, const __m128& y) const + { + return _mm_mul_ps(x, y); + } +#if __AVX__ + __m256 func_pack8(const __m256& x, const __m256& y) const + { + return _mm256_mul_ps(x, y); + } +#if __AVX512F__ + __m512 func_pack16(const __m512& x, const __m512& y) const + { + return _mm512_mul_ps(x, y); + } +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ +}; + +struct binary_op_div +{ + float func(const float& x, const float& y) const + { + return x / y; + } +#if __SSE2__ + __m128 func_pack4(const __m128& x, const __m128& y) const + { + return _mm_div_ps(x, y); + } +#if __AVX__ + __m256 func_pack8(const __m256& x, const __m256& y) const + { + return _mm256_div_ps(x, y); + } +#if __AVX512F__ + __m512 func_pack16(const __m512& x, const __m512& y) const + { + return _mm512_div_ps(x, y); + } +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ +}; + +struct binary_op_max +{ + float func(const float& x, const float& y) const + { + return std::max(x, y); + } +#if __SSE2__ + __m128 func_pack4(const __m128& x, const __m128& y) const + { + return _mm_max_ps(x, y); + } +#if __AVX__ + __m256 func_pack8(const __m256& x, const __m256& y) const + { + return _mm256_max_ps(x, y); + } +#if __AVX512F__ + __m512 func_pack16(const __m512& x, const __m512& y) const + { + return _mm512_max_ps(x, y); + } +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ +}; + +struct binary_op_min +{ + float func(const float& x, const float& y) const + { + return std::min(x, y); + } +#if __SSE2__ + __m128 func_pack4(const __m128& x, const __m128& y) const + { + return _mm_min_ps(x, y); + } +#if __AVX__ + __m256 func_pack8(const __m256& x, const __m256& y) const + { + return _mm256_min_ps(x, y); + } +#if __AVX512F__ + __m512 func_pack16(const __m512& x, const __m512& y) const + { + return _mm512_min_ps(x, y); + } +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ +}; + +struct binary_op_pow +{ + float func(const float& x, const float& y) const + { + return (float)pow(x, y); + } +#if __SSE2__ + __m128 func_pack4(const __m128& x, const __m128& y) const + { + return pow_ps(x, y); + } +#if __AVX__ + __m256 func_pack8(const __m256& x, const __m256& y) const + { + return pow256_ps(x, y); + } +#if __AVX512F__ + __m512 func_pack16(const __m512& x, const __m512& y) const + { + return pow512_ps(x, y); + } +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ +}; + +struct binary_op_rsub +{ + float func(const float& x, const float& y) const + { + return y - x; + } +#if __SSE2__ + __m128 func_pack4(const __m128& x, const __m128& y) const + { + return _mm_sub_ps(y, x); + } +#if __AVX__ + __m256 func_pack8(const __m256& x, const __m256& y) const + { + return _mm256_sub_ps(y, x); + } +#if __AVX512F__ + __m512 func_pack16(const __m512& x, const __m512& y) const + { + return _mm512_sub_ps(y, x); + } +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ +}; + +struct binary_op_rdiv +{ + float func(const float& x, const float& y) const + { + return y / x; + } +#if __SSE2__ + __m128 func_pack4(const __m128& x, const __m128& y) const + { + return _mm_div_ps(y, x); + } +#if __AVX__ + __m256 func_pack8(const __m256& x, const __m256& y) const + { + return _mm256_div_ps(y, x); + } +#if __AVX512F__ + __m512 func_pack16(const __m512& x, const __m512& y) const + { + return _mm512_div_ps(y, x); + } +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ +}; + +} // namespace BinaryOp_x86_functor + +int BinaryOp_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ +#if __SSE2__ + using namespace BinaryOp_x86_functor; + + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& bottom_blob1 = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + int elempack = bottom_blob.elempack; + int elempack1 = bottom_blob1.elempack; + +#if __AVX__ +#if __AVX512F__ + if (elempack == 16 || elempack1 == 16) + { + if (op_type == Operation_ADD) + return binary_op_pack16(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_SUB) + return binary_op_pack16(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MUL) + return binary_op_pack16(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_DIV) + return binary_op_pack16(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MAX) + return binary_op_pack16(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MIN) + return binary_op_pack16(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_POW) + return binary_op_pack16(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_RSUB) + return binary_op_pack16(bottom_blob1, bottom_blob, top_blob, opt); + + if (op_type == Operation_RDIV) + return binary_op_pack16(bottom_blob1, bottom_blob, top_blob, opt); + } +#endif // __AVX512F__ + + if (elempack == 8 || elempack1 == 8) + { + if (op_type == Operation_ADD) + return binary_op_pack8(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_SUB) + return binary_op_pack8(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MUL) + return binary_op_pack8(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_DIV) + return binary_op_pack8(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MAX) + return binary_op_pack8(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MIN) + return binary_op_pack8(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_POW) + return binary_op_pack8(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_RSUB) + return binary_op_pack8(bottom_blob1, bottom_blob, top_blob, opt); + + if (op_type == Operation_RDIV) + return binary_op_pack8(bottom_blob1, bottom_blob, top_blob, opt); + } +#endif // __AVX__ + + if (elempack == 4 || elempack1 == 4) + { + if (op_type == Operation_ADD) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_SUB) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MUL) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_DIV) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MAX) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MIN) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_POW) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_RSUB) + return binary_op_pack4(bottom_blob1, bottom_blob, top_blob, opt); + + if (op_type == Operation_RDIV) + return binary_op_pack4(bottom_blob1, bottom_blob, top_blob, opt); + } +#endif // __SSE2__ + + return BinaryOp::forward(bottom_blobs, top_blobs, opt); +} + +int BinaryOp_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + using namespace BinaryOp_x86_functor; + + if (op_type == Operation_ADD) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_SUB) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_MUL) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_DIV) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_MAX) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_MIN) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_POW) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_RSUB) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_RDIV) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/x86/binaryop_x86.h b/src/layer/x86/binaryop_x86.h new file mode 100644 index 000000000000..9f3ebb3cac9f --- /dev/null +++ b/src/layer/x86/binaryop_x86.h @@ -0,0 +1,34 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_BINARYOP_X86_H +#define LAYER_BINARYOP_X86_H + +#include "binaryop.h" + +namespace ncnn { + +class BinaryOp_x86 : virtual public BinaryOp +{ +public: + BinaryOp_x86(); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_BINARYOP_X86_H From 9277a1fe04ca7d71daf11584e3464ea38348ce25 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 11 Aug 2022 17:58:07 +0800 Subject: [PATCH 26/36] docs(src): update docs --- docs/developer-guide/operators.md | 3 ++ src/layer/layernorm.cpp | 82 ------------------------------- src/layer/multiheadattention.cpp | 1 - 3 files changed, 3 insertions(+), 83 deletions(-) diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md index 578a726fc916..61aa1a2f3dbe 100644 --- a/docs/developer-guide/operators.md +++ b/docs/developer-guide/operators.md @@ -971,11 +971,14 @@ y = x * gamma + beta by elementwise | 0 | affine_size | int | 0 | | | 1 | eps | float | 0.001f | x = x / sqrt(var + eps) | | 2 | affine | int | 1 | | +| 3 | int8_scale_term | int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | gamma_data | float | [affine_size] | | beta_data | float | [affine_size] | +| input_scales | float | [affine_size] | +| output_scale | float | [affine_size] | # Log ``` diff --git a/src/layer/layernorm.cpp b/src/layer/layernorm.cpp index 5aec4cfb177b..5f7def053e11 100644 --- a/src/layer/layernorm.cpp +++ b/src/layer/layernorm.cpp @@ -17,8 +17,6 @@ #include #include "mathfun.h" -// #include "npy.h" - namespace ncnn { LayerNorm::LayerNorm() @@ -89,71 +87,6 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con return -200; } - // { - // // setup input for debug - // { - // // write input - // std::vector shape; - // std::vector data; - // std::string typestr; - // std::string filename = "/home/PJLAB/konghuanjun/GitProjects/FQ-ViT/in_1_197_768.npy"; - // npy::LoadArrayFromNumpy(filename, typestr, shape, data); - - // float* ptr = (float*)bottom_top_blob.data; - // for (int i = 0; i < 151296; ++i) - // { - // ptr[i] = data[i]; - // } - // } - // { - // // write input scales - // std::vector shape; - // std::vector data; - // std::string typestr; - // std::string filename = "/home/PJLAB/konghuanjun/GitProjects/FQ-ViT/input_scales_768.npy"; - // npy::LoadArrayFromNumpy(filename, typestr, shape, data); - - // float* ptr = (float*)input_scales.data; - // for (int i = 0; i < 768; ++i) - // { - // ptr[i] =1.0f / data[i]; - // } - // } - // { - // // write output scale - // float* ptr = (float*)output_scale.data; - // ptr[0] = 1.0f / 0.0833f; - // } - // { - // // write gamma - // std::vector shape; - // std::vector data; - // std::string typestr; - // std::string filename = "/home/PJLAB/konghuanjun/GitProjects/FQ-ViT/gamma_768.npy"; - // npy::LoadArrayFromNumpy(filename, typestr, shape, data); - - // float* ptr = (float*)gamma_data.data; - // for (int i = 0; i < 768; ++i) - // { - // ptr[i] =data[i]; - // } - // } - // { - // // write beta - // std::vector shape; - // std::vector data; - // std::string typestr; - // std::string filename = "/home/PJLAB/konghuanjun/GitProjects/FQ-ViT/beta_768.npy"; - // npy::LoadArrayFromNumpy(filename, typestr, shape, data); - - // float* ptr = (float*)beta_data.data; - // for (int i = 0; i < 768; ++i) - // { - // ptr[i] =data[i]; - // } - // } - // } - // Transformer using BNC format float in_scale_max = -FLT_MAX; const float out_scale = output_scale[0]; @@ -187,7 +120,6 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con } else { - // else fuse ((in * in_scale).round() * (in_scale_max / in_scale)).round to (in*in_scale_max).round() int32_t* ptr = (int32_t*)xq.data; #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < elem_count; ++i) @@ -196,8 +128,6 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con } } - // std::vector A_save, B_save, result_save; - // get mean and std #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < xq.h; ++i) @@ -229,21 +159,9 @@ int LayerNorm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) con int32_t B = round((beta_data[j] - scale_b * gamma_data[j]) * out_scale * N); ptr[j] = round((sign * M * ptr[j] + B) / N); - - // A_save.emplace_back(A); - // B_save.emplace_back(B); - // result_save.emplace_back(ptr[j]); } } - // { - // // save to numpy - // const unsigned long shape[] = {197 * 768}; - // npy::SaveArrayAsNumpy("A.npy", false, 1, shape, A_save); - // npy::SaveArrayAsNumpy("B.npy", false, 1, shape, B_save); - // npy::SaveArrayAsNumpy("result.npy", false, 1, shape, result_save); - // } - if (int8_scale_term >= 100) { // output int8 diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index 1c2cee417a02..7cf8665ea86c 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -320,7 +320,6 @@ int MultiHeadAttention::forward_int8(const std::vector& bottom_blobs, std:: } } - // xqkv int4 @ int8, implement by shift Mat xqkv(embed_dim_per_head, num_head, seqlen, 1u, opt.workspace_allocator); const float xqkv_out_scale = internal_scales[4] / internal_scales[2]; From a666997f0ab04a0d99a7c3ec9663bf50c1a4726b Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 15 Aug 2022 10:50:20 +0800 Subject: [PATCH 27/36] fix(CI): layernorm int8 build error --- src/layer/layernorm.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/layer/layernorm.cpp b/src/layer/layernorm.cpp index 5f7def053e11..02b2b35802cb 100644 --- a/src/layer/layernorm.cpp +++ b/src/layer/layernorm.cpp @@ -60,7 +60,10 @@ int LayerNorm::load_model(const ModelBin& mb) #ifdef NCNN_INT8 static inline void get_MN(const float x, uint32_t& M, uint32_t& N) { - int bit = 7 - round(floor(log2(x))); +#define LOG2 (0.693147180f) + // log2(x) = log(x) / log(2) + int bit = 7 - round(floor(log(x) / LOG2)); +#undef LOG2 bit = bit < 0 ? 0 : bit; bit = bit > 31 ? 31 : bit; From 8226eee9aa27d9b1c33672b1dad1c4cbd2f7f47c Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 15 Aug 2022 10:52:02 +0800 Subject: [PATCH 28/36] fix(src/layer): fixme warning --- src/layer/x86/multiheadattention_x86.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index b7e42f162709..b8013e489602 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -55,14 +55,6 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) #endif // for fp32 inference, const fold inv_sqrt_embed_dim_per_head into `q_w` and `q_bias` -#if 0 - // FIXME! - float scale_vals[1] = {inv_sqrt_embed_dim_per_head}; - q_weight_fold_data = q_weight_data.clone(); - q_weight_fold_data.substract_mean_normalize(0, scale_vals); - q_bias_fold_data = q_bias_data.clone(); - q_bias_fold_data.substract_mean_normalize(0, scale_vals); -#else q_weight_fold_data = q_weight_data.clone(); for (int i = 0; i < q_weight_fold_data.w; ++i) { @@ -73,7 +65,6 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) { q_bias_fold_data[i] *= inv_sqrt_embed_dim_per_head; } -#endif if (opt.lightmode) { From 259ca0b3f33d8359cfbb98622f65673df0a1aa49 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 15 Aug 2022 12:09:07 +0800 Subject: [PATCH 29/36] fix(CI): sse2 run error --- src/layer/x86/x86_usability.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index c1865e211c73..7e52b4216213 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -551,13 +551,12 @@ static NCNN_FORCEINLINE float mul_add_reduce_no_align(const float* a, const floa #elif __SSE2__ align = (size >> 2) << 2; __m128 _sum = _mm_set1_ps(0.f); - for (int i = 0; i < align; i += 8) + for (int i = 0; i < align; i += 4) { __m128 val0 = _mm_loadu_ps(a + i); __m128 val1 = _mm_loadu_ps(b + i); _sum = _mm_add_ps(_sum, _mm_mul_ps(val0, val1)); } - sum += _mm_reduce_add_ps(_sum); #endif From 2c1d9b08d89b47bdd1096bd1bc3bb297238a9b6b Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 15 Aug 2022 12:15:40 +0800 Subject: [PATCH 30/36] fix(CI): build error --- src/layer/layernorm.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/layer/layernorm.cpp b/src/layer/layernorm.cpp index 02b2b35802cb..9270087fb1fd 100644 --- a/src/layer/layernorm.cpp +++ b/src/layer/layernorm.cpp @@ -16,6 +16,7 @@ #include #include #include "mathfun.h" +#include namespace ncnn { From 45f920775ef04f6f103e9a0dc579215b446bcbb0 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 15 Aug 2022 14:48:06 +0800 Subject: [PATCH 31/36] fix(CI): test_layernorm --- tests/test_layernorm.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_layernorm.cpp b/tests/test_layernorm.cpp index f638d299cd4c..5cfa79d27c35 100644 --- a/tests/test_layernorm.cpp +++ b/tests/test_layernorm.cpp @@ -121,10 +121,10 @@ static int test_layernorm_3() static int test_layernorm_4() { return 0 - || test_layernorm_int8(RandomMat(768, 197, 1), 768, 0.0001f, 101) - || test_layernorm_int8(RandomMat(768, 127, 1), 6, 0.01f, 101) - || test_layernorm_int8(RandomMat(6, 7, 1), 6, 0.001f, 1) - || test_layernorm_int8(RandomMat(768, 127, 1), 6, 0.01f, 1); + || test_layernorm_int8(RandomMat(768, 197), 768, 0.0001f, 101) + || test_layernorm_int8(RandomMat(768, 127), 6, 0.01f, 101) + || test_layernorm_int8(RandomMat(6, 7), 6, 0.001f, 1) + || test_layernorm_int8(RandomMat(768, 127), 6, 0.01f, 1); } int main() From a2c48aab9e7e49df4aa0ac75f0796e01e928e89a Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 15 Aug 2022 14:58:27 +0800 Subject: [PATCH 32/36] fix(CI): test_layernorm --- tests/test_layernorm.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_layernorm.cpp b/tests/test_layernorm.cpp index 5cfa79d27c35..5b916853b8fc 100644 --- a/tests/test_layernorm.cpp +++ b/tests/test_layernorm.cpp @@ -122,9 +122,8 @@ static int test_layernorm_4() { return 0 || test_layernorm_int8(RandomMat(768, 197), 768, 0.0001f, 101) - || test_layernorm_int8(RandomMat(768, 127), 6, 0.01f, 101) - || test_layernorm_int8(RandomMat(6, 7), 6, 0.001f, 1) - || test_layernorm_int8(RandomMat(768, 127), 6, 0.01f, 1); + || test_layernorm_int8(RandomMat(127, 127), 127, 0.01f, 101) + || test_layernorm_int8(RandomMat(6, 7), 6, 0.001f, 1); } int main() From 102e1dd9e7b37e72f2691bc23ac16e1dc7ff8706 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 15 Aug 2022 15:16:20 +0800 Subject: [PATCH 33/36] fix(CI): add hook --- .github/workflows/linux-x64-cpu-clang.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/linux-x64-cpu-clang.yml b/.github/workflows/linux-x64-cpu-clang.yml index 82e176559941..56ffcf4462dd 100644 --- a/.github/workflows/linux-x64-cpu-clang.yml +++ b/.github/workflows/linux-x64-cpu-clang.yml @@ -51,6 +51,8 @@ jobs: mkdir build-sse2 && cd build-sse2 cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 2 + - name: Setup Debug Session + uses: csexton/debugger-action@master - name: test-sse2 run: cd build-sse2 && ctest --output-on-failure -j 2 - name: build-shared From 4c9c906230a1d970316e222d7ec1cd781a89a1c0 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 15 Aug 2022 15:39:35 +0800 Subject: [PATCH 34/36] fix(layernorm): add debug hook --- .github/workflows/linux-x86-cpu-clang.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/linux-x86-cpu-clang.yml b/.github/workflows/linux-x86-cpu-clang.yml index 92544f4e4741..0d90618b86fa 100644 --- a/.github/workflows/linux-x86-cpu-clang.yml +++ b/.github/workflows/linux-x86-cpu-clang.yml @@ -45,6 +45,8 @@ jobs: mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j 2 + - name: Setup Debug Session + uses: csexton/debugger-action@master - name: test run: cd build && ctest --output-on-failure -j 2 - name: build-shared From e9c6792b7cbb6862f1b8fd98c7a06bd1df027b50 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 15 Aug 2022 16:10:06 +0800 Subject: [PATCH 35/36] fix(CI): update --- .github/workflows/linux-x64-cpu-clang.yml | 2 -- .github/workflows/linux-x86-cpu-clang.yml | 2 -- tests/test_multiheadattention.cpp | 8 ++++---- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/.github/workflows/linux-x64-cpu-clang.yml b/.github/workflows/linux-x64-cpu-clang.yml index 56ffcf4462dd..82e176559941 100644 --- a/.github/workflows/linux-x64-cpu-clang.yml +++ b/.github/workflows/linux-x64-cpu-clang.yml @@ -51,8 +51,6 @@ jobs: mkdir build-sse2 && cd build-sse2 cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 2 - - name: Setup Debug Session - uses: csexton/debugger-action@master - name: test-sse2 run: cd build-sse2 && ctest --output-on-failure -j 2 - name: build-shared diff --git a/.github/workflows/linux-x86-cpu-clang.yml b/.github/workflows/linux-x86-cpu-clang.yml index 0d90618b86fa..92544f4e4741 100644 --- a/.github/workflows/linux-x86-cpu-clang.yml +++ b/.github/workflows/linux-x86-cpu-clang.yml @@ -45,8 +45,6 @@ jobs: mkdir build && cd build cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. cmake --build . -j 2 - - name: Setup Debug Session - uses: csexton/debugger-action@master - name: test run: cd build && ctest --output-on-failure -j 2 - name: build-shared diff --git a/tests/test_multiheadattention.cpp b/tests/test_multiheadattention.cpp index bbf71c4d9c64..2cc00ab099d3 100644 --- a/tests/test_multiheadattention.cpp +++ b/tests/test_multiheadattention.cpp @@ -118,10 +118,10 @@ static int test_multiheadattention_int8(const ncnn::Mat& a, int num_heads) weights[9] = RandomMat(1, 1.f, 10.f); weights[10] = RandomMat(1, 1.f, 10.f); - weights[11] = RandomMat(embed_dim, 1.f, 10.f); - weights[12] = RandomMat(embed_dim, 1.f, 10.f); - weights[13] = RandomMat(embed_dim, 1.f, 10.f); - weights[14] = RandomMat(embed_dim, 1.f, 10.f); + weights[11] = scales_mat(weights[0], embed_dim, embed_dim, embed_dim); + weights[12] = scales_mat(weights[2], embed_dim, embed_dim, embed_dim); + weights[13] = scales_mat(weights[4], embed_dim, embed_dim, embed_dim); + weights[14] = scales_mat(weights[6], embed_dim, embed_dim, embed_dim); weights[15] = RandomMat(5, 1.f, 10.f); From 81a114996d8b7c98e50e89df6b181914e80e408a Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 15 Aug 2022 16:24:51 +0800 Subject: [PATCH 36/36] fix(test): update --- tests/test_multiheadattention.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_multiheadattention.cpp b/tests/test_multiheadattention.cpp index 2cc00ab099d3..ec26971cd105 100644 --- a/tests/test_multiheadattention.cpp +++ b/tests/test_multiheadattention.cpp @@ -141,7 +141,7 @@ static int test_multiheadattention_2() { return 0 || test_multiheadattention_int8(RandomMat(64, 128), 8) - || test_multiheadattention_int8(RandomMat(512, 512), 32); + || test_multiheadattention_int8(RandomMat(64, 127), 32); } #endif