From 66d1104930a93bbe05dbdeddd986c14652ca06ae Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 13 Jun 2022 17:14:01 +0800 Subject: [PATCH 01/15] feat(tools/quantize): support .ini format feat(src/layer): add mha int8 --- benchmark/benchncnn.cpp | 3 + benchmark/vision_transformer_int8.param | 146 ++ docs/developer-guide/operators.md | 13 +- .../quantized-int8-inference.md | 28 +- src/layer/multiheadattention.cpp | 333 ++- src/layer/multiheadattention.h | 30 + tests/test_multiheadattention.cpp | 60 +- tools/CMakeLists.txt | 2 +- tools/modelwriter.cpp | 2069 +++++++++++++++++ tools/modelwriter.h | 2054 +--------------- tools/quantize/CMakeLists.txt | 9 +- tools/quantize/imreadwrite.h | 2 +- tools/quantize/ini_config.cpp | 224 ++ tools/quantize/ini_config.h | 323 +++ tools/quantize/ncnn2int8.cpp | 497 +--- tools/quantize/ncnn2table.cpp | 103 +- tools/quantize/net_quantize.cpp | 625 +++++ tools/quantize/net_quantize.h | 48 + 18 files changed, 3998 insertions(+), 2571 deletions(-) create mode 100644 benchmark/vision_transformer_int8.param create mode 100644 tools/modelwriter.cpp create mode 100644 tools/quantize/ini_config.cpp create mode 100644 tools/quantize/ini_config.h create mode 100644 tools/quantize/net_quantize.cpp create mode 100644 tools/quantize/net_quantize.h diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp index 032e3f9fbc42..283c76a340e2 100644 --- a/benchmark/benchncnn.cpp +++ b/benchmark/benchncnn.cpp @@ -321,6 +321,9 @@ int main(int argc, char** argv) benchmark("vision_transformer", ncnn::Mat(384, 384, 3), opt); benchmark("FastestDet", ncnn::Mat(352, 352, 3), opt); + + benchmark("vision_transformer_int8", ncnn::Mat(384, 384, 3), opt); + #if NCNN_VULKAN delete g_blob_vkallocator; delete g_staging_vkallocator; diff --git a/benchmark/vision_transformer_int8.param b/benchmark/vision_transformer_int8.param new file mode 100644 index 000000000000..0a7e92383add --- /dev/null +++ b/benchmark/vision_transformer_int8.param @@ -0,0 +1,146 @@ +7767517 +144 192 +Input input 0 1 input +MemoryData backbone.cls_token 0 1 backbone.cls_token 0=768 1=1 +MemoryData backbone.pos_embed 0 1 backbone.pos_embed 0=768 1=145 +Convolution Conv_0 1 1 input onnx::Reshape_153 0=768 1=32 3=32 5=1 6=2359296 8=2 +Reshape Reshape_2 1 1 onnx::Reshape_153 onnx::Transpose_155 0=-1 1=768 +Permute Transpose_3 1 1 onnx::Transpose_155 onnx::Concat_156 0=1 +Concat Concat_4 2 1 backbone.cls_token onnx::Concat_156 onnx::Add_157 +BinaryOp Add_5 2 1 onnx::Add_157 backbone.pos_embed input.1 +Split splitncnn_0 1 2 input.1 input.1_splitncnn_0 input.1_splitncnn_1 +LayerNorm LayerNorm_6 1 1 input.1_splitncnn_1 qkv_input 0=768 1=1.000000e-06 +Split splitncnn_1 1 3 qkv_input qkv_input_splitncnn_0 qkv_input_splitncnn_1 qkv_input_splitncnn_2 +MultiHeadAttention MultiHeadAttention_15 3 1 qkv_input_splitncnn_2 qkv_input_splitncnn_1 qkv_input_splitncnn_0 onnx::Add_168 0=768 1=12 2=589824 3=1 +BinaryOp Add_16 2 1 input.1_splitncnn_0 onnx::Add_168 input.4 +Split splitncnn_2 1 2 input.4 input.4_splitncnn_0 input.4_splitncnn_1 +LayerNorm LayerNorm_17 1 1 input.4_splitncnn_1 onnx::Gemm_170 0=768 1=1.000000e-06 +InnerProduct Gemm_18 1 1 onnx::Gemm_170 mmdeploy::Gelu_171 0=3072 1=1 2=2359296 8=2 +GELU Gelu_19 1 1 mmdeploy::Gelu_171 input.8 0=1 +InnerProduct Gemm_20 1 1 input.8 input.12 0=768 1=1 2=2359296 8=2 +BinaryOp Add_21 2 1 input.4_splitncnn_0 input.12 input.16 +Split splitncnn_3 1 2 input.16 input.16_splitncnn_0 input.16_splitncnn_1 +LayerNorm LayerNorm_22 1 1 input.16_splitncnn_1 qkv_input.3 0=768 1=1.000000e-06 +Split splitncnn_4 1 3 qkv_input.3 qkv_input.3_splitncnn_0 qkv_input.3_splitncnn_1 qkv_input.3_splitncnn_2 +MultiHeadAttention MultiHeadAttention_31 3 1 qkv_input.3_splitncnn_2 qkv_input.3_splitncnn_1 qkv_input.3_splitncnn_0 onnx::Add_184 0=768 1=12 2=589824 3=1 +BinaryOp Add_32 2 1 input.16_splitncnn_0 onnx::Add_184 input.20 +Split splitncnn_5 1 2 input.20 input.20_splitncnn_0 input.20_splitncnn_1 +LayerNorm LayerNorm_33 1 1 input.20_splitncnn_1 onnx::Gemm_186 0=768 1=1.000000e-06 +InnerProduct Gemm_34 1 1 onnx::Gemm_186 mmdeploy::Gelu_187 0=3072 1=1 2=2359296 8=2 +GELU Gelu_35 1 1 mmdeploy::Gelu_187 input.24 0=1 +InnerProduct Gemm_36 1 1 input.24 input.28 0=768 1=1 2=2359296 8=2 +BinaryOp Add_37 2 1 input.20_splitncnn_0 input.28 input.32 +Split splitncnn_6 1 2 input.32 input.32_splitncnn_0 input.32_splitncnn_1 +LayerNorm LayerNorm_38 1 1 input.32_splitncnn_1 qkv_input.7 0=768 1=1.000000e-06 +Split splitncnn_7 1 3 qkv_input.7 qkv_input.7_splitncnn_0 qkv_input.7_splitncnn_1 qkv_input.7_splitncnn_2 +MultiHeadAttention MultiHeadAttention_47 3 1 qkv_input.7_splitncnn_2 qkv_input.7_splitncnn_1 qkv_input.7_splitncnn_0 onnx::Add_200 0=768 1=12 2=589824 3=1 +BinaryOp Add_48 2 1 input.32_splitncnn_0 onnx::Add_200 input.36 +Split splitncnn_8 1 2 input.36 input.36_splitncnn_0 input.36_splitncnn_1 +LayerNorm LayerNorm_49 1 1 input.36_splitncnn_1 onnx::Gemm_202 0=768 1=1.000000e-06 +InnerProduct Gemm_50 1 1 onnx::Gemm_202 mmdeploy::Gelu_203 0=3072 1=1 2=2359296 8=2 +GELU Gelu_51 1 1 mmdeploy::Gelu_203 input.40 0=1 +InnerProduct Gemm_52 1 1 input.40 input.44 0=768 1=1 2=2359296 8=2 +BinaryOp Add_53 2 1 input.36_splitncnn_0 input.44 input.48 +Split splitncnn_9 1 2 input.48 input.48_splitncnn_0 input.48_splitncnn_1 +LayerNorm LayerNorm_54 1 1 input.48_splitncnn_1 qkv_input.11 0=768 1=1.000000e-06 +Split splitncnn_10 1 3 qkv_input.11 qkv_input.11_splitncnn_0 qkv_input.11_splitncnn_1 qkv_input.11_splitncnn_2 +MultiHeadAttention MultiHeadAttention_63 3 1 qkv_input.11_splitncnn_2 qkv_input.11_splitncnn_1 qkv_input.11_splitncnn_0 onnx::Add_216 0=768 1=12 2=589824 3=1 +BinaryOp Add_64 2 1 input.48_splitncnn_0 onnx::Add_216 input.52 +Split splitncnn_11 1 2 input.52 input.52_splitncnn_0 input.52_splitncnn_1 +LayerNorm LayerNorm_65 1 1 input.52_splitncnn_1 onnx::Gemm_218 0=768 1=1.000000e-06 +InnerProduct Gemm_66 1 1 onnx::Gemm_218 mmdeploy::Gelu_219 0=3072 1=1 2=2359296 8=2 +GELU Gelu_67 1 1 mmdeploy::Gelu_219 input.56 0=1 +InnerProduct Gemm_68 1 1 input.56 input.60 0=768 1=1 2=2359296 8=2 +BinaryOp Add_69 2 1 input.52_splitncnn_0 input.60 input.64 +Split splitncnn_12 1 2 input.64 input.64_splitncnn_0 input.64_splitncnn_1 +LayerNorm LayerNorm_70 1 1 input.64_splitncnn_1 qkv_input.15 0=768 1=1.000000e-06 +Split splitncnn_13 1 3 qkv_input.15 qkv_input.15_splitncnn_0 qkv_input.15_splitncnn_1 qkv_input.15_splitncnn_2 +MultiHeadAttention MultiHeadAttention_79 3 1 qkv_input.15_splitncnn_2 qkv_input.15_splitncnn_1 qkv_input.15_splitncnn_0 onnx::Add_232 0=768 1=12 2=589824 3=1 +BinaryOp Add_80 2 1 input.64_splitncnn_0 onnx::Add_232 input.68 +Split splitncnn_14 1 2 input.68 input.68_splitncnn_0 input.68_splitncnn_1 +LayerNorm LayerNorm_81 1 1 input.68_splitncnn_1 onnx::Gemm_234 0=768 1=1.000000e-06 +InnerProduct Gemm_82 1 1 onnx::Gemm_234 mmdeploy::Gelu_235 0=3072 1=1 2=2359296 8=2 +GELU Gelu_83 1 1 mmdeploy::Gelu_235 input.72 0=1 +InnerProduct Gemm_84 1 1 input.72 input.76 0=768 1=1 2=2359296 8=2 +BinaryOp Add_85 2 1 input.68_splitncnn_0 input.76 input.80 +Split splitncnn_15 1 2 input.80 input.80_splitncnn_0 input.80_splitncnn_1 +LayerNorm LayerNorm_86 1 1 input.80_splitncnn_1 qkv_input.19 0=768 1=1.000000e-06 +Split splitncnn_16 1 3 qkv_input.19 qkv_input.19_splitncnn_0 qkv_input.19_splitncnn_1 qkv_input.19_splitncnn_2 +MultiHeadAttention MultiHeadAttention_95 3 1 qkv_input.19_splitncnn_2 qkv_input.19_splitncnn_1 qkv_input.19_splitncnn_0 onnx::Add_248 0=768 1=12 2=589824 3=1 +BinaryOp Add_96 2 1 input.80_splitncnn_0 onnx::Add_248 input.84 +Split splitncnn_17 1 2 input.84 input.84_splitncnn_0 input.84_splitncnn_1 +LayerNorm LayerNorm_97 1 1 input.84_splitncnn_1 onnx::Gemm_250 0=768 1=1.000000e-06 +InnerProduct Gemm_98 1 1 onnx::Gemm_250 mmdeploy::Gelu_251 0=3072 1=1 2=2359296 8=2 +GELU Gelu_99 1 1 mmdeploy::Gelu_251 input.88 0=1 +InnerProduct Gemm_100 1 1 input.88 input.92 0=768 1=1 2=2359296 8=2 +BinaryOp Add_101 2 1 input.84_splitncnn_0 input.92 input.96 +Split splitncnn_18 1 2 input.96 input.96_splitncnn_0 input.96_splitncnn_1 +LayerNorm LayerNorm_102 1 1 input.96_splitncnn_1 qkv_input.23 0=768 1=1.000000e-06 +Split splitncnn_19 1 3 qkv_input.23 qkv_input.23_splitncnn_0 qkv_input.23_splitncnn_1 qkv_input.23_splitncnn_2 +MultiHeadAttention MultiHeadAttention_111 3 1 qkv_input.23_splitncnn_2 qkv_input.23_splitncnn_1 qkv_input.23_splitncnn_0 onnx::Add_264 0=768 1=12 2=589824 3=1 +BinaryOp Add_112 2 1 input.96_splitncnn_0 onnx::Add_264 input.100 +Split splitncnn_20 1 2 input.100 input.100_splitncnn_0 input.100_splitncnn_1 +LayerNorm LayerNorm_113 1 1 input.100_splitncnn_1 onnx::Gemm_266 0=768 1=1.000000e-06 +InnerProduct Gemm_114 1 1 onnx::Gemm_266 mmdeploy::Gelu_267 0=3072 1=1 2=2359296 8=2 +GELU Gelu_115 1 1 mmdeploy::Gelu_267 input.104 0=1 +InnerProduct Gemm_116 1 1 input.104 input.108 0=768 1=1 2=2359296 8=2 +BinaryOp Add_117 2 1 input.100_splitncnn_0 input.108 input.112 +Split splitncnn_21 1 2 input.112 input.112_splitncnn_0 input.112_splitncnn_1 +LayerNorm LayerNorm_118 1 1 input.112_splitncnn_1 qkv_input.27 0=768 1=1.000000e-06 +Split splitncnn_22 1 3 qkv_input.27 qkv_input.27_splitncnn_0 qkv_input.27_splitncnn_1 qkv_input.27_splitncnn_2 +MultiHeadAttention MultiHeadAttention_127 3 1 qkv_input.27_splitncnn_2 qkv_input.27_splitncnn_1 qkv_input.27_splitncnn_0 onnx::Add_280 0=768 1=12 2=589824 3=1 +BinaryOp Add_128 2 1 input.112_splitncnn_0 onnx::Add_280 input.116 +Split splitncnn_23 1 2 input.116 input.116_splitncnn_0 input.116_splitncnn_1 +LayerNorm LayerNorm_129 1 1 input.116_splitncnn_1 onnx::Gemm_282 0=768 1=1.000000e-06 +InnerProduct Gemm_130 1 1 onnx::Gemm_282 mmdeploy::Gelu_283 0=3072 1=1 2=2359296 8=2 +GELU Gelu_131 1 1 mmdeploy::Gelu_283 input.120 0=1 +InnerProduct Gemm_132 1 1 input.120 input.124 0=768 1=1 2=2359296 8=2 +BinaryOp Add_133 2 1 input.116_splitncnn_0 input.124 input.128 +Split splitncnn_24 1 2 input.128 input.128_splitncnn_0 input.128_splitncnn_1 +LayerNorm LayerNorm_134 1 1 input.128_splitncnn_1 qkv_input.31 0=768 1=1.000000e-06 +Split splitncnn_25 1 3 qkv_input.31 qkv_input.31_splitncnn_0 qkv_input.31_splitncnn_1 qkv_input.31_splitncnn_2 +MultiHeadAttention MultiHeadAttention_143 3 1 qkv_input.31_splitncnn_2 qkv_input.31_splitncnn_1 qkv_input.31_splitncnn_0 onnx::Add_296 0=768 1=12 2=589824 3=1 +BinaryOp Add_144 2 1 input.128_splitncnn_0 onnx::Add_296 input.132 +Split splitncnn_26 1 2 input.132 input.132_splitncnn_0 input.132_splitncnn_1 +LayerNorm LayerNorm_145 1 1 input.132_splitncnn_1 onnx::Gemm_298 0=768 1=1.000000e-06 +InnerProduct Gemm_146 1 1 onnx::Gemm_298 mmdeploy::Gelu_299 0=3072 1=1 2=2359296 8=2 +GELU Gelu_147 1 1 mmdeploy::Gelu_299 input.136 0=1 +InnerProduct Gemm_148 1 1 input.136 input.140 0=768 1=1 2=2359296 8=2 +BinaryOp Add_149 2 1 input.132_splitncnn_0 input.140 input.144 +Split splitncnn_27 1 2 input.144 input.144_splitncnn_0 input.144_splitncnn_1 +LayerNorm LayerNorm_150 1 1 input.144_splitncnn_1 qkv_input.35 0=768 1=1.000000e-06 +Split splitncnn_28 1 3 qkv_input.35 qkv_input.35_splitncnn_0 qkv_input.35_splitncnn_1 qkv_input.35_splitncnn_2 +MultiHeadAttention MultiHeadAttention_159 3 1 qkv_input.35_splitncnn_2 qkv_input.35_splitncnn_1 qkv_input.35_splitncnn_0 onnx::Add_312 0=768 1=12 2=589824 3=1 +BinaryOp Add_160 2 1 input.144_splitncnn_0 onnx::Add_312 input.148 +Split splitncnn_29 1 2 input.148 input.148_splitncnn_0 input.148_splitncnn_1 +LayerNorm LayerNorm_161 1 1 input.148_splitncnn_1 onnx::Gemm_314 0=768 1=1.000000e-06 +InnerProduct Gemm_162 1 1 onnx::Gemm_314 mmdeploy::Gelu_315 0=3072 1=1 2=2359296 8=2 +GELU Gelu_163 1 1 mmdeploy::Gelu_315 input.152 0=1 +InnerProduct Gemm_164 1 1 input.152 input.156 0=768 1=1 2=2359296 8=2 +BinaryOp Add_165 2 1 input.148_splitncnn_0 input.156 input.160 +Split splitncnn_30 1 2 input.160 input.160_splitncnn_0 input.160_splitncnn_1 +LayerNorm LayerNorm_166 1 1 input.160_splitncnn_1 qkv_input.39 0=768 1=1.000000e-06 +Split splitncnn_31 1 3 qkv_input.39 qkv_input.39_splitncnn_0 qkv_input.39_splitncnn_1 qkv_input.39_splitncnn_2 +MultiHeadAttention MultiHeadAttention_175 3 1 qkv_input.39_splitncnn_2 qkv_input.39_splitncnn_1 qkv_input.39_splitncnn_0 onnx::Add_328 0=768 1=12 2=589824 3=1 +BinaryOp Add_176 2 1 input.160_splitncnn_0 onnx::Add_328 input.164 +Split splitncnn_32 1 2 input.164 input.164_splitncnn_0 input.164_splitncnn_1 +LayerNorm LayerNorm_177 1 1 input.164_splitncnn_1 onnx::Gemm_330 0=768 1=1.000000e-06 +InnerProduct Gemm_178 1 1 onnx::Gemm_330 mmdeploy::Gelu_331 0=3072 1=1 2=2359296 8=2 +GELU Gelu_179 1 1 mmdeploy::Gelu_331 input.168 0=1 +InnerProduct Gemm_180 1 1 input.168 input.172 0=768 1=1 2=2359296 8=2 +BinaryOp Add_181 2 1 input.164_splitncnn_0 input.172 input.176 +Split splitncnn_33 1 2 input.176 input.176_splitncnn_0 input.176_splitncnn_1 +LayerNorm LayerNorm_182 1 1 input.176_splitncnn_1 qkv_input.43 0=768 1=1.000000e-06 +Split splitncnn_34 1 3 qkv_input.43 qkv_input.43_splitncnn_0 qkv_input.43_splitncnn_1 qkv_input.43_splitncnn_2 +MultiHeadAttention MultiHeadAttention_191 3 1 qkv_input.43_splitncnn_2 qkv_input.43_splitncnn_1 qkv_input.43_splitncnn_0 onnx::Add_344 0=768 1=12 2=589824 3=1 +BinaryOp Add_192 2 1 input.176_splitncnn_0 onnx::Add_344 input.180 +Split splitncnn_35 1 2 input.180 input.180_splitncnn_0 input.180_splitncnn_1 +LayerNorm LayerNorm_193 1 1 input.180_splitncnn_1 onnx::Gemm_346 0=768 1=1.000000e-06 +InnerProduct Gemm_194 1 1 onnx::Gemm_346 mmdeploy::Gelu_347 0=3072 1=1 2=2359296 8=2 +GELU Gelu_195 1 1 mmdeploy::Gelu_347 input.184 0=1 +InnerProduct Gemm_196 1 1 input.184 input.188 0=768 1=1 2=2359296 8=2 +BinaryOp Add_197 2 1 input.180_splitncnn_0 input.188 input.192 +LayerNorm LayerNorm_198 1 1 input.192 onnx::Gather_351 0=768 1=1.000000e-06 +Crop Gather_200 1 1 onnx::Gather_351 onnx::Gemm_353 -23309=1,0 -23310=1,1 -23311=1,0 +InnerProduct Gemm_201 1 1 onnx::Gemm_353 cls_score 0=1000 1=1 2=768000 8=2 +Softmax Softmax_202 1 1 cls_score output diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md index 5366da1e112c..578a726fc916 100644 --- a/docs/developer-guide/operators.md +++ b/docs/developer-guide/operators.md @@ -1084,9 +1084,10 @@ y = affine(out) | 0 | embed_dim | int | 0 | | | 1 | num_head | int | 1 | | | 2 | weight_data_size| int | 0 | | +| 3 | int8_scale_term| int | 0 | | -| weight | type | shape | -| ------------- | ----- | --------------------- | +| weight | type | shape | description | +| ------------- | ----- | --- | --------------------- | | q_weight_data | float/fp16/int8 | [weight_data_size] | | q_bias_data | float | [embed_dim] | | k_weight_data | float/fp16/int8 | [weight_data_size] | @@ -1095,6 +1096,14 @@ y = affine(out) | v_bias_data | float | [embed_dim] | | out_weight_data| float/fp16/int8 | [weight_data_size] | | out_bias_data | float | [embed_dim] | +| q_input_scale | float | [1] | +| k_input_scale | float | [1] | +| v_input_scale | float | [1] | +| q_weight_scales | float | [embed_dim] | +| k_weight_scales | float | [embed_dim] | +| v_weight_scales | float | [embed_dim] | +| internal_scales | float | [5] | scales for xq/xk/xv/before_softmax/before_output | + # MVN ``` diff --git a/docs/how-to-use-and-FAQ/quantized-int8-inference.md b/docs/how-to-use-and-FAQ/quantized-int8-inference.md index cf8e05c20952..a8846fc96ebf 100644 --- a/docs/how-to-use-and-FAQ/quantized-int8-inference.md +++ b/docs/how-to-use-and-FAQ/quantized-int8-inference.md @@ -20,7 +20,7 @@ Some imagenet sample images here https://github.com/nihui/imagenet-sample-images ```shell find images/ -type f > imagelist.txt -./ncnn2table mobilenet-opt.param mobilenet-opt.bin imagelist.txt mobilenet.table mean=[104,117,123] norm=[0.017,0.017,0.017] shape=[224,224,3] pixel=BGR thread=8 method=kl +./ncnn2table mobilenet-opt.param mobilenet-opt.bin imagelist.txt mobilenet.table mean=[104,117,123] norm=[0.017,0.017,0.017] shape=[224,224,3] pixel=BGR thread=8 method=kl format=txt ``` * mean and norm are the values you passed to ```Mat::substract_mean_normalize()``` @@ -35,6 +35,7 @@ find images/ -type f > imagelist.txt * pixel is the pixel format of your model, image pixels will be converted to this type before ```Extractor::input()``` * thread is the CPU thread count that could be used for parallel inference * method is the post training quantization algorithm, kl and aciq are currently supported +* format is the output file type of quantization parameters, choose `ini` for `txt`. Using `txt` by default If your model has multiple input nodes, you can use multiple list files and other parameters @@ -60,7 +61,7 @@ mobilenet.load_model("mobilenet-int8.bin"); ## mixed precision inference -Before quantize your model, comment the layer weight scale line in table file, then the layer will do the float32 inference +Before quantize your model, comment layer weight scale line in the table file with `txt` format, then the layer will do the float32 inference ``` conv1_param_0 156.639840536 @@ -69,3 +70,26 @@ conv1_param_0 156.639840536 ``` #conv1_param_0 156.639840536 ``` + +If you are using `ini` format, just remove whole quantization parameters of the layer, for example: + +``` +[conv0] +type = "Conv" +weight = [ 156.639840536 ] +input_scale = 1.23 + +[fire] +type = "Gemm" +weight = [ 156.639840536 ] +input_scale = 1.23 +``` + +to + +``` +[fire] +type = "Gemm" +weight = [ 156.639840536 ] +input_scale = 1.23 +``` diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index ac26f599f048..80ec43518b25 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -15,6 +15,9 @@ #include "multiheadattention.h" #include +#ifdef NCNN_INT8 +#include +#endif namespace ncnn { @@ -27,50 +30,332 @@ int MultiHeadAttention::load_param(const ParamDict& pd) embed_dim = pd.get(0, 0); num_head = pd.get(1, 1); weight_data_size = pd.get(2, 0); + int8_scale_term = pd.get(3, 0); + if (int8_scale_term) + { +#if NCNN_INT8 + support_int8_storage = true; +#else + NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference"); + return -1; +#endif + } return 0; } int MultiHeadAttention::load_model(const ModelBin& mb) { - q_weight_data = mb.load(weight_data_size, 0); - if (q_weight_data.empty()) - return -100; +#define LOAD_MAT(name, len) \ + name = mb.load(len, 0); \ + if (name.empty()) \ + { \ + return -100; \ + } - q_bias_data = mb.load(embed_dim, 1); - if (q_bias_data.empty()) - return -100; +#define LOAD_FLOAT_MAT(name, len) \ + name = mb.load(len, 1); \ + if (name.empty()) \ + { \ + return -100; \ + } - k_weight_data = mb.load(weight_data_size, 0); - if (k_weight_data.empty()) - return -100; + LOAD_MAT(q_weight_data, weight_data_size); + LOAD_FLOAT_MAT(q_bias_data, embed_dim); - k_bias_data = mb.load(embed_dim, 1); - if (k_bias_data.empty()) - return -100; + LOAD_MAT(k_weight_data, weight_data_size); + LOAD_FLOAT_MAT(k_bias_data, embed_dim); - v_weight_data = mb.load(weight_data_size, 0); - if (v_weight_data.empty()) - return -100; + LOAD_MAT(v_weight_data, weight_data_size); + LOAD_FLOAT_MAT(v_bias_data, embed_dim); - v_bias_data = mb.load(embed_dim, 1); - if (v_bias_data.empty()) - return -100; + LOAD_MAT(out_weight_data, weight_data_size); + LOAD_FLOAT_MAT(out_bias_data, embed_dim); - out_weight_data = mb.load(weight_data_size, 0); - if (out_weight_data.empty()) - return -100; +#if NCNN_INT8 + if (int8_scale_term) + { + LOAD_FLOAT_MAT(q_input_scale, 1); + LOAD_FLOAT_MAT(k_input_scale, 1); + LOAD_FLOAT_MAT(v_input_scale, 1); - out_bias_data = mb.load(embed_dim, 1); - if (out_bias_data.empty()) - return -100; + LOAD_FLOAT_MAT(q_weight_scales, embed_dim); + LOAD_FLOAT_MAT(k_weight_scales, embed_dim); + LOAD_FLOAT_MAT(v_weight_scales, embed_dim); + LOAD_FLOAT_MAT(o_weight_scales, embed_dim); + + LOAD_FLOAT_MAT(internal_scales, 5); + } +#endif // NCNN_INT8 + +#undef LOAD_MAT +#undef LOAD_FLOAT_MAT return 0; } +#ifdef NCNN_INT8 +static int affine_input( + const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, + const Mat& input_scale, const Mat& weight_scales, const float transform_scale, + const int num_head, const Option& opt, bool transpose) +{ + const int embed_dim = input.w; + const int seqlen = input.h; + const int embed_dim_per_head = embed_dim / num_head; + const float scale = 1.0 / input_scale[0]; + + Mat input_int8; + if (input.elemsize != 1) + { + quantize_to_int8(input, input_int8, input_scale, opt); + } + + Mat buffer(out_int8.w, out_int8.h, out_int8.c, 4u, opt.workspace_allocator); + + if (transpose) + { + for (int q = 0; q < num_head; q++) + { + Mat outm = buffer.channel(q); + + for (int i = 0; i < embed_dim_per_head; i++) + { + for (int j = 0; j < seqlen; j++) + { + const int8_t* ptr = input_int8.row(j); + const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + i); + + int32_t sum = 0; + const int32_t index = q * embed_dim_per_head + i; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + float* outptr = outm.row(i); + outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; + } + } + } + } + else + { + for (int q = 0; q < num_head; q++) + { + Mat outm = buffer.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const int8_t* ptr = input_int8.row(i); + const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + j); + + int32_t sum = 0; + const int32_t index = q * embed_dim_per_head + j; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; + } + } + } + } + + Mat transform(1, 4u, opt.workspace_allocator); + transform[0] = transform_scale; + quantize_to_int8(buffer, out_int8, transform, opt); + return 0; +} + +static inline int32_t float2int8(float v) +{ + int int32 = static_cast(round(v)); + if (int32 > 127) return 127; + if (int32 < -127) return -127; + return int32; +} + +int MultiHeadAttention::forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& q_blob = bottom_blobs[0]; + const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1]; + const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2]; + + const int seqlen = q_blob.h; + const int embed_dim_per_head = embed_dim / num_head; + + Option opt_g = opt; + opt_g.blob_allocator = opt.workspace_allocator; + opt_g.use_packing_layout = false; + + Mat xq(embed_dim_per_head, seqlen, num_head, 1u, opt.workspace_allocator); + Mat xk(embed_dim_per_head, seqlen, num_head, 1u, opt.workspace_allocator); + Mat xv(seqlen, embed_dim_per_head, num_head, 1u, opt.workspace_allocator); + + affine_input(q_blob, q_weight_data, q_bias_data, xq, q_input_scale, q_weight_scales, internal_scales[0], num_head, opt_g, false); + affine_input(k_blob, k_weight_data, k_bias_data, xk, k_input_scale, k_weight_scales, internal_scales[1], num_head, opt_g, false); + affine_input(v_blob, v_weight_data, v_bias_data, xv, v_input_scale, v_weight_scales, internal_scales[2], num_head, opt_g, true); + + // transpose(v) for better gemm performance + // Mat xv(seqlen, embed_dim_per_head, num_head, 1u, opt.workspace_allocator); + // Mat debug_xv; + // transform_input(v_blob, v_weight_data, v_bias_data, xv, v_input_scale, v_weight_scales, internal_scales[2], opt_g, debug_xv, true); + + // xq @ qk * inv_sqrt_embed_dim_per_head + const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + + Mat xqk(seqlen, seqlen, num_head, 4u, opt.workspace_allocator); + { + // xqk = xq * xk + // xq (embed_dim_per_head, seqlen) + // xk (embed_dim_per_head, seqlen) + const float out_scale = inv_sqrt_embed_dim_per_head / (internal_scales[0] * internal_scales[1]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_head; ++q) + { + const Mat xqm = xq.channel(q); + const Mat xkm = xk.channel(q); + + Mat outm = xqk.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < seqlen; j++) + { + const int8_t* qptr = xqm.row(i); + const int8_t* kptr = xkm.row(j); + + int32_t sum = 0; + for (int k = 0; k < embed_dim_per_head; k++) + { + sum += *qptr++ * *kptr++; + } + + outptr[j] = sum * out_scale; + } + } + } + + // fp32_softmax(xqk) + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_head; q++) + { + // softmax(xqk) + { + Mat outm = xqk.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* ptr = outm.row(i); + + float max = -FLT_MAX; + for (int j = 0; j < seqlen; j++) + { + max = std::max(max, ptr[j]); + } + + float sum = 0.f; + for (int j = 0; j < seqlen; j++) + { + ptr[j] = (float)(exp(ptr[j] - max)); + sum += ptr[j]; + } + + for (int j = 0; j < seqlen; j++) + { + ptr[j] = ptr[j] / sum; + } + } + } + } + } + + // xqkv int4 @ int8, implement by shift + Mat xqkv(embed_dim_per_head, num_head, seqlen, 1u, opt.workspace_allocator); + + const float xqkv_out_scale = internal_scales[4] / internal_scales[2]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_head; ++q) + { + // xqkv = xqk * xv + // xqk (seqlen, seqlen) + // xv (seqlen, embed_dim_per_head) + // out (embed_dim_per_head, num_head, seqlen) + const Mat xqkm = xqk.channel(q); + const Mat xvm = xv.channel(q); + + for (int i = 0; i < seqlen; i++) + { + int8_t* outptr = xqkv.channel(i).row(q); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const float* qkptr = xqkm.row(i); + const int8_t* vptr = xvm.row(j); + + float sum = 0; + for (int k = 0; k < seqlen; k++) + { + sum += (*vptr++) * (*qkptr++); + } + + outptr[j] = float2int8(sum * xqkv_out_scale); + } + } + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(embed_dim, seqlen, 4u, opt.blob_allocator); + if (top_blob.empty()) + return -1; + + const float out_scale = 1.0f / internal_scales[4]; + // out = affine(xqkv) + // xqkv (embed_dim, seqlen) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < seqlen; i++) + { + float* outptr = top_blob.row(i); + + for (int j = 0; j < embed_dim; j++) + { + const int8_t* ptr = xqkv.channel(i); + const int8_t* kptr = (const int8_t*)out_weight_data + embed_dim * j; + + int32_t sum = 0; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + outptr[j] = sum * out_scale / o_weight_scales[j] + out_bias_data[j]; + } + } + + return 0; +} + +#endif + // refers to https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html int MultiHeadAttention::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { +#if NCNN_INT8 + if (opt.use_int8_inference && q_weight_data.elemsize == (size_t)1u && k_weight_data.elemsize == (size_t)1u && v_weight_data.elemsize == (size_t)1u && out_weight_data.elemsize == (size_t)1u) + { + return forward_int8(bottom_blobs, top_blobs, opt); + } +#endif + const Mat& q_blob = bottom_blobs[0]; const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1]; const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2]; diff --git a/src/layer/multiheadattention.h b/src/layer/multiheadattention.h index b878055385d0..31a967804391 100644 --- a/src/layer/multiheadattention.h +++ b/src/layer/multiheadattention.h @@ -30,10 +30,15 @@ class MultiHeadAttention : public Layer virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +#ifdef NCNN_INT8 + int forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +#endif + public: int embed_dim; int num_head; int weight_data_size; + int int8_scale_term; Mat q_weight_data; Mat q_bias_data; @@ -43,6 +48,31 @@ class MultiHeadAttention : public Layer Mat v_bias_data; Mat out_weight_data; Mat out_bias_data; + +#ifdef NCNN_INT8 + Mat q_input_scale; + Mat k_input_scale; + Mat v_input_scale; + + Mat q_weight_scales; + Mat k_weight_scales; + Mat v_weight_scales; + Mat o_weight_scales; + + /** + * @brief mha consists of multiple GEMM, they also have input scale + * + * internal_scales = [ + * q_affine_scale, + * k_affine_scale, + * v_affine_scale, + * energy_scale, + * feat_scael + * ] + * + */ + Mat internal_scales; +#endif }; } // namespace ncnn diff --git a/tests/test_multiheadattention.cpp b/tests/test_multiheadattention.cpp index f4e0b1b44f58..ba6e8d32e899 100644 --- a/tests/test_multiheadattention.cpp +++ b/tests/test_multiheadattention.cpp @@ -93,11 +93,69 @@ static int test_multiheadattention_1() || test_multiheadattention_sameqkv(RandomMat(64, 127), 32); } +#ifdef NCNN_INT8 +static int test_multiheadattention_int8(const ncnn::Mat& a, int num_heads) +{ + int embed_dim = a.w; + + ncnn::ParamDict pd; + pd.set(0, embed_dim); + pd.set(1, num_heads); + pd.set(2, embed_dim * embed_dim); + pd.set(3, 1); + + std::vector weights(16); + weights[0] = RandomIntMat(embed_dim * embed_dim); + weights[1] = RandomIntMat(embed_dim); + weights[2] = RandomIntMat(embed_dim * embed_dim); + weights[3] = RandomIntMat(embed_dim); + weights[4] = RandomIntMat(embed_dim * embed_dim); + weights[5] = RandomIntMat(embed_dim); + weights[6] = RandomIntMat(embed_dim * embed_dim); + weights[7] = RandomIntMat(embed_dim); + + weights[8] = RandomMat(1); + weights[9] = RandomMat(1); + weights[10] = RandomMat(1); + + weights[11] = RandomMat(embed_dim); + weights[12] = RandomMat(embed_dim); + weights[13] = RandomMat(embed_dim); + weights[14] = RandomMat(embed_dim); + + weights[15] = RandomMat(5); + + std::vector as(1); + as[0] = a; + + int ret = test_layer("MultiHeadAttention", pd, weights, as); + if (ret != 0) + { + fprintf(stderr, "test_multiheadattention failed a=(%d %d)\n", a.w, a.h); + } + + return ret; +} + +static int test_multiheadattention_2() +{ + return 0 + || test_multiheadattention_int8(RandomMat(64, 128), 8) + || test_multiheadattention_int8(RandomMat(64, 127), 32); +} +#endif + int main() { SRAND(7767517); - +#ifdef NCNN_INT8 + return 0 + || test_multiheadattention_0() + || test_multiheadattention_1() + || test_multiheadattention_2(); +#else return 0 || test_multiheadattention_0() || test_multiheadattention_1(); +#endif } diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 965046be48d0..31399e79f97d 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -29,7 +29,7 @@ if(NCNN_VULKAN) target_link_libraries(ncnn2mem PRIVATE ${Vulkan_LIBRARY}) endif() -add_executable(ncnnoptimize ncnnoptimize.cpp) +add_executable(ncnnoptimize ncnnoptimize.cpp modelwriter.cpp) target_link_libraries(ncnnoptimize PRIVATE ncnn) if(NCNN_VULKAN) target_link_libraries(ncnnoptimize PRIVATE ${Vulkan_LIBRARY}) diff --git a/tools/modelwriter.cpp b/tools/modelwriter.cpp new file mode 100644 index 000000000000..6096f7ea1cc1 --- /dev/null +++ b/tools/modelwriter.cpp @@ -0,0 +1,2069 @@ +#include "modelwriter.h" + +MemoryFootprintAllocator::MemoryFootprintAllocator() +{ + current_memory_usage = 0; + memory_footprint = 0; +} + +void* MemoryFootprintAllocator::fastMalloc(size_t size) +{ + ncnn::MutexLockGuard g(lock); + void* ptr = ncnn::fastMalloc(size); + bookkeeper[ptr] = size; + current_memory_usage += size; + memory_footprint = std::max(memory_footprint, current_memory_usage); + return ptr; +} + +void MemoryFootprintAllocator::fastFree(void* ptr) +{ + ncnn::MutexLockGuard g(lock); + size_t size = bookkeeper[ptr]; + current_memory_usage -= size; + bookkeeper.erase(bookkeeper.find(ptr)); + ncnn::fastFree(ptr); +} + +int CustomLayer::load_param(const ncnn::ParamDict& pd) +{ + mpd = pd; + return 0; +} + +void CustomLayer::write_param(FILE* pp) +{ + for (int i = 0; i < NCNN_MAX_PARAM_COUNT; i++) + { + int type = mpd.type(i); + if (type == 0) + continue; + + if (type == 2) + { + fprintf(pp, " %d=%d", i, mpd.get(i, 0)); + } + if (type == 3) + { + fprintf(pp, " %d=%e", i, mpd.get(i, 0.f)); + } + if (type == 5) + { + ncnn::Mat v = mpd.get(i, ncnn::Mat()); + int len = v.w; + fprintf(pp, " %d=%d", -i - 23300, len); + const int* p = v; + for (int j = 0; j < len; j++) + { + fprintf(pp, ",%d", p[j]); + } + } + if (type == 6) + { + ncnn::Mat v = mpd.get(i, ncnn::Mat()); + int len = v.w; + fprintf(pp, " %d=%d", -i - 23300, len); + const float* p = v; + for (int j = 0; j < len; j++) + { + fprintf(pp, ",%e", p[j]); + } + } + } +} + +DEFINE_LAYER_CREATOR(CustomLayer) + +ModelWriter::ModelWriter() + : blobs(mutable_blobs()), layers(mutable_layers()) +{ + opt.lightmode = false; + has_custom_layer = false; + gen_random_weight = false; + cutstart = -1; + cutend = -1; + + SRAND(7767517); +} + +ncnn::Layer* ModelWriter::create_custom_layer(const char* type) +{ + ncnn::Layer* layer = Net::create_custom_layer(type); + if (layer) + return layer; + + fprintf(stderr, "create_custom_layer %s\n", type); + + register_custom_layer(type, CustomLayer_layer_creator); + + has_custom_layer = true; + + return Net::create_custom_layer(type); +} + +int ModelWriter::set_cutparam(const char* cutstartname, const char* cutendname) +{ + if (cutstartname != nullptr) + { + int layindex = find_layer_index_by_name(cutstartname); + if (layindex >= 0) + { + cutstart = layindex; + fprintf(stderr, "cutstart layer %d:%s\n", layindex, cutstartname); + } + else + { + fprintf(stderr, "not find target cutstart layer %s\n", cutstartname); + return -1; + } + } + + if (cutendname != nullptr) + { + int layindex = find_layer_index_by_name(cutendname); + if (layindex >= 0) + { + cutend = layindex; + fprintf(stderr, "cutend layer %d:%s\n", layindex, cutendname); + } + else + { + fprintf(stderr, "not find target cutend layer %s\n", cutendname); + return -1; + } + } + + return 0; +} + +int ModelWriter::shape_inference() +{ + if (has_custom_layer) + { + fprintf(stderr, "model has custom layer, shape_inference skipped\n"); + return -1; + } + + const size_t layer_count = layers.size(); + const size_t blob_count = blobs.size(); + + // recreate layer pipeline for param and weight changes + for (size_t i = 0; i < layer_count; i++) + { + ncnn::Layer* layer = layers[i]; + + layer->destroy_pipeline(opt); + + int cret = layer->create_pipeline(opt); + if (cret != 0) + { + NCNN_LOGE("layer create_pipeline %d %s failed", (int)i, layer->name.c_str()); + return -1; + } + } + + ncnn::Extractor ex = create_extractor(); + ex.set_light_mode(true); + + // prepare Input blobs + for (size_t i = 0; i < layer_count; i++) + { + const ncnn::Layer* layer = layers[i]; + if (layer->type == "ncnnfused") + continue; + + if (layer->type != "Input") + continue; + + ncnn::Input* input = (ncnn::Input*)layer; + + int w = input->w; + int h = input->h; + int c = input->c; + + int dims = 0; + if (w == 0 && h == 0 && c == 0) dims = 0; + if (w != 0 && h == 0 && c == 0) dims = 1; + if (w != 0 && h != 0 && c == 0) dims = 2; + if (w != 0 && h != 0 && c != 0) dims = 3; + + if (dims == 0) + { + fprintf(stderr, "Input layer %s without shape info, shape_inference skipped\n", layer->name.c_str()); + return -1; + } + + ncnn::Mat m; + if (dims == 1) m.create(w); + if (dims == 2) m.create(w, h); + if (dims == 3) m.create(w, h, c); + + ex.input(layer->tops[0], m); + } + + // prepare blobs with predefined shape + for (size_t i = 0; i < blob_count; i++) + { + const ncnn::Blob& blob = blobs[i]; + + int dims = blob.shape.dims; + int w = blob.shape.w; + int h = blob.shape.h; + int c = blob.shape.c; + + if (dims == 0) + continue; + + ncnn::Mat m; + if (dims == 1) m.create(w); + if (dims == 2) m.create(w, h); + if (dims == 3) m.create(w, h, c); + + m.fill(0.f); + + ex.input(int(i), m); + } + + fprintf(stderr, "shape_inference\n"); + + // resolve all layer output blob shape + for (size_t i = 0; i < layer_count; i++) + { + const ncnn::Layer* layer = layers[i]; + if (layer->type == "ncnnfused") + continue; + + for (size_t j = 0; j < layer->tops.size(); j++) + { + int top_blob_index = layer->tops[j]; + + ncnn::Mat m; + ex.extract(top_blob_index, m); + + blobs[top_blob_index].shape = m; + } + } + + // assign all layer blob shape + for (size_t i = 0; i < layer_count; i++) + { + ncnn::Layer* layer = layers[i]; + if (layer->type == "ncnnfused") + continue; + + layer->bottom_shapes.resize(layer->bottoms.size()); + for (size_t j = 0; j < layer->bottoms.size(); j++) + { + int bottom_blob_index = layer->bottoms[j]; + + layer->bottom_shapes[j] = blobs[bottom_blob_index].shape; + } + + layer->top_shapes.resize(layer->tops.size()); + for (size_t j = 0; j < layer->tops.size(); j++) + { + int top_blob_index = layer->tops[j]; + + layer->top_shapes[j] = blobs[top_blob_index].shape; + + // fprintf(stderr, "%d %4d %4d %4d | %2d %s\n", blobs[top_blob_index].shape.dims, blobs[top_blob_index].shape.w, blobs[top_blob_index].shape.h, blobs[top_blob_index].shape.c, top_blob_index, blobs[top_blob_index].name.c_str()); + } + } + + return 0; +} + +int ModelWriter::estimate_memory_footprint() +{ + if (has_custom_layer) + { + fprintf(stderr, "model has custom layer, estimate_memory_footprint skipped\n"); + return -1; + } + + const size_t layer_count = layers.size(); + const size_t blob_count = blobs.size(); + + MemoryFootprintAllocator allocator; + + ncnn::Extractor ex = create_extractor(); + ex.set_light_mode(true); + + ex.set_blob_allocator(&allocator); + ex.set_workspace_allocator(&allocator); + + // prepare Input blobs + for (size_t i = 0; i < layer_count; i++) + { + const ncnn::Layer* layer = layers[i]; + if (layer->type == "ncnnfused") + continue; + + if (layer->type != "Input") + continue; + + ncnn::Input* input = (ncnn::Input*)layer; + + int w = input->w; + int h = input->h; + int c = input->c; + + int dims = 0; + if (w == 0 && h == 0 && c == 0) dims = 0; + if (w != 0 && h == 0 && c == 0) dims = 1; + if (w != 0 && h != 0 && c == 0) dims = 2; + if (w != 0 && h != 0 && c != 0) dims = 3; + + if (dims == 0) + { + fprintf(stderr, "Input layer %s without shape info, estimate_memory_footprint skipped\n", layer->name.c_str()); + return -1; + } + + ncnn::Mat m; + if (dims == 1) m.create(w, 4u, &allocator); + if (dims == 2) m.create(w, h, 4u, &allocator); + if (dims == 3) m.create(w, h, c, 4u, &allocator); + + ex.input(layer->tops[0], m); + + fprintf(stderr, "input = %s\n", blobs[layer->tops[0]].name.c_str()); + } + + // find output blobs and do inference + std::vector outputs; + for (size_t i = 0; i < blob_count; i++) + { + const ncnn::Blob& blob = blobs[i]; + + if (blob.producer == -1 || blob.consumer != -1) + continue; + + if (layers[blob.producer]->type == "ncnnfused") + continue; + + // treat blob without any consumers as output + ncnn::Mat m; + ex.extract(int(i), m); + outputs.push_back(m); + + fprintf(stderr, "extract = %s\n", blob.name.c_str()); + } + + fprintf(stderr, "estimated memory footprint = %.2f KB = %.2f MB\n", allocator.memory_footprint / 1024.f, allocator.memory_footprint / 1024.f / 1024.f); + + return 0; +} + +int ModelWriter::fprintf_param_int_array(int id, const ncnn::Mat& m, FILE* pp) +{ + const int count = m.w; + const int* ptr = m; + + fprintf(pp, " -%d=%d", 23300 + id, count); + for (int i = 0; i < count; i++) + { + fprintf(pp, ",%d", ptr[i]); + } + + return 0; +} + +int ModelWriter::fprintf_param_float_array(int id, const ncnn::Mat& m, FILE* pp) +{ + const int count = m.w; + const float* ptr = m; + + fprintf(pp, " -%d=%d", 23300 + id, count); + for (int i = 0; i < count; i++) + { + fprintf(pp, ",%e", ptr[i]); + } + + return 0; +} + +static inline size_t alignSize(size_t sz, int n) +{ + return (sz + n - 1) & -n; +} + +static void replace_denormals_with_zero(float* data, size_t data_length) +{ + const int total = static_cast(data_length); + for (size_t i = 0; i < data_length; ++i) + { + float value = data[i]; + + if (fabsf(value) < 1e-30 && fabsf(value) != 0.f) + { + data[i] = 0.f; + } + } +} + +static float RandomFloat(float a = -1.2f, float b = 1.2f) +{ + float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX; + float diff = b - a; + float r = random * diff; + return a + r; +} + +static void Randomize(ncnn::Mat& m, float a = -1.2f, float b = 1.2f) +{ + if (m.elemsize == 4) + { + for (size_t i = 0; i < m.total(); i++) + { + m[i] = RandomFloat(a, b); + } + } + else if (m.elemsize == 2) + { + unsigned short* p = m; + for (size_t i = 0; i < m.total(); i++) + { + p[i] = ncnn::float32_to_float16(RandomFloat(a, b)); + } + } + else if (m.elemsize == 1) + { + signed char* p = m; + for (size_t i = 0; i < m.total(); i++) + { + p[i] = (signed char)RandomFloat(-127, 127); + } + } +} + +int ModelWriter::fwrite_weight_tag_data(const ncnn::Mat& data, FILE* bp, float a, float b) +{ + int p0 = ftell(bp); + + ncnn::Mat data_flattened = data.reshape(data.w * data.h * data.d * data.c); + if (gen_random_weight) + Randomize(data_flattened, a, b); + + if (data_flattened.elemsize == 4) + { + if (storage_type == 1) + { + const int tag = 0x01306B47; // fp16 magic + fwrite(&tag, sizeof(int), 1, bp); + ncnn::Mat data_flattened_fp16; + ncnn::cast_float32_to_float16(data_flattened, data_flattened_fp16); + fwrite(data_flattened_fp16.data, data_flattened_fp16.elemsize, data_flattened_fp16.w, bp); + } + else + { + const int tag = 0; // fp32 magic + fwrite(&tag, sizeof(int), 1, bp); + replace_denormals_with_zero(data_flattened, data_flattened.w); + fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); + } + } + else if (data_flattened.elemsize == 2) + { + const int tag = 0x01306B47; // fp16 magic + fwrite(&tag, sizeof(int), 1, bp); + fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); + } + else if (data_flattened.elemsize == 1) + { + const int tag = 0x000D4B38; // int8 magic + fwrite(&tag, sizeof(int), 1, bp); + fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); + } + else + { + fprintf(stderr, "unknown weight data type %d\n", (int)data_flattened.elemsize); + } + + // padding to 32bit align + int nwrite = ftell(bp) - p0; + size_t nalign = alignSize(nwrite, 4); + unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00}; + fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp); + + return 0; +} + +int ModelWriter::fwrite_weight_data(const ncnn::Mat& data, FILE* bp, float a, float b) +{ + int p0 = ftell(bp); + + ncnn::Mat data_flattened = data.reshape(data.w * data.h * data.d * data.c); + if (gen_random_weight) + Randomize(data_flattened, a, b); + + if (data_flattened.elemsize == 4) // fp32 + { + replace_denormals_with_zero(data_flattened, data_flattened.w); + } + + fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); + + // padding to 32bit align + int nwrite = ftell(bp) - p0; + size_t nalign = alignSize(nwrite, 4); + unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00}; + fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp); + + return 0; +} + +int ModelWriter::save(const char* parampath, const char* binpath) +{ + uint64_t mac = 0; + + FILE* pp = fopen(parampath, "wb"); + FILE* bp = fopen(binpath, "wb"); + + fprintf(pp, "7767517\n"); + + const size_t layer_count = layers.size(); + + int layer_count_fused = 0; + std::set blob_names; + for (size_t i = 0; i < layer_count; i++) + { + const ncnn::Layer* layer = layers[i]; + if (layer->type == "ncnnfused") + continue; + + layer_count_fused++; + + size_t bottom_count = layer->bottoms.size(); + for (size_t j = 0; j < bottom_count; j++) + { + int bottom_blob_index = layer->bottoms[j]; + blob_names.insert(blobs[bottom_blob_index].name); + } + + size_t top_count = layer->tops.size(); + for (size_t j = 0; j < top_count; j++) + { + int top_blob_index = layer->tops[j]; + blob_names.insert(blobs[top_blob_index].name); + } + } + + size_t blob_count_fused = blob_names.size(); + + fprintf(pp, "%d %zd\n", layer_count_fused, blob_count_fused); + + for (size_t i = 0; i < layer_count; i++) + { + const ncnn::Layer* layer = layers[i]; + if (layer->type == "ncnnfused") + continue; + + if (cutstart > 0 && i < cutstart) + continue; + + if (cutend > 0 && i > cutend) + continue; + + size_t bottom_count = layer->bottoms.size(); + size_t top_count = layer->tops.size(); + + fprintf(pp, "%-24s %-24s %zd %zd", layer->type.c_str(), layer->name.c_str(), bottom_count, top_count); + + for (size_t j = 0; j < bottom_count; j++) + { + int bottom_blob_index = layer->bottoms[j]; + fprintf(pp, " %s", blobs[bottom_blob_index].name.c_str()); + } + for (size_t j = 0; j < top_count; j++) + { + int top_blob_index = layer->tops[j]; + fprintf(pp, " %s", blobs[top_blob_index].name.c_str()); + } + + // write shape hints + bool shape_ready = true; + for (size_t j = 0; j < top_count; j++) + { + int top_blob_index = layer->tops[j]; + + int dims = blobs[top_blob_index].shape.dims; + if (dims == 0) + { + shape_ready = false; + break; + } + } + if (shape_ready) + { + fprintf(pp, " -23330=%zd", top_count * 4); + for (size_t j = 0; j < top_count; j++) + { + int top_blob_index = layer->tops[j]; + + int dims = blobs[top_blob_index].shape.dims; + int w = blobs[top_blob_index].shape.w; + int h = blobs[top_blob_index].shape.h; + int c = blobs[top_blob_index].shape.c; + + fprintf(pp, ",%d,%d,%d,%d", dims, w, h, c); + } + } + + // custom op + if (layer->typeindex & ncnn::LayerType::CustomBit) + { + ((CustomLayer*)layer)->write_param(pp); + + fprintf(pp, "\n"); + + continue; + } + + ncnn::Layer* layer_default = ncnn::create_layer(layer->typeindex); + + ncnn::ParamDict pd; + layer_default->load_param(pd); + +#define fprintf_param_value(format, phase) \ + { \ + if (op->phase != op_default->phase) fprintf(pp, format, op->phase); \ + } + + if (layer->type == "BatchNorm") + { + ncnn::BatchNorm* op = (ncnn::BatchNorm*)layer; + ncnn::BatchNorm* op_default = (ncnn::BatchNorm*)layer_default; + + fprintf_param_value(" 0=%d", channels) + fprintf_param_value(" 1=%e", eps) + + fwrite_weight_data(op->slope_data, bp); + fwrite_weight_data(op->mean_data, bp); + fwrite_weight_data(op->var_data, bp); + fwrite_weight_data(op->bias_data, bp); + } + else if (layer->type == "Bias") + { + ncnn::Bias* op = (ncnn::Bias*)layer; + ncnn::Bias* op_default = (ncnn::Bias*)layer_default; + + fprintf_param_value(" 0=%d", bias_data_size) + + fwrite_weight_data(op->bias_data, bp); + } + else if (layer->type == "BinaryOp") + { + ncnn::BinaryOp* op = (ncnn::BinaryOp*)layer; + ncnn::BinaryOp* op_default = (ncnn::BinaryOp*)layer_default; + + fprintf_param_value(" 0=%d", op_type) + fprintf_param_value(" 1=%d", with_scalar) + fprintf_param_value(" 2=%e", b) + } + else if (layer->type == "Clip") + { + ncnn::Clip* op = (ncnn::Clip*)layer; + ncnn::Clip* op_default = (ncnn::Clip*)layer_default; + + fprintf_param_value(" 0=%e", min) + fprintf_param_value(" 1=%e", max) + } + else if (layer->type == "Concat") + { + ncnn::Concat* op = (ncnn::Concat*)layer; + ncnn::Concat* op_default = (ncnn::Concat*)layer_default; + + fprintf_param_value(" 0=%d", axis) + } + else if (layer->type == "Convolution") + { + ncnn::Convolution* op = (ncnn::Convolution*)layer; + ncnn::Convolution* op_default = (ncnn::Convolution*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + fprintf_param_value(" 18=%e", pad_value) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 8=%d", int8_scale_term) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + +#if NCNN_INT8 + // write int8_scale data + if (op->int8_scale_term) + { + fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100); + fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1); + fwrite_weight_data(op->top_blob_int8_scales, bp, 0.001, 1); + } +#endif // NCNN_INT8 + + if (shape_ready) + { + int inc = blobs[layer->bottoms[0]].shape.c; + int outw = blobs[layer->tops[0]].shape.w; + int outh = blobs[layer->tops[0]].shape.h; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_h * op->kernel_w * outw * outh * outc * inc; + } + } + else if (layer->type == "Convolution1D") + { + ncnn::Convolution1D* op = (ncnn::Convolution1D*)layer; + ncnn::Convolution1D* op_default = (ncnn::Convolution1D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + fprintf_param_value(" 2=%d", dilation_w) + fprintf_param_value(" 3=%d", stride_w) + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + fprintf_param_value(" 18=%e", pad_value) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inh = blobs[layer->bottoms[0]].shape.h; + int outw = blobs[layer->tops[0]].shape.w; + int outh = blobs[layer->tops[0]].shape.h; + + mac += (uint64_t)op->kernel_w * outw * outh * inh; + } + } + else if (layer->type == "Convolution3D") + { + ncnn::Convolution3D* op = (ncnn::Convolution3D*)layer; + ncnn::Convolution3D* op_default = (ncnn::Convolution3D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + { + if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); + } + fprintf_param_value(" 18=%e", pad_value) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inc = blobs[layer->bottoms[0]].shape.c; + int outw = blobs[layer->tops[0]].shape.w; + int outh = blobs[layer->tops[0]].shape.h; + int outd = blobs[layer->tops[0]].shape.d; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * outw * outh * outd * outc * inc; + } + } + else if (layer->type == "ConvolutionDepthWise") + { + ncnn::ConvolutionDepthWise* op = (ncnn::ConvolutionDepthWise*)layer; + ncnn::ConvolutionDepthWise* op_default = (ncnn::ConvolutionDepthWise*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + fprintf_param_value(" 18=%e", pad_value) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 7=%d", group) + fprintf_param_value(" 8=%d", int8_scale_term) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + +#if NCNN_INT8 + // write int8_scale data + if (op->int8_scale_term == 1 || op->int8_scale_term == 101) + { + op->bottom_blob_int8_scales.w = 1; + } + if (op->int8_scale_term == 2 || op->int8_scale_term == 102) + { + op->weight_data_int8_scales.w = 1; + op->bottom_blob_int8_scales.w = 1; + } + if (op->int8_scale_term > 100) + { + op->top_blob_int8_scales.w = 1; + } + + if (op->int8_scale_term) + { + fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100); + fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1); + fwrite_weight_data(op->top_blob_int8_scales, bp, 0.001, 1); + } +#endif // NCNN_INT8 + + if (shape_ready) + { + int inc = blobs[layer->bottoms[0]].shape.c; + int outw = blobs[layer->tops[0]].shape.w; + int outh = blobs[layer->tops[0]].shape.h; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_h * op->kernel_w * outw * outh * (outc / op->group) * (inc / op->group) * op->group; + } + } + else if (layer->type == "ConvolutionDepthWise1D") + { + ncnn::ConvolutionDepthWise1D* op = (ncnn::ConvolutionDepthWise1D*)layer; + ncnn::ConvolutionDepthWise1D* op_default = (ncnn::ConvolutionDepthWise1D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + fprintf_param_value(" 2=%d", dilation_w) + fprintf_param_value(" 3=%d", stride_w) + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + fprintf_param_value(" 18=%e", pad_value) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 7=%d", group) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inh = blobs[layer->bottoms[0]].shape.h; + int outw = blobs[layer->tops[0]].shape.w; + int outh = blobs[layer->tops[0]].shape.h; + + mac += (uint64_t)op->kernel_w * outw * (outh / op->group) * (inh / op->group) * op->group; + } + } + else if (layer->type == "ConvolutionDepthWise3D") + { + ncnn::ConvolutionDepthWise3D* op = (ncnn::ConvolutionDepthWise3D*)layer; + ncnn::ConvolutionDepthWise3D* op_default = (ncnn::ConvolutionDepthWise3D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + { + if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); + } + fprintf_param_value(" 18=%e", pad_value) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 7=%d", group) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inc = blobs[layer->bottoms[0]].shape.c; + int outw = blobs[layer->tops[0]].shape.w; + int outh = blobs[layer->tops[0]].shape.h; + int outd = blobs[layer->tops[0]].shape.d; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * outw * outh * outd * (outc / op->group) * (inc / op->group) * op->group; + } + } + else if (layer->type == "Crop") + { + ncnn::Crop* op = (ncnn::Crop*)layer; + ncnn::Crop* op_default = (ncnn::Crop*)layer_default; + + fprintf_param_value(" 0=%d", woffset) + fprintf_param_value(" 1=%d", hoffset) + fprintf_param_value(" 2=%d", coffset) + fprintf_param_value(" 3=%d", outw) + fprintf_param_value(" 4=%d", outh) + fprintf_param_value(" 5=%d", outc) + fprintf_param_value(" 6=%d", woffset2) + fprintf_param_value(" 7=%d", hoffset2) + fprintf_param_value(" 8=%d", coffset2) + { + if (!op->starts.empty()) fprintf_param_int_array(9, op->starts, pp); + } + { + if (!op->ends.empty()) fprintf_param_int_array(10, op->ends, pp); + } + { + if (!op->axes.empty()) fprintf_param_int_array(11, op->axes, pp); + } + } + else if (layer->type == "Deconvolution") + { + ncnn::Deconvolution* op = (ncnn::Deconvolution*)layer; + ncnn::Deconvolution* op_default = (ncnn::Deconvolution*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + fprintf_param_value(" 18=%d", output_pad_right) + { + if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); + } + fprintf_param_value(" 20=%d", output_w) + { + if (op->output_h != op->output_w) fprintf(pp, " 21=%d", op->output_h); + } + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inw = blobs[layer->bottoms[0]].shape.w; + int inh = blobs[layer->bottoms[0]].shape.h; + int inc = blobs[layer->bottoms[0]].shape.c; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_h * op->kernel_w * inw * inh * outc * inc; + } + } + else if (layer->type == "Deconvolution1D") + { + ncnn::Deconvolution1D* op = (ncnn::Deconvolution1D*)layer; + ncnn::Deconvolution1D* op_default = (ncnn::Deconvolution1D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + fprintf_param_value(" 2=%d", dilation_w) + fprintf_param_value(" 3=%d", stride_w) + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + fprintf_param_value(" 18=%d", output_pad_right) + fprintf_param_value(" 20=%d", output_w) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inw = blobs[layer->bottoms[0]].shape.w; + int inh = blobs[layer->bottoms[0]].shape.h; + int outh = blobs[layer->tops[0]].shape.h; + + mac += (uint64_t)op->kernel_w * inw * outh * inh; + } + } + else if (layer->type == "Deconvolution3D") + { + ncnn::Deconvolution3D* op = (ncnn::Deconvolution3D*)layer; + ncnn::Deconvolution3D* op_default = (ncnn::Deconvolution3D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + { + if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); + } + fprintf_param_value(" 18=%d", output_pad_right) + { + if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); + if (op->output_pad_behind != op->output_pad_right) fprintf(pp, " 20=%d", op->output_pad_behind); + } + fprintf_param_value(" 25=%d", output_w) + { + if (op->output_h != op->output_w) fprintf(pp, " 26=%d", op->output_h); + if (op->output_d != op->output_w) fprintf(pp, " 27=%d", op->output_d); + } + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inw = blobs[layer->bottoms[0]].shape.w; + int inh = blobs[layer->bottoms[0]].shape.h; + int ind = blobs[layer->bottoms[0]].shape.d; + int inc = blobs[layer->bottoms[0]].shape.c; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * inw * inh * ind * outc * inc; + } + } + else if (layer->type == "DeconvolutionDepthWise") + { + ncnn::DeconvolutionDepthWise* op = (ncnn::DeconvolutionDepthWise*)layer; + ncnn::DeconvolutionDepthWise* op_default = (ncnn::DeconvolutionDepthWise*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + fprintf_param_value(" 18=%d", output_pad_right) + { + if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); + } + fprintf_param_value(" 20=%d", output_w) + { + if (op->output_h != op->output_w) fprintf(pp, " 21=%d", op->output_h); + } + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 7=%d", group) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inw = blobs[layer->bottoms[0]].shape.w; + int inh = blobs[layer->bottoms[0]].shape.h; + int inc = blobs[layer->bottoms[0]].shape.c; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_h * op->kernel_w * inw * inh * (outc / op->group) * (inc / op->group) * op->group; + } + } + else if (layer->type == "DeconvolutionDepthWise1D") + { + ncnn::DeconvolutionDepthWise1D* op = (ncnn::DeconvolutionDepthWise1D*)layer; + ncnn::DeconvolutionDepthWise1D* op_default = (ncnn::DeconvolutionDepthWise1D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + fprintf_param_value(" 2=%d", dilation_w) + fprintf_param_value(" 3=%d", stride_w) + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + fprintf_param_value(" 18=%d", output_pad_right) + fprintf_param_value(" 20=%d", output_w) + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 7=%d", group) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inw = blobs[layer->bottoms[0]].shape.w; + int inh = blobs[layer->bottoms[0]].shape.h; + int outh = blobs[layer->tops[0]].shape.h; + + mac += (uint64_t)op->kernel_w * inw * (outh / op->group) * (inh / op->group) * op->group; + } + } + else if (layer->type == "DeconvolutionDepthWise3D") + { + ncnn::DeconvolutionDepthWise3D* op = (ncnn::DeconvolutionDepthWise3D*)layer; + ncnn::DeconvolutionDepthWise3D* op_default = (ncnn::DeconvolutionDepthWise3D*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); + } + fprintf_param_value(" 2=%d", dilation_w) + { + if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); + if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); + } + fprintf_param_value(" 3=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); + if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); + } + fprintf_param_value(" 4=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); + if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); + } + { + if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); + } + fprintf_param_value(" 18=%d", output_pad_right) + { + if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); + if (op->output_pad_behind != op->output_pad_right) fprintf(pp, " 20=%d", op->output_pad_behind); + } + fprintf_param_value(" 25=%d", output_w) + { + if (op->output_h != op->output_w) fprintf(pp, " 26=%d", op->output_h); + if (op->output_d != op->output_w) fprintf(pp, " 27=%d", op->output_d); + } + fprintf_param_value(" 5=%d", bias_term) + fprintf_param_value(" 6=%d", weight_data_size) + fprintf_param_value(" 7=%d", group) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + + if (shape_ready) + { + int inw = blobs[layer->bottoms[0]].shape.w; + int inh = blobs[layer->bottoms[0]].shape.h; + int ind = blobs[layer->bottoms[0]].shape.d; + int inc = blobs[layer->bottoms[0]].shape.c; + int outc = blobs[layer->tops[0]].shape.c; + + mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * inw * inh * ind * (outc / op->group) * (inc / op->group) * op->group; + } + } + else if (layer->type == "DetectionOutput") + { + ncnn::DetectionOutput* op = (ncnn::DetectionOutput*)layer; + ncnn::DetectionOutput* op_default = (ncnn::DetectionOutput*)layer_default; + + fprintf_param_value(" 0=%d", num_class) + fprintf_param_value(" 1=%e", nms_threshold) + fprintf_param_value(" 2=%d", nms_top_k) + fprintf_param_value(" 3=%d", keep_top_k) + fprintf_param_value(" 4=%e", confidence_threshold) + fprintf_param_value(" 5=%e", variances[0]) + fprintf_param_value(" 6=%e", variances[1]) + fprintf_param_value(" 7=%e", variances[2]) + fprintf_param_value(" 8=%e", variances[3]) + } + else if (layer->type == "Dropout") + { + ncnn::Dropout* op = (ncnn::Dropout*)layer; + ncnn::Dropout* op_default = (ncnn::Dropout*)layer_default; + + fprintf_param_value(" 0=%e", scale) + } + else if (layer->type == "Eltwise") + { + ncnn::Eltwise* op = (ncnn::Eltwise*)layer; + ncnn::Eltwise* op_default = (ncnn::Eltwise*)layer_default; + + fprintf_param_value(" 0=%d", op_type) + { + if (!op->coeffs.empty()) fprintf_param_float_array(1, op->coeffs, pp); + } + } + else if (layer->type == "ELU") + { + ncnn::ELU* op = (ncnn::ELU*)layer; + ncnn::ELU* op_default = (ncnn::ELU*)layer_default; + + fprintf_param_value(" 0=%e", alpha) + } + else if (layer->type == "Embed") + { + ncnn::Embed* op = (ncnn::Embed*)layer; + ncnn::Embed* op_default = (ncnn::Embed*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", input_dim) + fprintf_param_value(" 2=%d", bias_term) + fprintf_param_value(" 3=%d", weight_data_size) + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + } + else if (layer->type == "Exp") + { + ncnn::Exp* op = (ncnn::Exp*)layer; + ncnn::Exp* op_default = (ncnn::Exp*)layer_default; + + fprintf_param_value(" 0=%e", base) + fprintf_param_value(" 1=%e", scale) + fprintf_param_value(" 2=%e", shift) + } + else if (layer->type == "ExpandDims") + { + ncnn::ExpandDims* op = (ncnn::ExpandDims*)layer; + ncnn::ExpandDims* op_default = (ncnn::ExpandDims*)layer_default; + + fprintf_param_value(" 0=%d", expand_w) + fprintf_param_value(" 1=%d", expand_h) + fprintf_param_value(" 2=%d", expand_c) + { + if (!op->axes.empty()) fprintf_param_int_array(0, op->axes, pp); + } + } + else if (layer->type == "GELU") + { + ncnn::GELU* op = (ncnn::GELU*)layer; + ncnn::GELU* op_default = (ncnn::GELU*)layer_default; + + fprintf_param_value(" 0=%d", fast_gelu) + } + else if (layer->type == "Gemm") + { + ncnn::Gemm* op = (ncnn::Gemm*)layer; + ncnn::Gemm* op_default = (ncnn::Gemm*)layer_default; + + fprintf_param_value(" 0=%e", alpha) + fprintf_param_value(" 1=%e", beta) + fprintf_param_value(" 2=%d", transA) + fprintf_param_value(" 3=%d", transB) + } + else if (layer->type == "GroupNorm") + { + ncnn::GroupNorm* op = (ncnn::GroupNorm*)layer; + ncnn::GroupNorm* op_default = (ncnn::GroupNorm*)layer_default; + + fprintf_param_value(" 0=%d", group) + fprintf_param_value(" 1=%d", channels) + fprintf_param_value(" 2=%e", eps) + fprintf_param_value(" 3=%d", affine) + + fwrite_weight_data(op->gamma_data, bp); + fwrite_weight_data(op->beta_data, bp); + } + else if (layer->type == "GRU") + { + ncnn::GRU* op = (ncnn::GRU*)layer; + ncnn::GRU* op_default = (ncnn::GRU*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", weight_data_size) + fprintf_param_value(" 2=%d", direction) + + fwrite_weight_tag_data(op->weight_xc_data, bp); + fwrite_weight_tag_data(op->bias_c_data, bp); + fwrite_weight_tag_data(op->weight_hc_data, bp); + } + else if (layer->type == "HardSigmoid") + { + ncnn::HardSigmoid* op = (ncnn::HardSigmoid*)layer; + ncnn::HardSigmoid* op_default = (ncnn::HardSigmoid*)layer_default; + + fprintf_param_value(" 0=%e", alpha) + fprintf_param_value(" 1=%e", beta) + } + else if (layer->type == "HardSwish") + { + ncnn::HardSwish* op = (ncnn::HardSwish*)layer; + ncnn::HardSwish* op_default = (ncnn::HardSwish*)layer_default; + + fprintf_param_value(" 0=%e", alpha) + fprintf_param_value(" 1=%e", beta) + } + else if (layer->type == "InnerProduct") + { + ncnn::InnerProduct* op = (ncnn::InnerProduct*)layer; + ncnn::InnerProduct* op_default = (ncnn::InnerProduct*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", bias_term) + fprintf_param_value(" 2=%d", weight_data_size) + fprintf_param_value(" 8=%d", int8_scale_term) + fprintf_param_value(" 9=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); + } + + fwrite_weight_tag_data(op->weight_data, bp); + fwrite_weight_data(op->bias_data, bp); + +#if NCNN_INT8 + // write int8_scale data + if (op->int8_scale_term) + { + fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100); + fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1); + } +#endif // NCNN_INT8 + + if (shape_ready) + { + int inw = blobs[layer->bottoms[0]].shape.w; + int inh = blobs[layer->bottoms[0]].shape.h; + int inc = blobs[layer->bottoms[0]].shape.c; + int outw = blobs[layer->tops[0]].shape.w; + + mac += (uint64_t)inw * inh * inc * outw; + } + } + else if (layer->type == "Input") + { + ncnn::Input* op = (ncnn::Input*)layer; + ncnn::Input* op_default = (ncnn::Input*)layer_default; + + fprintf_param_value(" 0=%d", w) + fprintf_param_value(" 1=%d", h) + fprintf_param_value(" 2=%d", c) + } + else if (layer->type == "InstanceNorm") + { + ncnn::InstanceNorm* op = (ncnn::InstanceNorm*)layer; + ncnn::InstanceNorm* op_default = (ncnn::InstanceNorm*)layer_default; + + fprintf_param_value(" 0=%d", channels) + fprintf_param_value(" 1=%e", eps) + fprintf_param_value(" 2=%d", affine) + + fwrite_weight_data(op->gamma_data, bp); + fwrite_weight_data(op->beta_data, bp); + } + else if (layer->type == "Interp") + { + ncnn::Interp* op = (ncnn::Interp*)layer; + ncnn::Interp* op_default = (ncnn::Interp*)layer_default; + + fprintf_param_value(" 0=%d", resize_type) + fprintf_param_value(" 1=%e", height_scale) + fprintf_param_value(" 2=%e", width_scale) + fprintf_param_value(" 3=%d", output_height) + fprintf_param_value(" 4=%d", output_width) + fprintf_param_value(" 5=%d", dynamic_target_size) + fprintf_param_value(" 6=%d", align_corner) + } + else if (layer->type == "LayerNorm") + { + ncnn::LayerNorm* op = (ncnn::LayerNorm*)layer; + ncnn::LayerNorm* op_default = (ncnn::LayerNorm*)layer_default; + + fprintf_param_value(" 0=%d", affine_size) + fprintf_param_value(" 1=%e", eps) + fprintf_param_value(" 2=%d", affine) + + fwrite_weight_data(op->gamma_data, bp); + fwrite_weight_data(op->beta_data, bp); + } + else if (layer->type == "Log") + { + ncnn::Log* op = (ncnn::Log*)layer; + ncnn::Log* op_default = (ncnn::Log*)layer_default; + + fprintf_param_value(" 0=%e", base) + fprintf_param_value(" 1=%e", scale) + fprintf_param_value(" 2=%e", shift) + } + else if (layer->type == "LRN") + { + ncnn::LRN* op = (ncnn::LRN*)layer; + ncnn::LRN* op_default = (ncnn::LRN*)layer_default; + + fprintf_param_value(" 0=%d", region_type) + fprintf_param_value(" 1=%d", local_size) + fprintf_param_value(" 2=%e", alpha) + fprintf_param_value(" 3=%e", beta) + fprintf_param_value(" 4=%e", bias) + } + else if (layer->type == "LSTM") + { + ncnn::LSTM* op = (ncnn::LSTM*)layer; + ncnn::LSTM* op_default = (ncnn::LSTM*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", weight_data_size) + fprintf_param_value(" 2=%d", direction) + + fwrite_weight_tag_data(op->weight_xc_data, bp); + fwrite_weight_tag_data(op->bias_c_data, bp); + fwrite_weight_tag_data(op->weight_hc_data, bp); + } + else if (layer->type == "MatMul") + { + ncnn::MatMul* op = (ncnn::MatMul*)layer; + ncnn::MatMul* op_default = (ncnn::MatMul*)layer_default; + + fprintf_param_value(" 0=%d", transB) + } + else if (layer->type == "MemoryData") + { + ncnn::MemoryData* op = (ncnn::MemoryData*)layer; + ncnn::MemoryData* op_default = (ncnn::MemoryData*)layer_default; + + fprintf_param_value(" 0=%d", w) + fprintf_param_value(" 1=%d", h) + fprintf_param_value(" 2=%d", c) + fprintf_param_value(" 11=%d", d) + fwrite_weight_data(op->data, bp); + } + else if (layer->type == "MultiHeadAttention") + { + ncnn::MultiHeadAttention* op = (ncnn::MultiHeadAttention*)layer; + ncnn::MultiHeadAttention* op_default = (ncnn::MultiHeadAttention*)layer_default; + + fprintf_param_value(" 0=%d", embed_dim) + fprintf_param_value(" 1=%d", num_head) + fprintf_param_value(" 2=%d", weight_data_size) + fprintf_param_value(" 3=%d", int8_scale_term); + + fwrite_weight_tag_data(op->q_weight_data, bp); + fwrite_weight_data(op->q_bias_data, bp); + fwrite_weight_tag_data(op->k_weight_data, bp); + fwrite_weight_data(op->k_bias_data, bp); + fwrite_weight_tag_data(op->v_weight_data, bp); + fwrite_weight_data(op->v_bias_data, bp); + fwrite_weight_tag_data(op->out_weight_data, bp); + fwrite_weight_data(op->out_bias_data, bp); + +#ifdef NCNN_INT8 + if (op->int8_scale_term) + { + fwrite_weight_data(op->q_input_scale, bp, 90, 100); + fwrite_weight_data(op->k_input_scale, bp, 90, 100); + fwrite_weight_data(op->v_input_scale, bp, 90, 100); + + fwrite_weight_data(op->q_weight_scales, bp, 0.001, 1); + fwrite_weight_data(op->k_weight_scales, bp, 0.001, 1); + fwrite_weight_data(op->v_weight_scales, bp, 0.001, 1); + fwrite_weight_data(op->o_weight_scales, bp, 0.001, 1); + fwrite_weight_data(op->internal_scales, bp, 0.001, 1); + } +#endif + } + else if (layer->type == "MVN") + { + ncnn::MVN* op = (ncnn::MVN*)layer; + ncnn::MVN* op_default = (ncnn::MVN*)layer_default; + + fprintf_param_value(" 0=%d", normalize_variance) + fprintf_param_value(" 1=%d", across_channels) + fprintf_param_value(" 2=%e", eps) + } + else if (layer->type == "Normalize") + { + ncnn::Normalize* op = (ncnn::Normalize*)layer; + ncnn::Normalize* op_default = (ncnn::Normalize*)layer_default; + + fprintf_param_value(" 0=%d", across_spatial) + fprintf_param_value(" 1=%d", channel_shared) + fprintf_param_value(" 2=%e", eps) + fprintf_param_value(" 3=%d", scale_data_size) + fprintf_param_value(" 4=%d", across_channel) + fprintf_param_value(" 9=%d", eps_mode) + + fwrite_weight_data(op->scale_data, bp); + } + else if (layer->type == "Padding") + { + ncnn::Padding* op = (ncnn::Padding*)layer; + ncnn::Padding* op_default = (ncnn::Padding*)layer_default; + + fprintf_param_value(" 0=%d", top) + fprintf_param_value(" 1=%d", bottom) + fprintf_param_value(" 2=%d", left) + fprintf_param_value(" 3=%d", right) + fprintf_param_value(" 4=%d", type) + fprintf_param_value(" 5=%e", value) + fprintf_param_value(" 6=%d", per_channel_pad_data_size) + fprintf_param_value(" 7=%d", front) + fprintf_param_value(" 8=%d", behind) + + fwrite_weight_data(op->per_channel_pad_data, bp); + } + else if (layer->type == "Permute") + { + ncnn::Permute* op = (ncnn::Permute*)layer; + ncnn::Permute* op_default = (ncnn::Permute*)layer_default; + + fprintf_param_value(" 0=%d", order_type) + } + else if (layer->type == "PixelShuffle") + { + ncnn::PixelShuffle* op = (ncnn::PixelShuffle*)layer; + ncnn::PixelShuffle* op_default = (ncnn::PixelShuffle*)layer_default; + + fprintf_param_value(" 0=%d", upscale_factor) + fprintf_param_value(" 1=%d", mode) + } + else if (layer->type == "Pooling") + { + ncnn::Pooling* op = (ncnn::Pooling*)layer; + ncnn::Pooling* op_default = (ncnn::Pooling*)layer_default; + + fprintf_param_value(" 0=%d", pooling_type) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + } + fprintf_param_value(" 2=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 12=%d", op->stride_h); + } + fprintf_param_value(" 3=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 13=%d", op->pad_top); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 14=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 15=%d", op->pad_bottom); + } + fprintf_param_value(" 4=%d", global_pooling) + fprintf_param_value(" 5=%d", pad_mode) + fprintf_param_value(" 6=%d", avgpool_count_include_pad) + fprintf_param_value(" 7=%d", adaptive_pooling) + fprintf_param_value(" 8=%d", out_w) + { + if (op->out_h != op->out_w) fprintf(pp, " 18=%d", op->out_h); + } + } + else if (layer->type == "Pooling1D") + { + ncnn::Pooling1D* op = (ncnn::Pooling1D*)layer; + ncnn::Pooling1D* op_default = (ncnn::Pooling1D*)layer_default; + + fprintf_param_value(" 0=%d", pooling_type) + fprintf_param_value(" 1=%d", kernel_w) + fprintf_param_value(" 2=%d", stride_w) + fprintf_param_value(" 3=%d", pad_left) + { + if (op->pad_right != op->pad_left) fprintf(pp, " 14=%d", op->pad_right); + } + fprintf_param_value(" 4=%d", global_pooling) + fprintf_param_value(" 5=%d", pad_mode) + fprintf_param_value(" 6=%d", avgpool_count_include_pad) + fprintf_param_value(" 7=%d", adaptive_pooling) + fprintf_param_value(" 8=%d", out_w) + } + else if (layer->type == "Pooling3D") + { + ncnn::Pooling3D* op = (ncnn::Pooling3D*)layer; + ncnn::Pooling3D* op_default = (ncnn::Pooling3D*)layer_default; + + fprintf_param_value(" 0=%d", pooling_type) + fprintf_param_value(" 1=%d", kernel_w) + { + if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); + if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); + } + fprintf_param_value(" 2=%d", stride_w) + { + if (op->stride_h != op->stride_w) fprintf(pp, " 12=%d", op->stride_h); + if (op->stride_d != op->stride_w) fprintf(pp, " 22=%d", op->stride_d); + } + fprintf_param_value(" 3=%d", pad_left) + { + if (op->pad_top != op->pad_left) fprintf(pp, " 13=%d", op->pad_top); + if (op->pad_front != op->pad_left) fprintf(pp, " 23=%d", op->pad_front); + } + { + if (op->pad_right != op->pad_left) fprintf(pp, " 14=%d", op->pad_right); + } + { + if (op->pad_bottom != op->pad_top) fprintf(pp, " 15=%d", op->pad_bottom); + } + { + if (op->pad_behind != op->pad_front) fprintf(pp, " 16=%d", op->pad_behind); + } + fprintf_param_value(" 4=%d", global_pooling) + fprintf_param_value(" 5=%d", pad_mode) + fprintf_param_value(" 6=%d", avgpool_count_include_pad) + fprintf_param_value(" 7=%d", adaptive_pooling) + fprintf_param_value(" 8=%d", out_w) + { + if (op->out_h != op->out_w) fprintf(pp, " 18=%d", op->out_h); + if (op->out_d != op->out_w) fprintf(pp, " 28=%d", op->out_d); + } + } + else if (layer->type == "Power") + { + ncnn::Power* op = (ncnn::Power*)layer; + ncnn::Power* op_default = (ncnn::Power*)layer_default; + + fprintf_param_value(" 0=%e", power) + fprintf_param_value(" 1=%e", scale) + fprintf_param_value(" 2=%e", shift) + } + else if (layer->type == "PReLU") + { + ncnn::PReLU* op = (ncnn::PReLU*)layer; + ncnn::PReLU* op_default = (ncnn::PReLU*)layer_default; + + fprintf_param_value(" 0=%d", num_slope) + + fwrite_weight_data(op->slope_data, bp); + } + else if (layer->type == "PriorBox") + { + ncnn::PriorBox* op = (ncnn::PriorBox*)layer; + ncnn::PriorBox* op_default = (ncnn::PriorBox*)layer_default; + + { + if (!op->min_sizes.empty()) fprintf_param_float_array(0, op->min_sizes, pp); + } + { + if (!op->max_sizes.empty()) fprintf_param_float_array(1, op->max_sizes, pp); + } + { + if (!op->aspect_ratios.empty()) fprintf_param_float_array(2, op->aspect_ratios, pp); + } + fprintf_param_value(" 3=%e", variances[0]) + fprintf_param_value(" 4=%e", variances[1]) + fprintf_param_value(" 5=%e", variances[2]) + fprintf_param_value(" 6=%e", variances[3]) + fprintf_param_value(" 7=%d", flip) + fprintf_param_value(" 8=%d", clip) + fprintf_param_value(" 9=%d", image_width) + fprintf_param_value(" 10=%d", image_height) + fprintf_param_value(" 11=%e", step_width) + fprintf_param_value(" 12=%e", step_height) + fprintf_param_value(" 13=%e", offset) + } + else if (layer->type == "Proposal") + { + ncnn::Proposal* op = (ncnn::Proposal*)layer; + ncnn::Proposal* op_default = (ncnn::Proposal*)layer_default; + + fprintf_param_value(" 0=%d", feat_stride) + fprintf_param_value(" 1=%d", base_size) + fprintf_param_value(" 2=%d", pre_nms_topN) + fprintf_param_value(" 3=%d", after_nms_topN) + fprintf_param_value(" 4=%e", nms_thresh) + fprintf_param_value(" 5=%d", min_size) + } + else if (layer->type == "PSROIPooling") + { + ncnn::PSROIPooling* op = (ncnn::PSROIPooling*)layer; + ncnn::PSROIPooling* op_default = (ncnn::PSROIPooling*)layer_default; + + fprintf_param_value(" 0=%d", pooled_width) + fprintf_param_value(" 1=%d", pooled_height) + fprintf_param_value(" 2=%e", spatial_scale) + fprintf_param_value(" 3=%d", output_dim) + } + else if (layer->type == "Quantize") + { + ncnn::Quantize* op = (ncnn::Quantize*)layer; + ncnn::Quantize* op_default = (ncnn::Quantize*)layer_default; + + fprintf_param_value(" 0=%d", scale_data_size) + + fwrite_weight_data(op->scale_data, bp); + } + else if (layer->type == "Reduction") + { + ncnn::Reduction* op = (ncnn::Reduction*)layer; + ncnn::Reduction* op_default = (ncnn::Reduction*)layer_default; + + fprintf_param_value(" 0=%d", operation) + fprintf_param_value(" 1=%d", reduce_all) + fprintf_param_value(" 2=%e", coeff) + { + if (!op->axes.empty()) fprintf_param_int_array(3, op->axes, pp); + } + fprintf_param_value(" 4=%d", keepdims) + + // HACK + if (!op->axes.empty()) + { + int fixbug0 = 1; + fprintf(pp, " 5=%d", fixbug0); + } + } + else if (layer->type == "ReLU") + { + ncnn::ReLU* op = (ncnn::ReLU*)layer; + ncnn::ReLU* op_default = (ncnn::ReLU*)layer_default; + + fprintf_param_value(" 0=%e", slope) + } + else if (layer->type == "Reorg") + { + ncnn::Reorg* op = (ncnn::Reorg*)layer; + ncnn::Reorg* op_default = (ncnn::Reorg*)layer_default; + + fprintf_param_value(" 0=%d", stride) + fprintf_param_value(" 1=%d", mode) + } + else if (layer->type == "Requantize") + { + ncnn::Requantize* op = (ncnn::Requantize*)layer; + ncnn::Requantize* op_default = (ncnn::Requantize*)layer_default; + + fprintf_param_value(" 0=%d", scale_in_data_size) + fprintf_param_value(" 1=%d", scale_out_data_size) + fprintf_param_value(" 2=%d", bias_data_size) + fprintf_param_value(" 3=%d", activation_type) + { + if (!op->activation_params.empty()) fprintf_param_float_array(4, op->activation_params, pp); + } + + fwrite_weight_data(op->scale_in_data, bp); + fwrite_weight_data(op->scale_out_data, bp); + fwrite_weight_data(op->bias_data, bp); + } + else if (layer->type == "Reshape") + { + ncnn::Reshape* op = (ncnn::Reshape*)layer; + ncnn::Reshape* op_default = (ncnn::Reshape*)layer_default; + + fprintf_param_value(" 0=%d", w) + fprintf_param_value(" 1=%d", h) + fprintf_param_value(" 2=%d", c) + fprintf_param_value(" 3=%d", permute) + } + else if (layer->type == "RNN") + { + ncnn::RNN* op = (ncnn::RNN*)layer; + ncnn::RNN* op_default = (ncnn::RNN*)layer_default; + + fprintf_param_value(" 0=%d", num_output) + fprintf_param_value(" 1=%d", weight_data_size) + fprintf_param_value(" 2=%d", direction) + + fwrite_weight_tag_data(op->weight_xc_data, bp); + fwrite_weight_tag_data(op->bias_c_data, bp); + fwrite_weight_tag_data(op->weight_hc_data, bp); + } + else if (layer->type == "ROIAlign") + { + ncnn::ROIAlign* op = (ncnn::ROIAlign*)layer; + ncnn::ROIAlign* op_default = (ncnn::ROIAlign*)layer_default; + + fprintf_param_value(" 0=%d", pooled_width) + fprintf_param_value(" 1=%d", pooled_height) + fprintf_param_value(" 2=%e", spatial_scale) + fprintf_param_value(" 3=%d", sampling_ratio) + fprintf_param_value(" 4=%d", aligned) + fprintf_param_value(" 5=%d", version) + } + else if (layer->type == "ROIPooling") + { + ncnn::ROIPooling* op = (ncnn::ROIPooling*)layer; + ncnn::ROIPooling* op_default = (ncnn::ROIPooling*)layer_default; + + fprintf_param_value(" 0=%d", pooled_width) + fprintf_param_value(" 1=%d", pooled_height) + fprintf_param_value(" 2=%e", spatial_scale) + } + else if (layer->type == "Scale") + { + ncnn::Scale* op = (ncnn::Scale*)layer; + ncnn::Scale* op_default = (ncnn::Scale*)layer_default; + + fprintf_param_value(" 0=%d", scale_data_size) + fprintf_param_value(" 1=%d", bias_term) + + fwrite_weight_data(op->scale_data, bp); + fwrite_weight_data(op->bias_data, bp); + } + else if (layer->type == "ShuffleChannel") + { + ncnn::ShuffleChannel* op = (ncnn::ShuffleChannel*)layer; + ncnn::ShuffleChannel* op_default = (ncnn::ShuffleChannel*)layer_default; + + fprintf_param_value(" 0=%d", group) + fprintf_param_value(" 1=%d", reverse) + } + else if (layer->type == "Slice") + { + ncnn::Slice* op = (ncnn::Slice*)layer; + ncnn::Slice* op_default = (ncnn::Slice*)layer_default; + + { + if (!op->slices.empty()) fprintf_param_int_array(0, op->slices, pp); + } + fprintf_param_value(" 1=%d", axis) + } + else if (layer->type == "Softmax") + { + ncnn::Softmax* op = (ncnn::Softmax*)layer; + ncnn::Softmax* op_default = (ncnn::Softmax*)layer_default; + + fprintf_param_value(" 0=%d", axis) + + // HACK + if (op->axis != 0) + { + int fixbug0 = 1; + fprintf(pp, " 1=%d", fixbug0); + } + } + else if (layer->type == "Squeeze") + { + ncnn::Squeeze* op = (ncnn::Squeeze*)layer; + ncnn::Squeeze* op_default = (ncnn::Squeeze*)layer_default; + + fprintf_param_value(" 0=%d", squeeze_w) + fprintf_param_value(" 1=%d", squeeze_h) + fprintf_param_value(" 2=%d", squeeze_c) + { + if (!op->axes.empty()) fprintf_param_int_array(0, op->axes, pp); + } + } + else if (layer->type == "Threshold") + { + ncnn::Threshold* op = (ncnn::Threshold*)layer; + ncnn::Threshold* op_default = (ncnn::Threshold*)layer_default; + + fprintf_param_value(" 0=%e", threshold) + } + else if (layer->type == "UnaryOp") + { + ncnn::UnaryOp* op = (ncnn::UnaryOp*)layer; + ncnn::UnaryOp* op_default = (ncnn::UnaryOp*)layer_default; + + fprintf_param_value(" 0=%d", op_type) + } + else if (layer->type == "YoloDetectionOutput") + { + ncnn::YoloDetectionOutput* op = (ncnn::YoloDetectionOutput*)layer; + ncnn::YoloDetectionOutput* op_default = (ncnn::YoloDetectionOutput*)layer_default; + + fprintf_param_value(" 0=%d", num_class) + fprintf_param_value(" 1=%d", num_box) + fprintf_param_value(" 2=%e", confidence_threshold) + fprintf_param_value(" 3=%e", nms_threshold) + { + if (!op->biases.empty()) fprintf_param_float_array(4, op->biases, pp); + } + } + else if (layer->type == "Yolov3DetectionOutput") + { + ncnn::Yolov3DetectionOutput* op = (ncnn::Yolov3DetectionOutput*)layer; + ncnn::Yolov3DetectionOutput* op_default = (ncnn::Yolov3DetectionOutput*)layer_default; + + fprintf_param_value(" 0=%d", num_class) + fprintf_param_value(" 1=%d", num_box) + fprintf_param_value(" 2=%e", confidence_threshold) + fprintf_param_value(" 3=%e", nms_threshold) + { + if (!op->biases.empty()) fprintf_param_float_array(4, op->biases, pp); + } + { + if (!op->mask.empty()) fprintf_param_int_array(5, op->mask, pp); + } + { + if (!op->anchors_scale.empty()) fprintf_param_float_array(6, op->anchors_scale, pp); + } + } + +#undef fprintf_param_value + + fprintf(pp, "\n"); + + delete layer_default; + } + + fclose(pp); + fclose(bp); + + if (mac) + { + fprintf(stderr, "mac = %llu = %.2f M\n", static_cast(mac), mac / 1000000.0); + } + + return 0; +} diff --git a/tools/modelwriter.h b/tools/modelwriter.h index e9ff979176a7..844c6d4f6efa 100644 --- a/tools/modelwriter.h +++ b/tools/modelwriter.h @@ -11,7 +11,7 @@ // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. - +#pragma once #ifdef _MSC_VER #define _CRT_SECURE_NO_DEPRECATE #endif @@ -115,30 +115,11 @@ static struct prng_rand_t g_prng_rand_state; class MemoryFootprintAllocator : public ncnn::Allocator { public: - MemoryFootprintAllocator() - { - current_memory_usage = 0; - memory_footprint = 0; - } + MemoryFootprintAllocator(); - virtual void* fastMalloc(size_t size) - { - ncnn::MutexLockGuard g(lock); - void* ptr = ncnn::fastMalloc(size); - bookkeeper[ptr] = size; - current_memory_usage += size; - memory_footprint = std::max(memory_footprint, current_memory_usage); - return ptr; - } + virtual void* fastMalloc(size_t size); - virtual void fastFree(void* ptr) - { - ncnn::MutexLockGuard g(lock); - size_t size = bookkeeper[ptr]; - current_memory_usage -= size; - bookkeeper.erase(bookkeeper.find(ptr)); - ncnn::fastFree(ptr); - } + virtual void fastFree(void* ptr); public: int current_memory_usage; @@ -150,59 +131,14 @@ class MemoryFootprintAllocator : public ncnn::Allocator class CustomLayer : public ncnn::Layer { public: - virtual int load_param(const ncnn::ParamDict& pd) - { - mpd = pd; - return 0; - } - - void write_param(FILE* pp) - { - for (int i = 0; i < NCNN_MAX_PARAM_COUNT; i++) - { - int type = mpd.type(i); - if (type == 0) - continue; + virtual int load_param(const ncnn::ParamDict& pd); - if (type == 2) - { - fprintf(pp, " %d=%d", i, mpd.get(i, 0)); - } - if (type == 3) - { - fprintf(pp, " %d=%e", i, mpd.get(i, 0.f)); - } - if (type == 5) - { - ncnn::Mat v = mpd.get(i, ncnn::Mat()); - int len = v.w; - fprintf(pp, " %d=%d", -i - 23300, len); - const int* p = v; - for (int j = 0; j < len; j++) - { - fprintf(pp, ",%d", p[j]); - } - } - if (type == 6) - { - ncnn::Mat v = mpd.get(i, ncnn::Mat()); - int len = v.w; - fprintf(pp, " %d=%d", -i - 23300, len); - const float* p = v; - for (int j = 0; j < len; j++) - { - fprintf(pp, ",%e", p[j]); - } - } - } - } + void write_param(FILE* pp); public: ncnn::ParamDict mpd; }; -DEFINE_LAYER_CREATOR(CustomLayer) - class ModelWriter : public ncnn::Net { public: @@ -240,1981 +176,3 @@ class ModelWriter : public ncnn::Net int save(const char* parampath, const char* binpath); }; - -ModelWriter::ModelWriter() - : blobs(mutable_blobs()), layers(mutable_layers()) -{ - opt.lightmode = false; - has_custom_layer = false; - gen_random_weight = false; - cutstart = -1; - cutend = -1; - - SRAND(7767517); -} - -ncnn::Layer* ModelWriter::create_custom_layer(const char* type) -{ - ncnn::Layer* layer = Net::create_custom_layer(type); - if (layer) - return layer; - - fprintf(stderr, "create_custom_layer %s\n", type); - - register_custom_layer(type, CustomLayer_layer_creator); - - has_custom_layer = true; - - return Net::create_custom_layer(type); -} - -int ModelWriter::set_cutparam(const char* cutstartname, const char* cutendname) -{ - if (cutstartname != nullptr) - { - int layindex = find_layer_index_by_name(cutstartname); - if (layindex >= 0) - { - cutstart = layindex; - fprintf(stderr, "cutstart layer %d:%s\n", layindex, cutstartname); - } - else - { - fprintf(stderr, "not find target cutstart layer %s\n", cutstartname); - return -1; - } - } - - if (cutendname != nullptr) - { - int layindex = find_layer_index_by_name(cutendname); - if (layindex >= 0) - { - cutend = layindex; - fprintf(stderr, "cutend layer %d:%s\n", layindex, cutendname); - } - else - { - fprintf(stderr, "not find target cutend layer %s\n", cutendname); - return -1; - } - } - - return 0; -} - -int ModelWriter::shape_inference() -{ - if (has_custom_layer) - { - fprintf(stderr, "model has custom layer, shape_inference skipped\n"); - return -1; - } - - const size_t layer_count = layers.size(); - const size_t blob_count = blobs.size(); - - // recreate layer pipeline for param and weight changes - for (size_t i = 0; i < layer_count; i++) - { - ncnn::Layer* layer = layers[i]; - - layer->destroy_pipeline(opt); - - int cret = layer->create_pipeline(opt); - if (cret != 0) - { - NCNN_LOGE("layer create_pipeline %d %s failed", (int)i, layer->name.c_str()); - return -1; - } - } - - ncnn::Extractor ex = create_extractor(); - ex.set_light_mode(true); - - // prepare Input blobs - for (size_t i = 0; i < layer_count; i++) - { - const ncnn::Layer* layer = layers[i]; - if (layer->type == "ncnnfused") - continue; - - if (layer->type != "Input") - continue; - - ncnn::Input* input = (ncnn::Input*)layer; - - int w = input->w; - int h = input->h; - int c = input->c; - - int dims = 0; - if (w == 0 && h == 0 && c == 0) dims = 0; - if (w != 0 && h == 0 && c == 0) dims = 1; - if (w != 0 && h != 0 && c == 0) dims = 2; - if (w != 0 && h != 0 && c != 0) dims = 3; - - if (dims == 0) - { - fprintf(stderr, "Input layer %s without shape info, shape_inference skipped\n", layer->name.c_str()); - return -1; - } - - ncnn::Mat m; - if (dims == 1) m.create(w); - if (dims == 2) m.create(w, h); - if (dims == 3) m.create(w, h, c); - - ex.input(layer->tops[0], m); - } - - // prepare blobs with predefined shape - for (size_t i = 0; i < blob_count; i++) - { - const ncnn::Blob& blob = blobs[i]; - - int dims = blob.shape.dims; - int w = blob.shape.w; - int h = blob.shape.h; - int c = blob.shape.c; - - if (dims == 0) - continue; - - ncnn::Mat m; - if (dims == 1) m.create(w); - if (dims == 2) m.create(w, h); - if (dims == 3) m.create(w, h, c); - - m.fill(0.f); - - ex.input(int(i), m); - } - - fprintf(stderr, "shape_inference\n"); - - // resolve all layer output blob shape - for (size_t i = 0; i < layer_count; i++) - { - const ncnn::Layer* layer = layers[i]; - if (layer->type == "ncnnfused") - continue; - - for (size_t j = 0; j < layer->tops.size(); j++) - { - int top_blob_index = layer->tops[j]; - - ncnn::Mat m; - ex.extract(top_blob_index, m); - - blobs[top_blob_index].shape = m; - } - } - - // assign all layer blob shape - for (size_t i = 0; i < layer_count; i++) - { - ncnn::Layer* layer = layers[i]; - if (layer->type == "ncnnfused") - continue; - - layer->bottom_shapes.resize(layer->bottoms.size()); - for (size_t j = 0; j < layer->bottoms.size(); j++) - { - int bottom_blob_index = layer->bottoms[j]; - - layer->bottom_shapes[j] = blobs[bottom_blob_index].shape; - } - - layer->top_shapes.resize(layer->tops.size()); - for (size_t j = 0; j < layer->tops.size(); j++) - { - int top_blob_index = layer->tops[j]; - - layer->top_shapes[j] = blobs[top_blob_index].shape; - - // fprintf(stderr, "%d %4d %4d %4d | %2d %s\n", blobs[top_blob_index].shape.dims, blobs[top_blob_index].shape.w, blobs[top_blob_index].shape.h, blobs[top_blob_index].shape.c, top_blob_index, blobs[top_blob_index].name.c_str()); - } - } - - return 0; -} - -int ModelWriter::estimate_memory_footprint() -{ - if (has_custom_layer) - { - fprintf(stderr, "model has custom layer, estimate_memory_footprint skipped\n"); - return -1; - } - - const size_t layer_count = layers.size(); - const size_t blob_count = blobs.size(); - - MemoryFootprintAllocator allocator; - - ncnn::Extractor ex = create_extractor(); - ex.set_light_mode(true); - - ex.set_blob_allocator(&allocator); - ex.set_workspace_allocator(&allocator); - - // prepare Input blobs - for (size_t i = 0; i < layer_count; i++) - { - const ncnn::Layer* layer = layers[i]; - if (layer->type == "ncnnfused") - continue; - - if (layer->type != "Input") - continue; - - ncnn::Input* input = (ncnn::Input*)layer; - - int w = input->w; - int h = input->h; - int c = input->c; - - int dims = 0; - if (w == 0 && h == 0 && c == 0) dims = 0; - if (w != 0 && h == 0 && c == 0) dims = 1; - if (w != 0 && h != 0 && c == 0) dims = 2; - if (w != 0 && h != 0 && c != 0) dims = 3; - - if (dims == 0) - { - fprintf(stderr, "Input layer %s without shape info, estimate_memory_footprint skipped\n", layer->name.c_str()); - return -1; - } - - ncnn::Mat m; - if (dims == 1) m.create(w, 4u, &allocator); - if (dims == 2) m.create(w, h, 4u, &allocator); - if (dims == 3) m.create(w, h, c, 4u, &allocator); - - ex.input(layer->tops[0], m); - - fprintf(stderr, "input = %s\n", blobs[layer->tops[0]].name.c_str()); - } - - // find output blobs and do inference - std::vector outputs; - for (size_t i = 0; i < blob_count; i++) - { - const ncnn::Blob& blob = blobs[i]; - - if (blob.producer == -1 || blob.consumer != -1) - continue; - - if (layers[blob.producer]->type == "ncnnfused") - continue; - - // treat blob without any consumers as output - ncnn::Mat m; - ex.extract(int(i), m); - outputs.push_back(m); - - fprintf(stderr, "extract = %s\n", blob.name.c_str()); - } - - fprintf(stderr, "estimated memory footprint = %.2f KB = %.2f MB\n", allocator.memory_footprint / 1024.f, allocator.memory_footprint / 1024.f / 1024.f); - - return 0; -} - -int ModelWriter::fprintf_param_int_array(int id, const ncnn::Mat& m, FILE* pp) -{ - const int count = m.w; - const int* ptr = m; - - fprintf(pp, " -%d=%d", 23300 + id, count); - for (int i = 0; i < count; i++) - { - fprintf(pp, ",%d", ptr[i]); - } - - return 0; -} - -int ModelWriter::fprintf_param_float_array(int id, const ncnn::Mat& m, FILE* pp) -{ - const int count = m.w; - const float* ptr = m; - - fprintf(pp, " -%d=%d", 23300 + id, count); - for (int i = 0; i < count; i++) - { - fprintf(pp, ",%e", ptr[i]); - } - - return 0; -} - -static inline size_t alignSize(size_t sz, int n) -{ - return (sz + n - 1) & -n; -} - -static void replace_denormals_with_zero(float* data, size_t data_length) -{ - const int total = static_cast(data_length); - for (size_t i = 0; i < data_length; ++i) - { - float value = data[i]; - - if (fabsf(value) < 1e-30 && fabsf(value) != 0.f) - { - data[i] = 0.f; - } - } -} - -static float RandomFloat(float a = -1.2f, float b = 1.2f) -{ - float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX; - float diff = b - a; - float r = random * diff; - return a + r; -} - -static void Randomize(ncnn::Mat& m, float a = -1.2f, float b = 1.2f) -{ - if (m.elemsize == 4) - { - for (size_t i = 0; i < m.total(); i++) - { - m[i] = RandomFloat(a, b); - } - } - else if (m.elemsize == 2) - { - unsigned short* p = m; - for (size_t i = 0; i < m.total(); i++) - { - p[i] = ncnn::float32_to_float16(RandomFloat(a, b)); - } - } - else if (m.elemsize == 1) - { - signed char* p = m; - for (size_t i = 0; i < m.total(); i++) - { - p[i] = (signed char)RandomFloat(-127, 127); - } - } -} - -int ModelWriter::fwrite_weight_tag_data(const ncnn::Mat& data, FILE* bp, float a, float b) -{ - int p0 = ftell(bp); - - ncnn::Mat data_flattened = data.reshape(data.w * data.h * data.d * data.c); - if (gen_random_weight) - Randomize(data_flattened, a, b); - - if (data_flattened.elemsize == 4) - { - if (storage_type == 1) - { - const int tag = 0x01306B47; // fp16 magic - fwrite(&tag, sizeof(int), 1, bp); - ncnn::Mat data_flattened_fp16; - ncnn::cast_float32_to_float16(data_flattened, data_flattened_fp16); - fwrite(data_flattened_fp16.data, data_flattened_fp16.elemsize, data_flattened_fp16.w, bp); - } - else - { - const int tag = 0; // fp32 magic - fwrite(&tag, sizeof(int), 1, bp); - replace_denormals_with_zero(data_flattened, data_flattened.w); - fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); - } - } - else if (data_flattened.elemsize == 2) - { - const int tag = 0x01306B47; // fp16 magic - fwrite(&tag, sizeof(int), 1, bp); - fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); - } - else if (data_flattened.elemsize == 1) - { - const int tag = 0x000D4B38; // int8 magic - fwrite(&tag, sizeof(int), 1, bp); - fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); - } - else - { - fprintf(stderr, "unknown weight data type %d\n", (int)data_flattened.elemsize); - } - - // padding to 32bit align - int nwrite = ftell(bp) - p0; - size_t nalign = alignSize(nwrite, 4); - unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00}; - fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp); - - return 0; -} - -int ModelWriter::fwrite_weight_data(const ncnn::Mat& data, FILE* bp, float a, float b) -{ - int p0 = ftell(bp); - - ncnn::Mat data_flattened = data.reshape(data.w * data.h * data.d * data.c); - if (gen_random_weight) - Randomize(data_flattened, a, b); - - if (data_flattened.elemsize == 4) // fp32 - { - replace_denormals_with_zero(data_flattened, data_flattened.w); - } - - fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); - - // padding to 32bit align - int nwrite = ftell(bp) - p0; - size_t nalign = alignSize(nwrite, 4); - unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00}; - fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp); - - return 0; -} - -int ModelWriter::save(const char* parampath, const char* binpath) -{ - uint64_t mac = 0; - - FILE* pp = fopen(parampath, "wb"); - FILE* bp = fopen(binpath, "wb"); - - fprintf(pp, "7767517\n"); - - const size_t layer_count = layers.size(); - - int layer_count_fused = 0; - std::set blob_names; - for (size_t i = 0; i < layer_count; i++) - { - const ncnn::Layer* layer = layers[i]; - if (layer->type == "ncnnfused") - continue; - - layer_count_fused++; - - size_t bottom_count = layer->bottoms.size(); - for (size_t j = 0; j < bottom_count; j++) - { - int bottom_blob_index = layer->bottoms[j]; - blob_names.insert(blobs[bottom_blob_index].name); - } - - size_t top_count = layer->tops.size(); - for (size_t j = 0; j < top_count; j++) - { - int top_blob_index = layer->tops[j]; - blob_names.insert(blobs[top_blob_index].name); - } - } - - size_t blob_count_fused = blob_names.size(); - - fprintf(pp, "%d %zd\n", layer_count_fused, blob_count_fused); - - for (size_t i = 0; i < layer_count; i++) - { - const ncnn::Layer* layer = layers[i]; - if (layer->type == "ncnnfused") - continue; - - if (cutstart > 0 && i < cutstart) - continue; - - if (cutend > 0 && i > cutend) - continue; - - size_t bottom_count = layer->bottoms.size(); - size_t top_count = layer->tops.size(); - - fprintf(pp, "%-24s %-24s %zd %zd", layer->type.c_str(), layer->name.c_str(), bottom_count, top_count); - - for (size_t j = 0; j < bottom_count; j++) - { - int bottom_blob_index = layer->bottoms[j]; - fprintf(pp, " %s", blobs[bottom_blob_index].name.c_str()); - } - for (size_t j = 0; j < top_count; j++) - { - int top_blob_index = layer->tops[j]; - fprintf(pp, " %s", blobs[top_blob_index].name.c_str()); - } - - // write shape hints - bool shape_ready = true; - for (size_t j = 0; j < top_count; j++) - { - int top_blob_index = layer->tops[j]; - - int dims = blobs[top_blob_index].shape.dims; - if (dims == 0) - { - shape_ready = false; - break; - } - } - if (shape_ready) - { - fprintf(pp, " -23330=%zd", top_count * 4); - for (size_t j = 0; j < top_count; j++) - { - int top_blob_index = layer->tops[j]; - - int dims = blobs[top_blob_index].shape.dims; - int w = blobs[top_blob_index].shape.w; - int h = blobs[top_blob_index].shape.h; - int c = blobs[top_blob_index].shape.c; - - fprintf(pp, ",%d,%d,%d,%d", dims, w, h, c); - } - } - - // custom op - if (layer->typeindex & ncnn::LayerType::CustomBit) - { - ((CustomLayer*)layer)->write_param(pp); - - fprintf(pp, "\n"); - - continue; - } - - ncnn::Layer* layer_default = ncnn::create_layer(layer->typeindex); - - ncnn::ParamDict pd; - layer_default->load_param(pd); - -#define fprintf_param_value(format, phase) \ - { \ - if (op->phase != op_default->phase) fprintf(pp, format, op->phase); \ - } - - if (layer->type == "BatchNorm") - { - ncnn::BatchNorm* op = (ncnn::BatchNorm*)layer; - ncnn::BatchNorm* op_default = (ncnn::BatchNorm*)layer_default; - - fprintf_param_value(" 0=%d", channels) - fprintf_param_value(" 1=%e", eps) - - fwrite_weight_data(op->slope_data, bp); - fwrite_weight_data(op->mean_data, bp); - fwrite_weight_data(op->var_data, bp); - fwrite_weight_data(op->bias_data, bp); - } - else if (layer->type == "Bias") - { - ncnn::Bias* op = (ncnn::Bias*)layer; - ncnn::Bias* op_default = (ncnn::Bias*)layer_default; - - fprintf_param_value(" 0=%d", bias_data_size) - - fwrite_weight_data(op->bias_data, bp); - } - else if (layer->type == "BinaryOp") - { - ncnn::BinaryOp* op = (ncnn::BinaryOp*)layer; - ncnn::BinaryOp* op_default = (ncnn::BinaryOp*)layer_default; - - fprintf_param_value(" 0=%d", op_type) - fprintf_param_value(" 1=%d", with_scalar) - fprintf_param_value(" 2=%e", b) - } - else if (layer->type == "Clip") - { - ncnn::Clip* op = (ncnn::Clip*)layer; - ncnn::Clip* op_default = (ncnn::Clip*)layer_default; - - fprintf_param_value(" 0=%e", min) - fprintf_param_value(" 1=%e", max) - } - else if (layer->type == "Concat") - { - ncnn::Concat* op = (ncnn::Concat*)layer; - ncnn::Concat* op_default = (ncnn::Concat*)layer_default; - - fprintf_param_value(" 0=%d", axis) - } - else if (layer->type == "Convolution") - { - ncnn::Convolution* op = (ncnn::Convolution*)layer; - ncnn::Convolution* op_default = (ncnn::Convolution*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - fprintf_param_value(" 18=%e", pad_value) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 8=%d", int8_scale_term) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - -#if NCNN_INT8 - // write int8_scale data - if (op->int8_scale_term) - { - fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100); - fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1); - fwrite_weight_data(op->top_blob_int8_scales, bp, 0.001, 1); - } -#endif // NCNN_INT8 - - if (shape_ready) - { - int inc = blobs[layer->bottoms[0]].shape.c; - int outw = blobs[layer->tops[0]].shape.w; - int outh = blobs[layer->tops[0]].shape.h; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_h * op->kernel_w * outw * outh * outc * inc; - } - } - else if (layer->type == "Convolution1D") - { - ncnn::Convolution1D* op = (ncnn::Convolution1D*)layer; - ncnn::Convolution1D* op_default = (ncnn::Convolution1D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - fprintf_param_value(" 2=%d", dilation_w) - fprintf_param_value(" 3=%d", stride_w) - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - fprintf_param_value(" 18=%e", pad_value) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inh = blobs[layer->bottoms[0]].shape.h; - int outw = blobs[layer->tops[0]].shape.w; - int outh = blobs[layer->tops[0]].shape.h; - - mac += (uint64_t)op->kernel_w * outw * outh * inh; - } - } - else if (layer->type == "Convolution3D") - { - ncnn::Convolution3D* op = (ncnn::Convolution3D*)layer; - ncnn::Convolution3D* op_default = (ncnn::Convolution3D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - { - if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); - } - fprintf_param_value(" 18=%e", pad_value) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inc = blobs[layer->bottoms[0]].shape.c; - int outw = blobs[layer->tops[0]].shape.w; - int outh = blobs[layer->tops[0]].shape.h; - int outd = blobs[layer->tops[0]].shape.d; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * outw * outh * outd * outc * inc; - } - } - else if (layer->type == "ConvolutionDepthWise") - { - ncnn::ConvolutionDepthWise* op = (ncnn::ConvolutionDepthWise*)layer; - ncnn::ConvolutionDepthWise* op_default = (ncnn::ConvolutionDepthWise*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - fprintf_param_value(" 18=%e", pad_value) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 7=%d", group) - fprintf_param_value(" 8=%d", int8_scale_term) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - -#if NCNN_INT8 - // write int8_scale data - if (op->int8_scale_term == 1 || op->int8_scale_term == 101) - { - op->bottom_blob_int8_scales.w = 1; - } - if (op->int8_scale_term == 2 || op->int8_scale_term == 102) - { - op->weight_data_int8_scales.w = 1; - op->bottom_blob_int8_scales.w = 1; - } - if (op->int8_scale_term > 100) - { - op->top_blob_int8_scales.w = 1; - } - - if (op->int8_scale_term) - { - fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100); - fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1); - fwrite_weight_data(op->top_blob_int8_scales, bp, 0.001, 1); - } -#endif // NCNN_INT8 - - if (shape_ready) - { - int inc = blobs[layer->bottoms[0]].shape.c; - int outw = blobs[layer->tops[0]].shape.w; - int outh = blobs[layer->tops[0]].shape.h; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_h * op->kernel_w * outw * outh * (outc / op->group) * (inc / op->group) * op->group; - } - } - else if (layer->type == "ConvolutionDepthWise1D") - { - ncnn::ConvolutionDepthWise1D* op = (ncnn::ConvolutionDepthWise1D*)layer; - ncnn::ConvolutionDepthWise1D* op_default = (ncnn::ConvolutionDepthWise1D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - fprintf_param_value(" 2=%d", dilation_w) - fprintf_param_value(" 3=%d", stride_w) - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - fprintf_param_value(" 18=%e", pad_value) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 7=%d", group) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inh = blobs[layer->bottoms[0]].shape.h; - int outw = blobs[layer->tops[0]].shape.w; - int outh = blobs[layer->tops[0]].shape.h; - - mac += (uint64_t)op->kernel_w * outw * (outh / op->group) * (inh / op->group) * op->group; - } - } - else if (layer->type == "ConvolutionDepthWise3D") - { - ncnn::ConvolutionDepthWise3D* op = (ncnn::ConvolutionDepthWise3D*)layer; - ncnn::ConvolutionDepthWise3D* op_default = (ncnn::ConvolutionDepthWise3D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - { - if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); - } - fprintf_param_value(" 18=%e", pad_value) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 7=%d", group) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inc = blobs[layer->bottoms[0]].shape.c; - int outw = blobs[layer->tops[0]].shape.w; - int outh = blobs[layer->tops[0]].shape.h; - int outd = blobs[layer->tops[0]].shape.d; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * outw * outh * outd * (outc / op->group) * (inc / op->group) * op->group; - } - } - else if (layer->type == "Crop") - { - ncnn::Crop* op = (ncnn::Crop*)layer; - ncnn::Crop* op_default = (ncnn::Crop*)layer_default; - - fprintf_param_value(" 0=%d", woffset) - fprintf_param_value(" 1=%d", hoffset) - fprintf_param_value(" 2=%d", coffset) - fprintf_param_value(" 3=%d", outw) - fprintf_param_value(" 4=%d", outh) - fprintf_param_value(" 5=%d", outc) - fprintf_param_value(" 6=%d", woffset2) - fprintf_param_value(" 7=%d", hoffset2) - fprintf_param_value(" 8=%d", coffset2) - { - if (!op->starts.empty()) fprintf_param_int_array(9, op->starts, pp); - } - { - if (!op->ends.empty()) fprintf_param_int_array(10, op->ends, pp); - } - { - if (!op->axes.empty()) fprintf_param_int_array(11, op->axes, pp); - } - } - else if (layer->type == "Deconvolution") - { - ncnn::Deconvolution* op = (ncnn::Deconvolution*)layer; - ncnn::Deconvolution* op_default = (ncnn::Deconvolution*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - fprintf_param_value(" 18=%d", output_pad_right) - { - if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); - } - fprintf_param_value(" 20=%d", output_w) - { - if (op->output_h != op->output_w) fprintf(pp, " 21=%d", op->output_h); - } - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inw = blobs[layer->bottoms[0]].shape.w; - int inh = blobs[layer->bottoms[0]].shape.h; - int inc = blobs[layer->bottoms[0]].shape.c; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_h * op->kernel_w * inw * inh * outc * inc; - } - } - else if (layer->type == "Deconvolution1D") - { - ncnn::Deconvolution1D* op = (ncnn::Deconvolution1D*)layer; - ncnn::Deconvolution1D* op_default = (ncnn::Deconvolution1D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - fprintf_param_value(" 2=%d", dilation_w) - fprintf_param_value(" 3=%d", stride_w) - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - fprintf_param_value(" 18=%d", output_pad_right) - fprintf_param_value(" 20=%d", output_w) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inw = blobs[layer->bottoms[0]].shape.w; - int inh = blobs[layer->bottoms[0]].shape.h; - int outh = blobs[layer->tops[0]].shape.h; - - mac += (uint64_t)op->kernel_w * inw * outh * inh; - } - } - else if (layer->type == "Deconvolution3D") - { - ncnn::Deconvolution3D* op = (ncnn::Deconvolution3D*)layer; - ncnn::Deconvolution3D* op_default = (ncnn::Deconvolution3D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - { - if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); - } - fprintf_param_value(" 18=%d", output_pad_right) - { - if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); - if (op->output_pad_behind != op->output_pad_right) fprintf(pp, " 20=%d", op->output_pad_behind); - } - fprintf_param_value(" 25=%d", output_w) - { - if (op->output_h != op->output_w) fprintf(pp, " 26=%d", op->output_h); - if (op->output_d != op->output_w) fprintf(pp, " 27=%d", op->output_d); - } - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inw = blobs[layer->bottoms[0]].shape.w; - int inh = blobs[layer->bottoms[0]].shape.h; - int ind = blobs[layer->bottoms[0]].shape.d; - int inc = blobs[layer->bottoms[0]].shape.c; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * inw * inh * ind * outc * inc; - } - } - else if (layer->type == "DeconvolutionDepthWise") - { - ncnn::DeconvolutionDepthWise* op = (ncnn::DeconvolutionDepthWise*)layer; - ncnn::DeconvolutionDepthWise* op_default = (ncnn::DeconvolutionDepthWise*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - fprintf_param_value(" 18=%d", output_pad_right) - { - if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); - } - fprintf_param_value(" 20=%d", output_w) - { - if (op->output_h != op->output_w) fprintf(pp, " 21=%d", op->output_h); - } - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 7=%d", group) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inw = blobs[layer->bottoms[0]].shape.w; - int inh = blobs[layer->bottoms[0]].shape.h; - int inc = blobs[layer->bottoms[0]].shape.c; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_h * op->kernel_w * inw * inh * (outc / op->group) * (inc / op->group) * op->group; - } - } - else if (layer->type == "DeconvolutionDepthWise1D") - { - ncnn::DeconvolutionDepthWise1D* op = (ncnn::DeconvolutionDepthWise1D*)layer; - ncnn::DeconvolutionDepthWise1D* op_default = (ncnn::DeconvolutionDepthWise1D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - fprintf_param_value(" 2=%d", dilation_w) - fprintf_param_value(" 3=%d", stride_w) - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - fprintf_param_value(" 18=%d", output_pad_right) - fprintf_param_value(" 20=%d", output_w) - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 7=%d", group) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inw = blobs[layer->bottoms[0]].shape.w; - int inh = blobs[layer->bottoms[0]].shape.h; - int outh = blobs[layer->tops[0]].shape.h; - - mac += (uint64_t)op->kernel_w * inw * (outh / op->group) * (inh / op->group) * op->group; - } - } - else if (layer->type == "DeconvolutionDepthWise3D") - { - ncnn::DeconvolutionDepthWise3D* op = (ncnn::DeconvolutionDepthWise3D*)layer; - ncnn::DeconvolutionDepthWise3D* op_default = (ncnn::DeconvolutionDepthWise3D*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); - } - fprintf_param_value(" 2=%d", dilation_w) - { - if (op->dilation_h != op->dilation_w) fprintf(pp, " 12=%d", op->dilation_h); - if (op->dilation_d != op->dilation_w) fprintf(pp, " 22=%d", op->dilation_d); - } - fprintf_param_value(" 3=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 13=%d", op->stride_h); - if (op->stride_d != op->stride_w) fprintf(pp, " 23=%d", op->stride_d); - } - fprintf_param_value(" 4=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 14=%d", op->pad_top); - if (op->pad_front != op->pad_left) fprintf(pp, " 24=%d", op->pad_front); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 15=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 16=%d", op->pad_bottom); - } - { - if (op->pad_behind != op->pad_front) fprintf(pp, " 17=%d", op->pad_behind); - } - fprintf_param_value(" 18=%d", output_pad_right) - { - if (op->output_pad_bottom != op->output_pad_right) fprintf(pp, " 19=%d", op->output_pad_bottom); - if (op->output_pad_behind != op->output_pad_right) fprintf(pp, " 20=%d", op->output_pad_behind); - } - fprintf_param_value(" 25=%d", output_w) - { - if (op->output_h != op->output_w) fprintf(pp, " 26=%d", op->output_h); - if (op->output_d != op->output_w) fprintf(pp, " 27=%d", op->output_d); - } - fprintf_param_value(" 5=%d", bias_term) - fprintf_param_value(" 6=%d", weight_data_size) - fprintf_param_value(" 7=%d", group) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - - if (shape_ready) - { - int inw = blobs[layer->bottoms[0]].shape.w; - int inh = blobs[layer->bottoms[0]].shape.h; - int ind = blobs[layer->bottoms[0]].shape.d; - int inc = blobs[layer->bottoms[0]].shape.c; - int outc = blobs[layer->tops[0]].shape.c; - - mac += (uint64_t)op->kernel_d * op->kernel_h * op->kernel_w * inw * inh * ind * (outc / op->group) * (inc / op->group) * op->group; - } - } - else if (layer->type == "DetectionOutput") - { - ncnn::DetectionOutput* op = (ncnn::DetectionOutput*)layer; - ncnn::DetectionOutput* op_default = (ncnn::DetectionOutput*)layer_default; - - fprintf_param_value(" 0=%d", num_class) - fprintf_param_value(" 1=%e", nms_threshold) - fprintf_param_value(" 2=%d", nms_top_k) - fprintf_param_value(" 3=%d", keep_top_k) - fprintf_param_value(" 4=%e", confidence_threshold) - fprintf_param_value(" 5=%e", variances[0]) - fprintf_param_value(" 6=%e", variances[1]) - fprintf_param_value(" 7=%e", variances[2]) - fprintf_param_value(" 8=%e", variances[3]) - } - else if (layer->type == "Dropout") - { - ncnn::Dropout* op = (ncnn::Dropout*)layer; - ncnn::Dropout* op_default = (ncnn::Dropout*)layer_default; - - fprintf_param_value(" 0=%e", scale) - } - else if (layer->type == "Eltwise") - { - ncnn::Eltwise* op = (ncnn::Eltwise*)layer; - ncnn::Eltwise* op_default = (ncnn::Eltwise*)layer_default; - - fprintf_param_value(" 0=%d", op_type) - { - if (!op->coeffs.empty()) fprintf_param_float_array(1, op->coeffs, pp); - } - } - else if (layer->type == "ELU") - { - ncnn::ELU* op = (ncnn::ELU*)layer; - ncnn::ELU* op_default = (ncnn::ELU*)layer_default; - - fprintf_param_value(" 0=%e", alpha) - } - else if (layer->type == "Embed") - { - ncnn::Embed* op = (ncnn::Embed*)layer; - ncnn::Embed* op_default = (ncnn::Embed*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", input_dim) - fprintf_param_value(" 2=%d", bias_term) - fprintf_param_value(" 3=%d", weight_data_size) - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - } - else if (layer->type == "Exp") - { - ncnn::Exp* op = (ncnn::Exp*)layer; - ncnn::Exp* op_default = (ncnn::Exp*)layer_default; - - fprintf_param_value(" 0=%e", base) - fprintf_param_value(" 1=%e", scale) - fprintf_param_value(" 2=%e", shift) - } - else if (layer->type == "ExpandDims") - { - ncnn::ExpandDims* op = (ncnn::ExpandDims*)layer; - ncnn::ExpandDims* op_default = (ncnn::ExpandDims*)layer_default; - - fprintf_param_value(" 0=%d", expand_w) - fprintf_param_value(" 1=%d", expand_h) - fprintf_param_value(" 2=%d", expand_c) - { - if (!op->axes.empty()) fprintf_param_int_array(0, op->axes, pp); - } - } - else if (layer->type == "GELU") - { - ncnn::GELU* op = (ncnn::GELU*)layer; - ncnn::GELU* op_default = (ncnn::GELU*)layer_default; - - fprintf_param_value(" 0=%d", fast_gelu) - } - else if (layer->type == "Gemm") - { - ncnn::Gemm* op = (ncnn::Gemm*)layer; - ncnn::Gemm* op_default = (ncnn::Gemm*)layer_default; - - fprintf_param_value(" 0=%e", alpha) - fprintf_param_value(" 1=%e", beta) - fprintf_param_value(" 2=%d", transA) - fprintf_param_value(" 3=%d", transB) - } - else if (layer->type == "GroupNorm") - { - ncnn::GroupNorm* op = (ncnn::GroupNorm*)layer; - ncnn::GroupNorm* op_default = (ncnn::GroupNorm*)layer_default; - - fprintf_param_value(" 0=%d", group) - fprintf_param_value(" 1=%d", channels) - fprintf_param_value(" 2=%e", eps) - fprintf_param_value(" 3=%d", affine) - - fwrite_weight_data(op->gamma_data, bp); - fwrite_weight_data(op->beta_data, bp); - } - else if (layer->type == "GRU") - { - ncnn::GRU* op = (ncnn::GRU*)layer; - ncnn::GRU* op_default = (ncnn::GRU*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", weight_data_size) - fprintf_param_value(" 2=%d", direction) - - fwrite_weight_tag_data(op->weight_xc_data, bp); - fwrite_weight_tag_data(op->bias_c_data, bp); - fwrite_weight_tag_data(op->weight_hc_data, bp); - } - else if (layer->type == "HardSigmoid") - { - ncnn::HardSigmoid* op = (ncnn::HardSigmoid*)layer; - ncnn::HardSigmoid* op_default = (ncnn::HardSigmoid*)layer_default; - - fprintf_param_value(" 0=%e", alpha) - fprintf_param_value(" 1=%e", beta) - } - else if (layer->type == "HardSwish") - { - ncnn::HardSwish* op = (ncnn::HardSwish*)layer; - ncnn::HardSwish* op_default = (ncnn::HardSwish*)layer_default; - - fprintf_param_value(" 0=%e", alpha) - fprintf_param_value(" 1=%e", beta) - } - else if (layer->type == "InnerProduct") - { - ncnn::InnerProduct* op = (ncnn::InnerProduct*)layer; - ncnn::InnerProduct* op_default = (ncnn::InnerProduct*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", bias_term) - fprintf_param_value(" 2=%d", weight_data_size) - fprintf_param_value(" 8=%d", int8_scale_term) - fprintf_param_value(" 9=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(10, op->activation_params, pp); - } - - fwrite_weight_tag_data(op->weight_data, bp); - fwrite_weight_data(op->bias_data, bp); - -#if NCNN_INT8 - // write int8_scale data - if (op->int8_scale_term) - { - fwrite_weight_data(op->weight_data_int8_scales, bp, 90, 100); - fwrite_weight_data(op->bottom_blob_int8_scales, bp, 0.001, 1); - } -#endif // NCNN_INT8 - - if (shape_ready) - { - int inw = blobs[layer->bottoms[0]].shape.w; - int inh = blobs[layer->bottoms[0]].shape.h; - int inc = blobs[layer->bottoms[0]].shape.c; - int outw = blobs[layer->tops[0]].shape.w; - - mac += (uint64_t)inw * inh * inc * outw; - } - } - else if (layer->type == "Input") - { - ncnn::Input* op = (ncnn::Input*)layer; - ncnn::Input* op_default = (ncnn::Input*)layer_default; - - fprintf_param_value(" 0=%d", w) - fprintf_param_value(" 1=%d", h) - fprintf_param_value(" 2=%d", c) - } - else if (layer->type == "InstanceNorm") - { - ncnn::InstanceNorm* op = (ncnn::InstanceNorm*)layer; - ncnn::InstanceNorm* op_default = (ncnn::InstanceNorm*)layer_default; - - fprintf_param_value(" 0=%d", channels) - fprintf_param_value(" 1=%e", eps) - fprintf_param_value(" 2=%d", affine) - - fwrite_weight_data(op->gamma_data, bp); - fwrite_weight_data(op->beta_data, bp); - } - else if (layer->type == "Interp") - { - ncnn::Interp* op = (ncnn::Interp*)layer; - ncnn::Interp* op_default = (ncnn::Interp*)layer_default; - - fprintf_param_value(" 0=%d", resize_type) - fprintf_param_value(" 1=%e", height_scale) - fprintf_param_value(" 2=%e", width_scale) - fprintf_param_value(" 3=%d", output_height) - fprintf_param_value(" 4=%d", output_width) - fprintf_param_value(" 5=%d", dynamic_target_size) - fprintf_param_value(" 6=%d", align_corner) - } - else if (layer->type == "LayerNorm") - { - ncnn::LayerNorm* op = (ncnn::LayerNorm*)layer; - ncnn::LayerNorm* op_default = (ncnn::LayerNorm*)layer_default; - - fprintf_param_value(" 0=%d", affine_size) - fprintf_param_value(" 1=%e", eps) - fprintf_param_value(" 2=%d", affine) - - fwrite_weight_data(op->gamma_data, bp); - fwrite_weight_data(op->beta_data, bp); - } - else if (layer->type == "Log") - { - ncnn::Log* op = (ncnn::Log*)layer; - ncnn::Log* op_default = (ncnn::Log*)layer_default; - - fprintf_param_value(" 0=%e", base) - fprintf_param_value(" 1=%e", scale) - fprintf_param_value(" 2=%e", shift) - } - else if (layer->type == "LRN") - { - ncnn::LRN* op = (ncnn::LRN*)layer; - ncnn::LRN* op_default = (ncnn::LRN*)layer_default; - - fprintf_param_value(" 0=%d", region_type) - fprintf_param_value(" 1=%d", local_size) - fprintf_param_value(" 2=%e", alpha) - fprintf_param_value(" 3=%e", beta) - fprintf_param_value(" 4=%e", bias) - } - else if (layer->type == "LSTM") - { - ncnn::LSTM* op = (ncnn::LSTM*)layer; - ncnn::LSTM* op_default = (ncnn::LSTM*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", weight_data_size) - fprintf_param_value(" 2=%d", direction) - - fwrite_weight_tag_data(op->weight_xc_data, bp); - fwrite_weight_tag_data(op->bias_c_data, bp); - fwrite_weight_tag_data(op->weight_hc_data, bp); - } - else if (layer->type == "MatMul") - { - ncnn::MatMul* op = (ncnn::MatMul*)layer; - ncnn::MatMul* op_default = (ncnn::MatMul*)layer_default; - - fprintf_param_value(" 0=%d", transB) - } - else if (layer->type == "MemoryData") - { - ncnn::MemoryData* op = (ncnn::MemoryData*)layer; - ncnn::MemoryData* op_default = (ncnn::MemoryData*)layer_default; - - fprintf_param_value(" 0=%d", w) - fprintf_param_value(" 1=%d", h) - fprintf_param_value(" 2=%d", c) - fprintf_param_value(" 11=%d", d) - fwrite_weight_data(op->data, bp); - } - else if (layer->type == "MultiHeadAttention") - { - ncnn::MultiHeadAttention* op = (ncnn::MultiHeadAttention*)layer; - ncnn::MultiHeadAttention* op_default = (ncnn::MultiHeadAttention*)layer_default; - - fprintf_param_value(" 0=%d", embed_dim) - fprintf_param_value(" 1=%d", num_head) - fprintf_param_value(" 2=%d", weight_data_size) - - fwrite_weight_tag_data(op->q_weight_data, bp); - fwrite_weight_data(op->q_bias_data, bp); - fwrite_weight_tag_data(op->k_weight_data, bp); - fwrite_weight_data(op->k_bias_data, bp); - fwrite_weight_tag_data(op->v_weight_data, bp); - fwrite_weight_data(op->v_bias_data, bp); - fwrite_weight_tag_data(op->out_weight_data, bp); - fwrite_weight_data(op->out_bias_data, bp); - } - else if (layer->type == "MVN") - { - ncnn::MVN* op = (ncnn::MVN*)layer; - ncnn::MVN* op_default = (ncnn::MVN*)layer_default; - - fprintf_param_value(" 0=%d", normalize_variance) - fprintf_param_value(" 1=%d", across_channels) - fprintf_param_value(" 2=%e", eps) - } - else if (layer->type == "Normalize") - { - ncnn::Normalize* op = (ncnn::Normalize*)layer; - ncnn::Normalize* op_default = (ncnn::Normalize*)layer_default; - - fprintf_param_value(" 0=%d", across_spatial) - fprintf_param_value(" 1=%d", channel_shared) - fprintf_param_value(" 2=%e", eps) - fprintf_param_value(" 3=%d", scale_data_size) - fprintf_param_value(" 4=%d", across_channel) - fprintf_param_value(" 9=%d", eps_mode) - - fwrite_weight_data(op->scale_data, bp); - } - else if (layer->type == "Padding") - { - ncnn::Padding* op = (ncnn::Padding*)layer; - ncnn::Padding* op_default = (ncnn::Padding*)layer_default; - - fprintf_param_value(" 0=%d", top) - fprintf_param_value(" 1=%d", bottom) - fprintf_param_value(" 2=%d", left) - fprintf_param_value(" 3=%d", right) - fprintf_param_value(" 4=%d", type) - fprintf_param_value(" 5=%e", value) - fprintf_param_value(" 6=%d", per_channel_pad_data_size) - fprintf_param_value(" 7=%d", front) - fprintf_param_value(" 8=%d", behind) - - fwrite_weight_data(op->per_channel_pad_data, bp); - } - else if (layer->type == "Permute") - { - ncnn::Permute* op = (ncnn::Permute*)layer; - ncnn::Permute* op_default = (ncnn::Permute*)layer_default; - - fprintf_param_value(" 0=%d", order_type) - } - else if (layer->type == "PixelShuffle") - { - ncnn::PixelShuffle* op = (ncnn::PixelShuffle*)layer; - ncnn::PixelShuffle* op_default = (ncnn::PixelShuffle*)layer_default; - - fprintf_param_value(" 0=%d", upscale_factor) - fprintf_param_value(" 1=%d", mode) - } - else if (layer->type == "Pooling") - { - ncnn::Pooling* op = (ncnn::Pooling*)layer; - ncnn::Pooling* op_default = (ncnn::Pooling*)layer_default; - - fprintf_param_value(" 0=%d", pooling_type) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - } - fprintf_param_value(" 2=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 12=%d", op->stride_h); - } - fprintf_param_value(" 3=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 13=%d", op->pad_top); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 14=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 15=%d", op->pad_bottom); - } - fprintf_param_value(" 4=%d", global_pooling) - fprintf_param_value(" 5=%d", pad_mode) - fprintf_param_value(" 6=%d", avgpool_count_include_pad) - fprintf_param_value(" 7=%d", adaptive_pooling) - fprintf_param_value(" 8=%d", out_w) - { - if (op->out_h != op->out_w) fprintf(pp, " 18=%d", op->out_h); - } - } - else if (layer->type == "Pooling1D") - { - ncnn::Pooling1D* op = (ncnn::Pooling1D*)layer; - ncnn::Pooling1D* op_default = (ncnn::Pooling1D*)layer_default; - - fprintf_param_value(" 0=%d", pooling_type) - fprintf_param_value(" 1=%d", kernel_w) - fprintf_param_value(" 2=%d", stride_w) - fprintf_param_value(" 3=%d", pad_left) - { - if (op->pad_right != op->pad_left) fprintf(pp, " 14=%d", op->pad_right); - } - fprintf_param_value(" 4=%d", global_pooling) - fprintf_param_value(" 5=%d", pad_mode) - fprintf_param_value(" 6=%d", avgpool_count_include_pad) - fprintf_param_value(" 7=%d", adaptive_pooling) - fprintf_param_value(" 8=%d", out_w) - } - else if (layer->type == "Pooling3D") - { - ncnn::Pooling3D* op = (ncnn::Pooling3D*)layer; - ncnn::Pooling3D* op_default = (ncnn::Pooling3D*)layer_default; - - fprintf_param_value(" 0=%d", pooling_type) - fprintf_param_value(" 1=%d", kernel_w) - { - if (op->kernel_h != op->kernel_w) fprintf(pp, " 11=%d", op->kernel_h); - if (op->kernel_d != op->kernel_w) fprintf(pp, " 21=%d", op->kernel_d); - } - fprintf_param_value(" 2=%d", stride_w) - { - if (op->stride_h != op->stride_w) fprintf(pp, " 12=%d", op->stride_h); - if (op->stride_d != op->stride_w) fprintf(pp, " 22=%d", op->stride_d); - } - fprintf_param_value(" 3=%d", pad_left) - { - if (op->pad_top != op->pad_left) fprintf(pp, " 13=%d", op->pad_top); - if (op->pad_front != op->pad_left) fprintf(pp, " 23=%d", op->pad_front); - } - { - if (op->pad_right != op->pad_left) fprintf(pp, " 14=%d", op->pad_right); - } - { - if (op->pad_bottom != op->pad_top) fprintf(pp, " 15=%d", op->pad_bottom); - } - { - if (op->pad_behind != op->pad_front) fprintf(pp, " 16=%d", op->pad_behind); - } - fprintf_param_value(" 4=%d", global_pooling) - fprintf_param_value(" 5=%d", pad_mode) - fprintf_param_value(" 6=%d", avgpool_count_include_pad) - fprintf_param_value(" 7=%d", adaptive_pooling) - fprintf_param_value(" 8=%d", out_w) - { - if (op->out_h != op->out_w) fprintf(pp, " 18=%d", op->out_h); - if (op->out_d != op->out_w) fprintf(pp, " 28=%d", op->out_d); - } - } - else if (layer->type == "Power") - { - ncnn::Power* op = (ncnn::Power*)layer; - ncnn::Power* op_default = (ncnn::Power*)layer_default; - - fprintf_param_value(" 0=%e", power) - fprintf_param_value(" 1=%e", scale) - fprintf_param_value(" 2=%e", shift) - } - else if (layer->type == "PReLU") - { - ncnn::PReLU* op = (ncnn::PReLU*)layer; - ncnn::PReLU* op_default = (ncnn::PReLU*)layer_default; - - fprintf_param_value(" 0=%d", num_slope) - - fwrite_weight_data(op->slope_data, bp); - } - else if (layer->type == "PriorBox") - { - ncnn::PriorBox* op = (ncnn::PriorBox*)layer; - ncnn::PriorBox* op_default = (ncnn::PriorBox*)layer_default; - - { - if (!op->min_sizes.empty()) fprintf_param_float_array(0, op->min_sizes, pp); - } - { - if (!op->max_sizes.empty()) fprintf_param_float_array(1, op->max_sizes, pp); - } - { - if (!op->aspect_ratios.empty()) fprintf_param_float_array(2, op->aspect_ratios, pp); - } - fprintf_param_value(" 3=%e", variances[0]) - fprintf_param_value(" 4=%e", variances[1]) - fprintf_param_value(" 5=%e", variances[2]) - fprintf_param_value(" 6=%e", variances[3]) - fprintf_param_value(" 7=%d", flip) - fprintf_param_value(" 8=%d", clip) - fprintf_param_value(" 9=%d", image_width) - fprintf_param_value(" 10=%d", image_height) - fprintf_param_value(" 11=%e", step_width) - fprintf_param_value(" 12=%e", step_height) - fprintf_param_value(" 13=%e", offset) - } - else if (layer->type == "Proposal") - { - ncnn::Proposal* op = (ncnn::Proposal*)layer; - ncnn::Proposal* op_default = (ncnn::Proposal*)layer_default; - - fprintf_param_value(" 0=%d", feat_stride) - fprintf_param_value(" 1=%d", base_size) - fprintf_param_value(" 2=%d", pre_nms_topN) - fprintf_param_value(" 3=%d", after_nms_topN) - fprintf_param_value(" 4=%e", nms_thresh) - fprintf_param_value(" 5=%d", min_size) - } - else if (layer->type == "PSROIPooling") - { - ncnn::PSROIPooling* op = (ncnn::PSROIPooling*)layer; - ncnn::PSROIPooling* op_default = (ncnn::PSROIPooling*)layer_default; - - fprintf_param_value(" 0=%d", pooled_width) - fprintf_param_value(" 1=%d", pooled_height) - fprintf_param_value(" 2=%e", spatial_scale) - fprintf_param_value(" 3=%d", output_dim) - } - else if (layer->type == "Quantize") - { - ncnn::Quantize* op = (ncnn::Quantize*)layer; - ncnn::Quantize* op_default = (ncnn::Quantize*)layer_default; - - fprintf_param_value(" 0=%d", scale_data_size) - - fwrite_weight_data(op->scale_data, bp); - } - else if (layer->type == "Reduction") - { - ncnn::Reduction* op = (ncnn::Reduction*)layer; - ncnn::Reduction* op_default = (ncnn::Reduction*)layer_default; - - fprintf_param_value(" 0=%d", operation) - fprintf_param_value(" 1=%d", reduce_all) - fprintf_param_value(" 2=%e", coeff) - { - if (!op->axes.empty()) fprintf_param_int_array(3, op->axes, pp); - } - fprintf_param_value(" 4=%d", keepdims) - - // HACK - if (!op->axes.empty()) - { - int fixbug0 = 1; - fprintf(pp, " 5=%d", fixbug0); - } - } - else if (layer->type == "ReLU") - { - ncnn::ReLU* op = (ncnn::ReLU*)layer; - ncnn::ReLU* op_default = (ncnn::ReLU*)layer_default; - - fprintf_param_value(" 0=%e", slope) - } - else if (layer->type == "Reorg") - { - ncnn::Reorg* op = (ncnn::Reorg*)layer; - ncnn::Reorg* op_default = (ncnn::Reorg*)layer_default; - - fprintf_param_value(" 0=%d", stride) - fprintf_param_value(" 1=%d", mode) - } - else if (layer->type == "Requantize") - { - ncnn::Requantize* op = (ncnn::Requantize*)layer; - ncnn::Requantize* op_default = (ncnn::Requantize*)layer_default; - - fprintf_param_value(" 0=%d", scale_in_data_size) - fprintf_param_value(" 1=%d", scale_out_data_size) - fprintf_param_value(" 2=%d", bias_data_size) - fprintf_param_value(" 3=%d", activation_type) - { - if (!op->activation_params.empty()) fprintf_param_float_array(4, op->activation_params, pp); - } - - fwrite_weight_data(op->scale_in_data, bp); - fwrite_weight_data(op->scale_out_data, bp); - fwrite_weight_data(op->bias_data, bp); - } - else if (layer->type == "Reshape") - { - ncnn::Reshape* op = (ncnn::Reshape*)layer; - ncnn::Reshape* op_default = (ncnn::Reshape*)layer_default; - - fprintf_param_value(" 0=%d", w) - fprintf_param_value(" 1=%d", h) - fprintf_param_value(" 2=%d", c) - fprintf_param_value(" 3=%d", permute) - } - else if (layer->type == "RNN") - { - ncnn::RNN* op = (ncnn::RNN*)layer; - ncnn::RNN* op_default = (ncnn::RNN*)layer_default; - - fprintf_param_value(" 0=%d", num_output) - fprintf_param_value(" 1=%d", weight_data_size) - fprintf_param_value(" 2=%d", direction) - - fwrite_weight_tag_data(op->weight_xc_data, bp); - fwrite_weight_tag_data(op->bias_c_data, bp); - fwrite_weight_tag_data(op->weight_hc_data, bp); - } - else if (layer->type == "ROIAlign") - { - ncnn::ROIAlign* op = (ncnn::ROIAlign*)layer; - ncnn::ROIAlign* op_default = (ncnn::ROIAlign*)layer_default; - - fprintf_param_value(" 0=%d", pooled_width) - fprintf_param_value(" 1=%d", pooled_height) - fprintf_param_value(" 2=%e", spatial_scale) - fprintf_param_value(" 3=%d", sampling_ratio) - fprintf_param_value(" 4=%d", aligned) - fprintf_param_value(" 5=%d", version) - } - else if (layer->type == "ROIPooling") - { - ncnn::ROIPooling* op = (ncnn::ROIPooling*)layer; - ncnn::ROIPooling* op_default = (ncnn::ROIPooling*)layer_default; - - fprintf_param_value(" 0=%d", pooled_width) - fprintf_param_value(" 1=%d", pooled_height) - fprintf_param_value(" 2=%e", spatial_scale) - } - else if (layer->type == "Scale") - { - ncnn::Scale* op = (ncnn::Scale*)layer; - ncnn::Scale* op_default = (ncnn::Scale*)layer_default; - - fprintf_param_value(" 0=%d", scale_data_size) - fprintf_param_value(" 1=%d", bias_term) - - fwrite_weight_data(op->scale_data, bp); - fwrite_weight_data(op->bias_data, bp); - } - else if (layer->type == "ShuffleChannel") - { - ncnn::ShuffleChannel* op = (ncnn::ShuffleChannel*)layer; - ncnn::ShuffleChannel* op_default = (ncnn::ShuffleChannel*)layer_default; - - fprintf_param_value(" 0=%d", group) - fprintf_param_value(" 1=%d", reverse) - } - else if (layer->type == "Slice") - { - ncnn::Slice* op = (ncnn::Slice*)layer; - ncnn::Slice* op_default = (ncnn::Slice*)layer_default; - - { - if (!op->slices.empty()) fprintf_param_int_array(0, op->slices, pp); - } - fprintf_param_value(" 1=%d", axis) - } - else if (layer->type == "Softmax") - { - ncnn::Softmax* op = (ncnn::Softmax*)layer; - ncnn::Softmax* op_default = (ncnn::Softmax*)layer_default; - - fprintf_param_value(" 0=%d", axis) - - // HACK - if (op->axis != 0) - { - int fixbug0 = 1; - fprintf(pp, " 1=%d", fixbug0); - } - } - else if (layer->type == "Squeeze") - { - ncnn::Squeeze* op = (ncnn::Squeeze*)layer; - ncnn::Squeeze* op_default = (ncnn::Squeeze*)layer_default; - - fprintf_param_value(" 0=%d", squeeze_w) - fprintf_param_value(" 1=%d", squeeze_h) - fprintf_param_value(" 2=%d", squeeze_c) - { - if (!op->axes.empty()) fprintf_param_int_array(0, op->axes, pp); - } - } - else if (layer->type == "Threshold") - { - ncnn::Threshold* op = (ncnn::Threshold*)layer; - ncnn::Threshold* op_default = (ncnn::Threshold*)layer_default; - - fprintf_param_value(" 0=%e", threshold) - } - else if (layer->type == "UnaryOp") - { - ncnn::UnaryOp* op = (ncnn::UnaryOp*)layer; - ncnn::UnaryOp* op_default = (ncnn::UnaryOp*)layer_default; - - fprintf_param_value(" 0=%d", op_type) - } - else if (layer->type == "YoloDetectionOutput") - { - ncnn::YoloDetectionOutput* op = (ncnn::YoloDetectionOutput*)layer; - ncnn::YoloDetectionOutput* op_default = (ncnn::YoloDetectionOutput*)layer_default; - - fprintf_param_value(" 0=%d", num_class) - fprintf_param_value(" 1=%d", num_box) - fprintf_param_value(" 2=%e", confidence_threshold) - fprintf_param_value(" 3=%e", nms_threshold) - { - if (!op->biases.empty()) fprintf_param_float_array(4, op->biases, pp); - } - } - else if (layer->type == "Yolov3DetectionOutput") - { - ncnn::Yolov3DetectionOutput* op = (ncnn::Yolov3DetectionOutput*)layer; - ncnn::Yolov3DetectionOutput* op_default = (ncnn::Yolov3DetectionOutput*)layer_default; - - fprintf_param_value(" 0=%d", num_class) - fprintf_param_value(" 1=%d", num_box) - fprintf_param_value(" 2=%e", confidence_threshold) - fprintf_param_value(" 3=%e", nms_threshold) - { - if (!op->biases.empty()) fprintf_param_float_array(4, op->biases, pp); - } - { - if (!op->mask.empty()) fprintf_param_int_array(5, op->mask, pp); - } - { - if (!op->anchors_scale.empty()) fprintf_param_float_array(6, op->anchors_scale, pp); - } - } - -#undef fprintf_param_value - - fprintf(pp, "\n"); - - delete layer_default; - } - - fclose(pp); - fclose(bp); - - if (mac) - { - fprintf(stderr, "mac = %llu = %.2f M\n", static_cast(mac), mac / 1000000.0); - } - - return 0; -} diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt index 72c76d135015..3cf1460592ae 100644 --- a/tools/quantize/CMakeLists.txt +++ b/tools/quantize/CMakeLists.txt @@ -1,3 +1,4 @@ +set(CMAKE_CXX_STANDARD 11) if(NCNN_PIXEL) if(NOT NCNN_SIMPLEOCV) @@ -17,15 +18,15 @@ if(NCNN_PIXEL) set(OpenCV_FOUND FALSE) if(OpenCV_FOUND) - add_executable(ncnn2table ncnn2table.cpp) + add_executable(ncnn2table ncnn2table.cpp ini_config.cpp) target_include_directories(ncnn2table PRIVATE ${OpenCV_INCLUDE_DIRS}) target_link_libraries(ncnn2table PRIVATE ncnn ${OpenCV_LIBS}) elseif(NCNN_SIMPLEOCV) - add_executable(ncnn2table ncnn2table.cpp) + add_executable(ncnn2table ncnn2table.cpp ini_config.cpp) target_compile_definitions(ncnn2table PUBLIC USE_NCNN_SIMPLEOCV) target_link_libraries(ncnn2table PRIVATE ncnn) else() - add_executable(ncnn2table ncnn2table.cpp imreadwrite.cpp) + add_executable(ncnn2table ncnn2table.cpp imreadwrite.cpp ini_config.cpp) target_compile_definitions(ncnn2table PUBLIC USE_LOCAL_IMREADWRITE) target_link_libraries(ncnn2table PRIVATE ncnn) endif() @@ -34,7 +35,7 @@ if(NCNN_PIXEL) set_property(TARGET ncnn2table PROPERTY FOLDER "tools/optimization") endif() -add_executable(ncnn2int8 ncnn2int8.cpp) +add_executable(ncnn2int8 ncnn2int8.cpp ini_config.cpp net_quantize.cpp ../modelwriter.cpp) target_link_libraries(ncnn2int8 PRIVATE ncnn) # add ncnn2int8 tool to a virtual project group diff --git a/tools/quantize/imreadwrite.h b/tools/quantize/imreadwrite.h index 5a955dfd35e1..f25286502194 100644 --- a/tools/quantize/imreadwrite.h +++ b/tools/quantize/imreadwrite.h @@ -11,7 +11,7 @@ // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. - +#pragma once #ifndef IMREADWRITE_H #define IMREADWRITE_H diff --git a/tools/quantize/ini_config.cpp b/tools/quantize/ini_config.cpp new file mode 100644 index 000000000000..9da3ae9fcb86 --- /dev/null +++ b/tools/quantize/ini_config.cpp @@ -0,0 +1,224 @@ +#include "ini_config.h" +#include +#include + +namespace ini { + +template +void Value::set(T val) +{ + text = std::to_string(f); +} + +void Value::set(std::string str) +{ + text = '\"' + str + '\"'; +} + +template +void Value::set(const std::vector& data) +{ + text = "[ "; + + size_t len = data.size(); + if (len > 0) + { + size_t i = 0; + for (; i < len - 1; ++i) + { + text += std::to_string(data[i]); + text += ", "; + } + text += std::to_string(data[i]); + text += " "; + } + + text += "]"; +} + +template +T Value::get() +{ + T result; + std::stringstream ss; + ss << text; + ss >> result; + return result; +} + +template +std::vector Value::get() +{ + std::vector result; + + std::string no_brace; + { + // remove brace + auto start = text.find('['); + auto end = text.find(']'); + no_brace = text.substr(start + 1, end); + } + + { + // split with the separator ',' + std::stringstream ss; + size_t end = 0, start = 0; + while (true) + { + end = no_brace.find(',', start); + if (end == std::string::npos) + { + break; + } + + std::string val_str = no_brace.substr(start, end); + start = end + 1; + + T val; + ss << val_str; + ss >> val; + ss.clear(); + result.emplace_back(val); + } + + // parse the last one + std::string val_str = no_brace.substr(start); + T val; + ss << val_str; + ss >> val; + result.emplace_back(val); + } + + return result; +} + +std::string Value::stringify() +{ + return text; +} + +void Table::feed(std::string line) +{ + auto pos = line.find(':'); + assert(pos != std::string::npos); + + std::string key = line.substr(0, pos - 1); + std::string value_str = line.substr(pos + 1); + + values[key] = std::make_shared(value_str); +} + +void Table::feed(const std::vector& lines) +{ + for (auto& line : lines) + { + feed(line); + } +} + +void Table::append(std::string key, float data) +{ + auto pVal = std::make_shared(); + pVal->set(data); + values[key] = pVal; +} + +void Table::append(std::string key, const std::vector& data) +{ + auto pVal = std::make_shared(); + pVal->set(data); + values[key] = pVal; +} + +void Table::append(std::string key, std::string data) +{ + auto pVal = std::make_shared(); + pVal->set(data); + values[key] = pVal; +} + +std::shared_ptr Table::operator[](std::string key) +{ + return values[key]; +} + +std::string Table::stringify() +{ + std::string result; + for (auto itra = values.begin(); itra != values.end(); ++itra) + { + result += itra->first; + result += " = "; + result += itra->second->stringify(); + result += '\n'; + } + return result; +} + +void Config::read(std::string path) +{ + std::ifstream fin; + fin.open(path, std::ios::in); + + if (!fin.is_open()) + { + fprintf(stderr, "open %s failed\n", path.c_str()); + return; + } + + bool recoding = false; + std::shared_ptr pTable = nullptr; + + std::string line; + while (fin >> line) + { + if (nullptr == pTable) + { + auto start = line.find('['); + auto end = line.find(']'); + assert(start != std::string::npos); + assert(end != std::string::npos); + + std::string key = line.substr(start + 1, end); + pTable = std::make_shared
(); + tables[key] = pTable; + continue; + } + + if (line.length() <= 2) + { + pTable = nullptr; + continue; + ; + } + + pTable->feed(line); + } +} + +std::vector Config::list_all() +{ + std::vector result; + for (auto itra = tables.begin(); itra != tables.end(); ++itra) + { + result.push_back(itra->first); + } + return result; +} + +std::shared_ptr
Config::operator[](std::string key) +{ + return tables[key]; +} + +void Config::append(std::string key, std::shared_ptr
table) +{ + tables[key] = table; +} + +void Config::write(std::string path) +{ + // TODO +} + +} // namespace ini diff --git a/tools/quantize/ini_config.h b/tools/quantize/ini_config.h new file mode 100644 index 000000000000..7509b48efeb5 --- /dev/null +++ b/tools/quantize/ini_config.h @@ -0,0 +1,323 @@ +// tpoisonooo is pleased to support the open source community by making ncnn available. +// +// author:tpoisonooo (https://github.com/tpoisonooo/) . +// +// Copyright (C) 2022 tpoisonooo. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ini format table reader and writer +// file example: +// +// [Conv_0] +// type = "Conv" +// input_scale = 127.0 +// weight = [ 1117.265625, 8819.232421875 ] +// +// [LayerNorm_66] +// type = "LayerNorm" +// zero_point = -24 + +namespace ini { + +template +std::string value_set(T data) +{ + return std::to_string(data); +} + +template<> +std::string value_set(std::string data); + +template<> +std::string value_set(const char* data); + +template +std::string value_set(const std::vector& data) +{ + std::string text = "[ "; + size_t len = data.size(); + if (len > 0) + { + size_t i = 0; + for (; i < len - 1; ++i) + { + text += std::to_string(data[i]); + text += ", "; + } + text += std::to_string(data[i]); + text += " "; + } + text += "]"; + return text; +} + +template +T value_get(std::string text) +{ + T result; + std::stringstream ss; + ss << text; + ss >> result; + return result; +} + +template<> +std::string value_get(std::string text); + +/** + * @brief parse `[1, 2.2]` format to value list + * + * @tparam T + * @param text + * @return std::vector + */ +template +std::vector value_get_list(std::string text) +{ + std::vector result; + std::string no_brace; + { + // remove brace + auto start = text.find('['); + auto end = text.find(']'); + no_brace = text.substr(start + 1, end - start - 1); + } + + { + // split with the separator ',' + std::stringstream ss; + size_t end = 0, start = 0; + while (true) + { + end = no_brace.find(',', start); + if (end == std::string::npos) + { + break; + } + + std::string val_str = no_brace.substr(start, end - start); + start = end + 1; + + T val; + ss << val_str; + ss >> val; + ss.clear(); + result.emplace_back(val); + } + + // parse the last one + std::string val_str = no_brace.substr(start); + T val; + ss << val_str; + ss >> val; + result.emplace_back(val); + } + return result; +} + +/** + * @brief contains multiple `key=value` lines + * + */ +class Table +{ +public: + Table() + { + } + + void feed(std::string line) + { + auto pos = line.find('='); + assert(pos != std::string::npos); + + std::string key = line.substr(0, pos - 1); + std::string value_str = line.substr(pos + 2); + + values[key] = value_str; + } + + void feed(const std::vector& lines) + { + for (auto& line : lines) + { + feed(line); + } + } + + std::string operator[](std::string key) + { + return values[key]; + } + + template + T get(std::string key) + { + std::string text = values.at(key); + return value_get(text); + } + + template + std::vector get_list(std::string key) + { + std::string text = values[key]; + return value_get_list(text); + } + + template + void append(std::string key, T data) + { + values[key] = value_set(data); + } + + template + void append(std::string key, const std::vector& data) + { + values[key] = value_set(data); + } + + std::string stringify() + { + std::string result; + for (auto itra = values.begin(); itra != values.end(); ++itra) + { + result += itra->first; + result += " = "; + result += itra->second; + result += '\n'; + } + return result; + } + +private: + std::map values; +}; + +/** + * @brief `Config` consist of multiple key-table + * + */ +class Config +{ +public: + Config() + { + } + + void read(std::string path) + { + std::ifstream fin; + fin.open(path, std::ios::in); + + if (!fin.is_open()) + { + fprintf(stderr, "open %s failed\n", path.c_str()); + return; + } + + std::shared_ptr
pTable = nullptr; + constexpr int BUF_LEN = 1024 * 1024; + char buf[BUF_LEN] = {0}; + std::string line; + while (!fin.eof()) + { + fin.getline(buf, BUF_LEN); + line = std::string(buf); + + if (line.length() <= 2) + { + pTable = nullptr; + continue; + } + + if (nullptr == pTable) + { + auto start = line.find('['); + auto end = line.find(']'); + assert(start != std::string::npos); + assert(end != std::string::npos); + + std::string key = line.substr(start + 1, end - start - 1); + + pTable = std::make_shared
(); + append(key, pTable); + continue; + } + + pTable->feed(line); + } + + fin.close(); + } + + std::vector keys() + { + std::vector result; + for (auto& pair : tables) + { + result.push_back(std::get<0>(pair)); + } + return result; + } + + size_t size() + { + return tables.size(); + } + + std::tuple > operator[](size_t i) + { + return tables[i]; + } + + void append(const std::string& key, std::shared_ptr
table) + { + tables.emplace_back(std::make_pair(key, table)); + } + + void write(const std::string& path) + { + std::ofstream fout; + fout.open(path, std::ios::out); + if (!fout.is_open()) + { + fprintf(stderr, "open %s failed\n", path.c_str()); + } + + for (auto& pair : tables) + { + std::string name = std::get<0>(pair); + std::shared_ptr
ptable = std::get<1>(pair); + fout << "[" << name << "]\n"; + fout << ptable->stringify(); + fout << "\n"; + } + fout.flush(); + fout.close(); + } + +private: + std::vector > > tables; +}; + +} // namespace ini diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index f712306b0228..e820217392b3 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -24,12 +24,9 @@ // ncnn public header #include "datareader.h" -#include "layer.h" -#include "layer_type.h" -#include "net.h" // ncnn private header -#include "../modelwriter.h" +#include "net_quantize.h" class DataReaderFromEmpty : public ncnn::DataReader { @@ -45,476 +42,6 @@ class DataReaderFromEmpty : public ncnn::DataReader } }; -static bool read_int8scale_table(const char* filepath, std::map& blob_int8scale_table, std::map& weight_int8scale_table) -{ - blob_int8scale_table.clear(); - weight_int8scale_table.clear(); - - FILE* fp = fopen(filepath, "rb"); - if (!fp) - { - fprintf(stderr, "Open %s failed.\n", filepath); - return false; - } - - std::string key_str; - std::vector scales; - - std::vector line(10240000); - char* pch = NULL; - size_t len = 0; - - while (!feof(fp)) - { - char* s = fgets(line.data(), (int)line.size(), fp); - if (!s) - break; - - float scale = 1.f; - char key[256]; - line[strcspn(line.data(), "\r\n")] = 0; - - pch = strtok(line.data(), " "); - - if (pch == NULL) break; - - bool is_key = true; - while (pch != NULL) - { - if (is_key) - { - sscanf(pch, "%255s", key); - - key_str = key; - is_key = false; - } - else - { - sscanf(pch, "%f", &scale); - - scales.push_back(scale); - } - - pch = strtok(NULL, " "); - } - - // XYZ_param_N pattern - if (strstr(key_str.c_str(), "_param_")) - { - weight_int8scale_table[key_str] = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); - } - else - { - blob_int8scale_table[key_str] = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); - } - key_str.clear(); - scales.clear(); - } - - fclose(fp); - - return true; -} - -class NetQuantize : public ModelWriter -{ -public: - NetQuantize(); - - std::map blob_int8scale_table; - std::map weight_int8scale_table; - -public: - int quantize_convolution(); - int quantize_convolutiondepthwise(); - int quantize_innerproduct(); - - int fuse_requantize(); -}; - -NetQuantize::NetQuantize() - : ModelWriter() -{ -} - -int NetQuantize::quantize_convolution() -{ - const int layer_count = static_cast(layers.size()); - for (int i = 0; i < layer_count; i++) - { - // find convolution layer - if (layers[i]->type != "Convolution") - continue; - - // find convolution layer - std::map::iterator iter_data = blob_int8scale_table.find(layers[i]->name); - if (iter_data == blob_int8scale_table.end()) - continue; - - char key[256]; - sprintf(key, "%s_param_0", layers[i]->name.c_str()); - - std::map::iterator iter = weight_int8scale_table.find(key); - if (iter == weight_int8scale_table.end()) - { - fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); - return -1; - } - - // Convolution - quantize weight from fp32 to int8 - ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i]; - - ncnn::Mat bottom_blob_int8_scales = iter_data->second; - ncnn::Mat weight_data_int8_scales = iter->second; - - fprintf(stderr, "quantize_convolution %s\n", convolution->name.c_str()); - - { - const int maxk = convolution->kernel_w * convolution->kernel_h; - const int num_input = convolution->weight_data_size / convolution->num_output / maxk; - - ncnn::Mat weight_data_r2 = convolution->weight_data.reshape(maxk, num_input, convolution->num_output); - - ncnn::Mat weight_data_int8; - - ncnn::Option opt_q = opt; - opt_q.blob_allocator = convolution->weight_data.allocator; - opt_q.use_packing_layout = false; - ncnn::quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q); - if (weight_data_int8.empty()) - return -100; - - convolution->weight_data = weight_data_int8.reshape(convolution->weight_data_size); - } - - convolution->int8_scale_term = 2; - convolution->weight_data_int8_scales = weight_data_int8_scales; - convolution->bottom_blob_int8_scales = bottom_blob_int8_scales; - } - - return 0; -} - -int NetQuantize::quantize_convolutiondepthwise() -{ - const int layer_count = static_cast(layers.size()); - for (int i = 0; i < layer_count; i++) - { - // find convolution layer - if (layers[i]->type != "ConvolutionDepthWise") - continue; - - // find convolutiondepthwise layer - std::map::iterator iter_data = blob_int8scale_table.find(layers[i]->name); - if (iter_data == blob_int8scale_table.end()) - continue; - - char key[256]; - sprintf(key, "%s_param_0", layers[i]->name.c_str()); - - std::map::iterator iter = weight_int8scale_table.find(key); - if (iter == weight_int8scale_table.end()) - { - fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); - return -1; - } - - // Convolution - quantize weight from fp32 to int8 - ncnn::ConvolutionDepthWise* convdw = (ncnn::ConvolutionDepthWise*)layers[i]; - - ncnn::Mat bottom_blob_int8_scales = iter_data->second; - ncnn::Mat weight_data_int8_scales = iter->second; - - fprintf(stderr, "quantize_convolutiondepthwise %s\n", convdw->name.c_str()); - - { - ncnn::Mat int8_weight_data(convdw->weight_data_size, (size_t)1u); - if (int8_weight_data.empty()) - return -100; - - const int weight_data_size_g = convdw->weight_data_size / convdw->group; - - for (int g = 0; g < convdw->group; g++) - { - ncnn::Option opt_q = opt; - opt_q.blob_allocator = int8_weight_data.allocator; - opt_q.use_packing_layout = false; - - const ncnn::Mat weight_data_g = convdw->weight_data.range(weight_data_size_g * g, weight_data_size_g); - ncnn::Mat int8_weight_data_g = int8_weight_data.range(weight_data_size_g * g, weight_data_size_g); - const ncnn::Mat weight_data_int8_scales_g = weight_data_int8_scales.range(g, 1); - ncnn::quantize_to_int8(weight_data_g, int8_weight_data_g, weight_data_int8_scales_g, opt_q); - } - - convdw->weight_data = int8_weight_data; - } - - convdw->int8_scale_term = 1; - convdw->weight_data_int8_scales = weight_data_int8_scales; - convdw->bottom_blob_int8_scales = bottom_blob_int8_scales; - } - - return 0; -} - -int NetQuantize::quantize_innerproduct() -{ - const int layer_count = static_cast(layers.size()); - for (int i = 0; i < layer_count; i++) - { - // find convolution layer - if (layers[i]->type != "InnerProduct") - continue; - - // find InnerProduct layer - std::map::iterator iter_data = blob_int8scale_table.find(layers[i]->name); - if (iter_data == blob_int8scale_table.end()) - continue; - - char key[256]; - sprintf(key, "%s_param_0", layers[i]->name.c_str()); - - std::map::iterator iter = weight_int8scale_table.find(key); - if (iter == weight_int8scale_table.end()) - { - fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); - return -1; - } - - // InnerProduct - quantize weight from fp32 to int8 - ncnn::InnerProduct* fc = (ncnn::InnerProduct*)layers[i]; - - ncnn::Mat bottom_blob_int8_scales = iter_data->second; - ncnn::Mat weight_data_int8_scales = iter->second; - - fprintf(stderr, "quantize_innerproduct %s\n", fc->name.c_str()); - - { - const int num_input = fc->weight_data_size / fc->num_output; - - ncnn::Mat weight_data_r2 = fc->weight_data.reshape(num_input, fc->num_output); - - ncnn::Mat weight_data_int8; - ncnn::Option opt_q = opt; - opt_q.use_packing_layout = false; - ncnn::quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q); - if (weight_data_int8.empty()) - return -100; - - fc->weight_data = weight_data_int8.reshape(fc->weight_data_size); - } - - fc->int8_scale_term = 2; - fc->weight_data_int8_scales = weight_data_int8_scales; - fc->bottom_blob_int8_scales = bottom_blob_int8_scales; - } - - return 0; -} - -int NetQuantize::fuse_requantize() -{ - const size_t layer_count = layers.size(); - for (size_t i = 0; i < layer_count; i++) - { - if (layers[i]->type != "Convolution" && layers[i]->type != "ConvolutionDepthWise") - continue; - - // Convolution/ConvolutionDepthWise - Convolution/ConvolutionDepthWise - int top_blob_index = layers[i]->tops[0]; - - size_t j = i + 1; - for (; j < layer_count; j++) - { - if (layers[j]->type != "Convolution" && layers[j]->type != "ConvolutionDepthWise") - continue; - - if (layers[j]->bottoms.size() != 1) - continue; - - if (layers[j]->bottoms[0] == top_blob_index) - break; - } - - if (j == layer_count) - continue; - - // fuse requantize - fprintf(stderr, "fuse_requantize %s %s\n", layers[i]->name.c_str(), layers[j]->name.c_str()); - - if (layers[i]->type == "Convolution" && layers[j]->type == "Convolution") - { - ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; - ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - if (layers[i]->type == "Convolution" && layers[j]->type == "ConvolutionDepthWise") - { - ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; - ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "Convolution") - { - ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; - ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "ConvolutionDepthWise") - { - ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; - ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - } - - for (size_t i = 0; i < layer_count; i++) - { - if (layers[i]->type != "Convolution" && layers[i]->type != "ConvolutionDepthWise") - continue; - - // Convolution/ConvolutionDepthWise - Split - Convolution/ConvolutionDepthWise - int top_blob_index = layers[i]->tops[0]; - - size_t j = i + 1; - for (; j < layer_count; j++) - { - if (layers[j]->type != "Split") - continue; - - if (layers[j]->bottoms.size() != 1) - continue; - - if (layers[j]->bottoms[0] == top_blob_index) - break; - } - - if (j == layer_count) - continue; - - ncnn::Split* split = (ncnn::Split*)layers[j]; - - bool all_conv = true; - for (size_t p = 0; p < split->tops.size(); p++) - { - int split_top_blob_index = split->tops[p]; - - size_t k = j + 1; - for (; k < layer_count; k++) - { - if (layers[k]->type != "Convolution" && layers[k]->type != "ConvolutionDepthWise") - continue; - - if (layers[k]->bottoms.size() != 1) - continue; - - if (layers[k]->bottoms[0] == split_top_blob_index) - break; - } - - if (k == layer_count) - { - all_conv = false; - break; - } - - if (layers[k]->type == "Convolution") - { - ncnn::Convolution* convolution = (ncnn::Convolution*)layers[k]; - if (convolution->weight_data.elemsize != 1u) - { - all_conv = false; - break; - } - } - if (layers[k]->type == "ConvolutionDepthWise") - { - ncnn::ConvolutionDepthWise* convolution = (ncnn::ConvolutionDepthWise*)layers[k]; - if (convolution->weight_data.elemsize != 1u) - { - all_conv = false; - break; - } - } - } - - if (!all_conv) - continue; - - j = blobs[split->tops[0]].consumer; - - // fuse requantize - fprintf(stderr, "fuse_requantize %s %s\n", layers[i]->name.c_str(), split->name.c_str()); - - if (layers[i]->type == "Convolution" && layers[j]->type == "Convolution") - { - ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; - ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - if (layers[i]->type == "Convolution" && layers[j]->type == "ConvolutionDepthWise") - { - ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; - ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "Convolution") - { - ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; - ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "ConvolutionDepthWise") - { - ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; - ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; - - if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) - continue; - - convolution1->int8_scale_term += 100; - convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; - } - } - - return 0; -} - int main(int argc, char** argv) { if (argc != 6) @@ -532,14 +59,21 @@ int main(int argc, char** argv) NetQuantize quantizer; // parse the calibration scale table - if (int8scale_table_path) + bool success = false; + if (std::string(int8scale_table_path).find(".ini") == std::string::npos) + { + quantizer.set_weight_suffix("_param_0"); + success = quantizer.read_txt_format(int8scale_table_path); + } + else { - bool s2 = read_int8scale_table(int8scale_table_path, quantizer.blob_int8scale_table, quantizer.weight_int8scale_table); - if (!s2) - { - fprintf(stderr, "read_int8scale_table failed\n"); - return -1; - } + success = quantizer.read_ini_format(int8scale_table_path); + } + + if (!success) + { + fprintf(stderr, "read_int8scale_table failed\n"); + return -1; } quantizer.load_param(inparam); @@ -552,6 +86,7 @@ int main(int argc, char** argv) else quantizer.load_model(inbin); + quantizer.quantize_mha(); quantizer.quantize_convolution(); quantizer.quantize_convolutiondepthwise(); quantizer.quantize_innerproduct(); diff --git a/tools/quantize/ncnn2table.cpp b/tools/quantize/ncnn2table.cpp index 9fbafa2d1813..5f83706a0b9b 100644 --- a/tools/quantize/ncnn2table.cpp +++ b/tools/quantize/ncnn2table.cpp @@ -38,6 +38,7 @@ #endif #include #include +#include "ini_config.h" // ncnn public header #include "benchmark.h" @@ -91,6 +92,7 @@ class QuantNet : public ncnn::Net int init(); void print_quant_info() const; int save_table(const char* tablepath); + int save_ini(const char* filepath); int quantize_KL(); int quantize_ACIQ(); int quantize_EQ(); @@ -98,6 +100,7 @@ class QuantNet : public ncnn::Net public: std::vector input_blobs; std::vector conv_layers; + std::vector type_list; std::vector conv_bottom_blobs; std::vector conv_top_blobs; @@ -132,6 +135,7 @@ int QuantNet::init() if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise" || layer->type == "InnerProduct") { conv_layers.push_back(i); + type_list.push_back(layer->type); conv_bottom_blobs.push_back(layer->bottoms[0]); conv_top_blobs.push_back(layer->tops[0]); } @@ -190,6 +194,62 @@ int QuantNet::save_table(const char* tablepath) return 0; } +int QuantNet::save_ini(const char* filepath) +{ + auto root = ini::Config(); + + const int conv_layer_count = static_cast(conv_layers.size()); + const int conv_bottom_blob_count = static_cast(conv_bottom_blobs.size()); + + for (int i = 0; i < conv_layer_count; i++) + { + auto tbl = std::make_shared(); + + // write opr type + auto type = type_list[i]; + if (type == "Convolution" || type == "ConvolutionDepthWise") + { + tbl->append("type", std::string("Conv")); + } + else if (type == "InnerProduct") + { + tbl->append("type", std::string("Gemm")); + } + else + { + fprintf(stderr, "unknown type %s\n", type.c_str()); + } + + // write weight scales + { + const ncnn::Mat& weight_scale = weight_scales[i]; + + std::vector scales = {}; + for (int j = 0; j < weight_scale.w; j++) + { + scales.push_back(static_cast(weight_scale[j])); + } + tbl->append("weight", scales); + } + + // write input scale + { + const ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i]; + if (bottom_blob_scale.w != 1) + { + fprintf(stderr, "not support conv input scale length=%d\n", bottom_blob_scale.w); + return -1; + } + tbl->append("input_scale", static_cast(bottom_blob_scale[0])); + } + + const std::string name = layers[conv_layers[i]]->name; + root.append(name, tbl); + } + root.write(std::string(filepath)); + return 0; +} + void QuantNet::print_quant_info() const { for (int i = 0; i < (int)conv_bottom_blobs.size(); i++) @@ -1586,7 +1646,8 @@ static void show_usage() fprintf(stderr, " pixel=RAW/RGB/BGR/GRAY/RGBA/BGRA,...\n"); fprintf(stderr, " thread=8\n"); fprintf(stderr, " method=kl/aciq/eq\n"); - fprintf(stderr, "Sample usage: ncnn2table squeezenet.param squeezenet.bin imagelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl\n"); + fprintf(stderr, " format=txt/ini\n"); + fprintf(stderr, "Sample usage: ncnn2table squeezenet.param squeezenet.bin imagelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl format=txt\n"); } int main(int argc, char** argv) @@ -1629,6 +1690,7 @@ int main(int argc, char** argv) net.listspaths = parse_comma_path_list(lists); std::string method = "kl"; + std::string format = "txt"; for (int i = 5; i < argc; i++) { @@ -1649,17 +1711,37 @@ int main(int argc, char** argv) // load mean norm shape if (memcmp(key, "mean", 4) == 0) + { net.means = parse_comma_float_array_list(value); - if (memcmp(key, "norm", 4) == 0) + } + else if (memcmp(key, "norm", 4) == 0) + { net.norms = parse_comma_float_array_list(value); - if (memcmp(key, "shape", 5) == 0) + } + else if (memcmp(key, "shape", 5) == 0) + { net.shapes = parse_comma_int_array_list(value); - if (memcmp(key, "pixel", 5) == 0) + } + else if (memcmp(key, "pixel", 5) == 0) + { net.type_to_pixels = parse_comma_pixel_type_list(value); - if (memcmp(key, "thread", 6) == 0) + } + else if (memcmp(key, "thread", 6) == 0) + { net.quantize_num_threads = atoi(value); - if (memcmp(key, "method", 6) == 0) + } + else if (memcmp(key, "method", 6) == 0) + { method = std::string(value); + } + else if (memcmp(key, "format", 6) == 0) + { + format = std::string(value); + } + else + { + fprintf(stderr, "unknown key=%s\n", key); + } } // sanity check @@ -1735,7 +1817,14 @@ int main(int argc, char** argv) net.print_quant_info(); - net.save_table(outtable); + if (format == "ini") + { + net.save_ini(outtable); + } + else + { + net.save_table(outtable); + } return 0; } diff --git a/tools/quantize/net_quantize.cpp b/tools/quantize/net_quantize.cpp new file mode 100644 index 000000000000..b77925ffd235 --- /dev/null +++ b/tools/quantize/net_quantize.cpp @@ -0,0 +1,625 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer.h" +#include "layer_type.h" +#include "net.h" +#include "net_quantize.h" +#include +#include +#include + +void NetQuantize::set_weight_suffix(std::string str) +{ + suffix = str; +} + +bool NetQuantize::read_txt_format(const char* filepath) +{ + blob_int8scale_table.clear(); + weight_int8scale_table.clear(); + + FILE* fp = fopen(filepath, "rb"); + if (!fp) + { + fprintf(stderr, "Open %s failed.\n", filepath); + return false; + } + + std::string key_str; + std::vector scales; + + std::vector line(10240000); + char* pch = NULL; + size_t len = 0; + + while (!feof(fp)) + { + char* s = fgets(line.data(), (int)line.size(), fp); + if (!s) + break; + + float scale = 1.f; + char key[256]; + line[strcspn(line.data(), "\r\n")] = 0; + + pch = strtok(line.data(), " "); + + if (pch == NULL) break; + + bool is_key = true; + while (pch != NULL) + { + if (is_key) + { + sscanf(pch, "%255s", key); + + key_str = key; + is_key = false; + } + else + { + sscanf(pch, "%f", &scale); + + scales.push_back(scale); + } + + pch = strtok(NULL, " "); + } + + // XYZ_param_N pattern + if (strstr(key_str.c_str(), "_param_")) + { + weight_int8scale_table[key_str] = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); + } + else + { + blob_int8scale_table[key_str] = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); + } + key_str.clear(); + scales.clear(); + } + + fclose(fp); + + return true; +} + +bool NetQuantize::read_ini_format(const char* path) +{ + blob_int8scale_table.clear(); + weight_int8scale_table.clear(); + mha_table.clear(); + + ini::Config root; + root.read(std::string(path)); + + size_t len = root.size(); + std::string name, type; + std::shared_ptr ptable; + for (size_t i = 0; i < len; ++i) + { + std::tie(name, ptable) = root[i]; + type = ptable->get("type"); + + if (type == "Conv" || type == "Gemm") + { + // load weight scales + { + std::vector scales = ptable->get_list("weight"); + weight_int8scale_table[name] = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); + } + + // load input scale + { + std::vector scales = {ptable->get("input_scale")}; + blob_int8scale_table[name] = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); + } + } + else if (type == "MultiHeadAttention") + { + mha_table[name] = ptable; + } + } + + return true; +} + +int NetQuantize::quantize_mha() +{ + const int layer_count = static_cast(layers.size()); + auto base_opt = opt; + + for (int i = 0; i < layer_count; i++) + { + // find convolution layer + if (layers[i]->type != "MultiHeadAttention") + continue; + + std::string name = layers[i]->name; + if (mha_table.find(name) == mha_table.end()) + { + fprintf(stderr, "cannot find %s quant param.\n", name.c_str()); + continue; + } + + ncnn::MultiHeadAttention* mha = (ncnn::MultiHeadAttention*)layers[i]; + fprintf(stderr, "quantize_multiheadattention %s\n", mha->name.c_str()); + + auto& table = mha_table.at(name); + { + // write weights + // convert fp32 mat to int8 mat with the scales + auto convert = [table, mha, base_opt](ncnn::Mat& weight, std::string key, ncnn::Mat& w_scales) -> int { + ncnn::Option opt_q = base_opt; + opt_q.blob_allocator = weight.allocator; + opt_q.use_packing_layout = false; + + auto scales = table->get_list(key); + if (scales.empty()) + { + return -100; + } + w_scales = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); + + { + ncnn::Mat weight_int8; + const int num_input = mha->embed_dim; + const int num_output = mha->weight_data_size / num_input; + + ncnn::Mat weight_data_r2 = weight.reshape(num_input, num_output); + ncnn::quantize_to_int8(weight_data_r2, weight_int8, w_scales, opt_q); + if (weight_int8.empty()) + return -100; + + weight = weight_int8.reshape(mha->weight_data_size).clone(); + } + return 0; + }; + + int success = 0; + success += convert(mha->q_weight_data, "weight_q", mha->q_weight_scales); + success += convert(mha->k_weight_data, "weight_k", mha->k_weight_scales); + success += convert(mha->v_weight_data, "weight_v", mha->v_weight_scales); + success += convert(mha->out_weight_data, "weight_o", mha->o_weight_scales); + + if (success != 0) + { + fprintf(stderr, "convert fp32 weight to int8 failed. \n"); + return -1; + } + } + + { + // write input scale + auto convert = [table, base_opt](const std::string key, ncnn::Mat& mat) -> int { + std::vector scales = {table->get(key)}; + if (scales.empty()) + { + return -100; + } + + mat = ncnn::Mat((int)scales.size(), (void*)scales.data()).clone(); + return 0; + }; + + int success = 0; + success += convert("input_scale_q", mha->q_input_scale); + success += convert("input_scale_k", mha->k_input_scale); + success += convert("input_scale_v", mha->v_input_scale); + if (success != 0) + { + fprintf(stderr, "load input scale failed. \n"); + return -100; + } + } + + { + // write internal scales + std::vector internal_scales; + internal_scales.emplace_back(table->get("internal_scale_q")); + internal_scales.emplace_back(table->get("internal_scale_k")); + internal_scales.emplace_back(table->get("internal_scale_v")); + internal_scales.emplace_back(table->get("internal_scale_energy")); + internal_scales.emplace_back(table->get("internal_scale_feat")); + + mha->internal_scales = ncnn::Mat((int)internal_scales.size(), (void*)internal_scales.data()).clone(); + } + + { + // write control variable + mha->int8_scale_term = 1; + } + } + + return 0; +} + +int NetQuantize::quantize_convolution() +{ + const int layer_count = static_cast(layers.size()); + for (int i = 0; i < layer_count; i++) + { + // find convolution layer + if (layers[i]->type != "Convolution") + continue; + + // find convolution layer + std::map::iterator iter_data = blob_int8scale_table.find(layers[i]->name); + if (iter_data == blob_int8scale_table.end()) + continue; + + char key[256]; + sprintf(key, "%s%s", layers[i]->name.c_str(), suffix.c_str()); + + std::map::iterator iter = weight_int8scale_table.find(key); + if (iter == weight_int8scale_table.end()) + { + fprintf(stderr, "%s need to be quantized, but no scale param!\n", key); + return -1; + } + + // Convolution - quantize weight from fp32 to int8 + ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i]; + + ncnn::Mat bottom_blob_int8_scales = iter_data->second; + ncnn::Mat weight_data_int8_scales = iter->second; + + fprintf(stderr, "quantize_convolution %s\n", convolution->name.c_str()); + + { + const int maxk = convolution->kernel_w * convolution->kernel_h; + const int num_input = convolution->weight_data_size / convolution->num_output / maxk; + + ncnn::Mat weight_data_r2 = convolution->weight_data.reshape(maxk, num_input, convolution->num_output); + + ncnn::Mat weight_data_int8; + + ncnn::Option opt_q = opt; + opt_q.blob_allocator = convolution->weight_data.allocator; + opt_q.use_packing_layout = false; + ncnn::quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q); + if (weight_data_int8.empty()) + return -100; + + convolution->weight_data = weight_data_int8.reshape(convolution->weight_data_size); + } + + convolution->int8_scale_term = 2; + convolution->weight_data_int8_scales = weight_data_int8_scales; + convolution->bottom_blob_int8_scales = bottom_blob_int8_scales; + } + + return 0; +} + +int NetQuantize::quantize_convolutiondepthwise() +{ + const int layer_count = static_cast(layers.size()); + for (int i = 0; i < layer_count; i++) + { + // find convolution layer + if (layers[i]->type != "ConvolutionDepthWise") + continue; + + // find convolutiondepthwise layer + std::map::iterator iter_data = blob_int8scale_table.find(layers[i]->name); + if (iter_data == blob_int8scale_table.end()) + continue; + + char key[256]; + sprintf(key, "%s%s", layers[i]->name.c_str(), suffix.c_str()); + + std::map::iterator iter = weight_int8scale_table.find(key); + if (iter == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + // Convolution - quantize weight from fp32 to int8 + ncnn::ConvolutionDepthWise* convdw = (ncnn::ConvolutionDepthWise*)layers[i]; + + ncnn::Mat bottom_blob_int8_scales = iter_data->second; + ncnn::Mat weight_data_int8_scales = iter->second; + + fprintf(stderr, "quantize_convolutiondepthwise %s\n", convdw->name.c_str()); + + { + ncnn::Mat int8_weight_data(convdw->weight_data_size, (size_t)1u); + if (int8_weight_data.empty()) + return -100; + + const int weight_data_size_g = convdw->weight_data_size / convdw->group; + + for (int g = 0; g < convdw->group; g++) + { + ncnn::Option opt_q = opt; + opt_q.blob_allocator = int8_weight_data.allocator; + opt_q.use_packing_layout = false; + + const ncnn::Mat weight_data_g = convdw->weight_data.range(weight_data_size_g * g, weight_data_size_g); + ncnn::Mat int8_weight_data_g = int8_weight_data.range(weight_data_size_g * g, weight_data_size_g); + const ncnn::Mat weight_data_int8_scales_g = weight_data_int8_scales.range(g, 1); + ncnn::quantize_to_int8(weight_data_g, int8_weight_data_g, weight_data_int8_scales_g, opt_q); + } + + convdw->weight_data = int8_weight_data; + } + + convdw->int8_scale_term = 1; + convdw->weight_data_int8_scales = weight_data_int8_scales; + convdw->bottom_blob_int8_scales = bottom_blob_int8_scales; + } + + return 0; +} + +int NetQuantize::quantize_innerproduct() +{ + const int layer_count = static_cast(layers.size()); + for (int i = 0; i < layer_count; i++) + { + // find convolution layer + if (layers[i]->type != "InnerProduct") + continue; + + // find InnerProduct layer + std::map::iterator iter_data = blob_int8scale_table.find(layers[i]->name); + if (iter_data == blob_int8scale_table.end()) + continue; + + char key[256]; + sprintf(key, "%s%s", layers[i]->name.c_str(), suffix.c_str()); + + std::map::iterator iter = weight_int8scale_table.find(key); + if (iter == weight_int8scale_table.end()) + { + fprintf(stderr, "this layer need to be quantized, but no scale param!\n"); + return -1; + } + + // InnerProduct - quantize weight from fp32 to int8 + ncnn::InnerProduct* fc = (ncnn::InnerProduct*)layers[i]; + + ncnn::Mat bottom_blob_int8_scales = iter_data->second; + ncnn::Mat weight_data_int8_scales = iter->second; + + fprintf(stderr, "quantize_innerproduct %s\n", fc->name.c_str()); + + { + const int num_input = fc->weight_data_size / fc->num_output; + + ncnn::Mat weight_data_r2 = fc->weight_data.reshape(num_input, fc->num_output); + + ncnn::Mat weight_data_int8; + ncnn::Option opt_q = opt; + opt_q.use_packing_layout = false; + ncnn::quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q); + if (weight_data_int8.empty()) + return -100; + + fc->weight_data = weight_data_int8.reshape(fc->weight_data_size); + } + + fc->int8_scale_term = 2; + fc->weight_data_int8_scales = weight_data_int8_scales; + fc->bottom_blob_int8_scales = bottom_blob_int8_scales; + } + + return 0; +} + +int NetQuantize::fuse_requantize() +{ + const size_t layer_count = layers.size(); + for (size_t i = 0; i < layer_count; i++) + { + if (layers[i]->type != "Convolution" && layers[i]->type != "ConvolutionDepthWise") + continue; + + // Convolution/ConvolutionDepthWise - Convolution/ConvolutionDepthWise + int top_blob_index = layers[i]->tops[0]; + + size_t j = i + 1; + for (; j < layer_count; j++) + { + if (layers[j]->type != "Convolution" && layers[j]->type != "ConvolutionDepthWise") + continue; + + if (layers[j]->bottoms.size() != 1) + continue; + + if (layers[j]->bottoms[0] == top_blob_index) + break; + } + + if (j == layer_count) + continue; + + // fuse requantize + fprintf(stderr, "fuse_requantize %s %s\n", layers[i]->name.c_str(), layers[j]->name.c_str()); + + if (layers[i]->type == "Convolution" && layers[j]->type == "Convolution") + { + ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; + ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + if (layers[i]->type == "Convolution" && layers[j]->type == "ConvolutionDepthWise") + { + ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; + ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "Convolution") + { + ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; + ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "ConvolutionDepthWise") + { + ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; + ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + } + + for (size_t i = 0; i < layer_count; i++) + { + if (layers[i]->type != "Convolution" && layers[i]->type != "ConvolutionDepthWise") + continue; + + // Convolution/ConvolutionDepthWise - Split - Convolution/ConvolutionDepthWise + int top_blob_index = layers[i]->tops[0]; + + size_t j = i + 1; + for (; j < layer_count; j++) + { + if (layers[j]->type != "Split") + continue; + + if (layers[j]->bottoms.size() != 1) + continue; + + if (layers[j]->bottoms[0] == top_blob_index) + break; + } + + if (j == layer_count) + continue; + + ncnn::Split* split = (ncnn::Split*)layers[j]; + + bool all_conv = true; + for (size_t p = 0; p < split->tops.size(); p++) + { + int split_top_blob_index = split->tops[p]; + + size_t k = j + 1; + for (; k < layer_count; k++) + { + if (layers[k]->type != "Convolution" && layers[k]->type != "ConvolutionDepthWise") + continue; + + if (layers[k]->bottoms.size() != 1) + continue; + + if (layers[k]->bottoms[0] == split_top_blob_index) + break; + } + + if (k == layer_count) + { + all_conv = false; + break; + } + + if (layers[k]->type == "Convolution") + { + ncnn::Convolution* convolution = (ncnn::Convolution*)layers[k]; + if (convolution->weight_data.elemsize != 1u) + { + all_conv = false; + break; + } + } + if (layers[k]->type == "ConvolutionDepthWise") + { + ncnn::ConvolutionDepthWise* convolution = (ncnn::ConvolutionDepthWise*)layers[k]; + if (convolution->weight_data.elemsize != 1u) + { + all_conv = false; + break; + } + } + } + + if (!all_conv) + continue; + + j = blobs[split->tops[0]].consumer; + + // fuse requantize + fprintf(stderr, "fuse_requantize %s %s\n", layers[i]->name.c_str(), split->name.c_str()); + + if (layers[i]->type == "Convolution" && layers[j]->type == "Convolution") + { + ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; + ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + if (layers[i]->type == "Convolution" && layers[j]->type == "ConvolutionDepthWise") + { + ncnn::Convolution* convolution1 = (ncnn::Convolution*)layers[i]; + ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "Convolution") + { + ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; + ncnn::Convolution* convolution2 = (ncnn::Convolution*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + if (layers[i]->type == "ConvolutionDepthWise" && layers[j]->type == "ConvolutionDepthWise") + { + ncnn::ConvolutionDepthWise* convolution1 = (ncnn::ConvolutionDepthWise*)layers[i]; + ncnn::ConvolutionDepthWise* convolution2 = (ncnn::ConvolutionDepthWise*)layers[j]; + + if (convolution1->weight_data.elemsize != 1u || convolution2->weight_data.elemsize != 1u) + continue; + + convolution1->int8_scale_term += 100; + convolution1->top_blob_int8_scales = convolution2->bottom_blob_int8_scales; + } + } + + return 0; +} diff --git a/tools/quantize/net_quantize.h b/tools/quantize/net_quantize.h new file mode 100644 index 000000000000..79c8a163b92e --- /dev/null +++ b/tools/quantize/net_quantize.h @@ -0,0 +1,48 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#pragma once +// ncnn private header +#include +#include "../modelwriter.h" +#include "ini_config.h" + +class NetQuantize : public ModelWriter +{ +public: + NetQuantize() + { + } + // conv and gemm quant param + std::map blob_int8scale_table; + std::map weight_int8scale_table; + + // MutiHeadAttention quant param + std::map > mha_table; + +public: + bool read_txt_format(const char* path); + bool read_ini_format(const char* path); + + int quantize_convolution(); + int quantize_convolutiondepthwise(); + int quantize_innerproduct(); + int quantize_mha(); + int fuse_requantize(); + + void set_weight_suffix(std::string s); + +private: + std::string suffix; +}; From e0a0ca690765cf820a7b749acb47dd558c2b72f0 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 28 Jul 2022 10:25:13 +0000 Subject: [PATCH 02/15] apply code-format changes --- benchmark/benchncnn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp index 283c76a340e2..abb1bdef851d 100644 --- a/benchmark/benchncnn.cpp +++ b/benchmark/benchncnn.cpp @@ -321,7 +321,7 @@ int main(int argc, char** argv) benchmark("vision_transformer", ncnn::Mat(384, 384, 3), opt); benchmark("FastestDet", ncnn::Mat(352, 352, 3), opt); - + benchmark("vision_transformer_int8", ncnn::Mat(384, 384, 3), opt); #if NCNN_VULKAN From 7565af0c2771fab3df09fc6dee1a0a607d42578a Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 28 Jul 2022 18:43:27 +0800 Subject: [PATCH 03/15] fix(CI): rebase error --- src/layer/multiheadattention.cpp | 5 - tools/quantize/ini_config.cpp | 237 ++++--------------------------- 2 files changed, 27 insertions(+), 215 deletions(-) mode change 100644 => 100755 src/layer/multiheadattention.cpp mode change 100644 => 100755 tools/quantize/ini_config.cpp diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp old mode 100644 new mode 100755 index 80ec43518b25..14f9ddc83ad1 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -202,11 +202,6 @@ int MultiHeadAttention::forward_int8(const std::vector& bottom_blobs, std:: affine_input(k_blob, k_weight_data, k_bias_data, xk, k_input_scale, k_weight_scales, internal_scales[1], num_head, opt_g, false); affine_input(v_blob, v_weight_data, v_bias_data, xv, v_input_scale, v_weight_scales, internal_scales[2], num_head, opt_g, true); - // transpose(v) for better gemm performance - // Mat xv(seqlen, embed_dim_per_head, num_head, 1u, opt.workspace_allocator); - // Mat debug_xv; - // transform_input(v_blob, v_weight_data, v_bias_data, xv, v_input_scale, v_weight_scales, internal_scales[2], opt_g, debug_xv, true); - // xq @ qk * inv_sqrt_embed_dim_per_head const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); diff --git a/tools/quantize/ini_config.cpp b/tools/quantize/ini_config.cpp old mode 100644 new mode 100755 index 9da3ae9fcb86..227a5ea05b92 --- a/tools/quantize/ini_config.cpp +++ b/tools/quantize/ini_config.cpp @@ -1,224 +1,41 @@ +// tpoisonooo is pleased to support the open source community by making ncnn available. +// +// author:tpoisonooo (https://github.com/tpoisonooo/) . +// +// Copyright (C) 2022 tpoisonooo. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + #include "ini_config.h" -#include -#include namespace ini { - -template -void Value::set(T val) -{ - text = std::to_string(f); -} - -void Value::set(std::string str) -{ - text = '\"' + str + '\"'; -} - -template -void Value::set(const std::vector& data) -{ - text = "[ "; - - size_t len = data.size(); - if (len > 0) - { - size_t i = 0; - for (; i < len - 1; ++i) - { - text += std::to_string(data[i]); - text += ", "; - } - text += std::to_string(data[i]); - text += " "; - } - - text += "]"; -} - -template -T Value::get() -{ - T result; - std::stringstream ss; - ss << text; - ss >> result; - return result; -} - -template -std::vector Value::get() -{ - std::vector result; - - std::string no_brace; - { - // remove brace - auto start = text.find('['); - auto end = text.find(']'); - no_brace = text.substr(start + 1, end); - } - - { - // split with the separator ',' - std::stringstream ss; - size_t end = 0, start = 0; - while (true) - { - end = no_brace.find(',', start); - if (end == std::string::npos) - { - break; - } - - std::string val_str = no_brace.substr(start, end); - start = end + 1; - - T val; - ss << val_str; - ss >> val; - ss.clear(); - result.emplace_back(val); - } - - // parse the last one - std::string val_str = no_brace.substr(start); - T val; - ss << val_str; - ss >> val; - result.emplace_back(val); - } - - return result; -} - -std::string Value::stringify() -{ - return text; -} - -void Table::feed(std::string line) -{ - auto pos = line.find(':'); - assert(pos != std::string::npos); - - std::string key = line.substr(0, pos - 1); - std::string value_str = line.substr(pos + 1); - - values[key] = std::make_shared(value_str); -} - -void Table::feed(const std::vector& lines) -{ - for (auto& line : lines) - { - feed(line); - } -} - -void Table::append(std::string key, float data) -{ - auto pVal = std::make_shared(); - pVal->set(data); - values[key] = pVal; -} - -void Table::append(std::string key, const std::vector& data) +template<> +std::string value_set(std::string data) { - auto pVal = std::make_shared(); - pVal->set(data); - values[key] = pVal; + return "\"" + data + "\""; } -void Table::append(std::string key, std::string data) +template<> +std::string value_set(const char* data) { - auto pVal = std::make_shared(); - pVal->set(data); - values[key] = pVal; + return "\"" + std::string(data) + "\""; } -std::shared_ptr Table::operator[](std::string key) +template<> +std::string value_get(std::string text) { - return values[key]; -} - -std::string Table::stringify() -{ - std::string result; - for (auto itra = values.begin(); itra != values.end(); ++itra) - { - result += itra->first; - result += " = "; - result += itra->second->stringify(); - result += '\n'; - } - return result; -} - -void Config::read(std::string path) -{ - std::ifstream fin; - fin.open(path, std::ios::in); - - if (!fin.is_open()) - { - fprintf(stderr, "open %s failed\n", path.c_str()); - return; - } - - bool recoding = false; - std::shared_ptr
pTable = nullptr; + auto start = text.find('\"'); + auto end = text.find_last_of('\"'); - std::string line; - while (fin >> line) - { - if (nullptr == pTable) - { - auto start = line.find('['); - auto end = line.find(']'); - assert(start != std::string::npos); - assert(end != std::string::npos); - - std::string key = line.substr(start + 1, end); - pTable = std::make_shared
(); - tables[key] = pTable; - continue; - } - - if (line.length() <= 2) - { - pTable = nullptr; - continue; - ; - } - - pTable->feed(line); - } -} - -std::vector Config::list_all() -{ - std::vector result; - for (auto itra = tables.begin(); itra != tables.end(); ++itra) - { - result.push_back(itra->first); - } - return result; -} - -std::shared_ptr
Config::operator[](std::string key) -{ - return tables[key]; -} - -void Config::append(std::string key, std::shared_ptr
table) -{ - tables[key] = table; -} - -void Config::write(std::string path) -{ - // TODO + return text.substr(start + 1, end - start - 1); } } // namespace ini From d5f7835ea560e984703606227420b6e7afed9c8b Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 28 Jul 2022 13:25:06 +0000 Subject: [PATCH 04/15] apply code-format changes --- src/layer/multiheadattention.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 src/layer/multiheadattention.cpp diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp old mode 100755 new mode 100644 From fa8b0bc098b01a88bc8df5afbda5e12c00497c13 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 1 Aug 2022 18:01:12 +0800 Subject: [PATCH 05/15] fix(CI): test mha exceeding --- tests/test_multiheadattention.cpp | 38 +++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/test_multiheadattention.cpp b/tests/test_multiheadattention.cpp index ba6e8d32e899..acf3bba1d157 100644 --- a/tests/test_multiheadattention.cpp +++ b/tests/test_multiheadattention.cpp @@ -105,25 +105,25 @@ static int test_multiheadattention_int8(const ncnn::Mat& a, int num_heads) pd.set(3, 1); std::vector weights(16); - weights[0] = RandomIntMat(embed_dim * embed_dim); - weights[1] = RandomIntMat(embed_dim); - weights[2] = RandomIntMat(embed_dim * embed_dim); - weights[3] = RandomIntMat(embed_dim); - weights[4] = RandomIntMat(embed_dim * embed_dim); - weights[5] = RandomIntMat(embed_dim); - weights[6] = RandomIntMat(embed_dim * embed_dim); - weights[7] = RandomIntMat(embed_dim); - - weights[8] = RandomMat(1); - weights[9] = RandomMat(1); - weights[10] = RandomMat(1); - - weights[11] = RandomMat(embed_dim); - weights[12] = RandomMat(embed_dim); - weights[13] = RandomMat(embed_dim); - weights[14] = RandomMat(embed_dim); - - weights[15] = RandomMat(5); + weights[0] = RandomMat(embed_dim * embed_dim); + weights[1] = RandomMat(embed_dim); + weights[2] = RandomMat(embed_dim * embed_dim); + weights[3] = RandomMat(embed_dim); + weights[4] = RandomMat(embed_dim * embed_dim); + weights[5] = RandomMat(embed_dim); + weights[6] = RandomMat(embed_dim * embed_dim); + weights[7] = RandomMat(embed_dim); + + weights[8] = RandomMat(1, 1.f, 10.f); + weights[9] = RandomMat(1, 1.f, 10.f); + weights[10] = RandomMat(1, 1.f, 10.f); + + weights[11] = RandomMat(embed_dim, 1.f, 10.f); + weights[12] = RandomMat(embed_dim, 1.f, 10.f); + weights[13] = RandomMat(embed_dim, 1.f, 10.f); + weights[14] = RandomMat(embed_dim, 1.f, 10.f); + + weights[15] = RandomMat(5, 1.f, 10.f); std::vector as(1); as[0] = a; From 9d3fb108ba89dc0592219de9ff429d6538209c68 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 1 Aug 2022 19:04:07 +0800 Subject: [PATCH 06/15] improvement(src/mha): add file structure --- src/layer/x86/multiheadattention_x86.cpp | 517 +++++++++++++++++++++++ src/layer/x86/multiheadattention_x86.h | 44 ++ 2 files changed, 561 insertions(+) create mode 100644 src/layer/x86/multiheadattention_x86.cpp create mode 100644 src/layer/x86/multiheadattention_x86.h diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp new file mode 100644 index 000000000000..72084706b6a7 --- /dev/null +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -0,0 +1,517 @@ +// tpoisonooo is pleased to support the open source community by making ncnn available. +// +// author:tpoisonooo (https://github.com/tpoisonooo/) . +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "multiheadattention_x86.h" +#include +#ifdef NCNN_INT8 +#include +#endif + + +namespace ncnn { + +MultiHeadAttention_x86::MultiHeadAttention_x86() +{ + support_packing = false; +} + +int MultiHeadAttention_x86::create_pipeline(const Option& opt) +{ +#if NCNN_INT8 + if (opt.use_int8_inference && q_weight_data.elemsize == (size_t)1u && k_weight_data.elemsize == (size_t)1u && v_weight_data.elemsize == (size_t)1u && out_weight_data.elemsize == (size_t)1u) + { + return create_pipeline_int8_x86(opt); + } +#endif + return 0; +} + +#ifdef NCNN_INT8 +int MultiHeadAttention_x86::create_pipeline_int8_x86(const Option& opt) +{ + return 0; +} + +int MultiHeadAttention_x86::destroy_pipeline(const Option& opt) +{ + return 0; +} + +static int affine_input( + const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, + const Mat& input_scale, const Mat& weight_scales, const float transform_scale, + const int num_head, const Option& opt, bool transpose) +{ + const int embed_dim = input.w; + const int seqlen = input.h; + const int embed_dim_per_head = embed_dim / num_head; + const float scale = 1.0 / input_scale[0]; + + Mat input_int8; + if (input.elemsize != 1) + { + quantize_to_int8(input, input_int8, input_scale, opt); + } + + Mat buffer(out_int8.w, out_int8.h, out_int8.c, 4u, opt.workspace_allocator); + + if (transpose) + { + for (int q = 0; q < num_head; q++) + { + Mat outm = buffer.channel(q); + + for (int i = 0; i < embed_dim_per_head; i++) + { + for (int j = 0; j < seqlen; j++) + { + const int8_t* ptr = input_int8.row(j); + const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + i); + + int32_t sum = 0; + const int32_t index = q * embed_dim_per_head + i; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + float* outptr = outm.row(i); + outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; + } + } + } + } + else + { + for (int q = 0; q < num_head; q++) + { + Mat outm = buffer.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const int8_t* ptr = input_int8.row(i); + const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + j); + + int32_t sum = 0; + const int32_t index = q * embed_dim_per_head + j; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; + } + } + } + } + + Mat transform(1, 4u, opt.workspace_allocator); + transform[0] = transform_scale; + quantize_to_int8(buffer, out_int8, transform, opt); + return 0; +} + +static inline int32_t float2int8(float v) +{ + int int32 = static_cast(round(v)); + if (int32 > 127) return 127; + if (int32 < -127) return -127; + return int32; +} + +int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& q_blob = bottom_blobs[0]; + const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1]; + const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2]; + + const int seqlen = q_blob.h; + const int embed_dim_per_head = embed_dim / num_head; + + Option opt_g = opt; + opt_g.blob_allocator = opt.workspace_allocator; + opt_g.use_packing_layout = false; + + Mat xq(embed_dim_per_head, seqlen, num_head, 1u, opt.workspace_allocator); + Mat xk(embed_dim_per_head, seqlen, num_head, 1u, opt.workspace_allocator); + Mat xv(seqlen, embed_dim_per_head, num_head, 1u, opt.workspace_allocator); + + affine_input(q_blob, q_weight_data, q_bias_data, xq, q_input_scale, q_weight_scales, internal_scales[0], num_head, opt_g, false); + affine_input(k_blob, k_weight_data, k_bias_data, xk, k_input_scale, k_weight_scales, internal_scales[1], num_head, opt_g, false); + affine_input(v_blob, v_weight_data, v_bias_data, xv, v_input_scale, v_weight_scales, internal_scales[2], num_head, opt_g, true); + + // xq @ qk * inv_sqrt_embed_dim_per_head + const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + + Mat xqk(seqlen, seqlen, num_head, 4u, opt.workspace_allocator); + { + // xqk = xq * xk + // xq (embed_dim_per_head, seqlen) + // xk (embed_dim_per_head, seqlen) + const float out_scale = inv_sqrt_embed_dim_per_head / (internal_scales[0] * internal_scales[1]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_head; ++q) + { + const Mat xqm = xq.channel(q); + const Mat xkm = xk.channel(q); + + Mat outm = xqk.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < seqlen; j++) + { + const int8_t* qptr = xqm.row(i); + const int8_t* kptr = xkm.row(j); + + int32_t sum = 0; + for (int k = 0; k < embed_dim_per_head; k++) + { + sum += *qptr++ * *kptr++; + } + + outptr[j] = sum * out_scale; + } + } + } + + // fp32_softmax(xqk) + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_head; q++) + { + // softmax(xqk) + { + Mat outm = xqk.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* ptr = outm.row(i); + + float max = -FLT_MAX; + for (int j = 0; j < seqlen; j++) + { + max = std::max(max, ptr[j]); + } + + float sum = 0.f; + for (int j = 0; j < seqlen; j++) + { + ptr[j] = (float)(exp(ptr[j] - max)); + sum += ptr[j]; + } + + for (int j = 0; j < seqlen; j++) + { + ptr[j] = ptr[j] / sum; + } + } + } + } + } + + // xqkv int4 @ int8, implement by shift + Mat xqkv(embed_dim_per_head, num_head, seqlen, 1u, opt.workspace_allocator); + + const float xqkv_out_scale = internal_scales[4] / internal_scales[2]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_head; ++q) + { + // xqkv = xqk * xv + // xqk (seqlen, seqlen) + // xv (seqlen, embed_dim_per_head) + // out (embed_dim_per_head, num_head, seqlen) + const Mat xqkm = xqk.channel(q); + const Mat xvm = xv.channel(q); + + for (int i = 0; i < seqlen; i++) + { + int8_t* outptr = xqkv.channel(i).row(q); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const float* qkptr = xqkm.row(i); + const int8_t* vptr = xvm.row(j); + + float sum = 0; + for (int k = 0; k < seqlen; k++) + { + sum += (*vptr++) * (*qkptr++); + } + + outptr[j] = float2int8(sum * xqkv_out_scale); + } + } + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(embed_dim, seqlen, 4u, opt.blob_allocator); + if (top_blob.empty()) + return -1; + + const float out_scale = 1.0f / internal_scales[4]; + // out = affine(xqkv) + // xqkv (embed_dim, seqlen) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < seqlen; i++) + { + float* outptr = top_blob.row(i); + + for (int j = 0; j < embed_dim; j++) + { + const int8_t* ptr = xqkv.channel(i); + const int8_t* kptr = (const int8_t*)out_weight_data + embed_dim * j; + + int32_t sum = 0; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + outptr[j] = sum * out_scale / o_weight_scales[j] + out_bias_data[j]; + } + } + + return 0; +} + +#endif + +// refers to https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html +int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ +#if NCNN_INT8 + if (opt.use_int8_inference && q_weight_data.elemsize == (size_t)1u && k_weight_data.elemsize == (size_t)1u && v_weight_data.elemsize == (size_t)1u && out_weight_data.elemsize == (size_t)1u) + { + return forward_int8(bottom_blobs, top_blobs, opt); + } +#endif + + const Mat& q_blob = bottom_blobs[0]; + const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1]; + const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2]; + + const int seqlen = q_blob.h; + const int embed_dim_per_head = embed_dim / num_head; + + Mat& top_blob = top_blobs[0]; + top_blob.create(embed_dim, seqlen, 4u, opt.blob_allocator); + if (top_blob.empty()) + return -1; + + Mat xq(embed_dim_per_head, seqlen, num_head, 4u, opt.workspace_allocator); + Mat xk(embed_dim_per_head, seqlen, num_head, 4u, opt.workspace_allocator); + Mat xv(seqlen, embed_dim_per_head, num_head, 4u, opt.workspace_allocator); + + Mat xqk(seqlen, seqlen, num_head, 4u, opt.workspace_allocator); + + Mat xqkv(embed_dim_per_head, num_head, seqlen, 4u, opt.workspace_allocator); + + const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_head; q++) + { + // xq = affine(q) * inv_sqrt_embed_dim_per_head + { + Mat outm = xq.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const float* ptr = q_blob.row(i); + const float* kptr = (const float*)q_weight_data + embed_dim * (q * embed_dim_per_head + j); + + float sum = q_bias_data[q * embed_dim_per_head + j]; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + outptr[j] = sum * inv_sqrt_embed_dim_per_head; + } + } + } + + // xk = affine(k) + { + Mat outm = xk.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const float* ptr = k_blob.row(i); + const float* kptr = (const float*)k_weight_data + embed_dim * (q * embed_dim_per_head + j); + + float sum = k_bias_data[q * embed_dim_per_head + j]; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + outptr[j] = sum; + } + } + } + + // xv = affine(v) + { + Mat outm = xv.channel(q); + + for (int i = 0; i < embed_dim_per_head; i++) + { + for (int j = 0; j < seqlen; j++) + { + const float* ptr = v_blob.row(j); + const float* kptr = (const float*)v_weight_data + embed_dim * (q * embed_dim_per_head + i); + + float sum = v_bias_data[q * embed_dim_per_head + i]; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + float* outptr = outm.row(i); + + outptr[j] = sum; + } + } + } + + // xqk = xq * xk + // xq (embed_dim_per_head, seqlen) + // xk (embed_dim_per_head, seqlen) + { + const Mat xqm = xq.channel(q); + const Mat xkm = xk.channel(q); + + Mat outm = xqk.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < seqlen; j++) + { + const float* qptr = xqm.row(i); + const float* kptr = xkm.row(j); + + float sum = 0.f; + for (int k = 0; k < embed_dim_per_head; k++) + { + sum += *qptr++ * *kptr++; + } + + outptr[j] = sum; + } + } + } + + // softmax(xqk) + { + Mat outm = xqk.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* ptr = outm.row(i); + + float max = -FLT_MAX; + for (int j = 0; j < seqlen; j++) + { + max = std::max(max, ptr[j]); + } + + float sum = 0.f; + for (int j = 0; j < seqlen; j++) + { + ptr[j] = (float)(exp(ptr[j] - max)); + sum += ptr[j]; + } + + for (int j = 0; j < seqlen; j++) + { + ptr[j] /= sum; + } + } + } + + // xqkv = xqk * xv + // xqk (seqlen, seqlen) + // xv (seqlen, embed_dim_per_head) + // out (embed_dim_per_head, num_head, seqlen) + { + const Mat xqkm = xqk.channel(q); + const Mat xvm = xv.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = xqkv.channel(i).row(q); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const float* qkptr = xqkm.row(i); + const float* vptr = xvm.row(j); + + float sum = 0.f; + for (int k = 0; k < seqlen; k++) + { + sum += *qkptr++ * *vptr++; + } + + outptr[j] = sum; + } + } + } + } + + // out = affine(xqkv) + // xqkv (embed_dim, seqlen) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < seqlen; i++) + { + float* outptr = top_blob.row(i); + + for (int j = 0; j < embed_dim; j++) + { + const float* ptr = xqkv.channel(i); + const float* kptr = (const float*)out_weight_data + embed_dim * j; + + float sum = out_bias_data[j]; + for (int k = 0; k < embed_dim; k++) + { + sum += *ptr++ * *kptr++; + } + + outptr[j] = sum; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/x86/multiheadattention_x86.h b/src/layer/x86/multiheadattention_x86.h new file mode 100644 index 000000000000..6f6982e89847 --- /dev/null +++ b/src/layer/x86/multiheadattention_x86.h @@ -0,0 +1,44 @@ +// tpoisonooo is pleased to support the open source community by making ncnn available. +// +// author:tpoisonooo (https://github.com/tpoisonooo/) . +// +// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_MULTIHEADATTENTION_X86_H +#define LAYER_MULTIHEADATTENTION_X86_H + +#include "multiheadattention.h" + +namespace ncnn { + +class MultiHeadAttention_x86 : virtual public MultiHeadAttention +{ +public: + MultiHeadAttention_x86(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +protected: +#if NCNN_INT8 + int create_pipeline_int8_x86(const Option& opt); + int forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +#endif + +}; + +} // namespace ncnn + +#endif // LAYER_MULTIHEADATTENTION_X86_H From f38ca739497fa0c6a74bdf23c8d5001549e9c663 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Mon, 1 Aug 2022 11:06:30 +0000 Subject: [PATCH 07/15] apply code-format changes --- src/layer/x86/multiheadattention_x86.cpp | 1 - src/layer/x86/multiheadattention_x86.h | 1 - 2 files changed, 2 deletions(-) diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index 72084706b6a7..2ddf8a4d48b7 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -20,7 +20,6 @@ #include #endif - namespace ncnn { MultiHeadAttention_x86::MultiHeadAttention_x86() diff --git a/src/layer/x86/multiheadattention_x86.h b/src/layer/x86/multiheadattention_x86.h index 6f6982e89847..284d1701e100 100644 --- a/src/layer/x86/multiheadattention_x86.h +++ b/src/layer/x86/multiheadattention_x86.h @@ -36,7 +36,6 @@ class MultiHeadAttention_x86 : virtual public MultiHeadAttention int create_pipeline_int8_x86(const Option& opt); int forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; #endif - }; } // namespace ncnn From 4a05da2af0cf7a5ce13a9ea9ba82726bc66bfb7a Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Tue, 2 Aug 2022 16:56:01 +0800 Subject: [PATCH 08/15] improvement(src/layer/mha): add const fold --- src/layer/multiheadattention.cpp | 10 +- src/layer/multiheadattention.h | 3 + src/layer/x86/multiheadattention_x86.cpp | 128 ++++++++++------------- src/layer/x86/multiheadattention_x86.h | 4 + src/layer/x86/softmax_x86.cpp | 1 + src/layer/x86/x86_usability.h | 55 ++++++++++ 6 files changed, 122 insertions(+), 79 deletions(-) diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index 14f9ddc83ad1..0e9c54e93a79 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -32,6 +32,9 @@ int MultiHeadAttention::load_param(const ParamDict& pd) weight_data_size = pd.get(2, 0); int8_scale_term = pd.get(3, 0); + embed_dim_per_head = embed_dim / num_head; + inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + if (int8_scale_term) { #if NCNN_INT8 @@ -173,12 +176,12 @@ static int affine_input( return 0; } -static inline int32_t float2int8(float v) +static inline signed char float2int8(float v) { int int32 = static_cast(round(v)); if (int32 > 127) return 127; if (int32 < -127) return -127; - return int32; + return (signed char)int32; } int MultiHeadAttention::forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const @@ -188,7 +191,6 @@ int MultiHeadAttention::forward_int8(const std::vector& bottom_blobs, std:: const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2]; const int seqlen = q_blob.h; - const int embed_dim_per_head = embed_dim / num_head; Option opt_g = opt; opt_g.blob_allocator = opt.workspace_allocator; @@ -203,8 +205,6 @@ int MultiHeadAttention::forward_int8(const std::vector& bottom_blobs, std:: affine_input(v_blob, v_weight_data, v_bias_data, xv, v_input_scale, v_weight_scales, internal_scales[2], num_head, opt_g, true); // xq @ qk * inv_sqrt_embed_dim_per_head - const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); - Mat xqk(seqlen, seqlen, num_head, 4u, opt.workspace_allocator); { // xqk = xq * xk diff --git a/src/layer/multiheadattention.h b/src/layer/multiheadattention.h index 31a967804391..98a63e05a28e 100644 --- a/src/layer/multiheadattention.h +++ b/src/layer/multiheadattention.h @@ -40,6 +40,9 @@ class MultiHeadAttention : public Layer int weight_data_size; int int8_scale_term; + int embed_dim_per_head; + float inv_sqrt_embed_dim_per_head; + Mat q_weight_data; Mat q_bias_data; Mat k_weight_data; diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index 72084706b6a7..c0603f09787d 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -15,7 +15,10 @@ // specific language governing permissions and limitations under the License. #include "multiheadattention_x86.h" +#include "x86_usability.h" +#include "layer_type.h" #include + #ifdef NCNN_INT8 #include #endif @@ -26,16 +29,56 @@ namespace ncnn { MultiHeadAttention_x86::MultiHeadAttention_x86() { support_packing = false; + softmax = 0; } int MultiHeadAttention_x86::create_pipeline(const Option& opt) { + embed_dim_per_head = embed_dim / num_head; + inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + #if NCNN_INT8 if (opt.use_int8_inference && q_weight_data.elemsize == (size_t)1u && k_weight_data.elemsize == (size_t)1u && v_weight_data.elemsize == (size_t)1u && out_weight_data.elemsize == (size_t)1u) { return create_pipeline_int8_x86(opt); } #endif + + // for fp32 inference, const fold inv_sqrt_embed_dim_per_head into `q_w` and `q_bias` +#if 0 + // FIXME! + float scale_vals[1] = {inv_sqrt_embed_dim_per_head}; + q_weight_fold_data = q_weight_data.clone(); + q_weight_fold_data.substract_mean_normalize(0, scale_vals); + q_bias_fold_data = q_bias_data.clone(); + q_bias_fold_data.substract_mean_normalize(0, scale_vals); +#else + q_weight_fold_data = q_weight_data.clone(); + for (int i = 0; i < q_weight_fold_data.w; ++i) { + q_weight_fold_data[i] *= inv_sqrt_embed_dim_per_head; + } + q_bias_fold_data = q_bias_data.clone(); + for (int i = 0; i < q_bias_fold_data.w; ++i) { + q_bias_fold_data[i] *= inv_sqrt_embed_dim_per_head; + } +#endif + + { + softmax = ncnn::create_layer(ncnn::LayerType::Softmax); + + ncnn::ParamDict pd; + pd.set(0, 1); + pd.set(1, 1); + + softmax->load_param(pd); + softmax->create_pipeline(opt); + } + + if (opt.lightmode) + { + q_weight_data.release(); + q_bias_data.release(); + } return 0; } @@ -143,7 +186,6 @@ int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blob const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2]; const int seqlen = q_blob.h; - const int embed_dim_per_head = embed_dim / num_head; Option opt_g = opt; opt_g.blob_allocator = opt.workspace_allocator; @@ -158,7 +200,6 @@ int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blob affine_input(v_blob, v_weight_data, v_bias_data, xv, v_input_scale, v_weight_scales, internal_scales[2], num_head, opt_g, true); // xq @ qk * inv_sqrt_embed_dim_per_head - const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); Mat xqk(seqlen, seqlen, num_head, 4u, opt.workspace_allocator); { @@ -302,7 +343,7 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v #if NCNN_INT8 if (opt.use_int8_inference && q_weight_data.elemsize == (size_t)1u && k_weight_data.elemsize == (size_t)1u && v_weight_data.elemsize == (size_t)1u && out_weight_data.elemsize == (size_t)1u) { - return forward_int8(bottom_blobs, top_blobs, opt); + return forward_int8_x86(bottom_blobs, top_blobs, opt); } #endif @@ -326,9 +367,7 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v Mat xqkv(embed_dim_per_head, num_head, seqlen, 4u, opt.workspace_allocator); - const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); - #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < num_head; q++) { // xq = affine(q) * inv_sqrt_embed_dim_per_head @@ -342,15 +381,9 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v for (int j = 0; j < embed_dim_per_head; j++) { const float* ptr = q_blob.row(i); - const float* kptr = (const float*)q_weight_data + embed_dim * (q * embed_dim_per_head + j); + const float* kptr = (const float*)q_weight_fold_data + embed_dim * (q * embed_dim_per_head + j); - float sum = q_bias_data[q * embed_dim_per_head + j]; - for (int k = 0; k < embed_dim; k++) - { - sum += *ptr++ * *kptr++; - } - - outptr[j] = sum * inv_sqrt_embed_dim_per_head; + outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + q_bias_fold_data[q * embed_dim_per_head + j]; } } } @@ -368,13 +401,7 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v const float* ptr = k_blob.row(i); const float* kptr = (const float*)k_weight_data + embed_dim * (q * embed_dim_per_head + j); - float sum = k_bias_data[q * embed_dim_per_head + j]; - for (int k = 0; k < embed_dim; k++) - { - sum += *ptr++ * *kptr++; - } - - outptr[j] = sum; + outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + k_bias_data[q * embed_dim_per_head + j]; } } } @@ -385,20 +412,14 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v for (int i = 0; i < embed_dim_per_head; i++) { + float* outptr = outm.row(i); + for (int j = 0; j < seqlen; j++) { const float* ptr = v_blob.row(j); const float* kptr = (const float*)v_weight_data + embed_dim * (q * embed_dim_per_head + i); - float sum = v_bias_data[q * embed_dim_per_head + i]; - for (int k = 0; k < embed_dim; k++) - { - sum += *ptr++ * *kptr++; - } - - float* outptr = outm.row(i); - - outptr[j] = sum; + outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + v_bias_data[q * embed_dim_per_head + i]; } } } @@ -421,43 +442,14 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v const float* qptr = xqm.row(i); const float* kptr = xkm.row(j); - float sum = 0.f; - for (int k = 0; k < embed_dim_per_head; k++) - { - sum += *qptr++ * *kptr++; - } - - outptr[j] = sum; + outptr[j] = mul_add_reduce_no_align(qptr, kptr, embed_dim_per_head); } } } - // softmax(xqk) { Mat outm = xqk.channel(q); - - for (int i = 0; i < seqlen; i++) - { - float* ptr = outm.row(i); - - float max = -FLT_MAX; - for (int j = 0; j < seqlen; j++) - { - max = std::max(max, ptr[j]); - } - - float sum = 0.f; - for (int j = 0; j < seqlen; j++) - { - ptr[j] = (float)(exp(ptr[j] - max)); - sum += ptr[j]; - } - - for (int j = 0; j < seqlen; j++) - { - ptr[j] /= sum; - } - } + softmax->forward_inplace(outm, opt); } // xqkv = xqk * xv @@ -477,13 +469,7 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v const float* qkptr = xqkm.row(i); const float* vptr = xvm.row(j); - float sum = 0.f; - for (int k = 0; k < seqlen; k++) - { - sum += *qkptr++ * *vptr++; - } - - outptr[j] = sum; + outptr[j] = mul_add_reduce_no_align(qkptr, vptr, seqlen); } } } @@ -501,13 +487,7 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v const float* ptr = xqkv.channel(i); const float* kptr = (const float*)out_weight_data + embed_dim * j; - float sum = out_bias_data[j]; - for (int k = 0; k < embed_dim; k++) - { - sum += *ptr++ * *kptr++; - } - - outptr[j] = sum; + outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + out_bias_data[j]; } } diff --git a/src/layer/x86/multiheadattention_x86.h b/src/layer/x86/multiheadattention_x86.h index 6f6982e89847..ebc2b1cd8353 100644 --- a/src/layer/x86/multiheadattention_x86.h +++ b/src/layer/x86/multiheadattention_x86.h @@ -37,6 +37,10 @@ class MultiHeadAttention_x86 : virtual public MultiHeadAttention int forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; #endif +private: + Mat q_weight_fold_data; + Mat q_bias_fold_data; + Layer* softmax; }; } // namespace ncnn diff --git a/src/layer/x86/softmax_x86.cpp b/src/layer/x86/softmax_x86.cpp index d1df7e446cf7..69091ad75e08 100644 --- a/src/layer/x86/softmax_x86.cpp +++ b/src/layer/x86/softmax_x86.cpp @@ -317,6 +317,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return -100; max.fill(-FLT_MAX); + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = bottom_top_blob.channel(q); diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index 28ddfd50b952..dd05aa38a019 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -494,4 +494,59 @@ static NCNN_FORCEINLINE float _mm512_comp_reduce_max_ps(__m512 x) #endif // __AVX__ #endif // __SSE2__ +/** + * @brief A wrapper for simd computation + * + * result = reduce(a[] * b[]) + * + * @param a + * @param b + * @param size + * @return float + */ + static NCNN_FORCEINLINE float mul_add_reduce_no_align(const float* a, const float* b, const int size) +{ + float ret = 0.f; + int align = 0; +#if __AVX512F__ + align = (size >> 4) << 4; + __m512 _sum = _mm512_set1_ps(0.f); + + for (int i = 0; i < align; i+=16) { + __m512 val0 = _mm512_loadu_ps(a + i); + __m512 val1 = _mm512_loadu_ps(b + i); + _sum = _mm512_add_ps(_sum, _mm512_mul_ps(val0, val1)); + } + ret += _mm512_reduce_add_ps(_sum); + +#elif __AVX__ + align = (size >> 3) << 3; + __m256 _sum = _mm256_set1_ps(0.f); + for (int i = 0; i < align; i+=8) { + __m256 val0 = _mm256_loadu_ps(a + i); + __m256 val1 = _mm256_loadu_ps(b + i); + _sum = _mm256_comp_fmadd_ps(val0, val1, _sum); + } + ret += _mm256_reduce_add_ps(_sum); + +#elif __SSE2__ + align = (size >> 2) << 2; + __m128 _sum = _mm_set1_ps(0.f); + for (int i = 0; i < align; i+=8) { + __m128 val0 = _mm_loadu_ps(a + i); + __m128 val1 = _mm_loadu_ps(b + i); + _sum = _mm_add_ps(_sum, _mm_mul_ps(val0, val1)); + } + + ret += _mm_reduce_add_ps(_sum); + +#endif + float sum = 0.f; + for (int i = align; i < size; ++i) { + sum += a[i] * b[i]; + } + ret += sum; + return ret; +} + #endif // X86_USABILITY_H From 49cbb14db339181319bb76650fffb178fef63b70 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Tue, 2 Aug 2022 09:14:00 +0000 Subject: [PATCH 09/15] apply code-format changes --- src/layer/x86/multiheadattention_x86.cpp | 13 +++++----- src/layer/x86/multiheadattention_x86.h | 6 ++--- src/layer/x86/x86_usability.h | 30 ++++++++++++++---------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index cff2cf9d0b43..42af2ad3f20c 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -53,11 +53,13 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) q_bias_fold_data.substract_mean_normalize(0, scale_vals); #else q_weight_fold_data = q_weight_data.clone(); - for (int i = 0; i < q_weight_fold_data.w; ++i) { + for (int i = 0; i < q_weight_fold_data.w; ++i) + { q_weight_fold_data[i] *= inv_sqrt_embed_dim_per_head; } q_bias_fold_data = q_bias_data.clone(); - for (int i = 0; i < q_bias_fold_data.w; ++i) { + for (int i = 0; i < q_bias_fold_data.w; ++i) + { q_bias_fold_data[i] *= inv_sqrt_embed_dim_per_head; } #endif @@ -73,7 +75,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) softmax->create_pipeline(opt); } - if (opt.lightmode) + if (opt.lightmode) { q_weight_data.release(); q_bias_data.release(); @@ -366,7 +368,6 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v Mat xqkv(embed_dim_per_head, num_head, seqlen, 4u, opt.workspace_allocator); - for (int q = 0; q < num_head; q++) { // xq = affine(q) * inv_sqrt_embed_dim_per_head @@ -382,7 +383,7 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v const float* ptr = q_blob.row(i); const float* kptr = (const float*)q_weight_fold_data + embed_dim * (q * embed_dim_per_head + j); - outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + q_bias_fold_data[q * embed_dim_per_head + j]; + outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + q_bias_fold_data[q * embed_dim_per_head + j]; } } } @@ -400,7 +401,7 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v const float* ptr = k_blob.row(i); const float* kptr = (const float*)k_weight_data + embed_dim * (q * embed_dim_per_head + j); - outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + k_bias_data[q * embed_dim_per_head + j]; + outptr[j] = mul_add_reduce_no_align(ptr, kptr, embed_dim) + k_bias_data[q * embed_dim_per_head + j]; } } } diff --git a/src/layer/x86/multiheadattention_x86.h b/src/layer/x86/multiheadattention_x86.h index ebc2b1cd8353..966dbf558bdc 100644 --- a/src/layer/x86/multiheadattention_x86.h +++ b/src/layer/x86/multiheadattention_x86.h @@ -38,9 +38,9 @@ class MultiHeadAttention_x86 : virtual public MultiHeadAttention #endif private: - Mat q_weight_fold_data; - Mat q_bias_fold_data; - Layer* softmax; + Mat q_weight_fold_data; + Mat q_bias_fold_data; + Layer* softmax; }; } // namespace ncnn diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index d4512211abe0..10a14bf5fe4e 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -496,15 +496,15 @@ static NCNN_FORCEINLINE float _mm512_comp_reduce_max_ps(__m512 x) /** * @brief A wrapper for simd computation - * - * result = reduce(a[] * b[]) - * - * @param a - * @param b - * @param size - * @return float + * + * result = reduce(a[] * b[]) + * + * @param a + * @param b + * @param size + * @return float */ - static NCNN_FORCEINLINE float mul_add_reduce_no_align(const float* a, const float* b, const int size) +static NCNN_FORCEINLINE float mul_add_reduce_no_align(const float* a, const float* b, const int size) { float sum = 0.f; int align = 0; @@ -512,7 +512,8 @@ static NCNN_FORCEINLINE float _mm512_comp_reduce_max_ps(__m512 x) align = (size >> 4) << 4; __m512 _sum = _mm512_set1_ps(0.f); - for (int i = 0; i < align; i+=16) { + for (int i = 0; i < align; i += 16) + { __m512 val0 = _mm512_loadu_ps(a + i); __m512 val1 = _mm512_loadu_ps(b + i); _sum = _mm512_add_ps(_sum, _mm512_mul_ps(val0, val1)); @@ -522,7 +523,8 @@ static NCNN_FORCEINLINE float _mm512_comp_reduce_max_ps(__m512 x) #elif __AVX__ align = (size >> 3) << 3; __m256 _sum = _mm256_set1_ps(0.f); - for (int i = 0; i < align; i+=8) { + for (int i = 0; i < align; i += 8) + { __m256 val0 = _mm256_loadu_ps(a + i); __m256 val1 = _mm256_loadu_ps(b + i); _sum = _mm256_comp_fmadd_ps(val0, val1, _sum); @@ -532,16 +534,18 @@ static NCNN_FORCEINLINE float _mm512_comp_reduce_max_ps(__m512 x) #elif __SSE2__ align = (size >> 2) << 2; __m128 _sum = _mm_set1_ps(0.f); - for (int i = 0; i < align; i+=8) { + for (int i = 0; i < align; i += 8) + { __m128 val0 = _mm_loadu_ps(a + i); __m128 val1 = _mm_loadu_ps(b + i); _sum = _mm_add_ps(_sum, _mm_mul_ps(val0, val1)); } - sum += _mm_reduce_add_ps(_sum); + sum += _mm_reduce_add_ps(_sum); #endif - for (int i = align; i < size; ++i) { + for (int i = align; i < size; ++i) + { sum += a[i] * b[i]; } return sum; From 37848f11183937e0b9ded1d0235253be3fe3d872 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Tue, 2 Aug 2022 19:19:27 +0800 Subject: [PATCH 10/15] improvement(src/layer/mha): update --- src/layer/x86/multiheadattention_x86.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index cff2cf9d0b43..c0f4000099ae 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -18,6 +18,7 @@ #include "x86_usability.h" #include "layer_type.h" #include +#include #ifdef NCNN_INT8 #include @@ -66,7 +67,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) softmax = ncnn::create_layer(ncnn::LayerType::Softmax); ncnn::ParamDict pd; - pd.set(0, 1); + pd.set(0, 2); pd.set(1, 1); softmax->load_param(pd); @@ -386,7 +387,10 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v } } } + } + for (int q = 0; q < num_head; q++) + { // xk = affine(k) { Mat outm = xk.channel(q); @@ -404,7 +408,10 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v } } } + } + for (int q = 0; q < num_head; q++) + { // xv = affine(v) { Mat outm = xv.channel(q); @@ -422,7 +429,10 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v } } } + } + for (int q = 0; q < num_head; q++) + { // xqk = xq * xk // xq (embed_dim_per_head, seqlen) // xk (embed_dim_per_head, seqlen) @@ -445,12 +455,12 @@ int MultiHeadAttention_x86::forward(const std::vector& bottom_blobs, std::v } } } + } - { - Mat outm = xqk.channel(q); - softmax->forward_inplace(outm, opt); - } + softmax->forward_inplace(xqk, opt); + for (int q = 0; q < num_head; q++) + { // xqkv = xqk * xv // xqk (seqlen, seqlen) // xv (seqlen, embed_dim_per_head) From 95692d745af730d07df6f6afbab476189483b220 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Wed, 3 Aug 2022 19:57:57 +0800 Subject: [PATCH 11/15] fix(src/layer/mha): miss convert weight to int8 --- src/layer/multiheadattention.cpp | 53 ++++++++++++++++++++++++++++++-- src/layer/multiheadattention.h | 6 ++++ 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index 0e9c54e93a79..2d582b0809de 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -25,6 +25,36 @@ MultiHeadAttention::MultiHeadAttention() { } +int MultiHeadAttention::create_pipeline(const Option& opt) +{ + // runtime quantize the weight data + int ret = 0; +#if NCNN_INT8 + if (opt.use_int8_inference && int8_scale_term) + { + if (q_weight_data.elemsize == (size_t) 4u) + { + ret += quantize_weight(q_weight_data, q_weight_scales, opt); + } + if (k_weight_data.elemsize == (size_t) 4u) + { + ret += quantize_weight(k_weight_data, k_weight_scales, opt); + } + if (v_weight_data.elemsize == (size_t) 4u) + { + ret += quantize_weight(v_weight_data, v_weight_scales, opt); + } + if (out_weight_data.elemsize == (size_t) 4u) + { + ret += quantize_weight(out_weight_data, o_weight_scales, opt); + } + } +#else + (void)(opt); +#endif // NCNN_INT8 + return ret; +} + int MultiHeadAttention::load_param(const ParamDict& pd) { embed_dim = pd.get(0, 0); @@ -98,10 +128,29 @@ int MultiHeadAttention::load_model(const ModelBin& mb) } #ifdef NCNN_INT8 -static int affine_input( + +int MultiHeadAttention::quantize_weight(Mat& weight_data, const Mat& weight_data_int8_scales, const Option& opt) +{ + const int num_output = embed_dim; + const int num_input = weight_data_size / num_output; + + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + Mat weight_data_int8; + Option opt_q = opt; + opt_q.use_packing_layout = false; + quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q); + if (weight_data_int8.empty()) + return -100; + + weight_data = weight_data_int8.reshape(weight_data_size); + return 0; +} + +int MultiHeadAttention::affine_input( const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, const Mat& input_scale, const Mat& weight_scales, const float transform_scale, - const int num_head, const Option& opt, bool transpose) + const int num_head, const Option& opt, bool transpose) const { const int embed_dim = input.w; const int seqlen = input.h; diff --git a/src/layer/multiheadattention.h b/src/layer/multiheadattention.h index 98a63e05a28e..a344c24a8f56 100644 --- a/src/layer/multiheadattention.h +++ b/src/layer/multiheadattention.h @@ -28,10 +28,16 @@ class MultiHeadAttention : public Layer virtual int load_model(const ModelBin& mb); + virtual int create_pipeline(const Option& opt); + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; #ifdef NCNN_INT8 int forward_int8(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + + int quantize_weight(Mat& weight, const Mat& scales, const Option& opt); + + int affine_input(const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8,const Mat& input_scale, const Mat& weight_scales, const float transform_scale, const int num_head, const Option& opt, bool transpose) const; #endif public: From 07a1424503436a7074a24d19d166453cf7294b72 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Wed, 3 Aug 2022 19:58:52 +0800 Subject: [PATCH 12/15] fix(src/layer/x86/mha): update int8 --- src/layer/x86/multiheadattention_x86.cpp | 86 ------------------------ 1 file changed, 86 deletions(-) diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index 6a5f6382bc03..80c82e6ef7fa 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -95,92 +95,6 @@ int MultiHeadAttention_x86::destroy_pipeline(const Option& opt) return 0; } -static int affine_input( - const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, - const Mat& input_scale, const Mat& weight_scales, const float transform_scale, - const int num_head, const Option& opt, bool transpose) -{ - const int embed_dim = input.w; - const int seqlen = input.h; - const int embed_dim_per_head = embed_dim / num_head; - const float scale = 1.0 / input_scale[0]; - - Mat input_int8; - if (input.elemsize != 1) - { - quantize_to_int8(input, input_int8, input_scale, opt); - } - - Mat buffer(out_int8.w, out_int8.h, out_int8.c, 4u, opt.workspace_allocator); - - if (transpose) - { - for (int q = 0; q < num_head; q++) - { - Mat outm = buffer.channel(q); - - for (int i = 0; i < embed_dim_per_head; i++) - { - for (int j = 0; j < seqlen; j++) - { - const int8_t* ptr = input_int8.row(j); - const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + i); - - int32_t sum = 0; - const int32_t index = q * embed_dim_per_head + i; - for (int k = 0; k < embed_dim; k++) - { - sum += *ptr++ * *kptr++; - } - - float* outptr = outm.row(i); - outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; - } - } - } - } - else - { - for (int q = 0; q < num_head; q++) - { - Mat outm = buffer.channel(q); - - for (int i = 0; i < seqlen; i++) - { - float* outptr = outm.row(i); - - for (int j = 0; j < embed_dim_per_head; j++) - { - const int8_t* ptr = input_int8.row(i); - const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + j); - - int32_t sum = 0; - const int32_t index = q * embed_dim_per_head + j; - for (int k = 0; k < embed_dim; k++) - { - sum += *ptr++ * *kptr++; - } - - outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; - } - } - } - } - - Mat transform(1, 4u, opt.workspace_allocator); - transform[0] = transform_scale; - quantize_to_int8(buffer, out_int8, transform, opt); - return 0; -} - -static inline int32_t float2int8(float v) -{ - int int32 = static_cast(round(v)); - if (int32 > 127) return 127; - if (int32 < -127) return -127; - return int32; -} - int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& q_blob = bottom_blobs[0]; From 9c1c2c9d4f0c4712048b42def367cc56ffd8cba0 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Wed, 3 Aug 2022 12:00:37 +0000 Subject: [PATCH 13/15] apply code-format changes --- src/layer/multiheadattention.cpp | 8 ++++---- src/layer/multiheadattention.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index 2d582b0809de..4f77ff862b9d 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -32,19 +32,19 @@ int MultiHeadAttention::create_pipeline(const Option& opt) #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { - if (q_weight_data.elemsize == (size_t) 4u) + if (q_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(q_weight_data, q_weight_scales, opt); } - if (k_weight_data.elemsize == (size_t) 4u) + if (k_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(k_weight_data, k_weight_scales, opt); } - if (v_weight_data.elemsize == (size_t) 4u) + if (v_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(v_weight_data, v_weight_scales, opt); } - if (out_weight_data.elemsize == (size_t) 4u) + if (out_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(out_weight_data, o_weight_scales, opt); } diff --git a/src/layer/multiheadattention.h b/src/layer/multiheadattention.h index a344c24a8f56..4fc963d8feb8 100644 --- a/src/layer/multiheadattention.h +++ b/src/layer/multiheadattention.h @@ -37,7 +37,7 @@ class MultiHeadAttention : public Layer int quantize_weight(Mat& weight, const Mat& scales, const Option& opt); - int affine_input(const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8,const Mat& input_scale, const Mat& weight_scales, const float transform_scale, const int num_head, const Option& opt, bool transpose) const; + int affine_input(const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, const Mat& input_scale, const Mat& weight_scales, const float transform_scale, const int num_head, const Option& opt, bool transpose) const; #endif public: From 9454c5105864f9eaaee1a912c29fb78c4bda3060 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 4 Aug 2022 17:09:26 +0800 Subject: [PATCH 14/15] improvement(src/x86/mha): int8 optimization --- src/layer/multiheadattention.cpp | 8 +- src/layer/multiheadattention.h | 2 +- src/layer/x86/multiheadattention_x86.cpp | 108 ++++++++++++++++++----- src/layer/x86/multiheadattention_x86.h | 3 + src/layer/x86/x86_usability.h | 69 ++++++++++++++- 5 files changed, 162 insertions(+), 28 deletions(-) diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index 4f77ff862b9d..2d582b0809de 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -32,19 +32,19 @@ int MultiHeadAttention::create_pipeline(const Option& opt) #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { - if (q_weight_data.elemsize == (size_t)4u) + if (q_weight_data.elemsize == (size_t) 4u) { ret += quantize_weight(q_weight_data, q_weight_scales, opt); } - if (k_weight_data.elemsize == (size_t)4u) + if (k_weight_data.elemsize == (size_t) 4u) { ret += quantize_weight(k_weight_data, k_weight_scales, opt); } - if (v_weight_data.elemsize == (size_t)4u) + if (v_weight_data.elemsize == (size_t) 4u) { ret += quantize_weight(v_weight_data, v_weight_scales, opt); } - if (out_weight_data.elemsize == (size_t)4u) + if (out_weight_data.elemsize == (size_t) 4u) { ret += quantize_weight(out_weight_data, o_weight_scales, opt); } diff --git a/src/layer/multiheadattention.h b/src/layer/multiheadattention.h index 4fc963d8feb8..e81c1287a0d4 100644 --- a/src/layer/multiheadattention.h +++ b/src/layer/multiheadattention.h @@ -37,7 +37,7 @@ class MultiHeadAttention : public Layer int quantize_weight(Mat& weight, const Mat& scales, const Option& opt); - int affine_input(const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, const Mat& input_scale, const Mat& weight_scales, const float transform_scale, const int num_head, const Option& opt, bool transpose) const; + virtual int affine_input(const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, const Mat& input_scale, const Mat& weight_scales, const float transform_scale, const int num_head, const Option& opt, bool transpose) const; #endif public: diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index 80c82e6ef7fa..feba621c3db0 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -37,6 +37,17 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) embed_dim_per_head = embed_dim / num_head; inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); + { + softmax = ncnn::create_layer(ncnn::LayerType::Softmax); + + ncnn::ParamDict pd; + pd.set(0, 2); + pd.set(1, 1); + + softmax->load_param(pd); + softmax->create_pipeline(opt); + } + #if NCNN_INT8 if (opt.use_int8_inference && q_weight_data.elemsize == (size_t)1u && k_weight_data.elemsize == (size_t)1u && v_weight_data.elemsize == (size_t)1u && out_weight_data.elemsize == (size_t)1u) { @@ -65,16 +76,6 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) } #endif - { - softmax = ncnn::create_layer(ncnn::LayerType::Softmax); - - ncnn::ParamDict pd; - pd.set(0, 2); - pd.set(1, 1); - - softmax->load_param(pd); - softmax->create_pipeline(opt); - } if (opt.lightmode) { @@ -95,6 +96,79 @@ int MultiHeadAttention_x86::destroy_pipeline(const Option& opt) return 0; } + +int MultiHeadAttention_x86::affine_input( + const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, + const Mat& input_scale, const Mat& weight_scales, const float transform_scale, + const int num_head, const Option& opt, bool transpose) const +{ + const int embed_dim = input.w; + const int seqlen = input.h; + const int embed_dim_per_head = embed_dim / num_head; + const float scale = 1.0 / input_scale[0]; + + Mat input_int8; + if (input.elemsize != 1) + { + quantize_to_int8(input, input_int8, input_scale, opt); + } + + Mat buffer(out_int8.w, out_int8.h, out_int8.c, 4u, opt.workspace_allocator); + + if (transpose) + { + for (int q = 0; q < num_head; q++) + { + Mat outm = buffer.channel(q); + + for (int i = 0; i < embed_dim_per_head; i++) + { + for (int j = 0; j < seqlen; j++) + { + const int8_t* ptr = input_int8.row(j); + const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + i); + + const int32_t sum = mul_add_reduce_no_align(ptr, kptr, embed_dim); + + const int32_t index = q * embed_dim_per_head + i; + + float* outptr = outm.row(i); + outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; + } + } + } + } + else + { + for (int q = 0; q < num_head; q++) + { + Mat outm = buffer.channel(q); + + for (int i = 0; i < seqlen; i++) + { + float* outptr = outm.row(i); + + for (int j = 0; j < embed_dim_per_head; j++) + { + const int8_t* ptr = input_int8.row(i); + const int8_t* kptr = (int8_t*)(weight.data) + embed_dim * (q * embed_dim_per_head + j); + + const int32_t index = q * embed_dim_per_head + j; + + const int32_t sum = mul_add_reduce_no_align(ptr, kptr, embed_dim); + + outptr[j] = (float)sum * scale / weight_scales[index] + bias[index]; + } + } + } + } + + Mat transform(1, 4u, opt.workspace_allocator); + transform[0] = transform_scale; + quantize_to_int8(buffer, out_int8, transform, opt); + return 0; +} + int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& q_blob = bottom_blobs[0]; @@ -141,11 +215,7 @@ int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blob const int8_t* qptr = xqm.row(i); const int8_t* kptr = xkm.row(j); - int32_t sum = 0; - for (int k = 0; k < embed_dim_per_head; k++) - { - sum += *qptr++ * *kptr++; - } + const int32_t sum = mul_add_reduce_no_align(qptr, kptr, embed_dim_per_head); outptr[j] = sum * out_scale; } @@ -156,7 +226,6 @@ int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blob #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < num_head; q++) { - // softmax(xqk) { Mat outm = xqk.channel(q); @@ -186,7 +255,6 @@ int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blob } } - // xqkv int4 @ int8, implement by shift Mat xqkv(embed_dim_per_head, num_head, seqlen, 1u, opt.workspace_allocator); const float xqkv_out_scale = internal_scales[4] / internal_scales[2]; @@ -238,11 +306,7 @@ int MultiHeadAttention_x86::forward_int8_x86(const std::vector& bottom_blob const int8_t* ptr = xqkv.channel(i); const int8_t* kptr = (const int8_t*)out_weight_data + embed_dim * j; - int32_t sum = 0; - for (int k = 0; k < embed_dim; k++) - { - sum += *ptr++ * *kptr++; - } + const int32_t sum = mul_add_reduce_no_align(ptr, kptr, embed_dim); outptr[j] = sum * out_scale / o_weight_scales[j] + out_bias_data[j]; } diff --git a/src/layer/x86/multiheadattention_x86.h b/src/layer/x86/multiheadattention_x86.h index 966dbf558bdc..4190269ea9c8 100644 --- a/src/layer/x86/multiheadattention_x86.h +++ b/src/layer/x86/multiheadattention_x86.h @@ -35,6 +35,9 @@ class MultiHeadAttention_x86 : virtual public MultiHeadAttention #if NCNN_INT8 int create_pipeline_int8_x86(const Option& opt); int forward_int8_x86(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + + virtual int affine_input(const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, const Mat& input_scale, const Mat& weight_scales, const float transform_scale, const int num_head, const Option& opt, bool transpose) const; + #endif private: diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index 10a14bf5fe4e..f19b165f0461 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -287,6 +287,23 @@ static NCNN_FORCEINLINE float _mm256_reduce_add_ps(__m256 x) return _mm_cvtss_f32(x32); } +static NCNN_FORCEINLINE int32_t hsum_epi32_avx(__m128i x) +{ + __m128i hi64 = _mm_unpackhi_epi64(x, x); // 3-operand non-destructive AVX lets us save a byte without needing a movdqa + __m128i sum64 = _mm_add_epi32(hi64, x); + __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); // Swap the low two elements + __m128i sum32 = _mm_add_epi32(sum64, hi32); + return _mm_cvtsi128_si32(sum32); // movd +} + +static NCNN_FORCEINLINE int32_t _mm256_hsum_8x32_i(__m256i v) +{ + __m128i sum128 = _mm_add_epi32( + _mm256_castsi256_si128(v), + _mm256_extracti128_si256(v, 1)); // silly GCC uses a longer AXV512VL instruction if AVX512 is enabled :/ + return hsum_epi32_avx(sum128); +} + static NCNN_FORCEINLINE float _mm256_reduce_max_ps(__m256 x) { const __m128 x128 = _mm_max_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x)); @@ -495,7 +512,7 @@ static NCNN_FORCEINLINE float _mm512_comp_reduce_max_ps(__m512 x) #endif // __SSE2__ /** - * @brief A wrapper for simd computation + * @brief A wrapper for fp32 simd computation * * result = reduce(a[] * b[]) * @@ -551,4 +568,54 @@ static NCNN_FORCEINLINE float mul_add_reduce_no_align(const float* a, const floa return sum; } +/** + * @brief A wrapper for int8 simd computation + * + * result = reduce(a[] * b[]) + * + * @param a + * @param b + * @param size + * @return int32_t + */ +static NCNN_FORCEINLINE int32_t mul_add_reduce_no_align(const int8_t* a, const int8_t* b, const int size) +{ + int32_t sum = 0.f; + int align = 0; + +#if __AVXVNNI__ || __AVX512VNNI__ + align = (size >> 5) << 5; + __m256i _sum = _mm256_setzero_si256(); + for (int i = 0; i < align; i+=32) + { + __m256i val0 = _mm256_lddqu_si256((const __m256i*)(a + i)); + __m256i val1 = _mm256_lddqu_si256((const __m256i*)(b + i)); + _sum = _mm256_dpbusd_epi32(_sum, val0, val1); + } + sum += _mm256_hsum_8x32_i(_sum); + +#elif __AVX2__ + align = (size >> 4) << 4; + __m256i _sum = _mm256_setzero_si256(); + for (int i = 0; i < align; i += 16) + { + __m256i val0 = _mm256_cvtepi8_epi16(_mm_lddqu_si128((const __m128i*)(a + i))); + __m256i val1 = _mm256_cvtepi8_epi16(_mm_lddqu_si128((const __m128i*)(b + i))); + + __m256i lo = _mm256_mullo_epi16(val0, val1); + __m256i hi = _mm256_mulhi_epi16(val0, val1); + _sum = _mm256_add_epi32(_sum, _mm256_unpacklo_epi16(lo, hi)); + _sum = _mm256_add_epi32(_sum, _mm256_unpackhi_epi16(lo, hi)); + } + sum += _mm256_hsum_8x32_i(_sum); + +#endif + + for (int i = align; i < size; ++i) + { + sum += a[i] * b[i]; + } + return sum; +} + #endif // X86_USABILITY_H From 42ad426cb9d1dc8f7a18d33435b6af64cf4dcc01 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 4 Aug 2022 09:11:43 +0000 Subject: [PATCH 15/15] apply code-format changes --- src/layer/multiheadattention.cpp | 8 ++++---- src/layer/x86/multiheadattention_x86.cpp | 2 -- src/layer/x86/x86_usability.h | 16 ++++++++-------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index 2d582b0809de..4f77ff862b9d 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -32,19 +32,19 @@ int MultiHeadAttention::create_pipeline(const Option& opt) #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { - if (q_weight_data.elemsize == (size_t) 4u) + if (q_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(q_weight_data, q_weight_scales, opt); } - if (k_weight_data.elemsize == (size_t) 4u) + if (k_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(k_weight_data, k_weight_scales, opt); } - if (v_weight_data.elemsize == (size_t) 4u) + if (v_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(v_weight_data, v_weight_scales, opt); } - if (out_weight_data.elemsize == (size_t) 4u) + if (out_weight_data.elemsize == (size_t)4u) { ret += quantize_weight(out_weight_data, o_weight_scales, opt); } diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp index feba621c3db0..def263558bfa 100644 --- a/src/layer/x86/multiheadattention_x86.cpp +++ b/src/layer/x86/multiheadattention_x86.cpp @@ -76,7 +76,6 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt) } #endif - if (opt.lightmode) { q_weight_data.release(); @@ -96,7 +95,6 @@ int MultiHeadAttention_x86::destroy_pipeline(const Option& opt) return 0; } - int MultiHeadAttention_x86::affine_input( const Mat& input, const Mat& weight, const Mat& bias, Mat& out_int8, const Mat& input_scale, const Mat& weight_scales, const float transform_scale, diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index f19b165f0461..c1865e211c73 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -289,18 +289,18 @@ static NCNN_FORCEINLINE float _mm256_reduce_add_ps(__m256 x) static NCNN_FORCEINLINE int32_t hsum_epi32_avx(__m128i x) { - __m128i hi64 = _mm_unpackhi_epi64(x, x); // 3-operand non-destructive AVX lets us save a byte without needing a movdqa + __m128i hi64 = _mm_unpackhi_epi64(x, x); // 3-operand non-destructive AVX lets us save a byte without needing a movdqa __m128i sum64 = _mm_add_epi32(hi64, x); - __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); // Swap the low two elements + __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); // Swap the low two elements __m128i sum32 = _mm_add_epi32(sum64, hi32); - return _mm_cvtsi128_si32(sum32); // movd + return _mm_cvtsi128_si32(sum32); // movd } static NCNN_FORCEINLINE int32_t _mm256_hsum_8x32_i(__m256i v) { - __m128i sum128 = _mm_add_epi32( - _mm256_castsi256_si128(v), - _mm256_extracti128_si256(v, 1)); // silly GCC uses a longer AXV512VL instruction if AVX512 is enabled :/ + __m128i sum128 = _mm_add_epi32( + _mm256_castsi256_si128(v), + _mm256_extracti128_si256(v, 1)); // silly GCC uses a longer AXV512VL instruction if AVX512 is enabled :/ return hsum_epi32_avx(sum128); } @@ -586,7 +586,7 @@ static NCNN_FORCEINLINE int32_t mul_add_reduce_no_align(const int8_t* a, const i #if __AVXVNNI__ || __AVX512VNNI__ align = (size >> 5) << 5; __m256i _sum = _mm256_setzero_si256(); - for (int i = 0; i < align; i+=32) + for (int i = 0; i < align; i += 32) { __m256i val0 = _mm256_lddqu_si256((const __m256i*)(a + i)); __m256i val1 = _mm256_lddqu_si256((const __m256i*)(b + i)); @@ -596,7 +596,7 @@ static NCNN_FORCEINLINE int32_t mul_add_reduce_no_align(const int8_t* a, const i #elif __AVX2__ align = (size >> 4) << 4; - __m256i _sum = _mm256_setzero_si256(); + __m256i _sum = _mm256_setzero_si256(); for (int i = 0; i < align; i += 16) { __m256i val0 = _mm256_cvtepi8_epi16(_mm_lddqu_si128((const __m128i*)(a + i)));