diff --git a/docs/toolchain/appendix/operators.md b/docs/toolchain/appendix/operators.md index e2c62cd..be53deb 100644 --- a/docs/toolchain/appendix/operators.md +++ b/docs/toolchain/appendix/operators.md @@ -4,105 +4,264 @@ Table below shows the list of operators supports base on ONNX operators. *The operators NPU supports* -| Node | 520 | 720 | 530 | 630 | 730 | -| ----------------------------- | --- | --- | --- | --- |-----| -| Add | Y | Y | Y | Y | Y | -| AveragePool1 | Y | Y | Y | Y | Y | -| BatchNormalization | Y | Y | Y | Y | Y | -| BitShift | N | N | N | N | N | -| Cast | N | N | N | N | N | -| Clip2 | Y | Y | Y | Y | Y | -| Concat | Y | Y | Y | Y | Y | -| Conv | Y | Y | Y | Y | Y | -| ConvTranspose3 | N | Y | Y | Y | Y | -| DepthToSpace4 | N | N | Y | Y | Y | -| Div | N | N | N | N | Y | -| Dropout | Y | Y | Y | Y | Y | -| Elu | N | N | Y | Y | Y | -| Erf | N | N | Y | Y | Y | -| Exp | N | N | Y | Y | Y | -| Expand5 | N | N | Y | Y | Y | -| Flatten6 | Y | Y | Y | Y | Y | -| Floor | N | N | N | N | N | -| Gather | N | N | N | N | N | -| GatherElements | N | N | N | N | N | -| GatherND | N | N | N | N | N | -| Gemm | Y | Y | Y | Y | Y | -| GlobalAveragePool7 | Y | Y | Y | Y | Y | -| GlobalLpPool | N | N | N | N | N | -| GlobalMaxPool | Y | Y | Y | Y | Y | -| GRU | N | Y | Y | Y | Y | -| Hardmax | N | N | N | N | N | -| HardSigmoid | Y | Y | Y | Y | Y | -| InstanceNormalization | N | N | N | N | N | -| LeakyRelu | Y | Y | Y | Y | Y | -| LpNormalization | N | N | N | N | N | -| LRN | N | N | N | N | N | -| LSTM | N | N | N | N | Y | -| MatMul | N | Y | Y | Y | Y | -| MaxPool8 | Y | Y | Y | Y | Y | -| MaxRoiPool | N | Y | Y | Y | Y | -| MaxUnpool | N | N | N | N | N | -| Mean | Y | Y | Y | Y | Y | -| Min | N | N | N | N | N | -| Mod | N | N | N | N | N | -| Mul | N | Y | Y | Y | Y | -| Multinomial | N | N | N | N | N | -| Neg | Y | Y | Y | Y | Y | -| NonMaxSuppression | N | N | N | N | N | -| NonZero | N | N | N | N | N | -| Not | N | N | N | N | N | -| OneHot | N | N | N | N | N | -| Or | N | N | N | N | N | -| Pad9 | Y | Y | Y | Y | Y | -| Pow10 | N | Y | Y | Y | Y | -| PRelu | Y | Y | Y | Y | Y | -| RandomUniformLike | N | N | N | N | N | -| Reciprocal | N | Y | Y | Y | Y | -| ReduceLogSum | N | N | N | N | N | -| ReduceLogSumExp | N | N | N | N | N | -| ReduceMax | N | N | N | N | Y | -| ReduceMean11 | Y | Y | Y | Y | Y | -| ReduceSum | Y | Y | Y | Y | Y | -| ReduceSumSquare | N | Y | Y | Y | Y | -| Relu | Y | Y | Y | Y | Y | -| Reshape | N | Y | Y | Y | Y | -| Resize12 | N | Y | Y | Y | Y | -| RNN | N | Y | Y | Y | Y | -| RoiAlign | N | Y | Y | Y | Y | -| Selu | N | N | N | N | N | -| Shrink | N | N | N | N | N | -| Sigmoid | N | Y | Y | Y | Y | -| Slice | N | Y | Y | Y | Y | -| Softmax | N | N | N | N | Y | -| Softplus | N | N | Y | Y | Y | -| Softsign | N | N | Y | Y | Y | -| SpaceToDepth13 | N | N | Y | Y | Y | -| Split | Y | Y | Y | Y | Y | -| Squeeze | N | N | N | N | N | -| Sub | Y | Y | Y | Y | Y | -| Sum | Y | Y | Y | Y | Y | -| Tanh | N | Y | Y | Y | Y | -| ThresholdedRelu | N | N | N | N | N | -| Tile | N | N | N | N | N | -| Transpose | N | Y | Y | Y | Y | -| Unsqueeze | N | N | N | N | N | -| Upsample14 | N | Y | Y | Y | Y | +| Node | 520 | 720 | 530 | 630 | 730 | +| -------------------------- | ------------------ | ------------------ | ------------------ | ------------------ | ------------------ | +| Abs | N | N | N | N | Y | +| Acos | N | N | N | N | N | +| Acosh | N | N | N | N | N | +| Add | Y1 | Y1 | Y1 | Y1 | Y | +| And | N | N | N | N | N | +| ArgMax | N | N | N | N | N | +| ArgMin | N | N | N | N | N | +| Asin | N | N | N | N | N | +| Asinh | N | N | N | N | N | +| Atan | N | N | N | N | N | +| Atanh | N | N | N | N | N | +| AveragePool | Y2 | Y3 | Y3 | Y3 | Y4 | +| BatchNormalization | Y1 | Y1 | Y1 | Y1 | Y1 | +| Bernoulli | N | N | N | N | N | +| BitShift | N | N | N | N | N | +| BitwiseAnd | N | N | N | N | N | +| BitwiseNot | N | N | N | N | N | +| BitwiseOr | N | N | N | N | N | +| BitwiseXor | N | N | N | N | N | +| BlackmanWindow | N | N | N | N | N | +| CastLike | N | N | N | N | N | +| Cast | N | N | N | N | N | +| Ceil | N | N | N | N | N | +| Celu | N | N | N | N | N | +| CenterCropPad | N | N | N | N | N | +| Clip | Y5 | Y5 | Y5 | Y5 | Y5 | +| Col2lm | N | N | N | N | N | +| Compress | N | N | N | N | N | +| ConcatFromSequence | N | N | N | N | N | +| Concat | Y1 | Y1 | Y1 | Y1 | Y | +| ConvInteger | N | N | N | N | N | +| Conv | Y6 | Y7 | Y8 | Y8 | Y8 | +| ConvTranspose | N | Y9 | Y10 | Y10 | Y | +| Cos | N | N | N | N | N | +| Cosh | N | N | N | N | N | +| CumSum | N | N | N | N | N | +| DFT | N | N | N | N | N | +| DepthToSpace | Y11 | Y11 | Y12 | Y12 | Y12 | +| DequantizeLinear | N | N | N | N | N | +| Det | N | N | N | N | N | +| Div | N | N | Y | Y | Y | +| Dropout | N | N | N | N | N | +| DynamicQuantizeLinear | N | N | N | N | N | +| Einsum | N | N | N | N | N | +| Elu | N | N | Y | Y | Y | +| Equal | N | N | N | N | N | +| Erf | N | N | Y | Y | Y | +| Exp | N | Y14 | Y | Y | Y | +| Expand | N | Y15 | Y16 | Y16 | Y16 | +| EyeLike | N | N | N | N | N | +| Flatten | N | Y17 | Y17 | Y17 | Y17 | +| Floor | N | N | N | N | N | +| GRU | N | N | N | N | N | +| GatherElements | N | N | N | N | N | +| GatherND | N | N | N | N | N | +| Gather | Y18 | Y18 | Y18 | Y18 | Y18 | +| Gemm | Y | Y | Y | Y | Y | +| GlobalAveragePool | Y19 | Y20 | Y20 | Y20 | Y20 | +| GlobalLpPool | N | N | N | N | N | +| GlobalMaxPool | Y21 | Y | Y | Y | Y | +| Greater | N | N | N | N | N | +| GreaterOrEqual | N | N | N | N | N | +| GridSample | N | N | N | N | N | +| GroupNormalization | N | N | N | N | N | +| HammingWindow | N | N | N | N | N | +| HannWindow | N | N | N | N | N | +| HardSigmoid | N | N | Y | Y | Y | +| HardSwish | N | N | N | N | N | +| Hardmax | N | N | N | N | N | +| Identity | N | N | N | N | N | +| InstanceNormalization | N | N | N | N | Y | +| IsInf | N | N | N | N | N | +| IsNaN | N | N | N | N | N | +| LRN | N | N | N | N | N | +| LSTM | N | N | N | N | N | +| LayerNormalization | N | N | N | N | Y | +| LeakyRelu | Y | Y | Y | Y | Y | +| Less | N | N | N | N | N | +| LessOrEqual | N | N | N | N | N | +| Log | N | N | N | N | Y | +| LogSoftmax | N | N | N | N | N | +| Loop | N | N | N | N | Y | +| LpNormalization | N | N | N | N | N | +| LpPool | N | N | N | N | N | +| MatMulInteger | N | N | N | N | N | +| MatMul | N | Y22 | Y22 | Y22 | Y23 | +| Max | N | N | N | N | Y | +| MaxPool | Y24 | Y24 | Y25 | Y25 | Y25 | +| MaxRoiPool | N | Y | Y | Y | Y | +| MaxUnpool | N | N | N | N | N | +| Mean | N | N | N | N | N | +| MeanVarianceNormalization | N | N | N | N | N | +| MelWeightMatrix | N | N | N | N | N | +| Min | N | N | N | N | N | +| Mish | N | N | N | N | N | +| Mod | N | N | N | N | N | +| Mul | N | Y1 | Y1 | Y1 | Y | +| Multinomial | N | N | N | N | N | +| Neg | Y | Y | Y | Y | Y | +| NegativeLogLikelihoodLoss | N | N | N | N | N | +| NonMaxSuppression | N | N | N | N | N | +| NonZero | N | N | N | N | N | +| Not | N | N | N | N | N | +| OneHot | N | N | N | N | N | +| Or | N | N | N | N | N | +| PRelu | Y | Y | Y | Y | Y | +| Pad | Y26 | Y26 | Y26 | Y26 | Y27 | +| Pow | N | Y29 | Y29 | Y29 | Y29 | +| QLinearConv | N | N | N | N | N | +| QLinearMatMul | N | N | N | N | N | +| QuantizeLinear | N | N | N | N | N | +| RandomNormalLike | N | N | N | N | N | +| RandomNormal | N | N | N | N | N | +| RandomUniformLike | N | N | N | N | N | +| RandomUniform | N | N | N | N | N | +| Range | N | N | N | N | N | +| Reciprocal | N | N | Y | Y | Y | +| ReduceL1 | N | N | N | N | N | +| ReduceL2 | N | N | N | N | N | +| ReduceLogSumExp | N | N | N | N | N | +| ReduceLogSum | N | N | N | N | N | +| ReduceMax | Y30 | Y30 | Y30 | Y30 | Y30 | +| ReduceMean | N | Y31 | Y31 | Y31 | Y31 | +| ReduceMin | N | N | N | N | Y | +| ReduceProd | N | N | N | N | N | +| ReduceSum | Y32 | Y31 | Y31 | Y31 | Y | +| ReduceSumSquare | N | N | N | N | N | +| Relu | Y | Y | Y | Y | Y | +| Reshape | N | Y | Y | Y | Y | +| Resize | Y34 | Y35 | Y36 | Y36 | Y36 | +| ReverseSequence | N | N | N | N | N | +| RNN | N | N | N | N | N | +| RoiAlign | N | N | N | N | N | +| Round | N | N | N | N | N | +| STFT | N | N | N | N | N | +| ScatterElements | N | N | N | N | N | +| ScatterND | N | N | N | N | N | +| Scatter | N | N | N | N | N | +| Selu | N | N | N | N | N | +| SequenceAt | N | N | N | N | N | +| SequenceConstruct | N | N | N | N | N | +| SequenceEmpty | N | N | N | N | N | +| SequenceErase | N | N | N | N | N | +| SequenceInsert | N | N | N | N | N | +| SequenceLength | N | N | N | N | N | +| Shape | N | N | N | N | N | +| Shrink | N | N | N | N | N | +| Sigmoid | N | Y | Y | Y | Y | +| Sign | N | N | N | N | Y | +| Sin | N | N | N | N | N | +| Sinh | N | N | N | N | N | +| Size | N | N | N | N | N | +| SliceHeader | N | N | N | N | N | +| Slice | N | Y37 | Y37 | Y37 | Y38 | +| SliceTail | N | N | N | N | N | +| SoftmaxCrossEntropyLoss | N | N | N | N | N | +| Softmax | N | N39 | Y40 | Y40 | Y40 | +| Softplus | N | N | N | N | N | +| Softsign | N | N | N | N | N | +| SpaceToDepth | N | N | Y41 | Y41 | Y41 | +| Split | N | N | N | N | Y | +| Sqrt | N | N | Y | Y | Y | +| Squeeze | N | N | N | N | N | +| Sub | Y | Y | Y | Y | Y | +| Sum | N | N | N | N | N | +| Tan | N | N | N | N | N | +| Tanh | N | Y | Y | Y | Y | +| ThresholdedRelu | N | N | N | N | N | +| Tile | N | N | N | N | N | +| TopK | N | N | N | N | N | +| Transpose | N | Y42 | Y43 | Y43 | Y | +| Trilu | N | N | N | N | N | +| Unique | N | N | N | N | N | +| Unsqueeze | N | N | N | N | N | +| Upsample | Y44 | Y45 | Y46 | Y46 | Y46 | +| Where | N | N | N | N | N | +| Xor | N | N | N | N | N | Notes: -1. For AveragePool kernel size, 520 and 720 support square kernel up to 3x3. while 530 and 630 also support non-square kernel up to 3x3. -2. All hardware only support Clip with min set to 0. -3. 720 only supports ConvTranspose with stride set to 2. -4. 530 and 630 only support DepthToSpace with blocksize set to 2 or 4. -5. 530 and 630 only support Expand on channel or column and row. -6. 520 and 720 only support Flatten before Gemm. -7. For GlobalAveragePool, 520 and 530 support up to 524888 pixels (input and output together, 8 bit, same for the platforms following). 630 supports up to 262144 pixels. 720 supports up to 1048576 pixels. -8. For MaxPool kernel size, 520 and 720 support square kernel up to 3x3. while 530 and 630 also support non-square kernel up to 3x3. -9. NPUs only support constant pad mode and constant value set to 0. -10. NPUs only support power set to 2. -11. NPUs only support ReduceMean nodes that behave the same as GlobalAveragePool. And it has the same limitation, too. -12. 720 only supports Resize nodes which work as upsampleing. Same limitation as the Upsample. -13. 530 and 630 only support SpaceToDepth with blocksize set to 2 or 4. -14. 720 only supports Upsample with mode set to bilinear or nearest. -15. LSTM, GRU and RNN currently are only supported in onnx opset 13 after using ONNX Converter conversion. +1. Conditions: rank <= 4D +2. AveragePool 520 conditions: + - (ceil_mode=0, count_include_pad=0, kernel is nxn, stride is nxn where n is power of 2 and n > 3) or + - (ceil_mode=0, dilations={1,1}, kernel = 1 and stride > 1) or + - (ceil_mode=0, count_include_pad=0, stride is sxs where s > 3) or + - (2D pool, dilation == 1, kernel is kxk & stride is sxs where k <= 3 and s <= k) +3. AveragePool 720/530/630 conditions: + - (ceil_mode=0, count_include_pad=0, kernel is nxn, stride is nxn where n is power of 2 and n > 3) or + - (ceil_mode=0, dilations={1,1}, kernel = 1 and stride > 1) or + - (ceil_mode=0, count_include_pad=0, stride is sxs where s > 3) or + - (2D pool, dilation == 1, kernel is kxk & stride is sxs where k and s <= 3 or kernel_w & stride_w are 1 and kernel_h & stride_h <= 3 or kernel_h & stride_h are 1 and kernel_w = stride_w <= 3) +4. AveragePool 730 conditions: + - (ceil_mode=0, count_include_pad=0, kernel is nxn and stride is nxn where n is power of 2 and n > 3) or + - (ceil_mode=0, count_include_pad=0, dilation = 1, kernel = 1 and stride > 1) or + - (ceil_mode=0, count_include_pad=0, stride is sxs and s > 3) or + - (1D/2D pool, dilation == 1, + - kernel is kxk & stride is sxs where k and s <= 3 or + - kernel is kx1 & stride is sx1 where k and s <= 3 or + - kernel is 1xn & stride is 1xn where n <= 3) +5. conditions: min = 0 && max >= 0 +6. conditions: rank <= 4 && kernel <= 12 && stride_w <= 16 && stride_h <= 4 +7. conditions: rank <= 4 && stride_w <= 16 && stride_h <= 4 +8. conditions: rank <= 4 +9. condition: stride is sxs where s = 2 +10. condition: stride is sxs +11. decompose contidion: in_shape = 1x4x?x? && out_shape = 1x1x?x? && mode = CRD && blocksize = 2 +12. conditions: blocksize = 2 or 4 +14. decompose to constant + log2 + mul + pow2 +15. conditions: expend on column or row +16. conditions: expend on channel or column or row +17. conditions: axis = 1 +18. conditions: single index +19. conditions: rank <= 4 && row * col <= 256 +20. conditions: rank <= 4 && row * col <= 16384 +21. conditions: row > 3 +22. conditions: + - if second input is const + - const input shape must be WxV or 1x1xWxV + - else + - rank = 4 +23. conditions: + - if second input is const + - const input shape must be WxV or 1x1xWxV + - else + - 3 <= rank <= 5 +24. Maxpool conditions: + - (ceil_mode=0, dilations={1,1}, kernel = 1 and stride > 1) or + - (ceil_mode=0, dilations={1,1}, kernel_h=kernel_w=stride_h=stride_w=K where K is power of 2 and K > 3) or + - (ceil_mode=0, kernel > 3) or + - (2D pool, dilation == 1, kernel is kxk & stride is sxs where 2 <= k <= 3 and s <= k) +25. Maxpool conditions: + - (ceil_mode=0, dilations={1,1}, kernel = 1 and stride > 1) or + - (ceil_mode=0, dilations={1,1}, kernel_h=kernel_w=stride_h=stride_w=K where K is power of 2 and K > 3) or + - (ceil_mode=0, kernel > 3) or + - (2D pool, + - dilation == 1, + - kernel is kxk & stride is sxs where n and s <= 3 or + - kernel_w & stride_w are 1 and kernel_h & stride_h <= 3 or + - kernel_h & stride_h are 1 and kernel_w = stride_w <= 3) +26. conditions: not pad in batch && any pad in spacial < 32 && constant mode with 0 const_val +27. conditions: not pad in batch && any of pad < 32 && constant mode with 0 const_val +29. conditions: power is 2 +30. conditions: keepdims = 1 +31. contitions: keepdims = 1 && reduce not in batch +32. contitions: keepdims = 1 && reduce in ch +34. conditions: mode != cubic && extrapolation_value is 0 && rank is 4 && phase_init is {0,0} && nearest_mode is floor if mode is nearest && coordinate_transformation_mode != tf_crop_and_resize +35. conditions: mode != cubic && extrapolation_value is 0 && rank is 4 && not both vus_en and hus_en enabled && phase_init_v >= 0 and delta_v <= 1 if vus_en enabled && phase_init_h >= 0 and delta_h <= 1 if hus_en enabled +36. conditions: mode != cubic && extrapolation_value is 0 +37. conditions: rank <= 4 && all of steps are 1 +38. conditions: all of steps are 1 +39. will be decompose to ReduceSum + Div + Exp +40. will be decompose to ReduceSum + Div + Neg + Add + Exp +41. conditions: blocksize is 2 or 4 +42. conditions: row_col_transpose && ch_row_transpose +43. conditions: transpose not in batch +44. conditions: rank is 4 && upsample in spatial && mode is nearest or linear or bilinear or align_corner +45. conditions: rank is 4 && upsample in row or column but not both && mode is nearest or linear or bilinear or align_corner +46. conditions: rank is 4 && upsample in spatial && mode is nearest or linear or bilinear or align_corner + diff --git a/docs/toolchain/appendix/yolo_example.md b/docs/toolchain/appendix/yolo_example.md index bdeb8d2..4fe8175 100644 --- a/docs/toolchain/appendix/yolo_example.md +++ b/docs/toolchain/appendix/yolo_example.md @@ -4,6 +4,27 @@ In this document, we provide a step by step example on how to utilize our tools > This document is writen for toolchain v0.30.0. If any description is not consistent with the latest toolchain, please refer to the main toolchain manual. +## Tricks for deploying yolo-type detection models and anker based detection models + +1. set quantization config (for 730 only) + 1. input 8bit + 2. const input 16bit + 3. output 8bit + 4. weight mixlight or 8bit + 5. data mixlight or mixbalance + +2. Model structure + 1. anker based detection model has the following outputs + 1. class scores with shape 1x class # x pixel # at scale 0, 1x class # x pixel # at scale 1, ..., 1x class # x pixel # at scale S. + 2. bbox coordinates with shape 1x4xpixel # at scale 0, 1x4xpixel # at scale 1, ...,1x4xpixel # at scale S + 3. Trick: Do NOT concat class scores at different scales. Output class scores for each scale separately. + 4. Trick: Do NOT concat class score and coordinates at the same scale. Output class scores and bbox coordinates separately. + 5. Trick: Do NOT concat bbox coordinates at differnt scales. Output bbox coordinates for each scale separately. + 6. Trick: Typically, class scores need to pass activation fuctions such as exp, sigmoid or even softmax. Make sure these activation fucntions are in the model so that quantiztion algorithm can optimize the quantizaiton setting accordingly. + 7. Trick: sometimes, bbox coordinates need to pass exp function or other activation function. Make sure these activation fucntions are in the model so that quantiztion algorithm can optimize the quantizaiton setting accordingly. + 8. Trick: Do NOT concat some outputs and then split in the model. Make sure the computation of all these outputs are separate. If these computation needs to use the same op, the quantization algorithm can detect this situation and share the weights of the same op. + + ## Step 0: Prepare environment and data We need to download the latest toolchain docker image which contains all the tools we need.