From f6b095324a74902cf0da2f6936879b213454ee8f Mon Sep 17 00:00:00 2001 From: POI-WX Date: Wed, 29 May 2024 17:34:07 +0800 Subject: [PATCH 1/3] reimpl linear using opplugin --- impl/ascend_npu/ascend_config.yaml | 4 +- impl/ascend_npu/diopi_impl/linear.cpp | 87 +++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 impl/ascend_npu/diopi_impl/linear.cpp diff --git a/impl/ascend_npu/ascend_config.yaml b/impl/ascend_npu/ascend_config.yaml index 91ad35e42..c72f43fe4 100755 --- a/impl/ascend_npu/ascend_config.yaml +++ b/impl/ascend_npu/ascend_config.yaml @@ -5,8 +5,6 @@ ascend: - diopiGather - diopiLayerNorm - diopiLayerNormBackward -- diopiLinear -- diopiLinearBackward - diopiNormalScalarTensor - diopiNormalTensor - diopiNormalTensorScalar @@ -156,6 +154,8 @@ ascend_npu: - diopiLeInpScalar - diopiLeScalar - diopiLinalgVecNorm +- diopiLinear +- diopiLinearBackward - diopiLt - diopiLtInp - diopiLtInpScalar diff --git a/impl/ascend_npu/diopi_impl/linear.cpp b/impl/ascend_npu/diopi_impl/linear.cpp new file mode 100644 index 000000000..0f84b98b4 --- /dev/null +++ b/impl/ascend_npu/diopi_impl/linear.cpp @@ -0,0 +1,87 @@ +/** + * @file + * @author DeepLink + * @copyright (c) 2024, DeepLink. + */ + +#include "helper.hpp" +#include "op_plugin/OpApiInterface.h" +#include "op_plugin/utils/op_api_common.h" + +namespace { + +at::Tensor transTensorTo2D(const at::Tensor& tensor) { + std::vector dims; + std::vector shape(tensor.sizes().begin(), tensor.sizes().end()); + int64_t product = std::accumulate(shape.begin(), shape.end() - 1, 1, std::multiplies<>()); + dims = {product, shape.back()}; + return impl::aten::viewStorage(tensor, dims); +} +} // namespace + +namespace OP_IMPL_NS { + +diopiError_t diopiLinear(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, + diopiConstTensorHandle_t bias) { + BEGIN_CALL_ACL_OP(input, weight, bias, out); + + at::Tensor inputAt2D = inputAt; + at::Tensor outAt2D = outAt; + if (inputAt.dim() > 2) { + inputAt2D = transTensorTo2D(inputAt); + } + if (outAt.dim() > 2) { + outAt2D = transTensorTo2D(outAt); + } + + at::Tensor weightAtT = weightAt.t(); + int8_t cubeMathType = at_npu::native::OpPreparation::get_cube_math_type(at_npu::native::env::IsAllowMatmulHF32()); + if (biasAt.defined()) { + const at::Scalar beta = 1; + const at::Scalar alpha = 1; + EXEC_NPU_CMD(aclnnAddmm, biasAt, inputAt2D, weightAtT, beta, alpha, outAt2D, cubeMathType); + } else { + EXEC_NPU_CMD(aclnnMm, inputAt2D, weightAtT, outAt2D, cubeMathType); + } + END_CALL_ACL_OP(); +} + +diopiError_t diopiLinearBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gradInput, diopiTensorHandle_t gradWeight, diopiTensorHandle_t gradBias, + diopiConstTensorHandle_t gradOutput, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight) { + BEGIN_CALL_ACL_OP(input, weight, gradOutput, gradInput, gradWeight, gradBias); + at::Tensor gradOutputAt2D = gradOutputAt; + if (gradOutputAt.dim() > 2) { + gradOutputAt2D = transTensorTo2D(gradOutputAt); + } + + at::Tensor gradInputAt2D = gradInputAt; + if (gradInputAt.dim() > 2) { + gradInputAt2D = transTensorTo2D(gradInputAt); + } + + at::Tensor inputAt2D = inputAt; + if (inputAt.dim() > 2) { + inputAt2D = transTensorTo2D(inputAt); + } + + int8_t cubeMathType = at_npu::native::OpPreparation::get_cube_math_type(at_npu::native::env::IsAllowMatmulHF32()); + EXEC_NPU_CMD(aclnnMm, gradOutputAt2D, weightAt, gradInputAt2D, cubeMathType); + + at::Tensor gradOutputAt2DT = gradOutputAt2D.t(); + EXEC_NPU_CMD(aclnnMm, gradOutputAt2DT, inputAt2D, gradWeightAt, cubeMathType); + + if (gradBiasAt.defined()) { + auto outDim = gradOutputAt.dim(); + auto biasDim = gradBiasAt.dim(); + if (outDim > biasDim) { + std::vector sumDims(outDim - biasDim); + std::iota(sumDims.begin(), sumDims.end(), 0); + op_api::sum_out(gradOutputAt, sumDims, false, gradBiasAt.scalar_type(), gradBiasAt); + } else { + gradBiasAt.copy_(gradOutputAt); + } + } + END_CALL_ACL_OP(); +} + +} // namespace OP_IMPL_NS From 2c5055914b83f5d4b6d2646aec9440a9d8544de5 Mon Sep 17 00:00:00 2001 From: POI-WX Date: Wed, 29 May 2024 18:21:24 +0800 Subject: [PATCH 2/3] update --- impl/ascend_npu/diopi_impl/linear.cpp | 63 ++++++++++----------------- 1 file changed, 24 insertions(+), 39 deletions(-) diff --git a/impl/ascend_npu/diopi_impl/linear.cpp b/impl/ascend_npu/diopi_impl/linear.cpp index 0f84b98b4..32b3583e0 100644 --- a/impl/ascend_npu/diopi_impl/linear.cpp +++ b/impl/ascend_npu/diopi_impl/linear.cpp @@ -5,16 +5,14 @@ */ #include "helper.hpp" -#include "op_plugin/OpApiInterface.h" #include "op_plugin/utils/op_api_common.h" namespace { at::Tensor transTensorTo2D(const at::Tensor& tensor) { - std::vector dims; std::vector shape(tensor.sizes().begin(), tensor.sizes().end()); int64_t product = std::accumulate(shape.begin(), shape.end() - 1, 1, std::multiplies<>()); - dims = {product, shape.back()}; + std::vector dims = {product, shape.back()}; return impl::aten::viewStorage(tensor, dims); } } // namespace @@ -25,62 +23,49 @@ diopiError_t diopiLinear(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop diopiConstTensorHandle_t bias) { BEGIN_CALL_ACL_OP(input, weight, bias, out); - at::Tensor inputAt2D = inputAt; - at::Tensor outAt2D = outAt; - if (inputAt.dim() > 2) { - inputAt2D = transTensorTo2D(inputAt); - } - if (outAt.dim() > 2) { - outAt2D = transTensorTo2D(outAt); - } - - at::Tensor weightAtT = weightAt.t(); + at::Tensor inputAt2D = (inputAt.dim() > 2) ? transTensorTo2D(inputAt) : inputAt; + at::Tensor outAt2D = (outAt.dim() > 2) ? transTensorTo2D(outAt) : outAt; + at::Tensor weightAt2D = (weightAt.dim() > 2) ? transTensorTo2D(weightAt) : weightAt; + at::Tensor weightAt2DT = weightAt2D.t(); int8_t cubeMathType = at_npu::native::OpPreparation::get_cube_math_type(at_npu::native::env::IsAllowMatmulHF32()); if (biasAt.defined()) { - const at::Scalar beta = 1; - const at::Scalar alpha = 1; - EXEC_NPU_CMD(aclnnAddmm, biasAt, inputAt2D, weightAtT, beta, alpha, outAt2D, cubeMathType); + at::Scalar beta = 1; + at::Scalar alpha = 1; + EXEC_NPU_CMD(aclnnAddmm, biasAt, inputAt2D, weightAt2DT, beta, alpha, outAt2D, cubeMathType); } else { - EXEC_NPU_CMD(aclnnMm, inputAt2D, weightAtT, outAt2D, cubeMathType); + EXEC_NPU_CMD(aclnnMm, inputAt2D, weightAt2DT, outAt2D, cubeMathType); } + END_CALL_ACL_OP(); } diopiError_t diopiLinearBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gradInput, diopiTensorHandle_t gradWeight, diopiTensorHandle_t gradBias, diopiConstTensorHandle_t gradOutput, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight) { BEGIN_CALL_ACL_OP(input, weight, gradOutput, gradInput, gradWeight, gradBias); - at::Tensor gradOutputAt2D = gradOutputAt; - if (gradOutputAt.dim() > 2) { - gradOutputAt2D = transTensorTo2D(gradOutputAt); - } - at::Tensor gradInputAt2D = gradInputAt; - if (gradInputAt.dim() > 2) { - gradInputAt2D = transTensorTo2D(gradInputAt); - } - - at::Tensor inputAt2D = inputAt; - if (inputAt.dim() > 2) { - inputAt2D = transTensorTo2D(inputAt); - } + at::Tensor inputAt2D = (inputAt.dim() > 2) ? transTensorTo2D(inputAt) : inputAt; + at::Tensor weightAt2D = (weightAt.dim() > 2) ? transTensorTo2D(weightAt) : weightAt; + at::Tensor gradOutputAt2D = (gradOutputAt.dim() > 2) ? transTensorTo2D(gradOutputAt) : gradOutputAt; + at::Tensor gradInputAt2D = (gradInputAt.dim() > 2) ? transTensorTo2D(gradInputAt) : gradInputAt; + at::Tensor gradWeightAt2D = (gradWeightAt.dim() > 2) ? transTensorTo2D(gradWeightAt) : gradWeightAt; int8_t cubeMathType = at_npu::native::OpPreparation::get_cube_math_type(at_npu::native::env::IsAllowMatmulHF32()); - EXEC_NPU_CMD(aclnnMm, gradOutputAt2D, weightAt, gradInputAt2D, cubeMathType); + EXEC_NPU_CMD(aclnnMm, gradOutputAt2D, weightAt2D, gradInputAt2D, cubeMathType); at::Tensor gradOutputAt2DT = gradOutputAt2D.t(); - EXEC_NPU_CMD(aclnnMm, gradOutputAt2DT, inputAt2D, gradWeightAt, cubeMathType); + EXEC_NPU_CMD(aclnnMm, gradOutputAt2DT, inputAt2D, gradWeightAt2D, cubeMathType); if (gradBiasAt.defined()) { auto outDim = gradOutputAt.dim(); auto biasDim = gradBiasAt.dim(); - if (outDim > biasDim) { - std::vector sumDims(outDim - biasDim); - std::iota(sumDims.begin(), sumDims.end(), 0); - op_api::sum_out(gradOutputAt, sumDims, false, gradBiasAt.scalar_type(), gradBiasAt); - } else { - gradBiasAt.copy_(gradOutputAt); - } + std::vector sumDims(outDim - biasDim); + std::iota(sumDims.begin(), sumDims.end(), 0); + bool keepDim = false; + auto dtype = gradBiasAt.scalar_type(); + at::IntArrayRef sumDimsArrayRef(sumDims); + EXEC_NPU_CMD(aclnnReduceSum, gradOutputAt, sumDimsArrayRef, keepDim, dtype, gradBiasAt); } + END_CALL_ACL_OP(); } From 7785a31a8ba010b885d7da9909da38a56c0fbbfa Mon Sep 17 00:00:00 2001 From: POI-WX Date: Thu, 30 May 2024 12:28:59 +0800 Subject: [PATCH 3/3] update --- impl/ascend_npu/diopi_impl/linear.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/impl/ascend_npu/diopi_impl/linear.cpp b/impl/ascend_npu/diopi_impl/linear.cpp index 32b3583e0..098d150c0 100644 --- a/impl/ascend_npu/diopi_impl/linear.cpp +++ b/impl/ascend_npu/diopi_impl/linear.cpp @@ -27,13 +27,13 @@ diopiError_t diopiLinear(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop at::Tensor outAt2D = (outAt.dim() > 2) ? transTensorTo2D(outAt) : outAt; at::Tensor weightAt2D = (weightAt.dim() > 2) ? transTensorTo2D(weightAt) : weightAt; at::Tensor weightAt2DT = weightAt2D.t(); + int8_t cubeMathType = at_npu::native::OpPreparation::get_cube_math_type(at_npu::native::env::IsAllowMatmulHF32()); + EXEC_NPU_CMD(aclnnMatmul, inputAt2D, weightAt2DT, outAt2D, cubeMathType); + if (biasAt.defined()) { - at::Scalar beta = 1; at::Scalar alpha = 1; - EXEC_NPU_CMD(aclnnAddmm, biasAt, inputAt2D, weightAt2DT, beta, alpha, outAt2D, cubeMathType); - } else { - EXEC_NPU_CMD(aclnnMm, inputAt2D, weightAt2DT, outAt2D, cubeMathType); + EXEC_NPU_CMD(aclnnInplaceAdd, outAt, biasAt, alpha); } END_CALL_ACL_OP(); @@ -43,17 +43,14 @@ diopiError_t diopiLinearBackward(diopiContextHandle_t ctx, diopiTensorHandle_t g diopiConstTensorHandle_t gradOutput, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight) { BEGIN_CALL_ACL_OP(input, weight, gradOutput, gradInput, gradWeight, gradBias); + int8_t cubeMathType = at_npu::native::OpPreparation::get_cube_math_type(at_npu::native::env::IsAllowMatmulHF32()); + EXEC_NPU_CMD(aclnnMatmul, gradOutputAt, weightAt, gradInputAt, cubeMathType); + at::Tensor inputAt2D = (inputAt.dim() > 2) ? transTensorTo2D(inputAt) : inputAt; - at::Tensor weightAt2D = (weightAt.dim() > 2) ? transTensorTo2D(weightAt) : weightAt; at::Tensor gradOutputAt2D = (gradOutputAt.dim() > 2) ? transTensorTo2D(gradOutputAt) : gradOutputAt; - at::Tensor gradInputAt2D = (gradInputAt.dim() > 2) ? transTensorTo2D(gradInputAt) : gradInputAt; at::Tensor gradWeightAt2D = (gradWeightAt.dim() > 2) ? transTensorTo2D(gradWeightAt) : gradWeightAt; - - int8_t cubeMathType = at_npu::native::OpPreparation::get_cube_math_type(at_npu::native::env::IsAllowMatmulHF32()); - EXEC_NPU_CMD(aclnnMm, gradOutputAt2D, weightAt2D, gradInputAt2D, cubeMathType); - at::Tensor gradOutputAt2DT = gradOutputAt2D.t(); - EXEC_NPU_CMD(aclnnMm, gradOutputAt2DT, inputAt2D, gradWeightAt2D, cubeMathType); + EXEC_NPU_CMD(aclnnMatmul, gradOutputAt2DT, inputAt2D, gradWeightAt2D, cubeMathType); if (gradBiasAt.defined()) { auto outDim = gradOutputAt.dim();